From 349c004e3d31fda23ad225b61861be38047fff16 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 12 Mar 2011 12:50:10 +0100 Subject: x86: A fast way to check capabilities of the current cpu Add this_cpu_has() which determines if the current cpu has a certain ability using a segment prefix and a bit test operation. For that we need to add bit operations to x86s percpu.h. Many uses of cpu_has use a pointer passed to a function to determine the current flags. That is no longer necessary after this patch. However, this patch only converts the straightforward cases where cpu_has is used with this_cpu_ptr. The rest is work for later. -tj: Rolled up patch to add x86_ prefix and use percpu_read() instead of percpu_read_stable(). Signed-off-by: Christoph Lameter Acked-by: Tejun Heo Signed-off-by: Tejun Heo --- arch/x86/include/asm/cpufeature.h | 13 +++++++++---- arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 91f3e087cf2..50c0d30e676 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -207,8 +207,7 @@ extern const char * const x86_power_flags[32]; #define test_cpu_cap(c, bit) \ test_bit(bit, (unsigned long *)((c)->x86_capability)) -#define cpu_has(c, bit) \ - (__builtin_constant_p(bit) && \ +#define REQUIRED_MASK_BIT_SET(bit) \ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ @@ -218,10 +217,16 @@ extern const char * const x86_power_flags[32]; (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ - (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ - ? 1 : \ + (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) + +#define cpu_has(c, bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) +#define this_cpu_has(bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ + x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability)) + #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index d475b4398d8..76042d98159 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -542,6 +542,33 @@ do { \ old__; \ }) +static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, + const unsigned long __percpu *addr) +{ + unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; + + return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0; +} + +static inline int x86_this_cpu_variable_test_bit(int nr, + const unsigned long __percpu *addr) +{ + int oldbit; + + asm volatile("bt "__percpu_arg(2)",%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +#define x86_this_cpu_test_bit(nr, addr) \ + (__builtin_constant_p((nr)) \ + ? x86_this_cpu_constant_test_bit((nr), (addr)) \ + : x86_this_cpu_variable_test_bit((nr), (addr))) + + #include /* We can use this directly for local CPU (faster). */ -- cgit v1.2.3 From 052936080c8fb2f791103995b21bd4018c8df886 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 1 Apr 2011 11:15:12 +0200 Subject: x86-64, NUMA: Remove custom phys_to_nid() implementation phys_to_nid() maps physical address to NUMA node id. This is implemented by building perfect hash in compute_hash_shift() during initialization. However, with SPARSE memory model, the nid is encoded in page flags. The perfect hash implementation was for DISCONTIG memory model which got removed years ago by b263295dbf (x86: 64-bit, make sparsemem vmemmap the only memory model). So, the perfect hash ends up being used only during initialization when the core SPARSE code already provides perfectly acceptable generic early_pfn_to_nid() implementation. Drop phys_to_nid() and use the generic ealry_pfn_to_nid() instead. Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Acked-by: Yinghai Lu Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner --- arch/x86/Kconfig | 4 -- arch/x86/include/asm/mmzone_64.h | 23 -------- arch/x86/mm/numa_64.c | 123 +-------------------------------------- 3 files changed, 1 insertion(+), 149 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc6c53a95bf..b034814bf97 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1703,10 +1703,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y depends on MEMORY_HOTPLUG -config HAVE_ARCH_EARLY_PFN_TO_NID - def_bool X86_64 - depends on NUMA - config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index 288b96f815a..b3f88d7867c 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -4,36 +4,13 @@ #ifndef _ASM_X86_MMZONE_64_H #define _ASM_X86_MMZONE_64_H - #ifdef CONFIG_NUMA #include - #include -/* Simple perfect hash to map physical addresses to node numbers */ -struct memnode { - int shift; - unsigned int mapsize; - s16 *map; - s16 embedded_map[64 - 8]; -} ____cacheline_aligned; /* total size = 128 bytes */ -extern struct memnode memnode; -#define memnode_shift memnode.shift -#define memnodemap memnode.map -#define memnodemapsize memnode.mapsize - extern struct pglist_data *node_data[]; -static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) -{ - unsigned nid; - VIRTUAL_BUG_ON(!memnodemap); - nid = memnodemap[addr >> memnode_shift]; - VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); - return nid; -} - #define NODE_DATA(nid) (node_data[nid]) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index e8c00cc7203..3951ee6eade 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -28,125 +28,10 @@ EXPORT_SYMBOL(node_data); nodemask_t numa_nodes_parsed __initdata; -struct memnode memnode; - -static unsigned long __initdata nodemap_addr; -static unsigned long __initdata nodemap_size; - static struct numa_meminfo numa_meminfo __initdata; - static int numa_distance_cnt; static u8 *numa_distance; -/* - * Given a shift value, try to populate memnodemap[] - * Returns : - * 1 if OK - * 0 if memnodmap[] too small (of shift too small) - * -1 if node overlap or lost ram (shift too big) - */ -static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift) -{ - unsigned long addr, end; - int i, res = -1; - - memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); - for (i = 0; i < mi->nr_blks; i++) { - addr = mi->blk[i].start; - end = mi->blk[i].end; - if (addr >= end) - continue; - if ((end >> shift) >= memnodemapsize) - return 0; - do { - if (memnodemap[addr >> shift] != NUMA_NO_NODE) - return -1; - memnodemap[addr >> shift] = mi->blk[i].nid; - addr += (1UL << shift); - } while (addr < end); - res = 1; - } - return res; -} - -static int __init allocate_cachealigned_memnodemap(void) -{ - unsigned long addr; - - memnodemap = memnode.embedded_map; - if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) - return 0; - - addr = 0x8000; - nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); - nodemap_addr = memblock_find_in_range(addr, get_max_mapped(), - nodemap_size, L1_CACHE_BYTES); - if (nodemap_addr == MEMBLOCK_ERROR) { - printk(KERN_ERR - "NUMA: Unable to allocate Memory to Node hash map\n"); - nodemap_addr = nodemap_size = 0; - return -1; - } - memnodemap = phys_to_virt(nodemap_addr); - memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); - - printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", - nodemap_addr, nodemap_addr + nodemap_size); - return 0; -} - -/* - * The LSB of all start and end addresses in the node map is the value of the - * maximum possible shift. - */ -static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi) -{ - int i, nodes_used = 0; - unsigned long start, end; - unsigned long bitfield = 0, memtop = 0; - - for (i = 0; i < mi->nr_blks; i++) { - start = mi->blk[i].start; - end = mi->blk[i].end; - if (start >= end) - continue; - bitfield |= start; - nodes_used++; - if (end > memtop) - memtop = end; - } - if (nodes_used <= 1) - i = 63; - else - i = find_first_bit(&bitfield, sizeof(unsigned long)*8); - memnodemapsize = (memtop >> i)+1; - return i; -} - -static int __init compute_hash_shift(const struct numa_meminfo *mi) -{ - int shift; - - shift = extract_lsb_from_nodes(mi); - if (allocate_cachealigned_memnodemap()) - return -1; - printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", - shift); - - if (populate_memnodemap(mi, shift) != 1) { - printk(KERN_INFO "Your memory is not aligned you need to " - "rebuild your kernel with a bigger NODEMAPSIZE " - "shift=%d\n", shift); - return -1; - } - return shift; -} - -int __meminit __early_pfn_to_nid(unsigned long pfn) -{ - return phys_to_nid(pfn << PAGE_SHIFT); -} - static void * __init early_node_mem(int nodeid, unsigned long start, unsigned long end, unsigned long size, unsigned long align) @@ -270,7 +155,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, nodedata_phys + pgdat_size - 1); - nid = phys_to_nid(nodedata_phys); + nid = early_pfn_to_nid(nodedata_phys >> PAGE_SHIFT); if (nid != nodeid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); @@ -527,12 +412,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; - memnode_shift = compute_hash_shift(mi); - if (memnode_shift < 0) { - printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); - return -EINVAL; - } - for (i = 0; i < mi->nr_blks; i++) memblock_x86_register_active_regions(mi->blk[i].nid, mi->blk[i].start >> PAGE_SHIFT, -- cgit v1.2.3 From 7210cf9217937e470a9acbc113a590f476b9c047 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:53 +0200 Subject: x86-32, numa: Calculate remap size in common code Only pgdat and memmap use remap area and there isn't much benefit in allowing per-node override. In addition, the use of node_remap_size[] is confusing in that it contains number of bytes before remap initialization and then number of pages afterwards. Move remap size calculation for memap from specific NUMA config implementations to init_alloc_remap() and make node_remap_size[] static. The only behavior difference is that, before this patch, numaq_32 didn't consider max_pfn when calculating the memmap size but it's enforced after this patch, which is the right thing to do. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-8-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/topology.h | 1 - arch/x86/kernel/apic/numaq_32.c | 4 ---- arch/x86/mm/numa_32.c | 10 ++++------ arch/x86/mm/srat_32.c | 1 - 4 files changed, 4 insertions(+), 12 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 910a7084f7f..8dba76972fd 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -95,7 +95,6 @@ extern void setup_node_to_cpumask_map(void); #ifdef CONFIG_X86_32 extern unsigned long node_start_pfn[]; extern unsigned long node_end_pfn[]; -extern unsigned long node_remap_size[]; #define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid]) # define SD_CACHE_NICE_TRIES 1 diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 6273eee5134..0aced70815f 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -93,10 +93,6 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd) node_end_pfn[node]); memory_present(node, node_start_pfn[node], node_end_pfn[node]); - - node_remap_size[node] = node_memmap_size_bytes(node, - node_start_pfn[node], - node_end_pfn[node]); } /* diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 99310d26fe3..9a7336550f0 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -104,7 +104,7 @@ extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -unsigned long node_remap_size[MAX_NUMNODES]; +static unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); @@ -129,7 +129,6 @@ int __init get_memcfg_numa_flat(void) node_end_pfn[0] = max_pfn; memblock_x86_register_active_regions(0, 0, max_pfn); memory_present(0, 0, max_pfn); - node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); /* Indicate there is one node available. */ nodes_clear(node_online_map); @@ -282,11 +281,10 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) if (node_end_pfn[nid] > max_pfn) node_end_pfn[nid] = max_pfn; - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid]; + /* calculate the necessary space aligned to large page size */ + size = node_memmap_size_bytes(nid, node_start_pfn[nid], + min(node_end_pfn[nid], max_pfn)); size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - /* align to large page */ size = ALIGN(size, LARGE_PAGE_BYTES); node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 48651c6f657..1b9e82c96dc 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -276,7 +276,6 @@ int __init get_memcfg_from_srat(void) unsigned long end = min(node_end_pfn[nid], max_pfn); memory_present(nid, start, end); - node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); } return 1; out_fail: -- cgit v1.2.3 From 6bd262731bf7559bab8c749786e8652e2df1fb4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:52 +0200 Subject: x86, NUMA: Unify 32/64bit numa_cpu_node() implementation Currently, the only meaningful user of apic->x86_32_numa_cpu_node() is NUMAQ which returns valid mapping only after CPU is initialized during SMP bringup; thus, the previous patch to set apicid -> node in setup_local_APIC() makes __apicid_to_node[] always contain the correct mapping whether custom apic->x86_32_numa_cpu_node() is used or not. So, there is no reason to keep separate 32bit implementation. We can always consult __apicid_to_node[]. Move 64bit implementation from numa_64.c to numa.c and remove 32bit implementation from numa_32.c. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numa.h | 10 ++++++++++ arch/x86/include/asm/numa_32.h | 6 ------ arch/x86/include/asm/numa_64.h | 3 --- arch/x86/mm/numa.c | 9 +++++++++ arch/x86/mm/numa_32.c | 5 ----- arch/x86/mm/numa_64.c | 9 --------- 6 files changed, 19 insertions(+), 23 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index a50fc9f493b..5982d418c35 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_NUMA_H #define _ASM_X86_NUMA_H +#include + #include #include @@ -22,10 +24,18 @@ static inline void set_apicid_to_node(int apicid, s16 node) { __apicid_to_node[apicid] = node; } + +extern int __cpuinit numa_cpu_node(int cpu); + #else /* CONFIG_NUMA */ static inline void set_apicid_to_node(int apicid, s16 node) { } + +static inline int numa_cpu_node(int cpu) +{ + return NUMA_NO_NODE; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index c6beed1ef10..242522fe9f8 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -5,12 +5,6 @@ extern int numa_off; extern int pxm_to_nid(int pxm); -#ifdef CONFIG_NUMA -extern int __cpuinit numa_cpu_node(int cpu); -#else /* CONFIG_NUMA */ -static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); #else diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 344eb1790b4..12461ebec70 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -26,7 +26,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, extern nodemask_t numa_nodes_parsed __initdata; -extern int __cpuinit numa_cpu_node(int cpu); extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); extern void __init numa_set_distance(int from, int to, int distance); @@ -35,8 +34,6 @@ extern void __init numa_set_distance(int from, int to, int distance); #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) void numa_emu_cmdline(char *); #endif /* CONFIG_NUMA_EMU */ -#else -static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } #endif #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 745258dfc4d..e9005c4ea29 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -32,6 +32,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; +int __cpuinit numa_cpu_node(int cpu) +{ + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; +} + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index c757c0a3b52..e0d9716ab38 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -107,11 +107,6 @@ extern unsigned long highend_pfn, highstart_pfn; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -int __cpuinit numa_cpu_node(int cpu) -{ - return apic->x86_32_numa_cpu_node(cpu); -} - /* * FLAT - support for basic PC memory model with discontig enabled, essentially * a single node with all available processors in it with a flat diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index b4fd25e753c..7f83adec048 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -512,15 +512,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -int __cpuinit numa_cpu_node(int cpu) -{ - int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); - - if (apicid != BAD_APICID) - return __apicid_to_node[apicid]; - return NUMA_NO_NODE; -} - #ifdef CONFIG_MEMORY_HOTPLUG int memory_add_physaddr_to_nid(u64 start) { -- cgit v1.2.3 From 84914ed0ec6787d38e84b510f92ad4ca3a572fd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:52 +0200 Subject: x86-32, NUMA: Make apic->x86_32_numa_cpu_node() optional NUMAQ is the only meaningful user of this callback and setup_local_APIC() the only callsite. Stop torturing everyone else by making the callback optional and removing all the boilerplate implementations and assignments. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/apic.h | 9 ++++++--- arch/x86/kernel/apic/apic.c | 20 +++----------------- arch/x86/kernel/apic/apic_noop.c | 9 --------- arch/x86/kernel/apic/bigsmp_32.c | 1 - arch/x86/kernel/apic/es7000_32.c | 7 ------- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/apic/summit_32.c | 1 - 7 files changed, 9 insertions(+), 39 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 2b7d573be54..a0c46f06121 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -363,7 +363,12 @@ struct apic { */ int (*x86_32_early_logical_apicid)(int cpu); - /* determine CPU -> NUMA node mapping */ + /* + * Optional method called from setup_local_APIC() after logical + * apicid is guaranteed to be known to initialize apicid -> node + * mapping if NUMA initialization hasn't done so already. Don't + * add new users. + */ int (*x86_32_numa_cpu_node)(int cpu); #endif }; @@ -537,8 +542,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) return cpuid_apic >> index_msb; } -extern int default_x86_32_numa_cpu_node(int cpu); - #endif static inline unsigned int diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a6cd02a9268..0c67b4fc25b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1245,8 +1245,9 @@ void __cpuinit setup_local_APIC(void) * a bit too late - percpu allocation has already happened without * proper NUMA affinity. */ - set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), - apic->x86_32_numa_cpu_node(cpu)); + if (apic->x86_32_numa_cpu_node) + set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), + apic->x86_32_numa_cpu_node(cpu)); #endif /* @@ -2013,21 +2014,6 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -#ifdef CONFIG_X86_32 -int default_x86_32_numa_cpu_node(int cpu) -{ -#ifdef CONFIG_NUMA - int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); - - if (apicid != BAD_APICID) - return __apicid_to_node[apicid]; - return NUMA_NO_NODE; -#else - return 0; -#endif -} -#endif - /* * Power management */ diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index f1baa2dc087..775b82bc655 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(cpu_has_apic && !disable_apic); } -#ifdef CONFIG_X86_32 -static int noop_x86_32_numa_cpu_node(int cpu) -{ - /* we're always on node 0 */ - return 0; -} -#endif - struct apic apic_noop = { .name = "noop", .probe = noop_probe, @@ -195,6 +187,5 @@ struct apic apic_noop = { #ifdef CONFIG_X86_32 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, - .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node, #endif }; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 541a2e43165..d84ac5a584b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -253,5 +253,4 @@ struct apic apic_bigsmp = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, - .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 3e9de4854c5..70533de5bd2 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void) nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); } -static int es7000_numa_cpu_node(int cpu) -{ - return 0; -} - static int es7000_cpu_present_to_apicid(int mps_cpu) { if (!mps_cpu) @@ -688,7 +683,6 @@ struct apic __refdata apic_es7000_cluster = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = es7000_early_logical_apicid, - .x86_32_numa_cpu_node = es7000_numa_cpu_node, }; struct apic __refdata apic_es7000 = { @@ -752,5 +746,4 @@ struct apic __refdata apic_es7000 = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = es7000_early_logical_apicid, - .x86_32_numa_cpu_node = es7000_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index fc84c7b6110..6541e471fd9 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -172,7 +172,6 @@ struct apic apic_default = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, - .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; extern struct apic apic_numaq; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index e4b8059b414..35bcd7d995a 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -551,5 +551,4 @@ struct apic apic_summit = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = summit_early_logical_apicid, - .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; -- cgit v1.2.3 From 1201e10a092adc9c88a6ce5f27740cc5cd0d26e5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:52 +0200 Subject: x86, NUMA: trivial cleanups * Kill no longer used struct bootnode. * Kill dangling declaration of pxm_to_nid() in numa_32.h. * Make setup_node_bootmem() static. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/acpi.h | 2 -- arch/x86/include/asm/amd_nb.h | 1 - arch/x86/include/asm/numa_32.h | 2 -- arch/x86/include/asm/numa_64.h | 7 ------- arch/x86/mm/numa_64.c | 2 +- 5 files changed, 1 insertion(+), 13 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 12e0e7dd869..416d865eae3 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -183,8 +183,6 @@ static inline void disable_acpi(void) { } #define ARCH_HAS_POWER_INIT 1 -struct bootnode; - #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; extern int x86_acpi_numa_init(void); diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 331682231bb..67f87f25761 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -11,7 +11,6 @@ struct amd_nb_bus_dev_range { extern const struct pci_device_id amd_nb_misc_ids[]; extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; -struct bootnode; extern bool early_is_amd_nb(u32 value); extern int amd_cache_northbridges(void); diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index 242522fe9f8..7e54b64623a 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -3,8 +3,6 @@ extern int numa_off; -extern int pxm_to_nid(int pxm); - #ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); #else diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 12461ebec70..794da6de689 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -3,18 +3,11 @@ #include -struct bootnode { - u64 start; - u64 end; -}; - #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) extern int numa_off; extern unsigned long numa_free_all_bootmem(void); -extern void setup_node_bootmem(int nodeid, unsigned long start, - unsigned long end); #ifdef CONFIG_NUMA /* diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7f83adec048..287ae798935 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -95,7 +95,7 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) } /* Initialize bootmem allocator for a node */ -void __init +static void __init setup_node_bootmem(int nid, unsigned long start, unsigned long end) { const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT; -- cgit v1.2.3 From daf4f480ae24270bac06db4293908d36b4834e21 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86-32, NUMA: Move get_memcfg_numa() into numa_32.c There's no reason get_memcfg_numa() to be implemented inline in mmzone_32.h. Move it to numa_32.c and also make get_memcfg_numa_flag() static. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/mmzone_32.h | 18 ------------------ arch/x86/mm/numa_32.c | 11 ++++++++++- 2 files changed, 10 insertions(+), 19 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 91df7c51806..73e5745aef3 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -16,28 +16,10 @@ extern struct pglist_data *node_data[]; /* summit or generic arch */ #include -extern int get_memcfg_numa_flat(void); -/* - * This allows any one NUMA architecture to be compiled - * for, and still fall back to the flat function if it - * fails. - */ -static inline void get_memcfg_numa(void) -{ - - if (get_memcfg_numaq()) - return; - if (get_memcfg_from_srat()) - return; - get_memcfg_numa_flat(); -} - extern void resume_map_numa_kva(pgd_t *pgd); #else /* !CONFIG_NUMA */ -#define get_memcfg_numa get_memcfg_numa_flat - static inline void resume_map_numa_kva(pgd_t *pgd) {} #endif /* CONFIG_NUMA */ diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index f847fa1e02d..abf1247a4c3 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -112,7 +112,7 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); * a single node with all available processors in it with a flat * memory map. */ -int __init get_memcfg_numa_flat(void) +static int __init get_memcfg_numa_flat(void) { printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); @@ -332,6 +332,15 @@ static __init void init_alloc_remap(int nid) nid, node_pa, node_pa + size, remap_va, remap_va + size); } +static void get_memcfg_numa(void) +{ + if (get_memcfg_numaq()) + return; + if (get_memcfg_from_srat()) + return; + get_memcfg_numa_flat(); +} + void __init initmem_init(void) { int nid; -- cgit v1.2.3 From e6df595b37c7c033ef7400b4fdd382a2dc4f4131 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86, NUMA: Move numa_nodes_parsed to numa.[hc] Move numa_nodes_parsed from numa_64.[hc] to numa.[hc] to prepare for NUMA init path unification. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numa.h | 1 + arch/x86/include/asm/numa_64.h | 4 ---- arch/x86/mm/numa.c | 1 + arch/x86/mm/numa_64.c | 2 -- 4 files changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 5982d418c35..c24306cd150 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -19,6 +19,7 @@ * numa_cpu_node(). */ extern s16 __apicid_to_node[MAX_LOCAL_APIC]; +extern nodemask_t numa_nodes_parsed __initdata; static inline void set_apicid_to_node(int apicid, s16 node) { diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 794da6de689..e84113f8fff 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_NUMA_64_H #define _ASM_X86_NUMA_64_H -#include - #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) extern int numa_off; @@ -17,8 +15,6 @@ extern unsigned long numa_free_all_bootmem(void); */ #define NODE_MIN_SIZE (4*1024*1024) -extern nodemask_t numa_nodes_parsed __initdata; - extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); extern void __init numa_set_distance(int from, int to, int distance); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index e9005c4ea29..cce174109ca 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -6,6 +6,7 @@ #include int __initdata numa_off; +nodemask_t numa_nodes_parsed __initdata; static __init int numa_setup(char *opt) { diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 287ae798935..70bd8221f92 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -26,8 +26,6 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -nodemask_t numa_nodes_parsed __initdata; - static struct numa_meminfo numa_meminfo #ifndef CONFIG_MEMORY_HOTPLUG __initdata -- cgit v1.2.3 From b0d310801a4c1f95b44357e4ebc22a9903e3bf3d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86-32, NUMA: implement temporary NUMA init shims To help transition to common NUMA init, implement temporary 32bit shims for numa_add_memblk() and numa_set_distance(). numa_add_memblk() registers the memblk and adjusts node_start/end_pfn[]. numa_set_distance() is noop. These shims will allow using 64bit NUMA init functions on 32bit and gradual transition to common NUMA init path. For detailed description, please read description of commits which make use of the shim functions. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numa.h | 3 +++ arch/x86/include/asm/numa_64.h | 3 --- arch/x86/mm/numa_32.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index c24306cd150..db449c7d89b 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -21,6 +21,9 @@ extern s16 __apicid_to_node[MAX_LOCAL_APIC]; extern nodemask_t numa_nodes_parsed __initdata; +extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); +extern void __init numa_set_distance(int from, int to, int distance); + static inline void set_apicid_to_node(int apicid, s16 node) { __apicid_to_node[apicid] = node; diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index e84113f8fff..506dd050ff3 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -15,9 +15,6 @@ extern unsigned long numa_free_all_bootmem(void); */ #define NODE_MIN_SIZE (4*1024*1024) -extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); -extern void __init numa_set_distance(int from, int to, int distance); - #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index abf1247a4c3..d0369a56f84 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -414,3 +414,37 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif +/* temporary shim, will go away soon */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = end >> PAGE_SHIFT; + + printk(KERN_DEBUG "nid %d start_pfn %08lx end_pfn %08lx\n", + nid, start_pfn, end_pfn); + + if (start >= (u64)max_pfn << PAGE_SHIFT) { + printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", + start_pfn, end_pfn); + return 0; + } + + node_set_online(nid); + memblock_x86_register_active_regions(nid, start_pfn, + min(end_pfn, max_pfn)); + + if (!node_has_online_mem(nid)) { + node_start_pfn[nid] = start_pfn; + node_end_pfn[nid] = end_pfn; + } else { + node_start_pfn[nid] = min(node_start_pfn[nid], start_pfn); + node_end_pfn[nid] = max(node_end_pfn[nid], end_pfn); + } + return 0; +} + +/* temporary shim, will go away soon */ +void __init numa_set_distance(int from, int to, int distance) +{ + /* nada */ +} -- cgit v1.2.3 From 5acd91ab837c9d066af7345aea6462dc55695db7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86-32, NUMA: Replace srat_32.c with srat.c SRAT support implementation in srat_32.c and srat.c are generally similar; however, there are some differences. First of all, 64bit implementation supports more types of SRAT entries. 64bit supports x2apic, affinity, memory and SLIT. 32bit only supports processor and memory. Most other differences stem from different initialization protocols employed by 64bit and 32bit NUMA init paths. On 64bit, * Mappings among PXM, node and apicid are directly done in each SRAT entry callback. * Memory affinity information is passed to numa_add_memblk() which takes care of all interfacing with NUMA init. * Doesn't directly initialize NUMA configurations. All the information is recorded in numa_nodes_parsed and memblks. On 32bit, * Checks numa_off. * Things go through one more level of indirection via private tables but eventually end up initializing the same mappings. * node_start/end_pfn[] are initialized and memblock_x86_register_active_regions() is called for each memory chunk. * node_set_online() is called for each online node. * sort_node_map() is called. There are also other minor differences in sanity checking and messages but taking 64bit version should be good enough. This patch drops the 32bit specific implementation and makes the 64bit implementation common for both 32 and 64bit. The init protocol differences are dealt with in two places - the numa_add_memblk() shim added in the previous patch and new temporary numa_32.c:get_memcfg_from_srat() which wraps invocation of x86_acpi_numa_init(). The shim numa_add_memblk() handles the folowings. * node_start/end_pfn[] initialization. * node_set_online() for memory nodes. * Invocation of memblock_x86_register_active_regions(). The shim get_memcfg_from_srat() handles the followings. * numa_off check. * node_set_online() for CPU nodes. * sort_node_map() invocation. * Clearing of numa_nodes_parsed and active_ranges on failure. The shims are temporary and will be removed as the generic NUMA init path in 32bit is replaced with 64bit one. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/mmzone_32.h | 2 - arch/x86/include/asm/srat.h | 39 ------ arch/x86/mm/Makefile | 5 +- arch/x86/mm/numa_32.c | 23 ++++ arch/x86/mm/srat_32.c | 281 --------------------------------------- 5 files changed, 24 insertions(+), 326 deletions(-) delete mode 100644 arch/x86/include/asm/srat.h delete mode 100644 arch/x86/mm/srat_32.c (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 73e5745aef3..5e83a416eca 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -13,8 +13,6 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #include -/* summit or generic arch */ -#include extern void resume_map_numa_kva(pgd_t *pgd); diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h deleted file mode 100644 index b508d639d1a..00000000000 --- a/arch/x86/include/asm/srat.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Some of the code in this file has been gleaned from the 64 bit - * discontigmem support code base. - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to Pat Gaughen - */ - -#ifndef _ASM_X86_SRAT_H -#define _ASM_X86_SRAT_H - -#ifdef CONFIG_ACPI_NUMA -extern int get_memcfg_from_srat(void); -#else -static inline int get_memcfg_from_srat(void) -{ - return 0; -} -#endif - -#endif /* _ASM_X86_SRAT_H */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 37e7043362a..62997be3307 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -24,10 +24,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o -ifeq ($(CONFIG_ACPI_NUMA),y) -obj-$(CONFIG_X86_64) += srat.o -obj-$(CONFIG_X86_32) += srat_32.o -endif +obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d0369a56f84..8641239a066 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -332,6 +332,29 @@ static __init void init_alloc_remap(int nid) nid, node_pa, node_pa + size, remap_va, remap_va + size); } +static int get_memcfg_from_srat(void) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + if (numa_off) + return 0; + + if (x86_acpi_numa_init() < 0) { + nodes_clear(numa_nodes_parsed); + remove_all_active_ranges(); + return 0; + } + + for_each_node_mask(nid, numa_nodes_parsed) + node_set_online(nid); + sort_node_map(); + return 1; +#else + return 0; +#endif +} + static void get_memcfg_numa(void) { if (get_memcfg_numaq()) diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c deleted file mode 100644 index 6b9bfd78bc3..00000000000 --- a/arch/x86/mm/srat_32.c +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Some of the code in this file has been gleaned from the 64 bit - * discontigmem support code base. - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to Pat Gaughen - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * proximity macros and definitions - */ -#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ -#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ -#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) -#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) -/* bitmap length; _PXM is at most 255 */ -#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) -static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ - -#define MAX_CHUNKS_PER_NODE 3 -#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) -struct node_memory_chunk_s { - unsigned long start_pfn; - unsigned long end_pfn; - u8 pxm; // proximity domain of node - u8 nid; // which cnode contains this chunk? - u8 bank; // which mem bank on this node -}; -static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; - -static int __initdata num_memory_chunks; /* total number of memory chunks */ -static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC]; - -int acpi_numa __initdata; - -static __init void bad_srat(void) -{ - printk(KERN_ERR "SRAT: SRAT not used.\n"); - acpi_numa = -1; - num_memory_chunks = 0; -} - -static __init inline int srat_disabled(void) -{ - return numa_off || acpi_numa < 0; -} - -/* Identify CPU proximity domains */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) -{ - if (srat_disabled()) - return; - if (cpu_affinity->header.length != - sizeof(struct acpi_srat_cpu_affinity)) { - bad_srat(); - return; - } - - if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; /* empty entry */ - - /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); - - /* don't need to check apic_id here, because it is always 8 bits */ - apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; - - printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", - cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); -} - -/* - * Identify memory proximity domains and hot-remove capabilities. - * Fill node memory chunk list structure. - */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) -{ - unsigned long long paddr, size; - unsigned long start_pfn, end_pfn; - u8 pxm; - struct node_memory_chunk_s *p, *q, *pend; - - if (srat_disabled()) - return; - if (memory_affinity->header.length != - sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return; - } - - if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; /* empty entry */ - - pxm = memory_affinity->proximity_domain & 0xff; - - /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, pxm); - - /* calculate info for memory chunk structure */ - paddr = memory_affinity->base_address; - size = memory_affinity->length; - - start_pfn = paddr >> PAGE_SHIFT; - end_pfn = (paddr + size) >> PAGE_SHIFT; - - - if (num_memory_chunks >= MAXCHUNKS) { - printk(KERN_WARNING "Too many mem chunks in SRAT." - " Ignoring %lld MBytes at %llx\n", - size/(1024*1024), paddr); - return; - } - - /* Insertion sort based on base address */ - pend = &node_memory_chunk[num_memory_chunks]; - for (p = &node_memory_chunk[0]; p < pend; p++) { - if (start_pfn < p->start_pfn) - break; - } - if (p < pend) { - for (q = pend; q >= p; q--) - *(q + 1) = *q; - } - p->start_pfn = start_pfn; - p->end_pfn = end_pfn; - p->pxm = pxm; - - num_memory_chunks++; - - printk(KERN_DEBUG "Memory range %08lx to %08lx" - " in proximity domain %02x %s\n", - start_pfn, end_pfn, - pxm, - ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? - "enabled and removable" : "enabled" ) ); -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -} - -void acpi_numa_arch_fixup(void) -{ -} -/* - * The SRAT table always lists ascending addresses, so can always - * assume that the first "start" address that you see is the real - * start of the node, and that the current "end" address is after - * the previous one. - */ -static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) -{ - /* - * Only add present memory as told by the e820. - * There is no guarantee from the SRAT that the memory it - * enumerates is present at boot time because it represents - * *possible* memory hotplug areas the same as normal RAM. - */ - if (memory_chunk->start_pfn >= max_pfn) { - printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", - memory_chunk->start_pfn, memory_chunk->end_pfn); - return -1; - } - if (memory_chunk->nid != nid) - return -1; - - if (!node_has_online_mem(nid)) - node_start_pfn[nid] = memory_chunk->start_pfn; - - if (node_start_pfn[nid] > memory_chunk->start_pfn) - node_start_pfn[nid] = memory_chunk->start_pfn; - - if (node_end_pfn[nid] < memory_chunk->end_pfn) - node_end_pfn[nid] = memory_chunk->end_pfn; - - return 0; -} - -int __init get_memcfg_from_srat(void) -{ - int i, j; - - if (srat_disabled()) - goto out_fail; - - if (acpi_numa_init() < 0) - goto out_fail; - - if (num_memory_chunks == 0) { - printk(KERN_DEBUG - "could not find any ACPI SRAT memory areas.\n"); - goto out_fail; - } - - /* Calculate total number of nodes in system from PXM bitmap and create - * a set of sequential node IDs starting at zero. (ACPI doesn't seem - * to specify the range of _PXM values.) - */ - /* - * MCD - we no longer HAVE to number nodes sequentially. PXM domain - * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically - * 32, so we will continue numbering them in this manner until MAX_NUMNODES - * approaches MAX_PXM_DOMAINS for i386. - */ - nodes_clear(node_online_map); - for (i = 0; i < MAX_PXM_DOMAINS; i++) { - if (BMAP_TEST(pxm_bitmap, i)) { - int nid = acpi_map_pxm_to_node(i); - node_set_online(nid); - } - } - BUG_ON(num_online_nodes() == 0); - - /* set cnode id in memory chunk structure */ - for (i = 0; i < num_memory_chunks; i++) - node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); - - printk(KERN_DEBUG "pxm bitmap: "); - for (i = 0; i < sizeof(pxm_bitmap); i++) { - printk(KERN_CONT "%02x ", pxm_bitmap[i]); - } - printk(KERN_CONT "\n"); - printk(KERN_DEBUG "Number of logical nodes in system = %d\n", - num_online_nodes()); - printk(KERN_DEBUG "Number of memory chunks in system = %d\n", - num_memory_chunks); - - for (i = 0; i < MAX_LOCAL_APIC; i++) - set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i])); - - for (j = 0; j < num_memory_chunks; j++){ - struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; - printk(KERN_DEBUG - "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", - j, chunk->nid, chunk->start_pfn, chunk->end_pfn); - if (node_read_chunk(chunk->nid, chunk)) - continue; - - memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn, - min(chunk->end_pfn, max_pfn)); - } - /* for out of order entries in SRAT */ - sort_node_map(); - - return 1; -out_fail: - printk(KERN_DEBUG "failed to get NUMA memory information from SRAT" - " table\n"); - return 0; -} -- cgit v1.2.3 From 299a180aec6a8ee3069cf0fe90d722ac20c1f837 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86-32, NUMA: Update numaq to use new NUMA init protocol Update numaq such that it calls numa_add_memblk() and sets numa_nodes_parsed instead of directly diddling with NUMA states. The original get_memcfg_numaq() is renamed to numaq_numa_init() and new get_memcfg_numaq() is created in numa_32.c. The shim numa_add_memblk() implementation handles node_start/end_pfn[] and node_set_online() for nodes with memory. The new get_memcfg_numaq() exactly the same with get_memcfg_from_srat() other than calling the numaq init function. Things get_memcfgs_numaq() do are not strictly necessary for numaq but added for consistency and to help unifying NUMA init handling. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numaq.h | 7 +------ arch/x86/kernel/apic/numaq_32.c | 28 ++++++++++------------------ arch/x86/mm/numa_32.c | 23 +++++++++++++++++++++++ 3 files changed, 34 insertions(+), 24 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 37c516545ec..c3b3c322fd8 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -29,7 +29,7 @@ #ifdef CONFIG_X86_NUMAQ extern int found_numaq; -extern int get_memcfg_numaq(void); +extern int numaq_numa_init(void); extern int pci_numaq_init(void); extern void *xquad_portio; @@ -166,11 +166,6 @@ struct sys_cfg_data { void numaq_tsc_disable(void); -#else -static inline int get_memcfg_numaq(void) -{ - return 0; -} #endif /* CONFIG_X86_NUMAQ */ #endif /* _ASM_X86_NUMAQ_H */ diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 41b8b29d36f..30f13319e24 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -48,8 +48,6 @@ #include #include -#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) - int found_numaq; /* @@ -79,25 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4]; static inline void numaq_register_node(int node, struct sys_cfg_data *scd) { struct eachquadmem *eq = scd->eq + node; + u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20; + u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20; + int ret; - node_set_online(node); - - /* Convert to pages */ - node_start_pfn[node] = - MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size); - - node_end_pfn[node] = - MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); - - memblock_x86_register_active_regions(node, node_start_pfn[node], - node_end_pfn[node]); + node_set(node, numa_nodes_parsed); + ret = numa_add_memblk(node, start, end); + BUG_ON(ret < 0); } /* * Function: smp_dump_qct() * * Description: gets memory layout from the quad config table. This - * function also updates node_online_map with the nodes (quads) present. + * function also updates numa_nodes_parsed with the nodes (quads) present. */ static void __init smp_dump_qct(void) { @@ -106,7 +99,6 @@ static void __init smp_dump_qct(void) scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); - nodes_clear(node_online_map); for_each_node(node) { if (scd->quads_present31_0 & (1 << node)) numaq_register_node(node, scd); @@ -276,14 +268,14 @@ static __init void early_check_numaq(void) } } -int __init get_memcfg_numaq(void) +int __init numaq_numa_init(void) { early_check_numaq(); if (!found_numaq) - return 0; + return -ENOENT; smp_dump_qct(); - return 1; + return 0; } #define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 8641239a066..14135e52cef 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -332,6 +332,29 @@ static __init void init_alloc_remap(int nid) nid, node_pa, node_pa + size, remap_va, remap_va + size); } +static int get_memcfg_numaq(void) +{ +#ifdef CONFIG_X86_NUMAQ + int nid; + + if (numa_off) + return 0; + + if (numaq_numa_init() < 0) { + nodes_clear(numa_nodes_parsed); + remove_all_active_ranges(); + return 0; + } + + for_each_node_mask(nid, numa_nodes_parsed) + node_set_online(nid); + sort_node_map(); + return 1; +#else + return 0; +#endif +} + static int get_memcfg_from_srat(void) { #ifdef CONFIG_ACPI_NUMA -- cgit v1.2.3 From a4106eae650a4d5d30fcdd36d998edfa5ccb0ec4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86, NUMA: Move NUMA init logic from numa_64.c to numa.c Move the generic 64bit NUMA init machinery from numa_64.c to numa.c. * node_data[], numa_mem_info and numa_distance * numa_add_memblk[_to](), numa_remove_memblk[_from]() * numa_set_distance() and friends * numa_init() and all the numa_meminfo handling helpers called from it * dummy_numa_init() * memory_add_physaddr_to_nid() A new function x86_numa_init() is added and the content of numa_64.c::initmem_init() is moved into it. initmem_init() now simply calls x86_numa_init(). Constants and numa_off declaration are moved from numa_{32|64}.h to numa.h. This is code reorganization and doesn't involve any functional change. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numa.h | 16 ++ arch/x86/include/asm/numa_32.h | 2 - arch/x86/include/asm/numa_64.h | 19 -- arch/x86/mm/numa.c | 523 ++++++++++++++++++++++++++++++++++++++++- arch/x86/mm/numa_64.c | 503 +-------------------------------------- arch/x86/mm/numa_internal.h | 2 + 6 files changed, 539 insertions(+), 526 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index db449c7d89b..74540865dd8 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -9,6 +9,16 @@ #ifdef CONFIG_NUMA #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) +#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) + +/* + * Too small node sizes may confuse the VM badly. Usually they + * result from BIOS bugs. So dont recognize nodes as standalone + * NUMA entities that have less than this amount of RAM listed: + */ +#define NODE_MIN_SIZE (4*1024*1024) + +extern int numa_off; /* * __apicid_to_node[] stores the raw mapping between physical apicid and @@ -68,4 +78,10 @@ static inline void numa_remove_cpu(int cpu) { } void debug_cpumask_set_cpu(int cpu, int node, bool enable); #endif +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) +void numa_emu_cmdline(char *); +#endif /* CONFIG_NUMA_EMU */ + #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index 7e54b64623a..e7d6b825474 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_NUMA_32_H #define _ASM_X86_NUMA_32_H -extern int numa_off; - #ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); #else diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 506dd050ff3..0c05f7ae46e 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -1,25 +1,6 @@ #ifndef _ASM_X86_NUMA_64_H #define _ASM_X86_NUMA_64_H -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) - -extern int numa_off; - extern unsigned long numa_free_all_bootmem(void); -#ifdef CONFIG_NUMA -/* - * Too small node sizes may confuse the VM badly. Usually they - * result from BIOS bugs. So dont recognize nodes as standalone - * NUMA entities that have less than this amount of RAM listed: - */ -#define NODE_MIN_SIZE (4*1024*1024) - -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -void numa_emu_cmdline(char *); -#endif /* CONFIG_NUMA_EMU */ -#endif - #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index cce174109ca..ed1daba5490 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -1,13 +1,42 @@ /* Common code for 32 and 64-bit NUMA */ -#include -#include +#include +#include +#include +#include #include -#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include #include +#include + +#include "numa_internal.h" int __initdata numa_off; nodemask_t numa_nodes_parsed __initdata; +#ifdef CONFIG_X86_64 +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +static struct numa_meminfo numa_meminfo +#ifndef CONFIG_MEMORY_HOTPLUG +__initdata +#endif +; + +static int numa_distance_cnt; +static u8 *numa_distance; +#endif + static __init int numa_setup(char *opt) { if (!opt) @@ -105,6 +134,392 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } +#ifdef CONFIG_X86_64 +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", + nid, start, end); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/* Initialize bootmem allocator for a node */ +static void __init +setup_node_bootmem(int nid, unsigned long start, unsigned long end) +{ + const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT; + const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT; + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + unsigned long nd_pa; + int tnid; + + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + return; + + start = roundup(start, ZONE_ALIGN); + + printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", + nid, start, end); + + /* + * Try to allocate node data on local node and then fall back to + * all nodes. Never allocate in DMA zone. + */ + nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) + nd_pa = memblock_find_in_range(nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) { + pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid); + return; + } + memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); + + /* report and initialize */ + printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", + nd_pa, nd_pa + nd_size - 1); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (tnid != nid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = __va(nd_pa); + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; + NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; + + node_set_online(nid); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unncessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = 0; + const u64 high = (u64)max_pfn << PAGE_SHIFT; + int i, j, k; + + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* make sure all blocks are inside the limits */ + bi->start = max(bi->start, low); + bi->end = min(bi->end, high); + + /* and there's no empty block */ + if (bi->start >= bi->end) { + numa_remove_memblk_from(i--, mi); + continue; + } + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + unsigned long start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->nid, bj->start, bj->end); + return -EINVAL; + } + pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->start, bj->end); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = max(min(bi->start, bj->start), low); + end = min(max(bi->end, bj->end), high); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", + bi->nid, bi->start, bi->end, bj->start, bj->end, + start, end); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_x86_free_range(__pa(numa_distance), + __pa(numa_distance) + size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, + size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node at the time of + * table creation or @distance doesn't make sense, the call is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt) { + printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ + unsigned long numaram, e820ram; + int i; + + numaram = 0; + for (i = 0; i < mi->nr_blks; i++) { + unsigned long s = mi->blk[i].start >> PAGE_SHIFT; + unsigned long e = mi->blk[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); + if ((long)numaram < 0) + numaram = 0; + } + + e820ram = max_pfn - (memblock_x86_hole_size(0, + max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return false; + } + return true; +} + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ + int i, nid; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) + memblock_x86_register_active_regions(mi->blk[i].nid, + mi->blk[i].start >> PAGE_SHIFT, + mi->blk[i].end >> PAGE_SHIFT); + + /* for out of order entries */ + sort_node_map(); + if (!numa_meminfo_cover_memory(mi)) + return -EINVAL; + + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = (u64)max_pfn << PAGE_SHIFT; + u64 end = 0; + + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) + continue; + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); + } + + if (start < end) + setup_node_bootmem(nid, start, end); + } + + return 0; +} +#endif + /* * There are unfortunately some poorly designed mainboards around that * only connect memory to a single CPU. This breaks the 1:1 cpu->node @@ -127,6 +542,93 @@ void __init numa_init_array(void) } } +#ifdef CONFIG_X86_64 +static int __init numa_init(int (*init_func)(void)) +{ + int i; + int ret; + + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + remove_all_active_ranges(); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; + + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); + + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(i); + } + numa_init_array(); + return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ +static int __init dummy_numa_init(void) +{ + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + printk(KERN_INFO "Faking a node at %016lx-%016lx\n", + 0LU, max_pfn << PAGE_SHIFT); + + node_set(0, numa_nodes_parsed); + numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); + + return 0; +} + +/** + * x86_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds. The + * last fallback is dummy single node config encomapssing whole memory and + * never fails. + */ +void __init x86_numa_init(void) +{ + if (!numa_off) { +#ifdef CONFIG_ACPI_NUMA + if (!numa_init(x86_acpi_numa_init)) + return; +#endif +#ifdef CONFIG_AMD_NUMA + if (!numa_init(amd_numa_init)) + return; +#endif + } + + numa_init(dummy_numa_init); +} +#endif + static __init int find_near_online_node(int node) { int n, val; @@ -292,3 +794,18 @@ const struct cpumask *cpumask_of_node(int node) EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +#if defined(CONFIG_X86_64) && defined(CONFIG_MEMORY_HOTPLUG) +int memory_add_physaddr_to_nid(u64 start) +{ + struct numa_meminfo *mi = &numa_meminfo; + int nid = mi->blk[0].nid; + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + nid = mi->blk[i].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 70bd8221f92..dd27f401f0a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -2,499 +2,13 @@ * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include #include "numa_internal.h" -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -static struct numa_meminfo numa_meminfo -#ifndef CONFIG_MEMORY_HOTPLUG -__initdata -#endif -; - -static int numa_distance_cnt; -static u8 *numa_distance; - -static int __init numa_add_memblk_to(int nid, u64 start, u64 end, - struct numa_meminfo *mi) -{ - /* ignore zero length blks */ - if (start == end) - return 0; - - /* whine about and ignore invalid blks */ - if (start > end || nid < 0 || nid >= MAX_NUMNODES) { - pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", - nid, start, end); - return 0; - } - - if (mi->nr_blks >= NR_NODE_MEMBLKS) { - pr_err("NUMA: too many memblk ranges\n"); - return -EINVAL; - } - - mi->blk[mi->nr_blks].start = start; - mi->blk[mi->nr_blks].end = end; - mi->blk[mi->nr_blks].nid = nid; - mi->nr_blks++; - return 0; -} - -/** - * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo - * @idx: Index of memblk to remove - * @mi: numa_meminfo to remove memblk from - * - * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and - * decrementing @mi->nr_blks. - */ -void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) -{ - mi->nr_blks--; - memmove(&mi->blk[idx], &mi->blk[idx + 1], - (mi->nr_blks - idx) * sizeof(mi->blk[0])); -} - -/** - * numa_add_memblk - Add one numa_memblk to numa_meminfo - * @nid: NUMA node ID of the new memblk - * @start: Start address of the new memblk - * @end: End address of the new memblk - * - * Add a new memblk to the default numa_meminfo. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_add_memblk(int nid, u64 start, u64 end) -{ - return numa_add_memblk_to(nid, start, end, &numa_meminfo); -} - -/* Initialize bootmem allocator for a node */ -static void __init -setup_node_bootmem(int nid, unsigned long start, unsigned long end) -{ - const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT; - const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT; - const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - unsigned long nd_pa; - int tnid; - - /* - * Don't confuse VM with a node that doesn't have the - * minimum amount of memory: - */ - if (end && (end - start) < NODE_MIN_SIZE) - return; - - start = roundup(start, ZONE_ALIGN); - - printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", - nid, start, end); - - /* - * Try to allocate node data on local node and then fall back to - * all nodes. Never allocate in DMA zone. - */ - nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, - nd_size, SMP_CACHE_BYTES); - if (nd_pa == MEMBLOCK_ERROR) - nd_pa = memblock_find_in_range(nd_low, nd_high, - nd_size, SMP_CACHE_BYTES); - if (nd_pa == MEMBLOCK_ERROR) { - pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid); - return; - } - memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); - - /* report and initialize */ - printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", - nd_pa, nd_pa + nd_size - 1); - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != nid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); - - node_data[nid] = __va(nd_pa); - memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); - NODE_DATA(nid)->node_id = nid; - NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; - NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; - - node_set_online(nid); -} - -/** - * numa_cleanup_meminfo - Cleanup a numa_meminfo - * @mi: numa_meminfo to clean up - * - * Sanitize @mi by merging and removing unncessary memblks. Also check for - * conflicts and clear unused memblks. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_cleanup_meminfo(struct numa_meminfo *mi) -{ - const u64 low = 0; - const u64 high = (u64)max_pfn << PAGE_SHIFT; - int i, j, k; - - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi = &mi->blk[i]; - - /* make sure all blocks are inside the limits */ - bi->start = max(bi->start, low); - bi->end = min(bi->end, high); - - /* and there's no empty block */ - if (bi->start >= bi->end) { - numa_remove_memblk_from(i--, mi); - continue; - } - - for (j = i + 1; j < mi->nr_blks; j++) { - struct numa_memblk *bj = &mi->blk[j]; - unsigned long start, end; - - /* - * See whether there are overlapping blocks. Whine - * about but allow overlaps of the same nid. They - * will be merged below. - */ - if (bi->end > bj->start && bi->start < bj->end) { - if (bi->nid != bj->nid) { - pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->nid, bj->start, bj->end); - return -EINVAL; - } - pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->start, bj->end); - } - - /* - * Join together blocks on the same node, holes - * between which don't overlap with memory on other - * nodes. - */ - if (bi->nid != bj->nid) - continue; - start = max(min(bi->start, bj->start), low); - end = min(max(bi->end, bj->end), high); - for (k = 0; k < mi->nr_blks; k++) { - struct numa_memblk *bk = &mi->blk[k]; - - if (bi->nid == bk->nid) - continue; - if (start < bk->end && end > bk->start) - break; - } - if (k < mi->nr_blks) - continue; - printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", - bi->nid, bi->start, bi->end, bj->start, bj->end, - start, end); - bi->start = start; - bi->end = end; - numa_remove_memblk_from(j--, mi); - } - } - - for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { - mi->blk[i].start = mi->blk[i].end = 0; - mi->blk[i].nid = NUMA_NO_NODE; - } - - return 0; -} - -/* - * Set nodes, which have memory in @mi, in *@nodemask. - */ -static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, - const struct numa_meminfo *mi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mi->blk); i++) - if (mi->blk[i].start != mi->blk[i].end && - mi->blk[i].nid != NUMA_NO_NODE) - node_set(mi->blk[i].nid, *nodemask); -} - -/** - * numa_reset_distance - Reset NUMA distance table - * - * The current table is freed. The next numa_set_distance() call will - * create a new one. - */ -void __init numa_reset_distance(void) -{ - size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - - /* numa_distance could be 1LU marking allocation failure, test cnt */ - if (numa_distance_cnt) - memblock_x86_free_range(__pa(numa_distance), - __pa(numa_distance) + size); - numa_distance_cnt = 0; - numa_distance = NULL; /* enable table creation */ -} - -static int __init numa_alloc_distance(void) -{ - nodemask_t nodes_parsed; - size_t size; - int i, j, cnt = 0; - u64 phys; - - /* size the new table and allocate it */ - nodes_parsed = numa_nodes_parsed; - numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); - - for_each_node_mask(i, nodes_parsed) - cnt = i; - cnt++; - size = cnt * cnt * sizeof(numa_distance[0]); - - phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, - size, PAGE_SIZE); - if (phys == MEMBLOCK_ERROR) { - pr_warning("NUMA: Warning: can't allocate distance table!\n"); - /* don't retry until explicitly reset */ - numa_distance = (void *)1LU; - return -ENOMEM; - } - memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); - - numa_distance = __va(phys); - numa_distance_cnt = cnt; - - /* fill with the default distances */ - for (i = 0; i < cnt; i++) - for (j = 0; j < cnt; j++) - numa_distance[i * cnt + j] = i == j ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); - - return 0; -} - -/** - * numa_set_distance - Set NUMA distance from one NUMA to another - * @from: the 'from' node to set distance - * @to: the 'to' node to set distance - * @distance: NUMA distance - * - * Set the distance from node @from to @to to @distance. If distance table - * doesn't exist, one which is large enough to accommodate all the currently - * known nodes will be created. - * - * If such table cannot be allocated, a warning is printed and further - * calls are ignored until the distance table is reset with - * numa_reset_distance(). - * - * If @from or @to is higher than the highest known node at the time of - * table creation or @distance doesn't make sense, the call is ignored. - * This is to allow simplification of specific NUMA config implementations. - */ -void __init numa_set_distance(int from, int to, int distance) -{ - if (!numa_distance && numa_alloc_distance() < 0) - return; - - if (from >= numa_distance_cnt || to >= numa_distance_cnt) { - printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - if ((u8)distance != distance || - (from == to && distance != LOCAL_DISTANCE)) { - pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - numa_distance[from * numa_distance_cnt + to] = distance; -} - -int __node_distance(int from, int to) -{ - if (from >= numa_distance_cnt || to >= numa_distance_cnt) - return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; - return numa_distance[from * numa_distance_cnt + to]; -} -EXPORT_SYMBOL(__node_distance); - -/* - * Sanity check to catch more bad NUMA configurations (they are amazingly - * common). Make sure the nodes cover all memory. - */ -static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) -{ - unsigned long numaram, e820ram; - int i; - - numaram = 0; - for (i = 0; i < mi->nr_blks; i++) { - unsigned long s = mi->blk[i].start >> PAGE_SHIFT; - unsigned long e = mi->blk[i].end >> PAGE_SHIFT; - numaram += e - s; - numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); - if ((long)numaram < 0) - numaram = 0; - } - - e820ram = max_pfn - (memblock_x86_hole_size(0, - max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { - printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", - (numaram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return false; - } - return true; -} - -static int __init numa_register_memblks(struct numa_meminfo *mi) -{ - int i, nid; - - /* Account for nodes with cpus and no memory */ - node_possible_map = numa_nodes_parsed; - numa_nodemask_from_meminfo(&node_possible_map, mi); - if (WARN_ON(nodes_empty(node_possible_map))) - return -EINVAL; - - for (i = 0; i < mi->nr_blks; i++) - memblock_x86_register_active_regions(mi->blk[i].nid, - mi->blk[i].start >> PAGE_SHIFT, - mi->blk[i].end >> PAGE_SHIFT); - - /* for out of order entries */ - sort_node_map(); - if (!numa_meminfo_cover_memory(mi)) - return -EINVAL; - - /* Finally register nodes. */ - for_each_node_mask(nid, node_possible_map) { - u64 start = (u64)max_pfn << PAGE_SHIFT; - u64 end = 0; - - for (i = 0; i < mi->nr_blks; i++) { - if (nid != mi->blk[i].nid) - continue; - start = min(mi->blk[i].start, start); - end = max(mi->blk[i].end, end); - } - - if (start < end) - setup_node_bootmem(nid, start, end); - } - - return 0; -} - -/** - * dummy_numma_init - Fallback dummy NUMA init - * - * Used if there's no underlying NUMA architecture, NUMA initialization - * fails, or NUMA is disabled on the command line. - * - * Must online at least one node and add memory blocks that cover all - * allowed memory. This function must not fail. - */ -static int __init dummy_numa_init(void) -{ - printk(KERN_INFO "%s\n", - numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", - 0LU, max_pfn << PAGE_SHIFT); - - node_set(0, numa_nodes_parsed); - numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); - - return 0; -} - -static int __init numa_init(int (*init_func)(void)) -{ - int i; - int ret; - - for (i = 0; i < MAX_LOCAL_APIC; i++) - set_apicid_to_node(i, NUMA_NO_NODE); - - nodes_clear(numa_nodes_parsed); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - remove_all_active_ranges(); - numa_reset_distance(); - - ret = init_func(); - if (ret < 0) - return ret; - ret = numa_cleanup_meminfo(&numa_meminfo); - if (ret < 0) - return ret; - - numa_emulation(&numa_meminfo, numa_distance_cnt); - - ret = numa_register_memblks(&numa_meminfo); - if (ret < 0) - return ret; - - for (i = 0; i < nr_cpu_ids; i++) { - int nid = early_cpu_to_node(i); - - if (nid == NUMA_NO_NODE) - continue; - if (!node_online(nid)) - numa_clear_node(i); - } - numa_init_array(); - return 0; -} - void __init initmem_init(void) { - if (!numa_off) { -#ifdef CONFIG_ACPI_NUMA - if (!numa_init(x86_acpi_numa_init)) - return; -#endif -#ifdef CONFIG_AMD_NUMA - if (!numa_init(amd_numa_init)) - return; -#endif - } - - numa_init(dummy_numa_init); + x86_numa_init(); } unsigned long __init numa_free_all_bootmem(void) @@ -509,18 +23,3 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } - -#ifdef CONFIG_MEMORY_HOTPLUG -int memory_add_physaddr_to_nid(u64 start) -{ - struct numa_meminfo *mi = &numa_meminfo; - int nid = mi->blk[0].nid; - int i; - - for (i = 0; i < mi->nr_blks; i++) - if (mi->blk[i].start <= start && mi->blk[i].end > start) - nid = mi->blk[i].nid; - return nid; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index ef2d97377d7..ad86ec91e64 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -19,6 +19,8 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); int __init numa_cleanup_meminfo(struct numa_meminfo *mi); void __init numa_reset_distance(void); +void __init x86_numa_init(void); + #ifdef CONFIG_NUMA_EMU void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); -- cgit v1.2.3 From 744baba0c4072b04664952a89292e4708eaf949a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86, NUMA: Enable build of generic NUMA init code on 32bit Generic NUMA init code was moved to numa.c from numa_64.c but is still guaraded by CONFIG_X86_64. This patch removes the compile guard and enables compiling on 32bit. * numa_add_memblk() and numa_set_distance() clash with the shim implementation in numa_32.c and are left out. * memory_add_physaddr_to_nid() clashes with 32bit implementation and is left out. * MAX_DMA_PFN definition in dma.h moved out of !CONFIG_X86_32. * node_data definition in numa_32.c removed in favor of the one in numa.c. There are places where ulong is assumed to be 64bit. The next patch will fix them up. Note that although the code is compiled it isn't used yet and this patch doesn't cause any functional change. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/dma.h | 6 +++--- arch/x86/mm/numa.c | 10 ++++------ arch/x86/mm/numa_32.c | 3 --- 3 files changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 057099e5fab..d1a314b610f 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -69,6 +69,9 @@ #define MAX_DMA_CHANNELS 8 +/* 16MB ISA DMA zone */ +#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) + #ifdef CONFIG_X86_32 /* The maximum address that we can perform a DMA transfer to on this platform */ @@ -76,9 +79,6 @@ #else -/* 16MB ISA DMA zone */ -#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) - /* 4GB broken PCI/AGP hardware bus master zone */ #define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ed1daba5490..c400f3b2b93 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -23,7 +23,6 @@ int __initdata numa_off; nodemask_t numa_nodes_parsed __initdata; -#ifdef CONFIG_X86_64 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -35,7 +34,6 @@ __initdata static int numa_distance_cnt; static u8 *numa_distance; -#endif static __init int numa_setup(char *opt) { @@ -134,7 +132,6 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } -#ifdef CONFIG_X86_64 static int __init numa_add_memblk_to(int nid, u64 start, u64 end, struct numa_meminfo *mi) { @@ -176,6 +173,7 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) (mi->nr_blks - idx) * sizeof(mi->blk[0])); } +#ifdef CONFIG_X86_64 /** * numa_add_memblk - Add one numa_memblk to numa_meminfo * @nid: NUMA node ID of the new memblk @@ -191,6 +189,7 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) { return numa_add_memblk_to(nid, start, end, &numa_meminfo); } +#endif /* Initialize bootmem allocator for a node */ static void __init @@ -402,6 +401,7 @@ static int __init numa_alloc_distance(void) return 0; } +#ifdef CONFIG_X86_64 /** * numa_set_distance - Set NUMA distance from one NUMA to another * @from: the 'from' node to set distance @@ -440,6 +440,7 @@ void __init numa_set_distance(int from, int to, int distance) numa_distance[from * numa_distance_cnt + to] = distance; } +#endif int __node_distance(int from, int to) { @@ -518,7 +519,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) return 0; } -#endif /* * There are unfortunately some poorly designed mainboards around that @@ -542,7 +542,6 @@ void __init numa_init_array(void) } } -#ifdef CONFIG_X86_64 static int __init numa_init(int (*init_func)(void)) { int i; @@ -627,7 +626,6 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -#endif static __init int find_near_online_node(int node) { diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 14135e52cef..975a76f622b 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -41,9 +41,6 @@ #include #include -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - /* * numa interface - we expect the numa architecture specific code to have * populated the following initialisation. -- cgit v1.2.3 From bd6709a91a593d8fe35d08da542e9f93bb74a304 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 17:24:48 +0200 Subject: x86, NUMA: Make 32bit use common NUMA init path With both _numa_init() methods converted and the rest of init code adjusted, numa_32.c now can switch from the 32bit only init code to the common one in numa.c. * Shim get_memcfg_*()'s are dropped and initmem_init() calls x86_numa_init(), which is updated to handle NUMAQ. * All boilerplate operations including node range limiting, pgdat alloc/init are handled by numa_init(). 32bit only implementation is removed. * 32bit numa_add_memblk(), numa_set_distance() and memory_add_physaddr_to_nid() removed and common versions in numa_32.c enabled for 32bit. This change causes the following behavior changes. * NODE_DATA()->node_start_pfn/node_spanned_pages properly initialized for 32bit too. * Much more sanity checks and configuration cleanups. * Proper handling of node distances. * The same NUMA init messages as 64bit. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/topology.h | 7 -- arch/x86/mm/numa.c | 10 +- arch/x86/mm/numa_32.c | 231 +--------------------------------------- 3 files changed, 7 insertions(+), 241 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 8dba76972fd..c00692476e9 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -93,18 +93,11 @@ extern void setup_node_to_cpumask_map(void); #define pcibus_to_node(bus) __pcibus_to_node(bus) #ifdef CONFIG_X86_32 -extern unsigned long node_start_pfn[]; -extern unsigned long node_end_pfn[]; -#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid]) - # define SD_CACHE_NICE_TRIES 1 # define SD_IDLE_IDX 1 - #else - # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 - #endif /* sched_domains SD_NODE_INIT for NUMA machines */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a72317ae74c..56ed714ed24 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -173,7 +173,6 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) (mi->nr_blks - idx) * sizeof(mi->blk[0])); } -#ifdef CONFIG_X86_64 /** * numa_add_memblk - Add one numa_memblk to numa_meminfo * @nid: NUMA node ID of the new memblk @@ -189,7 +188,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) { return numa_add_memblk_to(nid, start, end, &numa_meminfo); } -#endif /* Initialize bootmem allocator for a node */ static void __init setup_node_bootmem(int nid, u64 start, u64 end) @@ -413,7 +411,6 @@ static int __init numa_alloc_distance(void) return 0; } -#ifdef CONFIG_X86_64 /** * numa_set_distance - Set NUMA distance from one NUMA to another * @from: the 'from' node to set distance @@ -452,7 +449,6 @@ void __init numa_set_distance(int from, int to, int distance) numa_distance[from * numa_distance_cnt + to] = distance; } -#endif int __node_distance(int from, int to) { @@ -626,6 +622,10 @@ static int __init dummy_numa_init(void) void __init x86_numa_init(void) { if (!numa_off) { +#ifdef CONFIG_X86_NUMAQ + if (!numa_init(numaq_numa_init)) + return; +#endif #ifdef CONFIG_ACPI_NUMA if (!numa_init(x86_acpi_numa_init)) return; @@ -805,7 +805,7 @@ EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#if defined(CONFIG_X86_64) && defined(CONFIG_MEMORY_HOTPLUG) +#ifdef CONFIG_MEMORY_HOTPLUG int memory_add_physaddr_to_nid(u64 start) { struct numa_meminfo *mi = &numa_meminfo; diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index fbd558fe10b..849a975d3fa 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -22,36 +22,11 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include #include #include -#include -#include -#include -#include #include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/* - * numa interface - we expect the numa architecture specific code to have - * populated the following initialisation. - * - * 1) node_online_map - the map of all nodes configured (online) in the system - * 2) node_start_pfn - the starting page frame number for a node - * 3) node_end_pfn - the ending page fram number for a node - */ -unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; -unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; +#include "numa_internal.h" #ifdef CONFIG_DISCONTIGMEM /* @@ -96,7 +71,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, } #endif -extern unsigned long find_max_low_pfn(void); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) @@ -104,68 +78,6 @@ extern unsigned long highend_pfn, highstart_pfn; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -/* - * FLAT - support for basic PC memory model with discontig enabled, essentially - * a single node with all available processors in it with a flat - * memory map. - */ -static int __init get_memcfg_numa_flat(void) -{ - printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); - - node_start_pfn[0] = 0; - node_end_pfn[0] = max_pfn; - memblock_x86_register_active_regions(0, 0, max_pfn); - - /* Indicate there is one node available. */ - nodes_clear(node_online_map); - node_set_online(0); - return 1; -} - -/* - * Find the highest page frame number we have available for the node - */ -static void __init propagate_e820_map_node(int nid) -{ - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - /* - * if a user has given mem=XXXX, then we need to make sure - * that the node _starts_ before that, too, not just ends - */ - if (node_start_pfn[nid] > max_pfn) - node_start_pfn[nid] = max_pfn; - BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); -} - -/* - * Allocate memory for the pg_data_t for this node via a crude pre-bootmem - * method. For node zero take this from the bottom of memory, for - * subsequent nodes place them at node_remap_start_vaddr which contains - * node local data in physically node local memory. See setup_memory() - * for details. - */ -static void __init allocate_pgdat(int nid) -{ - char buf[16]; - - NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE)); - if (!NODE_DATA(nid)) { - unsigned long pgdat_phys; - pgdat_phys = memblock_find_in_range(min_low_pfn<>PAGE_SHIFT)); - memset(buf, 0, sizeof(buf)); - sprintf(buf, "NODE_DATA %d", nid); - memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); - } - printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", - nid, (unsigned long)NODE_DATA(nid)); -} - /* * Remap memory allocator */ @@ -322,76 +234,9 @@ void __init init_alloc_remap(int nid, u64 start, u64 end) nid, node_pa, node_pa + size, remap_va, remap_va + size); } -static int get_memcfg_numaq(void) -{ -#ifdef CONFIG_X86_NUMAQ - int nid; - - if (numa_off) - return 0; - - if (numaq_numa_init() < 0) { - nodes_clear(numa_nodes_parsed); - remove_all_active_ranges(); - return 0; - } - - for_each_node_mask(nid, numa_nodes_parsed) - node_set_online(nid); - sort_node_map(); - return 1; -#else - return 0; -#endif -} - -static int get_memcfg_from_srat(void) -{ -#ifdef CONFIG_ACPI_NUMA - int nid; - - if (numa_off) - return 0; - - if (x86_acpi_numa_init() < 0) { - nodes_clear(numa_nodes_parsed); - remove_all_active_ranges(); - return 0; - } - - for_each_node_mask(nid, numa_nodes_parsed) - node_set_online(nid); - sort_node_map(); - return 1; -#else - return 0; -#endif -} - -static void get_memcfg_numa(void) -{ - if (get_memcfg_numaq()) - return; - if (get_memcfg_from_srat()) - return; - get_memcfg_numa_flat(); -} - void __init initmem_init(void) { - int nid; - - get_memcfg_numa(); - numa_init_array(); - - for_each_online_node(nid) { - u64 start = (u64)node_start_pfn[nid] << PAGE_SHIFT; - u64 end = min((u64)node_end_pfn[nid] << PAGE_SHIFT, - (u64)max_pfn << PAGE_SHIFT); - - if (start < end) - init_alloc_remap(nid, start, end); - } + x86_numa_init(); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; @@ -412,81 +257,9 @@ void __init initmem_init(void) printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) - allocate_pgdat(nid); printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); - for_each_online_node(nid) - propagate_e820_map_node(nid); - - for_each_online_node(nid) { - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - NODE_DATA(nid)->node_id = nid; - } setup_bootmem_allocator(); } - -#ifdef CONFIG_MEMORY_HOTPLUG -static int paddr_to_nid(u64 addr) -{ - int nid; - unsigned long pfn = PFN_DOWN(addr); - - for_each_node(nid) - if (node_start_pfn[nid] <= pfn && - pfn < node_end_pfn[nid]) - return nid; - - return -1; -} - -/* - * This function is used to ask node id BEFORE memmap and mem_section's - * initialization (pfn_to_nid() can't be used yet). - * If _PXM is not defined on ACPI's DSDT, node id must be found by this. - */ -int memory_add_physaddr_to_nid(u64 addr) -{ - int nid = paddr_to_nid(addr); - return (nid >= 0) ? nid : 0; -} - -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - -/* temporary shim, will go away soon */ -int __init numa_add_memblk(int nid, u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; - - printk(KERN_DEBUG "nid %d start_pfn %08lx end_pfn %08lx\n", - nid, start_pfn, end_pfn); - - if (start >= (u64)max_pfn << PAGE_SHIFT) { - printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", - start_pfn, end_pfn); - return 0; - } - - node_set_online(nid); - memblock_x86_register_active_regions(nid, start_pfn, - min(end_pfn, max_pfn)); - - if (!node_has_online_mem(nid)) { - node_start_pfn[nid] = start_pfn; - node_end_pfn[nid] = end_pfn; - } else { - node_start_pfn[nid] = min(node_start_pfn[nid], start_pfn); - node_end_pfn[nid] = max(node_end_pfn[nid], end_pfn); - } - return 0; -} - -/* temporary shim, will go away soon */ -void __init numa_set_distance(int from, int to, int distance) -{ - /* nada */ -} -- cgit v1.2.3 From 752d4f372f90a2f6eb562aaffb639957890cbcab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 17:24:48 +0200 Subject: x86, NUMA: Make numa_init_array() static numa_init_array() no longer has users outside of numa.c. Make it static. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numa.h | 2 -- arch/x86/mm/numa.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 74540865dd8..bfacd2ccf65 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -61,14 +61,12 @@ static inline int numa_cpu_node(int cpu) #ifdef CONFIG_NUMA extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); -extern void __init numa_init_array(void); extern void __init init_cpu_to_node(void); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } -static inline void numa_init_array(void) { } static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(int cpu) { } static inline void numa_remove_cpu(int cpu) { } diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 56ed714ed24..ecb5685d7d2 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -535,7 +535,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) * as the number of CPUs is not known yet. We round robin the existing * nodes. */ -void __init numa_init_array(void) +static void __init numa_init_array(void) { int rr, i; -- cgit v1.2.3 From 1b7e03ef7570568d2fb9e6640d7006a0edd728f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 17:24:48 +0200 Subject: x86, NUMA: Enable emulation on 32bit too Now that NUMA init path is unified, NUMA emulation can be enabled on 32bit. Make numa_emluation.c safe on 32bit by doing the followings. * Define MAX_DMA32_PFN on 32bit too. * Include bootmem.h for max_pfn declaration. * Use u64 explicitly and always use PFN_PHYS() when converting page number to address. * Avoid __udivdi3() generation on 32bit by doing number of pages calculation instead in split_nodes_interleave(). And drop X86_64 dependency from Kconfig. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/dma.h | 10 +++------- arch/x86/mm/numa_emulation.c | 16 +++++++++++----- 3 files changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 50cb68d2901..648fca42ae6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1201,7 +1201,7 @@ config NODES_SPAN_OTHER_NODES config NUMA_EMU bool "NUMA emulation" - depends on X86_64 && NUMA + depends on NUMA ---help--- Enable NUMA emulation. A flat machine will be split into virtual nodes when booted with "numa=fake=N", where N is the diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index d1a314b610f..0bdb0c54d9a 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -72,19 +72,15 @@ /* 16MB ISA DMA zone */ #define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) -#ifdef CONFIG_X86_32 +/* 4GB broken PCI/AGP hardware bus master zone */ +#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) +#ifdef CONFIG_X86_32 /* The maximum address that we can perform a DMA transfer to on this platform */ #define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000) - #else - -/* 4GB broken PCI/AGP hardware bus master zone */ -#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) - /* Compat define for old dma zone */ #define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) - #endif /* 8237 DMA controllers */ diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index de84cc14037..d0ed086b624 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "numa_internal.h" @@ -84,7 +85,13 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, nr_nodes = MAX_NUMNODES; } - size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; + /* + * Calculate target node size. x86_32 freaks on __udivdi3() so do + * the division in ulong number of pages and convert back. + */ + size = max_addr - addr - memblock_x86_hole_size(addr, max_addr); + size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + /* * Calculate the number of big nodes that can be allocated as a result * of consolidating the remainder. @@ -226,7 +233,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, */ while (nodes_weight(physnode_mask)) { for_each_node_mask(i, physnode_mask) { - u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); u64 start, limit, end; int phys_blk; @@ -298,7 +305,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) { static struct numa_meminfo ei __initdata; static struct numa_meminfo pi __initdata; - const u64 max_addr = max_pfn << PAGE_SHIFT; + const u64 max_addr = PFN_PHYS(max_pfn); u8 *phys_dist = NULL; size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); int max_emu_nid, dfl_phys_nid; @@ -342,8 +349,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) if (numa_dist_cnt) { u64 phys; - phys = memblock_find_in_range(0, - (u64)max_pfn_mapped << PAGE_SHIFT, + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), phys_size, PAGE_SIZE); if (phys == MEMBLOCK_ERROR) { pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); -- cgit v1.2.3