From cc1a9d86ce989083703c4bdc11b75a87e1cc404a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Jun 2008 19:39:16 -0700 Subject: mm, x86: shrink_active_range() should check all Now we are using register_e820_active_regions() instead of add_active_range() directly. So end_pfn could be different between the value in early_node_map to node_end_pfn. So we need to make shrink_active_range() smarter. shrink_active_range() is a generic MM function in mm/page_alloc.c but it is only used on 32-bit x86. Should we move it back to some file in arch/x86? Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/mm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index c31a9cd2a30e..7cbd949f2516 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -997,8 +997,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn); +extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); -- cgit v1.2.3 From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 19:08:52 -0700 Subject: x86: replace shrink_active_range() with remove_active_range() in case we have kva before ramdisk on a node, we still need to use those ranges. v2: reserve_early kva ram area, in case there are holes in highmem, to avoid those area could be treat as free high pages. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 45 ++++++++++++++++++++++++--------------------- include/linux/mm.h | 3 ++- mm/page_alloc.c | 29 +++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 28 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index accc7c6c57fc..c3f119e99e0d 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -230,8 +230,8 @@ static unsigned long calculate_numa_remap_pages(void) unsigned long size, reserve_pages = 0; for_each_online_node(nid) { - u64 node_end_target; - u64 node_end_final; + u64 node_kva_target; + u64 node_kva_final; /* * The acpi/srat node info can show hot-add memroy zones @@ -254,42 +254,45 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_end_target = round_down(node_end_pfn[nid] - size, + node_kva_target = round_down(node_end_pfn[nid] - size, PTRS_PER_PTE); - node_end_target <<= PAGE_SHIFT; + node_kva_target <<= PAGE_SHIFT; do { - node_end_final = find_e820_area(node_end_target, + node_kva_final = find_e820_area(node_kva_target, ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + node_kva_target -= LARGE_PAGE_BYTES; + } while (node_kva_final == -1ULL && + (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - if (node_end_final == -1ULL) + if (node_kva_final == -1ULL) panic("Can not get kva ram\n"); - printk("Reserving %ld pages of KVA for lmem_map of node %d\n", - size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %lld pages\n", - nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + size, nid, node_kva_final>>PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system * with less memory later. * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then max_low_pfn + * but we could have some hole in high memory, and it will only + * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide + * to use it as free. + * So reserve_early here, hope we don't run out of that array */ - if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) - reserve_early(node_end_final, - node_end_final+(((u64)size)<>PAGE_SHIFT; - node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, node_end_pfn[nid]); + reserve_early(node_kva_final, + node_kva_final+(((u64)size)<>PAGE_SHIFT; + remove_active_range(nid, node_remap_start_pfn[nid], + node_remap_start_pfn[nid] + size); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index ce8e397a61f6..034a3156d2f0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -998,7 +998,8 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); +extern void remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee5ba7509c1..d80e1868e570 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3552,30 +3552,47 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /** - * shrink_active_range - Shrink an existing registered range of PFNs + * remove_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @new_end_pfn: The new PFN of the range + * @start_pfn: The new PFN of the range + * @end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. * The map is kept near the end physical page range that has already been * registered. This function allows an arch to shrink an existing registered * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) +void __init remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) { int i, j; int removed = 0; + printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", + nid, start_pfn, end_pfn); + /* Find the old active region end and shrink */ for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= new_end_pfn) { + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn <= end_pfn) { /* clear it */ + early_node_map[i].start_pfn = 0; early_node_map[i].end_pfn = 0; removed = 1; continue; } - if (early_node_map[i].end_pfn > new_end_pfn) { - early_node_map[i].end_pfn = new_end_pfn; + if (early_node_map[i].start_pfn < start_pfn && + early_node_map[i].end_pfn > start_pfn) { + unsigned long temp_end_pfn = early_node_map[i].end_pfn; + early_node_map[i].end_pfn = start_pfn; + if (temp_end_pfn > end_pfn) + add_active_range(nid, end_pfn, temp_end_pfn); + continue; + } + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn > end_pfn && + early_node_map[i].start_pfn < end_pfn) { + early_node_map[i].start_pfn = end_pfn; continue; } } -- cgit v1.2.3 From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Jun 2008 18:32:52 -0700 Subject: x86, mm: use add_highpages_with_active_regions() for high pages init v2 use early_node_map to init high pages, so we can remove page_is_ram() and page_is_reserved_early() in the big loop with add_one_highpage also remove page_is_reserved_early(), it is not needed anymore. v2: fix the build of other platforms Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 11 -------- arch/x86/mm/discontig_32.c | 19 ++++++-------- arch/x86/mm/init_32.c | 62 ++++++++++++++++++++++++++++++++++++++-------- include/asm-x86/e820.h | 1 - include/asm-x86/highmem.h | 3 +++ include/linux/mm.h | 2 ++ mm/page_alloc.c | 8 ++++++ 7 files changed, 71 insertions(+), 35 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5051ce744b4e..ed46b7a6bc13 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -612,17 +612,6 @@ void __init free_early(u64 start, u64 end) early_res[j - 1].end = 0; } -int __init page_is_reserved_early(unsigned long pagenr) -{ - u64 start = (u64)pagenr << PAGE_SHIFT; - int i; - struct early_res *r; - - i = find_overlapped_early(start, start + PAGE_SIZE); - r = &early_res[i]; - return (i < MAX_EARLY_RES && r->end); -} - void __init early_res_to_bootmem(u64 start, u64 end) { int i; diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c3f119e99e0d..7c4d0255f8d8 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, #endif extern unsigned long find_max_low_pfn(void); -extern void add_one_highpage_init(struct page *, int, int); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) @@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro) { #ifdef CONFIG_HIGHMEM struct zone *zone; - struct page *page; + int nid; for_each_zone(zone) { - unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; if (!is_highmem(zone)) continue; @@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro) zone_start_pfn = zone->zone_start_pfn; zone_end_pfn = zone_start_pfn + zone->spanned_pages; + nid = zone_to_nid(zone); printk("Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, zone_to_nid(zone), - zone_start_pfn, zone_end_pfn); - - for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); - } + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn, bad_ppro); } totalram_pages += totalhigh_pages; #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index abadb1da70df..ba07a489230e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -287,10 +287,10 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __init +add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) && - !page_is_reserved_early(pfn)) { + if (!(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); init_page_count(page); __free_page(page); @@ -299,18 +299,58 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } +struct add_highpages_data { + unsigned long start_pfn; + unsigned long end_pfn; + int bad_ppro; +}; + +static void __init add_highpages_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + int node_pfn; + struct page *page; + unsigned long final_start_pfn, final_end_pfn; + struct add_highpages_data *data; + int bad_ppro; + + data = (struct add_highpages_data *)datax; + bad_ppro = data->bad_ppro; + + final_start_pfn = max(start_pfn, data->start_pfn); + final_end_pfn = min(end_pfn, data->end_pfn); + if (final_start_pfn >= final_end_pfn) + return; + + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; + node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page, node_pfn, bad_ppro); + } + +} + +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, + int bad_ppro) +{ + struct add_highpages_data data; + + data.start_pfn = start_pfn; + data.end_pfn = end_pfn; + data.bad_ppro = bad_ppro; + + work_with_active_regions(nid, add_highpages_work_fn, &data); +} + #ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { - int pfn; + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn, + bad_ppro); - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { - /* - * Holes under sparsemem might not have no mem_map[]: - */ - if (pfn_valid(pfn)) - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); - } totalram_pages += totalhigh_pages; } #endif /* !CONFIG_NUMA */ diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 6b0ce745a60c..55d310596907 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -86,7 +86,6 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); -extern int page_is_reserved_early(unsigned long pagenr); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); extern unsigned long e820_end_of_ram(void); diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h index e153f3b44774..85c4fea41ff6 100644 --- a/include/asm-x86/highmem.h +++ b/include/asm-x86/highmem.h @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, int bad_ppro); + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 034a3156d2f0..e4de460907c1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,6 +1011,8 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); +typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID extern int early_pfn_to_nid(unsigned long pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d80e1868e570..41c6e3aa059f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid, } } +void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, + data); +} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit v1.2.3 From 3461b0af025251bbc6b3d56c821c6ac2de6f7209 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: x86: remove static boot_cpu_pda array v2 * Remove the boot_cpu_pda array and pointer table from the data section. Allocate the pointer table and array during init. do_boot_cpu() will reallocate the pda in node local memory and if the cpu is being brought up before the bootmem array is released (after_bootmem = 0), then it will free the initial pda. This will happen for all cpus present at system startup. This removes 512k + 32k bytes from the data section. For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head64.c | 26 +++++++++++++++-- arch/x86/kernel/setup.c | 73 ++++++++++++++++++++++++++++++++++++----------- arch/x86/kernel/setup64.c | 8 ++++-- arch/x86/kernel/smpboot.c | 59 +++++++++++++++++++++++++++++--------- include/asm-x86/pda.h | 6 ++-- include/linux/mm.h | 1 + 6 files changed, 135 insertions(+), 38 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index e25c57b8aa84..0ab59edd7067 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -25,6 +25,24 @@ #include #include +/* boot cpu pda */ +static struct x8664_pda _boot_cpu_pda __read_mostly; + +#ifdef CONFIG_SMP +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +/* + * We install an empty cpu_pda pointer table to trap references before + * the actual cpu_pda pointer table is created in setup_cpu_pda_map(). + */ +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; +#else +static struct x8664_pda *__cpu_pda[1] __read_mostly; +#endif + +#else /* !CONFIG_SMP (NR_CPUS will be 1) */ +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; +#endif + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -156,10 +174,12 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel alive\n"); - for (i = 0; i < NR_CPUS; i++) - cpu_pda(i) = &boot_cpu_pda[i]; - + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; pda_init(0); + + early_printk("Kernel really alive\n"); + copy_bootdata(__va(real_mode_data)); reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 913af838c3c5..dd12c1c84a8f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -101,6 +101,50 @@ static inline void setup_cpumask_of_cpu(void) { } */ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +static inline void setup_cpu_pda_map(void) { } + +#elif !defined(CONFIG_SMP) +static inline void setup_cpu_pda_map(void) { } + +#else /* CONFIG_SMP && CONFIG_X86_64 */ + +/* + * Allocate cpu_pda pointer table and array via alloc_bootmem. + */ +static void __init setup_cpu_pda_map(void) +{ + char *pda; + struct x8664_pda **new_cpu_pda; + unsigned long size; + int cpu; + + size = roundup(sizeof(struct x8664_pda), cache_line_size()); + + /* allocate cpu_pda array and pointer table */ + { + unsigned long tsize = nr_cpu_ids * sizeof(void *); + unsigned long asize = size * (nr_cpu_ids - 1); + + tsize = roundup(tsize, cache_line_size()); + new_cpu_pda = alloc_bootmem(tsize + asize); + pda = (char *)new_cpu_pda + tsize; + } + + /* initialize pointer table to static pda's */ + for_each_possible_cpu(cpu) { + if (cpu == 0) { + /* leave boot cpu pda in place */ + new_cpu_pda[0] = cpu_pda(0); + continue; + } + new_cpu_pda[cpu] = (struct x8664_pda *)pda; + new_cpu_pda[cpu]->in_bootmem = 1; + pda += size; + } + + /* point to new pointer table */ + _cpu_pda = new_cpu_pda; +} #endif /* @@ -110,46 +154,43 @@ EXPORT_SYMBOL(__per_cpu_offset); */ void __init setup_per_cpu_areas(void) { - int i, highest_cpu = 0; - unsigned long size; + ssize_t size = PERCPU_ENOUGH_ROOM; + char *ptr; + int cpu; #ifdef CONFIG_HOTPLUG_CPU prefill_possible_map(); +#else + nr_cpu_ids = num_processors; #endif + /* Setup cpu_pda map */ + setup_cpu_pda_map(); + /* Copy section for each CPU (we discard the original) */ size = PERCPU_ENOUGH_ROOM; printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); - for_each_possible_cpu(i) { - char *ptr; + for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES ptr = alloc_bootmem_pages(size); #else - int node = early_cpu_to_node(i); + int node = early_cpu_to_node(cpu); if (!node_online(node) || !NODE_DATA(node)) { ptr = alloc_bootmem_pages(size); printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", - i, node); + cpu, node); } else ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); #endif - if (!ptr) - panic("Cannot allocate cpu data for CPU %d\n", i); -#ifdef CONFIG_X86_64 - cpu_pda(i)->data_offset = ptr - __per_cpu_start; -#else - __per_cpu_offset[i] = ptr - __per_cpu_start; -#endif + per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - highest_cpu = i; } - nr_cpu_ids = highest_cpu + 1; printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", NR_CPUS, nr_cpu_ids, nr_node_ids); @@ -199,7 +240,7 @@ void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - if (node != NUMA_NO_NODE) + if (cpu_pda(cpu) && node != NUMA_NO_NODE) cpu_pda(cpu)->nodenumber = node; if (cpu_to_node_map) diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index aee0e8200777..631ea6cc01d8 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -34,9 +35,8 @@ struct boot_params boot_params; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; +struct x8664_pda **_cpu_pda __read_mostly; EXPORT_SYMBOL(_cpu_pda); -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; @@ -114,8 +114,10 @@ void pda_init(int cpu) __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); if (!pda->irqstackptr) panic("cannot allocate irqstack for cpu %d", cpu); - } + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } pda->irqstackptr += IRQSTACKSIZE-64; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 036604d3daed..bf0833487455 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -816,6 +816,43 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } +/* + * Allocate node local memory for the AP pda. + * + * Must be called after the _cpu_pda pointer table is initialized. + */ +static int __cpuinit get_local_pda(int cpu) +{ + struct x8664_pda *oldpda, *newpda; + unsigned long size = sizeof(struct x8664_pda); + int node = cpu_to_node(cpu); + + if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem) + return 0; + + oldpda = cpu_pda(cpu); + newpda = kmalloc_node(size, GFP_ATOMIC, node); + if (!newpda) { + printk(KERN_ERR "Could not allocate node local PDA " + "for CPU %d on node %d\n", cpu, node); + + if (oldpda) + return 0; /* have a usable pda */ + else + return -1; + } + + if (oldpda) { + memcpy(newpda, oldpda, size); + if (!after_bootmem) + free_bootmem((unsigned long)oldpda, size); + } + + newpda->in_bootmem = 0; + cpu_pda(cpu) = newpda; + return 0; +} + static int __cpuinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -841,19 +878,11 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) } /* Allocate node local memory for AP pdas */ - if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { - struct x8664_pda *newpda, *pda; - int node = cpu_to_node(cpu); - pda = cpu_pda(cpu); - newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC, - node); - if (newpda) { - memcpy(newpda, pda, sizeof(struct x8664_pda)); - cpu_pda(cpu) = newpda; - } else - printk(KERN_ERR - "Could not allocate node local PDA for CPU %d on node %d\n", - cpu, node); + if (cpu > 0) { + boot_error = get_local_pda(cpu); + if (boot_error) + goto restore_state; + /* if can't get pda memory, can't start cpu */ } #endif @@ -972,6 +1001,8 @@ do_rest: } } +restore_state: + if (boot_error) { /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); @@ -1347,6 +1378,8 @@ __init void prefill_possible_map(void) for (i = 0; i < possible; i++) cpu_set(i, cpu_possible_map); + + nr_cpu_ids = possible; } static void __ref remove_cpu_from_maps(int cpu) diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h index de2ad9ac35a9..b34e9a7cc80b 100644 --- a/include/asm-x86/pda.h +++ b/include/asm-x86/pda.h @@ -22,7 +22,8 @@ struct x8664_pda { offset 40!!! */ #endif char *irqstackptr; - int nodenumber; /* number of current node */ + short nodenumber; /* number of current node (32k max) */ + short in_bootmem; /* pda lives in bootmem */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ short mmu_state; @@ -38,8 +39,7 @@ struct x8664_pda { unsigned irq_spurious_count; } ____cacheline_aligned_in_smp; -extern struct x8664_pda *_cpu_pda[]; -extern struct x8664_pda boot_cpu_pda[]; +extern struct x8664_pda **_cpu_pda; extern void pda_init(int); #define cpu_pda(i) (_cpu_pda[i]) diff --git a/include/linux/mm.h b/include/linux/mm.h index 586a943cab01..0ea48a5af823 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1024,6 +1024,7 @@ extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); +extern int after_bootmem; #ifdef CONFIG_NUMA extern void setup_per_cpu_pageset(void); -- cgit v1.2.3 From d52d53b8a5b258bfaab9223a5e7284fcfdd48577 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 20:10:55 -0700 Subject: RFC x86: try to remove arch_get_ram_range want to remove arch_get_ram_range, and use early_node_map instead. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 6 ++++-- drivers/pci/intel-iommu.c | 51 ++++++++++++++++++++++++++++++++++------------- include/linux/mm.h | 2 +- mm/page_alloc.c | 10 +++++++--- 4 files changed, 49 insertions(+), 20 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 65d55056b6e7..a0484adbf59d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -298,7 +298,7 @@ struct add_highpages_data { unsigned long end_pfn; }; -static void __init add_highpages_work_fn(unsigned long start_pfn, +static int __init add_highpages_work_fn(unsigned long start_pfn, unsigned long end_pfn, void *datax) { int node_pfn; @@ -311,7 +311,7 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, final_start_pfn = max(start_pfn, data->start_pfn); final_end_pfn = min(end_pfn, data->end_pfn); if (final_start_pfn >= final_end_pfn) - return; + return 0; for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; node_pfn++) { @@ -321,6 +321,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, add_one_highpage_init(page, node_pfn); } + return 0; + } void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 66c0fd21894b..bb0642318a95 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1637,12 +1637,43 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, } #ifdef CONFIG_DMAR_GFX_WA -extern int arch_get_ram_range(int slot, u64 *addr, u64 *size); +struct iommu_prepare_data { + struct pci_dev *pdev; + int ret; +}; + +static int __init iommu_prepare_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + struct iommu_prepare_data *data; + + data = (struct iommu_prepare_data *)datax; + + data->ret = iommu_prepare_identity_map(data->pdev, + start_pfn<ret; + +} + +static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev) +{ + int nid; + struct iommu_prepare_data data; + + data.pdev = pdev; + data.ret = 0; + + for_each_online_node(nid) { + work_with_active_regions(nid, iommu_prepare_work_fn, &data); + if (data.ret) + return data.ret; + } + return data.ret; +} + static void __init iommu_prepare_gfx_mapping(void) { struct pci_dev *pdev = NULL; - u64 base, size; - int slot; int ret; for_each_pci_dev(pdev) { @@ -1651,17 +1682,9 @@ static void __init iommu_prepare_gfx_mapping(void) continue; printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n", pci_name(pdev)); - slot = arch_get_ram_range(0, &base, &size); - while (slot >= 0) { - ret = iommu_prepare_identity_map(pdev, - base, base + size); - if (ret) - goto error; - slot = arch_get_ram_range(slot, &base, &size); - } - continue; -error: - printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); + ret = iommu_prepare_with_active_regions(pdev); + if (ret) + printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); } } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d647b24041f..cf1cd3a2ed78 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,7 +1011,7 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); -typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 41c6e3aa059f..e25b6b24f844 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2932,10 +2932,14 @@ void __init free_bootmem_with_active_regions(int nid, void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; + int ret; - for_each_active_range_index_in_nid(i, nid) - work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, - data); + for_each_active_range_index_in_nid(i, nid) { + ret = work_fn(early_node_map[i].start_pfn, + early_node_map[i].end_pfn, data); + if (ret) + break; + } } /** * sparse_memory_present_with_active_regions - Call memory_present for each active range -- cgit v1.2.3 From aba46c5027cb59d98052231b36efcbbde9c77a1d Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 8 Jul 2008 00:28:52 +1000 Subject: powerpc/mm: Define flags for Strong Access Ordering This patch defines: - PROT_SAO, which is passed into mmap() and mprotect() in the prot field - VM_SAO in vma->vm_flags, and - _PAGE_SAO, the combination of WIMG bits in the pte that enables strong access ordering for the page. Signed-off-by: Dave Kleikamp Signed-off-by: Benjamin Herrenschmidt --- include/asm-powerpc/mman.h | 2 ++ include/asm-powerpc/pgtable-ppc64.h | 3 +++ include/linux/mm.h | 1 + 3 files changed, 6 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/asm-powerpc/mman.h b/include/asm-powerpc/mman.h index 24cf664a8295..0c46bf2c7d5f 100644 --- a/include/asm-powerpc/mman.h +++ b/include/asm-powerpc/mman.h @@ -10,6 +10,8 @@ * 2 of the License, or (at your option) any later version. */ +#define PROT_SAO 0x10 /* Strong Access Ordering */ + #define MAP_RENAME MAP_ANONYMOUS /* In SunOS terminology */ #define MAP_NORESERVE 0x40 /* don't reserve swap pages */ #define MAP_LOCKED 0x80 diff --git a/include/asm-powerpc/pgtable-ppc64.h b/include/asm-powerpc/pgtable-ppc64.h index b2754d46be44..d09599cccb35 100644 --- a/include/asm-powerpc/pgtable-ppc64.h +++ b/include/asm-powerpc/pgtable-ppc64.h @@ -93,6 +93,9 @@ #define _PAGE_RW 0x0200 /* software: user write access allowed */ #define _PAGE_BUSY 0x0800 /* software: PTE & hash are busy */ +/* Strong Access Ordering */ +#define _PAGE_SAO (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT) + #define _PAGE_BASE (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_COHERENT) #define _PAGE_WRENABLE (_PAGE_RW | _PAGE_DIRTY) diff --git a/include/linux/mm.h b/include/linux/mm.h index 586a943cab01..689184446fc6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -108,6 +108,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ +#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -- cgit v1.2.3