From 692ae74aaf226a557d88d5412a1764c09e63a193 Mon Sep 17 00:00:00 2001 From: Byongho Lee Date: Wed, 31 Jan 2018 16:15:36 -0800 Subject: mm/slab_common.c: make calculate_alignment() static calculate_alignment() function is only used inside slab_common.c. So make it static and let the compiler do more optimizations. After this patch there's a small improvement in text and data size. $ gcc --version gcc (GCC) 7.2.1 20171128 Before: text data bss dec hex filename 9890457 3828702 1212364 14931523 e3d643 vmlinux After: text data bss dec hex filename 9890437 3828670 1212364 14931471 e3d60f vmlinux Also I fixed a style problem reported by checkpatch. WARNING: Missing a blank line after declarations #53: FILE: mm/slab_common.c:286: + unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) Link: http://lkml.kernel.org/r/20171210080132.406-1-bhlee.kernel@gmail.com Signed-off-by: Byongho Lee Acked-by: Michal Hocko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 3 --- mm/slab_common.c | 56 +++++++++++++++++++++++++++++--------------------------- 2 files changed, 29 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index ad657ffa44e5..e8e2095a6185 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -78,9 +78,6 @@ extern const struct kmalloc_info_struct { unsigned long size; } kmalloc_info[]; -unsigned long calculate_alignment(slab_flags_t flags, - unsigned long align, unsigned long size); - #ifndef CONFIG_SLOB /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); diff --git a/mm/slab_common.c b/mm/slab_common.c index c8cb36774ba1..deeddf95cdcf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -267,6 +267,35 @@ static inline void memcg_unlink_cache(struct kmem_cache *s) } #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +/* + * Figure out what the alignment of the objects will be given a set of + * flags, a user specified alignment and the size of the objects. + */ +static unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign; + + ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + if (align < ARCH_SLAB_MINALIGN) + align = ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + /* * Find a mergeable slab cache */ @@ -337,33 +366,6 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, return NULL; } -/* - * Figure out what the alignment of the objects will be given a set of - * flags, a user specified alignment and the size of the objects. - */ -unsigned long calculate_alignment(slab_flags_t flags, - unsigned long align, unsigned long size) -{ - /* - * If the user wants hardware cache aligned objects then follow that - * suggestion if the object is sufficiently large. - * - * The hardware cache alignment cannot override the specified - * alignment though. If that is greater then use it. - */ - if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - align = max(align, ralign); - } - - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - static struct kmem_cache *create_cache(const char *name, size_t object_size, size_t size, size_t align, slab_flags_t flags, void (*ctor)(void *), -- cgit v1.2.3 From 84ebb5827d015c1045429d018bf9a48f95f082a6 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 31 Jan 2018 16:15:39 -0800 Subject: mm/slab.c: remove redundant assignments for slab_state slab_state is being set to "UP" in create_kmalloc_caches(), and later on we set it again in kmem_cache_init_late(), but slab_state does not change in the meantime. Remove the redundant assignment from kmem_cache_init_late(). And unless I overlooked anything, the same goes for "slab_state = FULL". slab_state is set to "FULL" in kmem_cache_init_late(), but it is later being set again in cpucache_init(), which gets called from do_initcall_level(). So remove the assignment from cpucache_init() as well. Link: http://lkml.kernel.org/r/20171215134452.GA1920@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 4e51ef954026..226906294183 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1316,8 +1316,6 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; - slab_state = UP; - /* 6) resize the head arrays to their final sizes */ mutex_lock(&slab_mutex); list_for_each_entry(cachep, &slab_caches, list) @@ -1353,8 +1351,6 @@ static int __init cpucache_init(void) slab_online_cpu, slab_offline_cpu); WARN_ON(ret < 0); - /* Done! */ - slab_state = FULL; return 0; } __initcall(cpucache_init); -- cgit v1.2.3 From 5d682681f8a2bd127748d707243661fcb00f7acb Mon Sep 17 00:00:00 2001 From: Balasubramani Vivekanandan Date: Wed, 31 Jan 2018 16:15:43 -0800 Subject: mm/slub.c: fix wrong address during slab padding restoration Start address calculated for slab padding restoration was wrong. Wrong address would point to some section before padding and could cause corruption Link: http://lkml.kernel.org/r/1516604578-4577-1-git-send-email-balasubramani_vivekanandan@mentor.com Signed-off-by: Balasubramani Vivekanandan Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index cfd56e5a35fb..733ba32c031b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -838,6 +838,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) u8 *start; u8 *fault; u8 *end; + u8 *pad; int length; int remainder; @@ -851,8 +852,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!remainder) return 1; + pad = end - remainder; metadata_access_enable(); - fault = memchr_inv(end - remainder, POISON_INUSE, remainder); + fault = memchr_inv(pad, POISON_INUSE, remainder); metadata_access_disable(); if (!fault) return 1; @@ -860,9 +862,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section(KERN_ERR, "Padding ", end - remainder, remainder); + print_section(KERN_ERR, "Padding ", pad, remainder); - restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); + restore_bytes(s, "slab padding", POISON_INUSE, fault, end); return 0; } -- cgit v1.2.3 From 0d2d5d40deb49314b6f701589e1cae3bca3aa94c Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Wed, 31 Jan 2018 16:15:47 -0800 Subject: slub: remove obsolete comments of put_cpu_partial() Commit d6e0b7fa1186 ("slub: make dead caches discard free slabs immediately") makes put_cpu_partial() run with preemption disabled and interrupts disabled when calling unfreeze_partials(). The comment: "put_cpu_partial() is done without interrupts disabled and without preemption disabled" looks obsolete, so remove it. Link: http://lkml.kernel.org/r/1516968550-1520-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Reviewed-by: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 733ba32c031b..693b7074bc53 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2222,9 +2222,7 @@ static void unfreeze_partials(struct kmem_cache *s, /* * Put a page that was just frozen (in __slab_free) into a partial page - * slot if available. This is done without interrupts disabled and without - * preemption disabled. The cmpxchg is racy and may put the partial page - * onto a random cpus partial slot. + * slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. -- cgit v1.2.3 From 4a01768e9e91082efc9a6384b1ef579fdcbce828 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 31 Jan 2018 16:15:55 -0800 Subject: mm: kmemleak: remove unused hardirq.h Preempt counter APIs have been split out, currently, hardirq.h just includes irq_enter/exit APIs which are not used by kmemleak at all. So, remove the unused hardirq.h. Link: http://lkml.kernel.org/r/1510959741-31109-1-git-send-email-yang.s@alibaba-inc.com Signed-off-by: Yang Shi Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f656ca27f6c2..e83987c55a08 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -91,7 +91,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From a85f878b443f8d2b91ba76f09da21ac0af22e07f Mon Sep 17 00:00:00 2001 From: Srividya Desireddy Date: Wed, 31 Jan 2018 16:15:59 -0800 Subject: zswap: same-filled pages handling Zswap is a cache which compresses the pages that are being swapped out and stores them into a dynamically allocated RAM-based memory pool. Experiments have shown that around 10-20% of pages stored in zswap are same-filled pages (i.e. contents of the page are all same), but these pages are handled as normal pages by compressing and allocating memory in the pool. This patch adds a check in zswap_frontswap_store() to identify same-filled page before compression of the page. If the page is a same-filled page, set zswap_entry.length to zero, save the same-filled value and skip the compression of the page and alloction of memory in zpool. In zswap_frontswap_load(), check if value of zswap_entry.length is zero corresponding to the page to be loaded. If zswap_entry.length is zero, fill the page with same-filled value. This saves the decompression time during load. On a ARM Quad Core 32-bit device with 1.5GB RAM by launching and relaunching different applications, out of ~64000 pages stored in zswap, ~11000 pages were same-value filled pages (including zero-filled pages) and ~9000 pages were zero-filled pages. An average of 17% of pages(including zero-filled pages) in zswap are same-value filled pages and 14% pages are zero-filled pages. An average of 3% of pages are same-filled non-zero pages. The below table shows the execution time profiling with the patch. Baseline With patch % Improvement ----------------------------------------------------------------- *Zswap Store Time 26.5ms 18ms 32% (of same value pages) *Zswap Load Time (of same value pages) 25.5ms 13ms 49% ----------------------------------------------------------------- On Ubuntu PC with 2GB RAM, while executing kernel build and other test scripts and running multimedia applications, out of 360000 pages stored in zswap 78000(~22%) of pages were found to be same-value filled pages (including zero-filled pages) and 64000(~17%) are zero-filled pages. So an average of %5 of pages are same-filled non-zero pages. The below table shows the execution time profiling with the patch. Baseline With patch % Improvement ----------------------------------------------------------------- *Zswap Store Time 91ms 74ms 19% (of same value pages) *Zswap Load Time 50ms 7.5ms 85% (of same value pages) ----------------------------------------------------------------- *The execution times may vary with test device used. Dan said: : I did test this patch out this week, and I added some instrumentation to : check the performance impact, and tested with a small program to try to : check the best and worst cases. : : When doing a lot of swap where all (or almost all) pages are same-value, I : found this patch does save both time and space, significantly. The exact : improvement in time and space depends on which compressor is being used, : but roughly agrees with the numbers you listed. : : In the worst case situation, where all (or almost all) pages have the : same-value *except* the final long (meaning, zswap will check each long on : the entire page but then still have to pass the page to the compressor), : the same-value check is around 10-15% of the total time spent in : zswap_frontswap_store(). That's a not-insignificant amount of time, but : it's not huge. Considering that most systems will probably be swapping : pages that aren't similar to the worst case (although I don't have any : data to know that), I'd say the improvement is worth the possible : worst-case performance impact. [srividya.dr@samsung.com: add memset_l instead of for loop] Link: http://lkml.kernel.org/r/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1 Signed-off-by: Srividya Desireddy Acked-by: Dan Streetman Cc: Seth Jennings Cc: Pekka Enberg Cc: Dinakar Reddy Pathireddy Cc: SHARAN ALLUR Cc: RAJIB BASU Cc: JUHUN KIM Cc: Matthew Wilcox Cc: Timofey Titovets Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index d39581a076c3..1133b4ceb72e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -49,6 +49,8 @@ static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ static atomic_t zswap_stored_pages = ATOMIC_INIT(0); +/* The number of same-value filled pages currently stored in zswap */ +static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -116,6 +118,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); +/* Enable/disable handling same-value filled pages (enabled by default) */ +static bool zswap_same_filled_pages_enabled = true; +module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, + bool, 0644); + /********************************* * data structures **********************************/ @@ -145,9 +152,10 @@ struct zswap_pool { * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during - * decompression + * decompression. For a same value filled page length is 0. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data + * value - value of the same-value filled pages which have same content */ struct zswap_entry { struct rb_node rbnode; @@ -155,7 +163,10 @@ struct zswap_entry { int refcount; unsigned int length; struct zswap_pool *pool; - unsigned long handle; + union { + unsigned long handle; + unsigned long value; + }; }; struct zswap_header { @@ -320,8 +331,12 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) */ static void zswap_free_entry(struct zswap_entry *entry) { - zpool_free(entry->pool->zpool, entry->handle); - zswap_pool_put(entry->pool); + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); + } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); @@ -953,6 +968,28 @@ static int zswap_shrink(void) return ret; } +static int zswap_is_page_same_filled(void *ptr, unsigned long *value) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos] != page[0]) + return 0; + } + *value = page[0]; + return 1; +} + +static void zswap_fill_page(void *ptr, unsigned long value) +{ + unsigned long *page; + + page = (unsigned long *)ptr; + memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); +} + /********************************* * frontswap hooks **********************************/ @@ -965,7 +1002,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct crypto_comp *tfm; int ret; unsigned int dlen = PAGE_SIZE, len; - unsigned long handle; + unsigned long handle, value; char *buf; u8 *src, *dst; struct zswap_header *zhdr; @@ -993,6 +1030,19 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, goto reject; } + if (zswap_same_filled_pages_enabled) { + src = kmap_atomic(page); + if (zswap_is_page_same_filled(src, &value)) { + kunmap_atomic(src); + entry->offset = offset; + entry->length = 0; + entry->value = value; + atomic_inc(&zswap_same_filled_pages); + goto insert_entry; + } + kunmap_atomic(src); + } + /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) { @@ -1037,6 +1087,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, entry->handle = handle; entry->length = dlen; +insert_entry: /* map */ spin_lock(&tree->lock); do { @@ -1089,6 +1140,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, } spin_unlock(&tree->lock); + if (!entry->length) { + dst = kmap_atomic(page); + zswap_fill_page(dst, entry->value); + kunmap_atomic(dst); + goto freeentry; + } + /* decompress */ dlen = PAGE_SIZE; src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, @@ -1101,6 +1159,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); +freeentry: spin_lock(&tree->lock); zswap_entry_put(tree, entry); spin_unlock(&tree->lock); @@ -1209,6 +1268,8 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); + debugfs_create_atomic_t("same_filled_pages", 0444, + zswap_debugfs_root, &zswap_same_filled_pages); return 0; } -- cgit v1.2.3 From 2e3ca40f03bb13709df40eff2f7fc157803fa5a3 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 31 Jan 2018 16:16:02 -0800 Subject: mm: relax deferred struct page requirements There is no need to have ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT, as all the page initialization code is in common code. Also, there is no need to depend on MEMORY_HOTPLUG, as initialization code does not really use hotplug memory functionality. So, we can remove this requirement as well. This patch allows to use deferred struct page initialization on all platforms with memblock allocator. Tested on x86, arm64, and sparc. Also, verified that code compiles on PPC with CONFIG_MEMORY_HOTPLUG disabled. Link: http://lkml.kernel.org/r/20171117014601.31606-1-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Acked-by: Heiko Carstens [s390] Reviewed-by: Khalid Aziz Acked-by: Michael Ellerman Acked-by: Michal Hocko Cc: Steven Sistare Cc: Daniel Jordan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Kirill A. Shutemov Cc: Reza Arbab Cc: Martin Schwidefsky Cc: Thomas Gleixner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/x86/Kconfig | 1 - mm/Kconfig | 7 +------ 4 files changed, 1 insertion(+), 9 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e92432ae9737..73fcf592ee91 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -151,7 +151,6 @@ config PPC select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9376637229c9..0105ce28e246 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -108,7 +108,6 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dbe5542a6666..7a1c51198af1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,7 +69,6 @@ config X86 select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS diff --git a/mm/Kconfig b/mm/Kconfig index 03ff7703d322..c782e8fb7235 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -639,15 +639,10 @@ config MAX_STACK_SIZE_MB A sane initial value is 80 MB. -# For architectures that support deferred memory initialisation -config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - bool - config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n - depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - depends on NO_BOOTMEM && MEMORY_HOTPLUG + depends on NO_BOOTMEM depends on !FLATMEM help Ordinarily all struct pages are initialised during early boot in a -- cgit v1.2.3 From 66f308ed7dab1b3460d186a794e1f9c2d229f709 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 31 Jan 2018 16:16:07 -0800 Subject: mm/mempolicy: remove redundant check in get_nodes We have already checked whether maxnode is a page worth of bits, by: maxnode > PAGE_SIZE*BITS_PER_BYTE So no need to check it once more. Link: http://lkml.kernel.org/r/1510882624-44342-2-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Acked-by: Vlastimil Babka Acked-by: David Rientjes Cc: Ingo Molnar Cc: David Rientjes Cc: Naoya Horiguchi Cc: Chris Salls Cc: Andi Kleen Cc: Christopher Lameter Cc: Tan Xiaojun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ce44d3ff03d..6e867a8dcca9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1282,8 +1282,6 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, /* When the user specified more nodes than supported just check if the non supported part is all zero. */ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - if (nlongs > PAGE_SIZE/sizeof(long)) - return -EINVAL; for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { unsigned long t; if (get_user(t, nmask + k)) -- cgit v1.2.3 From 56521e7a02b7b84a5e72691a1fb15570e6055545 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 31 Jan 2018 16:16:11 -0800 Subject: mm/mempolicy: fix the check of nodemask from user As Xiaojun reported the ltp of migrate_pages01 will fail on arm64 system which has 4 nodes[0...3], all have memory and CONFIG_NODES_SHIFT=2: migrate_pages01 0 TINFO : test_invalid_nodes migrate_pages01 14 TFAIL : migrate_pages_common.c:45: unexpected failure - returned value = 0, expected: -1 migrate_pages01 15 TFAIL : migrate_pages_common.c:55: call succeeded unexpectedly In this case the test_invalid_nodes of migrate_pages01 will call: SYSC_migrate_pages as: migrate_pages(0, , {0x0000000000000001}, 64, , {0x0000000000000010}, 64) = 0 The new nodes specifies one or more node IDs that are greater than the maximum supported node ID, however, the errno is not set to EINVAL as expected. As man pages of set_mempolicy[1], mbind[2], and migrate_pages[3] mentioned, when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, the errno should set to EINVAL. However, get_nodes only check whether the part of bits [BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES), maxnode) is zero or not, and remain [MAX_NUMNODES, BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES) unchecked. This patch is to check the bits of [MAX_NUMNODES, maxnode) in get_nodes to let migrate_pages set the errno to EINVAL when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, which follows the manpage's guide. [1] http://man7.org/linux/man-pages/man2/set_mempolicy.2.html [2] http://man7.org/linux/man-pages/man2/mbind.2.html [3] http://man7.org/linux/man-pages/man2/migrate_pages.2.html Link: http://lkml.kernel.org/r/1510882624-44342-3-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Reported-by: Tan Xiaojun Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Chris Salls Cc: Christopher Lameter Cc: David Rientjes Cc: Ingo Molnar Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6e867a8dcca9..65df28d7cc89 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1263,6 +1263,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long k; + unsigned long t; unsigned long nlongs; unsigned long endmask; @@ -1279,11 +1280,17 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, else endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ + /* + * When the user specified more nodes than supported just check + * if the non supported part is all zero. + * + * If maxnode have more longs than MAX_NUMNODES, check + * the bits in that area first. And then go through to + * check the rest bits which equal or bigger than MAX_NUMNODES. + * Otherwise, just check bits [MAX_NUMNODES, maxnode). + */ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; if (get_user(t, nmask + k)) return -EFAULT; if (k == nlongs - 1) { @@ -1296,6 +1303,16 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, endmask = ~0UL; } + if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) { + unsigned long valid_mask = endmask; + + valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); + if (get_user(t, nmask + nlongs - 1)) + return -EFAULT; + if (t & valid_mask) + return -EINVAL; + } + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) return -EFAULT; nodes_addr(*nodes)[nlongs-1] &= endmask; -- cgit v1.2.3 From 0486a38bcc4749808edbc848f1bcf232042770fc Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 31 Jan 2018 16:16:15 -0800 Subject: mm/mempolicy: add nodes_empty check in SYSC_migrate_pages As in manpage of migrate_pages, the errno should be set to EINVAL when none of the node IDs specified by new_nodes are on-line and allowed by the process's current cpuset context, or none of the specified nodes contain memory. However, when test by following case: new_nodes = 0; old_nodes = 0xf; ret = migrate_pages(pid, old_nodes, new_nodes, MAX); The ret will be 0 and no errno is set. As the new_nodes is empty, we should expect EINVAL as documented. To fix the case like above, this patch check whether target nodes AND current task_nodes is empty, and then check whether AND node_states[N_MEMORY] is empty. Link: http://lkml.kernel.org/r/1510882624-44342-4-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Chris Salls Cc: Christopher Lameter Cc: David Rientjes Cc: Ingo Molnar Cc: Naoya Horiguchi Cc: Tan Xiaojun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 65df28d7cc89..f604b22ebb65 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1433,10 +1433,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out_put; } - if (!nodes_subset(*new, node_states[N_MEMORY])) { - err = -EINVAL; + task_nodes = cpuset_mems_allowed(current); + nodes_and(*new, *new, task_nodes); + if (nodes_empty(*new)) + goto out_put; + + nodes_and(*new, *new, node_states[N_MEMORY]); + if (nodes_empty(*new)) goto out_put; - } err = security_task_movememory(task); if (err) -- cgit v1.2.3 From 9852a7212324fd25f896932f4f4607ce47b0a22f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:16:19 -0800 Subject: mm: drop hotplug lock from lru_add_drain_all() Pulling cpu hotplug locks inside the mm core function like lru_add_drain_all just asks for problems and the recent lockdep splat [1] just proves this. While the usage in that particular case might be wrong we should avoid the locking as lru_add_drain_all() is used in many places. It seems that this is not all that hard to achieve actually. We have done the same thing for drain_all_pages which is analogous by commit a459eeb7b852 ("mm, page_alloc: do not depend on cpu hotplug locks inside the allocator"). All we have to care about is to handle - the work item might be executed on a different cpu in worker from unbound pool so it doesn't run on pinned on the cpu - we have to make sure that we do not race with page_alloc_cpu_dead calling lru_add_drain_cpu the first part is already handled because the worker calls lru_add_drain which disables preemption when calling lru_add_drain_cpu on the local cpu it is draining. The later is true because page_alloc_cpu_dead is called on the controlling CPU after the hotplugged CPU vanished completely. [1] http://lkml.kernel.org/r/089e0825eec8955c1f055c83d476@google.com [add a cpu hotplug locking interaction as per tglx] Link: http://lkml.kernel.org/r/20171116120535.23765-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Thomas Gleixner Cc: Tejun Heo Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - mm/memory_hotplug.c | 2 +- mm/swap.c | 16 ++++++++-------- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index c2b8128799c1..0bd4c25016f9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -332,7 +332,6 @@ extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); -extern void lru_add_drain_all_cpuslocked(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); extern void mark_page_lazyfree(struct page *page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c52aa05b106c..999ce3af809d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1637,7 +1637,7 @@ repeat: goto failed_removal; cond_resched(); - lru_add_drain_all_cpuslocked(); + lru_add_drain_all(); drain_all_pages(zone); pfn = scan_movable_pages(start_pfn, end_pfn); diff --git a/mm/swap.c b/mm/swap.c index 38e1b6374a97..e824c800adca 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -688,7 +688,14 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); -void lru_add_drain_all_cpuslocked(void) +/* + * Doesn't need any cpu hotplug locking because we do rely on per-cpu + * kworkers being shut down before our page_alloc_cpu_dead callback is + * executed on the offlined cpu. + * Calling this function with cpu hotplug locks held can actually lead + * to obscure indirect dependencies via WQ context. + */ +void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; @@ -724,13 +731,6 @@ void lru_add_drain_all_cpuslocked(void) mutex_unlock(&lock); } -void lru_add_drain_all(void) -{ - get_online_cpus(); - lru_add_drain_all_cpuslocked(); - put_online_cpus(); -} - /** * release_pages - batched put_page() * @pages: array of pages to release -- cgit v1.2.3 From fcb2b0c577f145c7616843c9d4dcb4f9e5d88e29 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 31 Jan 2018 16:16:22 -0800 Subject: mm: show total hugetlb memory consumption in /proc/meminfo Currently we display some hugepage statistics (total, free, etc) in /proc/meminfo, but only for default hugepage size (e.g. 2Mb). If hugepages of different sizes are used (like 2Mb and 1Gb on x86-64), /proc/meminfo output can be confusing, as non-default sized hugepages are not reflected at all, and there are no signs that they are existing and consuming system memory. To solve this problem, let's display the total amount of memory, consumed by hugetlb pages of all sized (both free and used). Let's call it "Hugetlb", and display size in kB to match generic /proc/meminfo style. For example, (1024 2Mb pages and 2 1Gb pages are pre-allocated): $ cat /proc/meminfo MemTotal: 8168984 kB MemFree: 3789276 kB <...> CmaFree: 0 kB HugePages_Total: 1024 HugePages_Free: 1024 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB Hugetlb: 4194304 kB DirectMap4k: 32632 kB DirectMap2M: 4161536 kB DirectMap1G: 6291456 kB Also, this patch updates corresponding docs to reflect Hugetlb entry meaning and difference between Hugetlb and HugePages_Total * Hugepagesize. Link: http://lkml.kernel.org/r/20171115231409.12131-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: David Rientjes Cc: Mike Kravetz Cc: "Aneesh Kumar K.V" Cc: Andrea Arcangeli Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hugetlbpage.txt | 27 ++++++++++++++++++--------- mm/hugetlb.c | 36 ++++++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 59cbc803aad6..faf077d50d42 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -20,19 +20,20 @@ options. The /proc/meminfo file provides information about the total number of persistent hugetlb pages in the kernel's huge page pool. It also displays -information about the number of free, reserved and surplus huge pages and the -default huge page size. The huge page size is needed for generating the -proper alignment and size of the arguments to system calls that map huge page -regions. +default huge page size and information about the number of free, reserved +and surplus huge pages in the pool of huge pages of default size. +The huge page size is needed for generating the proper alignment and +size of the arguments to system calls that map huge page regions. The output of "cat /proc/meminfo" will include lines like: ..... -HugePages_Total: vvv -HugePages_Free: www -HugePages_Rsvd: xxx -HugePages_Surp: yyy -Hugepagesize: zzz kB +HugePages_Total: uuu +HugePages_Free: vvv +HugePages_Rsvd: www +HugePages_Surp: xxx +Hugepagesize: yyy kB +Hugetlb: zzz kB where: HugePages_Total is the size of the pool of huge pages. @@ -47,6 +48,14 @@ HugePages_Surp is short for "surplus," and is the number of huge pages in the pool above the value in /proc/sys/vm/nr_hugepages. The maximum number of surplus huge pages is controlled by /proc/sys/vm/nr_overcommit_hugepages. +Hugepagesize is the default hugepage size (in Kb). +Hugetlb is the total amount of memory (in kB), consumed by huge + pages of all sizes. + If huge pages of different sizes are in use, this number + will exceed HugePages_Total * Hugepagesize. To get more + detailed information, please, refer to + /sys/kernel/mm/hugepages (described below). + /proc/filesystems should also show a filesystem of type "hugetlbfs" configured in the kernel. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a334f5fb730..1e6a5ad0d420 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2975,20 +2975,32 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { - struct hstate *h = &default_hstate; + struct hstate *h; + unsigned long total = 0; + if (!hugepages_supported()) return; - seq_printf(m, - "HugePages_Total: %5lu\n" - "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" - "HugePages_Surp: %5lu\n" - "Hugepagesize: %8lu kB\n", - h->nr_huge_pages, - h->free_huge_pages, - h->resv_huge_pages, - h->surplus_huge_pages, - 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); + + for_each_hstate(h) { + unsigned long count = h->nr_huge_pages; + + total += (PAGE_SIZE << huge_page_order(h)) * count; + + if (h == &default_hstate) + seq_printf(m, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", + count, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + (PAGE_SIZE << huge_page_order(h)) / 1024); + } + + seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); } int hugetlb_report_node_meminfo(int nid, char *buf) -- cgit v1.2.3 From 9092c71bb724dba2ecba849eae69e5c9d39bd3d2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 31 Jan 2018 16:16:26 -0800 Subject: mm: use sc->priority for slab shrink targets Previously we were using the ratio of the number of lru pages scanned to the number of eligible lru pages to determine the number of slab objects to scan. The problem with this is that these two things have nothing to do with each other, so in slab heavy work loads where there is little to no page cache we can end up with the pages scanned being a very low number. This means that we reclaim next to no slab pages and waste a lot of time reclaiming small amounts of space. Consider the following scenario, where we have the following values and the rest of the memory usage is in slab Active: 58840 kB Inactive: 46860 kB Every time we do a get_scan_count() we do this scan = size >> sc->priority where sc->priority starts at DEF_PRIORITY, which is 12. The first loop through reclaim would result in a scan target of 2 pages to 11715 total inactive pages, and 3 pages to 14710 total active pages. This is a really really small target for a system that is entirely slab pages. And this is super optimistic, this assumes we even get to scan these pages. We don't increment sc->nr_scanned unless we 1) isolate the page, which assumes it's not in use, and 2) can lock the page. Under pressure these numbers could probably go down, I'm sure there's some random pages from daemons that aren't actually in use, so the targets get even smaller. Instead use sc->priority in the same way we use it to determine scan amounts for the lru's. This generally equates to pages. Consider the following slab_pages = (nr_objects * object_size) / PAGE_SIZE What we would like to do is scan = slab_pages >> sc->priority but we don't know the number of slab pages each shrinker controls, only the objects. However say that theoretically we knew how many pages a shrinker controlled, we'd still have to convert this to objects, which would look like the following scan = shrinker_pages >> sc->priority scan_objects = (PAGE_SIZE / object_size) * scan or written another way scan_objects = (shrinker_pages >> sc->priority) * (PAGE_SIZE / object_size) which can thus be written scan_objects = ((shrinker_pages * PAGE_SIZE) / object_size) >> sc->priority which is just scan_objects = nr_objects >> sc->priority We don't need to know exactly how many pages each shrinker represents, it's objects are all the information we need. Making this change allows us to place an appropriate amount of pressure on the shrinker pools for their relative size. Link: http://lkml.kernel.org/r/1510780549-6812-1-git-send-email-josef@toxicpanda.com Signed-off-by: Josef Bacik Acked-by: Johannes Weiner Acked-by: Dave Chinner Acked-by: Andrey Ryabinin Cc: Michal Hocko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 23 +++++++++------------ mm/vmscan.c | 47 ++++++++++++------------------------------- 2 files changed, 23 insertions(+), 47 deletions(-) (limited to 'mm') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index d70b53e65f43..e0b8b9173e1c 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -192,12 +192,12 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, - long nr_objects_to_shrink, unsigned long pgs_scanned, - unsigned long lru_pgs, unsigned long cache_items, - unsigned long long delta, unsigned long total_scan), + long nr_objects_to_shrink, unsigned long cache_items, + unsigned long long delta, unsigned long total_scan, + int priority), - TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs, - cache_items, delta, total_scan), + TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, + priority), TP_STRUCT__entry( __field(struct shrinker *, shr) @@ -205,11 +205,10 @@ TRACE_EVENT(mm_shrink_slab_start, __field(int, nid) __field(long, nr_objects_to_shrink) __field(gfp_t, gfp_flags) - __field(unsigned long, pgs_scanned) - __field(unsigned long, lru_pgs) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) + __field(int, priority) ), TP_fast_assign( @@ -218,24 +217,22 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = sc->gfp_mask; - __entry->pgs_scanned = pgs_scanned; - __entry->lru_pgs = lru_pgs; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; + __entry->priority = priority; ), - TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", + TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), - __entry->pgs_scanned, - __entry->lru_pgs, __entry->cache_items, __entry->delta, - __entry->total_scan) + __entry->total_scan, + __entry->priority) ); TRACE_EVENT(mm_shrink_slab_end, diff --git a/mm/vmscan.c b/mm/vmscan.c index 47d5ced51f2d..e73274a60b22 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -310,9 +310,7 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - struct shrinker *shrinker, - unsigned long nr_scanned, - unsigned long nr_eligible) + struct shrinker *shrinker, int priority) { unsigned long freed = 0; unsigned long long delta; @@ -337,9 +335,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = (4 * nr_scanned) / shrinker->seeks; - delta *= freeable; - do_div(delta, nr_eligible + 1); + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", @@ -373,8 +371,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - nr_scanned, nr_eligible, - freeable, delta, total_scan); + freeable, delta, total_scan, priority); /* * Normally, we should not scan less than batch_size objects in one @@ -434,8 +431,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * @gfp_mask: allocation context * @nid: node whose slab caches to target * @memcg: memory cgroup whose slab caches to target - * @nr_scanned: pressure numerator - * @nr_eligible: pressure denominator + * @priority: the reclaim priority * * Call the shrink functions to age shrinkable caches. * @@ -447,20 +443,14 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * objects from the memory cgroup specified. Otherwise, only unaware * shrinkers are called. * - * @nr_scanned and @nr_eligible form a ratio that indicate how much of - * the available objects should be scanned. Page reclaim for example - * passes the number of pages scanned and the number of pages on the - * LRU lists that it considered on @nid, plus a bias in @nr_scanned - * when it encountered mapped pages. The ratio is further biased by - * the ->seeks setting of the shrink function, which indicates the - * cost to recreate an object relative to that of an LRU page. + * @priority is sc->priority, we take the number of objects and >> by priority + * in order to get the scan target. * * Returns the number of reclaimed slab objects. */ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, - unsigned long nr_scanned, - unsigned long nr_eligible) + int priority) { struct shrinker *shrinker; unsigned long freed = 0; @@ -468,9 +458,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; - if (nr_scanned == 0) - nr_scanned = SWAP_CLUSTER_MAX; - if (!down_read_trylock(&shrinker_rwsem)) { /* * If we would return 0, our callers would understand that we @@ -501,7 +488,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; - freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); + freed += do_shrink_slab(&sc, shrinker, priority); } up_read(&shrinker_rwsem); @@ -519,8 +506,7 @@ void drop_slab_node(int nid) freed = 0; do { - freed += shrink_slab(GFP_KERNEL, nid, memcg, - 1000, 1000); + freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); } while (freed > 10); } @@ -2615,14 +2601,12 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages; if (memcg) shrink_slab(sc->gfp_mask, pgdat->node_id, - memcg, sc->nr_scanned - scanned, - lru_pages); + memcg, sc->priority); /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -2646,14 +2630,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); - /* - * Shrink the slab caches in the same proportion that - * the eligible LRU pages were scanned. - */ if (global_reclaim(sc)) shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, - sc->nr_scanned - nr_scanned, - node_lru_pages); + sc->priority); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; -- cgit v1.2.3 From 80b1f41c0957a9da3bab4fb9ae76dc886753a59b Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 31 Jan 2018 16:16:30 -0800 Subject: mm: split deferred_init_range into initializing and freeing parts In deferred_init_range() we initialize struct pages, and also free them to buddy allocator. We do it in separate loops, because buddy page is computed ahead, so we do not want to access a struct page that has not been initialized yet. There is still, however, a corner case where it is potentially possible to access uninitialized struct page: this is when buddy page is from the next memblock range. This patch fixes this problem by splitting deferred_init_range() into two functions: one to initialize struct pages, and another to free them. In addition, this patch brings the following improvements: - Get rid of __def_free() helper function. And simplifies loop logic by adding a new pfn validity check function: deferred_pfn_valid(). - Reduces number of variables that we track. So, there is a higher chance that we will avoid using stack to store/load variables inside hot loops. - Enables future multi-threading of these functions: do initialization in multiple threads, wait for all threads to finish, do freeing part in multithreading. Tested on x86 with 1T of memory to make sure no regressions are introduced. [akpm@linux-foundation.org: fix spello in comment] Link: http://lkml.kernel.org/r/20171107150446.32055-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Steven Sistare Cc: Daniel Jordan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 146 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 76 insertions(+), 70 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 76c9688b6a0a..a73cffe287a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1457,92 +1457,87 @@ static inline void __init pgdat_init_report_one_done(void) } /* - * Helper for deferred_init_range, free the given range, reset the counters, and - * return number of pages freed. + * Returns true if page needs to be initialized or freed to buddy allocator. + * + * First we check if pfn is valid on architectures where it is possible to have + * holes within pageblock_nr_pages. On systems where it is not possible, this + * function is optimized out. + * + * Then, we check if a current large page is valid by only checking the validity + * of the head pfn. + * + * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave + * within a node: a pfn is between start and end of a node, but does not belong + * to this memory node. */ -static inline unsigned long __init __def_free(unsigned long *nr_free, - unsigned long *free_base_pfn, - struct page **page) +static inline bool __init +deferred_pfn_valid(int nid, unsigned long pfn, + struct mminit_pfnnid_cache *nid_init_state) { - unsigned long nr = *nr_free; + if (!pfn_valid_within(pfn)) + return false; + if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) + return false; + if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) + return false; + return true; +} - deferred_free_range(*free_base_pfn, nr); - *free_base_pfn = 0; - *nr_free = 0; - *page = NULL; +/* + * Free pages to buddy allocator. Try to free aligned pages in + * pageblock_nr_pages sizes. + */ +static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, + unsigned long end_pfn) +{ + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long nr_pgmask = pageblock_nr_pages - 1; + unsigned long nr_free = 0; - return nr; + for (; pfn < end_pfn; pfn++) { + if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + deferred_free_range(pfn - nr_free, nr_free); + nr_free = 0; + } else if (!(pfn & nr_pgmask)) { + deferred_free_range(pfn - nr_free, nr_free); + nr_free = 1; + cond_resched(); + } else { + nr_free++; + } + } + /* Free the last block of pages to allocator */ + deferred_free_range(pfn - nr_free, nr_free); } -static unsigned long __init deferred_init_range(int nid, int zid, - unsigned long start_pfn, - unsigned long end_pfn) +/* + * Initialize struct pages. We minimize pfn page lookups and scheduler checks + * by performing it only once every pageblock_nr_pages. + * Return number of pages initialized. + */ +static unsigned long __init deferred_init_pages(int nid, int zid, + unsigned long pfn, + unsigned long end_pfn) { struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; - unsigned long free_base_pfn = 0; unsigned long nr_pages = 0; - unsigned long nr_free = 0; struct page *page = NULL; - unsigned long pfn; - /* - * First we check if pfn is valid on architectures where it is possible - * to have holes within pageblock_nr_pages. On systems where it is not - * possible, this function is optimized out. - * - * Then, we check if a current large page is valid by only checking the - * validity of the head pfn. - * - * meminit_pfn_in_nid is checked on systems where pfns can interleave - * within a node: a pfn is between start and end of a node, but does not - * belong to this memory node. - * - * Finally, we minimize pfn page lookups and scheduler checks by - * performing it only once every pageblock_nr_pages. - * - * We do it in two loops: first we initialize struct page, than free to - * buddy allocator, becuse while we are freeing pages we can access - * pages that are ahead (computing buddy page in __free_one_page()). - */ - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) + for (; pfn < end_pfn; pfn++) { + if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + page = NULL; continue; - if ((pfn & nr_pgmask) || pfn_valid(pfn)) { - if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - if (page && (pfn & nr_pgmask)) - page++; - else - page = pfn_to_page(pfn); - __init_single_page(page, pfn, zid, nid); - cond_resched(); - } - } - } - - page = NULL; - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (page && (pfn & nr_pgmask)) { - page++; - nr_free++; - } else { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - free_base_pfn = pfn; - nr_free = 1; cond_resched(); + } else { + page++; } + __init_single_page(page, pfn, zid, nid); + nr_pages++; } - /* Free the last block of pages to allocator */ - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - - return nr_pages; + return (nr_pages); } /* Initialise remaining memory on a node */ @@ -1582,10 +1577,21 @@ static int __init deferred_init_memmap(void *data) } first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); + /* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, than free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + */ + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + nr_pages += deferred_init_pages(nid, zid, spfn, epfn); + } for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - nr_pages += deferred_init_range(nid, zid, spfn, epfn); + deferred_free_pages(nid, zid, spfn, epfn); } /* Sanity check that the next zone really is unpopulated */ -- cgit v1.2.3 From 2b9fceb3b47b7c44fb04eef068f441e7b18daa68 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 31 Jan 2018 16:16:34 -0800 Subject: mm/filemap.c: remove include of hardirq.h in_atomic() has been moved to include/linux/preempt.h, and the filemap.c doesn't use in_atomic() directly at all, so it sounds unnecessary to include hardirq.h. Link: http://lkml.kernel.org/r/1509985319-38633-1-git-send-email-yang.s@alibaba-inc.com Signed-off-by: Yang Shi Reviewed-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index ee83baaf855d..693f62212a59 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,7 +31,6 @@ #include #include #include -#include /* for BUG_ON(!in_atomic()) only */ #include #include #include -- cgit v1.2.3 From c9019e9bf42e66d028d70d2da6206cad4dd9250d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 31 Jan 2018 16:16:37 -0800 Subject: mm: memcontrol: eliminate raw access to stat and event counters Replace all raw 'this_cpu_' modifications of the stat and event per-cpu counters with API functions such as mod_memcg_state(). This makes the code easier to read, but is also in preparation for the next patch, which changes the per-cpu implementation of those counters. Link: http://lkml.kernel.org/r/20171103153336.24044-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 31 +++++++++++++++--------- mm/memcontrol.c | 59 ++++++++++++++++++++-------------------------- 2 files changed, 45 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..2c80b69dd266 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -272,13 +272,6 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) -{ - this_cpu_inc(memcg->stat->events[event]); - cgroup_file_notify(&memcg->events_file); -} - bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, @@ -627,15 +620,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +/* idx can be of type enum memcg_event_item or vm_event_item */ +static inline void __count_memcg_events(struct mem_cgroup *memcg, + int idx, unsigned long count) +{ + if (!mem_cgroup_disabled()) + __this_cpu_add(memcg->stat->events[idx], count); +} + +/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) + int idx, unsigned long count) { if (!mem_cgroup_disabled()) this_cpu_add(memcg->stat->events[idx], count); } -/* idx can be of type enum memcg_stat_item or node_stat_item */ +/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_page_event(struct page *page, int idx) { @@ -654,12 +655,20 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) { - this_cpu_inc(memcg->stat->events[idx]); + count_memcg_events(memcg, idx, 1); if (idx == OOM_KILL) cgroup_file_notify(&memcg->events_file); } rcu_read_unlock(); } + +static inline void mem_cgroup_event(struct mem_cgroup *memcg, + enum memcg_event_item event) +{ + count_memcg_events(memcg, event, 1); + cgroup_file_notify(&memcg->events_file); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE void mem_cgroup_split_huge_fixup(struct page *head); #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9011997d8a5c..23841af1d756 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -586,23 +586,23 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, * counted as CACHE even if it's on ANON LRU. */ if (PageAnon(page)) - __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages); + __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); else { - __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages); + __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); if (PageSwapBacked(page)) - __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages); + __mod_memcg_state(memcg, NR_SHMEM, nr_pages); } if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages); + __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(memcg->stat->events[PGPGIN]); + __count_memcg_events(memcg, PGPGIN, 1); else { - __this_cpu_inc(memcg->stat->events[PGPGOUT]); + __count_memcg_events(memcg, PGPGOUT, 1); nr_pages = -nr_pages; /* for event */ } @@ -2415,18 +2415,11 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE], - HPAGE_PMD_NR); + __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_MEMCG_SWAP -static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - int nr_entries) -{ - this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries); -} - /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. * @entry: swap entry to be moved @@ -2450,8 +2443,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mem_cgroup_swap_statistics(from, -1); - mem_cgroup_swap_statistics(to, 1); + mod_memcg_state(from, MEMCG_SWAP, -1); + mod_memcg_state(to, MEMCG_SWAP, 1); return 0; } return -EINVAL; @@ -4584,8 +4577,8 @@ static int mem_cgroup_move_account(struct page *page, spin_lock_irqsave(&from->move_lock, flags); if (!anon && page_mapped(page)) { - __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages); - __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages); + __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages); + __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages); } /* @@ -4597,16 +4590,14 @@ static int mem_cgroup_move_account(struct page *page, struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { - __this_cpu_sub(from->stat->count[NR_FILE_DIRTY], - nr_pages); - __this_cpu_add(to->stat->count[NR_FILE_DIRTY], - nr_pages); + __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages); + __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages); } } if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages); - __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages); + __mod_memcg_state(from, NR_WRITEBACK, -nr_pages); + __mod_memcg_state(to, NR_WRITEBACK, nr_pages); } /* @@ -5642,11 +5633,11 @@ static void uncharge_batch(const struct uncharge_gather *ug) } local_irq_save(flags); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge); - __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem); - __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout); + __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); + __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); + __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); + __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); + __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); @@ -5874,7 +5865,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) if (in_softirq()) gfp_mask = GFP_NOWAIT; - this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages); + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); if (try_charge(memcg, gfp_mask, nr_pages) == 0) return true; @@ -5895,7 +5886,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) return; } - this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); + mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); refill_stock(memcg, nr_pages); } @@ -6019,7 +6010,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), nr_entries); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, nr_entries); + mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); page->mem_cgroup = NULL; @@ -6085,7 +6076,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) mem_cgroup_id_get_many(memcg, nr_pages - 1); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, nr_pages); + mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); return 0; } @@ -6113,7 +6104,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) else page_counter_uncharge(&memcg->memsw, nr_pages); } - mem_cgroup_swap_statistics(memcg, -nr_pages); + mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); mem_cgroup_id_put_many(memcg, nr_pages); } rcu_read_unlock(); -- cgit v1.2.3 From a983b5ebee57209c99f68c8327072f25e0e6e3da Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 31 Jan 2018 16:16:45 -0800 Subject: mm: memcontrol: fix excessive complexity in memory.stat reporting We've seen memory.stat reads in top-level cgroups take up to fourteen seconds during a userspace bug that created tens of thousands of ghost cgroups pinned by lingering page cache. Even with a more reasonable number of cgroups, aggregating memory.stat is unnecessarily heavy. The complexity is this: nr_cgroups * nr_stat_items * nr_possible_cpus where the stat items are ~70 at this point. With 128 cgroups and 128 CPUs - decent, not enormous setups - reading the top-level memory.stat has to aggregate over a million per-cpu counters. This doesn't scale. Instead of spreading the source of truth across all CPUs, use the per-cpu counters merely to batch updates to shared atomic counters. This is the same as the per-cpu stocks we use for charging memory to the shared atomic page_counters, and also the way the global vmstat counters are implemented. Vmstat has elaborate spilling thresholds that depend on the number of CPUs, amount of memory, and memory pressure - carefully balancing the cost of counter updates with the amount of per-cpu error. That's because the vmstat counters are system-wide, but also used for decisions inside the kernel (e.g. NR_FREE_PAGES in the allocator). Neither is true for the memory controller. Use the same static batch size we already use for page_counter updates during charging. The per-cpu error in the stats will be 128k, which is an acceptable ratio of cores to memory accounting granularity. [hannes@cmpxchg.org: fix warning in __this_cpu_xchg() calls] Link: http://lkml.kernel.org/r/20171201135750.GB8097@cmpxchg.org Link: http://lkml.kernel.org/r/20171103153336.24044-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 96 +++++++++++++++++++++++++++--------------- mm/memcontrol.c | 101 +++++++++++++++++++++++---------------------- 2 files changed, 113 insertions(+), 84 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1ffc54ac4cc9..882046863581 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -108,7 +108,10 @@ struct lruvec_stat { */ struct mem_cgroup_per_node { struct lruvec lruvec; - struct lruvec_stat __percpu *lruvec_stat; + + struct lruvec_stat __percpu *lruvec_stat_cpu; + atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -227,10 +230,10 @@ struct mem_cgroup { spinlock_t move_lock; struct task_struct *move_lock_task; unsigned long move_lock_flags; - /* - * percpu counter. - */ - struct mem_cgroup_stat_cpu __percpu *stat; + + struct mem_cgroup_stat_cpu __percpu *stat_cpu; + atomic_long_t stat[MEMCG_NR_STAT]; + atomic_long_t events[MEMCG_NR_EVENTS]; unsigned long socket_pressure; @@ -265,6 +268,12 @@ struct mem_cgroup { /* WARNING: nodeinfo must be the last member here */ }; +/* + * size of first charge trial. "32" comes from vmscan.c's magic value. + * TODO: maybe necessary to use big numbers in big irons. + */ +#define MEMCG_CHARGE_BATCH 32U + extern struct mem_cgroup *root_mem_cgroup; static inline bool mem_cgroup_disabled(void) @@ -485,32 +494,38 @@ void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long val = 0; - int cpu; - - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->count[idx], cpu); - - if (val < 0) - val = 0; - - return val; + long x = atomic_long_read(&memcg->stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - if (!mem_cgroup_disabled()) - __this_cpu_add(memcg->stat->count[idx], val); + long x; + + if (mem_cgroup_disabled()) + return; + + x = val + __this_cpu_read(memcg->stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->stat[idx]); + x = 0; + } + __this_cpu_write(memcg->stat_cpu->count[idx], x); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->count[idx], val); + preempt_disable(); + __mod_memcg_state(memcg, idx, val); + preempt_enable(); } /** @@ -548,26 +563,25 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long val = 0; - int cpu; + long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for_each_possible_cpu(cpu) - val += per_cpu(pn->lruvec_stat->count[idx], cpu); - - if (val < 0) - val = 0; - - return val; + x = atomic_long_read(&pn->lruvec_stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; + long x; /* Update node */ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); @@ -581,7 +595,12 @@ static inline void __mod_lruvec_state(struct lruvec *lruvec, __mod_memcg_state(pn->memcg, idx, val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stat->count[idx], val); + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &pn->lruvec_stat[idx]); + x = 0; + } + __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } static inline void mod_lruvec_state(struct lruvec *lruvec, @@ -624,16 +643,25 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, static inline void __count_memcg_events(struct mem_cgroup *memcg, int idx, unsigned long count) { - if (!mem_cgroup_disabled()) - __this_cpu_add(memcg->stat->events[idx], count); + unsigned long x; + + if (mem_cgroup_disabled()) + return; + + x = count + __this_cpu_read(memcg->stat_cpu->events[idx]); + if (unlikely(x > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->events[idx]); + x = 0; + } + __this_cpu_write(memcg->stat_cpu->events[idx], x); } -/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_events(struct mem_cgroup *memcg, int idx, unsigned long count) { - if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->events[idx], count); + preempt_disable(); + __count_memcg_events(memcg, idx, count); + preempt_enable(); } /* idx can be of type enum memcg_event_item or vm_event_item */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 23841af1d756..51d398f1363c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } -/* - * Return page count for single (non recursive) @memcg. - * - * Implementation Note: reading percpu statistics for memcg. - * - * Both of vmstat[] and percpu_counter has threshold and do periodic - * synchronization to implement "quick" read. There are trade-off between - * reading cost and precision of value. Then, we may have a chance to implement - * a periodic synchronization of counter in memcg's counter. - * - * But this _read() function is used for user interface now. The user accounts - * memory usage by memory cgroup and he _always_ requires exact value because - * he accounts memory. Even if we provide quick-and-fuzzy read, we always - * have to visit all online cpus and make sum. So, for now, unnecessary - * synchronization is not implemented. (just implemented for cpu hotplug) - * - * If there are kernel internal actions which can make use of some not-exact - * value, and reading all cpu value can be performance bottleneck in some - * common workload, threshold and synchronization as vmstat[] should be - * implemented. - * - * The parameter idx can be of type enum memcg_event_item or vm_event_item. - */ - static unsigned long memcg_sum_events(struct mem_cgroup *memcg, int event) { - unsigned long val = 0; - int cpu; - - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->events[event], cpu); - return val; + return atomic_long_read(&memcg->events[event]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -606,7 +577,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(memcg->stat->nr_page_events, nr_pages); + __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); } unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, @@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next; - val = __this_cpu_read(memcg->stat->nr_page_events); - next = __this_cpu_read(memcg->stat->targets[target]); + val = __this_cpu_read(memcg->stat_cpu->nr_page_events); + next = __this_cpu_read(memcg->stat_cpu->targets[target]); /* from time_after() in jiffies.h */ if ((long)(next - val) < 0) { switch (target) { @@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, default: break; } - __this_cpu_write(memcg->stat->targets[target], next); + __this_cpu_write(memcg->stat_cpu->targets[target], next); return true; } return false; @@ -1707,11 +1678,6 @@ void unlock_page_memcg(struct page *page) } EXPORT_SYMBOL(unlock_page_memcg); -/* - * size of first charge trial. "32" comes from vmscan.c's magic value. - * TODO: maybe necessary to use big numbers in big irons. - */ -#define CHARGE_BATCH 32U struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -1739,7 +1705,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) unsigned long flags; bool ret = false; - if (nr_pages > CHARGE_BATCH) + if (nr_pages > MEMCG_CHARGE_BATCH) return ret; local_irq_save(flags); @@ -1808,7 +1774,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) } stock->nr_pages += nr_pages; - if (stock->nr_pages > CHARGE_BATCH) + if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); local_irq_restore(flags); @@ -1858,9 +1824,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; + struct mem_cgroup *memcg; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); + + for_each_mem_cgroup(memcg) { + int i; + + for (i = 0; i < MEMCG_NR_STAT; i++) { + int nid; + long x; + + x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); + if (x) + atomic_long_add(x, &memcg->stat[i]); + + if (i >= NR_VM_NODE_STAT_ITEMS) + continue; + + for_each_node(nid) { + struct mem_cgroup_per_node *pn; + + pn = mem_cgroup_nodeinfo(memcg, nid); + x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); + if (x) + atomic_long_add(x, &pn->lruvec_stat[i]); + } + } + + for (i = 0; i < MEMCG_NR_EVENTS; i++) { + long x; + + x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); + if (x) + atomic_long_add(x, &memcg->events[i]); + } + } + return 0; } @@ -1881,7 +1882,7 @@ static void high_work_func(struct work_struct *work) struct mem_cgroup *memcg; memcg = container_of(work, struct mem_cgroup, high_work); - reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); } /* @@ -1905,7 +1906,7 @@ void mem_cgroup_handle_over_high(void) static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { - unsigned int batch = max(CHARGE_BATCH, nr_pages); + unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct page_counter *counter; @@ -4161,8 +4162,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - pn->lruvec_stat = alloc_percpu(struct lruvec_stat); - if (!pn->lruvec_stat) { + pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat_cpu) { kfree(pn); return 1; } @@ -4180,7 +4181,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; - free_percpu(pn->lruvec_stat); + free_percpu(pn->lruvec_stat_cpu); kfree(pn); } @@ -4190,7 +4191,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); - free_percpu(memcg->stat); + free_percpu(memcg->stat_cpu); kfree(memcg); } @@ -4219,8 +4220,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail; - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat) + memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat_cpu) goto fail; for_each_node(node) @@ -5638,7 +5639,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); + __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); -- cgit v1.2.3 From 8e33771ca41245a7c7f7a3c84f5cbd6625620a89 Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Wed, 31 Jan 2018 16:16:48 -0800 Subject: mm/page_owner.c: use PTR_ERR_OR_ZERO() Fix ptr_ret.cocci warnings: mm/page_owner.c:639:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Link: http://lkml.kernel.org/r/1511824101-9597-1-git-send-email-gomonovych@gmail.com Signed-off-by: Vasyl Gomonovych Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index 270a8219ccd0..06a0055f45a6 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -635,9 +635,7 @@ static int __init pageowner_init(void) dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, NULL, &proc_page_owner_operations); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - return 0; + return PTR_ERR_OR_ZERO(dentry); } late_initcall(pageowner_init) -- cgit v1.2.3 From 48128397b04679717cfd419d55ec86456b84eb61 Mon Sep 17 00:00:00 2001 From: Jiankang Chen Date: Wed, 31 Jan 2018 16:16:52 -0800 Subject: mm/page_alloc.c: fix comment in __get_free_pages() __get_free_pages() will return a virtual address, but it is not just a 32-bit address, for example on a 64-bit system. And this comment really confuses new readers of mm. Link: http://lkml.kernel.org/r/1511780964-64864-1-git-send-email-chenjiankang1@huawei.com Signed-off-by: Jiankang Chen Reported-by: Hanjun Guo Cc: Mel Gorman Cc: Johannes Weiner Cc: Yisheng Xie Cc: Kefeng Wang Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a73cffe287a5..b411f97dfb25 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4278,7 +4278,7 @@ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) struct page *page; /* - * __get_free_pages() returns a 32-bit address, which cannot represent + * __get_free_pages() returns a virtual address, which cannot represent * a highmem page */ VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); -- cgit v1.2.3 From e496612c5130567fc9d5f1969ca4b86665aa3cbb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 31 Jan 2018 16:16:55 -0800 Subject: mm: do not stall register_shrinker() Shakeel Butt reported he has observed in production systems that the job loader gets stuck for 10s of seconds while doing a mount operation. It turns out that it was stuck in register_shrinker() because some unrelated job was under memory pressure and was spending time in shrink_slab(). Machines have a lot of shrinkers registered and jobs under memory pressure have to traverse all of those memcg-aware shrinkers and affect unrelated jobs which want to register their own shrinkers. To solve the issue, this patch simply bails out slab shrinking if it is found that someone wants to register a shrinker in parallel. A downside is it could cause unfair shrinking between shrinkers. However, it should be rare and we can add compilcated logic if we find it's not enough. [akpm@linux-foundation.org: tweak code comment] Link: http://lkml.kernel.org/r/20171115005602.GB23810@bbox Link: http://lkml.kernel.org/r/1511481899-20335-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Shakeel Butt Reported-by: Shakeel Butt Tested-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index e73274a60b22..153e0795f4f0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -489,6 +489,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, sc.nid = 0; freed += do_shrink_slab(&sc, shrinker, priority); + /* + * Bail out if someone want to register a new shrinker to + * prevent the regsitration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } up_read(&shrinker_rwsem); -- cgit v1.2.3 From e025f059a32085d76768e46eac344cba203a6a71 Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Wed, 31 Jan 2018 16:17:03 -0800 Subject: mm/interval_tree.c: use vma_pages() helper Use vma_pages function on vma object instead of explicit computation. mm/interval_tree.c:21:27-33: WARNING: Consider using vma_pages helper Generated by: scripts/coccinelle/api/vma_pages.cocci Link: http://lkml.kernel.org/r/1511364410-13499-1-git-send-email-gomonovych@gmail.com Signed-off-by: Vasyl Gomonovych Acked-by: Michael S. Tsirkin Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/interval_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/interval_tree.c b/mm/interval_tree.c index b47664358796..27ddfd29112a 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -18,7 +18,7 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) { - return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; + return v->vm_pgoff + vma_pages(v) - 1; } INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, -- cgit v1.2.3 From a4ef87684108e5fef38cf289ee360f9b87a53cfd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 31 Jan 2018 16:17:06 -0800 Subject: mm: remove unused pgdat_reclaimable_pages() Remove unused function pgdat_reclaimable_pages() and node_page_state_snapshot() which becomes unused as well. Link: http://lkml.kernel.org/r/20171122094416.26019-1-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - include/linux/vmstat.h | 17 ----------------- mm/vmscan.c | 16 ---------------- 3 files changed, 34 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0bd4c25016f9..7b6a59f722a3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -344,7 +344,6 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); -extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1779c9817b39..a4c2317d8b9f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -216,23 +216,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, return x; } -static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, - enum node_stat_item item) -{ - long x = atomic_long_read(&pgdat->vm_stat[item]); - -#ifdef CONFIG_SMP - int cpu; - for_each_online_cpu(cpu) - x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item]; - - if (x < 0) - x = 0; -#endif - return x; -} - - #ifdef CONFIG_NUMA extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, diff --git a/mm/vmscan.c b/mm/vmscan.c index 153e0795f4f0..1a33c8e1e758 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -220,22 +220,6 @@ unsigned long zone_reclaimable_pages(struct zone *zone) return nr; } -unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) -{ - unsigned long nr; - - nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + - node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + - node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); - - if (get_nr_swap_pages() > 0) - nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + - node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + - node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); - - return nr; -} - /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector -- cgit v1.2.3 From d6cb41cc44c63492702281b1d329955ca767d399 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:17:10 -0800 Subject: mm, hugetlb: remove hugepages_treat_as_movable sysctl hugepages_treat_as_movable has been introduced by 396faf0303d2 ("Allow huge page allocations to use GFP_HIGH_MOVABLE") to allow hugetlb allocations from ZONE_MOVABLE even when hugetlb pages were not migrateable. The purpose of the movable zone was different at the time. It aimed at reducing memory fragmentation and hugetlb pages being long lived and large werre not contributing to the fragmentation so it was acceptable to use the zone back then. Things have changed though and the primary purpose of the zone became migratability guarantee. If we allow non migrateable hugetlb pages to be in ZONE_MOVABLE memory hotplug might fail to offline the memory. Remove the knob and only rely on hugepage_migration_supported to allow movable zones. Mel said: : Primarily it was aimed at allowing the hugetlb pool to safely shrink with : the ability to grow it again. The use case was for batched jobs, some of : which needed huge pages and others that did not but didn't want the memory : useless pinned in the huge pages pool. : : I suspect that more users rely on THP than hugetlbfs for flexible use of : huge pages with fallback options so I think that removing the option : should be ok. Link: http://lkml.kernel.org/r/20171003072619.8654-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Alexandru Moise <00moses.alexander00@gmail.com> Acked-by: Mel Gorman Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 25 ------------------------- include/linux/hugetlb.h | 1 - kernel/sysctl.c | 7 ------- mm/hugetlb.c | 4 +--- 4 files changed, 1 insertion(+), 36 deletions(-) (limited to 'mm') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 5025ff9307e6..ff234d229cbb 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -30,7 +30,6 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold -- hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode - legacy_va_layout @@ -261,30 +260,6 @@ any throttling. ============================================================== -hugepages_treat_as_movable - -This parameter controls whether we can allocate hugepages from ZONE_MOVABLE -or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE. -ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified, -so this parameter has no effect if used without kernelcore=. - -Hugepage migration is now available in some situations which depend on the -architecture and/or the hugepage size. If a hugepage supports migration, -allocation from ZONE_MOVABLE is always enabled for the hugepage regardless -of the value of this parameter. -IOW, this parameter affects only non-migratable hugepages. - -Assuming that hugepages are not migratable in your system, one usecase of -this parameter is that users can make hugepage pool more extensible by -enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE -page reclaim/migration/compaction work more and you can get contiguous -memory more likely. Note that using ZONE_MOVABLE for non-migratable -hugepages can do harm to other features like memory hotremove (because -memory hotremove expects that memory blocks on ZONE_MOVABLE are always -removable,) so it's a trade-off responsible for the users. - -============================================================== - hugetlb_shm_group hugetlb_shm_group contains group id that is allowed to create SysV diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 82a25880714a..6fcf140188d0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -129,7 +129,6 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 557d46728577..2fb4e27c636a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "hugepages_treat_as_movable", - .data = &hugepages_treat_as_movable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "nr_overcommit_hugepages", .data = NULL, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1e6a5ad0d420..4137fb67cd79 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -36,8 +36,6 @@ #include #include "internal.h" -int hugepages_treat_as_movable; - int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -926,7 +924,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepages_treat_as_movable || hugepage_migration_supported(h)) + if (hugepage_migration_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; -- cgit v1.2.3 From dc88c88904b8c5eb749874aecc278146b6ae02f3 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 31 Jan 2018 16:17:14 -0800 Subject: mm/memory_hotplug.c: remove unnecesary check from register_page_bootmem_info_section() When we call register_page_bootmem_info_section() having CONFIG_SPARSEMEM_VMEMMAP enabled, we check if the pfn is valid. This check is redundant as we already checked this in register_page_bootmem_info_node() before calling register_page_bootmem_info_section(), so let's get rid of it. Link: http://lkml.kernel.org/r/20171205143422.GA31458@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 999ce3af809d..9646e5d63648 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -200,9 +200,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) struct mem_section *ms; struct page *page, *memmap; - if (!pfn_valid(start_pfn)) - return; - section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); -- cgit v1.2.3 From ef549e13cf62733097eb1f7a9f44b2cea1611007 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 31 Jan 2018 16:17:17 -0800 Subject: mm: update comment describing tlb_gather_mmu The comment describes @fullmm argument, but the function has no such parameter. Update the comment to match the code and convert it to kernel-doc markup. Link: http://lkml.kernel.org/r/1512394531-2264-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 793004608332..82a0577933aa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -400,10 +400,17 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ -/* tlb_gather_mmu - * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. The @fullmm argument is used when @mm is without - * users and we're going to destroy the full address space (exit/execve). +/** + * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * @start: start of the region that will be removed from the page-table + * @end: end of the region that will be removed from the page-table + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. The @start and @end are set to 0 and -1 + * respectively when @mm is without users and we're going to destroy + * the full address space (exit/execve). */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) -- cgit v1.2.3 From 9ac9322d7cfa35b5381a08c7eaed56eb2297377e Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 31 Jan 2018 16:17:25 -0800 Subject: mm: memory_hotplug: remove second __nr_to_section in register_page_bootmem_info_section() In register_page_bootmem_info_section() we call __nr_to_section() in order to get the mem_section struct at the beginning of the function. Since we already got it, there is no need for a second call to __nr_to_section(). Link: http://lkml.kernel.org/r/20171207102914.GA12396@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9646e5d63648..9bbd6982d4e4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -184,7 +184,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, SECTION_INFO); - usemap = __nr_to_section(section_nr)->pageblock_flags; + usemap = ms->pageblock_flags; page = virt_to_page(usemap); mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; @@ -207,7 +207,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); - usemap = __nr_to_section(section_nr)->pageblock_flags; + usemap = ms->pageblock_flags; page = virt_to_page(usemap); mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; -- cgit v1.2.3 From 9bebc09fcf4fb25e36cf86af764c038b92f64057 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 31 Jan 2018 16:17:29 -0800 Subject: mm/huge_memory.c: fix comment in __split_huge_pmd_locked pmd_trans_splitting() was removed after THP refcounting redesign, therefore related comment should be updated. Link: http://lkml.kernel.org/r/1512625745-59451-1-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e7ded98d114..0d3ae51ce4f7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2205,10 +2205,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * for the same virtual address to be loaded simultaneously. So instead * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the * current pmd notpresent (atomically because here the pmd_trans_huge - * and pmd_trans_splitting must remain set at all times on the pmd - * until the split is complete for this pmd), then we flush the SMP TLB - * and finally we write the non-huge version of the pmd entry with - * pmd_populate. + * must remain set at all times on the pmd until the split is complete + * for this pmd), then we flush the SMP TLB and finally we write the + * non-huge version of the pmd entry with pmd_populate. */ pmdp_invalidate(vma, haddr, pmd); pmd_populate(mm, pmd, pgtable); -- cgit v1.2.3 From 977fbdcd5986c9ff700bf276644d2b1973a53348 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:17:36 -0800 Subject: mm: add unmap_mapping_pages() Several users of unmap_mapping_range() would prefer to express their range in pages rather than bytes. Unfortuately, on a 32-bit kernel, you have to remember to cast your page number to a 64-bit type before shifting it, and four places in the current tree didn't remember to do that. That's a sign of a bad interface. Conveniently, unmap_mapping_range() actually converts from bytes into pages, so hoist the guts of unmap_mapping_range() into a new function unmap_mapping_pages() and convert the callers which want to use pages. Link: http://lkml.kernel.org/r/20171206142627.GD32044@bombadil.infradead.org Signed-off-by: Matthew Wilcox Reported-by: "zhangyi (F)" Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 19 ++++++------------- include/linux/mm.h | 26 ++++++++++++++++---------- mm/khugepaged.c | 3 +-- mm/memory.c | 43 +++++++++++++++++++++++++++++++------------ mm/nommu.c | 7 ------- mm/truncate.c | 23 +++++++---------------- 6 files changed, 61 insertions(+), 60 deletions(-) (limited to 'mm') diff --git a/fs/dax.c b/fs/dax.c index c2ebf10b70da..6ee6f7e24f5a 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -44,6 +44,7 @@ /* The 'colour' (ie low bits) within a PMD of a page offset. */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) +#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; @@ -375,8 +376,8 @@ restart: * unmapped. */ if (pmd_downgrade && dax_is_zero_entry(entry)) - unmap_mapping_range(mapping, - (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); @@ -538,12 +539,10 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) - unmap_mapping_range(mapping, - (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, - PMD_SIZE, 0); + unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); else /* pte entry */ - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, vmf->pgoff, 1, false); } spin_lock_irq(&mapping->tree_lock); @@ -1269,12 +1268,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } #ifdef CONFIG_FS_DAX_PMD -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below functions. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 7fc92384977e..173d2484f6e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1312,8 +1312,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); @@ -1324,12 +1322,6 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address, int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); -static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) -{ - unmap_mapping_range(mapping, holebegin, holelen, 0); -} - extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); @@ -1344,6 +1336,10 @@ extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); +void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows); +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -1360,10 +1356,20 @@ static inline int fixup_user_fault(struct task_struct *tsk, BUG(); return -EFAULT; } +static inline void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows) { } +static inline void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, - unsigned int gup_flags); +static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) +{ + unmap_mapping_range(mapping, holebegin, holelen, 0); +} + +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ea4ff259b671..1cd18e4347fe 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1399,8 +1399,7 @@ static void collapse_shmem(struct mm_struct *mm, } if (page_mapped(page)) - unmap_mapping_range(mapping, index << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, index, 1, false); spin_lock_irq(&mapping->tree_lock); diff --git a/mm/memory.c b/mm/memory.c index 82a0577933aa..a6e5d6ac5d24 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2798,9 +2798,38 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, } } +/** + * unmap_mapping_pages() - Unmap pages from processes. + * @mapping: The address space containing pages to be unmapped. + * @start: Index of first page to be unmapped. + * @nr: Number of pages to be unmapped. 0 to unmap to end of file. + * @even_cows: Whether to unmap even private COWed pages. + * + * Unmap the pages in this address space from any userspace process which + * has them mmaped. Generally, you want to remove COWed pages as well when + * a file is being truncated, but not when invalidating pages from the page + * cache. + */ +void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, + pgoff_t nr, bool even_cows) +{ + struct zap_details details = { }; + + details.check_mapping = even_cows ? NULL : mapping; + details.first_index = start; + details.last_index = start + nr - 1; + if (details.last_index < details.first_index) + details.last_index = ULONG_MAX; + + i_mmap_lock_write(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + i_mmap_unlock_write(mapping); +} + /** * unmap_mapping_range - unmap the portion of all mmaps in the specified - * address_space corresponding to the specified page range in the underlying + * address_space corresponding to the specified byte range in the underlying * file. * * @mapping: the address space containing mmaps to be unmapped. @@ -2818,7 +2847,6 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - struct zap_details details = { }; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -2830,16 +2858,7 @@ void unmap_mapping_range(struct address_space *mapping, hlen = ULONG_MAX - hba + 1; } - details.check_mapping = even_cows ? NULL : mapping; - details.first_index = hba; - details.last_index = hba + hlen - 1; - if (details.last_index < details.first_index) - details.last_index = ULONG_MAX; - - i_mmap_lock_write(mapping); - if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); - i_mmap_unlock_write(mapping); + unmap_mapping_pages(mapping, hba, hlen, even_cows); } EXPORT_SYMBOL(unmap_mapping_range); diff --git a/mm/nommu.c b/mm/nommu.c index 17c00d93de2e..4b9864b17cb0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1788,13 +1788,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, - int even_cows) -{ -} -EXPORT_SYMBOL(unmap_mapping_range); - int filemap_fault(struct vm_fault *vmf) { BUG(); diff --git a/mm/truncate.c b/mm/truncate.c index e4b4cf0f4070..c34e2fd4f583 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -179,12 +179,8 @@ static void truncate_cleanup_page(struct address_space *mapping, struct page *page) { if (page_mapped(page)) { - loff_t holelen; - - holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; - unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_SHIFT, - holelen, 0); + pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1; + unmap_mapping_pages(mapping, page->index, nr, false); } if (page_has_private(page)) @@ -715,19 +711,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping, /* * Zap the rest of the file in one hit. */ - unmap_mapping_range(mapping, - (loff_t)index << PAGE_SHIFT, - (loff_t)(1 + end - index) - << PAGE_SHIFT, - 0); + unmap_mapping_pages(mapping, index, + (1 + end - index), false); did_range_unmap = 1; } else { /* * Just zap this page */ - unmap_mapping_range(mapping, - (loff_t)index << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, index, + 1, false); } } BUG_ON(page_mapped(page)); @@ -753,8 +745,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * get remapped later. */ if (dax_mapping(mapping)) { - unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT, - (loff_t)(end - start + 1) << PAGE_SHIFT, 0); + unmap_mapping_pages(mapping, start, end - start + 1, false); } out: cleancache_invalidate_inode(mapping); -- cgit v1.2.3 From d52605d7cb306aaf86d0e6dede275dbf8a020072 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jan 2018 16:18:16 -0800 Subject: mm: do not lose dirty and accessed bits in pmdp_invalidate() Vlastimil noted that pmdp_invalidate() is not atomic and we can lose dirty and access bits if CPU sets them after pmdp dereference, but before set_pmd_at(). The patch change pmdp_invalidate() to make the entry non-present atomically and return previous value of the entry. This value can be used to check if CPU set dirty/accessed bits under us. The race window is very small and I haven't seen any reports that can be attributed to the bug. For this reason, I don't think backporting to stable trees needed. Link: http://lkml.kernel.org/r/20171213105756.69879-11-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reported-by: Vlastimil Babka Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: David Daney Cc: David Miller Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Nitin Gupta Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 2 +- mm/pgtable-generic.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 118ca2eb7a32..51eebd7546b2 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -325,7 +325,7 @@ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 1e4ee763c190..cf2af04b34b9 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -181,12 +181,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t entry = *pmdp; - set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); + pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return old; } #endif -- cgit v1.2.3 From a3cf988fcb88301912f95ecf66913502bcb90200 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jan 2018 16:18:20 -0800 Subject: mm: use updated pmdp_invalidate() interface to track dirty/accessed bits Use the modifed pmdp_invalidate() that returns the previous value of pmd to transfer dirty and accessed bits. Link: http://lkml.kernel.org/r/20171213105756.69879-12-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: David Daney Cc: David Miller Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Nitin Gupta Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 8 ++++---- mm/huge_memory.c | 29 ++++++++++++----------------- 2 files changed, 16 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4691f5aca00e..ec6d2983a5cb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -982,14 +982,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = *pmdp; + pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ - pmdp_invalidate(vma, addr, pmdp); - if (pmd_dirty(*pmdp)) + old = pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); - if (pmd_young(*pmdp)) + if (pmd_young(old)) pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0d3ae51ce4f7..2a79a6b7d19b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1910,17 +1910,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * pmdp_invalidate() is required to make sure we don't miss * dirty/young flags set by hardware. */ - entry = *pmd; - pmdp_invalidate(vma, addr, pmd); - - /* - * Recover dirty/young flags. It relies on pmdp_invalidate to not - * corrupt them. - */ - if (pmd_dirty(*pmd)) - entry = pmd_mkdirty(entry); - if (pmd_young(*pmd)) - entry = pmd_mkyoung(entry); + entry = pmdp_invalidate(vma, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) @@ -2073,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct mm_struct *mm = vma->vm_mm; struct page *page; pgtable_t pgtable; - pmd_t _pmd; - bool young, write, dirty, soft_dirty, pmd_migration = false; + pmd_t old, _pmd; + bool young, write, soft_dirty, pmd_migration = false; unsigned long addr; int i; @@ -2130,7 +2120,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); young = pmd_young(*pmd); - dirty = pmd_dirty(*pmd); soft_dirty = pmd_soft_dirty(*pmd); pmdp_huge_split_prepare(vma, haddr, pmd); @@ -2160,8 +2149,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (soft_dirty) entry = pte_mksoft_dirty(entry); } - if (dirty) - SetPageDirty(page + i); pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); @@ -2209,7 +2196,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * for this pmd), then we flush the SMP TLB and finally we write the * non-huge version of the pmd entry with pmd_populate. */ - pmdp_invalidate(vma, haddr, pmd); + old = pmdp_invalidate(vma, haddr, pmd); + + /* + * Transfer dirty bit using value returned by pmd_invalidate() to be + * sure we don't race with CPU that can set the bit under us. + */ + if (pmd_dirty(old)) + SetPageDirty(page); + pmd_populate(mm, pmd, pgtable); if (freeze) { -- cgit v1.2.3 From 423ac9af3ceff967a77b0714781033629593b077 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 31 Jan 2018 16:18:24 -0800 Subject: mm/thp: remove pmd_huge_split_prepare() Instead of marking the pmd ready for split, invalidate the pmd. This should take care of powerpc requirement. Only side effect is that we mark the pmd invalid early. This can result in us blocking access to the page a bit longer if we race against a thp split. [kirill.shutemov@linux.intel.com: rebased, dirty THP once] Link: http://lkml.kernel.org/r/20171213105756.69879-13-kirill.shutemov@linux.intel.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Catalin Marinas Cc: David Daney Cc: David Miller Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Nitin Gupta Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 2 - arch/powerpc/include/asm/book3s/64/hash-64k.h | 2 - arch/powerpc/include/asm/book3s/64/pgtable.h | 9 ---- arch/powerpc/include/asm/book3s/64/radix.h | 6 --- arch/powerpc/mm/pgtable-hash64.c | 22 -------- include/asm-generic/pgtable.h | 8 --- mm/huge_memory.c | 72 +++++++++++++-------------- 7 files changed, 35 insertions(+), 86 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 197ced1eaaa0..2d9df40446f6 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -101,8 +101,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); extern int hash__has_transparent_hugepage(void); diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 8d40cf03cb67..cb46d1034f33 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -203,8 +203,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); extern int hash__has_transparent_hugepage(void); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index ee19d5bbee06..6ca1208cedcb 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1140,15 +1140,6 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - if (radix_enabled()) - return radix__pmdp_huge_split_prepare(vma, address, pmdp); - return hash__pmdp_huge_split_prepare(vma, address, pmdp); -} - #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 19c44e1495ae..365010f66570 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -269,12 +269,6 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE); return __pmd(pmd_val(pmd) | _PAGE_PTE); } -static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - /* Nothing to do for radix. */ - return; -} extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index ec277913e01b..469808e77e58 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } -void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); - VM_BUG_ON(pmd_devmap(*pmdp)); - - /* - * We can't mark the pmd none here, because that will cause a race - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while - * we spilt, but at the same time we wan't rest of the ppc64 code - * not to insert hash pte on this, because we will be modifying - * the deposited pgtable in the caller of this function. Hence - * clear the _PAGE_USER so that we move the fault handling to - * higher level function and that will serialize against ptl. - * We need to flush existing hash pte entries here even though, - * the translation is still valid, because we will withdraw - * pgtable_t after this. - */ - pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); -} - /* * A linux hugepage PMD was changed and the corresponding hash table entries * neesd to be flushed. diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 51eebd7546b2..2cfa3075d148 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -329,14 +329,6 @@ extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif -#ifndef __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - -} -#endif - #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2a79a6b7d19b..87ab9b8f56b5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2063,7 +2063,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct mm_struct *mm = vma->vm_mm; struct page *page; pgtable_t pgtable; - pmd_t old, _pmd; + pmd_t old_pmd, _pmd; bool young, write, soft_dirty, pmd_migration = false; unsigned long addr; int i; @@ -2106,23 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } + /* + * Up to this point the pmd is present and huge and userland has the + * whole access to the hugepage during the split (which happens in + * place). If we overwrite the pmd with the not-huge version pointing + * to the pte here (which of course we could if all CPUs were bug + * free), userland could trigger a small page size TLB miss on the + * small sized TLB while the hugepage TLB entry is still established in + * the huge TLB. Some CPU doesn't like that. + * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum + * 383 on page 93. Intel should be safe but is also warns that it's + * only safe if the permission and cache attributes of the two entries + * loaded in the two TLB is identical (which should be the case here). + * But it is generally safer to never allow small and huge TLB entries + * for the same virtual address to be loaded simultaneously. So instead + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the + * current pmd notpresent (atomically because here the pmd_trans_huge + * must remain set at all times on the pmd until the split is complete + * for this pmd), then we flush the SMP TLB and finally we write the + * non-huge version of the pmd entry with pmd_populate. + */ + old_pmd = pmdp_invalidate(vma, haddr, pmd); + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - pmd_migration = is_pmd_migration_entry(*pmd); + pmd_migration = is_pmd_migration_entry(old_pmd); if (pmd_migration) { swp_entry_t entry; - entry = pmd_to_swp_entry(*pmd); + entry = pmd_to_swp_entry(old_pmd); page = pfn_to_page(swp_offset(entry)); } else #endif - page = pmd_page(*pmd); + page = pmd_page(old_pmd); VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); - write = pmd_write(*pmd); - young = pmd_young(*pmd); - soft_dirty = pmd_soft_dirty(*pmd); + if (pmd_dirty(old_pmd)) + SetPageDirty(page); + write = pmd_write(old_pmd); + young = pmd_young(old_pmd); + soft_dirty = pmd_soft_dirty(old_pmd); - pmdp_huge_split_prepare(vma, haddr, pmd); + /* + * Withdraw the table only after we mark the pmd entry invalid. + * This's critical for some architectures (Power). + */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2176,35 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } smp_wmb(); /* make pte visible before pmd */ - /* - * Up to this point the pmd is present and huge and userland has the - * whole access to the hugepage during the split (which happens in - * place). If we overwrite the pmd with the not-huge version pointing - * to the pte here (which of course we could if all CPUs were bug - * free), userland could trigger a small page size TLB miss on the - * small sized TLB while the hugepage TLB entry is still established in - * the huge TLB. Some CPU doesn't like that. - * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum - * 383 on page 93. Intel should be safe but is also warns that it's - * only safe if the permission and cache attributes of the two entries - * loaded in the two TLB is identical (which should be the case here). - * But it is generally safer to never allow small and huge TLB entries - * for the same virtual address to be loaded simultaneously. So instead - * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the - * current pmd notpresent (atomically because here the pmd_trans_huge - * must remain set at all times on the pmd until the split is complete - * for this pmd), then we flush the SMP TLB and finally we write the - * non-huge version of the pmd entry with pmd_populate. - */ - old = pmdp_invalidate(vma, haddr, pmd); - - /* - * Transfer dirty bit using value returned by pmd_invalidate() to be - * sure we don't race with CPU that can set the bit under us. - */ - if (pmd_dirty(old)) - SetPageDirty(page); - pmd_populate(mm, pmd, pgtable); if (freeze) { -- cgit v1.2.3 From 3b454ad35043dfbd3b5d2bb92b0991d6342afb44 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 31 Jan 2018 16:18:28 -0800 Subject: mm: thp: use down_read_trylock() in khugepaged to avoid long block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the current design, khugepaged needs to acquire mmap_sem before scanning an mm. But in some corner cases, khugepaged may scan a process which is modifying its memory mapping, so khugepaged blocks in uninterruptible state. But the process might hold the mmap_sem for a long time when modifying a huge memory space and it may trigger the below khugepaged hung issue: INFO: task khugepaged:270 blocked for more than 120 seconds. Tainted: G E 4.9.65-006.ali3000.alios7.x86_64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. khugepaged D 0 270 2 0x00000000  ffff883f3deae4c0 0000000000000000 ffff883f610596c0 ffff883f7d359440 ffff883f63818000 ffffc90019adfc78 ffffffff817079a5 d67e5aa8c1860a64 0000000000000246 ffff883f7d359440 ffffc90019adfc88 ffff883f610596c0 Call Trace: schedule+0x36/0x80 rwsem_down_read_failed+0xf0/0x150 call_rwsem_down_read_failed+0x18/0x30 down_read+0x20/0x40 khugepaged+0x476/0x11d0 kthread+0xe6/0x100 ret_from_fork+0x25/0x30 So it sounds pointless to just block khugepaged waiting for the semaphore so replace down_read() with down_read_trylock() to move to scan the next mm quickly instead of just blocking on the semaphore so that other processes can get more chances to install THP. Then khugepaged can come back to scan the skipped mm when it has finished the current round full_scan. And it appears that the change can improve khugepaged efficiency a little bit. Below is the test result when running LTP on a 24 cores 4GB memory 2 nodes NUMA VM: pristine w/ trylock full_scan 197 187 pages_collapsed 21 26 thp_fault_alloc 40818 44466 thp_fault_fallback 18413 16679 thp_collapse_alloc 21 150 thp_collapse_alloc_failed 14 16 thp_file_alloc 369 369 [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: tweak comment] [arnd@arndb.de: avoid uninitialized variable use] Link: http://lkml.kernel.org/r/20171215125129.2948634-1-arnd@arndb.de Link: http://lkml.kernel.org/r/1513281203-54878-1-git-send-email-yang.s@alibaba-inc.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1cd18e4347fe..b7e2268dfc9a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1673,10 +1673,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, spin_unlock(&khugepaged_mm_lock); mm = mm_slot->mm; - down_read(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) - vma = NULL; - else + /* + * Don't wait for semaphore (to avoid long wait times). Just move to + * the next mm on the list. + */ + vma = NULL; + if (unlikely(!down_read_trylock(&mm->mmap_sem))) + goto breakouterloop_mmap_sem; + if (likely(!khugepaged_test_exit(mm))) vma = find_vma(mm, khugepaged_scan.address); progress++; -- cgit v1.2.3 From 5ff7091f5a2ca1b7b642ca0dbdede8f693a56926 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 31 Jan 2018 16:18:32 -0800 Subject: mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers") prevented the oom reaper from unmapping private anonymous memory with the oom reaper when the oom victim mm had mmu notifiers registered. The rationale is that doing mmu_notifier_invalidate_range_{start,end}() around the unmap_page_range(), which is needed, can block and the oom killer will stall forever waiting for the victim to exit, which may not be possible without reaping. That concern is real, but only true for mmu notifiers that have blockable invalidate_range_{start,end}() callbacks. This patch adds a "flags" field to mmu notifier ops that can set a bit to indicate that these callbacks do not block. The implementation is steered toward an expensive slowpath, such as after the oom reaper has grabbed mm->mmap_sem of a still alive oom victim. [rientjes@google.com: mmu_notifier_invalidate_range_end() can also call the invalidate_range() must not block, fix comment] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1801091339570.240101@chino.kir.corp.google.com [akpm@linux-foundation.org: make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1712141329500.74052@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Michal Hocko Acked-by: Paolo Bonzini Acked-by: Christian König Acked-by: Dimitri Sivanich Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Oded Gabbay Cc: Alex Deucher Cc: David Airlie Cc: Joerg Roedel Cc: Doug Ledford Cc: Jani Nikula Cc: Mike Marciniszyn Cc: Sean Hefty Cc: Boris Ostrovsky Cc: Jérôme Glisse Cc: Radim Krčmář Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/hfi1/mmu_rb.c | 1 + drivers/iommu/amd_iommu_v2.c | 1 + drivers/iommu/intel-svm.c | 1 + drivers/misc/sgi-gru/grutlbpurge.c | 1 + include/linux/mmu_notifier.h | 30 +++++++++++++++++++++++++++--- mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 1 + 7 files changed, 63 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index e7b3ce123da6..70aceefe14d5 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler, static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = mmu_notifier_range_start, }; diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 7d94e1d39e5e..df72493a0f13 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = mn_release, .clear_flush_young = mn_clear_flush_young, .invalidate_range = mn_invalidate_range, diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index ed1cf7c5a43b..0a826eb7fe48 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops intel_mmuops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = intel_mm_release, .change_pte = intel_change_pte, .invalidate_range = intel_invalidate_range, diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index 9918eda0e05f..a3454eb56fbf 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops gru_mmuops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = gru_invalidate_range_start, .invalidate_range_end = gru_invalidate_range_end, .release = gru_release, diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b25dc9db19fc..2d07a1ed5a31 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H +#include #include #include #include @@ -10,6 +11,9 @@ struct mmu_notifier; struct mmu_notifier_ops; +/* mmu_notifier_ops flags */ +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) + #ifdef CONFIG_MMU_NOTIFIER /* @@ -26,6 +30,15 @@ struct mmu_notifier_mm { }; struct mmu_notifier_ops { + /* + * Flags to specify behavior of callbacks for this MMU notifier. + * Used to determine which context an operation may be called. + * + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not + * block + */ + int flags; + /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are @@ -137,6 +150,10 @@ struct mmu_notifier_ops { * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. + * + * If both of these callbacks cannot block, and invalidate_range + * cannot block, mmu_notifier_ops.flags should have + * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, @@ -159,12 +176,13 @@ struct mmu_notifier_ops { * external TLB range needs to be flushed. For more in depth * discussion on this see Documentation/vm/mmu_notifier.txt * - * The invalidate_range() function is called under the ptl - * spin-lock and not allowed to sleep. - * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if * called between those functions. + * + * If this callback cannot block, and invalidate_range_{start,end} + * cannot block, mmu_notifier_ops.flags should have + * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -218,6 +236,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -457,6 +476,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } +static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) +{ + return false; +} + static inline void mmu_notifier_mm_init(struct mm_struct *mm) { } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 96edb33fd09a..eff6b88a993f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); +/* + * Must be called while holding mm->mmap_sem for either read or write. + * The result is guaranteed to be valid until mm->mmap_sem is dropped. + */ +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + int id; + bool ret = false; + + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); + + if (!mm_has_notifiers(mm)) + return ret; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (!mn->ops->invalidate_range && + !mn->ops->invalidate_range_start && + !mn->ops->invalidate_range_end) + continue; + + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { + ret = true; + break; + } + } + srcu_read_unlock(&srcu, id); + return ret; +} + static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d6b9370806f8..35db929f92f0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, -- cgit v1.2.3 From f340ff820345b179b697f66ec6743c70416bf93f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 31 Jan 2018 16:18:36 -0800 Subject: mm, oom: avoid reaping only for mm's with blockable invalidate callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This uses the new annotation to determine if an mm has mmu notifiers with blockable invalidate range callbacks to avoid oom reaping. Otherwise, the callbacks are used around unmap_page_range(). Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1712141330120.74052@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Michal Hocko Cc: Paolo Bonzini Cc: Christian König Cc: Dimitri Sivanich Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Oded Gabbay Cc: Alex Deucher Cc: David Airlie Cc: Joerg Roedel Cc: Doug Ledford Cc: Jani Nikula Cc: Mike Marciniszyn Cc: Sean Hefty Cc: Boris Ostrovsky Cc: Jérôme Glisse Cc: Radim Krčmář Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 29f855551efe..f2e7dfb81eee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) } /* - * If the mm has notifiers then we would need to invalidate them around - * unmap_page_range and that is risky because notifiers can sleep and - * what they do is basically undeterministic. So let's have a short + * If the mm has invalidate_{start,end}() notifiers that could block, * sleep to give the oom victim some more time. * TODO: we really want to get rid of this ugly hack and make sure that - * notifiers cannot block for unbounded amount of time and add - * mmu_notifier_invalidate_range_{start,end} around unmap_page_range + * notifiers cannot block for unbounded amount of time */ - if (mm_has_notifiers(mm)) { + if (mm_has_blockable_invalidate_notifiers(mm)) { up_read(&mm->mmap_sem); schedule_timeout_idle(HZ); goto unlock_oom; @@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * count elevated without a good reason. */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end); - unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, - NULL); - tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end); + const unsigned long start = vma->vm_start; + const unsigned long end = vma->vm_end; + + tlb_gather_mmu(&tlb, mm, start, end); + mmu_notifier_invalidate_range_start(mm, start, end); + unmap_page_range(&tlb, vma, start, end, NULL); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); } } pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", -- cgit v1.2.3 From 93144ca35041b05a4b23528d3bdf0d6414f43002 Mon Sep 17 00:00:00 2001 From: Aliaksei Karaliou Date: Wed, 31 Jan 2018 16:18:40 -0800 Subject: mm/zsmalloc: simplify shrinker init/destroy Structure zs_pool has special flag to indicate success of shrinker initialization. unregister_shrinker() has improved and can detect by itself whether actual deinitialization should be performed or not, so extra flag becomes redundant. [akpm@linux-foundation.org: update comment (Aliaksei), remove unneeded cast] Link: http://lkml.kernel.org/r/1513680552-9798-1-git-send-email-akaraliou.dev@gmail.com Signed-off-by: Aliaksei Karaliou Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 683c0651098c..e136a8e72c48 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -257,11 +258,7 @@ struct zs_pool { /* Compact classes */ struct shrinker shrinker; - /* - * To signify that register_shrinker() was successful - * and unregister_shrinker() will not Oops. - */ - bool shrinker_enabled; + #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif @@ -2324,10 +2321,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, static void zs_unregister_shrinker(struct zs_pool *pool) { - if (pool->shrinker_enabled) { - unregister_shrinker(&pool->shrinker); - pool->shrinker_enabled = false; - } + unregister_shrinker(&pool->shrinker); } static int zs_register_shrinker(struct zs_pool *pool) @@ -2426,11 +2420,13 @@ struct zs_pool *zs_create_pool(const char *name) goto err; /* - * Not critical, we still can use the pool - * and user can trigger compaction manually. + * Not critical since shrinker is only used to trigger internal + * defragmentation of the pool which is pretty optional thing. If + * registration fails we still can use the pool normally and user can + * trigger compaction manually. Thus, ignore return code. */ - if (zs_register_shrinker(pool) == 0) - pool->shrinker_enabled = true; + zs_register_shrinker(pool); + return pool; err: -- cgit v1.2.3 From e9d586a8217882eb4068e3ed94a5234ba6dead34 Mon Sep 17 00:00:00 2001 From: Marc-André Lureau Date: Wed, 31 Jan 2018 16:19:14 -0800 Subject: shmem: unexport shmem_add_seals()/shmem_get_seals() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "memfd: add sealing to hugetlb-backed memory", v3. Recently, Mike Kravetz added hugetlbfs support to memfd. However, he didn't add sealing support. One of the reasons to use memfd is to have shared memory sealing when doing IPC or sharing memory with another process with some extra safety. qemu uses shared memory & hugetables with vhost-user (used by dpdk), so it is reasonable to use memfd now instead for convenience and security reasons. This patch (of 9): The functions are called through shmem_fcntl() only. And no danger in removing the EXPORTs as the routines only work with shmem file structs. Link: http://lkml.kernel.org/r/20171107122800.25517-2-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 2 -- mm/shmem.c | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 06b295bec00d..e464815a7e4c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -112,8 +112,6 @@ extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_TMPFS -extern int shmem_add_seals(struct file *file, unsigned int seals); -extern int shmem_get_seals(struct file *file); extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); #else diff --git a/mm/shmem.c b/mm/shmem.c index 7fbe67be86fa..975efd81621f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2722,7 +2722,7 @@ continue_resched: F_SEAL_GROW | \ F_SEAL_WRITE) -int shmem_add_seals(struct file *file, unsigned int seals) +static int shmem_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); @@ -2791,16 +2791,14 @@ unlock: inode_unlock(inode); return error; } -EXPORT_SYMBOL_GPL(shmem_add_seals); -int shmem_get_seals(struct file *file) +static int shmem_get_seals(struct file *file) { if (file->f_op != &shmem_file_operations) return -EINVAL; return SHMEM_I(file_inode(file))->seals; } -EXPORT_SYMBOL_GPL(shmem_get_seals); long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { -- cgit v1.2.3 From 5aadc431a593ac1f3a026dfbceaa16cc4d5e15ca Mon Sep 17 00:00:00 2001 From: Marc-André Lureau Date: Wed, 31 Jan 2018 16:19:18 -0800 Subject: shmem: rename functions that are memfd-related MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those functions are called for memfd files, backed by shmem or hugetlb (the next patches will handle hugetlb). Link: http://lkml.kernel.org/r/20171107122800.25517-3-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 2 +- include/linux/shmem_fs.h | 4 ++-- mm/shmem.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/fs/fcntl.c b/fs/fcntl.c index c7b9e0948107..e95fa0a352ea 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -418,7 +418,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_ADD_SEALS: case F_GET_SEALS: - err = shmem_fcntl(filp, cmd, arg); + err = memfd_fcntl(filp, cmd, arg); break; case F_GET_RW_HINT: case F_SET_RW_HINT: diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index e464815a7e4c..73b5e655a76e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -112,11 +112,11 @@ extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_TMPFS -extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); #else -static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) { return -EINVAL; } diff --git a/mm/shmem.c b/mm/shmem.c index 975efd81621f..86d7e06ee855 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2722,7 +2722,7 @@ continue_resched: F_SEAL_GROW | \ F_SEAL_WRITE) -static int shmem_add_seals(struct file *file, unsigned int seals) +static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); @@ -2792,7 +2792,7 @@ unlock: return error; } -static int shmem_get_seals(struct file *file) +static int memfd_get_seals(struct file *file) { if (file->f_op != &shmem_file_operations) return -EINVAL; @@ -2800,7 +2800,7 @@ static int shmem_get_seals(struct file *file) return SHMEM_I(file_inode(file))->seals; } -long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { long error; @@ -2810,10 +2810,10 @@ long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (arg > UINT_MAX) return -EINVAL; - error = shmem_add_seals(file, arg); + error = memfd_add_seals(file, arg); break; case F_GET_SEALS: - error = shmem_get_seals(file); + error = memfd_get_seals(file); break; default: error = -EINVAL; -- cgit v1.2.3 From 47b9012ecdc747f6936395265e677d41e11a31ff Mon Sep 17 00:00:00 2001 From: Marc-André Lureau Date: Wed, 31 Jan 2018 16:19:29 -0800 Subject: shmem: add sealing support to hugetlb-backed memfd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adapt add_seals()/get_seals() to work with hugetbfs-backed memory. Teach memfd_create() to allow sealing operations on MFD_HUGETLB. Link: http://lkml.kernel.org/r/20171107122800.25517-6-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 86d7e06ee855..1907688b75ee 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2717,6 +2717,19 @@ continue_resched: return error; } +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (file->f_op == &shmem_file_operations) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (file->f_op == &hugetlbfs_file_operations) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ @@ -2725,7 +2738,7 @@ continue_resched: static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); + unsigned int *file_seals; int error; /* @@ -2758,8 +2771,6 @@ static int memfd_add_seals(struct file *file, unsigned int seals) * other file types. */ - if (file->f_op != &shmem_file_operations) - return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EPERM; if (seals & ~(unsigned int)F_ALL_SEALS) @@ -2767,12 +2778,18 @@ static int memfd_add_seals(struct file *file, unsigned int seals) inode_lock(inode); - if (info->seals & F_SEAL_SEAL) { + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { error = -EPERM; goto unlock; } - if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) goto unlock; @@ -2784,7 +2801,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals) } } - info->seals |= seals; + *file_seals |= seals; error = 0; unlock: @@ -2794,10 +2811,9 @@ unlock: static int memfd_get_seals(struct file *file) { - if (file->f_op != &shmem_file_operations) - return -EINVAL; + unsigned int *seals = memfd_file_seals_ptr(file); - return SHMEM_I(file_inode(file))->seals; + return seals ? *seals : -EINVAL; } long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) @@ -3655,7 +3671,7 @@ SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { - struct shmem_inode_info *info; + unsigned int *file_seals; struct file *file; int fd, error; char *name; @@ -3665,9 +3681,6 @@ SYSCALL_DEFINE2(memfd_create, if (flags & ~(unsigned int)MFD_ALL_FLAGS) return -EINVAL; } else { - /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */ - if (flags & MFD_ALLOW_SEALING) - return -EINVAL; /* Allow huge page size encoding in flags. */ if (flags & ~(unsigned int)(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) @@ -3720,12 +3733,8 @@ SYSCALL_DEFINE2(memfd_create, file->f_flags |= O_RDWR | O_LARGEFILE; if (flags & MFD_ALLOW_SEALING) { - /* - * flags check at beginning of function ensures - * this is not a hugetlbfs (MFD_HUGETLB) file. - */ - info = SHMEM_I(file_inode(file)); - info->seals &= ~F_SEAL_SEAL; + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); -- cgit v1.2.3 From 69d763fc6d3aee787a3e8c8c35092b4f4960fa5d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 31 Jan 2018 16:19:52 -0800 Subject: mm: pin address_space before dereferencing it while isolating an LRU page Minchan Kim asked the following question -- what locks protects address_space destroying when race happens between inode trauncation and __isolate_lru_page? Jan Kara clarified by describing the race as follows CPU1 CPU2 truncate(inode) __isolate_lru_page() ... truncate_inode_page(mapping, page); delete_from_page_cache(page) spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL) page_cache_tree_delete(..) ... mapping = page_mapping(page); page->mapping = NULL; ... spin_unlock_irqrestore(&mapping->tree_lock, flags); page_cache_free_page(mapping, page) put_page(page) if (put_page_testzero(page)) -> false - inode now has no pages and can be freed including embedded address_space if (mapping && !mapping->a_ops->migratepage) - we've dereferenced mapping which is potentially already free. The race is theoretically possible but unlikely. Before the delete_from_page_cache, truncate_cleanup_page is called so the page is likely to be !PageDirty or PageWriteback which gets skipped by the only caller that checks the mappping in __isolate_lru_page. Even if the race occurs, a substantial amount of work has to happen during a tiny window with no preemption but it could potentially be done using a virtual machine to artifically slow one CPU or halt it during the critical window. This patch should eliminate the race with truncation by try-locking the page before derefencing mapping and aborting if the lock was not acquired. There was a suggestion from Huang Ying to use RCU as a side-effect to prevent mapping being freed. However, I do not like the solution as it's an unconventional means of preserving a mapping and it's not a context where rcu_read_lock is obviously protecting rcu data. Link: http://lkml.kernel.org/r/20180104102512.2qos3h5vqzeisrek@techsingularity.net Fixes: c82449352854 ("mm: compaction: make isolate_lru_page() filter-aware again") Signed-off-by: Mel Gorman Acked-by: Minchan Kim Cc: "Huang, Ying" Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1a33c8e1e758..fdd3fc6be862 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1415,14 +1415,24 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (PageDirty(page)) { struct address_space *mapping; + bool migrate_dirty; /* * Only pages without mappings or that have a * ->migratepage callback are possible to migrate - * without blocking + * without blocking. However, we can be racing with + * truncation so it's necessary to lock the page + * to stabilise the mapping as truncation holds + * the page lock until after the page is removed + * from the page cache. */ + if (!trylock_page(page)) + return ret; + mapping = page_mapping(page); - if (mapping && !mapping->a_ops->migratepage) + migrate_dirty = mapping && mapping->a_ops->migratepage; + unlock_page(page); + if (!migrate_dirty) return ret; } } -- cgit v1.2.3 From a7ab400d6fe73d0119fdc234e9982a6f80faea9f Mon Sep 17 00:00:00 2001 From: "shidao.ytt" Date: Wed, 31 Jan 2018 16:19:55 -0800 Subject: mm/fadvise: discard partial page if endbyte is also EOF During our recent testing with fadvise(FADV_DONTNEED), we find that if given offset/length is not page-aligned, the last page will not be discarded. The tool we use is vmtouch (https://hoytech.com/vmtouch/), we map a 10KB-sized file into memory and then try to run this tool to evict the whole file mapping, but the last single page always remains staying in the memory: $./vmtouch -e test_10K Files: 1 Directories: 0 Evicted Pages: 3 (12K) Elapsed: 2.1e-05 seconds $./vmtouch test_10K Files: 1 Directories: 0 Resident Pages: 1/3 4K/12K 33.3% Elapsed: 5.5e-05 seconds However when we test with an older kernel, say 3.10, this problem is gone. So we wonder if this is a regression: $./vmtouch -e test_10K Files: 1 Directories: 0 Evicted Pages: 3 (12K) Elapsed: 8.2e-05 seconds $./vmtouch test_10K Files: 1 Directories: 0 Resident Pages: 0/3 0/12K 0% <-- partial page also discarded Elapsed: 5e-05 seconds After digging a little bit into this problem, we find it seems not a regression. Not discarding partial page is likely to be on purpose according to commit 441c228f817f ("mm: fadvise: document the fadvise(FADV_DONTNEED) behaviour for partial pages") written by Mel Gorman. He explained why partial pages should be preserved instead of being discarded when using fadvise(FADV_DONTNEED). However, the interesting part is that the actual code did NOT work as the same as it was described, the partial page was still discarded anyway, due to a calculation mistake of `end_index' passed to invalidate_mapping_pages(). This mistake has not been fixed until recently, that's why we fail to reproduce our problem in old kernels. The fix is done in commit 18aba41cbf ("mm/fadvise.c: do not discard partial pages with POSIX_FADV_DONTNEED") by Oleg Drokin. Back to the original testing, our problem becomes that there is a special case that, if the page-unaligned `endbyte' is also the end of file, it is not necessary at all to preserve the last partial page, as we all know no one else will use the rest of it. It should be safe enough if we just discard the whole page. So we add an EOF check in this patch. We also find a poosbile real world issue in mainline kernel. Assume such scenario: A userspace backup application want to backup a huge amount of small files (<4k) at once, the developer might (I guess) want to use fadvise(FADV_DONTNEED) to save memory. However, FADV_DONTNEED won't really happen since the only page mapped is a partial page, and kernel will preserve it. Our patch also fixes this problem, since we know the endbyte is EOF, so we discard it. Here is a simple reproducer to reproduce and verify each scenario we described above: test_fadvise.c ============================== #include #include #include #include #include #include #include int main(int argc, char **argv) { int i, fd, ret, len; struct stat buf; void *addr; unsigned char *vec; char *strbuf; ssize_t pagesize = getpagesize(); ssize_t filesize; fd = open(argv[1], O_RDWR|O_CREAT, S_IRUSR|S_IWUSR); if (fd < 0) return -1; filesize = strtoul(argv[2], NULL, 10); strbuf = malloc(filesize); memset(strbuf, 42, filesize); write(fd, strbuf, filesize); free(strbuf); fsync(fd); len = (filesize + pagesize - 1) / pagesize; printf("length of pages: %d\n", len); addr = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) return -1; ret = posix_fadvise(fd, 0, filesize, POSIX_FADV_DONTNEED); if (ret < 0) return -1; vec = malloc(len); ret = mincore(addr, filesize, (void *)vec); if (ret < 0) return -1; for (i = 0; i < len; i++) printf("pages[%d]: %x\n", i, vec[i] & 0x1); free(vec); close(fd); return 0; } ============================== Test 1: running on kernel with commit 18aba41cbf reverted: [root@caspar ~]# uname -r 4.15.0-rc6.revert+ [root@caspar ~]# ./test_fadvise file1 1024 length of pages: 1 pages[0]: 0 # <-- partial page discarded [root@caspar ~]# ./test_fadvise file2 8192 length of pages: 2 pages[0]: 0 pages[1]: 0 [root@caspar ~]# ./test_fadvise file3 10240 length of pages: 3 pages[0]: 0 pages[1]: 0 pages[2]: 0 # <-- partial page discarded Test 2: running on mainline kernel: [root@caspar ~]# uname -r 4.15.0-rc6+ [root@caspar ~]# ./test_fadvise test1 1024 length of pages: 1 pages[0]: 1 # <-- partial and the only page not discarded [root@caspar ~]# ./test_fadvise test2 8192 length of pages: 2 pages[0]: 0 pages[1]: 0 [root@caspar ~]# ./test_fadvise test3 10240 length of pages: 3 pages[0]: 0 pages[1]: 0 pages[2]: 1 # <-- partial page not discarded Test 3: running on kernel with this patch: [root@caspar ~]# uname -r 4.15.0-rc6.patched+ [root@caspar ~]# ./test_fadvise test1 1024 length of pages: 1 pages[0]: 0 # <-- partial page and EOF, discarded [root@caspar ~]# ./test_fadvise test2 8192 length of pages: 2 pages[0]: 0 pages[1]: 0 [root@caspar ~]# ./test_fadvise test3 10240 length of pages: 3 pages[0]: 0 pages[1]: 0 pages[2]: 0 # <-- partial page and EOF, discarded [akpm@linux-foundation.org: tweak code comment] Link: http://lkml.kernel.org/r/5222da9ee20e1695eaabb69f631f200d6e6b8876.1515132470.git.jinli.zjl@alibaba-inc.com Signed-off-by: shidao.ytt Signed-off-by: Caspar Zhang Reviewed-by: Oliver Yang Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index ec70d6e4b86d..767887f5f3bf 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -127,7 +127,15 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) */ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; end_index = (endbyte >> PAGE_SHIFT); - if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) { + /* + * The page at end_index will be inclusively discarded according + * by invalidate_mapping_pages(), so subtracting 1 from + * end_index means we will skip the last page. But if endbyte + * is page aligned or is at the end of file, we should not skip + * that page - discarding the last page is safe enough. + */ + if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK && + endbyte != inode->i_size - 1) { /* First page is tricky as 0 - 1 = -1, but pgoff_t * is unsigned, so the end_index >= start_index * check below would be true and we'll discard the whole -- cgit v1.2.3 From 9c3760eb80880f3e02546e0a2ef479e1454986b3 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 31 Jan 2018 16:19:59 -0800 Subject: zswap: only save zswap header when necessary We waste sizeof(swp_entry_t) for zswap header when using zsmalloc as zpool driver because zsmalloc doesn't support eviction. Add zpool_evictable() to detect if zpool is potentially evictable, and use it in zswap to avoid waste memory for zswap header. [yuzhao@google.com: The zpool->" prefix is a result of copy & paste] Link: http://lkml.kernel.org/r/20180110225626.110330-1-yuzhao@google.com Link: http://lkml.kernel.org/r/20180110224741.83751-1-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Dan Streetman Reviewed-by: Sergey Senozhatsky Cc: Seth Jennings Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 2 ++ mm/zpool.c | 25 +++++++++++++++++++++++-- mm/zsmalloc.c | 7 ------- mm/zswap.c | 20 ++++++++++---------- 4 files changed, 35 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 004ba807df96..7238865e75b0 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); +bool zpool_evictable(struct zpool *pool); + #endif diff --git a/mm/zpool.c b/mm/zpool.c index fd3ff719c32c..e1e7aa6d1d06 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -21,6 +21,7 @@ struct zpool { struct zpool_driver *driver; void *pool; const struct zpool_ops *ops; + bool evictable; struct list_head list; }; @@ -142,7 +143,7 @@ EXPORT_SYMBOL(zpool_has_pool); * * This creates a new zpool of the specified type. The gfp flags will be * used when allocating memory, if the implementation supports it. If the - * ops param is NULL, then the created zpool will not be shrinkable. + * ops param is NULL, then the created zpool will not be evictable. * * Implementations must guarantee this to be thread-safe. * @@ -180,6 +181,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; + zpool->evictable = driver->shrink && ops && ops->evict; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -296,7 +298,8 @@ void zpool_free(struct zpool *zpool, unsigned long handle) int zpool_shrink(struct zpool *zpool, unsigned int pages, unsigned int *reclaimed) { - return zpool->driver->shrink(zpool->pool, pages, reclaimed); + return zpool->driver->shrink ? + zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL; } /** @@ -355,6 +358,24 @@ u64 zpool_get_total_size(struct zpool *zpool) return zpool->driver->total_size(zpool->pool); } +/** + * zpool_evictable() - Test if zpool is potentially evictable + * @pool The zpool to test + * + * Zpool is only potentially evictable when it's created with struct + * zpool_ops.evict and its driver implements struct zpool_driver.shrink. + * + * However, it doesn't necessarily mean driver will use zpool_ops.evict + * in its implementation of zpool_driver.shrink. It could do internal + * defragmentation instead. + * + * Returns: true if potentially evictable; false otherwise. + */ +bool zpool_evictable(struct zpool *zpool) +{ + return zpool->evictable; +} + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman "); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e136a8e72c48..f797d8b0d820 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -404,12 +404,6 @@ static void zs_zpool_free(void *pool, unsigned long handle) zs_free(pool, handle); } -static int zs_zpool_shrink(void *pool, unsigned int pages, - unsigned int *reclaimed) -{ - return -EINVAL; -} - static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -447,7 +441,6 @@ static struct zpool_driver zs_zpool_driver = { .destroy = zs_zpool_destroy, .malloc = zs_zpool_malloc, .free = zs_zpool_free, - .shrink = zs_zpool_shrink, .map = zs_zpool_map, .unmap = zs_zpool_unmap, .total_size = zs_zpool_total_size, diff --git a/mm/zswap.c b/mm/zswap.c index 1133b4ceb72e..c004aa4fd3f4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1001,11 +1001,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct zswap_entry *entry, *dupentry; struct crypto_comp *tfm; int ret; - unsigned int dlen = PAGE_SIZE, len; + unsigned int hlen, dlen = PAGE_SIZE; unsigned long handle, value; char *buf; u8 *src, *dst; - struct zswap_header *zhdr; + struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; if (!zswap_enabled || !tree) { ret = -ENODEV; @@ -1063,8 +1063,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* store */ - len = dlen + sizeof(struct zswap_header); - ret = zpool_malloc(entry->pool->zpool, len, + hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; + ret = zpool_malloc(entry->pool->zpool, hlen + dlen, __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, &handle); if (ret == -ENOSPC) { @@ -1075,10 +1075,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto put_dstmem; } - zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); - zhdr->swpentry = swp_entry(type, offset); - buf = (u8 *)(zhdr + 1); - memcpy(buf, dst, dlen); + buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); + memcpy(buf, &zhdr, hlen); + memcpy(buf + hlen, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); put_cpu_var(zswap_dstmem); @@ -1149,8 +1148,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, - ZPOOL_MM_RO) + sizeof(struct zswap_header); + src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); + if (zpool_evictable(entry->pool->zpool)) + src += sizeof(struct zswap_header); dst = kmap_atomic(page); tfm = *get_cpu_ptr(entry->pool->tfm); ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); -- cgit v1.2.3 From c054a78c66c7a5aa218220d8949ebcf13a86b796 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 31 Jan 2018 16:20:02 -0800 Subject: memcg: refactor mem_cgroup_resize_limit() mem_cgroup_resize_limit() and mem_cgroup_resize_memsw_limit() have identical logics. Refactor code so we don't need to keep two pieces of code that does same thing. Link: http://lkml.kernel.org/r/20180108224238.14583-1-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 77 +++++++++++++-------------------------------------------- 1 file changed, 17 insertions(+), 60 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 51d398f1363c..695d9f10906e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2461,13 +2461,15 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, static DEFINE_MUTEX(memcg_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit) + unsigned long limit, bool memsw) { unsigned long curusage; unsigned long oldusage; bool enlarge = false; int retry_count; int ret; + bool limits_invariant; + struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; /* * For keeping hierarchical_reclaim simple, how long we should retry @@ -2477,7 +2479,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, retry_count = MEM_CGROUP_RECLAIM_RETRIES * mem_cgroup_count_children(memcg); - oldusage = page_counter_read(&memcg->memory); + oldusage = page_counter_read(counter); do { if (signal_pending(current)) { @@ -2486,73 +2488,28 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, } mutex_lock(&memcg_limit_mutex); - if (limit > memcg->memsw.limit) { - mutex_unlock(&memcg_limit_mutex); - ret = -EINVAL; - break; - } - if (limit > memcg->memory.limit) - enlarge = true; - ret = page_counter_limit(&memcg->memory, limit); - mutex_unlock(&memcg_limit_mutex); - - if (!ret) - break; - - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); - - curusage = page_counter_read(&memcg->memory); - /* Usage is reduced ? */ - if (curusage >= oldusage) - retry_count--; - else - oldusage = curusage; - } while (retry_count); - - if (!ret && enlarge) - memcg_oom_recover(memcg); - - return ret; -} - -static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, - unsigned long limit) -{ - unsigned long curusage; - unsigned long oldusage; - bool enlarge = false; - int retry_count; - int ret; - - /* see mem_cgroup_resize_res_limit */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * - mem_cgroup_count_children(memcg); - - oldusage = page_counter_read(&memcg->memsw); - - do { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - - mutex_lock(&memcg_limit_mutex); - if (limit < memcg->memory.limit) { + /* + * Make sure that the new limit (memsw or memory limit) doesn't + * break our basic invariant rule memory.limit <= memsw.limit. + */ + limits_invariant = memsw ? limit >= memcg->memory.limit : + limit <= memcg->memsw.limit; + if (!limits_invariant) { mutex_unlock(&memcg_limit_mutex); ret = -EINVAL; break; } - if (limit > memcg->memsw.limit) + if (limit > counter->limit) enlarge = true; - ret = page_counter_limit(&memcg->memsw, limit); + ret = page_counter_limit(counter, limit); mutex_unlock(&memcg_limit_mutex); if (!ret) break; - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw); - curusage = page_counter_read(&memcg->memsw); + curusage = page_counter_read(counter); /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; @@ -3014,10 +2971,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages); + ret = mem_cgroup_resize_limit(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); + ret = mem_cgroup_resize_limit(memcg, nr_pages, true); break; case _KMEM: ret = memcg_update_kmem_limit(memcg, nr_pages); -- cgit v1.2.3 From 3c2c648842843326f8c6ace425810eb47864c6b4 Mon Sep 17 00:00:00 2001 From: Shile Zhang Date: Wed, 31 Jan 2018 16:20:07 -0800 Subject: mm/page_alloc.c: fix typos in comments Link: http://lkml.kernel.org/r/1515485774-4768-1-git-send-email-zhangshile@gmail.com Signed-off-by: Shile Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b411f97dfb25..a6972750e7c5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -293,7 +293,7 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* - * Determine how many pages need to be initialized durig early boot + * Determine how many pages need to be initialized during early boot * (non-deferred initialization). * The value of first_deferred_pfn will be set later, once non-deferred pages * are initialized, but for now set it ULONG_MAX. @@ -344,7 +344,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { - /* Always populate low zones for address-contrained allocations */ + /* Always populate low zones for address-constrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; (*nr_initialised)++; @@ -3397,7 +3397,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_THISNODE) goto out; - /* Exhausted what can be done so it's blamo time */ + /* Exhausted what can be done so it's blame time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; -- cgit v1.2.3 From 6787c1dab1724ca0d92110d83485c8c72dbf83f4 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 31 Jan 2018 16:20:11 -0800 Subject: mm/page_owner.c: clean up init_pages_in_zone() Remove two redundant assignments in init_pages_in_zone(). [osalvador@techadventures.net: v3] Link: http://lkml.kernel.org/r/20180117124513.GA876@techadventures.net [akpm@linux-foundation.org: coding style tweaks] Link: http://lkml.kernel.org/r/20180110084355.GA22822@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index 06a0055f45a6..9886c6073828 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -528,21 +528,18 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) { - struct page *page; - struct page_ext *page_ext; - unsigned long pfn = zone->zone_start_pfn, block_end_pfn; - unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long count = 0; - /* Scan block by block. First and last block may be incomplete */ - pfn = zone->zone_start_pfn; - /* * Walk the zone in pageblock_nr_pages steps. If a page block spans * a zone boundary, it will be double counted between zones. This does * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { + unsigned long block_end_pfn; + if (!pfn_valid(pfn)) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; @@ -551,9 +548,10 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - page = pfn_to_page(pfn); - for (; pfn < block_end_pfn; pfn++) { + struct page *page; + struct page_ext *page_ext; + if (!pfn_valid_within(pfn)) continue; -- cgit v1.2.3 From 01a6ad9ac80c9b861f63087f81e696f47b481168 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 31 Jan 2018 16:20:15 -0800 Subject: zsmalloc: use U suffix for negative literals being shifted Fix warning about shifting unsigned literals being undefined behavior. Link: http://lkml.kernel.org/r/1515642078-4259-1-git-send-email-nick.desaulniers@gmail.com Signed-off-by: Nick Desaulniers Suggested-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Matthew Wilcox Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f797d8b0d820..c3013505c305 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1047,7 +1047,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) * Reset OBJ_TAG_BITS bit to last link to tell * whether it's allocated object or not. */ - link->next = -1 << OBJ_TAG_BITS; + link->next = -1UL << OBJ_TAG_BITS; } kunmap_atomic(vaddr); page = next_page; -- cgit v1.2.3 From 3a45acc0869748d7a650e36377839d849c28a52c Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 31 Jan 2018 16:20:19 -0800 Subject: mm/page_ext.c: make page_ext_init a noop when CONFIG_PAGE_EXTENSION but nothing uses it static struct page_ext_operations *page_ext_ops[] always contains debug_guardpage_ops, static struct page_ext_operations *page_ext_ops[] = { &debug_guardpage_ops, #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif ... } but for it to work, CONFIG_DEBUG_PAGEALLOC must be enabled first. If someone has CONFIG_PAGE_EXTENSION, but has none of its users, eg: (CONFIG_PAGE_OWNER, CONFIG_DEBUG_PAGEALLOC, CONFIG_IDLE_PAGE_TRACKING), we can shrink page_ext_init() to a simple retq. $ size vmlinux (before patch) text data bss dec hex filename 14356698 5681582 1687748 21726028 14b834c vmlinux $ size vmlinux (after patch) text data bss dec hex filename 14356008 5681538 1687748 21725294 14b806e vmlinux On the other hand, it might does not even make sense, since if someone enables CONFIG_PAGE_EXTENSION, I would expect him to enable also at least one of its users. Link: http://lkml.kernel.org/r/20180105130235.GA21241@techadventures.net Signed-off-by: Oscar Salvador Cc: Michal Hocko Cc: Vlastimil Babka Cc: Jaewon Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_ext.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/page_ext.c b/mm/page_ext.c index 2c16216c29b6..5295ef331165 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,7 +59,9 @@ */ static struct page_ext_operations *page_ext_ops[] = { +#ifdef CONFIG_DEBUG_PAGEALLOC &debug_guardpage_ops, +#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif -- cgit v1.2.3 From 112d2d29fc087d3078f60db220c4f31f25e59cf0 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 31 Jan 2018 16:20:23 -0800 Subject: mm/compaction.c: fix comment for try_to_compact_pages() "mode" argument is not used by try_to_compact_pages() and sub functions anymore, it has been replaced by "prio". Fix the comment to explain the use of "prio" argument. Link: http://lkml.kernel.org/r/1515801336-20611-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Cc: Joonsoo Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 10cd757f1006..2c8999d027ab 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1738,7 +1738,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @alloc_flags: The allocation flags of the current allocation * @ac: The context of current allocation - * @mode: The migration mode for async, sync light, or sync migration + * @prio: Determines how hard direct compaction should try to succeed * * This is the main entry point for direct page compaction. */ -- cgit v1.2.3 From def9b71ee651a6fee93a10734b94f93a69cdb2d4 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Wed, 31 Jan 2018 16:20:26 -0800 Subject: include/linux/mmzone.h: fix explanation of lower bits in the SPARSEMEM mem_map pointer The comment is confusing. On the one hand, it refers to 32-bit alignment (struct page alignment on 32-bit platforms), but this would only guarantee that the 2 lowest bits must be zero. On the other hand, it claims that at least 3 bits are available, and 3 bits are actually used. This is not broken, because there is a stronger alignment guarantee, just less obvious. Let's fix the comment to make it clear how many bits are available and why. Although memmap arrays are allocated in various places, the resulting pointer is encoded eventually, so I am adding a BUG_ON() here to enforce at runtime that all expected bits are indeed available. I have also added a BUILD_BUG_ON to check that PFN_SECTION_SHIFT is sufficient, because this part of the calculation can be easily checked at build time. [ptesarik@suse.com: v2] Link: http://lkml.kernel.org/r/20180125100516.589ea6af@ezekiel.suse.cz Link: http://lkml.kernel.org/r/20180119080908.3a662e6f@ezekiel.suse.cz Signed-off-by: Petr Tesarik Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Cc: Kemi Wang Cc: YASUAKI ISHIMATSU Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ++++++++++-- mm/sparse.c | 6 +++++- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67f2e3c38939..7522a6987595 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1166,8 +1166,16 @@ extern unsigned long usemap_size(void); /* * We use the lower bits of the mem_map pointer to store - * a little bit of information. There should be at least - * 3 bits here due to 32-bit alignment. + * a little bit of information. The pointer is calculated + * as mem_map - section_nr_to_pfn(pnum). The result is + * aligned to the minimum alignment of the two values: + * 1. All mem_map arrays are page-aligned. + * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT + * lowest bits. PFN_SECTION_SHIFT is arch-specific + * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the + * worst combination is powerpc with 256k pages, + * which results in PFN_SECTION_SHIFT equal 6. + * To sum it up, at least 6 bits are available. */ #define SECTION_MARKED_PRESENT (1UL<<0) #define SECTION_HAS_MEM_MAP (1UL<<1) diff --git a/mm/sparse.c b/mm/sparse.c index 2609aba121e8..6b8b5e91ceef 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -264,7 +264,11 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, */ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) { - return (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + unsigned long coded_mem_map = + (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL< Date: Wed, 31 Jan 2018 16:20:30 -0800 Subject: mm/hmm: fix uninitialized use of 'entry' in hmm_vma_walk_pmd() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable 'entry' is used before being initialized in hmm_vma_walk_pmd(). No bad effect (beside performance hit) so !non_swap_entry(0) evaluate to true which trigger a fault as if CPU was trying to access migrated memory and migrate memory back from device memory to regular memory. This function (hmm_vma_walk_pmd()) is called when a device driver tries to populate its own page table. For migrated memory it should not happen as the device driver should already have populated its page table correctly during the migration. Only case I can think of is multi-GPU where a second GPU triggers migration back to regular memory. Again this would just result in a performance hit, nothing bad would happen. Link: http://lkml.kernel.org/r/20180122185759.26286-1-jglisse@redhat.com Signed-off-by: Ralph Campbell Signed-off-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index ea19742a5d60..979211c7ccc8 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -418,7 +418,7 @@ again: } if (!pte_present(pte)) { - swp_entry_t entry; + swp_entry_t entry = pte_to_swp_entry(pte); if (!non_swap_entry(entry)) { if (hmm_vma_walk->fault) @@ -426,8 +426,6 @@ again: continue; } - entry = pte_to_swp_entry(pte); - /* * This is a special swap entry, ignore migration, use * device and report anything else as error. -- cgit v1.2.3 From 8ad6e404efa294b848782cf14f3d298762674e58 Mon Sep 17 00:00:00 2001 From: Christopher Díaz Riveros Date: Wed, 31 Jan 2018 16:20:33 -0800 Subject: mm/memcontrol.c: make local symbol static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following sparse warning: mm/memcontrol.c:1097:14: warning: symbol 'memcg1_stats' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20180118193327.14200-1-chrisadr@gentoo.org Signed-off-by: Christopher Díaz Riveros Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 695d9f10906e..3d7a3d02b168 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1095,7 +1095,7 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -unsigned int memcg1_stats[] = { +static const unsigned int memcg1_stats[] = { MEMCG_CACHE, MEMCG_RSS, MEMCG_RSS_HUGE, -- cgit v1.2.3 From 1ab5c05695bd514119a15f74d2e43456fe94b0e5 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 31 Jan 2018 16:20:37 -0800 Subject: mm/memcontrol.c: try harder to decrease [memory,memsw].limit_in_bytes mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX) pages on each iteration. This makes it practically impossible to decrease limit of memory cgroup. Tasks could easily allocate back 32 pages, so we can't reduce memory usage, and once retry_count reaches zero we return -EBUSY. Easy to reproduce the problem by running the following commands: mkdir /sys/fs/cgroup/memory/test echo $$ >> /sys/fs/cgroup/memory/test/tasks cat big_file > /dev/null & sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes -bash: echo: write error: Device or resource busy Instead of relying on retry_count, keep retrying the reclaim until the desired limit is reached or fail if the reclaim doesn't make any progress or a signal is pending. Link: http://lkml.kernel.org/r/20180119132544.19569-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Shakeel Butt Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 42 ++++++------------------------------------ 1 file changed, 6 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3d7a3d02b168..0ae2dc3a1748 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1176,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) } } -/* - * This function returns the number of memcg under hierarchy tree. Returns - * 1(self count) if no children. - */ -static int mem_cgroup_count_children(struct mem_cgroup *memcg) -{ - int num = 0; - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) - num++; - return num; -} - /* * Return the memory (and swap, if configured) limit for a memcg. */ @@ -2463,24 +2449,11 @@ static DEFINE_MUTEX(memcg_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long limit, bool memsw) { - unsigned long curusage; - unsigned long oldusage; bool enlarge = false; - int retry_count; int ret; bool limits_invariant; struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; - /* - * For keeping hierarchical_reclaim simple, how long we should retry - * is depends on callers. We set our retry-count to be function - * of # of children which we should visit in this loop. - */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * - mem_cgroup_count_children(memcg); - - oldusage = page_counter_read(counter); - do { if (signal_pending(current)) { ret = -EINTR; @@ -2507,15 +2480,12 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw); - - curusage = page_counter_read(counter); - /* Usage is reduced ? */ - if (curusage >= oldusage) - retry_count--; - else - oldusage = curusage; - } while (retry_count); + if (!try_to_free_mem_cgroup_pages(memcg, 1, + GFP_KERNEL, !memsw)) { + ret = -EBUSY; + break; + } + } while (true); if (!ret && enlarge) memcg_oom_recover(memcg); -- cgit v1.2.3 From af0fb9df784174f8cb02c57b33728a6a4f1de9fb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:41 -0800 Subject: mm, hugetlb: unify core page allocation accounting and initialization Patch series "mm, hugetlb: allocation API and migration improvements" Motivation: this is a follow up for [3] for the allocation API and [4] for the hugetlb migration. It wasn't really easy to split those into two separate patch series as they share some code. My primary motivation to touch this code is to make the gigantic pages migration working. The giga pages allocation code is just too fragile and hacked into the hugetlb code now. This series tries to move giga pages closer to the first class citizen. We are not there yet but having 5 patches is quite a lot already and it will already make the code much easier to follow. I will come with other changes on top after this sees some review. The first two patches should be trivial to review. The third patch changes the way how we migrate huge pages. Newly allocated pages are a subject of the overcommit check and they participate surplus accounting which is quite unfortunate as the changelog explains. This patch doesn't change anything wrt. giga pages. Patch #4 removes the surplus accounting hack from __alloc_surplus_huge_page. I hope I didn't miss anything there and a deeper review is really due there. Patch #5 finally unifies allocation paths and giga pages shouldn't be any special anymore. There is also some renaming going on as well. This patch (of 6): hugetlb allocator has two entry points to the page allocator - alloc_fresh_huge_page_node - __hugetlb_alloc_buddy_huge_page The two differ very subtly in two aspects. The first one doesn't care about HTLB_BUDDY_* stats and it doesn't initialize the huge page. prep_new_huge_page is not used because it not only initializes hugetlb specific stuff but because it also put_page and releases the page to the hugetlb pool which is not what is required in some contexts. This makes things more complicated than necessary. Simplify things by a) removing the page allocator entry point duplicity and only keep __hugetlb_alloc_buddy_huge_page and b) make prep_new_huge_page more reusable by removing the put_page which moves the page to the allocator pool. All current callers are updated to call put_page explicitly. Later patches will add new callers which won't need it. This patch shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20180103093213.26329-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 61 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4137fb67cd79..a8959667f539 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1157,6 +1157,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) if (page) { prep_compound_gigantic_page(page, huge_page_order(h)); prep_new_huge_page(h, page, nid); + put_page(page); /* free it into the hugepage allocator */ } return page; @@ -1304,7 +1305,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); - put_page(page); /* free it into the hugepage allocator */ } static void prep_compound_gigantic_page(struct page *page, unsigned int order) @@ -1381,41 +1381,49 @@ pgoff_t __basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } -static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) +static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) { + int order = huge_page_order(h); struct page *page; - page = __alloc_pages_node(nid, - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_RETRY_MAYFAIL|__GFP_NOWARN, - huge_page_order(h)); - if (page) { - prep_new_huge_page(h, page, nid); - } + gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); + if (page) + __count_vm_event(HTLB_BUDDY_PGALLOC); + else + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return page; } +/* + * Allocates a fresh page to the hugetlb allocator pool in the node interleaved + * manner. + */ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int nr_nodes, node; - int ret = 0; + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page_node(h, node); - if (page) { - ret = 1; + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, + node, nodes_allowed); + if (page) break; - } + } - if (ret) - count_vm_event(HTLB_BUDDY_PGALLOC); - else - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + if (!page) + return 0; - return ret; + prep_new_huge_page(h, page, page_to_nid(page)); + put_page(page); /* free it into the hugepage allocator */ + + return 1; } /* @@ -1523,17 +1531,6 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) return rc; } -static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask) -{ - int order = huge_page_order(h); - - gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; - if (nid == NUMA_NO_NODE) - nid = numa_mem_id(); - return __alloc_pages_nodemask(gfp_mask, order, nid, nmask); -} - static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { @@ -1589,11 +1586,9 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, */ h->nr_huge_pages_node[r_nid]++; h->surplus_huge_pages_node[r_nid]++; - __count_vm_event(HTLB_BUDDY_PGALLOC); } else { h->nr_huge_pages--; h->surplus_huge_pages--; - __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); } spin_unlock(&hugetlb_lock); @@ -2148,6 +2143,8 @@ static void __init gather_bootmem_prealloc(void) prep_compound_huge_page(page, h->order); WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); + put_page(page); /* free it into the hugepage allocator */ + /* * If we had gigantic hugepages allocated at boot time, we need * to restore the 'stolen' pages to totalram_pages in order to -- cgit v1.2.3 From d9cc948f6fa1c3384037f500e0acd35f03850d15 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:44 -0800 Subject: mm, hugetlb: integrate giga hugetlb more naturally to the allocation path Gigantic hugetlb pages were ingrown to the hugetlb code as an alien specie with a lot of special casing. The allocation path is not an exception. Unnecessarily so to be honest. It is true that the underlying allocator is different but that is an implementation detail. This patch unifies the hugetlb allocation path that a prepares fresh pool pages. alloc_fresh_gigantic_page basically copies alloc_fresh_huge_page logic so we can move everything there. This will simplify set_max_huge_pages which doesn't have to care about what kind of huge page we allocate. Link: http://lkml.kernel.org/r/20180103093213.26329-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 55 ++++++++++++++----------------------------------------- 1 file changed, 14 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8959667f539..360765156c7c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1106,7 +1106,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, struct hstate *h) +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { unsigned int order = huge_page_order(h); unsigned long nr_pages = 1 << order; @@ -1114,11 +1115,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h) struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - gfp_t gfp_mask; - gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; zonelist = node_zonelist(nid, gfp_mask); - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { spin_lock_irqsave(&zone->lock, flags); pfn = ALIGN(zone->zone_start_pfn, nr_pages); @@ -1149,42 +1148,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); -static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) -{ - struct page *page; - - page = alloc_gigantic_page(nid, h); - if (page) { - prep_compound_gigantic_page(page, huge_page_order(h)); - prep_new_huge_page(h, page, nid); - put_page(page); /* free it into the hugepage allocator */ - } - - return page; -} - -static int alloc_fresh_gigantic_page(struct hstate *h, - nodemask_t *nodes_allowed) -{ - struct page *page = NULL; - int nr_nodes, node; - - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_gigantic_page_node(h, node); - if (page) - return 1; - } - - return 0; -} - #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static inline bool gigantic_page_supported(void) { return false; } +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { return NULL; } static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } -static inline int alloc_fresh_gigantic_page(struct hstate *h, - nodemask_t *nodes_allowed) { return 0; } #endif static void update_and_free_page(struct hstate *h, struct page *page) @@ -1410,8 +1380,12 @@ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, - node, nodes_allowed); + if (hstate_is_gigantic(h)) + page = alloc_gigantic_page(h, gfp_mask, + node, nodes_allowed); + else + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, + node, nodes_allowed); if (page) break; @@ -1420,6 +1394,8 @@ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) if (!page) return 0; + if (hstate_is_gigantic(h)) + prep_compound_gigantic_page(page, huge_page_order(h)); prep_new_huge_page(h, page, page_to_nid(page)); put_page(page); /* free it into the hugepage allocator */ @@ -2307,10 +2283,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, /* yield cpu to avoid soft lockup */ cond_resched(); - if (hstate_is_gigantic(h)) - ret = alloc_fresh_gigantic_page(h, nodes_allowed); - else - ret = alloc_fresh_huge_page(h, nodes_allowed); + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; -- cgit v1.2.3 From ab5ac90aecf5685eb630c42c396f5f14726b0afd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:48 -0800 Subject: mm, hugetlb: do not rely on overcommit limit during migration hugepage migration relies on __alloc_buddy_huge_page to get a new page. This has 2 main disadvantages. 1) it doesn't allow to migrate any huge page if the pool is used completely which is not an exceptional case as the pool is static and unused memory is just wasted. 2) it leads to a weird semantic when migration between two numa nodes might increase the pool size of the destination NUMA node while the page is in use. The issue is caused by per NUMA node surplus pages tracking (see free_huge_page). Address both issues by changing the way how we allocate and account pages allocated for migration. Those should temporal by definition. So we mark them that way (we will abuse page flags in the 3rd page) and update free_huge_page to free such pages to the page allocator. Page migration path then just transfers the temporal status from the new page to the old one which will be freed on the last reference. The global surplus count will never change during this path but we still have to be careful when migrating a per-node suprlus page. This is now handled in move_hugetlb_state which is called from the migration path and it copies the hugetlb specific page state and fixes up the accounting when needed Rename __alloc_buddy_huge_page to __alloc_surplus_huge_page to better reflect its purpose. The new allocation routine for the migration path is __alloc_migrate_huge_page. The user visible effect of this patch is that migrated pages are really temporal and they travel between NUMA nodes as per the migration request: Before migration /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:1 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0 After /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:1 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0 with the previous implementation, both nodes would have nr_hugepages:1 until the page is freed. Link: http://lkml.kernel.org/r/20180103093213.26329-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 ++ mm/hugetlb.c | 111 +++++++++++++++++++++++++++++++++++++++++------- mm/migrate.c | 3 +- 3 files changed, 99 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 944e6e8bd572..66992348531e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -157,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); + #else /* !CONFIG_HUGETLB_PAGE */ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) @@ -197,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list) return false; } #define putback_active_hugepage(p) do {} while (0) +#define move_hugetlb_state(old, new, reason) do {} while (0) static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 360765156c7c..f260ffa26363 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "internal.h" int hugetlb_max_hstate __read_mostly; @@ -1219,6 +1220,28 @@ static void clear_page_huge_active(struct page *page) ClearPagePrivate(&page[1]); } +/* + * Internal hugetlb specific page flag. Do not use outside of the hugetlb + * code + */ +static inline bool PageHugeTemporary(struct page *page) +{ + if (!PageHuge(page)) + return false; + + return (unsigned long)page[2].mapping == -1U; +} + +static inline void SetPageHugeTemporary(struct page *page) +{ + page[2].mapping = (void *)-1U; +} + +static inline void ClearPageHugeTemporary(struct page *page) +{ + page[2].mapping = NULL; +} + void free_huge_page(struct page *page) { /* @@ -1253,7 +1276,11 @@ void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (h->surplus_huge_pages_node[nid]) { + if (PageHugeTemporary(page)) { + list_del(&page->lru); + ClearPageHugeTemporary(page); + update_and_free_page(h, page); + } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ list_del(&page->lru); update_and_free_page(h, page); @@ -1507,7 +1534,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) return rc; } -static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, +/* + * Allocates a fresh surplus page from the page allocator. + */ +static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct page *page; @@ -1571,6 +1601,28 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, return page; } +static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) +{ + struct page *page; + + if (hstate_is_gigantic(h)) + return NULL; + + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); + if (!page) + return NULL; + + /* + * We do not account these pages as surplus because they are only + * temporary and will be released properly on the last reference + */ + prep_new_huge_page(h, page, page_to_nid(page)); + SetPageHugeTemporary(page); + + return page; +} + /* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ @@ -1585,17 +1637,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask); + page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; } -/* - * This allocation function is useful in the context where vma is irrelevant. - * E.g. soft-offlining uses this function because it only cares physical - * address of error page. - */ +/* page migration callback function */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { gfp_t gfp_mask = htlb_alloc_mask(h); @@ -1610,12 +1658,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (!page) - page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL); + page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL); return page; } - +/* page migration callback function */ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask) { @@ -1633,9 +1681,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, } spin_unlock(&hugetlb_lock); - /* No reservations, try to overcommit */ - - return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask); + return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } /* @@ -1663,7 +1709,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h), + page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; @@ -2260,7 +2306,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with __alloc_buddy_huge_page() here and be unable + * We might race with __alloc_surplus_huge_page() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -2303,7 +2349,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * __alloc_buddy_huge_page() is checking the global counter, + * __alloc_surplus_huge_page() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. @@ -4779,3 +4825,36 @@ void putback_active_hugepage(struct page *page) spin_unlock(&hugetlb_lock); put_page(page); } + +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) +{ + struct hstate *h = page_hstate(oldpage); + + hugetlb_cgroup_migrate(oldpage, newpage); + set_page_owner_migrate_reason(newpage, reason); + + /* + * transfer temporary state of the new huge page. This is + * reverse to other transitions because the newpage is going to + * be final while the old one will be freed so it takes over + * the temporary status. + * + * Also note that we have to transfer the per-node surplus state + * here as well otherwise the global surplus count will not match + * the per-node's. + */ + if (PageHugeTemporary(newpage)) { + int old_nid = page_to_nid(oldpage); + int new_nid = page_to_nid(newpage); + + SetPageHugeTemporary(oldpage); + ClearPageHugeTemporary(newpage); + + spin_lock(&hugetlb_lock); + if (h->surplus_huge_pages_node[old_nid]) { + h->surplus_huge_pages_node[old_nid]--; + h->surplus_huge_pages_node[new_nid]++; + } + spin_unlock(&hugetlb_lock); + } +} diff --git a/mm/migrate.c b/mm/migrate.c index 4d0be47a322a..1e5525a25691 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1323,9 +1323,8 @@ put_anon: put_anon_vma(anon_vma); if (rc == MIGRATEPAGE_SUCCESS) { - hugetlb_cgroup_migrate(hpage, new_hpage); + move_hugetlb_state(hpage, new_hpage, reason); put_new_page = NULL; - set_page_owner_migrate_reason(new_hpage, reason); } unlock_page(hpage); -- cgit v1.2.3 From 9980d744a04281c65a8849c437c8ab9fec2db17b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:52 -0800 Subject: mm, hugetlb: get rid of surplus page accounting tricks alloc_surplus_huge_page increases the pool size and the number of surplus pages opportunistically to prevent from races with the pool size change. See commit d1c3fb1f8f29 ("hugetlb: introduce nr_overcommit_hugepages sysctl") for more details. The resulting code is unnecessarily hairy, cause code duplication and doesn't allow to share the allocation paths. Moreover pool size changes tend to be very seldom so optimizing for them is not really reasonable. Simplify the code and allow to allocate a fresh surplus page as long as we are under the overcommit limit and then recheck the condition after the allocation and drop the new page if the situation has changed. This should provide a reasonable guarantee that an abrupt allocation requests will not go way off the limit. If we consider races with the pool shrinking and enlarging then we should be reasonably safe as well. In the first case we are off by one in the worst case and the second case should work OK because the page is not yet visible. We can waste CPU cycles for the allocation but that should be acceptable for a relatively rare condition. Link: http://lkml.kernel.org/r/20180103093213.26329-5-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 62 ++++++++++++++++++++++-------------------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f260ffa26363..7dc80cbe8e89 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1540,62 +1540,46 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { - struct page *page; - unsigned int r_nid; + struct page *page = NULL; if (hstate_is_gigantic(h)) return NULL; - /* - * Assume we will successfully allocate the surplus page to - * prevent racing processes from causing the surplus to exceed - * overcommit - * - * This however introduces a different race, where a process B - * tries to grow the static hugepage pool while alloc_pages() is - * called by process A. B will only examine the per-node - * counters in determining if surplus huge pages can be - * converted to normal huge pages in adjust_pool_surplus(). A - * won't be able to increment the per-node counter, until the - * lock is dropped by B, but B doesn't drop hugetlb_lock until - * no more huge pages can be converted from surplus to normal - * state (and doesn't try to convert again). Thus, we have a - * case where a surplus huge page exists, the pool is grown, and - * the surplus huge page still exists after, even though it - * should just have been converted to a normal huge page. This - * does not leak memory, though, as the hugepage will be freed - * once it is out of use. It also does not allow the counters to - * go out of whack in adjust_pool_surplus() as we don't modify - * the node values until we've gotten the hugepage and only the - * per-node value is checked there. - */ spin_lock(&hugetlb_lock); - if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - spin_unlock(&hugetlb_lock); - return NULL; - } else { - h->nr_huge_pages++; - h->surplus_huge_pages++; - } + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) + goto out_unlock; spin_unlock(&hugetlb_lock); page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); + if (!page) + goto out_unlock; spin_lock(&hugetlb_lock); - if (page) { + /* + * We could have raced with the pool size change. + * Double check that and simply deallocate the new page + * if we would end up overcommiting the surpluses. Abuse + * temporary page to workaround the nasty free_huge_page + * codeflow + */ + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { + SetPageHugeTemporary(page); + put_page(page); + page = NULL; + } else { + int r_nid; + + h->surplus_huge_pages++; + h->nr_huge_pages++; INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); set_hugetlb_cgroup(page, NULL); - /* - * We incremented the global counters already - */ h->nr_huge_pages_node[r_nid]++; h->surplus_huge_pages_node[r_nid]++; - } else { - h->nr_huge_pages--; - h->surplus_huge_pages--; } + +out_unlock: spin_unlock(&hugetlb_lock); return page; -- cgit v1.2.3 From 0c397daea1d456f304e00413ee9e90a1830868a5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:56 -0800 Subject: mm, hugetlb: further simplify hugetlb allocation API Hugetlb allocator has several layer of allocation functions depending and the purpose of the allocation. There are two allocators depending on whether the page can be allocated from the page allocator or we need a contiguous allocator. This is currently opencoded in alloc_fresh_huge_page which is the only path that might allocate giga pages which require the later allocator. Create alloc_fresh_huge_page which hides this implementation detail and use it in all callers which hardcoded the buddy allocator path (__hugetlb_alloc_buddy_huge_page). This shouldn't introduce any funtional change because both migration and surplus allocators exlude giga pages explicitly. While we are at it let's do some renaming. The current scheme is not consistent and overly painfull to read and understand. Get rid of prefix underscores from most functions. There is no real reason to make names longer. * alloc_fresh_huge_page is the new layer to abstract underlying allocator * __hugetlb_alloc_buddy_huge_page becomes shorter and neater alloc_buddy_huge_page. * Former alloc_fresh_huge_page becomes alloc_pool_huge_page because we put the new page directly to the pool * alloc_surplus_huge_page can drop the opencoded prep_new_huge_page code as it uses alloc_fresh_huge_page now * others lose their excessive prefix underscores to make names shorter [dan.carpenter@oracle.com: fix double unlock bug in alloc_surplus_huge_page()] Link: http://lkml.kernel.org/r/20180109200559.g3iz5kvbdrz7yydp@mwanda Link: http://lkml.kernel.org/r/20180103093213.26329-6-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 80 ++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7dc80cbe8e89..b55886af82aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1378,7 +1378,7 @@ pgoff_t __basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } -static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, +static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { int order = huge_page_order(h); @@ -1396,34 +1396,49 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, return page; } +/* + * Common helper to allocate a fresh hugetlb page. All specific allocators + * should use this function to get new hugetlb pages + */ +static struct page *alloc_fresh_huge_page(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) +{ + struct page *page; + + if (hstate_is_gigantic(h)) + page = alloc_gigantic_page(h, gfp_mask, nid, nmask); + else + page = alloc_buddy_huge_page(h, gfp_mask, + nid, nmask); + if (!page) + return NULL; + + if (hstate_is_gigantic(h)) + prep_compound_gigantic_page(page, huge_page_order(h)); + prep_new_huge_page(h, page, page_to_nid(page)); + + return page; +} + /* * Allocates a fresh page to the hugetlb allocator pool in the node interleaved * manner. */ -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int nr_nodes, node; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - if (hstate_is_gigantic(h)) - page = alloc_gigantic_page(h, gfp_mask, - node, nodes_allowed); - else - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, - node, nodes_allowed); + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); if (page) break; - } if (!page) return 0; - if (hstate_is_gigantic(h)) - prep_compound_gigantic_page(page, huge_page_order(h)); - prep_new_huge_page(h, page, page_to_nid(page)); put_page(page); /* free it into the hugepage allocator */ return 1; @@ -1537,7 +1552,7 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) /* * Allocates a fresh surplus page from the page allocator. */ -static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, +static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct page *page = NULL; @@ -1550,9 +1565,9 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock(&hugetlb_lock); - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); if (!page) - goto out_unlock; + return NULL; spin_lock(&hugetlb_lock); /* @@ -1567,16 +1582,8 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, put_page(page); page = NULL; } else { - int r_nid; - h->surplus_huge_pages++; - h->nr_huge_pages++; - INIT_LIST_HEAD(&page->lru); - r_nid = page_to_nid(page); - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - set_hugetlb_cgroup(page, NULL); - h->nr_huge_pages_node[r_nid]++; - h->surplus_huge_pages_node[r_nid]++; + h->nr_huge_pages_node[page_to_nid(page)]++; } out_unlock: @@ -1585,7 +1592,7 @@ out_unlock: return page; } -static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, +static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct page *page; @@ -1593,7 +1600,7 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); if (!page) return NULL; @@ -1601,7 +1608,6 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ - prep_new_huge_page(h, page, page_to_nid(page)); SetPageHugeTemporary(page); return page; @@ -1611,7 +1617,7 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static -struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, +struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct page *page; @@ -1621,7 +1627,7 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; @@ -1642,7 +1648,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (!page) - page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL); + page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL); return page; } @@ -1665,7 +1671,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, } spin_unlock(&hugetlb_lock); - return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); + return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } /* @@ -1693,7 +1699,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h), + page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; @@ -2030,7 +2036,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { spin_unlock(&hugetlb_lock); - page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); + page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { @@ -2170,7 +2176,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; - } else if (!alloc_fresh_huge_page(h, + } else if (!alloc_pool_huge_page(h, &node_states[N_MEMORY])) break; cond_resched(); @@ -2290,7 +2296,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with __alloc_surplus_huge_page() here and be unable + * We might race with alloc_surplus_huge_page() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -2313,7 +2319,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, /* yield cpu to avoid soft lockup */ cond_resched(); - ret = alloc_fresh_huge_page(h, nodes_allowed); + ret = alloc_pool_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -2333,7 +2339,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * __alloc_surplus_huge_page() is checking the global counter, + * alloc_surplus_huge_page() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. -- cgit v1.2.3 From ebd637235890a3fa6a6d4bb57522098f2f59c693 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:21:00 -0800 Subject: hugetlb, mempolicy: fix the mbind hugetlb migration do_mbind migration code relies on alloc_huge_page_noerr for hugetlb pages. alloc_huge_page_noerr uses alloc_huge_page which is a highlevel allocation function which has to take care of reserves, overcommit or hugetlb cgroup accounting. None of that is really required for the page migration because the new page is only temporal and either will replace the original page or it will be dropped. This is essentially as for other migration call paths and there shouldn't be any reason to handle mbind in a special way. The current implementation is even suboptimal because the migration might fail just because the hugetlb cgroup limit is reached, or the overcommit is saturated. Fix this by making mbind like other hugetlb migration paths. Add a new migration helper alloc_huge_page_vma as a wrapper around alloc_huge_page_nodemask with additional mempolicy handling. alloc_huge_page_noerr has no more users and it can go. Link: http://lkml.kernel.org/r/20180103093213.26329-7-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 ++--- mm/hugetlb.c | 33 +++++++++++++++++++-------------- mm/mempolicy.c | 3 +-- 3 files changed, 22 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 66992348531e..612a29b7f6c6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -356,10 +356,9 @@ struct huge_bootmem_page { struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_node(struct hstate *h, int nid); -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); +struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -537,7 +536,7 @@ struct hstate {}; #define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL -#define alloc_huge_page_noerr(v, a, r) NULL +#define alloc_huge_page_vma(vma, address) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b55886af82aa..742a929f2311 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1674,6 +1674,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } +/* mempolicy aware migration callback */ +struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct page *page; + struct hstate *h; + gfp_t gfp_mask; + int node; + + h = hstate_vma(vma); + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + page = alloc_huge_page_nodemask(h, node, nodemask); + mpol_cond_put(mpol); + + return page; +} + /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -2079,20 +2098,6 @@ out_subpool_put: return ERR_PTR(-ENOSPC); } -/* - * alloc_huge_page()'s wrapper which simply returns the page if allocation - * succeeds, otherwise NULL. This function is called from new_vma_page(), - * where no ERR_VALUE is expected to be returned. - */ -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) -{ - struct page *page = alloc_huge_page(vma, addr, avoid_reserve); - if (IS_ERR(page)) - page = NULL; - return page; -} - int alloc_bootmem_huge_page(struct hstate *h) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f604b22ebb65..96823fa07f38 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1121,8 +1121,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) } if (PageHuge(page)) { - BUG_ON(!vma); - return alloc_huge_page_noerr(vma, address, 1); + return alloc_huge_page_vma(vma, address); } else if (thp_migration_supported() && PageTransHuge(page)) { struct page *thp; -- cgit v1.2.3 From 389c8178d0904f944887ccca2256ff9d79c12e8e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:21:03 -0800 Subject: hugetlb, mbind: fall back to default policy if vma is NULL Dan Carpenter has noticed that mbind migration callback (new_page) can get a NULL vma pointer and choke on it inside alloc_huge_page_vma which relies on the VMA to get the hstate. We used to BUG_ON this case but the BUG_+ON has been removed recently by "hugetlb, mempolicy: fix the mbind hugetlb migration". The proper way to handle this is to get the hstate from the migrated page and rely on huge_node (resp. get_vma_policy) do the right thing with null VMA. We are currently falling back to the default mempolicy in that case which is in line what THP path is doing here. Link: http://lkml.kernel.org/r/20180110104712.GR1732@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Dan Carpenter Cc: Naoya Horiguchi Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++-- mm/hugetlb.c | 5 ++--- mm/mempolicy.c | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 612a29b7f6c6..36fa6a2a82e3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -358,7 +358,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *alloc_huge_page_node(struct hstate *h, int nid); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); -struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address); +struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, + unsigned long address); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -536,7 +537,7 @@ struct hstate {}; #define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL -#define alloc_huge_page_vma(vma, address) NULL +#define alloc_huge_page_vma(h, vma, address) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 742a929f2311..7c204e3d132b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1675,16 +1675,15 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, } /* mempolicy aware migration callback */ -struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address) +struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, + unsigned long address) { struct mempolicy *mpol; nodemask_t *nodemask; struct page *page; - struct hstate *h; gfp_t gfp_mask; int node; - h = hstate_vma(vma); gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); page = alloc_huge_page_nodemask(h, node, nodemask); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 96823fa07f38..d879f1d8a44a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1121,7 +1121,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) } if (PageHuge(page)) { - return alloc_huge_page_vma(vma, address); + return alloc_huge_page_vma(page_hstate(compound_head(page)), + vma, address); } else if (thp_migration_supported() && PageTransHuge(page)) { struct page *thp; -- cgit v1.2.3 From 859d4adc3415a64ccb8b0c50dc4e3a888dcb5805 Mon Sep 17 00:00:00 2001 From: Henry Willard Date: Wed, 31 Jan 2018 16:21:07 -0800 Subject: mm: numa: do not trap faults on shared data section pages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workloads consisting of a large number of processes running the same program with a very large shared data segment may experience performance problems when numa balancing attempts to migrate the shared cow pages. This manifests itself with many processes or tasks in TASK_UNINTERRUPTIBLE state waiting for the shared pages to be migrated. The program listed below simulates the conditions with these results when run with 288 processes on a 144 core/8 socket machine. Average throughput Average throughput Average throughput with numa_balancing=0 with numa_balancing=1 with numa_balancing=1 without the patch with the patch --------------------- --------------------- --------------------- 2118782 2021534 2107979 Complex production environments show less variability and fewer poorly performing outliers accompanied with a smaller number of processes waiting on NUMA page migration with this patch applied. In some cases, %iowait drops from 16%-26% to 0. // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include int a[1000000] = {13}; int main(int argc, const char **argv) { int n = 0; int i; pid_t pid; int stat; int *count_array; int cpu_count = 288; long total = 0; struct timeval t1, t2 = {(argc > 1 ? atoi(argv[1]) : 10), 0}; if (argc > 2) cpu_count = atoi(argv[2]); count_array = mmap(NULL, cpu_count * sizeof(int), (PROT_READ|PROT_WRITE), (MAP_SHARED|MAP_ANONYMOUS), 0, 0); if (count_array == MAP_FAILED) { perror("mmap:"); return 0; } for (i = 0; i < cpu_count; ++i) { pid = fork(); if (pid <= 0) break; if ((i & 0xf) == 0) usleep(2); } if (pid != 0) { if (i == 0) { perror("fork:"); return 0; } for (;;) { pid = wait(&stat); if (pid < 0) break; } for (i = 0; i < cpu_count; ++i) total += count_array[i]; printf("Total %ld\n", total); munmap(count_array, cpu_count * sizeof(int)); return 0; } gettimeofday(&t1, 0); timeradd(&t1, &t2, &t1); while (timercmp(&t2, &t1, <)) { int b = 0; int j; for (j = 0; j < 1000000; j++) b += a[j]; gettimeofday(&t2, 0); n++; } count_array[i] = n; return 0; } This patch changes change_pte_range() to skip shared copy-on-write pages when called from change_prot_numa(). NOTE: change_prot_numa() is nominally called from task_numa_work() and queue_pages_test_walk(). task_numa_work() is the auto NUMA balancing path, and queue_pages_test_walk() is part of explicit NUMA policy management. However, queue_pages_test_walk() only calls change_prot_numa() when MPOL_MF_LAZY is specified and currently that is not allowed, so change_prot_numa() is only called from auto NUMA balancing. In the case of explicit NUMA policy management, shared pages are not migrated unless MPOL_MF_MOVE_ALL is specified, and MPOL_MF_MOVE_ALL depends on CAP_SYS_NICE. Currently, there is no way to pass information about MPOL_MF_MOVE_ALL to change_pte_range. This will have to be fixed if MPOL_MF_LAZY is enabled and MPOL_MF_MOVE_ALL is to be honored in lazy migration mode. task_numa_work() skips the read-only VMAs of programs and shared libraries. Link: http://lkml.kernel.org/r/1516751617-7369-1-git-send-email-henry.willard@oracle.com Signed-off-by: Henry Willard Reviewed-by: Håkon Bugge Reviewed-by: Steve Sistare Acked-by: Mel Gorman Cc: Kate Stewart Cc: Zi Yan Cc: Philippe Ombredanne Cc: Andrea Arcangeli Cc: Greg Kroah-Hartman Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 58b629bb70de..e3309fcf586b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -84,6 +84,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!page || PageKsm(page)) continue; + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + page_mapcount(page) != 1) + continue; + /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; -- cgit v1.2.3 From da391d640c528bc5bb227ea5b39c882b75ac3167 Mon Sep 17 00:00:00 2001 From: William Kucharski Date: Wed, 31 Jan 2018 16:21:11 -0800 Subject: mm: correct comments regarding do_fault_around() There are multiple comments surrounding do_fault_around that memtion fault_around_pages() and fault_around_mask(), two routines that do not exist. These comments should be reworded to reference fault_around_bytes, the value which is used to determine how much do_fault_around() will attempt to read when processing a fault. These comments should have been updated when fault_around_pages() and fault_around_mask() were removed in commit aecd6f44266c ("mm: close race between do_fault_around() and fault_around_bytes_set()"). Fixes: aecd6f44266c1 ("mm: close race between do_fault_around() and fault_around_bytes_set()") Link: http://lkml.kernel.org/r/302D0B14-C7E9-44C6-8BED-033F9ACBD030@oracle.com Signed-off-by: William Kucharski Reviewed-by: Larry Bassel Cc: Michal Hocko Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index a6e5d6ac5d24..53373b7a1512 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3511,9 +3511,8 @@ static int fault_around_bytes_get(void *data, u64 *val) } /* - * fault_around_pages() and fault_around_mask() expects fault_around_bytes - * rounded down to nearest page order. It's what do_fault_around() expects to - * see. + * fault_around_bytes must be rounded down to the nearest page order as it's + * what do_fault_around() expects to see. */ static int fault_around_bytes_set(void *data, u64 val) { @@ -3556,13 +3555,14 @@ late_initcall(fault_around_debugfs); * This function doesn't cross the VMA boundaries, in order to call map_pages() * only once. * - * fault_around_pages() defines how many pages we'll try to map. - * do_fault_around() expects it to return a power of two less than or equal to - * PTRS_PER_PTE. + * fault_around_bytes defines how many bytes we'll try to map. + * do_fault_around() expects it to be set to a power of two less than or equal + * to PTRS_PER_PTE. * - * The virtual address of the area that we map is naturally aligned to the - * fault_around_pages() value (and therefore to page order). This way it's - * easier to guarantee that we don't cross page table boundaries. + * The virtual address of the area that we map is naturally aligned to + * fault_around_bytes rounded down to the machine page size + * (and therefore to page order). This way it's easier to guarantee + * that we don't cross page table boundaries. */ static int do_fault_around(struct vm_fault *vmf) { @@ -3579,8 +3579,8 @@ static int do_fault_around(struct vm_fault *vmf) start_pgoff -= off; /* - * end_pgoff is either end of page table or end of vma - * or fault_around_pages() from start_pgoff, depending what is nearest. + * end_pgoff is either the end of the page table, the end of + * the vma or nr_pages from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + -- cgit v1.2.3 From 9bb5a391f9a5707e04763cf14298fc4cc29bfecd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:21:14 -0800 Subject: mm, memory_hotplug: fix memmap initialization Bharata has noticed that onlining a newly added memory doesn't increase the total memory, pointing to commit f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap") as a culprit. This commit has changed the way how the memory for memmaps is initialized and moves it from the allocation time to the initialization time. This works properly for the early memmap init path. It doesn't work for the memory hotplug though because we need to mark page as reserved when the sparsemem section is created and later initialize it completely during onlining. memmap_init_zone is called in the early stage of onlining. With the current code it calls __init_single_page and as such it clears up the whole stage and therefore online_pages_range skips those pages. Fix this by skipping mm_zero_struct_page in __init_single_page for memory hotplug path. This is quite uggly but unifying both early init and memory hotplug init paths is a large project. Make sure we plug the regression at least. Link: http://lkml.kernel.org/r/20180130101141.GW21609@dhcp22.suse.cz Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap") Signed-off-by: Michal Hocko Reported-by: Bharata B Rao Tested-by: Bharata B Rao Reviewed-by: Pavel Tatashin Cc: Steven Sistare Cc: Daniel Jordan Cc: Bob Picco Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6972750e7c5..c7dd9c86e353 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1177,9 +1177,10 @@ static void free_one_page(struct zone *zone, } static void __meminit __init_single_page(struct page *page, unsigned long pfn, - unsigned long zone, int nid) + unsigned long zone, int nid, bool zero) { - mm_zero_struct_page(page); + if (zero) + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1194,9 +1195,9 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, } static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, - int nid) + int nid, bool zero) { - return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); + return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero); } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -1217,7 +1218,7 @@ static void __meminit init_reserved_page(unsigned long pfn) if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) break; } - __init_single_pfn(pfn, zid, nid); + __init_single_pfn(pfn, zid, nid, true); } #else static inline void init_reserved_page(unsigned long pfn) @@ -1534,7 +1535,7 @@ static unsigned long __init deferred_init_pages(int nid, int zid, } else { page++; } - __init_single_page(page, pfn, zid, nid); + __init_single_page(page, pfn, zid, nid, true); nr_pages++; } return (nr_pages); @@ -5399,15 +5400,20 @@ not_early: * can be created for invalid pages (for alignment) * check here not to call set_pageblock_migratetype() against * pfn out of zone. + * + * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * because this is done early in sparse_add_one_section */ if (!(pfn & (pageblock_nr_pages - 1))) { struct page *page = pfn_to_page(pfn); - __init_single_page(page, pfn, zone, nid); + __init_single_page(page, pfn, zone, nid, + context != MEMMAP_HOTPLUG); set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } else { - __init_single_pfn(pfn, zone, nid); + __init_single_pfn(pfn, zone, nid, + context != MEMMAP_HOTPLUG); } } } -- cgit v1.2.3 From e02a9f048ef79a411904bef075fd3ce4204052a9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 31 Jan 2018 16:21:19 -0800 Subject: mm/swap.c: make functions and their kernel-doc agree Fix some basic kernel-doc notation in mm/swap.c: - for function lru_cache_add_anon(), make its kernel-doc function name match its function name and change colon to hyphen following the function name - for function pagevec_lookup_entries(), change the function parameter name from nr_pages to nr_entries since that is more descriptive of what the parameter actually is and then it matches the kernel-doc comments also Fix function kernel-doc to match the change in commit 67fd707f4681: - drop the kernel-doc notation for @nr_pages from pagevec_lookup_range() and correct the function description for that change Link: http://lkml.kernel.org/r/3b42ee3e-04a9-a6ca-6be4-f00752a114fe@infradead.org Fixes: 67fd707f4681 ("mm: remove nr_pages argument from pagevec_lookup_{,range}_tag()") Signed-off-by: Randy Dunlap Reviewed-by: Andrew Morton Cc: Jan Kara Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index e824c800adca..10568b1548d4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -411,7 +411,7 @@ static void __lru_cache_add(struct page *page) } /** - * lru_cache_add: add a page to the page lists + * lru_cache_add_anon - add a page to the page lists * @page: the page to add */ void lru_cache_add_anon(struct page *page) @@ -930,10 +930,10 @@ EXPORT_SYMBOL(__pagevec_lru_add); */ unsigned pagevec_lookup_entries(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned nr_pages, + pgoff_t start, unsigned nr_entries, pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->nr = find_get_entries(mapping, start, nr_entries, pvec->pages, indices); return pagevec_count(pvec); } @@ -965,9 +965,8 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) * @mapping: The address_space to search * @start: The starting page index * @end: The final page index - * @nr_pages: The maximum number of pages * - * pagevec_lookup_range() will search for and return a group of up to @nr_pages + * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE * pages in the mapping starting from index @start and upto index @end * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a * reference against the pages in @pvec. @@ -977,7 +976,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) * also update @start to index the next page for the traversal. * * pagevec_lookup_range() returns the number of pages which were found. If this - * number is smaller than @nr_pages, the end of specified range has been + * number is smaller than PAGEVEC_SIZE, the end of specified range has been * reached. */ unsigned pagevec_lookup_range(struct pagevec *pvec, -- cgit v1.2.3