diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 68 | ||||
| -rw-r--r-- | mm/Kconfig.debug | 27 | ||||
| -rw-r--r-- | mm/Makefile | 11 | ||||
| -rw-r--r-- | mm/allocpercpu.c | 36 | ||||
| -rw-r--r-- | mm/backing-dev.c | 424 | ||||
| -rw-r--r-- | mm/bootmem.c | 61 | ||||
| -rw-r--r-- | mm/bounce.c | 10 | ||||
| -rw-r--r-- | mm/debug-pagealloc.c | 129 | ||||
| -rw-r--r-- | mm/dmapool.c | 2 | ||||
| -rw-r--r-- | mm/fadvise.c | 2 | ||||
| -rw-r--r-- | mm/failslab.c | 1 | ||||
| -rw-r--r-- | mm/filemap.c | 377 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 4 | ||||
| -rw-r--r-- | mm/fremap.c | 2 | ||||
| -rw-r--r-- | mm/highmem.c | 111 | ||||
| -rw-r--r-- | mm/hugetlb.c | 185 | ||||
| -rw-r--r-- | mm/init-mm.c | 20 | ||||
| -rw-r--r-- | mm/internal.h | 41 | ||||
| -rw-r--r-- | mm/kmemcheck.c | 122 | ||||
| -rw-r--r-- | mm/kmemleak-test.c | 111 | ||||
| -rw-r--r-- | mm/kmemleak.c | 1686 | ||||
| -rw-r--r-- | mm/maccess.c | 2 | ||||
| -rw-r--r-- | mm/madvise.c | 26 | ||||
| -rw-r--r-- | mm/memcontrol.c | 906 | ||||
| -rw-r--r-- | mm/memory.c | 362 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
| -rw-r--r-- | mm/mempolicy.c | 187 | ||||
| -rw-r--r-- | mm/mempool.c | 4 | ||||
| -rw-r--r-- | mm/migrate.c | 18 | ||||
| -rw-r--r-- | mm/mlock.c | 116 | ||||
| -rw-r--r-- | mm/mmap.c | 135 | ||||
| -rw-r--r-- | mm/mmzone.c | 15 | ||||
| -rw-r--r-- | mm/mprotect.c | 7 | ||||
| -rw-r--r-- | mm/nommu.c | 133 | ||||
| -rw-r--r-- | mm/oom_kill.c | 81 | ||||
| -rw-r--r-- | mm/page-writeback.c | 275 | ||||
| -rw-r--r-- | mm/page_alloc.c | 963 | ||||
| -rw-r--r-- | mm/page_cgroup.c | 73 | ||||
| -rw-r--r-- | mm/page_io.c | 4 | ||||
| -rw-r--r-- | mm/pdflush.c | 251 | ||||
| -rw-r--r-- | mm/percpu.c | 1293 | ||||
| -rw-r--r-- | mm/quicklist.c | 2 | ||||
| -rw-r--r-- | mm/readahead.c | 210 | ||||
| -rw-r--r-- | mm/rmap.c | 51 | ||||
| -rw-r--r-- | mm/shmem.c | 97 | ||||
| -rw-r--r-- | mm/shmem_acl.c | 40 | ||||
| -rw-r--r-- | mm/slab.c | 352 | ||||
| -rw-r--r-- | mm/slob.c | 101 | ||||
| -rw-r--r-- | mm/slub.c | 354 | ||||
| -rw-r--r-- | mm/sparse.c | 4 | ||||
| -rw-r--r-- | mm/swap.c | 73 | ||||
| -rw-r--r-- | mm/swap_state.c | 20 | ||||
| -rw-r--r-- | mm/swapfile.c | 293 | ||||
| -rw-r--r-- | mm/thrash.c | 32 | ||||
| -rw-r--r-- | mm/truncate.c | 50 | ||||
| -rw-r--r-- | mm/util.c | 89 | ||||
| -rw-r--r-- | mm/vmalloc.c | 170 | ||||
| -rw-r--r-- | mm/vmscan.c | 539 | ||||
| -rw-r--r-- | mm/vmstat.c | 56 |
59 files changed, 8005 insertions, 2815 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a5b77811fdf..fe5f674d7a7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA - depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG + depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC64 || SUPERH || S390) comment "Memory hotplug is currently incompatible with Software Suspend" - depends on SPARSEMEM && HOTPLUG && HIBERNATION + depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 config MEMORY_HOTPLUG_SPARSE def_bool y @@ -203,16 +203,60 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config UNEVICTABLE_LRU - bool "Add LRU list to track non-evictable pages" - default y - depends on MMU - help - Keeps unevictable pages off of the active and inactive pageout - lists, so kswapd will not waste CPU time or have its balancing - algorithms thrown off by scanning these pages. Selecting this - will use one page flag and increase the code size a little, - say Y unless you know what you are doing. +config HAVE_MLOCK + bool + default y if MMU=y + +config HAVE_MLOCKED_PAGE_BIT + bool + default y if HAVE_MLOCK=y config MMU_NOTIFIER bool + +config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + default 4096 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages + can help reduce the impact of kernel NULL pointer bugs. + + For most ia64, ppc64 and x86 users with lots of address space + a value of 65536 is reasonable and should cause no problems. + On arm and other archs it should not be higher than 32768. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. + + This value can be changed after boot using the + /proc/sys/vm/mmap_min_addr tunable. + + +config NOMMU_INITIAL_TRIM_EXCESS + int "Turn on mmap() excess space trimming before booting" + depends on !MMU + default 1 + help + The NOMMU mmap() frequently needs to allocate large contiguous chunks + of memory on which to store mappings, but it can only ask the system + allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently + more than it requires. To deal with this, mmap() is able to trim off + the excess and return it to the allocator. + + If trimming is enabled, the excess is trimmed off and returned to the + system allocator, which can cause extra fragmentation, particularly + if there are a lot of transient processes. + + If trimming is disabled, the excess is kept, but not used, which for + long-term mappings means that the space is wasted. + + Trimming can be dynamically controlled through a sysctl option + (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of + excess pages there must be before trimming should occur, or zero if + no trimming is to occur. + + This option specifies the initial value of this option. The default + of 1 says that all excess pages should be trimmed. + + See Documentation/nommu-mmap.txt for more information. diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug new file mode 100644 index 00000000000..aa99fd1f710 --- /dev/null +++ b/mm/Kconfig.debug @@ -0,0 +1,27 @@ +config DEBUG_PAGEALLOC + bool "Debug page memory allocations" + depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC + depends on !HIBERNATION || !PPC && !SPARC + depends on !KMEMCHECK + ---help--- + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + +config WANT_PAGE_DEBUG_FLAGS + bool + +config PAGE_POISONING + bool "Debug page memory allocations" + depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC + depends on !HIBERNATION + select DEBUG_PAGEALLOC + select WANT_PAGE_DEBUG_FLAGS + help + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages(). This results in a large slowdown, + but helps to find certain types of memory corruptions. + + This option cannot enalbe with hibernation. Otherwise, it will get + wrong messages for memory corruption because the free pages are not + saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index 72255be57f8..147a7a7873c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -8,10 +8,11 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ vmalloc.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o pdflush.o \ + maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) +obj-y += init-mm.o obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o @@ -24,12 +25,20 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o +obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o +ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +obj-$(CONFIG_SMP) += percpu.o +else obj-$(CONFIG_SMP) += allocpercpu.o +endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o +obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 4297bc41bfd..dfdee6a4735 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -31,7 +31,7 @@ static void percpu_depopulate(void *__pdata, int cpu) * @__pdata: per-cpu data to depopulate * @mask: depopulate per-cpu data for cpu's selected through mask bits */ -static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) +static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask) { int cpu; for_each_cpu_mask_nr(cpu, *mask) @@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) /** - * percpu_alloc_mask - initial setup of per-cpu data + * alloc_percpu - initial setup of per-cpu data * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @mask: populate per-data for cpu's selected through mask bits + * @align: alignment * - * Populating per-cpu data for all online cpu's would be a typical use case, - * which is simplified by the percpu_alloc() wrapper. - * Per-cpu objects are populated with zeroed buffers. + * Allocate dynamic percpu area. Percpu objects are populated with + * zeroed buffers. */ -void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) +void *__alloc_percpu(size_t size, size_t align) { /* * We allocate whole cache lines to avoid false sharing */ size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); - void *pdata = kzalloc(sz, gfp); + void *pdata = kzalloc(sz, GFP_KERNEL); void *__pdata = __percpu_disguise(pdata); + /* + * Can't easily make larger alignment work with kmalloc. WARN + * on it. Larger alignment should only be used for module + * percpu sections on SMP for which this path isn't used. + */ + WARN_ON_ONCE(align > SMP_CACHE_BYTES); + if (unlikely(!pdata)) return NULL; - if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) + if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, + &cpu_possible_map))) return __pdata; kfree(pdata); return NULL; } -EXPORT_SYMBOL_GPL(__percpu_alloc_mask); +EXPORT_SYMBOL_GPL(__alloc_percpu); /** - * percpu_free - final cleanup of per-cpu data + * free_percpu - final cleanup of per-cpu data * @__pdata: object to clean up * * We simply clean up any per-cpu object left. No need for the client to * track and specify through a bis mask which per-cpu objects are to free. */ -void percpu_free(void *__pdata) +void free_percpu(void *__pdata) { if (unlikely(!__pdata)) return; - __percpu_depopulate_mask(__pdata, &cpu_possible_map); + __percpu_depopulate_mask(__pdata, cpu_possible_mask); kfree(__percpu_disguise(__pdata)); } -EXPORT_SYMBOL_GPL(percpu_free); +EXPORT_SYMBOL_GPL(free_percpu); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8e858744413..d3ca0dac111 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,14 +1,43 @@ #include <linux/wait.h> #include <linux/backing-dev.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> +void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ +} +EXPORT_SYMBOL(default_unplug_io_fn); + +struct backing_dev_info default_backing_dev_info = { + .name = "default", + .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, + .state = 0, + .capabilities = BDI_CAP_MAP_COPY, + .unplug_io_fn = default_unplug_io_fn, +}; +EXPORT_SYMBOL_GPL(default_backing_dev_info); static struct class *bdi_class; +DEFINE_SPINLOCK(bdi_lock); +LIST_HEAD(bdi_list); +LIST_HEAD(bdi_pending_list); + +static struct task_struct *sync_supers_tsk; +static struct timer_list sync_supers_timer; + +static int bdi_sync_supers(void *); +static void sync_supers_timer_fn(unsigned long); +static void arm_supers_timer(void); + +static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> @@ -24,9 +53,29 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; + struct bdi_writeback *wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; + struct inode *inode; + + /* + * inode lock is enough here, the bdi->wb_list is protected by + * RCU on the reader side + */ + nr_wb = nr_dirty = nr_io = nr_more_io = 0; + spin_lock(&inode_lock); + list_for_each_entry(wb, &bdi->wb_list, list) { + nr_wb++; + list_for_each_entry(inode, &wb->b_dirty, i_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_list) + nr_more_io++; + } + spin_unlock(&inode_lock); get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -36,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiReclaimable: %8lu kB\n" "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n", + "BackgroundThresh: %8lu kB\n" + "WriteBack threads:%8lu\n" + "b_dirty: %8lu\n" + "b_io: %8lu\n" + "b_more_io: %8lu\n" + "bdi_list: %8u\n" + "state: %8lx\n" + "wb_mask: %8lx\n" + "wb_list: %8u\n" + "wb_cnt: %8u\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), - K(dirty_thresh), - K(background_thresh)); + K(bdi_thresh), K(dirty_thresh), + K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, + !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, + !list_empty(&bdi->wb_list), bdi->wb_cnt); #undef K return 0; @@ -166,9 +225,269 @@ static __init int bdi_class_init(void) bdi_debug_init(); return 0; } - postcore_initcall(bdi_class_init); +static int __init default_bdi_init(void) +{ + int err; + + sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); + BUG_ON(IS_ERR(sync_supers_tsk)); + + init_timer(&sync_supers_timer); + setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); + arm_supers_timer(); + + err = bdi_init(&default_backing_dev_info); + if (!err) + bdi_register(&default_backing_dev_info, NULL, "default"); + + return err; +} +subsys_initcall(default_bdi_init); + +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); +} + +static void bdi_task_init(struct backing_dev_info *bdi, + struct bdi_writeback *wb) +{ + struct task_struct *tsk = current; + + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&wb->list, &bdi->wb_list); + spin_unlock(&bdi->wb_lock); + + tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(tsk, 0); +} + +static int bdi_start_fn(void *ptr) +{ + struct bdi_writeback *wb = ptr; + struct backing_dev_info *bdi = wb->bdi; + int ret; + + /* + * Add us to the active bdi_list + */ + spin_lock(&bdi_lock); + list_add(&bdi->bdi_list, &bdi_list); + spin_unlock(&bdi_lock); + + bdi_task_init(bdi, wb); + + /* + * Clear pending bit and wakeup anybody waiting to tear us down + */ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); + + ret = bdi_writeback_task(wb); + + /* + * Remove us from the list + */ + spin_lock(&bdi->wb_lock); + list_del_rcu(&wb->list); + spin_unlock(&bdi->wb_lock); + + /* + * Flush any work that raced with us exiting. No new work + * will be added, since this bdi isn't discoverable anymore. + */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); + + wb->task = NULL; + return ret; +} + +int bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + return wb_has_dirty_io(&bdi->wb); +} + +static void bdi_flush_io(struct backing_dev_info *bdi) +{ + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .range_cyclic = 1, + .nr_to_write = 1024, + }; + + writeback_inodes_wbc(&wbc); +} + +/* + * kupdated() used to do this. We cannot do it from the bdi_forker_task() + * or we risk deadlocking on ->s_umount. The longer term solution would be + * to implement sync_supers_bdi() or similar and simply do it from the + * bdi writeback tasks individually. + */ +static int bdi_sync_supers(void *unused) +{ + set_user_nice(current, 0); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* + * Do this periodically, like kupdated() did before. + */ + sync_supers(); + } + + return 0; +} + +static void arm_supers_timer(void) +{ + unsigned long next; + + next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; + mod_timer(&sync_supers_timer, round_jiffies_up(next)); +} + +static void sync_supers_timer_fn(unsigned long unused) +{ + wake_up_process(sync_supers_tsk); + arm_supers_timer(); +} + +static int bdi_forker_task(void *ptr) +{ + struct bdi_writeback *me = ptr; + + bdi_task_init(me->bdi, me); + + for (;;) { + struct backing_dev_info *bdi, *tmp; + struct bdi_writeback *wb; + + /* + * Temporary measure, we want to make sure we don't see + * dirty data on the default backing_dev_info + */ + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) + wb_do_writeback(me, 0); + + spin_lock(&bdi_lock); + + /* + * Check if any existing bdi's have dirty data without + * a thread registered. If so, set that up. + */ + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { + if (bdi->wb.task) + continue; + if (list_empty(&bdi->work_list) && + !bdi_has_dirty_io(bdi)) + continue; + + bdi_add_default_flusher_task(bdi); + } + + set_current_state(TASK_INTERRUPTIBLE); + + if (list_empty(&bdi_pending_list)) { + unsigned long wait; + + spin_unlock(&bdi_lock); + wait = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout(wait); + try_to_freeze(); + continue; + } + + __set_current_state(TASK_RUNNING); + + /* + * This is our real job - check for pending entries in + * bdi_pending_list, and create the tasks that got added + */ + bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, + bdi_list); + list_del_init(&bdi->bdi_list); + spin_unlock(&bdi_lock); + + wb = &bdi->wb; + wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", + dev_name(bdi->dev)); + /* + * If task creation fails, then readd the bdi to + * the pending list and force writeout of the bdi + * from this forker thread. That will free some memory + * and we can try again. + */ + if (IS_ERR(wb->task)) { + wb->task = NULL; + + /* + * Add this 'bdi' to the back, so we get + * a chance to flush other bdi's to free + * memory. + */ + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_pending_list); + spin_unlock(&bdi_lock); + + bdi_flush_io(bdi); + } + } + + return 0; +} + +/* + * Add the default flusher task that gets created for any bdi + * that has dirty data pending writeout + */ +void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) +{ + if (!bdi_cap_writeback_dirty(bdi)) + return; + + if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { + printk(KERN_ERR "bdi %p/%s is not registered!\n", + bdi, bdi->name); + return; + } + + /* + * Check with the helper whether to proceed adding a task. Will only + * abort if we two or more simultanous calls to + * bdi_add_default_flusher_task() occured, further additions will block + * waiting for previous additions to finish. + */ + if (!test_and_set_bit(BDI_pending, &bdi->state)) { + list_move_tail(&bdi->bdi_list, &bdi_pending_list); + + /* + * We are now on the pending list, wake up bdi_forker_task() + * to finish the job and add us back to the active bdi_list + */ + wake_up_process(default_backing_dev_info.wb.task); + } +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { @@ -187,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, goto exit; } + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_list); + spin_unlock(&bdi_lock); + bdi->dev = dev; - bdi_debug_register(bdi, dev_name(dev)); + /* + * Just start the forker thread for our default backing_dev_info, + * and add other bdi's to the list. They will get a thread created + * on-demand when they need it. + */ + if (bdi_cap_flush_forker(bdi)) { + struct bdi_writeback *wb = &bdi->wb; + + wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", + dev_name(dev)); + if (IS_ERR(wb->task)) { + wb->task = NULL; + ret = -ENOMEM; + + spin_lock(&bdi_lock); + list_del(&bdi->bdi_list); + spin_unlock(&bdi_lock); + goto exit; + } + } + + bdi_debug_register(bdi, dev_name(dev)); + set_bit(BDI_registered, &bdi->state); exit: return ret; } @@ -201,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) } EXPORT_SYMBOL(bdi_register_dev); +/* + * Remove bdi from the global list and shutdown any threads we have running + */ +static void bdi_wb_shutdown(struct backing_dev_info *bdi) +{ + struct bdi_writeback *wb; + + if (!bdi_cap_writeback_dirty(bdi)) + return; + + /* + * If setup is pending, wait for that to complete first + */ + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); + + /* + * Make sure nobody finds us on the bdi_list anymore + */ + spin_lock(&bdi_lock); + list_del(&bdi->bdi_list); + spin_unlock(&bdi_lock); + + /* + * Finally, kill the kernel threads. We don't need to be RCU + * safe anymore, since the bdi is gone from visibility. + */ + list_for_each_entry(wb, &bdi->wb_list, list) + kthread_stop(wb->task); +} + void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + if (!bdi_cap_flush_forker(bdi)) + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; @@ -213,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister); int bdi_init(struct backing_dev_info *bdi) { - int i; - int err; + int i, err; bdi->dev = NULL; bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; + spin_lock_init(&bdi->wb_lock); + INIT_LIST_HEAD(&bdi->bdi_list); + INIT_LIST_HEAD(&bdi->wb_list); + INIT_LIST_HEAD(&bdi->work_list); + + bdi_wb_init(&bdi->wb, bdi); + + /* + * Just one thread support for now, hard code mask and count + */ + bdi->wb_mask = 1; + bdi->wb_cnt = 1; for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); @@ -245,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; + WARN_ON(bdi_has_dirty_io(bdi)); + bdi_unregister(bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) @@ -259,13 +650,12 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; - -void clear_bdi_congested(struct backing_dev_info *bdi, int rw) +void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { enum bdi_state bit; - wait_queue_head_t *wqh = &congestion_wqh[rw]; + wait_queue_head_t *wqh = &congestion_wqh[sync]; - bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; + bit = sync ? BDI_sync_congested : BDI_async_congested; clear_bit(bit, &bdi->state); smp_mb__after_clear_bit(); if (waitqueue_active(wqh)) @@ -273,29 +663,29 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int rw) } EXPORT_SYMBOL(clear_bdi_congested); -void set_bdi_congested(struct backing_dev_info *bdi, int rw) +void set_bdi_congested(struct backing_dev_info *bdi, int sync) { enum bdi_state bit; - bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; + bit = sync ? BDI_sync_congested : BDI_async_congested; set_bit(bit, &bdi->state); } EXPORT_SYMBOL(set_bdi_congested); /** * congestion_wait - wait for a backing_dev to become uncongested - * @rw: READ or WRITE + * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit * write congestion. If no backing_devs are congested then just wait for the * next write to be completed. */ -long congestion_wait(int rw, long timeout) +long congestion_wait(int sync, long timeout) { long ret; DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[rw]; + wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); diff --git a/mm/bootmem.c b/mm/bootmem.c index 51a0ccf61e0..555d5d2731c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -12,6 +12,7 @@ #include <linux/pfn.h> #include <linux/bootmem.h> #include <linux/module.h> +#include <linux/kmemleak.h> #include <asm/bug.h> #include <asm/io.h> @@ -335,6 +336,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, { unsigned long start, end; + kmemleak_free_part(__va(physaddr), size); + start = PFN_UP(physaddr); end = PFN_DOWN(physaddr + size); @@ -354,6 +357,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size) { unsigned long start, end; + kmemleak_free_part(__va(addr), size); + start = PFN_UP(addr); end = PFN_DOWN(addr + size); @@ -382,7 +387,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); } -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE /** * reserve_bootmem - mark a page range as usable * @addr: starting address of the range @@ -403,7 +407,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, return mark_bootmem(start, end, 1, flags); } -#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) @@ -429,8 +432,8 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, } static void * __init alloc_bootmem_core(struct bootmem_data *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) { unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; @@ -518,6 +521,11 @@ find_block: region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + start_off); memset(region, 0, size); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(region, size, 0, 0); return region; } @@ -530,17 +538,41 @@ find_block: return NULL; } +static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +#ifdef CONFIG_HAVE_ARCH_BOOTMEM + { + bootmem_data_t *p_bdata; + + p_bdata = bootmem_arch_preferred_node(bdata, size, align, + goal, limit); + if (p_bdata) + return alloc_bootmem_core(p_bdata, size, align, + goal, limit); + } +#endif + return NULL; +} + static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { bootmem_data_t *bdata; + void *region; restart: - list_for_each_entry(bdata, &bdata_list, list) { - void *region; + region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); + if (region) + return region; + list_for_each_entry(bdata, &bdata_list, list) { if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) continue; if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) @@ -618,6 +650,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, { void *ptr; + ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); + if (ptr) + return ptr; + ptr = alloc_bootmem_core(bdata, size, align, goal, limit); if (ptr) return ptr; @@ -643,6 +679,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); } @@ -674,6 +713,13 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, { void *ptr; + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); + if (ptr) + return ptr; + ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; @@ -722,6 +768,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } diff --git a/mm/bounce.c b/mm/bounce.c index e590272fe7a..a2b76a588e3 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -13,17 +13,15 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> -#include <linux/blktrace_api.h> -#include <trace/block.h> #include <asm/tlbflush.h> +#include <trace/events/block.h> + #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 static mempool_t *page_pool, *isa_page_pool; -DEFINE_TRACE(block_bio_bounce); - #ifdef CONFIG_HIGHMEM static __init int init_emergency_pool(void) { @@ -192,7 +190,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, /* * is destination page below bounce pfn? */ - if (page_to_pfn(page) <= q->bounce_pfn) + if (page_to_pfn(page) <= queue_bounce_pfn(q)) continue; /* @@ -284,7 +282,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (q->bounce_pfn >= blk_max_pfn) + if (queue_bounce_pfn(q) >= blk_max_pfn) return; pool = page_pool; } else { diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c new file mode 100644 index 00000000000..a1e3324de2b --- /dev/null +++ b/mm/debug-pagealloc.c @@ -0,0 +1,129 @@ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/page-debug-flags.h> +#include <linux/poison.h> + +static inline void set_page_poison(struct page *page) +{ + __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static inline void clear_page_poison(struct page *page) +{ + __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static inline bool page_poison(struct page *page) +{ + return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static void poison_highpage(struct page *page) +{ + /* + * Page poisoning for highmem pages is not implemented. + * + * This can be called from interrupt contexts. + * So we need to create a new kmap_atomic slot for this + * application and it will need interrupt protection. + */ +} + +static void poison_page(struct page *page) +{ + void *addr; + + if (PageHighMem(page)) { + poison_highpage(page); + return; + } + set_page_poison(page); + addr = page_address(page); + memset(addr, PAGE_POISON, PAGE_SIZE); +} + +static void poison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + poison_page(page + i); +} + +static bool single_bit_flip(unsigned char a, unsigned char b) +{ + unsigned char error = a ^ b; + + return error && !(error & (error - 1)); +} + +static void check_poison_mem(unsigned char *mem, size_t bytes) +{ + unsigned char *start; + unsigned char *end; + + for (start = mem; start < mem + bytes; start++) { + if (*start != PAGE_POISON) + break; + } + if (start == mem + bytes) + return; + + for (end = mem + bytes - 1; end > start; end--) { + if (*end != PAGE_POISON) + break; + } + + if (!printk_ratelimit()) + return; + else if (start == end && single_bit_flip(*start, PAGE_POISON)) + printk(KERN_ERR "pagealloc: single bit error\n"); + else + printk(KERN_ERR "pagealloc: memory corruption\n"); + + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, + end - start + 1, 1); + dump_stack(); +} + +static void unpoison_highpage(struct page *page) +{ + /* + * See comment in poison_highpage(). + * Highmem pages should not be poisoned for now + */ + BUG_ON(page_poison(page)); +} + +static void unpoison_page(struct page *page) +{ + if (PageHighMem(page)) { + unpoison_highpage(page); + return; + } + if (page_poison(page)) { + void *addr = page_address(page); + + check_poison_mem(addr, PAGE_SIZE); + clear_page_poison(page); + } +} + +static void unpoison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + unpoison_page(page + i); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (!debug_pagealloc_enabled) + return; + + if (enable) + unpoison_pages(page, numpages); + else + poison_pages(page, numpages); +} diff --git a/mm/dmapool.c b/mm/dmapool.c index b1f0885dda2..3df063706f5 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -86,10 +86,12 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf) unsigned pages = 0; unsigned blocks = 0; + spin_lock_irq(&pool->lock); list_for_each_entry(page, &pool->page_list, page_list) { pages++; blocks += page->in_use; } + spin_unlock_irq(&pool->lock); /* per-pool info, no real statistics yet */ temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", diff --git a/mm/fadvise.c b/mm/fadvise.c index 54a0f8040af..e43359214f6 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) ret = force_page_cache_readahead(mapping, file, start_index, - max_sane_readahead(nrpages)); + nrpages); if (ret > 0) ret = 0; break; diff --git a/mm/failslab.c b/mm/failslab.c index 7c6ea6493f8..9339de5f0a9 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,4 +1,5 @@ #include <linux/fault-inject.h> +#include <linux/gfp.h> static struct { struct fault_attr attr; diff --git a/mm/filemap.c b/mm/filemap.c index 23acefe5180..dd51c68e2b8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -39,11 +39,10 @@ /* * FIXME: remove all knowledge of the buffer layer from the core VM */ -#include <linux/buffer_head.h> /* for generic_osync_inode */ +#include <linux/buffer_head.h> /* for try_to_free_buffers */ #include <asm/mman.h> - /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -121,7 +120,6 @@ void __remove_from_page_cache(struct page *page) mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); - mem_cgroup_uncharge_cache_page(page); /* * Some filesystems seem to re-dirty the page even after @@ -145,6 +143,7 @@ void remove_from_page_cache(struct page *page) spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } static int sync_page(void *word) @@ -307,68 +306,24 @@ int wait_on_page_writeback_range(struct address_space *mapping, } /** - * sync_page_range - write and wait on all pages in the passed range - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Write and wait upon all the pages in the passed range. This is a "data - * integrity" operation. It waits upon in-flight writeout before starting and - * waiting upon new writeout. If there was an IO error, return it. + * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range + * @mapping: address space structure to wait for + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) * - * We need to re-take i_mutex during the generic_osync_inode list walk because - * it is otherwise livelockable. - */ -int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) { - mutex_lock(&inode->i_mutex); - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - mutex_unlock(&inode->i_mutex); - } - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; -} -EXPORT_SYMBOL(sync_page_range); - -/** - * sync_page_range_nolock - write & wait on all pages in the passed range without locking - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. * - * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea - * as it forces O_SYNC writers to different parts of the same file - * to be serialised right until io completion. + * This is just a simple wrapper so that callers don't have to convert offsets + * to page indexes themselves */ -int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) +int filemap_fdatawait_range(struct address_space *mapping, loff_t start, + loff_t end) { - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; + return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); } -EXPORT_SYMBOL(sync_page_range_nolock); +EXPORT_SYMBOL(filemap_fdatawait_range); /** * filemap_fdatawait - wait for all under-writeback pages to complete @@ -441,6 +396,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, } return err; } +EXPORT_SYMBOL(filemap_write_and_wait_range); /** * add_to_page_cache_locked - add a locked page to the pagecache @@ -475,13 +431,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); } - - spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else mem_cgroup_uncharge_cache_page(page); @@ -513,13 +469,14 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, } return ret; } +EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + return alloc_pages_exact_node(n, gfp, 0); } return alloc_pages(gfp, 0); } @@ -565,6 +522,24 @@ void wait_on_page_bit(struct page *page, int bit_nr) EXPORT_SYMBOL(wait_on_page_bit); /** + * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue + * @page: Page defining the wait queue of interest + * @waiter: Waiter to add to the queue + * + * Add an arbitrary @waiter to the wait queue for the nominated @page. + */ +void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +{ + wait_queue_head_t *q = page_waitqueue(page); + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, waiter); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(add_page_wait_queue); + +/** * unlock_page - unlock a locked page * @page: the page * @@ -627,6 +602,7 @@ int __lock_page_killable(struct page *page) return __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page_killable, TASK_KILLABLE); } +EXPORT_SYMBOL_GPL(__lock_page_killable); /** * __lock_page_nosync - get a lock on the page, without calling sync_page() @@ -983,9 +959,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); static void shrink_readahead_size_eio(struct file *filp, struct file_ra_state *ra) { - if (!ra->ra_pages) - return; - ra->ra_pages /= 4; } @@ -1369,8 +1342,7 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; - force_page_cache_readahead(mapping, filp, index, - max_sane_readahead(nr)); + force_page_cache_readahead(mapping, filp, index, nr); return 0; } @@ -1436,6 +1408,73 @@ static int page_cache_read(struct file *file, pgoff_t offset) #define MMAP_LOTSAMISS (100) +/* + * Synchronous readahead happens when we don't even find + * a page in the page cache at all. + */ +static void do_sync_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + pgoff_t offset) +{ + unsigned long ra_pages; + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + + if (VM_SequentialReadHint(vma) || + offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { + page_cache_sync_readahead(mapping, ra, file, offset, + ra->ra_pages); + return; + } + + if (ra->mmap_miss < INT_MAX) + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > MMAP_LOTSAMISS) + return; + + /* + * mmap read-around + */ + ra_pages = max_sane_readahead(ra->ra_pages); + if (ra_pages) { + ra->start = max_t(long, 0, offset - ra_pages/2); + ra->size = ra_pages; + ra->async_size = 0; + ra_submit(ra, mapping, file); + } +} + +/* + * Asynchronous readahead happens when we find the page and PG_readahead, + * so we want to possibly extend the readahead further.. + */ +static void do_async_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + struct page *page, + pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + if (ra->mmap_miss > 0) + ra->mmap_miss--; + if (PageReadahead(page)) + page_cache_async_readahead(mapping, ra, file, + page, offset, ra->ra_pages); +} + /** * filemap_fault - read in file data for page fault handling * @vma: vma in which the fault was taken @@ -1455,78 +1494,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; + pgoff_t offset = vmf->pgoff; struct page *page; pgoff_t size; - int did_readaround = 0; int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) + if (offset >= size) return VM_FAULT_SIGBUS; - /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) - goto no_cached_page; - /* * Do we have something in the page cache already? */ -retry_find: - page = find_lock_page(mapping, vmf->pgoff); - /* - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(vma)) { - if (!page) { - page_cache_sync_readahead(mapping, ra, file, - vmf->pgoff, 1); - page = find_lock_page(mapping, vmf->pgoff); - if (!page) - goto no_cached_page; - } - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, ra, file, page, - vmf->pgoff, 1); - } - } - - if (!page) { - unsigned long ra_pages; - - ra->mmap_miss++; - + page = find_get_page(mapping, offset); + if (likely(page)) { /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. + * We found the page, so try async readahead before + * waiting for the lock. */ - if (ra->mmap_miss > MMAP_LOTSAMISS) - goto no_cached_page; + do_async_mmap_readahead(vma, ra, file, page, offset); + lock_page(page); - /* - * To keep the pgmajfault counter straight, we need to - * check did_readaround, as this is an inner loop. - */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); - if (ra_pages) { - pgoff_t start = 0; - - if (vmf->pgoff > ra_pages / 2) - start = vmf->pgoff - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto no_cached_page; } - page = find_lock_page(mapping, vmf->pgoff); + } else { + /* No page in the page cache at all */ + do_sync_mmap_readahead(vma, ra, file, offset); + count_vm_event(PGMAJFAULT); + ret = VM_FAULT_MAJOR; +retry_find: + page = find_lock_page(mapping, offset); if (!page) goto no_cached_page; } - if (!did_readaround) - ra->mmap_miss--; - /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -1534,18 +1539,18 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; - /* Must recheck i_size under page lock */ + /* + * Found the page and have a reference on it. + * We must recheck i_size under page lock. + */ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { + if (unlikely(offset >= size)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; } - /* - * Found the page and have a reference on it. - */ - ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; + ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -1554,7 +1559,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, vmf->pgoff); + error = page_cache_read(file, offset); /* * The page we want has now been added to the page cache. @@ -1574,12 +1579,6 @@ no_cached_page: return VM_FAULT_SIGBUS; page_not_uptodate: - /* IO error path */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, @@ -1823,7 +1822,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr, int copy = min(bytes, iov->iov_len - base); base = 0; - left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); + left = __copy_from_user_inatomic(vaddr, buf, copy); copied += copy; bytes -= copy; vaddr += copy; @@ -1851,8 +1850,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_inatomic_nocache(kaddr + offset, - buf, bytes); + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); copied = bytes - left; } else { copied = __iovec_copy_from_user_inatomic(kaddr + offset, @@ -1880,7 +1878,7 @@ size_t iov_iter_copy_from_user(struct page *page, if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_nocache(kaddr + offset, buf, bytes); + left = __copy_from_user(kaddr + offset, buf, bytes); copied = bytes - left; } else { copied = __iovec_copy_from_user_inatomic(kaddr + offset, @@ -2124,20 +2122,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } *ppos = end; } - - /* - * Sync the fs metadata but not the minor inode changes and - * of course not the data as we did direct DMA for the IO. - * i_mutex is held, which protects generic_osync_inode() from - * livelocking. AIO O_DIRECT ops attempt to sync metadata here. - */ out: - if ((written >= 0 || written == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (err < 0) - written = err; - } return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -2229,6 +2214,7 @@ again: pagefault_enable(); flush_dcache_page(page); + mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) @@ -2268,8 +2254,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; ssize_t status; struct iov_iter i; @@ -2279,16 +2263,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, if (likely(status >= 0)) { written += status; *ppos = pos + status; - - /* - * For now, when the user asks for O_SYNC, we'll actually give - * O_DSYNC - */ - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - } } /* @@ -2304,9 +2278,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL(generic_file_buffered_write); -static ssize_t -__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +/** + * __generic_file_aio_write - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @ppos: position where to write + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * It expects i_mutex to be grabbed unless we work on a block device or similar + * object which does not need locking at all. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_mutex. + */ +ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; @@ -2403,51 +2395,37 @@ out: current->backing_dev_info = NULL; return written ? written : err; } +EXPORT_SYMBOL(__generic_file_aio_write); -ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - BUG_ON(iocb->ki_pos != pos); - - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); - - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range_nolock(inode, mapping, pos, ret); - if (err < 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_aio_write_nolock); - +/** + * generic_file_aio_write - write data to a file + * @iocb: IO state structure + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @pos: position in file where to write + * + * This is a wrapper around __generic_file_aio_write() to be used by most + * filesystems. It takes care of syncing the file in case of O_SYNC file + * and acquires i_mutex as needed. + */ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; - err = sync_page_range(inode, mapping, pos, ret); - if (err < 0) + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) ret = err; } return ret; @@ -2464,6 +2442,9 @@ EXPORT_SYMBOL(generic_file_aio_write); * (presumably at page->private). If the release was successful, return `1'. * Otherwise return zero. * + * This may also be called if PG_fscache is set on a page, indicating that the + * page is known to the local caching routines. + * * The @gfp_mask argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). * diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 0c04615651b..427dfe3ce78 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -89,8 +89,8 @@ do_xip_mapping_read(struct address_space *mapping, } } nr = nr - offset; - if (nr > len) - nr = len; + if (nr > len - copied) + nr = len - copied; error = mapping->a_ops->get_xip_mem(mapping, index, 0, &xip_mem, &xip_pfn); diff --git a/mm/fremap.c b/mm/fremap.c index 736ba7f3306..b6ec85abbb3 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; get_file(file); addr = mmap_region(file, start, size, - flags, vma->vm_flags, pgoff, 1); + flags, vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; diff --git a/mm/highmem.c b/mm/highmem.c index b36b83b920f..25878cc49da 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -26,7 +26,6 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> -#include <linux/blktrace_api.h> #include <asm/tlbflush.h> /* @@ -67,6 +66,25 @@ pte_t * pkmap_page_table; static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); +/* + * Most architectures have no use for kmap_high_get(), so let's abstract + * the disabling of IRQ out of the locking in that case to save on a + * potential useless overhead. + */ +#ifdef ARCH_NEEDS_KMAP_HIGH_GET +#define lock_kmap() spin_lock_irq(&kmap_lock) +#define unlock_kmap() spin_unlock_irq(&kmap_lock) +#define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags) +#define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags) +#else +#define lock_kmap() spin_lock(&kmap_lock) +#define unlock_kmap() spin_unlock(&kmap_lock) +#define lock_kmap_any(flags) \ + do { spin_lock(&kmap_lock); (void)(flags); } while (0) +#define unlock_kmap_any(flags) \ + do { spin_unlock(&kmap_lock); (void)(flags); } while (0) +#endif + static void flush_all_zero_pkmaps(void) { int i; @@ -113,9 +131,9 @@ static void flush_all_zero_pkmaps(void) */ void kmap_flush_unused(void) { - spin_lock(&kmap_lock); + lock_kmap(); flush_all_zero_pkmaps(); - spin_unlock(&kmap_lock); + unlock_kmap(); } static inline unsigned long map_new_virtual(struct page *page) @@ -145,10 +163,10 @@ start: __set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&pkmap_map_wait, &wait); - spin_unlock(&kmap_lock); + unlock_kmap(); schedule(); remove_wait_queue(&pkmap_map_wait, &wait); - spin_lock(&kmap_lock); + lock_kmap(); /* Somebody else might have mapped it while we slept */ if (page_address(page)) @@ -184,29 +202,59 @@ void *kmap_high(struct page *page) * For highmem pages, we can't trust "virtual" until * after we have the lock. */ - spin_lock(&kmap_lock); + lock_kmap(); vaddr = (unsigned long)page_address(page); if (!vaddr) vaddr = map_new_virtual(page); pkmap_count[PKMAP_NR(vaddr)]++; BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); - spin_unlock(&kmap_lock); + unlock_kmap(); return (void*) vaddr; } EXPORT_SYMBOL(kmap_high); +#ifdef ARCH_NEEDS_KMAP_HIGH_GET +/** + * kmap_high_get - pin a highmem page into memory + * @page: &struct page to pin + * + * Returns the page's current virtual memory address, or NULL if no mapping + * exists. When and only when a non null address is returned then a + * matching call to kunmap_high() is necessary. + * + * This can be called from any context. + */ +void *kmap_high_get(struct page *page) +{ + unsigned long vaddr, flags; + + lock_kmap_any(flags); + vaddr = (unsigned long)page_address(page); + if (vaddr) { + BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1); + pkmap_count[PKMAP_NR(vaddr)]++; + } + unlock_kmap_any(flags); + return (void*) vaddr; +} +#endif + /** * kunmap_high - map a highmem page into memory * @page: &struct page to unmap + * + * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called + * only from user context. */ void kunmap_high(struct page *page) { unsigned long vaddr; unsigned long nr; + unsigned long flags; int need_wakeup; - spin_lock(&kmap_lock); + lock_kmap_any(flags); vaddr = (unsigned long)page_address(page); BUG_ON(!vaddr); nr = PKMAP_NR(vaddr); @@ -232,7 +280,7 @@ void kunmap_high(struct page *page) */ need_wakeup = waitqueue_active(&pkmap_map_wait); } - spin_unlock(&kmap_lock); + unlock_kmap_any(flags); /* do wake-up, if needed, race-free outside of the spin lock */ if (need_wakeup) @@ -373,3 +421,48 @@ void __init page_address_init(void) } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ + +#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) + +void debug_kmap_atomic(enum km_type type) +{ + static unsigned warn_count = 10; + + if (unlikely(warn_count == 0)) + return; + + if (unlikely(in_interrupt())) { + if (in_irq()) { + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } else if (!irqs_disabled()) { /* softirq */ + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && + type != KM_SKB_SUNRPC_DATA && + type != KM_SKB_DATA_SOFTIRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } + } + + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { + if (!irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { + if (irq_count() == 0 && !irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } +} + +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 618e9830408..cafdcee154e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -316,7 +316,7 @@ static void resv_map_release(struct kref *ref) static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); return NULL; @@ -325,7 +325,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, (get_vma_private_data(vma) & HPAGE_RESV_MASK) | (unsigned long)map); @@ -334,7 +334,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } @@ -353,7 +353,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h, if (vma->vm_flags & VM_NORESERVE) return; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { /* Shared mappings always use reserves */ h->resv_huge_pages--; } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { @@ -369,14 +369,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h, void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) vma->vm_private_data = (void *)0; } /* Returns true if the VMA has associated reserve pages */ static int vma_has_reserves(struct vm_area_struct *vma) { - if (vma->vm_flags & VM_SHARED) + if (vma->vm_flags & VM_MAYSHARE) return 1; if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 1; @@ -578,41 +578,6 @@ static void free_huge_page(struct page *page) hugetlb_put_quota(mapping, 1); } -/* - * Increment or decrement surplus_huge_pages. Keep node-specific counters - * balanced by operating on them in a round-robin fashion. - * Returns 1 if an adjustment was made. - */ -static int adjust_pool_surplus(struct hstate *h, int delta) -{ - static int prev_nid; - int nid = prev_nid; - int ret = 0; - - VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) - continue; - - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (nid != prev_nid); - - prev_nid = nid; - return ret; -} - static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { set_compound_page_dtor(page, free_huge_page); @@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } +static void prep_compound_gigantic_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + /* we rely on prep_new_huge_page to set the destructor */ + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __SetPageTail(p); + p->first_page = page; + } +} + +int PageHuge(struct page *page) +{ + compound_page_dtor *dtor; + + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + dtor = get_compound_page_dtor(page); + + return dtor == free_huge_page; +} + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) if (h->order >= MAX_ORDER) return NULL; - page = alloc_pages_node(nid, + page = alloc_pages_exact_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * Use a helper variable to find the next node and then * copy it back to hugetlb_next_nid afterwards: * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do. Move nid forward in the mask even @@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h, * can no longer free unreserved surplus pages. This occurs when * the nodes with surplus pages have no free pages. */ - unsigned long remaining_iterations = num_online_nodes(); + unsigned long remaining_iterations = nr_online_nodes; /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h, h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; nr_pages--; - remaining_iterations = num_online_nodes(); + remaining_iterations = nr_online_nodes; } } } @@ -918,13 +911,13 @@ static void return_unused_surplus_pages(struct hstate *h, * an instantiated the change should be committed via vma_commit_reservation. * No action is required on failure. */ -static int vma_needs_reservation(struct hstate *h, +static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); @@ -933,7 +926,7 @@ static int vma_needs_reservation(struct hstate *h, return 1; } else { - int err; + long err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); @@ -949,7 +942,7 @@ static void vma_commit_reservation(struct hstate *h, struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); @@ -969,7 +962,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned int chg; + long chg; /* * Processes that did not create the mapping will have no reserves and @@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) } #endif +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(struct hstate *h, int delta) +{ + static int prev_nid; + int nid = prev_nid; + int ret = 0; + + VM_BUG_ON(delta != -1 && delta != 1); + do { + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); + + /* To shrink on this node, there must be a surplus page */ + if (delta < 0 && !h->surplus_huge_pages_node[nid]) + continue; + /* Surplus cannot exceed the total number of pages */ + if (delta > 0 && h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) + continue; + + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; + ret = 1; + break; + } while (nid != prev_nid); + + prev_nid = nid; + return ret; +} + #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { @@ -1893,7 +1921,7 @@ retry_avoidcopy: * at the time of fork() could consume its reserves on COW instead * of the full address range. */ - if (!(vma->vm_flags & VM_SHARED) && + if (!(vma->vm_flags & VM_MAYSHARE) && is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; @@ -1957,7 +1985,7 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, } static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, int write_access) + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; @@ -2000,7 +2028,7 @@ retry: clear_huge_page(page, address, huge_page_size(h)); __SetPageUptodate(page); - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { int err; struct inode *inode = mapping->host; @@ -2025,7 +2053,7 @@ retry: * any allocations necessary to record that reservation occur outside * the spinlock. */ - if (write_access && !(vma->vm_flags & VM_SHARED)) + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -2044,7 +2072,7 @@ retry: && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); - if (write_access && !(vma->vm_flags & VM_SHARED)) { + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } @@ -2063,7 +2091,7 @@ backout_unlocked: } int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) + unsigned long address, unsigned int flags) { pte_t *ptep; pte_t entry; @@ -2084,7 +2112,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_lock(&hugetlb_instantiation_mutex); entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, address, ptep, write_access); + ret = hugetlb_no_page(mm, vma, address, ptep, flags); goto out_mutex; } @@ -2098,13 +2126,13 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * page now as it is used to determine if a reservation has been * consumed. */ - if (write_access && !pte_write(entry)) { + if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, vma, address); } @@ -2115,7 +2143,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page_table_lock; - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); @@ -2124,7 +2152,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) + if (huge_ptep_set_access_flags(vma, address, ptep, entry, + flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, entry); out_page_table_lock: @@ -2269,12 +2298,18 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int acctflag) { long ret, chg; struct hstate *h = hstate_inode(inode); - if (vma && vma->vm_flags & VM_NORESERVE) + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page + * and filesystem quota without using reserves + */ + if (acctflag & VM_NORESERVE) return 0; /* @@ -2283,7 +2318,7 @@ int hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!vma || vma->vm_flags & VM_SHARED) + if (!vma || vma->vm_flags & VM_MAYSHARE) chg = region_chg(&inode->i_mapping->private_list, from, to); else { struct resv_map *resv_map = resv_map_alloc(); @@ -2299,14 +2334,32 @@ int hugetlb_reserve_pages(struct inode *inode, if (chg < 0) return chg; + /* There must be enough filesystem quota for the mapping */ if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; + + /* + * Check enough hugepages are available for the reservation. + * Hand back the quota if there are not + */ ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; } - if (!vma || vma->vm_flags & VM_SHARED) + + /* + * Account for the reservations made. Shared mappings record regions + * that have reservations as they are shared by multiple VMAs. + * When the last VMA disappears, the region map says how much + * the reservation was and the page cache tells how much of + * the reservation was consumed. Private mappings are per-VMA and + * only the consumed reservations are tracked. When the VMA + * disappears, the original reservation is the VMA size and the + * consumed reservations are stored in the map. Hence, nothing + * else has to be done for private mappings here + */ + if (!vma || vma->vm_flags & VM_MAYSHARE) region_add(&inode->i_mapping->private_list, from, to); return 0; } @@ -2317,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); - inode->i_blocks -= blocks_per_huge_page(h); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed)); diff --git a/mm/init-mm.c b/mm/init-mm.c new file mode 100644 index 00000000000..57aba0da966 --- /dev/null +++ b/mm/init-mm.c @@ -0,0 +1,20 @@ +#include <linux/mm_types.h> +#include <linux/rbtree.h> +#include <linux/rwsem.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/cpumask.h> + +#include <asm/atomic.h> +#include <asm/pgtable.h> + +struct mm_struct init_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; diff --git a/mm/internal.h b/mm/internal.h index 478223b73a2..f290c4db528 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -16,9 +16,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -extern void prep_compound_page(struct page *page, unsigned long order); -extern void prep_compound_gigantic_page(struct page *page, unsigned long order); - static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); @@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page); */ extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void prep_compound_page(struct page *page, unsigned long order); + /* * function for dealing with page's order in buddy system. @@ -63,6 +62,7 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +#ifdef CONFIG_HAVE_MLOCK extern long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void munlock_vma_pages_range(struct vm_area_struct *vma, @@ -71,8 +71,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) { munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); } +#endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * unevictable_migrate_page() called only from migrate_page_copy() to * migrate unevictable flag to new page. @@ -84,13 +84,8 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) if (TestClearPageUnevictable(old)) SetPageUnevictable(new); } -#else -static inline void unevictable_migrate_page(struct page *new, struct page *old) -{ -} -#endif -#ifdef CONFIG_UNEVICTABLE_LRU +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* * Called only in fault path via page_evictable() for a new page * to determine if it's being mapped into a LOCKED vma. @@ -148,24 +143,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } -/* - * free_page_mlock() -- clean up attempts to free and mlocked() page. - * Page should not be on lru, so no need to fix that up. - * free_pages_check() will verify... - */ -static inline void free_page_mlock(struct page *page) -{ - if (unlikely(TestClearPageMlocked(page))) { - unsigned long flags; - - local_irq_save(flags); - __dec_zone_page_state(page, NR_MLOCK); - __count_vm_event(UNEVICTABLE_MLOCKFREED); - local_irq_restore(flags); - } -} - -#else /* CONFIG_UNEVICTABLE_LRU */ +#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { return 0; @@ -173,9 +151,8 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } -static inline void free_page_mlock(struct page *page) { } -#endif /* CONFIG_UNEVICTABLE_LRU */ +#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ /* * Return the mem_map entry representing the 'offset' subpage within @@ -282,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int flags, struct page **pages, struct vm_area_struct **vmas); +#define ZONE_RECLAIM_NOSCAN -2 +#define ZONE_RECLAIM_FULL -1 +#define ZONE_RECLAIM_SOME 0 +#define ZONE_RECLAIM_SUCCESS 1 #endif diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c new file mode 100644 index 00000000000..fd814fd6131 --- /dev/null +++ b/mm/kmemcheck.c @@ -0,0 +1,122 @@ +#include <linux/gfp.h> +#include <linux/mm_types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/kmemcheck.h> + +void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) +{ + struct page *shadow; + int pages; + int i; + + pages = 1 << order; + + /* + * With kmemcheck enabled, we need to allocate a memory area for the + * shadow bits as well. + */ + shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); + if (!shadow) { + if (printk_ratelimit()) + printk(KERN_ERR "kmemcheck: failed to allocate " + "shadow bitmap\n"); + return; + } + + for(i = 0; i < pages; ++i) + page[i].shadow = page_address(&shadow[i]); + + /* + * Mark it as non-present for the MMU so that our accesses to + * this memory will trigger a page fault and let us analyze + * the memory accesses. + */ + kmemcheck_hide_pages(page, pages); +} + +void kmemcheck_free_shadow(struct page *page, int order) +{ + struct page *shadow; + int pages; + int i; + + if (!kmemcheck_page_is_tracked(page)) + return; + + pages = 1 << order; + + kmemcheck_show_pages(page, pages); + + shadow = virt_to_page(page[0].shadow); + + for(i = 0; i < pages; ++i) + page[i].shadow = NULL; + + __free_pages(shadow, order); +} + +void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, + size_t size) +{ + /* + * Has already been memset(), which initializes the shadow for us + * as well. + */ + if (gfpflags & __GFP_ZERO) + return; + + /* No need to initialize the shadow of a non-tracked slab. */ + if (s->flags & SLAB_NOTRACK) + return; + + if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { + /* + * Allow notracked objects to be allocated from + * tracked caches. Note however that these objects + * will still get page faults on access, they just + * won't ever be flagged as uninitialized. If page + * faults are not acceptable, the slab cache itself + * should be marked NOTRACK. + */ + kmemcheck_mark_initialized(object, size); + } else if (!s->ctor) { + /* + * New objects should be marked uninitialized before + * they're returned to the called. + */ + kmemcheck_mark_uninitialized(object, size); + } +} + +void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) +{ + /* TODO: RCU freeing is unsupported for now; hide false positives. */ + if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) + kmemcheck_mark_freed(object, size); +} + +void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, + gfp_t gfpflags) +{ + int pages; + + if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) + return; + + pages = 1 << order; + + /* + * NOTE: We choose to track GFP_ZERO pages too; in fact, they + * can become uninitialized by copying uninitialized memory + * into them. + */ + + /* XXX: Can use zone->node for node? */ + kmemcheck_alloc_shadow(page, order, gfpflags, -1); + + if (gfpflags & __GFP_ZERO) + kmemcheck_mark_initialized_pages(page, pages); + else + kmemcheck_mark_uninitialized_pages(page, pages); +} diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c new file mode 100644 index 00000000000..d5292fc6f52 --- /dev/null +++ b/mm/kmemleak-test.c @@ -0,0 +1,111 @@ +/* + * mm/kmemleak-test.c + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas <catalin.marinas@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/list.h> +#include <linux/percpu.h> +#include <linux/fdtable.h> + +#include <linux/kmemleak.h> + +struct test_node { + long header[25]; + struct list_head list; + long footer[25]; +}; + +static LIST_HEAD(test_list); +static DEFINE_PER_CPU(void *, test_pointer); + +/* + * Some very simple testing. This function needs to be extended for + * proper testing. + */ +static int __init kmemleak_test_init(void) +{ + struct test_node *elem; + int i; + + printk(KERN_INFO "Kmemleak testing\n"); + + /* make some orphan objects */ + pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); +#ifndef CONFIG_MODULES + pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); + pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); +#endif + pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + + /* + * Add elements to a list. They should only appear as orphan + * after the module is removed. + */ + for (i = 0; i < 10; i++) { + elem = kmalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); + if (!elem) + return -ENOMEM; + memset(elem, 0, sizeof(*elem)); + INIT_LIST_HEAD(&elem->list); + + list_add_tail(&elem->list, &test_list); + } + + for_each_possible_cpu(i) { + per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + pr_info("kmemleak: kmalloc(129) = %p\n", + per_cpu(test_pointer, i)); + } + + return 0; +} +module_init(kmemleak_test_init); + +static void __exit kmemleak_test_exit(void) +{ + struct test_node *elem, *tmp; + + /* + * Remove the list elements without actually freeing the + * memory. + */ + list_for_each_entry_safe(elem, tmp, &test_list, list) + list_del(&elem->list); +} +module_exit(kmemleak_test_exit); + +MODULE_LICENSE("GPL"); diff --git a/mm/kmemleak.c b/mm/kmemleak.c new file mode 100644 index 00000000000..4ea4510e299 --- /dev/null +++ b/mm/kmemleak.c @@ -0,0 +1,1686 @@ +/* + * mm/kmemleak.c + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas <catalin.marinas@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * For more information on the algorithm and kmemleak usage, please see + * Documentation/kmemleak.txt. + * + * Notes on locking + * ---------------- + * + * The following locks and mutexes are used by kmemleak: + * + * - kmemleak_lock (rwlock): protects the object_list modifications and + * accesses to the object_tree_root. The object_list is the main list + * holding the metadata (struct kmemleak_object) for the allocated memory + * blocks. The object_tree_root is a priority search tree used to look-up + * metadata based on a pointer to the corresponding memory block. The + * kmemleak_object structures are added to the object_list and + * object_tree_root in the create_object() function called from the + * kmemleak_alloc() callback and removed in delete_object() called from the + * kmemleak_free() callback + * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to + * the metadata (e.g. count) are protected by this lock. Note that some + * members of this structure may be protected by other means (atomic or + * kmemleak_lock). This lock is also held when scanning the corresponding + * memory block to avoid the kernel freeing it via the kmemleak_free() + * callback. This is less heavyweight than holding a global lock like + * kmemleak_lock during scanning + * - scan_mutex (mutex): ensures that only one thread may scan the memory for + * unreferenced objects at a time. The gray_list contains the objects which + * are already referenced or marked as false positives and need to be + * scanned. This list is only modified during a scanning episode when the + * scan_mutex is held. At the end of a scan, the gray_list is always empty. + * Note that the kmemleak_object.use_count is incremented when an object is + * added to the gray_list and therefore cannot be freed. This mutex also + * prevents multiple users of the "kmemleak" debugfs file together with + * modifications to the memory scanning parameters including the scan_thread + * pointer + * + * The kmemleak_object structures have a use_count incremented or decremented + * using the get_object()/put_object() functions. When the use_count becomes + * 0, this count can no longer be incremented and put_object() schedules the + * kmemleak_object freeing via an RCU callback. All calls to the get_object() + * function must be protected by rcu_read_lock() to avoid accessing a freed + * structure. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/sched.h> +#include <linux/jiffies.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/prio_tree.h> +#include <linux/gfp.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/cpumask.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rcupdate.h> +#include <linux/stacktrace.h> +#include <linux/cache.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mmzone.h> +#include <linux/slab.h> +#include <linux/thread_info.h> +#include <linux/err.h> +#include <linux/uaccess.h> +#include <linux/string.h> +#include <linux/nodemask.h> +#include <linux/mm.h> +#include <linux/workqueue.h> + +#include <asm/sections.h> +#include <asm/processor.h> +#include <asm/atomic.h> + +#include <linux/kmemcheck.h> +#include <linux/kmemleak.h> + +/* + * Kmemleak configuration and common defines. + */ +#define MAX_TRACE 16 /* stack trace length */ +#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ +#define SECS_FIRST_SCAN 60 /* delay before the first scan */ +#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ +#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ +#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ + +#define BYTES_PER_POINTER sizeof(void *) + +/* GFP bitmask for kmemleak internal allocations */ +#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) + +/* scanning area inside a memory block */ +struct kmemleak_scan_area { + struct hlist_node node; + unsigned long offset; + size_t length; +}; + +#define KMEMLEAK_GREY 0 +#define KMEMLEAK_BLACK -1 + +/* + * Structure holding the metadata for each allocated memory block. + * Modifications to such objects should be made while holding the + * object->lock. Insertions or deletions from object_list, gray_list or + * tree_node are already protected by the corresponding locks or mutex (see + * the notes on locking above). These objects are reference-counted + * (use_count) and freed using the RCU mechanism. + */ +struct kmemleak_object { + spinlock_t lock; + unsigned long flags; /* object status flags */ + struct list_head object_list; + struct list_head gray_list; + struct prio_tree_node tree_node; + struct rcu_head rcu; /* object_list lockless traversal */ + /* object usage count; object freed when use_count == 0 */ + atomic_t use_count; + unsigned long pointer; + size_t size; + /* minimum number of a pointers found before it is considered leak */ + int min_count; + /* the total number of pointers found pointing to this object */ + int count; + /* memory ranges to be scanned inside an object (empty for all) */ + struct hlist_head area_list; + unsigned long trace[MAX_TRACE]; + unsigned int trace_len; + unsigned long jiffies; /* creation timestamp */ + pid_t pid; /* pid of the current task */ + char comm[TASK_COMM_LEN]; /* executable name */ +}; + +/* flag representing the memory block allocation status */ +#define OBJECT_ALLOCATED (1 << 0) +/* flag set after the first reporting of an unreference object */ +#define OBJECT_REPORTED (1 << 1) +/* flag set to not scan the object */ +#define OBJECT_NO_SCAN (1 << 2) +/* flag set on newly allocated objects */ +#define OBJECT_NEW (1 << 3) + +/* number of bytes to print per line; must be 16 or 32 */ +#define HEX_ROW_SIZE 16 +/* number of bytes to print at a time (1, 2, 4, 8) */ +#define HEX_GROUP_SIZE 1 +/* include ASCII after the hex output */ +#define HEX_ASCII 1 +/* max number of lines to be printed */ +#define HEX_MAX_LINES 2 + +/* the list of all allocated objects */ +static LIST_HEAD(object_list); +/* the list of gray-colored objects (see color_gray comment below) */ +static LIST_HEAD(gray_list); +/* prio search tree for object boundaries */ +static struct prio_tree_root object_tree_root; +/* rw_lock protecting the access to object_list and prio_tree_root */ +static DEFINE_RWLOCK(kmemleak_lock); + +/* allocation caches for kmemleak internal data */ +static struct kmem_cache *object_cache; +static struct kmem_cache *scan_area_cache; + +/* set if tracing memory operations is enabled */ +static atomic_t kmemleak_enabled = ATOMIC_INIT(0); +/* set in the late_initcall if there were no errors */ +static atomic_t kmemleak_initialized = ATOMIC_INIT(0); +/* enables or disables early logging of the memory operations */ +static atomic_t kmemleak_early_log = ATOMIC_INIT(1); +/* set if a fata kmemleak error has occurred */ +static atomic_t kmemleak_error = ATOMIC_INIT(0); + +/* minimum and maximum address that may be valid pointers */ +static unsigned long min_addr = ULONG_MAX; +static unsigned long max_addr; + +static struct task_struct *scan_thread; +/* used to avoid reporting of recently allocated objects */ +static unsigned long jiffies_min_age; +static unsigned long jiffies_last_scan; +/* delay between automatic memory scannings */ +static signed long jiffies_scan_wait; +/* enables or disables the task stacks scanning */ +static int kmemleak_stack_scan = 1; +/* protects the memory scanning, parameters and debug/kmemleak file access */ +static DEFINE_MUTEX(scan_mutex); + +/* + * Early object allocation/freeing logging. Kmemleak is initialized after the + * kernel allocator. However, both the kernel allocator and kmemleak may + * allocate memory blocks which need to be tracked. Kmemleak defines an + * arbitrary buffer to hold the allocation/freeing information before it is + * fully initialized. + */ + +/* kmemleak operation type for early logging */ +enum { + KMEMLEAK_ALLOC, + KMEMLEAK_FREE, + KMEMLEAK_FREE_PART, + KMEMLEAK_NOT_LEAK, + KMEMLEAK_IGNORE, + KMEMLEAK_SCAN_AREA, + KMEMLEAK_NO_SCAN +}; + +/* + * Structure holding the information passed to kmemleak callbacks during the + * early logging. + */ +struct early_log { + int op_type; /* kmemleak operation type */ + const void *ptr; /* allocated/freed memory block */ + size_t size; /* memory block size */ + int min_count; /* minimum reference count */ + unsigned long offset; /* scan area offset */ + size_t length; /* scan area length */ + unsigned long trace[MAX_TRACE]; /* stack trace */ + unsigned int trace_len; /* stack trace length */ +}; + +/* early logging buffer and current position */ +static struct early_log + early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; +static int crt_early_log __initdata; + +static void kmemleak_disable(void); + +/* + * Print a warning and dump the stack trace. + */ +#define kmemleak_warn(x...) do { \ + pr_warning(x); \ + dump_stack(); \ +} while (0) + +/* + * Macro invoked when a serious kmemleak condition occured and cannot be + * recovered from. Kmemleak will be disabled and further allocation/freeing + * tracing no longer available. + */ +#define kmemleak_stop(x...) do { \ + kmemleak_warn(x); \ + kmemleak_disable(); \ +} while (0) + +/* + * Printing of the objects hex dump to the seq file. The number of lines to be + * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The + * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called + * with the object->lock held. + */ +static void hex_dump_object(struct seq_file *seq, + struct kmemleak_object *object) +{ + const u8 *ptr = (const u8 *)object->pointer; + int i, len, remaining; + unsigned char linebuf[HEX_ROW_SIZE * 5]; + + /* limit the number of lines to HEX_MAX_LINES */ + remaining = len = + min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); + + seq_printf(seq, " hex dump (first %d bytes):\n", len); + for (i = 0; i < len; i += HEX_ROW_SIZE) { + int linelen = min(remaining, HEX_ROW_SIZE); + + remaining -= HEX_ROW_SIZE; + hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE, + HEX_GROUP_SIZE, linebuf, sizeof(linebuf), + HEX_ASCII); + seq_printf(seq, " %s\n", linebuf); + } +} + +/* + * Object colors, encoded with count and min_count: + * - white - orphan object, not enough references to it (count < min_count) + * - gray - not orphan, not marked as false positive (min_count == 0) or + * sufficient references to it (count >= min_count) + * - black - ignore, it doesn't contain references (e.g. text section) + * (min_count == -1). No function defined for this color. + * Newly created objects don't have any color assigned (object->count == -1) + * before the next memory scan when they become white. + */ +static bool color_white(const struct kmemleak_object *object) +{ + return object->count != KMEMLEAK_BLACK && + object->count < object->min_count; +} + +static bool color_gray(const struct kmemleak_object *object) +{ + return object->min_count != KMEMLEAK_BLACK && + object->count >= object->min_count; +} + +static bool color_black(const struct kmemleak_object *object) +{ + return object->min_count == KMEMLEAK_BLACK; +} + +/* + * Objects are considered unreferenced only if their color is white, they have + * not be deleted and have a minimum age to avoid false positives caused by + * pointers temporarily stored in CPU registers. + */ +static bool unreferenced_object(struct kmemleak_object *object) +{ + return (object->flags & OBJECT_ALLOCATED) && color_white(object) && + time_before_eq(object->jiffies + jiffies_min_age, + jiffies_last_scan); +} + +/* + * Printing of the unreferenced objects information to the seq file. The + * print_unreferenced function must be called with the object->lock held. + */ +static void print_unreferenced(struct seq_file *seq, + struct kmemleak_object *object) +{ + int i; + + seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + object->pointer, object->size); + seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", + object->comm, object->pid, object->jiffies); + hex_dump_object(seq, object); + seq_printf(seq, " backtrace:\n"); + + for (i = 0; i < object->trace_len; i++) { + void *ptr = (void *)object->trace[i]; + seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); + } +} + +/* + * Print the kmemleak_object information. This function is used mainly for + * debugging special cases when kmemleak operations. It must be called with + * the object->lock held. + */ +static void dump_object_info(struct kmemleak_object *object) +{ + struct stack_trace trace; + + trace.nr_entries = object->trace_len; + trace.entries = object->trace; + + pr_notice("Object 0x%08lx (size %zu):\n", + object->tree_node.start, object->size); + pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", + object->comm, object->pid, object->jiffies); + pr_notice(" min_count = %d\n", object->min_count); + pr_notice(" count = %d\n", object->count); + pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" backtrace:\n"); + print_stack_trace(&trace, 4); +} + +/* + * Look-up a memory block metadata (kmemleak_object) in the priority search + * tree based on a pointer value. If alias is 0, only values pointing to the + * beginning of the memory block are allowed. The kmemleak_lock must be held + * when calling this function. + */ +static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) +{ + struct prio_tree_node *node; + struct prio_tree_iter iter; + struct kmemleak_object *object; + + prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); + node = prio_tree_next(&iter); + if (node) { + object = prio_tree_entry(node, struct kmemleak_object, + tree_node); + if (!alias && object->pointer != ptr) { + kmemleak_warn("Found object by alias"); + object = NULL; + } + } else + object = NULL; + + return object; +} + +/* + * Increment the object use_count. Return 1 if successful or 0 otherwise. Note + * that once an object's use_count reached 0, the RCU freeing was already + * registered and the object should no longer be used. This function must be + * called under the protection of rcu_read_lock(). + */ +static int get_object(struct kmemleak_object *object) +{ + return atomic_inc_not_zero(&object->use_count); +} + +/* + * RCU callback to free a kmemleak_object. + */ +static void free_object_rcu(struct rcu_head *rcu) +{ + struct hlist_node *elem, *tmp; + struct kmemleak_scan_area *area; + struct kmemleak_object *object = + container_of(rcu, struct kmemleak_object, rcu); + + /* + * Once use_count is 0 (guaranteed by put_object), there is no other + * code accessing this object, hence no need for locking. + */ + hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) { + hlist_del(elem); + kmem_cache_free(scan_area_cache, area); + } + kmem_cache_free(object_cache, object); +} + +/* + * Decrement the object use_count. Once the count is 0, free the object using + * an RCU callback. Since put_object() may be called via the kmemleak_free() -> + * delete_object() path, the delayed RCU freeing ensures that there is no + * recursive call to the kernel allocator. Lock-less RCU object_list traversal + * is also possible. + */ +static void put_object(struct kmemleak_object *object) +{ + if (!atomic_dec_and_test(&object->use_count)) + return; + + /* should only get here after delete_object was called */ + WARN_ON(object->flags & OBJECT_ALLOCATED); + + call_rcu(&object->rcu, free_object_rcu); +} + +/* + * Look up an object in the prio search tree and increase its use_count. + */ +static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) +{ + unsigned long flags; + struct kmemleak_object *object = NULL; + + rcu_read_lock(); + read_lock_irqsave(&kmemleak_lock, flags); + if (ptr >= min_addr && ptr < max_addr) + object = lookup_object(ptr, alias); + read_unlock_irqrestore(&kmemleak_lock, flags); + + /* check whether the object is still available */ + if (object && !get_object(object)) + object = NULL; + rcu_read_unlock(); + + return object; +} + +/* + * Save stack trace to the given array of MAX_TRACE size. + */ +static int __save_stack_trace(unsigned long *trace) +{ + struct stack_trace stack_trace; + + stack_trace.max_entries = MAX_TRACE; + stack_trace.nr_entries = 0; + stack_trace.entries = trace; + stack_trace.skip = 2; + save_stack_trace(&stack_trace); + + return stack_trace.nr_entries; +} + +/* + * Create the metadata (struct kmemleak_object) corresponding to an allocated + * memory block and add it to the object_list and object_tree_root. + */ +static struct kmemleak_object *create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object; + struct prio_tree_node *node; + + object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); + if (!object) { + kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); + return NULL; + } + + INIT_LIST_HEAD(&object->object_list); + INIT_LIST_HEAD(&object->gray_list); + INIT_HLIST_HEAD(&object->area_list); + spin_lock_init(&object->lock); + atomic_set(&object->use_count, 1); + object->flags = OBJECT_ALLOCATED | OBJECT_NEW; + object->pointer = ptr; + object->size = size; + object->min_count = min_count; + object->count = -1; /* no color initially */ + object->jiffies = jiffies; + + /* task information */ + if (in_irq()) { + object->pid = 0; + strncpy(object->comm, "hardirq", sizeof(object->comm)); + } else if (in_softirq()) { + object->pid = 0; + strncpy(object->comm, "softirq", sizeof(object->comm)); + } else { + object->pid = current->pid; + /* + * There is a small chance of a race with set_task_comm(), + * however using get_task_comm() here may cause locking + * dependency issues with current->alloc_lock. In the worst + * case, the command line is not correct. + */ + strncpy(object->comm, current->comm, sizeof(object->comm)); + } + + /* kernel backtrace */ + object->trace_len = __save_stack_trace(object->trace); + + INIT_PRIO_TREE_NODE(&object->tree_node); + object->tree_node.start = ptr; + object->tree_node.last = ptr + size - 1; + + write_lock_irqsave(&kmemleak_lock, flags); + + min_addr = min(min_addr, ptr); + max_addr = max(max_addr, ptr + size); + node = prio_tree_insert(&object_tree_root, &object->tree_node); + /* + * The code calling the kernel does not yet have the pointer to the + * memory block to be able to free it. However, we still hold the + * kmemleak_lock here in case parts of the kernel started freeing + * random memory blocks. + */ + if (node != &object->tree_node) { + kmemleak_stop("Cannot insert 0x%lx into the object search tree " + "(already existing)\n", ptr); + object = lookup_object(ptr, 1); + spin_lock(&object->lock); + dump_object_info(object); + spin_unlock(&object->lock); + + goto out; + } + list_add_tail_rcu(&object->object_list, &object_list); +out: + write_unlock_irqrestore(&kmemleak_lock, flags); + return object; +} + +/* + * Remove the metadata (struct kmemleak_object) for a memory block from the + * object_list and object_tree_root and decrement its use_count. + */ +static void __delete_object(struct kmemleak_object *object) +{ + unsigned long flags; + + write_lock_irqsave(&kmemleak_lock, flags); + prio_tree_remove(&object_tree_root, &object->tree_node); + list_del_rcu(&object->object_list); + write_unlock_irqrestore(&kmemleak_lock, flags); + + WARN_ON(!(object->flags & OBJECT_ALLOCATED)); + WARN_ON(atomic_read(&object->use_count) < 2); + + /* + * Locking here also ensures that the corresponding memory block + * cannot be freed when it is being scanned. + */ + spin_lock_irqsave(&object->lock, flags); + object->flags &= ~OBJECT_ALLOCATED; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. + */ +static void delete_object_full(unsigned long ptr) +{ + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Freeing unknown object at 0x%08lx\n", + ptr); +#endif + return; + } + __delete_object(object); + put_object(object); +} + +/* + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. If the memory block is partially freed, the function may create + * additional metadata for the remaining parts of the block. + */ +static void delete_object_part(unsigned long ptr, size_t size) +{ + struct kmemleak_object *object; + unsigned long start, end; + + object = find_and_get_object(ptr, 1); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx " + "(size %zu)\n", ptr, size); +#endif + return; + } + __delete_object(object); + + /* + * Create one or two objects that may result from the memory block + * split. Note that partial freeing is only done by free_bootmem() and + * this happens before kmemleak_init() is called. The path below is + * only executed during early log recording in kmemleak_init(), so + * GFP_KERNEL is enough. + */ + start = object->pointer; + end = object->pointer + object->size; + if (ptr > start) + create_object(start, ptr - start, object->min_count, + GFP_KERNEL); + if (ptr + size < end) + create_object(ptr + size, end - ptr - size, object->min_count, + GFP_KERNEL); + + put_object(object); +} + +static void __paint_it(struct kmemleak_object *object, int color) +{ + object->min_count = color; + if (color == KMEMLEAK_BLACK) + object->flags |= OBJECT_NO_SCAN; +} + +static void paint_it(struct kmemleak_object *object, int color) +{ + unsigned long flags; + + spin_lock_irqsave(&object->lock, flags); + __paint_it(object, color); + spin_unlock_irqrestore(&object->lock, flags); +} + +static void paint_ptr(unsigned long ptr, int color) +{ + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Trying to color unknown object " + "at 0x%08lx as %s\n", ptr, + (color == KMEMLEAK_GREY) ? "Grey" : + (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); + return; + } + paint_it(object, color); + put_object(object); +} + +/* + * Make a object permanently as gray-colored so that it can no longer be + * reported as a leak. This is used in general to mark a false positive. + */ +static void make_gray_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_GREY); +} + +/* + * Mark the object as black-colored so that it is ignored from scans and + * reporting. + */ +static void make_black_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_BLACK); +} + +/* + * Add a scanning area to the object. If at least one such area is added, + * kmemleak will only scan these ranges rather than the whole memory block. + */ +static void add_scan_area(unsigned long ptr, unsigned long offset, + size_t length, gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object; + struct kmemleak_scan_area *area; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", + ptr); + return; + } + + area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); + if (!area) { + kmemleak_warn("Cannot allocate a scan area\n"); + goto out; + } + + spin_lock_irqsave(&object->lock, flags); + if (offset + length > object->size) { + kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); + dump_object_info(object); + kmem_cache_free(scan_area_cache, area); + goto out_unlock; + } + + INIT_HLIST_NODE(&area->node); + area->offset = offset; + area->length = length; + + hlist_add_head(&area->node, &object->area_list); +out_unlock: + spin_unlock_irqrestore(&object->lock, flags); +out: + put_object(object); +} + +/* + * Set the OBJECT_NO_SCAN flag for the object corresponding to the give + * pointer. Such object will not be scanned by kmemleak but references to it + * are searched. + */ +static void object_no_scan(unsigned long ptr) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->flags |= OBJECT_NO_SCAN; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Log an early kmemleak_* call to the early_log buffer. These calls will be + * processed later once kmemleak is fully initialized. + */ +static void __init log_early(int op_type, const void *ptr, size_t size, + int min_count, unsigned long offset, size_t length) +{ + unsigned long flags; + struct early_log *log; + + if (crt_early_log >= ARRAY_SIZE(early_log)) { + pr_warning("Early log buffer exceeded, " + "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n"); + kmemleak_disable(); + return; + } + + /* + * There is no need for locking since the kernel is still in UP mode + * at this stage. Disabling the IRQs is enough. + */ + local_irq_save(flags); + log = &early_log[crt_early_log]; + log->op_type = op_type; + log->ptr = ptr; + log->size = size; + log->min_count = min_count; + log->offset = offset; + log->length = length; + if (op_type == KMEMLEAK_ALLOC) + log->trace_len = __save_stack_trace(log->trace); + crt_early_log++; + local_irq_restore(flags); +} + +/* + * Log an early allocated block and populate the stack trace. + */ +static void early_alloc(struct early_log *log) +{ + struct kmemleak_object *object; + unsigned long flags; + int i; + + if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) + return; + + /* + * RCU locking needed to ensure object is not freed via put_object(). + */ + rcu_read_lock(); + object = create_object((unsigned long)log->ptr, log->size, + log->min_count, GFP_KERNEL); + spin_lock_irqsave(&object->lock, flags); + for (i = 0; i < log->trace_len; i++) + object->trace[i] = log->trace[i]; + object->trace_len = log->trace_len; + spin_unlock_irqrestore(&object->lock, flags); + rcu_read_unlock(); +} + +/* + * Memory allocation function callback. This function is called from the + * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, + * vmalloc etc.). + */ +void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, + gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + create_object((unsigned long)ptr, size, min_count, gfp); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_alloc); + +/* + * Memory freeing function callback. This function is called from the kernel + * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). + */ +void __ref kmemleak_free(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + delete_object_full((unsigned long)ptr); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free); + +/* + * Partial memory freeing function callback. This function is usually called + * from bootmem allocator when (part of) a memory block is freed. + */ +void __ref kmemleak_free_part(const void *ptr, size_t size) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + delete_object_part((unsigned long)ptr, size); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free_part); + +/* + * Mark an already allocated memory block as a false positive. This will cause + * the block to no longer be reported as leak and always be scanned. + */ +void __ref kmemleak_not_leak(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + make_gray_object((unsigned long)ptr); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); +} +EXPORT_SYMBOL(kmemleak_not_leak); + +/* + * Ignore a memory block. This is usually done when it is known that the + * corresponding block is not a leak and does not contain any references to + * other allocated memory blocks. + */ +void __ref kmemleak_ignore(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + make_black_object((unsigned long)ptr); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); +} +EXPORT_SYMBOL(kmemleak_ignore); + +/* + * Limit the range to be scanned in an allocated memory block. + */ +void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, + size_t length, gfp_t gfp) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + add_scan_area((unsigned long)ptr, offset, length, gfp); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); +} +EXPORT_SYMBOL(kmemleak_scan_area); + +/* + * Inform kmemleak not to scan the given memory block. + */ +void __ref kmemleak_no_scan(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + object_no_scan((unsigned long)ptr); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); +} +EXPORT_SYMBOL(kmemleak_no_scan); + +/* + * Memory scanning is a long process and it needs to be interruptable. This + * function checks whether such interrupt condition occured. + */ +static int scan_should_stop(void) +{ + if (!atomic_read(&kmemleak_enabled)) + return 1; + + /* + * This function may be called from either process or kthread context, + * hence the need to check for both stop conditions. + */ + if (current->mm) + return signal_pending(current); + else + return kthread_should_stop(); + + return 0; +} + +/* + * Scan a memory block (exclusive range) for valid pointers and add those + * found to the gray list. + */ +static void scan_block(void *_start, void *_end, + struct kmemleak_object *scanned, int allow_resched) +{ + unsigned long *ptr; + unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); + unsigned long *end = _end - (BYTES_PER_POINTER - 1); + + for (ptr = start; ptr < end; ptr++) { + struct kmemleak_object *object; + unsigned long flags; + unsigned long pointer; + + if (allow_resched) + cond_resched(); + if (scan_should_stop()) + break; + + /* don't scan uninitialized memory */ + if (!kmemcheck_is_obj_initialized((unsigned long)ptr, + BYTES_PER_POINTER)) + continue; + + pointer = *ptr; + + object = find_and_get_object(pointer, 1); + if (!object) + continue; + if (object == scanned) { + /* self referenced, ignore */ + put_object(object); + continue; + } + + /* + * Avoid the lockdep recursive warning on object->lock being + * previously acquired in scan_object(). These locks are + * enclosed by scan_mutex. + */ + spin_lock_irqsave_nested(&object->lock, flags, + SINGLE_DEPTH_NESTING); + if (!color_white(object)) { + /* non-orphan, ignored or new */ + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); + continue; + } + + /* + * Increase the object's reference count (number of pointers + * to the memory block). If this count reaches the required + * minimum, the object's color will become gray and it will be + * added to the gray_list. + */ + object->count++; + if (color_gray(object)) + list_add_tail(&object->gray_list, &gray_list); + else + put_object(object); + spin_unlock_irqrestore(&object->lock, flags); + } +} + +/* + * Scan a memory block corresponding to a kmemleak_object. A condition is + * that object->use_count >= 1. + */ +static void scan_object(struct kmemleak_object *object) +{ + struct kmemleak_scan_area *area; + struct hlist_node *elem; + unsigned long flags; + + /* + * Once the object->lock is aquired, the corresponding memory block + * cannot be freed (the same lock is aquired in delete_object). + */ + spin_lock_irqsave(&object->lock, flags); + if (object->flags & OBJECT_NO_SCAN) + goto out; + if (!(object->flags & OBJECT_ALLOCATED)) + /* already freed object */ + goto out; + if (hlist_empty(&object->area_list)) { + void *start = (void *)object->pointer; + void *end = (void *)(object->pointer + object->size); + + while (start < end && (object->flags & OBJECT_ALLOCATED) && + !(object->flags & OBJECT_NO_SCAN)) { + scan_block(start, min(start + MAX_SCAN_SIZE, end), + object, 0); + start += MAX_SCAN_SIZE; + + spin_unlock_irqrestore(&object->lock, flags); + cond_resched(); + spin_lock_irqsave(&object->lock, flags); + } + } else + hlist_for_each_entry(area, elem, &object->area_list, node) + scan_block((void *)(object->pointer + area->offset), + (void *)(object->pointer + area->offset + + area->length), object, 0); +out: + spin_unlock_irqrestore(&object->lock, flags); +} + +/* + * Scan data sections and all the referenced memory blocks allocated via the + * kernel's standard allocators. This function must be called with the + * scan_mutex held. + */ +static void kmemleak_scan(void) +{ + unsigned long flags; + struct kmemleak_object *object, *tmp; + int i; + int new_leaks = 0; + int gray_list_pass = 0; + + jiffies_last_scan = jiffies; + + /* prepare the kmemleak_object's */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); +#ifdef DEBUG + /* + * With a few exceptions there should be a maximum of + * 1 reference to any object at this point. + */ + if (atomic_read(&object->use_count) > 1) { + pr_debug("object->use_count = %d\n", + atomic_read(&object->use_count)); + dump_object_info(object); + } +#endif + /* reset the reference count (whiten the object) */ + object->count = 0; + object->flags &= ~OBJECT_NEW; + if (color_gray(object) && get_object(object)) + list_add_tail(&object->gray_list, &gray_list); + + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + /* data/bss scanning */ + scan_block(_sdata, _edata, NULL, 1); + scan_block(__bss_start, __bss_stop, NULL, 1); + +#ifdef CONFIG_SMP + /* per-cpu sections scanning */ + for_each_possible_cpu(i) + scan_block(__per_cpu_start + per_cpu_offset(i), + __per_cpu_end + per_cpu_offset(i), NULL, 1); +#endif + + /* + * Struct page scanning for each node. The code below is not yet safe + * with MEMORY_HOTPLUG. + */ + for_each_online_node(i) { + pg_data_t *pgdat = NODE_DATA(i); + unsigned long start_pfn = pgdat->node_start_pfn; + unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page; + + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + /* only scan if page is in use */ + if (page_count(page) == 0) + continue; + scan_block(page, page + 1, NULL, 1); + } + } + + /* + * Scanning the task stacks (may introduce false negatives). + */ + if (kmemleak_stack_scan) { + struct task_struct *p, *g; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + scan_block(task_stack_page(p), task_stack_page(p) + + THREAD_SIZE, NULL, 0); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } + + /* + * Scan the objects already referenced from the sections scanned + * above. More objects will be referenced and, if there are no memory + * leaks, all the objects will be scanned. The list traversal is safe + * for both tail additions and removals from inside the loop. The + * kmemleak objects cannot be freed from outside the loop because their + * use_count was increased. + */ +repeat: + object = list_entry(gray_list.next, typeof(*object), gray_list); + while (&object->gray_list != &gray_list) { + cond_resched(); + + /* may add new objects to the list */ + if (!scan_should_stop()) + scan_object(object); + + tmp = list_entry(object->gray_list.next, typeof(*object), + gray_list); + + /* remove the object from the list and release it */ + list_del(&object->gray_list); + put_object(object); + + object = tmp; + } + + if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) + goto scan_end; + + /* + * Check for new objects allocated during this scanning and add them + * to the gray list. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_NEW) && !color_black(object) && + get_object(object)) { + object->flags &= ~OBJECT_NEW; + list_add_tail(&object->gray_list, &gray_list); + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + if (!list_empty(&gray_list)) + goto repeat; + +scan_end: + WARN_ON(!list_empty(&gray_list)); + + /* + * If scanning was stopped or new objects were being allocated at a + * higher rate than gray list scanning, do not report any new + * unreferenced objects. + */ + if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) + return; + + /* + * Scanning result reporting. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if (unreferenced_object(object) && + !(object->flags & OBJECT_REPORTED)) { + object->flags |= OBJECT_REPORTED; + new_leaks++; + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + if (new_leaks) + pr_info("%d new suspected memory leaks (see " + "/sys/kernel/debug/kmemleak)\n", new_leaks); + +} + +/* + * Thread function performing automatic memory scanning. Unreferenced objects + * at the end of a memory scan are reported but only the first time. + */ +static int kmemleak_scan_thread(void *arg) +{ + static int first_run = 1; + + pr_info("Automatic memory scanning thread started\n"); + set_user_nice(current, 10); + + /* + * Wait before the first scan to allow the system to fully initialize. + */ + if (first_run) { + first_run = 0; + ssleep(SECS_FIRST_SCAN); + } + + while (!kthread_should_stop()) { + signed long timeout = jiffies_scan_wait; + + mutex_lock(&scan_mutex); + kmemleak_scan(); + mutex_unlock(&scan_mutex); + + /* wait before the next scan */ + while (timeout && !kthread_should_stop()) + timeout = schedule_timeout_interruptible(timeout); + } + + pr_info("Automatic memory scanning thread ended\n"); + + return 0; +} + +/* + * Start the automatic memory scanning thread. This function must be called + * with the scan_mutex held. + */ +static void start_scan_thread(void) +{ + if (scan_thread) + return; + scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); + if (IS_ERR(scan_thread)) { + pr_warning("Failed to create the scan thread\n"); + scan_thread = NULL; + } +} + +/* + * Stop the automatic memory scanning thread. This function must be called + * with the scan_mutex held. + */ +static void stop_scan_thread(void) +{ + if (scan_thread) { + kthread_stop(scan_thread); + scan_thread = NULL; + } +} + +/* + * Iterate over the object_list and return the first valid object at or after + * the required position with its use_count incremented. The function triggers + * a memory scanning when the pos argument points to the first position. + */ +static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct kmemleak_object *object; + loff_t n = *pos; + int err; + + err = mutex_lock_interruptible(&scan_mutex); + if (err < 0) + return ERR_PTR(err); + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + if (n-- > 0) + continue; + if (get_object(object)) + goto out; + } + object = NULL; +out: + return object; +} + +/* + * Return the next object in the object_list. The function decrements the + * use_count of the previous object and increases that of the next one. + */ +static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct kmemleak_object *prev_obj = v; + struct kmemleak_object *next_obj = NULL; + struct list_head *n = &prev_obj->object_list; + + ++(*pos); + + list_for_each_continue_rcu(n, &object_list) { + next_obj = list_entry(n, struct kmemleak_object, object_list); + if (get_object(next_obj)) + break; + } + + put_object(prev_obj); + return next_obj; +} + +/* + * Decrement the use_count of the last object required, if any. + */ +static void kmemleak_seq_stop(struct seq_file *seq, void *v) +{ + if (!IS_ERR(v)) { + /* + * kmemleak_seq_start may return ERR_PTR if the scan_mutex + * waiting was interrupted, so only release it if !IS_ERR. + */ + rcu_read_unlock(); + mutex_unlock(&scan_mutex); + if (v) + put_object(v); + } +} + +/* + * Print the information for an unreferenced object to the seq file. + */ +static int kmemleak_seq_show(struct seq_file *seq, void *v) +{ + struct kmemleak_object *object = v; + unsigned long flags; + + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) + print_unreferenced(seq, object); + spin_unlock_irqrestore(&object->lock, flags); + return 0; +} + +static const struct seq_operations kmemleak_seq_ops = { + .start = kmemleak_seq_start, + .next = kmemleak_seq_next, + .stop = kmemleak_seq_stop, + .show = kmemleak_seq_show, +}; + +static int kmemleak_open(struct inode *inode, struct file *file) +{ + if (!atomic_read(&kmemleak_enabled)) + return -EBUSY; + + return seq_open(file, &kmemleak_seq_ops); +} + +static int kmemleak_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} + +static int dump_str_object_info(const char *str) +{ + unsigned long flags; + struct kmemleak_object *object; + unsigned long addr; + + addr= simple_strtoul(str, NULL, 0); + object = find_and_get_object(addr, 0); + if (!object) { + pr_info("Unknown object at 0x%08lx\n", addr); + return -EINVAL; + } + + spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + return 0; +} + +/* + * We use grey instead of black to ensure we can do future scans on the same + * objects. If we did not do future scans these black objects could + * potentially contain references to newly allocated objects in the future and + * we'd end up with false positives. + */ +static void kmemleak_clear(void) +{ + struct kmemleak_object *object; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && + unreferenced_object(object)) + __paint_it(object, KMEMLEAK_GREY); + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); +} + +/* + * File write operation to configure kmemleak at run-time. The following + * commands can be written to the /sys/kernel/debug/kmemleak file: + * off - disable kmemleak (irreversible) + * stack=on - enable the task stacks scanning + * stack=off - disable the tasks stacks scanning + * scan=on - start the automatic memory scanning thread + * scan=off - stop the automatic memory scanning thread + * scan=... - set the automatic memory scanning period in seconds (0 to + * disable it) + * scan - trigger a memory scan + * clear - mark all current reported unreferenced kmemleak objects as + * grey to ignore printing them + * dump=... - dump information about the object found at the given address + */ +static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, + size_t size, loff_t *ppos) +{ + char buf[64]; + int buf_size; + int ret; + + buf_size = min(size, (sizeof(buf) - 1)); + if (strncpy_from_user(buf, user_buf, buf_size) < 0) + return -EFAULT; + buf[buf_size] = 0; + + ret = mutex_lock_interruptible(&scan_mutex); + if (ret < 0) + return ret; + + if (strncmp(buf, "off", 3) == 0) + kmemleak_disable(); + else if (strncmp(buf, "stack=on", 8) == 0) + kmemleak_stack_scan = 1; + else if (strncmp(buf, "stack=off", 9) == 0) + kmemleak_stack_scan = 0; + else if (strncmp(buf, "scan=on", 7) == 0) + start_scan_thread(); + else if (strncmp(buf, "scan=off", 8) == 0) + stop_scan_thread(); + else if (strncmp(buf, "scan=", 5) == 0) { + unsigned long secs; + + ret = strict_strtoul(buf + 5, 0, &secs); + if (ret < 0) + goto out; + stop_scan_thread(); + if (secs) { + jiffies_scan_wait = msecs_to_jiffies(secs * 1000); + start_scan_thread(); + } + } else if (strncmp(buf, "scan", 4) == 0) + kmemleak_scan(); + else if (strncmp(buf, "clear", 5) == 0) + kmemleak_clear(); + else if (strncmp(buf, "dump=", 5) == 0) + ret = dump_str_object_info(buf + 5); + else + ret = -EINVAL; + +out: + mutex_unlock(&scan_mutex); + if (ret < 0) + return ret; + + /* ignore the rest of the buffer, only one command at a time */ + *ppos += size; + return size; +} + +static const struct file_operations kmemleak_fops = { + .owner = THIS_MODULE, + .open = kmemleak_open, + .read = seq_read, + .write = kmemleak_write, + .llseek = seq_lseek, + .release = kmemleak_release, +}; + +/* + * Perform the freeing of the kmemleak internal objects after waiting for any + * current memory scan to complete. + */ +static void kmemleak_do_cleanup(struct work_struct *work) +{ + struct kmemleak_object *object; + + mutex_lock(&scan_mutex); + stop_scan_thread(); + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) + delete_object_full(object->pointer); + rcu_read_unlock(); + mutex_unlock(&scan_mutex); +} + +static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); + +/* + * Disable kmemleak. No memory allocation/freeing will be traced once this + * function is called. Disabling kmemleak is an irreversible operation. + */ +static void kmemleak_disable(void) +{ + /* atomically check whether it was already invoked */ + if (atomic_cmpxchg(&kmemleak_error, 0, 1)) + return; + + /* stop any memory operation tracing */ + atomic_set(&kmemleak_early_log, 0); + atomic_set(&kmemleak_enabled, 0); + + /* check whether it is too early for a kernel thread */ + if (atomic_read(&kmemleak_initialized)) + schedule_work(&cleanup_work); + + pr_info("Kernel memory leak detector disabled\n"); +} + +/* + * Allow boot-time kmemleak disabling (enabled by default). + */ +static int kmemleak_boot_config(char *str) +{ + if (!str) + return -EINVAL; + if (strcmp(str, "off") == 0) + kmemleak_disable(); + else if (strcmp(str, "on") != 0) + return -EINVAL; + return 0; +} +early_param("kmemleak", kmemleak_boot_config); + +/* + * Kmemleak initialization. + */ +void __init kmemleak_init(void) +{ + int i; + unsigned long flags; + + jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); + jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); + + object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); + scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); + INIT_PRIO_TREE_ROOT(&object_tree_root); + + /* the kernel is still in UP mode, so disabling the IRQs is enough */ + local_irq_save(flags); + if (!atomic_read(&kmemleak_error)) { + atomic_set(&kmemleak_enabled, 1); + atomic_set(&kmemleak_early_log, 0); + } + local_irq_restore(flags); + + /* + * This is the point where tracking allocations is safe. Automatic + * scanning is started during the late initcall. Add the early logged + * callbacks to the kmemleak infrastructure. + */ + for (i = 0; i < crt_early_log; i++) { + struct early_log *log = &early_log[i]; + + switch (log->op_type) { + case KMEMLEAK_ALLOC: + early_alloc(log); + break; + case KMEMLEAK_FREE: + kmemleak_free(log->ptr); + break; + case KMEMLEAK_FREE_PART: + kmemleak_free_part(log->ptr, log->size); + break; + case KMEMLEAK_NOT_LEAK: + kmemleak_not_leak(log->ptr); + break; + case KMEMLEAK_IGNORE: + kmemleak_ignore(log->ptr); + break; + case KMEMLEAK_SCAN_AREA: + kmemleak_scan_area(log->ptr, log->offset, log->length, + GFP_KERNEL); + break; + case KMEMLEAK_NO_SCAN: + kmemleak_no_scan(log->ptr); + break; + default: + WARN_ON(1); + } + } +} + +/* + * Late initialization function. + */ +static int __init kmemleak_late_init(void) +{ + struct dentry *dentry; + + atomic_set(&kmemleak_initialized, 1); + + if (atomic_read(&kmemleak_error)) { + /* + * Some error occured and kmemleak was disabled. There is a + * small chance that kmemleak_disable() was called immediately + * after setting kmemleak_initialized and we may end up with + * two clean-up threads but serialized by scan_mutex. + */ + schedule_work(&cleanup_work); + return -ENOMEM; + } + + dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, + &kmemleak_fops); + if (!dentry) + pr_warning("Failed to create the debugfs kmemleak file\n"); + mutex_lock(&scan_mutex); + start_scan_thread(); + mutex_unlock(&scan_mutex); + + pr_info("Kernel memory leak detector initialized\n"); + + return 0; +} +late_initcall(kmemleak_late_init); diff --git a/mm/maccess.c b/mm/maccess.c index ac40796cfb1..9073695ff25 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_kernel_write(void *dst, void *src, size_t size) +long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); diff --git a/mm/madvise.c b/mm/madvise.c index b9ce574827c..76eb4193acd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma, end = vma->vm_end; end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - force_page_cache_readahead(file->f_mapping, - file, start, max_sane_readahead(end - start)); + force_page_cache_readahead(file->f_mapping, file, start, end - start); return 0; } @@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, break; default: - error = -EINVAL; + BUG(); break; } return error; } +static int +madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_DOFORK: + case MADV_DONTFORK: + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 1; + + default: + return 0; + } +} /* * The madvise(2) system call. * @@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; + if (!madvise_behavior_valid(behavior)) + return error; + write = madvise_need_mmap_write(behavior); if (write) down_write(¤t->mm->mmap_sem); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d0ea3ceba6..fd4529d86de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -27,6 +27,7 @@ #include <linux/backing-dev.h> #include <linux/bit_spinlock.h> #include <linux/rcupdate.h> +#include <linux/limits.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/swap.h> @@ -44,7 +45,7 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP -/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ +/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; static int really_do_swap_account __initdata = 1; /* for remember boot option*/ #else @@ -61,7 +62,8 @@ enum mem_cgroup_stat_index { * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. */ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ + MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ + MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ @@ -95,6 +97,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, return ret; } +static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) +{ + s64 ret; + + ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); + ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); + return ret; +} + /* * per-zone information in memory controller. */ @@ -154,9 +165,9 @@ struct mem_cgroup { /* * While reclaiming in a hiearchy, we cache the last child we - * reclaimed from. Protected by hierarchy_mutex + * reclaimed from. */ - struct mem_cgroup *last_scanned_child; + int last_scanned_child; /* * Should the accounting and control be hierarchical, per subtree? */ @@ -166,6 +177,9 @@ struct mem_cgroup { unsigned int swappiness; + /* set when res.limit == memsw.limit */ + bool memsw_is_minimum; + /* * statistics. This must be placed at the end of memcg. */ @@ -178,6 +192,7 @@ enum charge_type { MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ + MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ NR_CHARGE_TYPE, }; @@ -202,6 +217,7 @@ pcg_default_flags[NR_CHARGE_TYPE] = { static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, @@ -246,7 +262,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) return mem_cgroup_zoneinfo(mem, nid, zid); } -static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, +static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { int nid, zid; @@ -285,6 +301,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *mem = NULL; + + if (!mm) + return NULL; /* * Because we have no locks, mm->owner's may be being moved to other * cgroup. We use css_tryget() here even if this looks @@ -300,11 +319,39 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return mem; } -static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) +/* + * Call callback function against all cgroup under hierarchy tree. + */ +static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, + int (*func)(struct mem_cgroup *, void *)) { - if (!mem) - return true; - return css_is_removed(&mem->css); + int found, ret, nextid; + struct cgroup_subsys_state *css; + struct mem_cgroup *mem; + + if (!root->use_hierarchy) + return (*func)(root, data); + + nextid = 1; + do { + ret = 0; + mem = NULL; + + rcu_read_lock(); + css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, + &found); + if (css && css_tryget(css)) + mem = container_of(css, struct mem_cgroup, css); + rcu_read_unlock(); + + if (mem) { + ret = (*func)(mem, data); + css_put(&mem->css); + } + nextid = found + 1; + } while (!ret && css); + + return ret; } /* @@ -440,31 +487,24 @@ void mem_cgroup_move_lists(struct page *page, int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; + struct mem_cgroup *curr = NULL; task_lock(task); - ret = task->mm && mm_match_cgroup(task->mm, mem); + rcu_read_lock(); + curr = try_get_mem_cgroup_from_mm(task->mm); + rcu_read_unlock(); task_unlock(task); + if (!curr) + return 0; + if (curr->use_hierarchy) + ret = css_is_ancestor(&curr->css, &mem->css); + else + ret = (curr == mem); + css_put(&curr->css); return ret; } /* - * Calculate mapped_ratio under memory controller. This will be used in - * vmscan.c for deteremining we have to reclaim mapped pages. - */ -int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) -{ - long total, rss; - - /* - * usage is recorded in bytes. But, here, we assume the number of - * physical pages can be represented by "long" on any arch. - */ - total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; - rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); - return (int)((rss * 100L) / total); -} - -/* * prev_priority control...this will be used in memory reclaim path. */ int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) @@ -500,8 +540,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ unsigned long gb; unsigned long inactive_ratio; - inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); - active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); + inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); + active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -535,6 +575,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) return 0; } +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +{ + unsigned long active; + unsigned long inactive; + + inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); + active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); + + return (active > inactive); +} + unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) @@ -598,6 +649,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; int lru = LRU_FILE * !!file + !!active; + int ret; BUG_ON(!mem_cont); mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); @@ -615,9 +667,19 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, continue; scan++; - if (__isolate_lru_page(page, mode, file) == 0) { + ret = __isolate_lru_page(page, mode, file); + switch (ret) { + case 0: list_move(&page->lru, dst); + mem_cgroup_del_lru(page); nr_taken++; + break; + case -EBUSY: + /* we don't affect global LRU but rotate in our LRU */ + mem_cgroup_rotate_lru_list(page, page_lru(page)); + break; + default: + break; } } @@ -628,172 +690,206 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) -/* - * This routine finds the DFS walk successor. This routine should be - * called with hierarchy_mutex held - */ -static struct mem_cgroup * -__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) +static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) { - struct cgroup *cgroup, *curr_cgroup, *root_cgroup; - - curr_cgroup = curr->css.cgroup; - root_cgroup = root_mem->css.cgroup; + if (do_swap_account) { + if (res_counter_check_under_limit(&mem->res) && + res_counter_check_under_limit(&mem->memsw)) + return true; + } else + if (res_counter_check_under_limit(&mem->res)) + return true; + return false; +} - if (!list_empty(&curr_cgroup->children)) { - /* - * Walk down to children - */ - cgroup = list_entry(curr_cgroup->children.next, - struct cgroup, sibling); - curr = mem_cgroup_from_cont(cgroup); - goto done; - } +static unsigned int get_swappiness(struct mem_cgroup *memcg) +{ + struct cgroup *cgrp = memcg->css.cgroup; + unsigned int swappiness; -visit_parent: - if (curr_cgroup == root_cgroup) { - /* caller handles NULL case */ - curr = NULL; - goto done; - } + /* root ? */ + if (cgrp->parent == NULL) + return vm_swappiness; - /* - * Goto next sibling - */ - if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { - cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, - sibling); - curr = mem_cgroup_from_cont(cgroup); - goto done; - } + spin_lock(&memcg->reclaim_param_lock); + swappiness = memcg->swappiness; + spin_unlock(&memcg->reclaim_param_lock); - /* - * Go up to next parent and next parent's sibling if need be - */ - curr_cgroup = curr_cgroup->parent; - goto visit_parent; + return swappiness; +} -done: - return curr; +static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) +{ + int *val = data; + (*val)++; + return 0; } -/* - * Visit the first child (need not be the first child as per the ordering - * of the cgroup list, since we track last_scanned_child) of @mem and use - * that to reclaim free pages from. +/** + * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. + * @memcg: The memory cgroup that went over limit + * @p: Task that is going to be killed + * + * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is + * enabled */ -static struct mem_cgroup * -mem_cgroup_get_next_node(struct mem_cgroup *root_mem) +void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - struct cgroup *cgroup; - struct mem_cgroup *orig, *next; - bool obsolete; - + struct cgroup *task_cgrp; + struct cgroup *mem_cgrp; /* - * Scan all children under the mem_cgroup mem + * Need a buffer in BSS, can't rely on allocations. The code relies + * on the assumption that OOM is serialized for memory controller. + * If this assumption is broken, revisit this code. */ - mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); + static char memcg_name[PATH_MAX]; + int ret; + + if (!memcg) + return; - orig = root_mem->last_scanned_child; - obsolete = mem_cgroup_is_obsolete(orig); - if (list_empty(&root_mem->css.cgroup->children)) { + rcu_read_lock(); + + mem_cgrp = memcg->css.cgroup; + task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); + + ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); + if (ret < 0) { /* - * root_mem might have children before and last_scanned_child - * may point to one of them. We put it later. + * Unfortunately, we are unable to convert to a useful name + * But we'll still print out the usage information */ - if (orig) - VM_BUG_ON(!obsolete); - next = NULL; + rcu_read_unlock(); goto done; } + rcu_read_unlock(); - if (!orig || obsolete) { - cgroup = list_first_entry(&root_mem->css.cgroup->children, - struct cgroup, sibling); - next = mem_cgroup_from_cont(cgroup); - } else - next = __mem_cgroup_get_next_node(orig, root_mem); + printk(KERN_INFO "Task in %s killed", memcg_name); + + rcu_read_lock(); + ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); + if (ret < 0) { + rcu_read_unlock(); + goto done; + } + rcu_read_unlock(); + /* + * Continues from above, so we don't need an KERN_ level + */ + printk(KERN_CONT " as a result of limit of %s\n", memcg_name); done: - if (next) - mem_cgroup_get(next); - root_mem->last_scanned_child = next; - if (orig) - mem_cgroup_put(orig); - mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); - return (next) ? next : root_mem; + + printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->res, RES_FAILCNT)); + printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " + "failcnt %llu\n", + res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); } -static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) +/* + * This function returns the number of memcg under hierarchy tree. Returns + * 1(self count) if no children. + */ +static int mem_cgroup_count_children(struct mem_cgroup *mem) { - if (do_swap_account) { - if (res_counter_check_under_limit(&mem->res) && - res_counter_check_under_limit(&mem->memsw)) - return true; - } else - if (res_counter_check_under_limit(&mem->res)) - return true; - return false; + int num = 0; + mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); + return num; } -static unsigned int get_swappiness(struct mem_cgroup *memcg) +/* + * Visit the first child (need not be the first child as per the ordering + * of the cgroup list, since we track last_scanned_child) of @mem and use + * that to reclaim free pages from. + */ +static struct mem_cgroup * +mem_cgroup_select_victim(struct mem_cgroup *root_mem) { - struct cgroup *cgrp = memcg->css.cgroup; - unsigned int swappiness; + struct mem_cgroup *ret = NULL; + struct cgroup_subsys_state *css; + int nextid, found; - /* root ? */ - if (cgrp->parent == NULL) - return vm_swappiness; + if (!root_mem->use_hierarchy) { + css_get(&root_mem->css); + ret = root_mem; + } - spin_lock(&memcg->reclaim_param_lock); - swappiness = memcg->swappiness; - spin_unlock(&memcg->reclaim_param_lock); + while (!ret) { + rcu_read_lock(); + nextid = root_mem->last_scanned_child + 1; + css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, + &found); + if (css && css_tryget(css)) + ret = container_of(css, struct mem_cgroup, css); + + rcu_read_unlock(); + /* Updates scanning parameter */ + spin_lock(&root_mem->reclaim_param_lock); + if (!css) { + /* this means start scan from ID:1 */ + root_mem->last_scanned_child = 0; + } else + root_mem->last_scanned_child = found; + spin_unlock(&root_mem->reclaim_param_lock); + } - return swappiness; + return ret; } /* - * Dance down the hierarchy if needed to reclaim memory. We remember the - * last child we reclaimed from, so that we don't end up penalizing - * one child extensively based on its position in the children list. + * Scan the hierarchy if needed to reclaim memory. We remember the last child + * we reclaimed from, so that we don't end up penalizing one child extensively + * based on its position in the children list. * * root_mem is the original ancestor that we've been reclaim from. + * + * We give up and return to the caller when we visit root_mem twice. + * (other groups can be removed while we're walking....) + * + * If shrink==true, for avoiding to free too much, this returns immedieately. */ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, - gfp_t gfp_mask, bool noswap) -{ - struct mem_cgroup *next_mem; - int ret = 0; - - /* - * Reclaim unconditionally and don't check for return value. - * We need to reclaim in the current group and down the tree. - * One might think about checking for children before reclaiming, - * but there might be left over accounting, even after children - * have left. - */ - ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, - get_swappiness(root_mem)); - if (mem_cgroup_check_under_limit(root_mem)) - return 1; /* indicate reclaim has succeeded */ - if (!root_mem->use_hierarchy) - return ret; - - next_mem = mem_cgroup_get_next_node(root_mem); - - while (next_mem != root_mem) { - if (mem_cgroup_is_obsolete(next_mem)) { - next_mem = mem_cgroup_get_next_node(root_mem); + gfp_t gfp_mask, bool noswap, bool shrink) +{ + struct mem_cgroup *victim; + int ret, total = 0; + int loop = 0; + + /* If memsw_is_minimum==1, swap-out is of-no-use. */ + if (root_mem->memsw_is_minimum) + noswap = true; + + while (loop < 2) { + victim = mem_cgroup_select_victim(root_mem); + if (victim == root_mem) + loop++; + if (!mem_cgroup_local_usage(&victim->stat)) { + /* this cgroup's local usage == 0 */ + css_put(&victim->css); continue; } - ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, - get_swappiness(next_mem)); + /* we use swappiness of local cgroup */ + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, + get_swappiness(victim)); + css_put(&victim->css); + /* + * At shrinking usage, we can't check we should stop here or + * reclaim more. It's depends on callers. last_scanned_child + * will work enough for keeping fairness under tree. + */ + if (shrink) + return ret; + total += ret; if (mem_cgroup_check_under_limit(root_mem)) - return 1; /* indicate reclaim has succeeded */ - next_mem = mem_cgroup_get_next_node(root_mem); + return 1 + total; } - return ret; + return total; } bool mem_cgroup_oom_called(struct task_struct *task) @@ -812,6 +908,57 @@ bool mem_cgroup_oom_called(struct task_struct *task) rcu_read_unlock(); return ret; } + +static int record_last_oom_cb(struct mem_cgroup *mem, void *data) +{ + mem->last_oom_jiffies = jiffies; + return 0; +} + +static void record_last_oom(struct mem_cgroup *mem) +{ + mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); +} + +/* + * Currently used to update mapped file statistics, but the routine can be + * generalized to update other statistics as well. + */ +void mem_cgroup_update_mapped_file_stat(struct page *page, int val) +{ + struct mem_cgroup *mem; + struct mem_cgroup_stat *stat; + struct mem_cgroup_stat_cpu *cpustat; + int cpu; + struct page_cgroup *pc; + + if (!page_is_file_cache(page)) + return; + + pc = lookup_page_cgroup(page); + if (unlikely(!pc)) + return; + + lock_page_cgroup(pc); + mem = pc->mem_cgroup; + if (!mem) + goto done; + + if (!PageCgroupUsed(pc)) + goto done; + + /* + * Preemption is already disabled, we don't need get_cpu() + */ + cpu = smp_processor_id(); + stat = &mem->stat; + cpustat = &stat->cpustat[cpu]; + + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); +done: + unlock_page_cgroup(pc); +} + /* * Unlike exported interface, "oom" parameter is added. if oom==true, * oom-killer can be invoked. @@ -846,7 +993,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (unlikely(!mem)) return 0; - VM_BUG_ON(mem_cgroup_is_obsolete(mem)); + VM_BUG_ON(css_is_removed(&mem->css)); while (1) { int ret; @@ -874,7 +1021,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, goto nomem; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, - noswap); + noswap, false); if (ret) continue; @@ -894,7 +1041,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, mutex_lock(&memcg_tasklist); mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); mutex_unlock(&memcg_tasklist); - mem_over_limit->last_oom_jiffies = jiffies; + record_last_oom(mem_over_limit); } goto nomem; } @@ -905,20 +1052,54 @@ nomem: return -ENOMEM; } + +/* + * A helper function to get mem_cgroup from ID. must be called under + * rcu_read_lock(). The caller must check css_is_removed() or some if + * it's concern. (dropping refcnt from swap can be called against removed + * memcg.) + */ +static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) +{ + struct cgroup_subsys_state *css; + + /* ID 0 is unused ID */ + if (!id) + return NULL; + css = css_lookup(&mem_cgroup_subsys, id); + if (!css) + return NULL; + return container_of(css, struct mem_cgroup, css); +} + static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) { struct mem_cgroup *mem; + struct page_cgroup *pc; + unsigned short id; swp_entry_t ent; + VM_BUG_ON(!PageLocked(page)); + if (!PageSwapCache(page)) return NULL; - ent.val = page_private(page); - mem = lookup_swap_cgroup(ent); - if (!mem) - return NULL; - if (!css_tryget(&mem->css)) - return NULL; + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + mem = pc->mem_cgroup; + if (mem && !css_tryget(&mem->css)) + mem = NULL; + } else { + ent.val = page_private(page); + id = lookup_swap_cgroup(ent); + rcu_read_lock(); + mem = mem_cgroup_lookup(id); + if (mem && !css_tryget(&mem->css)) + mem = NULL; + rcu_read_unlock(); + } + unlock_page_cgroup(pc); return mem; } @@ -975,6 +1156,10 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, struct mem_cgroup_per_zone *from_mz, *to_mz; int nid, zid; int ret = -EBUSY; + struct page *page; + int cpu; + struct mem_cgroup_stat *stat; + struct mem_cgroup_stat_cpu *cpustat; VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); @@ -995,6 +1180,23 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, res_counter_uncharge(&from->res, PAGE_SIZE); mem_cgroup_charge_statistics(from, pc, false); + + page = pc->page; + if (page_is_file_cache(page) && page_mapped(page)) { + cpu = smp_processor_id(); + /* Update mapped_file data for mem_cgroup "from" */ + stat = &from->stat; + cpustat = &stat->cpustat[cpu]; + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, + -1); + + /* Update mapped_file data for mem_cgroup "to" */ + stat = &to->stat; + cpustat = &stat->cpustat[cpu]; + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, + 1); + } + if (do_swap_account) res_counter_uncharge(&from->memsw, PAGE_SIZE); css_put(&from->css); @@ -1005,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, ret = 0; out: unlock_page_cgroup(pc); + /* + * We charges against "to" which may not have any tasks. Then, "to" + * can be under rmdir(). But in current implementation, caller of + * this function is just force_empty() and it's garanteed that + * "to" is never removed. So, we don't check rmdir status here. + */ return ret; } @@ -1117,6 +1325,10 @@ int mem_cgroup_newpage_charge(struct page *page, MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } +static void +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, + enum charge_type ctype); + int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { @@ -1153,16 +1365,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, unlock_page_cgroup(pc); } - if (do_swap_account && PageSwapCache(page)) { - mem = try_get_mem_cgroup_from_swapcache(page); - if (mem) - mm = NULL; - else - mem = NULL; - /* SwapCache may be still linked to LRU now. */ - mem_cgroup_lru_del_before_commit_swapcache(page); - } - if (unlikely(!mm && !mem)) mm = &init_mm; @@ -1170,22 +1372,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); - ret = mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); - if (mem) - css_put(&mem->css); - if (PageSwapCache(page)) - mem_cgroup_lru_add_after_commit_swapcache(page); + /* shmem */ + if (PageSwapCache(page)) { + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); + if (!ret) + __mem_cgroup_commit_charge_swapin(page, mem, + MEM_CGROUP_CHARGE_TYPE_SHMEM); + } else + ret = mem_cgroup_charge_common(page, mm, gfp_mask, + MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); - if (do_swap_account && !ret && PageSwapCache(page)) { - swp_entry_t ent = {.val = page_private(page)}; - /* avoid double counting */ - mem = swap_cgroup_record(ent, NULL); - if (mem) { - res_counter_uncharge(&mem->memsw, PAGE_SIZE); - mem_cgroup_put(mem); - } - } return ret; } @@ -1228,7 +1424,9 @@ charge_cur_mm: return __mem_cgroup_try_charge(mm, mask, ptr, true); } -void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +static void +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, + enum charge_type ctype) { struct page_cgroup *pc; @@ -1236,9 +1434,10 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) return; if (!ptr) return; + cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); + __mem_cgroup_commit_charge(ptr, pc, ctype); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -1249,16 +1448,34 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; + unsigned short id; struct mem_cgroup *memcg; - memcg = swap_cgroup_record(ent, NULL); + + id = swap_cgroup_record(ent, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); if (memcg) { + /* + * This recorded memcg can be obsolete one. So, avoid + * calling css_tryget + */ res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_put(memcg); } - + rcu_read_unlock(); } - /* add this page(page_cgroup) to the LRU we want. */ + /* + * At swapin, we may charge account against cgroup which has no tasks. + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&ptr->css); +} +void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +{ + __mem_cgroup_commit_charge_swapin(page, ptr, + MEM_CGROUP_CHARGE_TYPE_MAPPED); } void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) @@ -1306,6 +1523,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) switch (ctype) { case MEM_CGROUP_CHARGE_TYPE_MAPPED: + case MEM_CGROUP_CHARGE_TYPE_DROP: if (page_mapped(page)) goto unlock_out; break; @@ -1323,8 +1541,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) res_counter_uncharge(&mem->res, PAGE_SIZE); if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) res_counter_uncharge(&mem->memsw, PAGE_SIZE); - mem_cgroup_charge_statistics(mem, pc, false); + ClearPageCgroupUsed(pc); /* * pc->mem_cgroup is not cleared here. It will be accessed when it's @@ -1364,24 +1582,31 @@ void mem_cgroup_uncharge_cache_page(struct page *page) __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } +#ifdef CONFIG_SWAP /* - * called from __delete_from_swap_cache() and drop "page" account. + * called after __delete_from_swap_cache() and drop "page" account. * memcg information is recorded to swap_cgroup of "ent" */ -void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) +void +mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) { struct mem_cgroup *memcg; + int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; + + if (!swapout) /* this was a swap cache but the swap is unused ! */ + ctype = MEM_CGROUP_CHARGE_TYPE_DROP; + + memcg = __mem_cgroup_uncharge_common(page, ctype); - memcg = __mem_cgroup_uncharge_common(page, - MEM_CGROUP_CHARGE_TYPE_SWAPOUT); /* record memcg information */ - if (do_swap_account && memcg) { - swap_cgroup_record(ent, memcg); + if (do_swap_account && swapout && memcg) { + swap_cgroup_record(ent, css_id(&memcg->css)); mem_cgroup_get(memcg); } - if (memcg) + if (swapout && memcg) css_put(&memcg->css); } +#endif #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* @@ -1391,15 +1616,23 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) void mem_cgroup_uncharge_swap(swp_entry_t ent) { struct mem_cgroup *memcg; + unsigned short id; if (!do_swap_account) return; - memcg = swap_cgroup_record(ent, NULL); + id = swap_cgroup_record(ent, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); if (memcg) { + /* + * We uncharge this because swap is freed. + * This memcg can be obsolete one. We avoid calling css_tryget + */ res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_put(memcg); } + rcu_read_unlock(); } #endif @@ -1442,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, if (!mem) return; - + cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { target = oldpage; @@ -1482,39 +1715,37 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, */ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) mem_cgroup_uncharge_page(target); + /* + * At migration, we may charge account against cgroup which has no tasks + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&mem->css); } /* - * A call to try to shrink memory usage under specified resource controller. - * This is typically used for page reclaiming for shmem for reducing side - * effect of page allocation from shmem, which is used by some mem_cgroup. + * A call to try to shrink memory usage on charge failure at shmem's swapin. + * Calling hierarchical_reclaim is not enough because we should update + * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. + * Moreover considering hierarchy, we should reclaim from the mem_over_limit, + * not from the memcg which this page would be charged to. + * try_charge_swapin does all of these works properly. */ -int mem_cgroup_shrink_usage(struct page *page, +int mem_cgroup_shmem_charge_fallback(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { struct mem_cgroup *mem = NULL; - int progress = 0; - int retry = MEM_CGROUP_RECLAIM_RETRIES; + int ret; if (mem_cgroup_disabled()) return 0; - if (page) - mem = try_get_mem_cgroup_from_swapcache(page); - if (!mem && mm) - mem = try_get_mem_cgroup_from_mm(mm); - if (unlikely(!mem)) - return 0; - do { - progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); - progress += mem_cgroup_check_under_limit(mem); - } while (!progress && --retry); + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); + if (!ret) + mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ - css_put(&mem->css); - if (!retry) - return -ENOMEM; - return 0; + return ret; } static DEFINE_MUTEX(set_limit_mutex); @@ -1522,11 +1753,21 @@ static DEFINE_MUTEX(set_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { - - int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int retry_count; int progress; u64 memswlimit; int ret = 0; + int children = mem_cgroup_count_children(memcg); + u64 curusage, oldusage; + + /* + * For keeping hierarchical_reclaim simple, how long we should retry + * is depends on callers. We set our retry-count to be function + * of # of children which we should visit in this loop. + */ + retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; + + oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); while (retry_count) { if (signal_pending(current)) { @@ -1546,29 +1787,41 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; } ret = res_counter_set_limit(&memcg->res, val); + if (!ret) { + if (memswlimit == val) + memcg->memsw_is_minimum = true; + else + memcg->memsw_is_minimum = false; + } mutex_unlock(&set_limit_mutex); if (!ret) break; progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, - false); - if (!progress) retry_count--; + false, true); + curusage = res_counter_read_u64(&memcg->res, RES_USAGE); + /* Usage is reduced ? */ + if (curusage >= oldusage) + retry_count--; + else + oldusage = curusage; } return ret; } -int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, - unsigned long long val) +static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, + unsigned long long val) { - int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int retry_count; u64 memlimit, oldusage, curusage; - int ret; - - if (!do_swap_account) - return -EINVAL; + int children = mem_cgroup_count_children(memcg); + int ret = -EBUSY; + /* see mem_cgroup_resize_res_limit */ + retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; + oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); while (retry_count) { if (signal_pending(current)) { ret = -EINTR; @@ -1587,16 +1840,24 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, break; } ret = res_counter_set_limit(&memcg->memsw, val); + if (!ret) { + if (memlimit == val) + memcg->memsw_is_minimum = true; + else + memcg->memsw_is_minimum = false; + } mutex_unlock(&set_limit_mutex); if (!ret) break; - oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); - mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true); + mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; + else + oldusage = curusage; } return ret; } @@ -1684,7 +1945,7 @@ move_account: /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); ret = 0; - for_each_node_state(node, N_POSSIBLE) { + for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { enum lru_list l; for_each_lru(l) { @@ -1729,7 +1990,7 @@ try_to_free: if (!progress) { nr_retries--; /* maybe some writeback is necessary */ - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } } @@ -1798,8 +2059,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - if (do_swap_account) - val = res_counter_read_u64(&mem->memsw, name); + val = res_counter_read_u64(&mem->memsw, name); break; default: BUG(); @@ -1892,54 +2152,94 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) return 0; } -static const struct mem_cgroup_stat_desc { - const char *msg; - u64 unit; -} mem_cgroup_stat_desc[] = { - [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, - [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, - [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, - [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, + +/* For read statistics */ +enum { + MCS_CACHE, + MCS_RSS, + MCS_MAPPED_FILE, + MCS_PGPGIN, + MCS_PGPGOUT, + MCS_INACTIVE_ANON, + MCS_ACTIVE_ANON, + MCS_INACTIVE_FILE, + MCS_ACTIVE_FILE, + MCS_UNEVICTABLE, + NR_MCS_STAT, +}; + +struct mcs_total_stat { + s64 stat[NR_MCS_STAT]; +}; + +struct { + char *local_name; + char *total_name; +} memcg_stat_strings[NR_MCS_STAT] = { + {"cache", "total_cache"}, + {"rss", "total_rss"}, + {"mapped_file", "total_mapped_file"}, + {"pgpgin", "total_pgpgin"}, + {"pgpgout", "total_pgpgout"}, + {"inactive_anon", "total_inactive_anon"}, + {"active_anon", "total_active_anon"}, + {"inactive_file", "total_inactive_file"}, + {"active_file", "total_active_file"}, + {"unevictable", "total_unevictable"} }; + +static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) +{ + struct mcs_total_stat *s = data; + s64 val; + + /* per cpu stat */ + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); + s->stat[MCS_CACHE] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); + s->stat[MCS_RSS] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); + s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); + s->stat[MCS_PGPGIN] += val; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); + s->stat[MCS_PGPGOUT] += val; + + /* per zone stat */ + val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); + s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); + s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); + s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); + s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); + s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; + return 0; +} + +static void +mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) +{ + mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); +} + static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, struct cgroup_map_cb *cb) { struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); - struct mem_cgroup_stat *stat = &mem_cont->stat; + struct mcs_total_stat mystat; int i; - for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { - s64 val; + memset(&mystat, 0, sizeof(mystat)); + mem_cgroup_get_local_stat(mem_cont, &mystat); - val = mem_cgroup_read_stat(stat, i); - val *= mem_cgroup_stat_desc[i].unit; - cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); - } - /* showing # of active pages */ - { - unsigned long active_anon, inactive_anon; - unsigned long active_file, inactive_file; - unsigned long unevictable; - - inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, - LRU_INACTIVE_ANON); - active_anon = mem_cgroup_get_all_zonestat(mem_cont, - LRU_ACTIVE_ANON); - inactive_file = mem_cgroup_get_all_zonestat(mem_cont, - LRU_INACTIVE_FILE); - active_file = mem_cgroup_get_all_zonestat(mem_cont, - LRU_ACTIVE_FILE); - unevictable = mem_cgroup_get_all_zonestat(mem_cont, - LRU_UNEVICTABLE); - - cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); - cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); - cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); - cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); - cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); + for (i = 0; i < NR_MCS_STAT; i++) + cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); - } + /* Hierarchical information */ { unsigned long long limit, memsw_limit; memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); @@ -1948,6 +2248,12 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); } + memset(&mystat, 0, sizeof(mystat)); + mem_cgroup_get_total_stat(mem_cont, &mystat); + for (i = 0; i < NR_MCS_STAT; i++) + cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); + + #ifdef CONFIG_DEBUG_VM cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); @@ -2177,6 +2483,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) { int node; + free_css_id(&mem_cgroup_subsys, &mem->css); + for_each_node_state(node, N_POSSIBLE) free_mem_cgroup_per_zone_info(mem, node); @@ -2193,10 +2501,23 @@ static void mem_cgroup_get(struct mem_cgroup *mem) static void mem_cgroup_put(struct mem_cgroup *mem) { - if (atomic_dec_and_test(&mem->refcnt)) + if (atomic_dec_and_test(&mem->refcnt)) { + struct mem_cgroup *parent = parent_mem_cgroup(mem); __mem_cgroup_free(mem); + if (parent) + mem_cgroup_put(parent); + } } +/* + * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. + */ +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) +{ + if (!mem->res.parent) + return NULL; + return mem_cgroup_from_res_counter(mem->res.parent, res); +} #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP static void __init enable_swap_cgroup(void) @@ -2214,11 +2535,12 @@ static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem, *parent; + long error = -ENOMEM; int node; mem = mem_cgroup_alloc(); if (!mem) - return ERR_PTR(-ENOMEM); + return ERR_PTR(error); for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) @@ -2235,11 +2557,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&mem->res, &parent->res); res_counter_init(&mem->memsw, &parent->memsw); + /* + * We increment refcnt of the parent to ensure that we can + * safely access it on res_counter_charge/uncharge. + * This refcnt will be decremented when freeing this + * mem_cgroup(see mem_cgroup_put). + */ + mem_cgroup_get(parent); } else { res_counter_init(&mem->res, NULL); res_counter_init(&mem->memsw, NULL); } - mem->last_scanned_child = NULL; + mem->last_scanned_child = 0; spin_lock_init(&mem->reclaim_param_lock); if (parent) @@ -2248,26 +2577,22 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &mem->css; free_out: __mem_cgroup_free(mem); - return ERR_PTR(-ENOMEM); + return ERR_PTR(error); } -static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, +static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - mem_cgroup_force_empty(mem, false); + + return mem_cgroup_force_empty(mem, false); } static void mem_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - struct mem_cgroup *last_scanned_child = mem->last_scanned_child; - if (last_scanned_child) { - VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); - mem_cgroup_put(last_scanned_child); - } mem_cgroup_put(mem); } @@ -2306,6 +2631,7 @@ struct cgroup_subsys mem_cgroup_subsys = { .populate = mem_cgroup_populate, .attach = mem_cgroup_move_task, .early_init = 0, + .use_id = 1, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP diff --git a/mm/memory.c b/mm/memory.c index 22bfa7a47a0..aede2ce3aba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd) * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); - pte_free_tlb(tlb, token); + pte_free_tlb(tlb, token, addr); tlb->mm->nr_ptes--; } @@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - free_pte_range(tlb, pmd); + free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; @@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* @@ -1151,6 +1152,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); + /* + * pte_mkyoung() would be more correct here, but atomic care + * is needed to avoid losing the dirty bit: it is easier to use + * mark_page_accessed(). + */ mark_page_accessed(page); } unlock: @@ -1202,8 +1208,8 @@ static inline int use_zero_page(struct vm_area_struct *vma) int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) + unsigned long start, int nr_pages, int flags, + struct page **pages, struct vm_area_struct **vmas) { int i; unsigned int vm_flags = 0; @@ -1212,7 +1218,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); - if (len <= 0) + if (nr_pages <= 0) return 0; /* * Require read or write permissions. @@ -1264,7 +1270,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vmas[i] = gate_vma; i++; start += PAGE_SIZE; - len--; + nr_pages--; continue; } @@ -1275,7 +1281,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &len, i, write); + &start, &nr_pages, i, write); continue; } @@ -1305,8 +1311,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; + ret = handle_mm_fault(mm, vma, start, - foll_flags & FOLL_WRITE); + (foll_flags & FOLL_WRITE) ? + FAULT_FLAG_WRITE : 0); + if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; @@ -1349,14 +1358,64 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vmas[i] = vma; i++; start += PAGE_SIZE; - len--; - } while (len && start < vma->vm_end); - } while (len); + nr_pages--; + } while (nr_pages && start < vma->vm_end); + } while (nr_pages); return i; } +/** + * get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force write access even if user mapping is + * readonly. This will result in the page being COWed even + * in MAP_SHARED mappings. You do not want this. + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { int flags = 0; @@ -1366,9 +1425,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= GUP_FLAGS_FORCE; - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } EXPORT_SYMBOL(get_user_pages); @@ -1665,9 +1722,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". */ - if (addr == vma->vm_start && end == vma->vm_end) + if (addr == vma->vm_start && end == vma->vm_end) { vma->vm_pgoff = pfn; - else if (is_cow_mapping(vma->vm_flags)) + vma->vm_flags |= VM_PFN_AT_MMAP; + } else if (is_cow_mapping(vma->vm_flags)) return -EINVAL; vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; @@ -1679,6 +1737,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * needed from higher level routine calling unmap_vmas */ vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); + vma->vm_flags &= ~VM_PFN_AT_MMAP; return -EINVAL; } @@ -1938,6 +1997,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + struct vm_fault vmf; + int tmp; + + vmf.virtual_address = (void __user *)(address & + PAGE_MASK); + vmf.pgoff = old_page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = old_page; + /* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait @@ -1949,8 +2017,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; goto unwritable_page; + } + if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + lock_page(old_page); + if (!old_page->mapping) { + ret = 0; /* retry the fault */ + unlock_page(old_page); + goto unwritable_page; + } + } else + VM_BUG_ON(!PageLocked(old_page)); /* * Since we dropped the lock we need to revalidate @@ -1960,9 +2041,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - page_cache_release(old_page); - if (!pte_same(*page_table, orig_pte)) + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + page_cache_release(old_page); goto unlock; + } page_mkwrite = 1; } @@ -1999,7 +2082,7 @@ gotten: * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ - if (vma->vm_flags & VM_LOCKED) { + if ((vma->vm_flags & VM_LOCKED) && old_page) { lock_page(old_page); /* for LRU manipulation */ clear_page_mlock(old_page); unlock_page(old_page); @@ -2074,9 +2157,6 @@ gotten: unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { - if (vma->vm_file) - file_update_time(vma->vm_file); - /* * Yes, Virginia, this is actually required to prevent a race * with clear_page_dirty_for_io() from clearing the page dirty @@ -2085,21 +2165,46 @@ unlock: * * do_no_page is protected similarly. */ - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + if (!page_mkwrite) { + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + } put_page(dirty_page); + if (page_mkwrite) { + struct address_space *mapping = dirty_page->mapping; + + set_page_dirty(dirty_page); + unlock_page(dirty_page); + page_cache_release(dirty_page); + if (mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); } return ret; oom_free_new: page_cache_release(new_page); oom: - if (old_page) + if (old_page) { + if (page_mkwrite) { + unlock_page(old_page); + page_cache_release(old_page); + } page_cache_release(old_page); + } return VM_FAULT_OOM; unwritable_page: page_cache_release(old_page); - return VM_FAULT_SIGBUS; + return ret; } /* @@ -2393,7 +2498,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; struct page *page; @@ -2413,7 +2518,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - grab_swap_token(); /* Contend for token _before_ read-in */ + grab_swap_token(mm); /* Contend for token _before_ read-in */ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { @@ -2433,15 +2538,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGMAJFAULT); } - mark_page_accessed(page); - lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; - unlock_page(page); - goto out; + goto out_page; } /* @@ -2472,9 +2574,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); - if (write_access && reuse_swap_page(page)) { + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - write_access = 0; + flags &= ~FAULT_FLAG_WRITE; } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); @@ -2487,7 +2589,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, try_to_free_swap(page); unlock_page(page); - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; @@ -2503,6 +2605,7 @@ out: out_nomap: mem_cgroup_cancel_charge_swapin(ptr); pte_unmap_unlock(page_table, ptl); +out_page: unlock_page(page); page_cache_release(page); return ret; @@ -2515,7 +2618,7 @@ out_nomap: */ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) + unsigned int flags) { struct page *page; spinlock_t *ptl; @@ -2643,25 +2746,25 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * to become writable */ if (vma->vm_ops->page_mkwrite) { + int tmp; + unlock_page(page); - if (vma->vm_ops->page_mkwrite(vma, page) < 0) { - ret = VM_FAULT_SIGBUS; - anon = 1; /* no anon but release vmf.page */ - goto out_unlocked; - } - lock_page(page); - /* - * XXX: this is not quite right (racy vs - * invalidate) to unlock and relock the page - * like this, however a better fix requires - * reworking page_mkwrite locking API, which - * is better done later. - */ - if (!page->mapping) { - ret = 0; - anon = 1; /* no anon but release vmf.page */ - goto out; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; + goto unwritable_page; } + if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + ret = 0; /* retry the fault */ + unlock_page(page); + goto unwritable_page; + } + } else + VM_BUG_ON(!PageLocked(page)); page_mkwrite = 1; } } @@ -2675,7 +2778,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * due to the bad i386 page protection. But it's valid * for other architectures too. * - * Note that if write_access is true, we either now have + * Note that if FAULT_FLAG_WRITE is set, we either now have * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. @@ -2713,28 +2816,43 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); out: - unlock_page(vmf.page); -out_unlocked: - if (anon) - page_cache_release(vmf.page); - else if (dirty_page) { - if (vma->vm_file) - file_update_time(vma->vm_file); + if (dirty_page) { + struct address_space *mapping = page->mapping; - set_page_dirty_balance(dirty_page, page_mkwrite); + if (set_page_dirty(dirty_page)) + page_mkwrite = 1; + unlock_page(dirty_page); put_page(dirty_page); + if (page_mkwrite && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); + } else { + unlock_page(vmf.page); + if (anon) + page_cache_release(vmf.page); } return ret; + +unwritable_page: + page_cache_release(page); + return ret; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { pgoff_t pgoff = (((address & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); pte_unmap(page_table); return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); @@ -2751,12 +2869,12 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { - unsigned int flags = FAULT_FLAG_NONLINEAR | - (write_access ? FAULT_FLAG_WRITE : 0); pgoff_t pgoff; + flags |= FAULT_FLAG_NONLINEAR; + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return 0; @@ -2787,7 +2905,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, int write_access) + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -2798,30 +2916,30 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); } return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); + pte, pmd, flags); } if (pte_file(entry)) return do_nonlinear_fault(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); return do_swap_page(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); } ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, pmd, ptl, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { + if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { update_mmu_cache(vma, address, entry); } else { /* @@ -2830,7 +2948,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (write_access) + if (flags & FAULT_FLAG_WRITE) flush_tlb_page(vma, address); } unlock: @@ -2842,7 +2960,7 @@ unlock: * By the time we get here, we already hold the mm semaphore */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) + unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; @@ -2854,7 +2972,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGFAULT); if (unlikely(is_vm_hugetlb_page(vma))) - return hugetlb_fault(mm, vma, address, write_access); + return hugetlb_fault(mm, vma, address, flags); pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); @@ -2867,7 +2985,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) return VM_FAULT_OOM; - return handle_pte_fault(mm, vma, address, pte, pmd, write_access); + return handle_pte_fault(mm, vma, address, pte, pmd, flags); } #ifndef __PAGETABLE_PUD_FOLDED @@ -2986,22 +3104,13 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ -#ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot, resource_size_t *phys) +static int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - resource_size_t phys_addr = 0; - struct mm_struct *mm = vma->vm_mm; - int ret = -EINVAL; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - goto out; + pte_t *ptep; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) @@ -3019,22 +3128,71 @@ int follow_phys(struct vm_area_struct *vma, if (pmd_huge(*pmd)) goto out; - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!ptep) goto out; + if (!pte_present(*ptep)) + goto unlock; + *ptepp = ptep; + return 0; +unlock: + pte_unmap_unlock(ptep, *ptlp); +out: + return -EINVAL; +} + +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + int ret = -EINVAL; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return ret; + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + if (ret) + return ret; + *pfn = pte_pfn(*ptep); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(follow_pfn); + +#ifdef CONFIG_HAVE_IOREMAP_PROT +int follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot, resource_size_t *phys) +{ + int ret = -EINVAL; + pte_t *ptep, pte; + spinlock_t *ptl; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + goto out; pte = *ptep; - if (!pte_present(pte)) - goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; - phys_addr = pte_pfn(pte); - phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ *prot = pgprot_val(pte_pgprot(pte)); - *phys = phys_addr; - ret = 0; + *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + ret = 0; unlock: pte_unmap_unlock(ptep, ptl); out: diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c083cf5fd6d..e4412a676c8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); + calculate_zone_inactive_ratio(zone); if (onlined_pages) { kswapd_run(zone_to_nid(zone)); node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); @@ -832,6 +833,9 @@ repeat: totalram_pages -= offlined_pages; num_physpages -= offlined_pages; + setup_per_zone_wmarks(); + calculate_zone_inactive_ratio(zone); + vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6fdc04..7dd9d9f8069 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -182,13 +182,58 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) return 0; } -/* Create a new policy */ +/* + * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if + * any, for the new policy. mpol_new() has already validated the nodes + * parameter with respect to the policy mode and flags. But, we need to + * handle an empty nodemask with MPOL_PREFERRED here. + * + * Must be called holding task's alloc_lock to protect task's mems_allowed + * and mempolicy. May also be called holding the mmap_semaphore for write. + */ +static int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc) +{ + int ret; + + /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ + if (pol == NULL) + return 0; + /* Check N_HIGH_MEMORY */ + nodes_and(nsc->mask1, + cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); + + VM_BUG_ON(!nodes); + if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) + nodes = NULL; /* explicit local allocation */ + else { + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); + else + nodes_and(nsc->mask2, *nodes, nsc->mask1); + + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; + else + pol->w.cpuset_mems_allowed = + cpuset_current_mems_allowed; + } + + if (nodes) + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + else + ret = mpol_ops[pol->mode].create(pol, NULL); + return ret; +} + +/* + * This function just creates a new policy, does some check and simple + * initialization. You must invoke mpol_set_nodemask() to set nodes. + */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; - nodemask_t cpuset_context_nmask; - int ret; pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); @@ -210,7 +255,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); - nodes = NULL; /* flag local alloc */ } } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); @@ -221,30 +265,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy->mode = mode; policy->flags = flags; - if (nodes) { - /* - * cpuset related setup doesn't apply to local allocation - */ - cpuset_update_task_memory_state(); - if (flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); - else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); - if (mpol_store_user_nodemask(policy)) - policy->w.user_nodemask = *nodes; - else - policy->w.cpuset_mems_allowed = - cpuset_mems_allowed(current); - } - - ret = mpol_ops[mode].create(policy, - nodes ? &cpuset_context_nmask : NULL); - if (ret < 0) { - kmem_cache_free(policy_cache, policy); - return ERR_PTR(ret); - } return policy; } @@ -324,6 +344,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. + * + * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) @@ -600,13 +622,19 @@ static void mpol_set_task_struct_flag(void) static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { - struct mempolicy *new; + struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + NODEMASK_SCRATCH(scratch); + int ret; - new = mpol_new(mode, flags, nodes); - if (IS_ERR(new)) - return PTR_ERR(new); + if (!scratch) + return -ENOMEM; + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto out; + } /* * prevent changing our mempolicy while show_numa_maps() * is using it. @@ -615,20 +643,36 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, */ if (mm) down_write(&mm->mmap_sem); - mpol_put(current->mempolicy); + task_lock(current); + ret = mpol_set_nodemask(new, nodes, scratch); + if (ret) { + task_unlock(current); + if (mm) + up_write(&mm->mmap_sem); + mpol_put(new); + goto out; + } + old = current->mempolicy; current->mempolicy = new; mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); + task_unlock(current); if (mm) up_write(&mm->mmap_sem); - return 0; + mpol_put(old); + ret = 0; +out: + NODEMASK_SCRATCH_FREE(scratch); + return ret; } /* * Return nodemask for policy for get_mempolicy() query + * + * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) { @@ -674,7 +718,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; - cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; @@ -683,7 +726,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ + task_lock(current); *nmask = cpuset_current_mems_allowed; + task_unlock(current); return 0; } @@ -738,8 +783,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } err = 0; - if (nmask) + if (nmask) { + task_lock(current); get_policy_nodemask(pol, nmask); + task_unlock(current); + } out: mpol_cond_put(pol); @@ -767,7 +815,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -978,7 +1026,23 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) return err; } - down_write(&mm->mmap_sem); + { + NODEMASK_SCRATCH(scratch); + if (scratch) { + down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask, scratch); + task_unlock(current); + if (err) + up_write(&mm->mmap_sem); + } else + err = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + } + if (err) { + mpol_put(new); + return err; + } vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); @@ -1545,8 +1609,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; - cpuset_update_task_memory_state(); - if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -1593,8 +1655,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; - if ((gfp & __GFP_WAIT) && !in_interrupt()) - cpuset_update_task_memory_state(); if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; @@ -1851,27 +1911,46 @@ restart: * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. + * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { + int ret; + sp->root = RB_ROOT; /* empty tree == default mempolicy */ spin_lock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; struct mempolicy *new; + NODEMASK_SCRATCH(scratch); + if (!scratch) + return; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - mpol_put(mpol); /* drop our ref on sb mpol */ - if (IS_ERR(new)) + if (IS_ERR(new)) { + mpol_put(mpol); /* drop our ref on sb mpol */ + NODEMASK_SCRATCH_FREE(scratch); return; /* no valid nodemask intersection */ + } + + task_lock(current); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); + task_unlock(current); + mpol_put(mpol); /* drop our ref on sb mpol */ + if (ret) { + NODEMASK_SCRATCH_FREE(scratch); + mpol_put(new); + return; + } /* Create pseudo-vma that contains just the policy */ memset(&pvma, 0, sizeof(struct vm_area_struct)); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ mpol_put(new); /* drop initial ref */ + NODEMASK_SCRATCH_FREE(scratch); } } @@ -2086,8 +2165,24 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) err = 1; - else if (no_context) - new->w.user_nodemask = nodes; /* save for contextualization */ + else { + int ret; + NODEMASK_SCRATCH(scratch); + if (scratch) { + task_lock(current); + ret = mpol_set_nodemask(new, &nodes, scratch); + task_unlock(current); + } else + ret = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + if (ret) { + err = 1; + mpol_put(new); + } else if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } + } out: /* Restore string for error message */ diff --git a/mm/mempool.c b/mm/mempool.c index a46eb1b4bb6..32e75d40050 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab); */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t)(long)pool_data; + size_t size = (size_t)pool_data; return kmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t) pool_data; + size_t size = (size_t)pool_data; return kzalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kzalloc); diff --git a/mm/migrate.c b/mm/migrate.c index 2bb4e1d6352..939888f9dda 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -250,7 +250,7 @@ out: * The number of remaining references must be: * 1 for anonymous pages without a mapping * 2 for pages with a mapping - * 3 for pages with a mapping and PagePrivate set. + * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) @@ -270,7 +270,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - expected_count = 2 + !!PagePrivate(page); + expected_count = 2 + !!page_has_private(page); if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { spin_unlock_irq(&mapping->tree_lock); @@ -386,7 +386,7 @@ EXPORT_SYMBOL(fail_migrate_page); /* * Common logic to directly migrate a single page suitable for - * pages that do not use PagePrivate. + * pages that do not use PagePrivate/PagePrivate2. * * Pages are locked upon entry and exit. */ @@ -522,7 +522,7 @@ static int fallback_migrate_page(struct address_space *mapping, * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (PagePrivate(page) && + if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; @@ -655,7 +655,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * free the metadata, so the page can be freed. */ if (!page->mapping) { - if (!PageAnon(page) && PagePrivate(page)) { + if (!PageAnon(page) && page_has_private(page)) { /* * Go direct to try_to_free_buffers() here because * a) that's what try_to_release_page() would do anyway @@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_node(pm->node, + return alloc_pages_exact_node(pm->node, GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } @@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, struct page_to_node *pp; LIST_HEAD(pagelist); - migrate_prep(); down_read(&mm->mmap_sem); /* @@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); if (!pm) goto out; + + migrate_prep(); + /* * Store a chunk of page_to_node array in a page, * but keep the last one as a marker @@ -1129,7 +1131,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, struct vm_area_struct *vma; int err = 0; - for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { + for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { if (vma->vm_ops && vma->vm_ops->migrate) { err = vma->vm_ops->migrate(vma, to, from, flags); if (err) diff --git a/mm/mlock.c b/mm/mlock.c index 2904a347e47..45eb650b965 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -31,7 +31,6 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); -#ifdef CONFIG_UNEVICTABLE_LRU /* * Mlocked pages are marked with PageMlocked() flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate @@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval) return retval; } -#else /* CONFIG_UNEVICTABLE_LRU */ - -/* - * Just make pages present if VM_LOCKED. No-op if unlocking. - */ -static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int mlock) -{ - if (mlock && (vma->vm_flags & VM_LOCKED)) - return make_pages_present(start, end); - return 0; -} - -static inline int __mlock_posix_error_return(long retval) -{ - return 0; -} - -#endif /* CONFIG_UNEVICTABLE_LRU */ - /** * mlock_vma_pages_range() - mlock pages in specified vma range. * @vma - the vma containing the specfied address range @@ -294,14 +272,10 @@ static inline int __mlock_posix_error_return(long retval) * * return number of pages [> 0] to be removed from locked_vm on success * of "special" vmas. - * - * return negative error if vma spanning @start-@range disappears while - * mmap semaphore is dropped. Unlikely? */ long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - struct mm_struct *mm = vma->vm_mm; int nr_pages = (end - start) / PAGE_SIZE; BUG_ON(!(vma->vm_flags & VM_LOCKED)); @@ -314,20 +288,11 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - long error; - downgrade_write(&mm->mmap_sem); - - error = __mlock_vma_pages_range(vma, start, end, 1); - up_read(&mm->mmap_sem); - /* vma can change or disappear */ - down_write(&mm->mmap_sem); - vma = find_vma(mm, start); - /* non-NULL vma must contain @start, but need to check @end */ - if (!vma || end > vma->vm_end) - return -ENOMEM; + __mlock_vma_pages_range(vma, start, end, 1); - return 0; /* hide other errors from mmap(), et al */ + /* Hide errors from mmap() and other callers */ + return 0; } /* @@ -438,41 +403,14 @@ success: vma->vm_flags = newflags; if (lock) { - /* - * mmap_sem is currently held for write. Downgrade the write - * lock to a read lock so that other faults, mmap scans, ... - * while we fault in all pages. - */ - downgrade_write(&mm->mmap_sem); - ret = __mlock_vma_pages_range(vma, start, end, 1); - /* - * Need to reacquire mmap sem in write mode, as our callers - * expect this. We have no support for atomically upgrading - * a sem to write, so we need to check for ranges while sem - * is unlocked. - */ - up_read(&mm->mmap_sem); - /* vma can change or disappear */ - down_write(&mm->mmap_sem); - *prev = find_vma(mm, start); - /* non-NULL *prev must contain @start, but need to check @end */ - if (!(*prev) || end > (*prev)->vm_end) - ret = -ENOMEM; - else if (ret > 0) { + if (ret > 0) { mm->locked_vm -= ret; ret = 0; } else ret = __mlock_posix_error_return(ret); /* translate if needed */ } else { - /* - * TODO: for unlocking, pages will already be resident, so - * we don't need to wait for allocations/reclaim/pagein, ... - * However, unlocking a very large region can still take a - * while. Should we downgrade the semaphore for both lock - * AND unlock ? - */ __mlock_vma_pages_range(vma, start, end, 0); } @@ -669,47 +607,43 @@ void user_shm_unlock(size_t size, struct user_struct *user) free_uid(user); } -void *alloc_locked_buffer(size_t size) +int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, + size_t size) { - unsigned long rlim, vm, pgsz; - void *buffer = NULL; + unsigned long lim, vm, pgsz; + int error = -ENOMEM; pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) - goto out; + down_write(&mm->mmap_sem); - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) + lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = mm->total_vm + pgsz; + if (lim < vm) goto out; - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) + lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = mm->locked_vm + pgsz; + if (lim < vm) goto out; - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; + mm->total_vm += pgsz; + mm->locked_vm += pgsz; + error = 0; out: - up_write(¤t->mm->mmap_sem); - return buffer; + up_write(&mm->mmap_sem); + return error; } -void free_locked_buffer(void *buffer, size_t size) +void refund_locked_memory(struct mm_struct *mm, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); - - current->mm->total_vm -= pgsz; - current->mm->locked_vm -= pgsz; + down_write(&mm->mmap_sem); - up_write(¤t->mm->mmap_sem); + mm->total_vm -= pgsz; + mm->locked_vm -= pgsz; - kfree(buffer); + up_write(&mm->mmap_sem); } diff --git a/mm/mmap.c b/mm/mmap.c index 8d95902e9a3..8101de490c7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -20,6 +20,7 @@ #include <linux/fs.h> #include <linux/personality.h> #include <linux/security.h> +#include <linux/ima.h> #include <linux/hugetlb.h> #include <linux/profile.h> #include <linux/module.h> @@ -27,6 +28,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> +#include <linux/perf_counter.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -84,7 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); +struct percpu_counter vm_committed_as; /* * Check that a process has enough memory to allocate a new virtual @@ -178,11 +180,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (mm) allowed -= mm->total_vm / 32; - /* - * cast `allowed' as a signed long because vm_committed_space - * sometimes has a negative value - */ - if (atomic_long_read(&vm_committed_space) < (long)allowed) + if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: vm_unacct_memory(pages); @@ -658,6 +656,9 @@ again: remove_next = 1 + (end > next->vm_end); validate_mm(mm); } +/* Flags that can be inherited from an existing mapping when merging */ +#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) + /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -665,7 +666,7 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if (vma->vm_flags != vm_flags) + if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) return 0; if (vma->vm_file != file) return 0; @@ -915,7 +916,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, struct inode *inode; unsigned int vm_flags; int error; - int accountable = 1; unsigned long reqprot = prot; /* @@ -1016,8 +1016,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, return -EPERM; vm_flags &= ~VM_MAYEXEC; } - if (is_file_hugepages(file)) - accountable = 0; if (!file->f_op || !file->f_op->mmap) return -ENODEV; @@ -1049,9 +1047,11 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, error = security_file_mmap(file, reqprot, prot, flags, addr, 0); if (error) return error; + error = ima_file_mmap(file, prot); + if (error) + return error; - return mmap_region(file, addr, len, flags, vm_flags, pgoff, - accountable); + return mmap_region(file, addr, len, flags, vm_flags, pgoff); } EXPORT_SYMBOL(do_mmap_pgoff); @@ -1087,10 +1087,25 @@ int vma_wants_writenotify(struct vm_area_struct *vma) mapping_cap_account_dirty(vma->vm_file->f_mapping); } +/* + * We account for memory if it's a private writeable mapping, + * not hugepages and VM_NORESERVE wasn't set. + */ +static inline int accountable_mapping(struct file *file, unsigned int vm_flags) +{ + /* + * hugetlb has its own accounting separate from the core VM + * VM_HUGETLB may not be set yet so we cannot check for that flag. + */ + if (file && is_file_hugepages(file)) + return 0; + + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; +} + unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff, - int accountable) + unsigned int vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1114,38 +1129,38 @@ munmap_back: if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - if (flags & MAP_NORESERVE) - vm_flags |= VM_NORESERVE; + /* + * Set 'VM_NORESERVE' if we should not account for the + * memory use of this mapping. + */ + if ((flags & MAP_NORESERVE)) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; - if (accountable && (!(flags & MAP_NORESERVE) || - sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { - if (vm_flags & VM_SHARED) { - /* Check memory availability in shmem_file_setup? */ - vm_flags |= VM_ACCOUNT; - } else if (vm_flags & VM_WRITE) { - /* - * Private writable mapping: check memory availability - */ - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; } /* - * Can we just expand an old private anonymous mapping? - * The VM_SHARED test is necessary because shmem_zero_setup - * will create the file object for a shared anonymous map below. + * Private writable mapping: check memory availability */ - if (!file && !(vm_flags & VM_SHARED)) { - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL); - if (vma) - goto out; + if (accountable_mapping(file, vm_flags)) { + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; } /* + * Can we just expand an old mapping? + */ + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); + if (vma) + goto out; + + /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. @@ -1186,14 +1201,6 @@ munmap_back: goto free_vma; } - /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform - * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) - * that memory reservation must be checked; but that reservation - * belongs to shared memory object, not to vma: so now clear it. - */ - if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) - vma->vm_flags &= ~VM_ACCOUNT; - /* Can addr have changed?? * * Answer: Yes, several device drivers can do it in their @@ -1206,22 +1213,15 @@ munmap_back: if (vma_wants_writenotify(vma)) vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); - if (file && vma_merge(mm, prev, addr, vma->vm_end, - vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { - mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); - fput(file); - if (vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } else { - vma_link(mm, vma, prev, rb_link, rb_parent); - file = vma->vm_file; - } + vma_link(mm, vma, prev, rb_link, rb_parent); + file = vma->vm_file; /* Once vma denies write, undo our temporary denial count */ if (correct_wcount) atomic_inc(&inode->i_writecount); out: + perf_counter_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { @@ -1574,7 +1574,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns * Overcommit.. This must be the final test, as it will * update security statistics. */ - if (security_vm_enough_memory(grow)) + if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; /* Ok, everything looks good - let it rip */ @@ -2087,12 +2087,8 @@ void exit_mmap(struct mm_struct *mm) unsigned long end; /* mm's last user has gone, and its about to be pulled down */ - arch_exit_mmap(mm); mmu_notifier_release(mm); - if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ - return; - if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -2101,7 +2097,13 @@ void exit_mmap(struct mm_struct *mm) vma = vma->vm_next; } } + + arch_exit_mmap(mm); + vma = mm->mmap; + if (!vma) /* Can happen if dup_mmap() received an OOM */ + return; + lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); @@ -2306,6 +2308,8 @@ int install_special_mapping(struct mm_struct *mm, mm->total_vm += len >> PAGE_SHIFT; + perf_counter_mmap(vma); + return 0; } @@ -2478,7 +2482,8 @@ void mm_drop_all_locks(struct mm_struct *mm) */ void __init mmap_init(void) { - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); + int ret; + + ret = percpu_counter_init(&vm_committed_as, 0); + VM_BUG_ON(ret); } diff --git a/mm/mmzone.c b/mm/mmzone.c index 16ce8b955dc..f5b7d176021 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -6,6 +6,7 @@ #include <linux/stddef.h> +#include <linux/mm.h> #include <linux/mmzone.h> #include <linux/module.h> @@ -72,3 +73,17 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, *zone = zonelist_zone(z); return z; } + +#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL +int memmap_valid_within(unsigned long pfn, + struct page *page, struct zone *zone) +{ + if (page_to_pfn(page) != pfn) + return 0; + + if (page_zone(page) != zone) + return 0; + + return 1; +} +#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ diff --git a/mm/mprotect.c b/mm/mprotect.c index abe2694e13f..d80311baeb2 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,6 +23,7 @@ #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> +#include <linux/perf_counter.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -151,10 +152,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we - * make it unwritable again. + * make it unwritable again. hugetlb mapping were accounted for + * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE| + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) @@ -298,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; + perf_counter_mmap(vma); nstart = tmp; if (nstart < prev->vm_end) diff --git a/mm/nommu.c b/mm/nommu.c index 8cee8c8ff0f..66e81e7e9fe 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -10,7 +10,7 @@ * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> - * Copyright (c) 2007-2008 Paul Mundt <lethal@linux-sh.org> + * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> */ #include <linux/module.h> @@ -62,14 +62,14 @@ void *high_memory; struct page *mem_map; unsigned long max_mapnr; unsigned long num_physpages; -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); +struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; -int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ +int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -atomic_t mmap_pages_allocated; +atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(num_physpages); @@ -170,8 +170,8 @@ unsigned int kobjsize(const void *objp) } int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) + unsigned long start, int nr_pages, int flags, + struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -186,7 +186,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - for (i = 0; i < len; i++) { + for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) goto finish_or_fault; @@ -221,7 +221,7 @@ finish_or_fault: * - don't permit access to VMAs that don't support it, such as I/O mappings */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { int flags = 0; @@ -231,12 +231,31 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= GUP_FLAGS_FORCE; - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } EXPORT_SYMBOL(get_user_pages); +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + *pfn = address >> PAGE_SHIFT; + return 0; +} +EXPORT_SYMBOL(follow_pfn); + DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; @@ -394,6 +413,24 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL(vm_map_ram); + +void vm_unmap_ram(const void *mem, unsigned int count) +{ + BUG(); +} +EXPORT_SYMBOL(vm_unmap_ram); + +void vm_unmap_aliases(void) +{ +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + /* * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. @@ -445,12 +482,11 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) */ void __init mmap_init(void) { - vm_region_jar = kmem_cache_create("vm_region_jar", - sizeof(struct vm_region), 0, - SLAB_PANIC, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); + int ret; + + ret = percpu_counter_init(&vm_committed_as, 0); + VM_BUG_ON(ret); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } /* @@ -468,27 +504,24 @@ static noinline void validate_nommu_regions(void) return; last = rb_entry(lastp, struct vm_region, vm_rb); - if (unlikely(last->vm_end <= last->vm_start)) - BUG(); - if (unlikely(last->vm_top < last->vm_end)) - BUG(); + BUG_ON(unlikely(last->vm_end <= last->vm_start)); + BUG_ON(unlikely(last->vm_top < last->vm_end)); while ((p = rb_next(lastp))) { region = rb_entry(p, struct vm_region, vm_rb); last = rb_entry(lastp, struct vm_region, vm_rb); - if (unlikely(region->vm_end <= region->vm_start)) - BUG(); - if (unlikely(region->vm_top < region->vm_end)) - BUG(); - if (unlikely(region->vm_start < last->vm_top)) - BUG(); + BUG_ON(unlikely(region->vm_end <= region->vm_start)); + BUG_ON(unlikely(region->vm_top < region->vm_end)); + BUG_ON(unlikely(region->vm_start < last->vm_top)); lastp = p; } } #else -#define validate_nommu_regions() do {} while(0) +static void validate_nommu_regions(void) +{ +} #endif /* @@ -501,8 +534,6 @@ static void add_nommu_region(struct vm_region *region) validate_nommu_regions(); - BUG_ON(region->vm_start & ~PAGE_MASK); - parent = NULL; p = &nommu_region_tree.rb_node; while (*p) { @@ -545,16 +576,17 @@ static void free_page_series(unsigned long from, unsigned long to) struct page *page = virt_to_page(from); kdebug("- free %lx", from); - atomic_dec(&mmap_pages_allocated); + atomic_long_dec(&mmap_pages_allocated); if (page_count(page) != 1) - kdebug("free page %p [%d]", page, page_count(page)); + kdebug("free page %p: refcount not one: %d", + page, page_count(page)); put_page(page); } } /* * release a reference to a region - * - the caller must hold the region semaphore, which this releases + * - the caller must hold the region semaphore for writing, which this releases * - the region may not have been added to the tree yet, in which case vm_top * will equal vm_start */ @@ -887,6 +919,10 @@ static int validate_mmap_request(struct file *file, if (!file->f_op->read) capabilities &= ~BDI_CAP_MAP_COPY; + /* The file shall have been opened with read permission. */ + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (flags & MAP_SHARED) { /* do checks for writing, appending and locking */ if ((prot & PROT_WRITE) && @@ -1078,7 +1114,7 @@ static int do_mmap_private(struct vm_area_struct *vma, goto enomem; total = 1 << order; - atomic_add(total, &mmap_pages_allocated); + atomic_long_add(total, &mmap_pages_allocated); point = rlen >> PAGE_SHIFT; @@ -1089,7 +1125,7 @@ static int do_mmap_private(struct vm_area_struct *vma, order = ilog2(total - point); n = 1 << order; kdebug("shave %lu/%lu @%lu", n, total - point, total); - atomic_sub(n, &mmap_pages_allocated); + atomic_long_sub(n, &mmap_pages_allocated); total -= n; set_page_refcounted(pages + total); __free_pages(pages + total, order); @@ -1143,8 +1179,8 @@ error_free: return ret; enomem: - printk("Allocation of length %lu from process %d failed\n", - len, current->pid); + printk("Allocation of length %lu from process %d (%s) failed\n", + len, current->pid, current->comm); show_free_areas(); return -ENOMEM; } @@ -1316,6 +1352,7 @@ unsigned long do_mmap_pgoff(struct file *file, } vma->vm_region = region; + add_nommu_region(region); /* set up the mapping */ if (file && vma->vm_flags & VM_SHARED) @@ -1325,8 +1362,6 @@ unsigned long do_mmap_pgoff(struct file *file, if (ret < 0) goto error_put_region; - add_nommu_region(region); - /* okay... we have a mapping; now we have to register it */ result = vma->vm_start; @@ -1518,10 +1553,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* find the first potentially overlapping VMA */ vma = find_vma(mm, start); if (!vma) { - printk(KERN_WARNING - "munmap of memory not mmapped by process %d (%s):" - " 0x%lx-0x%lx\n", - current->pid, current->comm, start, start + len - 1); + static int limit = 0; + if (limit < 5) { + printk(KERN_WARNING + "munmap of memory not mmapped by process %d" + " (%s): 0x%lx-0x%lx\n", + current->pid, current->comm, + start, start + len - 1); + limit++; + } return -EINVAL; } @@ -1831,12 +1871,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (mm) allowed -= mm->total_vm / 32; - /* - * cast `allowed' as a signed long because vm_committed_space - * sometimes has a negative value - */ - if (atomic_long_read(&vm_committed_space) < (long)allowed) + if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; + error: vm_unacct_memory(pages); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 40ba05061a4..a7b2460e922 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(zone_scan_lock); unsigned long badness(struct task_struct *p, unsigned long uptime) { - unsigned long points, cpu_time, run_time, s; + unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; @@ -110,12 +110,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) else run_time = 0; - s = int_sqrt(cpu_time); - if (s) - points /= s; - s = int_sqrt(int_sqrt(run_time)); - if (s) - points /= s; + if (cpu_time) + points /= int_sqrt(cpu_time); + if (run_time) + points /= int_sqrt(int_sqrt(run_time)); /* * Niced processes are most likely less important, so double @@ -286,22 +284,28 @@ static void dump_tasks(const struct mem_cgroup *mem) printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " "name\n"); do_each_thread(g, p) { - /* - * total_vm and rss sizes do not exist for tasks with a - * detached mm so there's no need to report them. - */ - if (!p->mm) - continue; + struct mm_struct *mm; + if (mem && !task_in_mem_cgroup(p, mem)) continue; if (!thread_group_leader(p)) continue; task_lock(p); + mm = p->mm; + if (!mm) { + /* + * total_vm and rss sizes do not exist for tasks with no + * mm so there's no need to report them; they can't be + * oom killed anyway. + */ + task_unlock(p); + continue; + } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, __task_cred(p)->uid, p->tgid, - p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p), - p->oomkilladj, p->comm); + p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -396,6 +400,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); + mem_cgroup_print_oom_info(mem, current); show_mem(); if (sysctl_oom_dump_tasks) dump_tasks(mem); @@ -515,34 +520,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) */ static void __out_of_memory(gfp_t gfp_mask, int order) { - if (sysctl_oom_kill_allocating_task) { - oom_kill_process(current, gfp_mask, order, 0, NULL, - "Out of memory (oom_kill_allocating_task)"); - - } else { - unsigned long points; - struct task_struct *p; - -retry: - /* - * Rambo mode: Shoot down a process and hope it solves whatever - * issues we may have. - */ - p = select_bad_process(&points, NULL); + struct task_struct *p; + unsigned long points; - if (PTR_ERR(p) == -1UL) + if (sysctl_oom_kill_allocating_task) + if (!oom_kill_process(current, gfp_mask, order, 0, NULL, + "Out of memory (oom_kill_allocating_task)")) return; +retry: + /* + * Rambo mode: Shoot down a process and hope it solves whatever + * issues we may have. + */ + p = select_bad_process(&points, NULL); - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - read_unlock(&tasklist_lock); - panic("Out of memory and no killable processes...\n"); - } + if (PTR_ERR(p) == -1UL) + return; - if (oom_kill_process(p, gfp_mask, order, points, NULL, - "Out of memory")) - goto retry; + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + read_unlock(&tasklist_lock); + panic("Out of memory and no killable processes...\n"); } + + if (oom_kill_process(p, gfp_mask, order, points, NULL, + "Out of memory")) + goto retry; } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b493db7841d..25e7770309b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,15 +36,6 @@ #include <linux/pagevec.h> /* - * The maximum number of pages to writeout in a single bdflush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 - -/* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ @@ -66,7 +57,7 @@ static inline long sync_writeback_pages(void) /* * Start background writeback (via pdflush) at this percentage */ -int dirty_background_ratio = 5; +int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of @@ -83,7 +74,7 @@ int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 10; +int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of @@ -92,14 +83,14 @@ int vm_dirty_ratio = 10; unsigned long vm_dirty_bytes; /* - * The interval between `kupdate'-style writebacks, in jiffies + * The interval between `kupdate'-style writebacks */ -int dirty_writeback_interval = 5 * HZ; +unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ /* - * The longest number of jiffies for which data is allowed to remain dirty + * The longest time for which data is allowed to remain dirty */ -int dirty_expire_interval = 30 * HZ; +unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -static void background_writeout(unsigned long _min_pages); - /* * Scale the writeback cache size proportional to the relative writeout speeds. * @@ -209,7 +198,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int old_bytes = vm_dirty_bytes; + unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); @@ -240,7 +229,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi) } EXPORT_SYMBOL_GPL(bdi_writeout_inc); -static inline void task_dirty_inc(struct task_struct *tsk) +void task_dirty_inc(struct task_struct *tsk) { prop_inc_single(&vm_dirties, &tsk->dirties); } @@ -265,18 +254,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, * This avoids exceeding the total dirty_limit when the floating averages * fluctuate too quickly. */ -static void -clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) +static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, + unsigned long dirty, unsigned long *pbdi_dirty) { - long avail_dirty; + unsigned long avail_dirty; - avail_dirty = dirty - - (global_page_state(NR_FILE_DIRTY) + + avail_dirty = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_WRITEBACK) + global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK_TEMP)); + global_page_state(NR_WRITEBACK_TEMP); - if (avail_dirty < 0) + if (avail_dirty < dirty) + avail_dirty = dirty - avail_dirty; + else avail_dirty = 0; avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + @@ -299,10 +289,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk, * * dirty -= (dirty/8) * p_{t} */ -static void task_dirty_limit(struct task_struct *tsk, long *pdirty) +static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) { long numerator, denominator; - long dirty = *pdirty; + unsigned long dirty = *pdirty; u64 inv = dirty >> 3; task_dirties_fraction(tsk, &numerator, &denominator); @@ -319,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, long *pdirty) /* * */ -static DEFINE_SPINLOCK(bdi_lock); static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; - unsigned long flags; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { @@ -339,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) ret = -EINVAL; } } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock(&bdi_lock); return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { - unsigned long flags; int ret = 0; if (max_ratio > 100) return -EINVAL; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock(&bdi_lock); return ret; } @@ -540,9 +527,12 @@ static void balance_dirty_pages(struct address_space *mapping) * filesystems (i.e. NFS) in which data may have been * written to the server's write cache, but has not yet * been flushed to permanent storage. + * Only move pages to writeback if this bdi is over its + * threshold otherwise wait until the disk writes catch + * up. */ - if (bdi_nr_reclaimable) { - writeback_inodes(&wbc); + if (bdi_nr_reclaimable > bdi_thresh) { + writeback_inodes_wbc(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -571,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - congestion_wait(WRITE, HZ/10); + schedule_timeout(1); } if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && @@ -590,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS) - > background_thresh))) - pdflush_operation(background_writeout, 0); + (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) + > background_thresh))) { + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .nr_to_write = nr_writeback, + }; + + + bdi_start_writeback(&wbc); + } } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -665,7 +663,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The caller might hold locks which can prevent IO completion @@ -677,152 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask) } } -/* - * writeback at least _min_pages, and keep writing until the amount of dirty - * memory is less than the background threshold, or until we're all clean. - */ -static void background_writeout(unsigned long _min_pages) -{ - long min_pages = _min_pages; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = 0, - .nonblocking = 1, - .range_cyclic = 1, - }; - - for ( ; ; ) { - unsigned long background_thresh; - unsigned long dirty_thresh; - - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) < background_thresh - && min_pages <= 0) - break; - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - wbc.pages_skipped = 0; - writeback_inodes(&wbc); - min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - /* Wrote less than expected */ - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; - } - } -} - -/* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. Returns 0 if a pdflush thread was dispatched. Returns - * -1 if all pdflush threads were busy. - */ -int wakeup_pdflush(long nr_pages) -{ - if (nr_pages == 0) - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - return pdflush_operation(background_writeout, nr_pages); -} - -static void wb_timer_fn(unsigned long unused); static void laptop_timer_fn(unsigned long unused); -static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); /* - * Periodic writeback of "old" data. - * - * Define "old": the first time one of an inode's pages is dirtied, we mark the - * dirtying-time in the inode's address_space. So this periodic writeback code - * just walks the superblock inode list, writing back any inodes which are - * older than a specific point in time. - * - * Try to run once per dirty_writeback_interval. But if a writeback event - * takes longer than a dirty_writeback_interval interval, then leave a - * one-second gap. - * - * older_than_this takes precedence over nr_to_write. So we'll only write back - * all dirty pages if they are all attached to "old" mappings. - */ -static void wb_kupdate(unsigned long arg) -{ - unsigned long oldest_jif; - unsigned long start_jif; - unsigned long next_jif; - long nr_to_write; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = &oldest_jif, - .nr_to_write = 0, - .nonblocking = 1, - .for_kupdate = 1, - .range_cyclic = 1, - }; - - sync_supers(); - - oldest_jif = jiffies - dirty_expire_interval; - start_jif = jiffies; - next_jif = start_jif + dirty_writeback_interval; - nr_to_write = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - while (nr_to_write > 0) { - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - writeback_inodes(&wbc); - if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; /* All the old data is written */ - } - nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - } - if (time_before(next_jif, jiffies + HZ)) - next_jif = jiffies + HZ; - if (dirty_writeback_interval) - mod_timer(&wb_timer, next_jif); -} - -/* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); - if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); - else - del_timer(&wb_timer); + proc_dointvec(table, write, file, buffer, length, ppos); return 0; } -static void wb_timer_fn(unsigned long unused) -{ - if (pdflush_operation(wb_kupdate, 0) < 0) - mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ -} - -static void laptop_flush(unsigned long unused) +static void do_laptop_sync(struct work_struct *work) { - sys_sync(); + wakeup_flusher_threads(0); + kfree(work); } static void laptop_timer_fn(unsigned long unused) { - pdflush_operation(laptop_flush, 0); + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_laptop_sync); + schedule_work(work); + } } /* @@ -905,7 +786,6 @@ void __init page_writeback_init(void) { int shift; - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); @@ -1051,13 +931,25 @@ continue_unlock: } } - if (wbc->sync_mode == WB_SYNC_NONE) { - wbc->nr_to_write--; - if (wbc->nr_to_write <= 0) { + if (nr_to_write > 0) { + nr_to_write--; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ done = 1; break; } } + if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; @@ -1067,7 +959,7 @@ continue_unlock: pagevec_release(&pvec); cond_resched(); } - if (!cycled) { + if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap @@ -1186,6 +1078,20 @@ int __set_page_dirty_no_writeback(struct page *page) } /* + * Helper function for set_page_dirty family. + * NOTE: This relies on being atomic wrt interrupts. + */ +void account_page_dirtied(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); + task_io_account_write(PAGE_CACHE_SIZE); + } +} + +/* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * @@ -1214,12 +1120,7 @@ int __set_page_dirty_nobuffers(struct page *page) if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } @@ -1250,7 +1151,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage); * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ -static int __set_page_dirty(struct page *page) +int set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -1268,14 +1169,6 @@ static int __set_page_dirty(struct page *page) } return 0; } - -int set_page_dirty(struct page *page) -{ - int ret = __set_page_dirty(page); - if (ret) - task_dirty_inc(current); - return ret; -} EXPORT_SYMBOL(set_page_dirty); /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5675b307385..a0de15f4698 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -23,6 +23,7 @@ #include <linux/bootmem.h> #include <linux/compiler.h> #include <linux/kernel.h> +#include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/suspend.h> #include <linux/pagevec.h> @@ -46,6 +47,7 @@ #include <linux/page-isolation.h> #include <linux/page_cgroup.h> #include <linux/debugobjects.h> +#include <linux/kmemleak.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -71,6 +73,7 @@ unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; +gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE int pageblock_order __read_mostly; @@ -149,10 +152,6 @@ static unsigned long __meminitdata dma_reserve; static int __meminitdata nr_nodemap_entries; static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; - static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; -#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; @@ -164,17 +163,25 @@ static unsigned long __meminitdata dma_reserve; #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; +int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); +EXPORT_SYMBOL(nr_online_nodes); #endif int page_group_by_mobility_disabled __read_mostly; static void set_pageblock_migratetype(struct page *page, int migratetype) { + + if (unlikely(page_group_by_mobility_disabled)) + migratetype = MIGRATE_UNMOVABLE; + set_pageblock_flags_group(page, (unsigned long)migratetype, PB_migrate, PB_migrate_end); } +bool oom_killer_disabled __read_mostly; + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -297,23 +304,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -#ifdef CONFIG_HUGETLBFS -void prep_compound_gigantic_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - struct page *p = page + 1; - - set_compound_page_dtor(page, free_compound_page); - set_compound_order(page, order); - __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); - p->first_page = page; - } -} -#endif - static int destroy_compound_page(struct page *page, unsigned long order) { int i; @@ -331,7 +321,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - if (unlikely(!PageTail(p) | (p->first_page != page))) { + if (unlikely(!PageTail(p) || (p->first_page != page))) { bad_page(page); bad++; } @@ -420,7 +410,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; if (PageBuddy(buddy) && page_order(buddy) == order) { - BUG_ON(page_count(buddy) != 0); + VM_BUG_ON(page_count(buddy) != 0); return 1; } return 0; @@ -451,22 +441,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, */ static inline void __free_one_page(struct page *page, - struct zone *zone, unsigned int order) + struct zone *zone, unsigned int order, + int migratetype) { unsigned long page_idx; - int order_size = 1 << order; - int migratetype = get_pageblock_migratetype(page); if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) return; + VM_BUG_ON(migratetype == -1); + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - VM_BUG_ON(page_idx & (order_size - 1)); + VM_BUG_ON(page_idx & ((1 << order) - 1)); VM_BUG_ON(bad_range(zone, page)); - __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); while (order < MAX_ORDER-1) { unsigned long combined_idx; struct page *buddy; @@ -490,12 +480,26 @@ static inline void __free_one_page(struct page *page, zone->free_area[order].nr_free++; } +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT +/* + * free_page_mlock() -- clean up attempts to free and mlocked() page. + * Page should not be on lru, so no need to fix that up. + * free_pages_check() will verify... + */ +static inline void free_page_mlock(struct page *page) +{ + __dec_zone_page_state(page, NR_MLOCK); + __count_vm_event(UNEVICTABLE_MLOCKFREED); +} +#else +static void free_page_mlock(struct page *page) { } +#endif + static inline int free_pages_check(struct page *page) { - free_page_mlock(page); if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_count(page) != 0) | + (atomic_read(&page->_count) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { bad_page(page); return 1; @@ -522,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count, spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; + + __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); while (count--) { struct page *page; @@ -529,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); - __free_one_page(page, zone, order); + __free_one_page(page, zone, order, page_private(page)); } spin_unlock(&zone->lock); } -static void free_one_page(struct zone *zone, struct page *page, int order) +static void free_one_page(struct zone *zone, struct page *page, int order, + int migratetype) { spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; - __free_one_page(page, zone, order); + + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + __free_one_page(page, zone, order, migratetype); spin_unlock(&zone->lock); } @@ -548,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) unsigned long flags; int i; int bad = 0; + int wasMlocked = TestClearPageMlocked(page); + + kmemcheck_free_shadow(page, order); for (i = 0 ; i < (1 << order) ; ++i) bad += free_pages_check(page + i); @@ -563,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) kernel_map_pages(page, 1 << order, 0); local_irq_save(flags); + if (unlikely(wasMlocked)) + free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, order); + free_one_page(page_zone(page), page, order, + get_pageblock_migratetype(page)); local_irq_restore(flags); } @@ -635,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_count(page) != 0) | + (atomic_read(&page->_count) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { bad_page(page); return 1; @@ -660,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ -static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, +static inline +struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { unsigned int current_order; @@ -678,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, list_del(&page->lru); rmv_page_order(page); area->nr_free--; - __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); expand(zone, page, order, current_order, area, migratetype); return page; } @@ -769,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page, } /* Remove an element from the buddy allocator from the fallback list */ -static struct page *__rmqueue_fallback(struct zone *zone, int order, - int start_migratetype) +static inline struct page * +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area * area; int current_order; @@ -802,13 +817,15 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, * agressive about taking ownership of free pages */ if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE) { + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled) { unsigned long pages; pages = move_freepages_block(zone, page, start_migratetype); /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1))) + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_migratetype); @@ -818,8 +835,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); - __mod_zone_page_state(zone, NR_FREE_PAGES, - -(1UL << order)); if (current_order == pageblock_order) set_pageblock_migratetype(page, @@ -830,8 +845,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, } } - /* Use MIGRATE_RESERVE rather than fail an allocation */ - return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); + return NULL; } /* @@ -843,11 +857,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; +retry_reserve: page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) + if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { page = __rmqueue_fallback(zone, order, migratetype); + /* + * Use MIGRATE_RESERVE rather than fail an allocation. goto + * is used because __rmqueue_smallest is an inline function + * and we want just one call site + */ + if (!page) { + migratetype = MIGRATE_RESERVE; + goto retry_reserve; + } + } + return page; } @@ -858,7 +884,7 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, int cold) { int i; @@ -877,10 +903,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - list_add(&page->lru, list); + if (likely(cold == 0)) + list_add(&page->lru, list); + else + list_add_tail(&page->lru, list); set_page_private(page, migratetype); list = &page->lru; } + __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); return i; } @@ -922,13 +952,10 @@ static void drain_pages(unsigned int cpu) unsigned long flags; struct zone *zone; - for_each_zone(zone) { + for_each_populated_zone(zone) { struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - if (!populated_zone(zone)) - continue; - pset = zone_pcp(zone, cpu); pcp = &pset->pcp; @@ -999,6 +1026,9 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + int wasMlocked = TestClearPageMlocked(page); + + kmemcheck_free_shadow(page, 0); if (PageAnon(page)) page->mapping = NULL; @@ -1013,13 +1043,16 @@ static void free_hot_cold_page(struct page *page, int cold) kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp; + set_page_private(page, get_pageblock_migratetype(page)); local_irq_save(flags); + if (unlikely(wasMlocked)) + free_page_mlock(page); __count_vm_event(PGFREE); + if (cold) list_add_tail(&page->lru, &pcp->list); else list_add(&page->lru, &pcp->list); - set_page_private(page, get_pageblock_migratetype(page)); pcp->count++; if (pcp->count >= pcp->high) { free_pages_bulk(zone, pcp->batch, &pcp->list, 0); @@ -1053,6 +1086,16 @@ void split_page(struct page *page, unsigned int order) VM_BUG_ON(PageCompound(page)); VM_BUG_ON(!page_count(page)); + +#ifdef CONFIG_KMEMCHECK + /* + * Split shadow pages too, because free(page[0]) would + * otherwise free the whole shadow. + */ + if (kmemcheck_page_is_tracked(page)) + split_page(virt_to_page(page[0].shadow), order); +#endif + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } @@ -1062,14 +1105,15 @@ void split_page(struct page *page, unsigned int order) * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page *buffered_rmqueue(struct zone *preferred_zone, - struct zone *zone, int order, gfp_t gfp_flags) +static inline +struct page *buffered_rmqueue(struct zone *preferred_zone, + struct zone *zone, int order, gfp_t gfp_flags, + int migratetype) { unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); int cpu; - int migratetype = allocflags_to_migratetype(gfp_flags); again: cpu = get_cpu(); @@ -1080,7 +1124,8 @@ again: local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); if (unlikely(!pcp->count)) goto failed; } @@ -1099,15 +1144,30 @@ again: /* Allocate more to the pcp list if necessary */ if (unlikely(&page->lru == &pcp->list)) { pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); page = list_entry(pcp->list.next, struct page, lru); } list_del(&page->lru); pcp->count--; } else { + if (unlikely(gfp_flags & __GFP_NOFAIL)) { + /* + * __GFP_NOFAIL is not to be used in new code. + * + * All __GFP_NOFAIL callers should be fixed so that they + * properly detect and handle allocation failures. + * + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with + * __GFP_NOFAIL. + */ + WARN_ON_ONCE(order > 1); + } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); spin_unlock(&zone->lock); if (!page) goto failed; @@ -1129,10 +1189,15 @@ failed: return NULL; } -#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ -#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ -#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ -#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ @@ -1390,23 +1455,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) */ static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, - struct zonelist *zonelist, int high_zoneidx, int alloc_flags) + struct zonelist *zonelist, int high_zoneidx, int alloc_flags, + struct zone *preferred_zone, int migratetype) { struct zoneref *z; struct page *page = NULL; int classzone_idx; - struct zone *zone, *preferred_zone; + struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ - (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, - &preferred_zone); - if (!preferred_zone) - return NULL; - classzone_idx = zone_idx(preferred_zone); - zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. @@ -1421,31 +1481,49 @@ zonelist_scan: !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; - if (alloc_flags & ALLOC_WMARK_MIN) - mark = zone->pages_min; - else if (alloc_flags & ALLOC_WMARK_LOW) - mark = zone->pages_low; - else - mark = zone->pages_high; - if (!zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) { - if (!zone_reclaim_mode || - !zone_reclaim(zone, gfp_mask, order)) + int ret; + + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) + goto try_this_zone; + + if (zone_reclaim_mode == 0) + goto this_zone_full; + + ret = zone_reclaim(zone, gfp_mask, order); + switch (ret) { + case ZONE_RECLAIM_NOSCAN: + /* did not scan */ + goto try_next_zone; + case ZONE_RECLAIM_FULL: + /* scanned but unreclaimable */ + goto this_zone_full; + default: + /* did we reclaim enough */ + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) goto this_zone_full; } } - page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); +try_this_zone: + page = buffered_rmqueue(preferred_zone, zone, order, + gfp_mask, migratetype); if (page) break; this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); try_next_zone: - if (NUMA_BUILD && !did_zlc_setup) { - /* we do zlc_setup after the first zone is tried */ + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + /* + * we do zlc_setup after the first zone is tried but only + * if there are multiple nodes make it worthwhile + */ allowednodes = zlc_setup(zonelist, alloc_flags); zlc_active = 1; did_zlc_setup = 1; @@ -1460,45 +1538,219 @@ try_next_zone: return page; } +static inline int +should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long pages_reclaimed) +{ + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + return 0; + + /* + * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER + * means __GFP_NOFAIL, but that may not be true in other + * implementations. + */ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return 1; + + /* + * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is + * specified, then we retry until we no longer reclaim any pages + * (above), or we've reclaimed an order of pages at least as + * large as the allocation's order. In both cases, if the + * allocation still fails, we stop retrying. + */ + if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) + return 1; + + /* + * Don't let big-order allocations loop unless the caller + * explicitly requests that. + */ + if (gfp_mask & __GFP_NOFAIL) + return 1; + + return 0; +} + +static inline struct page * +__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, struct zone *preferred_zone, + int migratetype) +{ + struct page *page; + + /* Acquire the OOM killer lock for the zones in zonelist */ + if (!try_set_zone_oom(zonelist, gfp_mask)) { + schedule_timeout_uninterruptible(1); + return NULL; + } + + /* + * Go through the zonelist yet one more time, keep very high watermark + * here, this is only to catch a parallel oom killing, we must fail if + * we're still under heavy pressure. + */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, + order, zonelist, high_zoneidx, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, + preferred_zone, migratetype); + if (page) + goto out; + + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) + goto out; + + /* Exhausted what can be done so it's blamo time */ + out_of_memory(zonelist, gfp_mask, order); + +out: + clear_zonelist_oom(zonelist, gfp_mask); + return page; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page = NULL; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + + cond_resched(); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); + + /* + * The task's cpuset might have expanded its set of allowable nodes + */ + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (order != 0) + drain_all_pages(); + + if (likely(*did_some_progress)) + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, + alloc_flags, preferred_zone, + migratetype); + return page; +} + /* - * This is the 'heart' of the zoned buddy allocator. + * This is called in the allocator slow-path if the allocation request is of + * sufficient urgency to ignore watermarks and take other desperate measures */ -struct page * -__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +static inline struct page * +__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, struct zone *preferred_zone, + int migratetype) +{ + struct page *page; + + do { + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); + + if (!page && gfp_mask & __GFP_NOFAIL) + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (!page && (gfp_mask & __GFP_NOFAIL)); + + return page; +} + +static inline +void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, + enum zone_type high_zoneidx) { - const gfp_t wait = gfp_mask & __GFP_WAIT; - enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zoneref *z; struct zone *zone; - struct page *page; - struct reclaim_state reclaim_state; - struct task_struct *p = current; - int do_retry; - int alloc_flags; - unsigned long did_some_progress; - unsigned long pages_reclaimed = 0; - might_sleep_if(wait); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order); +} - if (should_fail_alloc_page(gfp_mask, order)) - return NULL; +static inline int +gfp_to_alloc_flags(gfp_t gfp_mask) +{ + struct task_struct *p = current; + int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + const gfp_t wait = gfp_mask & __GFP_WAIT; -restart: - z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ + /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ + BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); + + /* + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). + */ + alloc_flags |= (gfp_mask & __GFP_HIGH); - if (unlikely(!z->zone)) { + if (!wait) { + alloc_flags |= ALLOC_HARDER; /* - * Happens if we have an empty zonelist as a result of - * GFP_THISNODE being used on a memoryless node + * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ - return NULL; + alloc_flags &= ~ALLOC_CPUSET; + } else if (unlikely(rt_task(p))) + alloc_flags |= ALLOC_HARDER; + + if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { + if (!in_interrupt() && + ((p->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) + alloc_flags |= ALLOC_NO_WATERMARKS; } - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); - if (page) - goto got_pg; + return alloc_flags; +} + +static inline struct page * +__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, struct zone *preferred_zone, + int migratetype) +{ + const gfp_t wait = gfp_mask & __GFP_WAIT; + struct page *page = NULL; + int alloc_flags; + unsigned long pages_reclaimed = 0; + unsigned long did_some_progress; + struct task_struct *p = current; + + /* + * In the slowpath, we sanity check order to avoid ever trying to + * reclaim >= MAX_ORDER areas which will never succeed. Callers may + * be using allocators in order of preference for an area that is + * too large. + */ + if (order >= MAX_ORDER) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); + return NULL; + } /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and @@ -1511,151 +1763,88 @@ restart: if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wake_all_kswapd(order, zonelist, high_zoneidx); /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according * to how we want to proceed. - * - * The caller may dip into page reserves a bit more if the caller - * cannot run direct reclaim, or if the caller has realtime scheduling - * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags = ALLOC_WMARK_MIN; - if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) - alloc_flags |= ALLOC_HARDER; - if (gfp_mask & __GFP_HIGH) - alloc_flags |= ALLOC_HIGH; - if (wait) - alloc_flags |= ALLOC_CPUSET; + alloc_flags = gfp_to_alloc_flags(gfp_mask); - /* - * Go through the zonelist again. Let __GFP_HIGH and allocations - * coming from realtime tasks go deeper into reserves. - * - * This is the last chance, in general, before the goto nopage. - * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. - */ +restart: + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, - high_zoneidx, alloc_flags); + high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); if (page) goto got_pg; - /* This allocation should allow future memory freeing. */ - rebalance: - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) - && !in_interrupt()) { - if (!(gfp_mask & __GFP_NOMEMALLOC)) { -nofail_alloc: - /* go through the zonelist yet again, ignoring mins */ - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); - if (page) - goto got_pg; - if (gfp_mask & __GFP_NOFAIL) { - congestion_wait(WRITE, HZ/50); - goto nofail_alloc; - } - } - goto nopage; + /* Allocate without watermarks if the context allows */ + if (alloc_flags & ALLOC_NO_WATERMARKS) { + page = __alloc_pages_high_priority(gfp_mask, order, + zonelist, high_zoneidx, nodemask, + preferred_zone, migratetype); + if (page) + goto got_pg; } /* Atomic allocations - we can't balance anything */ if (!wait) goto nopage; - cond_resched(); - - /* We now go into synchronous reclaim */ - cpuset_memory_pressure_bump(); - /* - * The task's cpuset might have expanded its set of allowable nodes - */ - cpuset_update_task_memory_state(); - p->flags |= PF_MEMALLOC; - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + /* Avoid recursion of direct reclaim */ + if (p->flags & PF_MEMALLOC) + goto nopage; - did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + goto nopage; - p->reclaim_state = NULL; - p->flags &= ~PF_MEMALLOC; + /* Try direct reclaim and then allocating */ + page = __alloc_pages_direct_reclaim(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress); + if (page) + goto got_pg; - cond_resched(); + /* + * If we failed to make any progress reclaiming, then we are + * running out of options and have to consider going OOM + */ + if (!did_some_progress) { + if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (oom_killer_disabled) + goto nopage; + page = __alloc_pages_may_oom(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, preferred_zone, + migratetype); + if (page) + goto got_pg; - if (order != 0) - drain_all_pages(); + /* + * The OOM killer does not trigger for high-order + * ~__GFP_NOFAIL allocations so if no progress is being + * made, there are no other options and retrying is + * unlikely to help. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && + !(gfp_mask & __GFP_NOFAIL)) + goto nopage; - if (likely(did_some_progress)) { - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, alloc_flags); - if (page) - goto got_pg; - } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { - if (!try_set_zone_oom(zonelist, gfp_mask)) { - schedule_timeout_uninterruptible(1); goto restart; } - - /* - * Go through the zonelist yet one more time, keep - * very high watermark here, this is only to catch - * a parallel oom killing, we must fail if we're still - * under heavy pressure. - */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, - order, zonelist, high_zoneidx, - ALLOC_WMARK_HIGH|ALLOC_CPUSET); - if (page) { - clear_zonelist_oom(zonelist, gfp_mask); - goto got_pg; - } - - /* The OOM killer will not help higher order allocs so fail */ - if (order > PAGE_ALLOC_COSTLY_ORDER) { - clear_zonelist_oom(zonelist, gfp_mask); - goto nopage; - } - - out_of_memory(zonelist, gfp_mask, order); - clear_zonelist_oom(zonelist, gfp_mask); - goto restart; } - /* - * Don't let big-order allocations loop unless the caller explicitly - * requests that. Wait for some write requests to complete then retry. - * - * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER - * means __GFP_NOFAIL, but that may not be true in other - * implementations. - * - * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is - * specified, then we retry until we no longer reclaim any pages - * (above), or we've reclaimed an order of pages at least as - * large as the allocation's order. In both cases, if the - * allocation still fails, we stop retrying. - */ + /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; - do_retry = 0; - if (!(gfp_mask & __GFP_NORETRY)) { - if (order <= PAGE_ALLOC_COSTLY_ORDER) { - do_retry = 1; - } else { - if (gfp_mask & __GFP_REPEAT && - pages_reclaimed < (1 << order)) - do_retry = 1; - } - if (gfp_mask & __GFP_NOFAIL) - do_retry = 1; - } - if (do_retry) { - congestion_wait(WRITE, HZ/50); + if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { + /* Wait for some write requests to complete then retry */ + congestion_wait(BLK_RW_ASYNC, HZ/50); goto rebalance; } @@ -1667,10 +1856,60 @@ nopage: dump_stack(); show_mem(); } + return page; got_pg: + if (kmemcheck_enabled) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); + return page; + +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zone *preferred_zone; + struct page *page; + int migratetype = allocflags_to_migratetype(gfp_mask); + + gfp_mask &= gfp_allowed_mask; + + lockdep_trace_alloc(gfp_mask); + + might_sleep_if(gfp_mask & __GFP_WAIT); + + if (should_fail_alloc_page(gfp_mask, order)) + return NULL; + + /* + * Check the zones suitable for the gfp_mask contain at least one + * valid zone. It's possible to have an empty zonelist as a result + * of GFP_THISNODE and a memoryless node + */ + if (unlikely(!zonelist->_zonerefs->zone)) + return NULL; + + /* The preferred zone is used for statistics later */ + first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + if (!preferred_zone) + return NULL; + + /* First allocation attempt */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, + zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, + preferred_zone, migratetype); + if (unlikely(!page)) + page = __alloc_pages_slowpath(gfp_mask, order, + zonelist, high_zoneidx, nodemask, + preferred_zone, migratetype); + return page; } -EXPORT_SYMBOL(__alloc_pages_internal); +EXPORT_SYMBOL(__alloc_pages_nodemask); /* * Common helper functions. @@ -1757,7 +1996,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long alloc_end = addr + (PAGE_SIZE << order); unsigned long used = addr + PAGE_ALIGN(size); - split_page(virt_to_page(addr), order); + split_page(virt_to_page((void *)addr), order); while (used < alloc_end) { free_page(used); used += PAGE_SIZE; @@ -1799,7 +2038,7 @@ static unsigned int nr_free_zone_pages(int offset) for_each_zone_zonelist(zone, z, zonelist, offset) { unsigned long size = zone->present_pages; - unsigned long high = zone->pages_high; + unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; } @@ -1874,10 +2113,7 @@ void show_free_areas(void) int cpu; struct zone *zone; - for_each_zone(zone) { - if (!populated_zone(zone)) - continue; - + for_each_populated_zone(zone) { show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -1894,19 +2130,14 @@ void show_free_areas(void) printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" " inactive_file:%lu" -//TODO: check/adjust line lengths -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lu" -#endif " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_INACTIVE_FILE), -#ifdef CONFIG_UNEVICTABLE_LRU global_page_state(NR_UNEVICTABLE), -#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -1917,12 +2148,9 @@ void show_free_areas(void) global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); - for_each_zone(zone) { + for_each_populated_zone(zone) { int i; - if (!populated_zone(zone)) - continue; - show_node(zone); printk("%s" " free:%lukB" @@ -1933,25 +2161,21 @@ void show_free_areas(void) " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lukB" -#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high), + K(min_wmark_pages(zone)), + K(low_wmark_pages(zone)), + K(high_wmark_pages(zone)), K(zone_page_state(zone, NR_ACTIVE_ANON)), K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU K(zone_page_state(zone, NR_UNEVICTABLE)), -#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") @@ -1962,12 +2186,9 @@ void show_free_areas(void) printk("\n"); } - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; - if (!populated_zone(zone)) - continue; - show_node(zone); printk("%s: ", zone->name); @@ -2112,7 +2333,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, } -#define MAX_NODE_LOAD (num_online_nodes()) +#define MAX_NODE_LOAD (nr_online_nodes) static int node_load[MAX_NUMNODES]; /** @@ -2134,7 +2355,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) int n, val; int min_val = INT_MAX; int best_node = -1; - node_to_cpumask_ptr(tmp, 0); + const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ if (!node_isset(node, *used_node_mask)) { @@ -2155,8 +2376,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) val += (n < node); /* Give preference to headless and unused nodes */ - node_to_cpumask_ptr_next(tmp, n); - if (!cpus_empty(*tmp)) + tmp = cpumask_of_node(n); + if (!cpumask_empty(tmp)) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ @@ -2321,11 +2542,10 @@ static void build_zonelists(pg_data_t *pgdat) /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = num_online_nodes(); + load = nr_online_nodes; prev_node = local_node; nodes_clear(used_mask); - memset(node_load, 0, sizeof(node_load)); memset(node_order, 0, sizeof(node_order)); j = 0; @@ -2434,6 +2654,9 @@ static int __build_all_zonelists(void *dummy) { int nid; +#ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +#endif for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -2472,7 +2695,7 @@ void build_all_zonelists(void) printk("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", - num_online_nodes(), + nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); @@ -2551,8 +2774,8 @@ static inline unsigned long wait_table_bits(unsigned long size) /* * Mark a number of pageblocks as MIGRATE_RESERVE. The number - * of blocks reserved is based on zone->pages_min. The memory within the - * reserve will tend to store contiguous free pages. Setting min_free_kbytes + * of blocks reserved is based on min_wmark_pages(zone). The memory within + * the reserve will tend to store contiguous free pages. Setting min_free_kbytes * higher will lead to a bigger reserve which will get freed as contiguous * blocks as reclaim kicks in */ @@ -2565,7 +2788,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) /* Get the start pfn, end pfn and the number of blocks to reserve */ start_pfn = zone->zone_start_pfn; end_pfn = start_pfn + zone->spanned_pages; - reserve = roundup(zone->pages_min, pageblock_nr_pages) >> + reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { @@ -2687,6 +2910,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) static int zone_batchsize(struct zone *zone) { +#ifdef CONFIG_MMU int batch; /* @@ -2712,9 +2936,26 @@ static int zone_batchsize(struct zone *zone) * of pages of one half of the possible page colors * and the other with pages of the other colors. */ - batch = (1 << (fls(batch + batch/2)-1)) - 1; + batch = rounddown_pow_of_two(batch + batch/2) - 1; return batch; + +#else + /* The deferral and batching of frees should be suppressed under NOMMU + * conditions. + * + * The problem is that NOMMU needs to be able to allocate large chunks + * of contiguous memory as there's no hardware page translation to + * assemble apparent contiguous memory from discontiguous pages. + * + * Queueing large contiguous runs of pages for batching, however, + * causes the pages to actually be freed in smaller chunks. As there + * can be a significant delay between the individual batches being + * recycled, this leads to the once large chunks of space being + * fragmented and becoming unavailable for high-order allocations. + */ + return 0; +#endif } static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) @@ -2779,11 +3020,7 @@ static int __cpuinit process_zones(int cpu) node_set_state(node, N_CPU); /* this node has a cpu */ - for_each_zone(zone) { - - if (!populated_zone(zone)) - continue; - + for_each_populated_zone(zone) { zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, node); if (!zone_pcp(zone, cpu)) @@ -2804,7 +3041,7 @@ bad: if (dzone == zone) break; kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = NULL; + zone_pcp(dzone, cpu) = &boot_pageset[cpu]; } return -ENOMEM; } @@ -2819,7 +3056,7 @@ static inline void free_zone_pagesets(int cpu) /* Free per_cpu_pageset if it is slab allocated */ if (pset != &boot_pageset[cpu]) kfree(pset); - zone_pcp(zone, cpu) = NULL; + zone_pcp(zone, cpu) = &boot_pageset[cpu]; } } @@ -2989,7 +3226,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) * was used and there are no special requirements, this is a convenient * alternative */ -int __meminit early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn) { int i; @@ -3000,10 +3237,33 @@ int __meminit early_pfn_to_nid(unsigned long pfn) if (start_pfn <= pfn && pfn < end_pfn) return early_node_map[i].nid; } + /* This is a memory hole */ + return -1; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0) + return nid; + /* just returns 0 */ return 0; } -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0 && nid != node) + return false; + return true; +} +#endif /* Basic iterator support to walk early_node_map[] */ #define for_each_active_range_index_in_nid(i, nid) \ @@ -3072,64 +3332,6 @@ void __init sparse_memory_present_with_active_regions(int nid) } /** - * push_node_boundaries - Push node boundaries to at least the requested boundary - * @nid: The nid of the node to push the boundary for - * @start_pfn: The start pfn of the node - * @end_pfn: The end pfn of the node - * - * In reserve-based hot-add, mem_map is allocated that is unused until hotadd - * time. Specifically, on x86_64, SRAT will report ranges that can potentially - * be hotplugged even though no physical memory exists. This function allows - * an arch to push out the node boundaries so mem_map is allocated that can - * be used later. - */ -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -void __init push_node_boundaries(unsigned int nid, - unsigned long start_pfn, unsigned long end_pfn) -{ - mminit_dprintk(MMINIT_TRACE, "zoneboundary", - "Entering push_node_boundaries(%u, %lu, %lu)\n", - nid, start_pfn, end_pfn); - - /* Initialise the boundary for this node if necessary */ - if (node_boundary_end_pfn[nid] == 0) - node_boundary_start_pfn[nid] = -1UL; - - /* Update the boundaries */ - if (node_boundary_start_pfn[nid] > start_pfn) - node_boundary_start_pfn[nid] = start_pfn; - if (node_boundary_end_pfn[nid] < end_pfn) - node_boundary_end_pfn[nid] = end_pfn; -} - -/* If necessary, push the node boundary out for reserve hotadd */ -static void __meminit account_node_boundary(unsigned int nid, - unsigned long *start_pfn, unsigned long *end_pfn) -{ - mminit_dprintk(MMINIT_TRACE, "zoneboundary", - "Entering account_node_boundary(%u, %lu, %lu)\n", - nid, *start_pfn, *end_pfn); - - /* Return if boundary information has not been provided */ - if (node_boundary_end_pfn[nid] == 0) - return; - - /* Check the boundaries and update if necessary */ - if (node_boundary_start_pfn[nid] < *start_pfn) - *start_pfn = node_boundary_start_pfn[nid]; - if (node_boundary_end_pfn[nid] > *end_pfn) - *end_pfn = node_boundary_end_pfn[nid]; -} -#else -void __init push_node_boundaries(unsigned int nid, - unsigned long start_pfn, unsigned long end_pfn) {} - -static void __meminit account_node_boundary(unsigned int nid, - unsigned long *start_pfn, unsigned long *end_pfn) {} -#endif - - -/** * get_pfn_range_for_nid - Return the start and end page frames for a node * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. * @start_pfn: Passed by reference. On return, it will have the node start_pfn. @@ -3154,9 +3356,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, if (*start_pfn == -1UL) *start_pfn = 0; - - /* Push the node boundaries out if requested */ - account_node_boundary(nid, start_pfn, end_pfn); } /* @@ -3521,7 +3720,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; @@ -3762,10 +3961,6 @@ void __init remove_all_active_ranges(void) { memset(early_node_map, 0, sizeof(early_node_map)); nr_nodemap_entries = 0; -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); - memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); -#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ } /* Compare two active node_active_regions */ @@ -3852,6 +4047,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) int i, nid; unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; + /* save the state before borrow the nodemask */ + nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); @@ -3879,7 +4076,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) /* If kernelcore was not specified, there is no ZONE_MOVABLE */ if (!required_kernelcore) - return; + goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ find_usable_zone_for_movable(); @@ -3978,6 +4175,10 @@ restart: for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + +out: + /* restore the node_state */ + node_states[N_HIGH_MEMORY] = saved_node_state; } /* Any regular memory on that node ? */ @@ -4196,8 +4397,8 @@ static void calculate_totalreserve_pages(void) max = zone->lowmem_reserve[j]; } - /* we treat pages_high as reserved pages. */ - max += zone->pages_high; + /* we treat the high watermark as reserved pages. */ + max += high_wmark_pages(zone); if (max > zone->present_pages) max = zone->present_pages; @@ -4247,12 +4448,13 @@ static void setup_per_zone_lowmem_reserve(void) } /** - * setup_per_zone_pages_min - called when min_free_kbytes changes. + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} * - * Ensures that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. */ -void setup_per_zone_pages_min(void) +void setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -4277,7 +4479,7 @@ void setup_per_zone_pages_min(void) * need highmem pages, so cap pages_min to a small * value here. * - * The (pages_high-pages_low) and (pages_low-pages_min) + * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) * deltas controls asynch page reclaim, and so should * not be capped for highmem. */ @@ -4288,17 +4490,17 @@ void setup_per_zone_pages_min(void) min_pages = SWAP_CLUSTER_MAX; if (min_pages > 128) min_pages = 128; - zone->pages_min = min_pages; + zone->watermark[WMARK_MIN] = min_pages; } else { /* * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = tmp; + zone->watermark[WMARK_MIN] = tmp; } - zone->pages_low = zone->pages_min + (tmp >> 2); - zone->pages_high = zone->pages_min + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -4308,8 +4510,6 @@ void setup_per_zone_pages_min(void) } /** - * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. - * * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. @@ -4330,21 +4530,26 @@ void setup_per_zone_pages_min(void) * 1TB 101 10GB * 10TB 320 32GB */ -static void setup_per_zone_inactive_ratio(void) +void calculate_zone_inactive_ratio(struct zone *zone) { - struct zone *zone; + unsigned int gb, ratio; - for_each_zone(zone) { - unsigned int gb, ratio; - - /* Zone size in gigabytes */ - gb = zone->present_pages >> (30 - PAGE_SHIFT); + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + if (gb) ratio = int_sqrt(10 * gb); - if (!ratio) - ratio = 1; + else + ratio = 1; - zone->inactive_ratio = ratio; - } + zone->inactive_ratio = ratio; +} + +static void __init setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) + calculate_zone_inactive_ratio(zone); } /* @@ -4371,7 +4576,7 @@ static void setup_per_zone_inactive_ratio(void) * 8192MB: 11584k * 16384MB: 16384k */ -static int __init init_per_zone_pages_min(void) +static int __init init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; @@ -4382,12 +4587,12 @@ static int __init init_per_zone_pages_min(void) min_free_kbytes = 128; if (min_free_kbytes > 65536) min_free_kbytes = 65536; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_pages_min) +module_init(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so @@ -4399,7 +4604,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, { proc_dointvec(table, write, file, buffer, length, ppos); if (write) - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); return 0; } @@ -4443,7 +4648,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * whenever sysctl_lowmem_reserve_ratio changes. * * The reserve ratio obviously has absolutely no relation with the - * pages_min watermarks. The lowmem reserve ratio can only make sense + * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, @@ -4470,7 +4675,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; - for_each_zone(zone) { + for_each_populated_zone(zone) { for_each_online_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; @@ -4550,22 +4755,14 @@ void *__init alloc_large_system_hash(const char *tablename, else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { - unsigned long order = get_order(size); - table = (void*) __get_free_pages(GFP_ATOMIC, order); /* * If bucketsize is not a power-of-two, we may free - * some pages at the end of hash table. + * some pages at the end of hash table which + * alloc_pages_exact() automatically does */ - if (table) { - unsigned long alloc_end = (unsigned long)table + - (PAGE_SIZE << order); - unsigned long used = (unsigned long)table + - PAGE_ALIGN(size); - split_page(virt_to_page(table), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } + if (get_order(size) < MAX_ORDER) { + table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); } } } while (!table && size > PAGE_SIZE && --log2qty); diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 7006a11350c..f22b4ebbd8d 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -69,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid) return 0; } -void __init page_cgroup_init(void) +void __init page_cgroup_init_flatmem(void) { int nid, fail; @@ -83,12 +83,12 @@ void __init page_cgroup_init(void) goto fail; } printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try cgroup_disable=memory option if you" - " don't want\n"); + printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" + " don't want memory cgroups\n"); return; fail: - printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); - printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); + printk(KERN_CRIT "allocation of page_cgroup failed.\n"); + printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); panic("Out of memory"); } @@ -99,6 +99,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); + if (!section->page_cgroup) + return NULL; return section->page_cgroup + pfn; } @@ -113,15 +115,11 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) if (!section->page_cgroup) { nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - if (slab_is_available()) { - base = kmalloc_node(table_size, GFP_KERNEL, nid); - if (!base) - base = vmalloc_node(table_size, nid); - } else { - base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), - table_size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); - } + VM_BUG_ON(!slab_is_available()); + base = kmalloc_node(table_size, + GFP_KERNEL | __GFP_NOWARN, nid); + if (!base) + base = vmalloc_node(table_size, nid); } else { /* * We don't have to allocate page_cgroup again, but @@ -256,14 +254,14 @@ void __init page_cgroup_init(void) fail = init_section_page_cgroup(pfn); } if (fail) { - printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); + printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); panic("Out of memory"); } else { hotplug_memory_notifier(page_cgroup_callback, 0); } printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try cgroup_disable=memory option if you don't" - " want\n"); + printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" + " want memory cgroups\n"); } void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) @@ -284,12 +282,8 @@ struct swap_cgroup_ctrl { struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; -/* - * This 8bytes seems big..maybe we can reduce this when we can use "id" for - * cgroup rather than pointer. - */ struct swap_cgroup { - struct mem_cgroup *val; + unsigned short id; }; #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) #define SC_POS_MASK (SC_PER_PAGE - 1) @@ -317,8 +311,6 @@ static int swap_cgroup_prepare(int type) struct swap_cgroup_ctrl *ctrl; unsigned long idx, max; - if (!do_swap_account) - return 0; ctrl = &swap_cgroup_ctrl[type]; for (idx = 0; idx < ctrl->length; idx++) { @@ -341,10 +333,10 @@ not_enough_page: * @ent: swap entry to be recorded into * @mem: mem_cgroup to be recorded * - * Returns old value at success, NULL at failure. - * (Of course, old value can be NULL.) + * Returns old value at success, 0 at failure. + * (Of course, old value can be 0.) */ -struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) { int type = swp_type(ent); unsigned long offset = swp_offset(ent); @@ -353,18 +345,15 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) struct swap_cgroup_ctrl *ctrl; struct page *mappage; struct swap_cgroup *sc; - struct mem_cgroup *old; - - if (!do_swap_account) - return NULL; + unsigned short old; ctrl = &swap_cgroup_ctrl[type]; mappage = ctrl->map[idx]; sc = page_address(mappage); sc += pos; - old = sc->val; - sc->val = mem; + old = sc->id; + sc->id = id; return old; } @@ -373,9 +362,9 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry * @ent: swap entry to be looked up. * - * Returns pointer to mem_cgroup at success. NULL at failure. + * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ -struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup(swp_entry_t ent) { int type = swp_type(ent); unsigned long offset = swp_offset(ent); @@ -384,16 +373,13 @@ struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) struct swap_cgroup_ctrl *ctrl; struct page *mappage; struct swap_cgroup *sc; - struct mem_cgroup *ret; - - if (!do_swap_account) - return NULL; + unsigned short ret; ctrl = &swap_cgroup_ctrl[type]; mappage = ctrl->map[idx]; sc = page_address(mappage); sc += pos; - ret = sc->val; + ret = sc->id; return ret; } @@ -429,13 +415,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) } mutex_unlock(&swap_cgroup_mutex); - printk(KERN_INFO - "swap_cgroup: uses %ld bytes of vmalloc for pointer array space" - " and %ld bytes to hold mem_cgroup pointers on swap\n", - array_size, length * PAGE_SIZE); - printk(KERN_INFO - "swap_cgroup can be disabled by noswapaccount boot option.\n"); - return 0; nomem: printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); diff --git a/mm/page_io.c b/mm/page_io.c index dc6ce0afbde..c6f3e5071de 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -111,7 +111,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); @@ -120,7 +120,7 @@ out: return ret; } -int swap_readpage(struct file *file, struct page *page) +int swap_readpage(struct page *page) { struct bio *bio; int ret = 0; diff --git a/mm/pdflush.c b/mm/pdflush.c deleted file mode 100644 index 15de509b68f..00000000000 --- a/mm/pdflush.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * mm/pdflush.c - worker threads for writing back filesystem data - * - * Copyright (C) 2002, Linus Torvalds. - * - * 09Apr2002 Andrew Morton - * Initial version - * 29Feb2004 kaos@sgi.com - * Move worker thread creation to kthread to avoid chewing - * up stack space with nested calls to kernel_thread. - */ - -#include <linux/sched.h> -#include <linux/list.h> -#include <linux/signal.h> -#include <linux/spinlock.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/fs.h> /* Needed by writeback.h */ -#include <linux/writeback.h> /* Prototypes pdflush_operation() */ -#include <linux/kthread.h> -#include <linux/cpuset.h> -#include <linux/freezer.h> - - -/* - * Minimum and maximum number of pdflush instances - */ -#define MIN_PDFLUSH_THREADS 2 -#define MAX_PDFLUSH_THREADS 8 - -static void start_one_pdflush_thread(void); - - -/* - * The pdflush threads are worker threads for writing back dirty data. - * Ideally, we'd like one thread per active disk spindle. But the disk - * topology is very hard to divine at this level. Instead, we take - * care in various places to prevent more than one pdflush thread from - * performing writeback against a single filesystem. pdflush threads - * have the PF_FLUSHER flag set in current->flags to aid in this. - */ - -/* - * All the pdflush threads. Protected by pdflush_lock - */ -static LIST_HEAD(pdflush_list); -static DEFINE_SPINLOCK(pdflush_lock); - -/* - * The count of currently-running pdflush threads. Protected - * by pdflush_lock. - * - * Readable by sysctl, but not writable. Published to userspace at - * /proc/sys/vm/nr_pdflush_threads. - */ -int nr_pdflush_threads = 0; - -/* - * The time at which the pdflush thread pool last went empty - */ -static unsigned long last_empty_jifs; - -/* - * The pdflush thread. - * - * Thread pool management algorithm: - * - * - The minimum and maximum number of pdflush instances are bound - * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. - * - * - If there have been no idle pdflush instances for 1 second, create - * a new one. - * - * - If the least-recently-went-to-sleep pdflush thread has been asleep - * for more than one second, terminate a thread. - */ - -/* - * A structure for passing work to a pdflush thread. Also for passing - * state information between pdflush threads. Protected by pdflush_lock. - */ -struct pdflush_work { - struct task_struct *who; /* The thread */ - void (*fn)(unsigned long); /* A callback function */ - unsigned long arg0; /* An argument to the callback */ - struct list_head list; /* On pdflush_list, when idle */ - unsigned long when_i_went_to_sleep; -}; - -static int __pdflush(struct pdflush_work *my_work) -{ - current->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - my_work->fn = NULL; - my_work->who = current; - INIT_LIST_HEAD(&my_work->list); - - spin_lock_irq(&pdflush_lock); - nr_pdflush_threads++; - for ( ; ; ) { - struct pdflush_work *pdf; - - set_current_state(TASK_INTERRUPTIBLE); - list_move(&my_work->list, &pdflush_list); - my_work->when_i_went_to_sleep = jiffies; - spin_unlock_irq(&pdflush_lock); - schedule(); - try_to_freeze(); - spin_lock_irq(&pdflush_lock); - if (!list_empty(&my_work->list)) { - /* - * Someone woke us up, but without removing our control - * structure from the global list. swsusp will do this - * in try_to_freeze()->refrigerator(). Handle it. - */ - my_work->fn = NULL; - continue; - } - if (my_work->fn == NULL) { - printk("pdflush: bogus wakeup\n"); - continue; - } - spin_unlock_irq(&pdflush_lock); - - (*my_work->fn)(my_work->arg0); - - /* - * Thread creation: For how long have there been zero - * available threads? - */ - if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { - /* unlocked list_empty() test is OK here */ - if (list_empty(&pdflush_list)) { - /* unlocked test is OK here */ - if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) - start_one_pdflush_thread(); - } - } - - spin_lock_irq(&pdflush_lock); - my_work->fn = NULL; - - /* - * Thread destruction: For how long has the sleepiest - * thread slept? - */ - if (list_empty(&pdflush_list)) - continue; - if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) - continue; - pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); - if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { - /* Limit exit rate */ - pdf->when_i_went_to_sleep = jiffies; - break; /* exeunt */ - } - } - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - return 0; -} - -/* - * Of course, my_work wants to be just a local in __pdflush(). It is - * separated out in this manner to hopefully prevent the compiler from - * performing unfortunate optimisations against the auto variables. Because - * these are visible to other tasks and CPUs. (No problem has actually - * been observed. This is just paranoia). - */ -static int pdflush(void *dummy) -{ - struct pdflush_work my_work; - cpumask_var_t cpus_allowed; - - /* - * Since the caller doesn't even check kthread_run() worked, let's not - * freak out too much if this fails. - */ - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - printk(KERN_WARNING "pdflush failed to allocate cpumask\n"); - return 0; - } - - /* - * pdflush can spend a lot of time doing encryption via dm-crypt. We - * don't want to do that at keventd's priority. - */ - set_user_nice(current, 0); - - /* - * Some configs put our parent kthread in a limited cpuset, - * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL. - * Our needs are more modest - cut back to our cpusets cpus_allowed. - * This is needed as pdflush's are dynamically created and destroyed. - * The boottime pdflush's are easily placed w/o these 2 lines. - */ - cpuset_cpus_allowed(current, cpus_allowed); - set_cpus_allowed_ptr(current, cpus_allowed); - free_cpumask_var(cpus_allowed); - - return __pdflush(&my_work); -} - -/* - * Attempt to wake up a pdflush thread, and get it to do some work for you. - * Returns zero if it indeed managed to find a worker thread, and passed your - * payload to it. - */ -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) -{ - unsigned long flags; - int ret = 0; - - BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ - - spin_lock_irqsave(&pdflush_lock, flags); - if (list_empty(&pdflush_list)) { - ret = -1; - } else { - struct pdflush_work *pdf; - - pdf = list_entry(pdflush_list.next, struct pdflush_work, list); - list_del_init(&pdf->list); - if (list_empty(&pdflush_list)) - last_empty_jifs = jiffies; - pdf->fn = fn; - pdf->arg0 = arg0; - wake_up_process(pdf->who); - } - spin_unlock_irqrestore(&pdflush_lock, flags); - - return ret; -} - -static void start_one_pdflush_thread(void) -{ - kthread_run(pdflush, NULL, "pdflush"); -} - -static int __init pdflush_init(void) -{ - int i; - - for (i = 0; i < MIN_PDFLUSH_THREADS; i++) - start_one_pdflush_thread(); - return 0; -} - -module_init(pdflush_init); diff --git a/mm/percpu.c b/mm/percpu.c new file mode 100644 index 00000000000..3311c8919f3 --- /dev/null +++ b/mm/percpu.c @@ -0,0 +1,1293 @@ +/* + * linux/mm/percpu.c - percpu memory allocator + * + * Copyright (C) 2009 SUSE Linux Products GmbH + * Copyright (C) 2009 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + * + * This is percpu allocator which can handle both static and dynamic + * areas. Percpu areas are allocated in chunks in vmalloc area. Each + * chunk is consisted of nr_cpu_ids units and the first chunk is used + * for static percpu variables in the kernel image (special boot time + * alloc/init handling necessary as these areas need to be brought up + * before allocation services are running). Unit grows as necessary + * and all units grow or shrink in unison. When a chunk is filled up, + * another chunk is allocated. ie. in vmalloc area + * + * c0 c1 c2 + * ------------------- ------------------- ------------ + * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u + * ------------------- ...... ------------------- .... ------------ + * + * Allocation is done in offset-size areas of single unit space. Ie, + * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, + * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring + * percpu base registers pcpu_unit_size apart. + * + * There are usually many small percpu allocations many of them as + * small as 4 bytes. The allocator organizes chunks into lists + * according to free size and tries to allocate from the fullest one. + * Each chunk keeps the maximum contiguous area size hint which is + * guaranteed to be eqaul to or larger than the maximum contiguous + * area in the chunk. This helps the allocator not to iterate the + * chunk maps unnecessarily. + * + * Allocation state in each chunk is kept using an array of integers + * on chunk->map. A positive value in the map represents a free + * region and negative allocated. Allocation inside a chunk is done + * by scanning this map sequentially and serving the first matching + * entry. This is mostly copied from the percpu_modalloc() allocator. + * Chunks can be determined from the address using the index field + * in the page struct. The index field contains a pointer to the chunk. + * + * To use this allocator, arch code should do the followings. + * + * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * + * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate + * regular address to percpu pointer and back if they need to be + * different from the default + * + * - use pcpu_setup_first_chunk() during percpu area initialization to + * setup the first chunk containing the kernel static percpu area + */ + +#include <linux/bitmap.h> +#include <linux/bootmem.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/pfn.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> + +#include <asm/cacheflush.h> +#include <asm/sections.h> +#include <asm/tlbflush.h> + +#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ +#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ + +/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ +#ifndef __addr_to_pcpu_ptr +#define __addr_to_pcpu_ptr(addr) \ + (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ + + (unsigned long)__per_cpu_start) +#endif +#ifndef __pcpu_ptr_to_addr +#define __pcpu_ptr_to_addr(ptr) \ + (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ + - (unsigned long)__per_cpu_start) +#endif + +struct pcpu_chunk { + struct list_head list; /* linked to pcpu_slot lists */ + int free_size; /* free bytes in the chunk */ + int contig_hint; /* max contiguous size hint */ + struct vm_struct *vm; /* mapped vmalloc region */ + int map_used; /* # of map entries used */ + int map_alloc; /* # of map entries allocated */ + int *map; /* allocation map */ + bool immutable; /* no [de]population allowed */ + struct page **page; /* points to page array */ + struct page *page_ar[]; /* #cpus * UNIT_PAGES */ +}; + +static int pcpu_unit_pages __read_mostly; +static int pcpu_unit_size __read_mostly; +static int pcpu_chunk_size __read_mostly; +static int pcpu_nr_slots __read_mostly; +static size_t pcpu_chunk_struct_size __read_mostly; + +/* the address of the first chunk which starts with the kernel static area */ +void *pcpu_base_addr __read_mostly; +EXPORT_SYMBOL_GPL(pcpu_base_addr); + +/* + * The first chunk which always exists. Note that unlike other + * chunks, this one can be allocated and mapped in several different + * ways and thus often doesn't live in the vmalloc area. + */ +static struct pcpu_chunk *pcpu_first_chunk; + +/* + * Optional reserved chunk. This chunk reserves part of the first + * chunk and serves it for reserved allocations. The amount of + * reserved offset is in pcpu_reserved_chunk_limit. When reserved + * area doesn't exist, the following variables contain NULL and 0 + * respectively. + */ +static struct pcpu_chunk *pcpu_reserved_chunk; +static int pcpu_reserved_chunk_limit; + +/* + * Synchronization rules. + * + * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former + * protects allocation/reclaim paths, chunks and chunk->page arrays. + * The latter is a spinlock and protects the index data structures - + * chunk slots, chunks and area maps in chunks. + * + * During allocation, pcpu_alloc_mutex is kept locked all the time and + * pcpu_lock is grabbed and released as necessary. All actual memory + * allocations are done using GFP_KERNEL with pcpu_lock released. + * + * Free path accesses and alters only the index data structures, so it + * can be safely called from atomic context. When memory needs to be + * returned to the system, free path schedules reclaim_work which + * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be + * reclaimed, release both locks and frees the chunks. Note that it's + * necessary to grab both locks to remove a chunk from circulation as + * allocation path might be referencing the chunk with only + * pcpu_alloc_mutex locked. + */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ +static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ + +static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ + +/* reclaim work to release fully free chunks, scheduled from free path */ +static void pcpu_reclaim(struct work_struct *work); +static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); + +static int __pcpu_size_to_slot(int size) +{ + int highbit = fls(size); /* size is in bytes */ + return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); +} + +static int pcpu_size_to_slot(int size) +{ + if (size == pcpu_unit_size) + return pcpu_nr_slots - 1; + return __pcpu_size_to_slot(size); +} + +static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) +{ + if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) + return 0; + + return pcpu_size_to_slot(chunk->free_size); +} + +static int pcpu_page_idx(unsigned int cpu, int page_idx) +{ + return cpu * pcpu_unit_pages + page_idx; +} + +static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return &chunk->page[pcpu_page_idx(cpu, page_idx)]; +} + +static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return (unsigned long)chunk->vm->addr + + (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); +} + +static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, + int page_idx) +{ + /* + * Any possible cpu id can be used here, so there's no need to + * worry about preemption or cpu hotplug. + */ + return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(), + page_idx) != NULL; +} + +/* set the pointer to a chunk in a page struct */ +static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) +{ + page->index = (unsigned long)pcpu; +} + +/* obtain pointer to a chunk from a page struct */ +static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) +{ + return (struct pcpu_chunk *)page->index; +} + +/** + * pcpu_mem_alloc - allocate memory + * @size: bytes to allocate + * + * Allocate @size bytes. If @size is smaller than PAGE_SIZE, + * kzalloc() is used; otherwise, vmalloc() is used. The returned + * memory is always zeroed. + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void *pcpu_mem_alloc(size_t size) +{ + if (size <= PAGE_SIZE) + return kzalloc(size, GFP_KERNEL); + else { + void *ptr = vmalloc(size); + if (ptr) + memset(ptr, 0, size); + return ptr; + } +} + +/** + * pcpu_mem_free - free memory + * @ptr: memory to free + * @size: size of the area + * + * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). + */ +static void pcpu_mem_free(void *ptr, size_t size) +{ + if (size <= PAGE_SIZE) + kfree(ptr); + else + vfree(ptr); +} + +/** + * pcpu_chunk_relocate - put chunk in the appropriate chunk slot + * @chunk: chunk of interest + * @oslot: the previous slot it was on + * + * This function is called after an allocation or free changed @chunk. + * New slot according to the changed state is determined and @chunk is + * moved to the slot. Note that the reserved chunk is never put on + * chunk slots. + * + * CONTEXT: + * pcpu_lock. + */ +static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) +{ + int nslot = pcpu_chunk_slot(chunk); + + if (chunk != pcpu_reserved_chunk && oslot != nslot) { + if (oslot < nslot) + list_move(&chunk->list, &pcpu_slot[nslot]); + else + list_move_tail(&chunk->list, &pcpu_slot[nslot]); + } +} + +/** + * pcpu_chunk_addr_search - determine chunk containing specified address + * @addr: address for which the chunk needs to be determined. + * + * RETURNS: + * The address of the found chunk. + */ +static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) +{ + void *first_start = pcpu_first_chunk->vm->addr; + + /* is it in the first chunk? */ + if (addr >= first_start && addr < first_start + pcpu_chunk_size) { + /* is it in the reserved area? */ + if (addr < first_start + pcpu_reserved_chunk_limit) + return pcpu_reserved_chunk; + return pcpu_first_chunk; + } + + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += raw_smp_processor_id() * pcpu_unit_size; + return pcpu_get_page_chunk(vmalloc_to_page(addr)); +} + +/** + * pcpu_extend_area_map - extend area map for allocation + * @chunk: target chunk + * + * Extend area map of @chunk so that it can accomodate an allocation. + * A single allocation can split an area into three areas, so this + * function makes sure that @chunk->map has at least two extra slots. + * + * CONTEXT: + * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired + * if area map is extended. + * + * RETURNS: + * 0 if noop, 1 if successfully extended, -errno on failure. + */ +static int pcpu_extend_area_map(struct pcpu_chunk *chunk) +{ + int new_alloc; + int *new; + size_t size; + + /* has enough? */ + if (chunk->map_alloc >= chunk->map_used + 2) + return 0; + + spin_unlock_irq(&pcpu_lock); + + new_alloc = PCPU_DFL_MAP_ALLOC; + while (new_alloc < chunk->map_used + 2) + new_alloc *= 2; + + new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); + if (!new) { + spin_lock_irq(&pcpu_lock); + return -ENOMEM; + } + + /* + * Acquire pcpu_lock and switch to new area map. Only free + * could have happened inbetween, so map_used couldn't have + * grown. + */ + spin_lock_irq(&pcpu_lock); + BUG_ON(new_alloc < chunk->map_used + 2); + + size = chunk->map_alloc * sizeof(chunk->map[0]); + memcpy(new, chunk->map, size); + + /* + * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is + * one of the first chunks and still using static map. + */ + if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) + pcpu_mem_free(chunk->map, size); + + chunk->map_alloc = new_alloc; + chunk->map = new; + return 0; +} + +/** + * pcpu_split_block - split a map block + * @chunk: chunk of interest + * @i: index of map block to split + * @head: head size in bytes (can be 0) + * @tail: tail size in bytes (can be 0) + * + * Split the @i'th map block into two or three blocks. If @head is + * non-zero, @head bytes block is inserted before block @i moving it + * to @i+1 and reducing its size by @head bytes. + * + * If @tail is non-zero, the target block, which can be @i or @i+1 + * depending on @head, is reduced by @tail bytes and @tail byte block + * is inserted after the target block. + * + * @chunk->map must have enough free slots to accomodate the split. + * + * CONTEXT: + * pcpu_lock. + */ +static void pcpu_split_block(struct pcpu_chunk *chunk, int i, + int head, int tail) +{ + int nr_extra = !!head + !!tail; + + BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); + + /* insert new subblocks */ + memmove(&chunk->map[i + nr_extra], &chunk->map[i], + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + + if (head) { + chunk->map[i + 1] = chunk->map[i] - head; + chunk->map[i++] = head; + } + if (tail) { + chunk->map[i++] -= tail; + chunk->map[i] = tail; + } +} + +/** + * pcpu_alloc_area - allocate area from a pcpu_chunk + * @chunk: chunk of interest + * @size: wanted size in bytes + * @align: wanted align + * + * Try to allocate @size bytes area aligned at @align from @chunk. + * Note that this function only allocates the offset. It doesn't + * populate or map the area. + * + * @chunk->map must have at least two free slots. + * + * CONTEXT: + * pcpu_lock. + * + * RETURNS: + * Allocated offset in @chunk on success, -1 if no matching area is + * found. + */ +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +{ + int oslot = pcpu_chunk_slot(chunk); + int max_contig = 0; + int i, off; + + for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { + bool is_last = i + 1 == chunk->map_used; + int head, tail; + + /* extra for alignment requirement */ + head = ALIGN(off, align) - off; + BUG_ON(i == 0 && head != 0); + + if (chunk->map[i] < 0) + continue; + if (chunk->map[i] < head + size) { + max_contig = max(chunk->map[i], max_contig); + continue; + } + + /* + * If head is small or the previous block is free, + * merge'em. Note that 'small' is defined as smaller + * than sizeof(int), which is very small but isn't too + * uncommon for percpu allocations. + */ + if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { + if (chunk->map[i - 1] > 0) + chunk->map[i - 1] += head; + else { + chunk->map[i - 1] -= head; + chunk->free_size -= head; + } + chunk->map[i] -= head; + off += head; + head = 0; + } + + /* if tail is small, just keep it around */ + tail = chunk->map[i] - head - size; + if (tail < sizeof(int)) + tail = 0; + + /* split if warranted */ + if (head || tail) { + pcpu_split_block(chunk, i, head, tail); + if (head) { + i++; + off += head; + max_contig = max(chunk->map[i - 1], max_contig); + } + if (tail) + max_contig = max(chunk->map[i + 1], max_contig); + } + + /* update hint and mark allocated */ + if (is_last) + chunk->contig_hint = max_contig; /* fully scanned */ + else + chunk->contig_hint = max(chunk->contig_hint, + max_contig); + + chunk->free_size -= chunk->map[i]; + chunk->map[i] = -chunk->map[i]; + + pcpu_chunk_relocate(chunk, oslot); + return off; + } + + chunk->contig_hint = max_contig; /* fully scanned */ + pcpu_chunk_relocate(chunk, oslot); + + /* tell the upper layer that this chunk has no matching area */ + return -1; +} + +/** + * pcpu_free_area - free area to a pcpu_chunk + * @chunk: chunk of interest + * @freeme: offset of area to free + * + * Free area starting from @freeme to @chunk. Note that this function + * only modifies the allocation map. It doesn't depopulate or unmap + * the area. + * + * CONTEXT: + * pcpu_lock. + */ +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +{ + int oslot = pcpu_chunk_slot(chunk); + int i, off; + + for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) + if (off == freeme) + break; + BUG_ON(off != freeme); + BUG_ON(chunk->map[i] > 0); + + chunk->map[i] = -chunk->map[i]; + chunk->free_size += chunk->map[i]; + + /* merge with previous? */ + if (i > 0 && chunk->map[i - 1] >= 0) { + chunk->map[i - 1] += chunk->map[i]; + chunk->map_used--; + memmove(&chunk->map[i], &chunk->map[i + 1], + (chunk->map_used - i) * sizeof(chunk->map[0])); + i--; + } + /* merge with next? */ + if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { + chunk->map[i] += chunk->map[i + 1]; + chunk->map_used--; + memmove(&chunk->map[i + 1], &chunk->map[i + 2], + (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); + } + + chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); + pcpu_chunk_relocate(chunk, oslot); +} + +/** + * pcpu_unmap - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * @flush_tlb: whether to flush tlb or not + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * If @flush is true, vcache is flushed before unmapping and tlb + * after. + */ +static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, + bool flush_tlb) +{ + unsigned int last = nr_cpu_ids - 1; + unsigned int cpu; + + /* unmap must not be done on immutable chunk */ + WARN_ON(chunk->immutable); + + /* + * Each flushing trial can be very expensive, issue flush on + * the whole region at once rather than doing it for each cpu. + * This could be an overkill but is more scalable. + */ + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + + for_each_possible_cpu(cpu) + unmap_kernel_range_noflush( + pcpu_chunk_addr(chunk, cpu, page_start), + (page_end - page_start) << PAGE_SHIFT); + + /* ditto as flush_cache_vunmap() */ + if (flush_tlb) + flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @off: offset to the area to depopulate + * @size: size of the area to depopulate in bytes + * @flush: whether to flush cache and tlb or not + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. If @flush is true, vcache is flushed before unmapping + * and tlb after. + * + * CONTEXT: + * pcpu_alloc_mutex. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, + bool flush) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int unmap_start = -1; + int uninitialized_var(unmap_end); + unsigned int cpu; + int i; + + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) { + struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + + if (!*pagep) + continue; + + __free_page(*pagep); + + /* + * If it's partial depopulation, it might get + * populated or depopulated again. Mark the + * page gone. + */ + *pagep = NULL; + + unmap_start = unmap_start < 0 ? i : unmap_start; + unmap_end = i + 1; + } + } + + if (unmap_start >= 0) + pcpu_unmap(chunk, unmap_start, unmap_end, flush); +} + +/** + * pcpu_map - map pages into a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to map + * @page_end: page index of the last page to map + 1 + * + * For each cpu, map pages [@page_start,@page_end) into @chunk. + * vcache is flushed afterwards. + */ +static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +{ + unsigned int last = nr_cpu_ids - 1; + unsigned int cpu; + int err; + + /* map must not be done on immutable chunk */ + WARN_ON(chunk->immutable); + + for_each_possible_cpu(cpu) { + err = map_kernel_range_noflush( + pcpu_chunk_addr(chunk, cpu, page_start), + (page_end - page_start) << PAGE_SHIFT, + PAGE_KERNEL, + pcpu_chunk_pagep(chunk, cpu, page_start)); + if (err < 0) + return err; + } + + /* flush at once, please read comments in pcpu_unmap() */ + flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + return 0; +} + +/** + * pcpu_populate_chunk - populate and map an area of a pcpu_chunk + * @chunk: chunk of interest + * @off: offset to the area to populate + * @size: size of the area to populate in bytes + * + * For each cpu, populate and map pages [@page_start,@page_end) into + * @chunk. The area is cleared on return. + * + * CONTEXT: + * pcpu_alloc_mutex, does GFP_KERNEL allocation. + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int map_start = -1; + int uninitialized_var(map_end); + unsigned int cpu; + int i; + + for (i = page_start; i < page_end; i++) { + if (pcpu_chunk_page_occupied(chunk, i)) { + if (map_start >= 0) { + if (pcpu_map(chunk, map_start, map_end)) + goto err; + map_start = -1; + } + continue; + } + + map_start = map_start < 0 ? i : map_start; + map_end = i + 1; + + for_each_possible_cpu(cpu) { + struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + + *pagep = alloc_pages_node(cpu_to_node(cpu), + alloc_mask, 0); + if (!*pagep) + goto err; + pcpu_set_page_chunk(*pagep, chunk); + } + } + + if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) + goto err; + + for_each_possible_cpu(cpu) + memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, + size); + + return 0; +err: + /* likely under heavy memory pressure, give memory back */ + pcpu_depopulate_chunk(chunk, off, size, true); + return -ENOMEM; +} + +static void free_pcpu_chunk(struct pcpu_chunk *chunk) +{ + if (!chunk) + return; + if (chunk->vm) + free_vm_area(chunk->vm); + pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); + kfree(chunk); +} + +static struct pcpu_chunk *alloc_pcpu_chunk(void) +{ + struct pcpu_chunk *chunk; + + chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); + if (!chunk) + return NULL; + + chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = pcpu_unit_size; + chunk->page = chunk->page_ar; + + chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); + if (!chunk->vm) { + free_pcpu_chunk(chunk); + return NULL; + } + + INIT_LIST_HEAD(&chunk->list); + chunk->free_size = pcpu_unit_size; + chunk->contig_hint = pcpu_unit_size; + + return chunk; +} + +/** + * pcpu_alloc - the percpu allocator + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * @reserved: allocate from the reserved chunk if available + * + * Allocate percpu area of @size bytes aligned at @align. + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +static void *pcpu_alloc(size_t size, size_t align, bool reserved) +{ + struct pcpu_chunk *chunk; + int slot, off; + + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { + WARN(true, "illegal size (%zu) or align (%zu) for " + "percpu allocation\n", size, align); + return NULL; + } + + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irq(&pcpu_lock); + + /* serve reserved allocations from the reserved chunk if available */ + if (reserved && pcpu_reserved_chunk) { + chunk = pcpu_reserved_chunk; + if (size > chunk->contig_hint || + pcpu_extend_area_map(chunk) < 0) + goto fail_unlock; + off = pcpu_alloc_area(chunk, size, align); + if (off >= 0) + goto area_found; + goto fail_unlock; + } + +restart: + /* search through normal chunks */ + for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + if (size > chunk->contig_hint) + continue; + + switch (pcpu_extend_area_map(chunk)) { + case 0: + break; + case 1: + goto restart; /* pcpu_lock dropped, restart */ + default: + goto fail_unlock; + } + + off = pcpu_alloc_area(chunk, size, align); + if (off >= 0) + goto area_found; + } + } + + /* hmmm... no space left, create a new chunk */ + spin_unlock_irq(&pcpu_lock); + + chunk = alloc_pcpu_chunk(); + if (!chunk) + goto fail_unlock_mutex; + + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + goto restart; + +area_found: + spin_unlock_irq(&pcpu_lock); + + /* populate, map and clear the area */ + if (pcpu_populate_chunk(chunk, off, size)) { + spin_lock_irq(&pcpu_lock); + pcpu_free_area(chunk, off); + goto fail_unlock; + } + + mutex_unlock(&pcpu_alloc_mutex); + + return __addr_to_pcpu_ptr(chunk->vm->addr + off); + +fail_unlock: + spin_unlock_irq(&pcpu_lock); +fail_unlock_mutex: + mutex_unlock(&pcpu_alloc_mutex); + return NULL; +} + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align. Might + * sleep. Might trigger writeouts. + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, false); +} +EXPORT_SYMBOL_GPL(__alloc_percpu); + +/** + * __alloc_reserved_percpu - allocate reserved percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align from reserved + * percpu area if arch has set it up; otherwise, allocation is served + * from the same dynamic area. Might sleep. Might trigger writeouts. + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_reserved_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, true); +} + +/** + * pcpu_reclaim - reclaim fully free chunks, workqueue function + * @work: unused + * + * Reclaim all fully free chunks except for the first one. + * + * CONTEXT: + * workqueue context. + */ +static void pcpu_reclaim(struct work_struct *work) +{ + LIST_HEAD(todo); + struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; + struct pcpu_chunk *chunk, *next; + + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irq(&pcpu_lock); + + list_for_each_entry_safe(chunk, next, head, list) { + WARN_ON(chunk->immutable); + + /* spare the first one */ + if (chunk == list_first_entry(head, struct pcpu_chunk, list)) + continue; + + list_move(&chunk->list, &todo); + } + + spin_unlock_irq(&pcpu_lock); + mutex_unlock(&pcpu_alloc_mutex); + + list_for_each_entry_safe(chunk, next, &todo, list) { + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + free_pcpu_chunk(chunk); + } +} + +/** + * free_percpu - free percpu area + * @ptr: pointer to area to free + * + * Free percpu area @ptr. + * + * CONTEXT: + * Can be called from atomic context. + */ +void free_percpu(void *ptr) +{ + void *addr = __pcpu_ptr_to_addr(ptr); + struct pcpu_chunk *chunk; + unsigned long flags; + int off; + + if (!ptr) + return; + + spin_lock_irqsave(&pcpu_lock, flags); + + chunk = pcpu_chunk_addr_search(addr); + off = addr - chunk->vm->addr; + + pcpu_free_area(chunk, off); + + /* if there are more than one fully free chunks, wake up grim reaper */ + if (chunk->free_size == pcpu_unit_size) { + struct pcpu_chunk *pos; + + list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) + if (pos != chunk) { + schedule_work(&pcpu_reclaim_work); + break; + } + } + + spin_unlock_irqrestore(&pcpu_lock, flags); +} +EXPORT_SYMBOL_GPL(free_percpu); + +/** + * pcpu_setup_first_chunk - initialize the first percpu chunk + * @get_page_fn: callback to fetch page pointer + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto + * @base_addr: mapped address, NULL for auto + * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary + * + * Initialize the first percpu chunk which contains the kernel static + * perpcu area. This function is to be called from arch percpu area + * setup path. The first two parameters are mandatory. The rest are + * optional. + * + * @get_page_fn() should return pointer to percpu page given cpu + * number and page number. It should at least return enough pages to + * cover the static area. The returned pages for static area should + * have been initialized with valid data. If @unit_size is specified, + * it can also return pages after the static area. NULL return + * indicates end of pages for the cpu. Note that @get_page_fn() must + * return the same number of pages for all cpus. + * + * @reserved_size, if non-zero, specifies the amount of bytes to + * reserve after the static area in the first chunk. This reserves + * the first chunk such that it's available only through reserved + * percpu allocation. This is primarily used to serve module percpu + * static areas on architectures where the addressing model has + * limited offset range for symbol relocations to guarantee module + * percpu symbols fall inside the relocatable range. + * + * @dyn_size, if non-negative, determines the number of bytes + * available for dynamic allocation in the first chunk. Specifying + * non-negative value makes percpu leave alone the area beyond + * @static_size + @reserved_size + @dyn_size. + * + * @unit_size, if non-negative, specifies unit size and must be + * aligned to PAGE_SIZE and equal to or larger than @static_size + + * @reserved_size + if non-negative, @dyn_size. + * + * Non-null @base_addr means that the caller already allocated virtual + * region for the first chunk and mapped it. percpu must not mess + * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL + * @populate_pte_fn doesn't make any sense. + * + * @populate_pte_fn is used to populate the pagetable. NULL means the + * caller already populated the pagetable. + * + * If the first chunk ends up with both reserved and dynamic areas, it + * is served by two chunks - one to serve the core static and reserved + * areas and the other for the dynamic area. They share the same vm + * and page map but uses different area allocation map to stay away + * from each other. The latter chunk is circulated in the chunk slots + * and available for dynamic allocation like any other chunks. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access. + */ +size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, + size_t static_size, size_t reserved_size, + ssize_t dyn_size, ssize_t unit_size, + void *base_addr, + pcpu_populate_pte_fn_t populate_pte_fn) +{ + static struct vm_struct first_vm; + static int smap[2], dmap[2]; + size_t size_sum = static_size + reserved_size + + (dyn_size >= 0 ? dyn_size : 0); + struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned int cpu; + int nr_pages; + int err, i; + + /* santiy checks */ + BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || + ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); + BUG_ON(!static_size); + if (unit_size >= 0) { + BUG_ON(unit_size < size_sum); + BUG_ON(unit_size & ~PAGE_MASK); + BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + } else + BUG_ON(base_addr); + BUG_ON(base_addr && populate_pte_fn); + + if (unit_size >= 0) + pcpu_unit_pages = unit_size >> PAGE_SHIFT; + else + pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, + PFN_UP(size_sum)); + + pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; + pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); + + if (dyn_size < 0) + dyn_size = pcpu_unit_size - static_size - reserved_size; + + /* + * Allocate chunk slots. The additional last slot is for + * empty chunks. + */ + pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; + pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); + for (i = 0; i < pcpu_nr_slots; i++) + INIT_LIST_HEAD(&pcpu_slot[i]); + + /* + * Initialize static chunk. If reserved_size is zero, the + * static chunk covers static area + dynamic allocation area + * in the first chunk. If reserved_size is not zero, it + * covers static area + reserved area (mostly used for module + * static percpu allocation). + */ + schunk = alloc_bootmem(pcpu_chunk_struct_size); + INIT_LIST_HEAD(&schunk->list); + schunk->vm = &first_vm; + schunk->map = smap; + schunk->map_alloc = ARRAY_SIZE(smap); + schunk->page = schunk->page_ar; + + if (reserved_size) { + schunk->free_size = reserved_size; + pcpu_reserved_chunk = schunk; + pcpu_reserved_chunk_limit = static_size + reserved_size; + } else { + schunk->free_size = dyn_size; + dyn_size = 0; /* dynamic area covered */ + } + schunk->contig_hint = schunk->free_size; + + schunk->map[schunk->map_used++] = -static_size; + if (schunk->free_size) + schunk->map[schunk->map_used++] = schunk->free_size; + + /* init dynamic chunk if necessary */ + if (dyn_size) { + dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + INIT_LIST_HEAD(&dchunk->list); + dchunk->vm = &first_vm; + dchunk->map = dmap; + dchunk->map_alloc = ARRAY_SIZE(dmap); + dchunk->page = schunk->page_ar; /* share page map with schunk */ + + dchunk->contig_hint = dchunk->free_size = dyn_size; + dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; + dchunk->map[dchunk->map_used++] = dchunk->free_size; + } + + /* allocate vm address */ + first_vm.flags = VM_ALLOC; + first_vm.size = pcpu_chunk_size; + + if (!base_addr) + vm_area_register_early(&first_vm, PAGE_SIZE); + else { + /* + * Pages already mapped. No need to remap into + * vmalloc area. In this case the first chunks can't + * be mapped or unmapped by percpu and are marked + * immutable. + */ + first_vm.addr = base_addr; + schunk->immutable = true; + if (dchunk) + dchunk->immutable = true; + } + + /* assign pages */ + nr_pages = -1; + for_each_possible_cpu(cpu) { + for (i = 0; i < pcpu_unit_pages; i++) { + struct page *page = get_page_fn(cpu, i); + + if (!page) + break; + *pcpu_chunk_pagep(schunk, cpu, i) = page; + } + + BUG_ON(i < PFN_UP(static_size)); + + if (nr_pages < 0) + nr_pages = i; + else + BUG_ON(nr_pages != i); + } + + /* map them */ + if (populate_pte_fn) { + for_each_possible_cpu(cpu) + for (i = 0; i < nr_pages; i++) + populate_pte_fn(pcpu_chunk_addr(schunk, + cpu, i)); + + err = pcpu_map(schunk, 0, nr_pages); + if (err) + panic("failed to setup static percpu area, err=%d\n", + err); + } + + /* link the first chunk in */ + pcpu_first_chunk = dchunk ?: schunk; + pcpu_chunk_relocate(pcpu_first_chunk, -1); + + /* we're done */ + pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); + return pcpu_unit_size; +} + +/* + * Embedding first chunk setup helper. + */ +static void *pcpue_ptr __initdata; +static size_t pcpue_size __initdata; +static size_t pcpue_unit_size __initdata; + +static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpue_size) + return NULL; + + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); +} + +/** + * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto + * + * This is a helper to ease setting up embedded first percpu chunk and + * can be called where pcpu_setup_first_chunk() is expected. + * + * If this function is used to setup the first chunk, it is allocated + * as a contiguous area using bootmem allocator and used as-is without + * being mapped into vmalloc area. This enables the first chunk to + * piggy back on the linear physical mapping which often uses larger + * page size. + * + * When @dyn_size is positive, dynamic area might be larger than + * specified to fill page alignment. Also, when @dyn_size is auto, + * @dyn_size does not fill the whole first chunk but only what's + * necessary for page alignment after static and reserved areas. + * + * If the needed size is smaller than the minimum or specified unit + * size, the leftover is returned to the bootmem allocator. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, + ssize_t dyn_size, ssize_t unit_size) +{ + size_t chunk_size; + unsigned int cpu; + + /* determine parameters and allocate */ + pcpue_size = PFN_ALIGN(static_size + reserved_size + + (dyn_size >= 0 ? dyn_size : 0)); + if (dyn_size != 0) + dyn_size = pcpue_size - static_size - reserved_size; + + if (unit_size >= 0) { + BUG_ON(unit_size < pcpue_size); + pcpue_unit_size = unit_size; + } else + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); + + chunk_size = pcpue_unit_size * nr_cpu_ids; + + pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); + if (!pcpue_ptr) { + pr_warning("PERCPU: failed to allocate %zu bytes for " + "embedding\n", chunk_size); + return -ENOMEM; + } + + /* return the leftover and copy */ + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + + if (cpu_possible(cpu)) { + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } else + free_bootmem(__pa(ptr), pcpue_unit_size); + } + + /* we're ready, commit */ + pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", + pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + reserved_size, dyn_size, + pcpue_unit_size, pcpue_ptr, NULL); +} diff --git a/mm/quicklist.c b/mm/quicklist.c index 8dbb6805ef3..e66d07d1b4f 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -29,7 +29,7 @@ static unsigned long max_pages(unsigned long min_pages) int node = numa_node_id(); struct zone *zones = NODE_DATA(node)->node_zones; int num_cpus_on_node; - node_to_cpumask_ptr(cpumask_on_node, node); + const struct cpumask *cpumask_on_node = cpumask_of_node(node); node_free_pages = #ifdef CONFIG_ZONE_DMA diff --git a/mm/readahead.c b/mm/readahead.c index bec83c15a78..aa1aa234523 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -17,19 +17,6 @@ #include <linux/pagevec.h> #include <linux/pagemap.h> -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} -EXPORT_SYMBOL(default_unplug_io_fn); - -struct backing_dev_info default_backing_dev_info = { - .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, - .state = 0, - .capabilities = BDI_CAP_MAP_COPY, - .unplug_io_fn = default_unplug_io_fn, -}; -EXPORT_SYMBOL_GPL(default_backing_dev_info); - /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -44,6 +31,42 @@ EXPORT_SYMBOL_GPL(file_ra_state_init); #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) +/* + * see if a page needs releasing upon read_cache_pages() failure + * - the caller of read_cache_pages() may have set PG_private or PG_fscache + * before calling, such as the NFS fs marking pages that are cached locally + * on disk, thus we need to give the fs a chance to clean up in the event of + * an error + */ +static void read_cache_pages_invalidate_page(struct address_space *mapping, + struct page *page) +{ + if (page_has_private(page)) { + if (!trylock_page(page)) + BUG(); + page->mapping = mapping; + do_invalidatepage(page, 0); + page->mapping = NULL; + unlock_page(page); + } + page_cache_release(page); +} + +/* + * release a list of pages, invalidating them first if need be + */ +static void read_cache_pages_invalidate_pages(struct address_space *mapping, + struct list_head *pages) +{ + struct page *victim; + + while (!list_empty(pages)) { + victim = list_to_page(pages); + list_del(&victim->lru); + read_cache_pages_invalidate_page(mapping, victim); + } +} + /** * read_cache_pages - populate an address space with some pages & start reads against them * @mapping: the address_space @@ -65,14 +88,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) { - page_cache_release(page); + read_cache_pages_invalidate_page(mapping, page); continue; } page_cache_release(page); ret = filler(data, page); if (unlikely(ret)) { - put_pages_list(pages); + read_cache_pages_invalidate_pages(mapping, pages); break; } task_io_account_read(PAGE_CACHE_SIZE); @@ -110,15 +133,12 @@ out: } /* - * do_page_cache_readahead actually reads a chunk of disk. It allocates all + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all * the pages first, then submits them all for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * * Returns the number of pages requested, or the maximum amount of I/O allowed. - * - * do_page_cache_readahead() returns -1 if it encountered request queue - * congestion. */ static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, @@ -187,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; + nr_to_read = max_sane_readahead(nr_to_read); while (nr_to_read) { int err; @@ -208,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, } /* - * This version skips the IO if the queue is read-congested, and will tell the - * block layer to abandon the readahead if request allocation would block. - * - * force_page_cache_readahead() will ignore queue congestion and will block on - * request queues. - */ -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) -{ - if (bdi_read_congested(mapping->backing_dev_info)) - return -1; - - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); -} - -/* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. */ @@ -233,22 +238,10 @@ unsigned long max_sane_readahead(unsigned long nr) + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); } -static int __init readahead_init(void) -{ - int err; - - err = bdi_init(&default_backing_dev_info); - if (!err) - bdi_register(&default_backing_dev_info, NULL, "default"); - - return err; -} -subsys_initcall(readahead_init); - /* * Submit IO for the read-ahead request in file_ra_state. */ -static unsigned long ra_submit(struct file_ra_state *ra, +unsigned long ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp) { int actual; @@ -337,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, */ /* + * Count contiguously cached pages from @offset-1 to @offset-@max, + * this count is a conservative estimation of + * - length of the sequential read sequence, or + * - thrashing threshold in memory tight systems + */ +static pgoff_t count_history_pages(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t offset, unsigned long max) +{ + pgoff_t head; + + rcu_read_lock(); + head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + rcu_read_unlock(); + + return offset - 1 - head; +} + +/* + * page cache context based read-ahead + */ +static int try_context_readahead(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t offset, + unsigned long req_size, + unsigned long max) +{ + pgoff_t size; + + size = count_history_pages(mapping, ra, offset, max); + + /* + * no history pages: + * it could be a random read + */ + if (!size) + return 0; + + /* + * starts from beginning of file: + * it is a strong indication of long-run stream (or whole-file-read) + */ + if (size >= offset) + size *= 2; + + ra->start = offset; + ra->size = get_init_ra_size(size + req_size, max); + ra->async_size = ra->size; + + return 1; +} + +/* * A minimal readahead algorithm for trivial sequential/random reads. */ static unsigned long @@ -345,34 +391,26 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - int max = ra->ra_pages; /* max readahead pages */ - pgoff_t prev_offset; - int sequential; + unsigned long max = max_sane_readahead(ra->ra_pages); + + /* + * start of file + */ + if (!offset) + goto initial_readahead; /* * It's the expected callback offset, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if (offset && (offset == (ra->start + ra->size - ra->async_size) || - offset == (ra->start + ra->size))) { + if ((offset == (ra->start + ra->size - ra->async_size) || + offset == (ra->start + ra->size))) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max); ra->async_size = ra->size; goto readit; } - prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; - sequential = offset - prev_offset <= 1UL || req_size > max; - - /* - * Standalone, small read. - * Read as is, and do not pollute the readahead state. - */ - if (!hit_readahead_marker && !sequential) { - return __do_page_cache_readahead(mapping, filp, - offset, req_size, 0); - } - /* * Hit a marked page without valid readahead state. * E.g. interleaved reads. @@ -383,7 +421,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); + start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); rcu_read_unlock(); if (!start || start - offset > max) @@ -391,23 +429,53 @@ ondemand_readahead(struct address_space *mapping, ra->start = start; ra->size = start - offset; /* old async_size */ + ra->size += req_size; ra->size = get_next_ra_size(ra, max); ra->async_size = ra->size; goto readit; } /* - * It may be one of - * - first read on start of file - * - sequential cache miss - * - oversize random read - * Start readahead for it. + * oversize read + */ + if (req_size > max) + goto initial_readahead; + + /* + * sequential cache miss */ + if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) + goto initial_readahead; + + /* + * Query the page cache and look for the traces(cached history pages) + * that a sequential stream would leave behind. + */ + if (try_context_readahead(mapping, ra, offset, req_size, max)) + goto readit; + + /* + * standalone, small random read + * Read as is, and do not pollute the readahead state. + */ + return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + +initial_readahead: ra->start = offset; ra->size = get_init_ra_size(req_size, max); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; readit: + /* + * Will this read hit the readahead marker made by itself? + * If so, trigger the readahead marker hit now, and merge + * the resulted next readahead window into the current one. + */ + if (offset == ra->start && ra->size == ra->async_size) { + ra->async_size = get_next_ra_size(ra, max); + ra->size += ra->async_size; + } + return ra_submit(ra, mapping, filp); } diff --git a/mm/rmap.c b/mm/rmap.c index ac4af8cffbf..0895b5c7cbf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -14,7 +14,7 @@ * Original design by Rik van Riel <riel@conectiva.com.br> 2001 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 - * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 + * Contributions by Hugh Dickins 2003, 2004 */ /* @@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) * repeatedly from either page_referenced_anon or page_referenced_file. */ static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, unsigned int *mapcount) + struct vm_area_struct *vma, + unsigned int *mapcount, + unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -356,6 +358,7 @@ static int page_referenced_one(struct page *page, */ if (vma->vm_flags & VM_LOCKED) { *mapcount = 1; /* break early from loop */ + *vm_flags |= VM_LOCKED; goto out_unmap; } @@ -381,11 +384,14 @@ out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); out: + if (referenced) + *vm_flags |= vma->vm_flags; return referenced; } static int page_referenced_anon(struct page *page, - struct mem_cgroup *mem_cont) + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { unsigned int mapcount; struct anon_vma *anon_vma; @@ -405,7 +411,8 @@ static int page_referenced_anon(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, &mapcount); + referenced += page_referenced_one(page, vma, + &mapcount, vm_flags); if (!mapcount) break; } @@ -418,6 +425,7 @@ static int page_referenced_anon(struct page *page, * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. * @mem_cont: target memory controller + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * For an object-based mapped page, find all the places it is mapped and * check/clear the referenced flag. This is done by following the page->mapping @@ -427,7 +435,8 @@ static int page_referenced_anon(struct page *page, * This function is only called from page_referenced for object-based pages. */ static int page_referenced_file(struct page *page, - struct mem_cgroup *mem_cont) + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { unsigned int mapcount; struct address_space *mapping = page->mapping; @@ -467,7 +476,8 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, &mapcount); + referenced += page_referenced_one(page, vma, + &mapcount, vm_flags); if (!mapcount) break; } @@ -481,29 +491,35 @@ static int page_referenced_file(struct page *page, * @page: the page to test * @is_locked: caller holds lock on the page * @mem_cont: target memory controller + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. */ -int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *mem_cont) +int page_referenced(struct page *page, + int is_locked, + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { int referenced = 0; if (TestClearPageReferenced(page)) referenced++; + *vm_flags = 0; if (page_mapped(page) && page->mapping) { if (PageAnon(page)) - referenced += page_referenced_anon(page, mem_cont); + referenced += page_referenced_anon(page, mem_cont, + vm_flags); else if (is_locked) - referenced += page_referenced_file(page, mem_cont); + referenced += page_referenced_file(page, mem_cont, + vm_flags); else if (!trylock_page(page)) referenced++; else { if (page->mapping) - referenced += - page_referenced_file(page, mem_cont); + referenced += page_referenced_file(page, + mem_cont, vm_flags); unlock_page(page); } } @@ -688,8 +704,10 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - if (atomic_inc_and_test(&page->_mapcount)) + if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); + mem_cgroup_update_mapped_file_stat(page, 1); + } } #ifdef CONFIG_DEBUG_VM @@ -738,6 +756,7 @@ void page_remove_rmap(struct page *page) mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + mem_cgroup_update_mapped_file_stat(page, -1); /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1072,7 +1091,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (MLOCK_PAGES && unlikely(unlock)) { - if (!(vma->vm_flags & VM_LOCKED)) + if (!((vma->vm_flags & VM_LOCKED) && + page_mapped_in_vma(page, vma))) continue; /* must visit all vmas */ ret = SWAP_MLOCK; } else { @@ -1201,7 +1221,6 @@ int try_to_unmap(struct page *page, int migration) return ret; } -#ifdef CONFIG_UNEVICTABLE_LRU /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked @@ -1225,4 +1244,4 @@ int try_to_munlock(struct page *page) else return try_to_unmap_file(page, 1, 0); } -#endif + diff --git a/mm/shmem.c b/mm/shmem.c index 5d0de96c978..5a0b3d4055f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -24,10 +24,12 @@ #include <linux/init.h> #include <linux/vfs.h> #include <linux/mount.h> +#include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/swap.h> +#include <linux/ima.h> static struct vfsmount *shm_mnt; @@ -42,7 +44,6 @@ static struct vfsmount *shm_mnt; #include <linux/exportfs.h> #include <linux/generic_acl.h> #include <linux/mman.h> -#include <linux/pagemap.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> @@ -64,13 +65,28 @@ static struct vfsmount *shm_mnt; #include <asm/div64.h> #include <asm/pgtable.h> +/* + * The maximum size of a shmem/tmpfs file is limited by the maximum size of + * its triple-indirect swap vector - see illustration at shmem_swp_entry(). + * + * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, + * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum + * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, + * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. + * + * We use / and * instead of shifts in the definitions below, so that the swap + * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. + */ #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) -#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) -#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) +#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) + +#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) +#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) -#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) -#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) +#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) +#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) +#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ @@ -169,13 +185,13 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { - return (flags & VM_ACCOUNT) ? - security_vm_enough_memory_kern(VM_ACCT(size)) : 0; + return (flags & VM_NORESERVE) ? + 0 : security_vm_enough_memory_kern(VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { - if (flags & VM_ACCOUNT) + if (!(flags & VM_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } @@ -187,13 +203,13 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) */ static inline int shmem_acct_block(unsigned long flags) { - return (flags & VM_ACCOUNT) ? - 0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)); + return (flags & VM_NORESERVE) ? + security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0; } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { - if (!(flags & VM_ACCOUNT)) + if (flags & VM_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); } @@ -1067,8 +1083,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) swap_duplicate(swap); BUG_ON(page_mapped(page)); page_cache_release(page); /* pagecache ref */ - set_page_dirty(page); - unlock_page(page); + swap_writepage(page, wbc); if (inode) { mutex_lock(&shmem_swaplist_mutex); /* move instead of add in case we're racing */ @@ -1082,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_swp_unmap(entry); unlock: spin_unlock(&info->lock); - swap_free(swap); + swapcache_free(swap, NULL); redirty: set_page_dirty(page); if (wbc->for_reclaim) @@ -1325,8 +1340,12 @@ repeat: shmem_swp_unmap(entry); spin_unlock(&info->lock); if (error == -ENOMEM) { - /* allow reclaim from this memory cgroup */ - error = mem_cgroup_shrink_usage(swappage, + /* + * reclaim from proper memory cgroup and + * call memcg's OOM if needed. + */ + error = mem_cgroup_shmem_charge_fallback( + swappage, current->mm, gfp); if (error) { @@ -1515,8 +1534,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static struct inode * -shmem_get_inode(struct super_block *sb, int mode, dev_t dev) +static struct inode *shmem_get_inode(struct super_block *sb, int mode, + dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; @@ -1537,7 +1556,9 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); + cache_no_acl(inode); switch (mode & S_IFMT) { default: @@ -1779,9 +1800,10 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) static int shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev); + struct inode *inode; int error = -ENOSPC; + inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, NULL, NULL, NULL); @@ -1920,7 +1942,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (len > PAGE_CACHE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); + inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); if (!inode) return -ENOSPC; @@ -2332,7 +2354,7 @@ static int shmem_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif - inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0); + inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (!inode) goto failed; inode->i_uid = sbinfo->uid; @@ -2367,7 +2389,6 @@ static void shmem_destroy_inode(struct inode *inode) /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } - shmem_acl_destroy_inode(inode); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } @@ -2376,10 +2397,6 @@ static void init_once(void *foo) struct shmem_inode_info *p = (struct shmem_inode_info *) foo; inode_init_once(&p->vfs_inode); -#ifdef CONFIG_TMPFS_POSIX_ACL - p->i_acl = NULL; - p->i_default_acl = NULL; -#endif } static int init_inodecache(void) @@ -2429,7 +2446,7 @@ static const struct inode_operations shmem_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; @@ -2452,7 +2469,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; @@ -2463,7 +2480,7 @@ static const struct inode_operations shmem_special_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; @@ -2574,12 +2591,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page) return 0; } -#define shmem_file_operations ramfs_file_operations -#define shmem_vm_ops generic_file_vm_ops -#define shmem_get_inode ramfs_get_inode -#define shmem_acct_size(a, b) 0 -#define shmem_unacct_size(a, b) do {} while (0) -#define SHMEM_MAX_BYTES LLONG_MAX +#define shmem_vm_ops generic_file_vm_ops +#define shmem_file_operations ramfs_file_operations +#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) +#define shmem_acct_size(flags, size) 0 +#define shmem_unacct_size(flags, size) do {} while (0) +#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE #endif /* CONFIG_SHMEM */ @@ -2589,9 +2606,9 @@ int shmem_unuse(swp_entry_t entry, struct page *page) * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc/<pid>/maps * @size: size to be set for the file - * @flags: vm_flags + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { int error; struct file *file; @@ -2623,13 +2640,10 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) goto put_dentry; error = -ENOSPC; - inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) goto close_file; -#ifdef CONFIG_SHMEM - SHMEM_I(inode)->flags = flags & VM_ACCOUNT; -#endif d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ @@ -2641,6 +2655,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (error) goto close_file; #endif + ima_counts_get(file); return file; close_file: diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index 8e5aadd7dcd..df2c87fdae5 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -22,11 +22,11 @@ shmem_get_acl(struct inode *inode, int type) spin_lock(&inode->i_lock); switch(type) { case ACL_TYPE_ACCESS: - acl = posix_acl_dup(SHMEM_I(inode)->i_acl); + acl = posix_acl_dup(inode->i_acl); break; case ACL_TYPE_DEFAULT: - acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); + acl = posix_acl_dup(inode->i_default_acl); break; } spin_unlock(&inode->i_lock); @@ -45,13 +45,13 @@ shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) spin_lock(&inode->i_lock); switch(type) { case ACL_TYPE_ACCESS: - free = SHMEM_I(inode)->i_acl; - SHMEM_I(inode)->i_acl = posix_acl_dup(acl); + free = inode->i_acl; + inode->i_acl = posix_acl_dup(acl); break; case ACL_TYPE_DEFAULT: - free = SHMEM_I(inode)->i_default_acl; - SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); + free = inode->i_default_acl; + inode->i_default_acl = posix_acl_dup(acl); break; } spin_unlock(&inode->i_lock); @@ -155,26 +155,9 @@ shmem_acl_init(struct inode *inode, struct inode *dir) } /** - * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode - * - * This is done before destroying the actual inode. - */ - -void -shmem_acl_destroy_inode(struct inode *inode) -{ - if (SHMEM_I(inode)->i_acl) - posix_acl_release(SHMEM_I(inode)->i_acl); - SHMEM_I(inode)->i_acl = NULL; - if (SHMEM_I(inode)->i_default_acl) - posix_acl_release(SHMEM_I(inode)->i_default_acl); - SHMEM_I(inode)->i_default_acl = NULL; -} - -/** * shmem_check_acl - check_acl() callback for generic_permission() */ -static int +int shmem_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); @@ -186,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask) } return -EAGAIN; } - -/** - * shmem_permission - permission() inode operation - */ -int -shmem_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, shmem_check_acl); -} diff --git a/mm/slab.c b/mm/slab.c index ddc41f337d5..7b5d4deacfc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,16 +102,19 @@ #include <linux/cpu.h> #include <linux/sysctl.h> #include <linux/module.h> +#include <linux/kmemtrace.h> #include <linux/rcupdate.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/nodemask.h> +#include <linux/kmemleak.h> #include <linux/mempolicy.h> #include <linux/mutex.h> #include <linux/fault-inject.h> #include <linux/rtmutex.h> #include <linux/reciprocal_div.h> #include <linux/debugobjects.h> +#include <linux/kmemcheck.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -177,13 +180,13 @@ SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS) + SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS) + SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) #endif /* @@ -314,7 +317,7 @@ static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); -static int enable_cpucache(struct kmem_cache *cachep); +static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); /* @@ -372,87 +375,6 @@ static void kmem_list3_init(struct kmem_list3 *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) -/* - * struct kmem_cache - * - * manages a cache. - */ - -struct kmem_cache { -/* 1) per-cpu data, touched during every alloc/free */ - struct array_cache *array[NR_CPUS]; -/* 2) Cache tunables. Protected by cache_chain_mutex */ - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - - unsigned int buffer_size; - u32 reciprocal_buffer_size; -/* 3) touched by every alloc & free from the backend */ - - unsigned int flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - -/* 4) cache_grow/shrink */ - /* order of pgs per slab (2^n) */ - unsigned int gfporder; - - /* force GFP flags, e.g. GFP_DMA */ - gfp_t gfpflags; - - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - struct kmem_cache *slabp_cache; - unsigned int slab_size; - unsigned int dflags; /* dynamic flags */ - - /* constructor func */ - void (*ctor)(void *obj); - -/* 5) cache creation/removal */ - const char *name; - struct list_head next; - -/* 6) statistics */ -#if STATS - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - unsigned long node_overflow; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; -#endif -#if DEBUG - /* - * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. buffer_size contains the total - * object size including these internal fields, the following two - * variables contain the offset to the user object and its size. - */ - int obj_offset; - int obj_size; -#endif - /* - * We put nodelists[] at the end of kmem_cache, because we want to size - * this array to nr_node_ids slots instead of MAX_NUMNODES - * (see kmem_cache_init()) - * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache - * is statically defined, so we reserve the max number of nodes. - */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; - /* - * Do not add fields after nodelists[] - */ -}; - #define CFLGS_OFF_SLAB (0x80000000UL) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) @@ -568,6 +490,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif +#ifdef CONFIG_KMEMTRACE +size_t slab_buffer_size(struct kmem_cache *cachep) +{ + return cachep->buffer_size; +} +EXPORT_SYMBOL(slab_buffer_size); +#endif + /* * Do not go above this order unless 0 objects fit into the slab. */ @@ -743,6 +673,7 @@ static enum { NONE, PARTIAL_AC, PARTIAL_L3, + EARLY, FULL } g_cpucache_up; @@ -751,7 +682,7 @@ static enum { */ int slab_is_available(void) { - return g_cpucache_up == FULL; + return g_cpucache_up >= EARLY; } static DEFINE_PER_CPU(struct delayed_work, reap_work); @@ -881,7 +812,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, */ static int use_alien_caches __read_mostly = 1; -static int numa_platform __read_mostly = 1; static int __init noaliencache_setup(char *s) { use_alien_caches = 0; @@ -949,12 +879,20 @@ static void __cpuinit start_cpu_timer(int cpu) } static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount) + int batchcount, gfp_t gfp) { int memsize = sizeof(void *) * entries + sizeof(struct array_cache); struct array_cache *nc = NULL; - nc = kmalloc_node(memsize, GFP_KERNEL, node); + nc = kmalloc_node(memsize, gfp, node); + /* + * The array_cache structures contain pointers to free object. + * However, when such objects are allocated or transfered to another + * cache the pointers are not cleared and they could be counted as + * valid references during a kmemleak scan. Therefore, kmemleak must + * not scan such objects. + */ + kmemleak_no_scan(nc); if (nc) { nc->avail = 0; nc->limit = entries; @@ -994,7 +932,7 @@ static int transfer_objects(struct array_cache *to, #define drain_alien_cache(cachep, alien) do { } while (0) #define reap_alien(cachep, l3) do { } while (0) -static inline struct array_cache **alloc_alien_cache(int node, int limit) +static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { return (struct array_cache **)BAD_ALIEN_MAGIC; } @@ -1025,7 +963,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); -static struct array_cache **alloc_alien_cache(int node, int limit) +static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { struct array_cache **ac_ptr; int memsize = sizeof(void *) * nr_node_ids; @@ -1033,14 +971,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit) if (limit > 1) limit = 12; - ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); + ac_ptr = kmalloc_node(memsize, gfp, node); if (ac_ptr) { for_each_node(i) { if (i == node || !node_online(i)) { ac_ptr[i] = NULL; continue; } - ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); + ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); if (!ac_ptr[i]) { for (i--; i >= 0; i--) kfree(ac_ptr[i]); @@ -1160,7 +1098,7 @@ static void __cpuinit cpuup_canceled(long cpu) struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); - node_to_cpumask_ptr(mask, node); + const struct cpumask *mask = cpumask_of_node(node); list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; @@ -1273,20 +1211,20 @@ static int __cpuinit cpuup_prepare(long cpu) struct array_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, - cachep->batchcount); + cachep->batchcount, GFP_KERNEL); if (!nc) goto bad; if (cachep->shared) { shared = alloc_arraycache(node, cachep->shared * cachep->batchcount, - 0xbaadf00d); + 0xbaadf00d, GFP_KERNEL); if (!shared) { kfree(nc); goto bad; } } if (use_alien_caches) { - alien = alloc_alien_cache(node, cachep->limit); + alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); if (!alien) { kfree(shared); kfree(nc); @@ -1390,10 +1328,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, { struct kmem_list3 *ptr; - ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); + ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); BUG_ON(!ptr); - local_irq_disable(); memcpy(ptr, list, sizeof(struct kmem_list3)); /* * Do not assume that spinlocks can be initialized via memcpy: @@ -1402,7 +1339,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->nodelists[nodeid] = ptr; - local_irq_enable(); } /* @@ -1434,10 +1370,8 @@ void __init kmem_cache_init(void) int order; int node; - if (num_possible_nodes() == 1) { + if (num_possible_nodes() == 1) use_alien_caches = 0; - numa_platform = 0; - } for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); @@ -1566,9 +1500,8 @@ void __init kmem_cache_init(void) { struct array_cache *ptr; - ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - local_irq_disable(); BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); memcpy(ptr, cpu_cache_get(&cache_cache), sizeof(struct arraycache_init)); @@ -1578,11 +1511,9 @@ void __init kmem_cache_init(void) spin_lock_init(&ptr->lock); cache_cache.array[smp_processor_id()] = ptr; - local_irq_enable(); - ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - local_irq_disable(); BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) != &initarray_generic.cache); memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), @@ -1594,7 +1525,6 @@ void __init kmem_cache_init(void) malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = ptr; - local_irq_enable(); } /* 5) Replace the bootstrap kmem_list3's */ { @@ -1613,23 +1543,26 @@ void __init kmem_cache_init(void) } } - /* 6) resize the head arrays to their final sizes */ - { - struct kmem_cache *cachep; - mutex_lock(&cache_chain_mutex); - list_for_each_entry(cachep, &cache_chain, next) - if (enable_cpucache(cachep)) - BUG(); - mutex_unlock(&cache_chain_mutex); - } + g_cpucache_up = EARLY; +} - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); +void __init kmem_cache_init_late(void) +{ + struct kmem_cache *cachep; + /* 6) resize the head arrays to their final sizes */ + mutex_lock(&cache_chain_mutex); + list_for_each_entry(cachep, &cache_chain, next) + if (enable_cpucache(cachep, GFP_NOWAIT)) + BUG(); + mutex_unlock(&cache_chain_mutex); /* Done! */ g_cpucache_up = FULL; + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + /* * Register a cpu startup notifier callback that initializes * cpu_cache_get for all new cpus @@ -1680,7 +1613,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - page = alloc_pages_node(nodeid, flags, cachep->gfporder); + page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) return NULL; @@ -1693,6 +1626,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) NR_SLAB_UNRECLAIMABLE, nr_pages); for (i = 0; i < nr_pages; i++) __SetPageSlab(page + i); + + if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { + kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); + + if (cachep->ctor) + kmemcheck_mark_uninitialized_pages(page, nr_pages); + else + kmemcheck_mark_unallocated_pages(page, nr_pages); + } + return page_address(page); } @@ -1705,6 +1648,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) struct page *page = virt_to_page(addr); const unsigned long nr_freed = i; + kmemcheck_free_shadow(page, cachep->gfporder); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), NR_SLAB_RECLAIMABLE, nr_freed); @@ -2055,10 +2000,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, return left_over; } -static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) +static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { if (g_cpucache_up == FULL) - return enable_cpucache(cachep); + return enable_cpucache(cachep, gfp); if (g_cpucache_up == NONE) { /* @@ -2080,7 +2025,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) g_cpucache_up = PARTIAL_AC; } else { cachep->array[smp_processor_id()] = - kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + kmalloc(sizeof(struct arraycache_init), gfp); if (g_cpucache_up == PARTIAL_AC) { set_up_list3s(cachep, SIZE_L3); @@ -2090,7 +2035,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) for_each_online_node(node) { cachep->nodelists[node] = kmalloc_node(sizeof(struct kmem_list3), - GFP_KERNEL, node); + gfp, node); BUG_ON(!cachep->nodelists[node]); kmem_list3_init(cachep->nodelists[node]); } @@ -2144,6 +2089,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; + gfp_t gfp; /* * Sanity checks... these are all serious usage bugs. @@ -2159,8 +2105,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, * We use cache_chain_mutex to ensure a consistent view of * cpu_online_mask as well. Please see cpuup_callback */ - get_online_cpus(); - mutex_lock(&cache_chain_mutex); + if (slab_is_available()) { + get_online_cpus(); + mutex_lock(&cache_chain_mutex); + } list_for_each_entry(pc, &cache_chain, next) { char tmp; @@ -2269,8 +2217,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ align = ralign; + if (slab_is_available()) + gfp = GFP_KERNEL; + else + gfp = GFP_NOWAIT; + /* Get cache's description obj. */ - cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); + cachep = kmem_cache_zalloc(&cache_cache, gfp); if (!cachep) goto oops; @@ -2344,6 +2297,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* really off slab. No need for manual alignment */ slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + +#ifdef CONFIG_PAGE_POISONING + /* If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif } cachep->colour_off = cache_line_size(); @@ -2373,7 +2335,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->ctor = ctor; cachep->name = name; - if (setup_cpu_cache(cachep)) { + if (setup_cpu_cache(cachep, gfp)) { __kmem_cache_destroy(cachep); cachep = NULL; goto oops; @@ -2385,8 +2347,10 @@ oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); - mutex_unlock(&cache_chain_mutex); - put_online_cpus(); + if (slab_is_available()) { + mutex_unlock(&cache_chain_mutex); + put_online_cpus(); + } return cachep; } EXPORT_SYMBOL(kmem_cache_create); @@ -2583,7 +2547,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) } if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) - synchronize_rcu(); + rcu_barrier(); __kmem_cache_destroy(cachep); mutex_unlock(&cache_chain_mutex); @@ -2612,6 +2576,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc_node(cachep->slabp_cache, local_flags, nodeid); + /* + * If the first object in the slab is leaked (it's allocated + * but no one has a reference to it), we want to make sure + * kmemleak does not treat the ->s_mem pointer as a reference + * to the object. Otherwise we will not report the leak. + */ + kmemleak_scan_area(slabp, offsetof(struct slab, list), + sizeof(struct list_head), local_flags); if (!slabp) return NULL; } else { @@ -3132,6 +3104,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) STATS_INC_ALLOCMISS(cachep); objp = cache_alloc_refill(cachep, flags); } + /* + * To avoid a false negative, if an object that is in one of the + * per-CPU caches is leaked, we need to make sure kmemleak doesn't + * treat the array pointers as a reference to the object. + */ + kmemleak_erase(&ac->entry[ac->avail]); return objp; } @@ -3210,7 +3188,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, -1); + obj = kmem_getpages(cache, local_flags, numa_node_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { @@ -3318,6 +3296,10 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long save_flags; void *ptr; + flags &= gfp_allowed_mask; + + lockdep_trace_alloc(flags); + if (slab_should_failslab(cachep, flags)) return NULL; @@ -3349,6 +3331,11 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, + flags); + + if (likely(ptr)) + kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); if (unlikely((flags & __GFP_ZERO) && ptr)) memset(ptr, 0, obj_size(cachep)); @@ -3394,6 +3381,10 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) unsigned long save_flags; void *objp; + flags &= gfp_allowed_mask; + + lockdep_trace_alloc(flags); + if (slab_should_failslab(cachep, flags)) return NULL; @@ -3402,8 +3393,13 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); + kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, + flags); prefetchw(objp); + if (likely(objp)) + kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); + if (unlikely((flags & __GFP_ZERO) && objp)) memset(objp, 0, obj_size(cachep)); @@ -3517,8 +3513,11 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); + kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + kmemcheck_slab_free(cachep, objp, obj_size(cachep)); + /* * Skip calling cache_free_alien() when the platform is not numa. * This will avoid cache misses that happen while accessing slabp (which @@ -3526,7 +3525,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) * variable to skip the call, which is mostly likely to be present in * the cache. */ - if (numa_platform && cache_free_alien(cachep, objp)) + if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { @@ -3550,10 +3549,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - return __cache_alloc(cachep, flags, __builtin_return_address(0)); + void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + + trace_kmem_cache_alloc(_RET_IP_, ret, + obj_size(cachep), cachep->buffer_size, flags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) +{ + return __cache_alloc(cachep, flags, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + /** * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. * @cachep: the cache we're checking against @@ -3598,23 +3610,46 @@ out: #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - return __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + void *ret = __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); + + trace_kmem_cache_alloc_node(_RET_IP_, ret, + obj_size(cachep), cachep->buffer_size, + flags, nodeid); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid) +{ + return __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; + void *ret; cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node(cachep, flags, node); + ret = kmem_cache_alloc_node_notrace(cachep, flags, node); + + trace_kmalloc_node((unsigned long) caller, ret, + size, cachep->buffer_size, flags, node); + + return ret; } -#ifdef CONFIG_DEBUG_SLAB +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, @@ -3647,6 +3682,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller) { struct kmem_cache *cachep; + void *ret; /* If you want to save a few bytes .text space: replace * __ with kmem_. @@ -3656,11 +3692,16 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = __find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return __cache_alloc(cachep, flags, caller); + ret = __cache_alloc(cachep, flags, caller); + + trace_kmalloc((unsigned long) caller, ret, + size, cachep->buffer_size, flags); + + return ret; } -#ifdef CONFIG_DEBUG_SLAB +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, __builtin_return_address(0)); @@ -3699,6 +3740,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) debug_check_no_obj_freed(objp, obj_size(cachep)); __cache_free(cachep, objp); local_irq_restore(flags); + + trace_kmem_cache_free(_RET_IP_, objp); } EXPORT_SYMBOL(kmem_cache_free); @@ -3716,6 +3759,8 @@ void kfree(const void *objp) struct kmem_cache *c; unsigned long flags; + trace_kfree(_RET_IP_, objp); + if (unlikely(ZERO_OR_NULL_PTR(objp))) return; local_irq_save(flags); @@ -3743,7 +3788,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); /* * This initializes kmem_list3 or resizes various caches for all nodes. */ -static int alloc_kmemlist(struct kmem_cache *cachep) +static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) { int node; struct kmem_list3 *l3; @@ -3753,7 +3798,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) for_each_online_node(node) { if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit); + new_alien = alloc_alien_cache(node, cachep->limit, gfp); if (!new_alien) goto fail; } @@ -3762,7 +3807,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) if (cachep->shared) { new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount, - 0xbaadf00d); + 0xbaadf00d, gfp); if (!new_shared) { free_alien_cache(new_alien); goto fail; @@ -3791,7 +3836,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) free_alien_cache(new_alien); continue; } - l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); + l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); if (!l3) { free_alien_cache(new_alien); kfree(new_shared); @@ -3847,18 +3892,18 @@ static void do_ccupdate_local(void *info) /* Always called with the cache_chain_mutex held */ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, - int batchcount, int shared) + int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; int i; - new = kzalloc(sizeof(*new), GFP_KERNEL); + new = kzalloc(sizeof(*new), gfp); if (!new) return -ENOMEM; for_each_online_cpu(i) { new->new[i] = alloc_arraycache(cpu_to_node(i), limit, - batchcount); + batchcount, gfp); if (!new->new[i]) { for (i--; i >= 0; i--) kfree(new->new[i]); @@ -3885,11 +3930,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, kfree(ccold); } kfree(new); - return alloc_kmemlist(cachep); + return alloc_kmemlist(cachep, gfp); } /* Called with cache_chain_mutex held always */ -static int enable_cpucache(struct kmem_cache *cachep) +static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; int limit, shared; @@ -3935,7 +3980,7 @@ static int enable_cpucache(struct kmem_cache *cachep) if (limit > 32) limit = 32; #endif - err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); + err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); @@ -3988,8 +4033,7 @@ static void cache_reap(struct work_struct *w) struct kmem_cache *searchp; struct kmem_list3 *l3; int node = numa_node_id(); - struct delayed_work *work = - container_of(w, struct delayed_work, work); + struct delayed_work *work = to_delayed_work(w); if (!mutex_trylock(&cache_chain_mutex)) /* Give up. Setup the next iteration. */ @@ -4242,7 +4286,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, res = 0; } else { res = do_tune_cpucache(cachep, limit, - batchcount, shared); + batchcount, shared, + GFP_KERNEL); } break; } @@ -4457,3 +4502,4 @@ size_t ksize(const void *objp) return obj_size(virt_to_cache(objp)); } +EXPORT_SYMBOL(ksize); diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed..9641da3d5e5 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -46,7 +46,7 @@ * NUMA support in SLOB is fairly simplistic, pushing most of the real * logic down to the page allocator, and simply doing the node accounting * on the upper levels. In the event that a node id is explicitly - * provided, alloc_pages_node() with the specified node id is used + * provided, alloc_pages_exact_node() with the specified node id is used * instead. The common case (or when the node id isn't explicitly provided) * will default to the current node, as per numa_node_id(). * @@ -60,11 +60,14 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/swap.h> /* struct reclaim_state */ #include <linux/cache.h> #include <linux/init.h> #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/list.h> +#include <linux/kmemtrace.h> +#include <linux/kmemleak.h> #include <asm/atomic.h> /* @@ -126,21 +129,26 @@ static LIST_HEAD(free_slob_medium); static LIST_HEAD(free_slob_large); /* - * slob_page: True for all slob pages (false for bigblock pages) + * is_slob_page: True for all slob pages (false for bigblock pages) */ -static inline int slob_page(struct slob_page *sp) +static inline int is_slob_page(struct slob_page *sp) { - return PageSlobPage((struct page *)sp); + return PageSlab((struct page *)sp); } static inline void set_slob_page(struct slob_page *sp) { - __SetPageSlobPage((struct page *)sp); + __SetPageSlab((struct page *)sp); } static inline void clear_slob_page(struct slob_page *sp) { - __ClearPageSlobPage((struct page *)sp); + __ClearPageSlab((struct page *)sp); +} + +static inline struct slob_page *slob_page(const void *addr) +{ + return (struct slob_page *)virt_to_page(addr); } /* @@ -230,13 +238,13 @@ static int slob_last(slob_t *s) return !((unsigned long)slob_next(s) & ~PAGE_MASK); } -static void *slob_new_page(gfp_t gfp, int order, int node) +static void *slob_new_pages(gfp_t gfp, int order, int node) { void *page; #ifdef CONFIG_NUMA if (node != -1) - page = alloc_pages_node(node, gfp, order); + page = alloc_pages_exact_node(node, gfp, order); else #endif page = alloc_pages(gfp, order); @@ -247,12 +255,19 @@ static void *slob_new_page(gfp_t gfp, int order, int node) return page_address(page); } +static void slob_free_pages(void *b, int order) +{ + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += 1 << order; + free_pages((unsigned long)b, order); +} + /* * Allocate a slob block within a given slob_page sp. */ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) { - slob_t *prev, *cur, *aligned = 0; + slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { @@ -349,10 +364,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) /* Not enough space: must allocate a new page */ if (!b) { - b = slob_new_page(gfp & ~__GFP_ZERO, 0, node); + b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); if (!b) - return 0; - sp = (struct slob_page *)virt_to_page(b); + return NULL; + sp = slob_page(b); set_slob_page(sp); spin_lock_irqsave(&slob_lock, flags); @@ -384,7 +399,7 @@ static void slob_free(void *block, int size) return; BUG_ON(!size); - sp = (struct slob_page *)virt_to_page(block); + sp = slob_page(block); units = SLOB_UNITS(size); spin_lock_irqsave(&slob_lock, flags); @@ -393,10 +408,11 @@ static void slob_free(void *block, int size) /* Go directly to page allocator. Do not pass slob allocator */ if (slob_page_free(sp)) clear_slob_page_free(sp); + spin_unlock_irqrestore(&slob_lock, flags); clear_slob_page(sp); free_slob_page(sp); - free_page((unsigned long)b); - goto out; + slob_free_pages(b, 0); + return; } if (!slob_page_free(sp)) { @@ -463,27 +479,39 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + void *ret; + + lockdep_trace_alloc(gfp); if (size < PAGE_SIZE - align) { if (!size) return ZERO_SIZE_PTR; m = slob_alloc(size + align, gfp, align, node); + if (!m) return NULL; *m = size; - return (void *)m + align; + ret = (void *)m + align; + + trace_kmalloc_node(_RET_IP_, ret, + size, size + align, gfp, node); } else { - void *ret; + unsigned int order = get_order(size); - ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); if (ret) { struct page *page; page = virt_to_page(ret); page->private = size; } - return ret; + + trace_kmalloc_node(_RET_IP_, ret, + size, PAGE_SIZE << order, gfp, node); } + + kmemleak_alloc(ret, size, 1, gfp); + return ret; } EXPORT_SYMBOL(__kmalloc_node); @@ -491,11 +519,14 @@ void kfree(const void *block) { struct slob_page *sp; + trace_kfree(_RET_IP_, block); + if (unlikely(ZERO_OR_NULL_PTR(block))) return; + kmemleak_free(block); - sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) { + sp = slob_page(block); + if (is_slob_page(sp)) { int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); @@ -513,14 +544,15 @@ size_t ksize(const void *block) if (unlikely(block == ZERO_SIZE_PTR)) return 0; - sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) { + sp = slob_page(block); + if (is_slob_page(sp)) { int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } else return sp->page.private; } +EXPORT_SYMBOL(ksize); struct kmem_cache { unsigned int size, align; @@ -555,12 +587,16 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, } else if (flags & SLAB_PANIC) panic("Cannot create slab cache %s\n", name); + kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); return c; } EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *c) { + kmemleak_free(c); + if (c->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); slob_free(c, sizeof(struct kmem_cache)); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -569,14 +605,22 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; - if (c->size < PAGE_SIZE) + if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); - else - b = slob_new_page(flags, get_order(c->size), node); + trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + SLOB_UNITS(c->size) * SLOB_UNIT, + flags, node); + } else { + b = slob_new_pages(flags, get_order(c->size), node); + trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + PAGE_SIZE << get_order(c->size), + flags, node); + } if (c->ctor) c->ctor(b); + kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; } EXPORT_SYMBOL(kmem_cache_alloc_node); @@ -586,7 +630,7 @@ static void __kmem_cache_free(void *b, int size) if (size < PAGE_SIZE) slob_free(b, size); else - free_pages((unsigned long)b, get_order(size)); + slob_free_pages(b, get_order(size)); } static void kmem_rcu_free(struct rcu_head *head) @@ -599,6 +643,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { + kmemleak_free_recursive(b, c->flags); if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); @@ -608,6 +653,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } else { __kmem_cache_free(b, c->size); } + + trace_kmem_cache_free(_RET_IP_, b); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slub.c b/mm/slub.c index 6392ae5cc6b..b6276753626 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -9,6 +9,7 @@ */ #include <linux/mm.h> +#include <linux/swap.h> /* struct reclaim_state */ #include <linux/module.h> #include <linux/bit_spinlock.h> #include <linux/interrupt.h> @@ -16,6 +17,8 @@ #include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/kmemtrace.h> +#include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/mempolicy.h> @@ -141,10 +144,10 @@ * Set of flags that will prevent slab merging */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA) + SLAB_CACHE_DMA | SLAB_NOTRACK) #ifndef ARCH_KMALLOC_MINALIGN #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) @@ -374,14 +377,8 @@ static struct track *get_track(struct kmem_cache *s, void *object, static void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { - struct track *p; - - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + struct track *p = get_track(s, object, alloc); - p += alloc; if (addr) { p->addr = addr; p->cpu = smp_processor_id(); @@ -836,6 +833,11 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node) return atomic_long_read(&n->nr_slabs); } +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->nr_slabs); +} + static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) { struct kmem_cache_node *n = get_node(s, node); @@ -1054,6 +1056,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, static inline unsigned long slabs_node(struct kmem_cache *s, int node) { return 0; } +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) + { return 0; } static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, @@ -1068,6 +1072,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, { int order = oo_order(oo); + flags |= __GFP_NOTRACK; + if (node == -1) return alloc_pages(flags, order); else @@ -1078,11 +1084,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; struct kmem_cache_order_objects oo = s->oo; + gfp_t alloc_gfp; flags |= s->allocflags; - page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, - oo); + /* + * Let the initial higher-order allocation fail under memory pressure + * so we fall-back to the minimum order allocation. + */ + alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + + page = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; /* @@ -1095,6 +1107,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); } + + if (kmemcheck_enabled + && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) + { + int pages = 1 << oo_order(oo); + + kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + + /* + * Objects from caches that have a constructor don't get + * cleared when they're allocated, so we need to do it here. + */ + if (s->ctor) + kmemcheck_mark_uninitialized_pages(page, pages); + else + kmemcheck_mark_unallocated_pages(page, pages); + } + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? @@ -1168,6 +1198,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlubDebug(page); } + kmemcheck_free_shadow(page, compound_order(page)); + mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, @@ -1175,6 +1207,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); reset_page_mapcount(page); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += pages; __free_pages(page, order); } @@ -1335,7 +1369,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) n = get_node(s, zone_to_nid(zone)); if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > n->min_partial) { + n->nr_partial > s->min_partial) { page = get_partial_node(n); if (page) return page; @@ -1387,7 +1421,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { stat(c, DEACTIVATE_EMPTY); - if (n->nr_partial < n->min_partial) { + if (n->nr_partial < s->min_partial) { /* * Adding an empty slab to the partial slabs in order * to avoid page allocator overhead. This slab needs @@ -1486,6 +1520,65 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) return 1; } +static int count_free(struct page *page) +{ + return page->objects - page->inuse; +} + +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct page *)) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += get_count(page); + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ +#ifdef CONFIG_SLUB_DEBUG + return atomic_long_read(&n->total_objects); +#else + return 0; +#endif +} + +static noinline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) +{ + int node; + + printk(KERN_WARNING + "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + nid, gfpflags); + printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " + "default order: %d, min order: %d\n", s->name, s->objsize, + s->size, oo_order(s->oo), oo_order(s->min)); + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + unsigned long nr_slabs; + unsigned long nr_objs; + unsigned long nr_free; + + if (!n) + continue; + + nr_free = count_partial(n, count_free); + nr_slabs = node_nr_slabs(n); + nr_objs = node_nr_objs(n); + + printk(KERN_WARNING + " node %d: slabs: %ld, objs: %ld, free: %ld\n", + node, nr_slabs, nr_objs, nr_free); + } +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. @@ -1567,6 +1660,8 @@ new_slab: c->page = new; goto load_freelist; } + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) @@ -1596,6 +1691,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, unsigned long flags; unsigned int objsize; + gfpflags &= gfp_allowed_mask; + + lockdep_trace_alloc(gfpflags); might_sleep_if(gfpflags & __GFP_WAIT); if (should_failslab(s->objsize, gfpflags)) @@ -1618,23 +1716,53 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, if (unlikely((gfpflags & __GFP_ZERO) && object)) memset(object, 0, objsize); + kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); + kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); + return object; } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); + + trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +{ + return slab_alloc(s, gfpflags, -1, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - return slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + trace_kmem_cache_alloc_node(_RET_IP_, ret, + s->objsize, s->size, gfpflags, node); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); #endif +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node) +{ + return slab_alloc(s, gfpflags, node, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + /* * Slow patch handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -1720,11 +1848,13 @@ static __always_inline void slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long flags; + kmemleak_free_recursive(x, s->flags); local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); + kmemcheck_slab_free(s, object, c->objsize); debug_check_no_locks_freed(object, c->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); + debug_check_no_obj_freed(object, c->objsize); if (likely(page == c->page && c->node >= 0)) { object[c->offset] = c->freelist; c->freelist = object; @@ -1742,6 +1872,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); slab_free(s, page, x, _RET_IP_); + + trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); @@ -1844,6 +1976,7 @@ static inline int calculate_order(int size) int order; int min_objects; int fraction; + int max_objects; /* * Attempt to find best configuration for a slab. This @@ -1856,6 +1989,9 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); + max_objects = (PAGE_SIZE << slub_max_order)/size; + min_objects = min(min_objects, max_objects); + while (min_objects > 1) { fraction = 16; while (fraction >= 4) { @@ -1865,7 +2001,7 @@ static inline int calculate_order(int size) return order; fraction /= 2; } - min_objects /= 2; + min_objects --; } /* @@ -1880,7 +2016,7 @@ static inline int calculate_order(int size) * Doh this slab cannot be placed using slub_max_order. */ order = slab_order(size, 1, MAX_ORDER, 1); - if (order <= MAX_ORDER) + if (order < MAX_ORDER) return order; return -ENOSYS; } @@ -1928,17 +2064,6 @@ static void init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { n->nr_partial = 0; - - /* - * The larger the object size is, the more pages we want on the partial - * list to avoid pounding the page allocator excessively. - */ - n->min_partial = ilog2(s->size); - if (n->min_partial < MIN_PARTIAL) - n->min_partial = MIN_PARTIAL; - else if (n->min_partial > MAX_PARTIAL) - n->min_partial = MAX_PARTIAL; - spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG @@ -1996,7 +2121,7 @@ static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) { if (c < per_cpu(kmem_cache_cpu, cpu) || - c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { + c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { kfree(c); return; } @@ -2181,6 +2306,15 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } #endif +static void set_min_partial(struct kmem_cache *s, unsigned long min) +{ + if (min < MIN_PARTIAL) + min = MIN_PARTIAL; + else if (min > MAX_PARTIAL) + min = MAX_PARTIAL; + s->min_partial = min; +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -2319,6 +2453,11 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, if (!calculate_sizes(s, -1)) goto error; + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + set_min_partial(s, ilog2(s->size)); s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -2465,6 +2604,8 @@ void kmem_cache_destroy(struct kmem_cache *s) "still has objects.\n", s->name, __func__); dump_stack(); } + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); sysfs_slab_remove(s); } else up_write(&slub_lock); @@ -2475,7 +2616,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; +struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); static int __init setup_slub_min_order(char *str) @@ -2490,6 +2631,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, &slub_max_order); + slub_max_order = min(slub_max_order, MAX_ORDER - 1); return 1; } @@ -2521,13 +2663,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, if (gfp_flags & SLUB_DMA) flags = SLAB_CACHE_DMA; - down_write(&slub_lock); + /* + * This function is called with IRQs disabled during early-boot on + * single CPU so there's no need to take slub_lock here. + */ if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL)) goto panic; list_add(&s->list, &slab_caches); - up_write(&slub_lock); + if (sysfs_slab_add(s)) goto panic; return s; @@ -2537,7 +2682,7 @@ panic: } #ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; +static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; static void sysfs_add_func(struct work_struct *w) { @@ -2560,6 +2705,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) struct kmem_cache *s; char *text; size_t realsize; + unsigned long slabflags; s = kmalloc_caches_dma[index]; if (s) @@ -2581,9 +2727,18 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) (unsigned int)realsize); s = kmalloc(kmem_size, flags & ~SLUB_DMA); + /* + * Must defer sysfs creation to a workqueue because we don't know + * what context we are called from. Before sysfs comes up, we don't + * need to do anything because our sysfs initcall will start by + * adding all existing slabs to sysfs. + */ + slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK; + if (slab_state >= SYSFS) + slabflags |= __SYSFS_ADD_DEFERRED; + if (!s || !text || !kmem_cache_open(s, flags, text, - realsize, ARCH_KMALLOC_MINALIGN, - SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { + realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { kfree(s); kfree(text); goto unlock_out; @@ -2592,7 +2747,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) list_add(&s->list, &slab_caches); kmalloc_caches_dma[index] = s; - schedule_work(&sysfs_add_work); + if (slab_state >= SYSFS) + schedule_work(&sysfs_add_work); unlock_out: up_write(&slub_lock); @@ -2657,8 +2813,9 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, flags); s = get_slab(size, flags); @@ -2666,35 +2823,54 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, -1, _RET_IP_); + ret = slab_alloc(s, flags, -1, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + + return ret; } EXPORT_SYMBOL(__kmalloc); static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { - struct page *page = alloc_pages_node(node, flags | __GFP_COMP, - get_order(size)); + struct page *page; + void *ptr = NULL; + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_pages_node(node, flags, get_order(size)); if (page) - return page_address(page); - else - return NULL; + ptr = page_address(page); + + kmemleak_alloc(ptr, size, 1, flags); + return ptr; } #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; + void *ret; + + if (unlikely(size > SLUB_MAX_SIZE)) { + ret = kmalloc_large_node(size, flags, node); - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large_node(size, flags, node); + trace_kmalloc_node(_RET_IP_, ret, + size, PAGE_SIZE << get_order(size), + flags, node); + + return ret; + } s = get_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, node, _RET_IP_); + ret = slab_alloc(s, flags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); + + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2736,18 +2912,22 @@ size_t ksize(const void *object) */ return s->size; } +EXPORT_SYMBOL(ksize); void kfree(const void *x) { struct page *page; void *object = (void *)x; + trace_kfree(_RET_IP_, x); + if (unlikely(ZERO_OR_NULL_PTR(x))) return; page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); + kmemleak_free(x); put_page(page); return; } @@ -2965,7 +3145,7 @@ void __init kmem_cache_init(void) * kmem_cache_open for slab_state == DOWN. */ create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_KERNEL); + sizeof(struct kmem_cache_node), GFP_NOWAIT); kmalloc_caches[0].refcount = -1; caches++; @@ -2978,16 +3158,16 @@ void __init kmem_cache_init(void) /* Caches that are not of the two-to-the-power-of size */ if (KMALLOC_MIN_SIZE <= 64) { create_kmalloc_cache(&kmalloc_caches[1], - "kmalloc-96", 96, GFP_KERNEL); + "kmalloc-96", 96, GFP_NOWAIT); caches++; create_kmalloc_cache(&kmalloc_caches[2], - "kmalloc-192", 192, GFP_KERNEL); + "kmalloc-192", 192, GFP_NOWAIT); caches++; } - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], - "kmalloc", 1 << i, GFP_KERNEL); + "kmalloc", 1 << i, GFP_NOWAIT); caches++; } @@ -3022,9 +3202,9 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) kmalloc_caches[i]. name = - kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); + kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); @@ -3042,6 +3222,10 @@ void __init kmem_cache_init(void) nr_cpu_ids, nr_node_ids); } +void __init kmem_cache_init_late(void) +{ +} + /* * Find a mergeable slab cache */ @@ -3221,8 +3405,9 @@ static struct notifier_block __cpuinitdata slab_notifier = { void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, gfpflags); s = get_slab(size, gfpflags); @@ -3230,15 +3415,21 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, -1, caller); + + /* Honor the call site pointer we recieved. */ + trace_kmalloc(caller, ret, size, s->size, gfpflags); + + return ret; } void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, int node, unsigned long caller) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large_node(size, gfpflags, node); s = get_slab(size, gfpflags); @@ -3246,24 +3437,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, node, caller); -} + ret = slab_alloc(s, gfpflags, node, caller); -#ifdef CONFIG_SLUB_DEBUG -static unsigned long count_partial(struct kmem_cache_node *n, - int (*get_count)(struct page *)) -{ - unsigned long flags; - unsigned long x = 0; - struct page *page; + /* Honor the call site pointer we recieved. */ + trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - x += get_count(page); - spin_unlock_irqrestore(&n->list_lock, flags); - return x; + return ret; } +#ifdef CONFIG_SLUB_DEBUG static int count_inuse(struct page *page) { return page->inuse; @@ -3274,11 +3456,6 @@ static int count_total(struct page *page) return page->objects; } -static int count_free(struct page *page) -{ - return page->objects - page->inuse; -} - static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { @@ -3647,7 +3824,7 @@ static int list_locations(struct kmem_cache *s, char *buf, to_cpumask(l->cpus)); } - if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && + if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " nodes="); len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, @@ -3835,6 +4012,26 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR(order); +static ssize_t min_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%lu\n", s->min_partial); +} + +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long min; + int err; + + err = strict_strtoul(buf, 10, &min); + if (err) + return err; + + set_min_partial(s, min); + return length; +} +SLAB_ATTR(min_partial); + static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (s->ctor) { @@ -4150,6 +4347,7 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &min_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, &total_objects_attr.attr, @@ -4301,6 +4499,8 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'a'; if (s->flags & SLAB_DEBUG_FREE) *p++ = 'F'; + if (!(s->flags & SLAB_NOTRACK)) + *p++ = 't'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); diff --git a/mm/sparse.c b/mm/sparse.c index 083f5b63e7a..da432d9f0ae 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -164,9 +164,7 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, WARN_ON_ONCE(1); *start_pfn = max_sparsemem_pfn; *end_pfn = max_sparsemem_pfn; - } - - if (*end_pfn > max_sparsemem_pfn) { + } else if (*end_pfn > max_sparsemem_pfn) { mminit_dprintk(MMINIT_WARNING, "pfnvalidation", "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", *start_pfn, *end_pfn, max_sparsemem_pfn); diff --git a/mm/swap.c b/mm/swap.c index 8adb9feb61e..cb29ae5d33a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -448,8 +448,8 @@ void pagevec_strip(struct pagevec *pvec) for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (PagePrivate(page) && trylock_page(page)) { - if (PagePrivate(page)) + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) try_to_release_page(page, 0); unlock_page(page); } @@ -457,29 +457,6 @@ void pagevec_strip(struct pagevec *pvec) } /** - * pagevec_swap_free - try to free swap space from the pages in a pagevec - * @pvec: pagevec with swapcache pages to free the swap space of - * - * The caller needs to hold an extra reference to each page and - * not hold the page lock on the pages. This function uses a - * trylock on the page lock so it may not always free the swap - * space associated with a page. - */ -void pagevec_swap_free(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - - if (PageSwapCache(page) && trylock_page(page)) { - try_to_free_swap(page); - unlock_page(page); - } - } -} - -/** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search @@ -514,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, EXPORT_SYMBOL(pagevec_lookup_tag); -#ifdef CONFIG_SMP -/* - * We tolerate a little inaccuracy to avoid ping-ponging the counter between - * CPUs - */ -#define ACCT_THRESHOLD max(16, NR_CPUS * 2) - -static DEFINE_PER_CPU(long, committed_space); - -void vm_acct_memory(long pages) -{ - long *local; - - preempt_disable(); - local = &__get_cpu_var(committed_space); - *local += pages; - if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { - atomic_long_add(*local, &vm_committed_space); - *local = 0; - } - preempt_enable(); -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* Drop the CPU's cached committed space back into the central pool. */ -static int cpu_swap_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - long *committed; - - committed = &per_cpu(committed_space, (long)hcpu); - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - atomic_long_add(*committed, &vm_committed_space); - *committed = 0; - drain_cpu_pagevecs((long)hcpu); - } - return NOTIFY_OK; -} -#endif /* CONFIG_HOTPLUG_CPU */ -#endif /* CONFIG_SMP */ - /* * Perform any setup for the swap system */ @@ -577,7 +511,4 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ -#ifdef CONFIG_HOTPLUG_CPU - hotcpu_notifier(cpu_swap_callback, 0); -#endif } diff --git a/mm/swap_state.c b/mm/swap_state.c index 3ecea98ecb4..5ae6b8b78c8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { + .name = "swap", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = swap_unplug_io_fn, }; @@ -109,8 +110,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - swp_entry_t ent = {.val = page_private(page)}; - VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); @@ -121,13 +120,11 @@ void __delete_from_swap_cache(struct page *page) total_swapcache_pages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); - mem_cgroup_uncharge_swapcache(page, ent); } /** * add_to_swap - allocate swap space for a page * @page: page we want to move to swap - * @gfp_mask: memory allocation flags * * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. @@ -165,11 +162,11 @@ int add_to_swap(struct page *page) return 1; case -EEXIST: /* Raced with "speculative" read_swap_cache_async */ - swap_free(entry); + swapcache_free(entry, NULL); continue; default: /* -ENOMEM radix-tree allocation failure */ - swap_free(entry); + swapcache_free(entry, NULL); return 0; } } @@ -191,7 +188,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&swapper_space.tree_lock); - swap_free(entry); + swapcache_free(entry, page); page_cache_release(page); } @@ -295,7 +292,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Swap entry may have been freed since our caller observed it. */ - if (!swap_duplicate(entry)) + err = swapcache_prepare(entry); + if (err == -EEXIST) /* seems racy */ + continue; + if (err) /* swp entry is obsolete ? */ break; /* @@ -314,12 +314,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); - swap_readpage(NULL, new_page); + swap_readpage(new_page); return new_page; } ClearPageSwapBacked(new_page); __clear_page_locked(new_page); - swap_free(entry); + swapcache_free(entry, NULL); } while (err != -ENOMEM); if (new_page) diff --git a/mm/swapfile.c b/mm/swapfile.c index f48b831e5e5..8ffdc0d23c5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); +/* For reference count accounting in swap_map */ +/* enum for swap_map[] handling. internal use only */ +enum { + SWAP_MAP = 0, /* ops for reference from swap users */ + SWAP_CACHE, /* ops for reference from swap cache */ +}; + +static inline int swap_count(unsigned short ent) +{ + return ent & SWAP_COUNT_MASK; +} + +static inline bool swap_has_cache(unsigned short ent) +{ + return !!(ent & SWAP_HAS_CACHE); +} + +static inline unsigned short encode_swapmap(int count, bool has_cache) +{ + unsigned short ret = count; + + if (has_cache) + return SWAP_HAS_CACHE | ret; + return ret; +} + +/* returnes 1 if swap entry is freed */ +static int +__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) +{ + int type = si - swap_info; + swp_entry_t entry = swp_entry(type, offset); + struct page *page; + int ret = 0; + + page = find_get_page(&swapper_space, entry.val); + if (!page) + return 0; + /* + * This function is called from scan_swap_map() and it's called + * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. + * We have to use trylock for avoiding deadlock. This is a special + * case and you should use try_to_free_swap() with explicit lock_page() + * in usual operations. + */ + if (trylock_page(page)) { + ret = try_to_free_swap(page); + unlock_page(page); + } + page_cache_release(page); + return ret; +} + /* * We need this because the bdev->unplug_fn can sleep and we cannot * hold swap_lock while calling the unplug_fn. And swap_lock @@ -167,7 +220,8 @@ static int wait_for_discard(void *word) #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 -static inline unsigned long scan_swap_map(struct swap_info_struct *si) +static inline unsigned long scan_swap_map(struct swap_info_struct *si, + int cache) { unsigned long offset; unsigned long scan_base; @@ -273,6 +327,19 @@ checks: goto no_page; if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; + + /* reuse swap entry of cache-only swap if not busy. */ + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; + spin_unlock(&swap_lock); + swap_was_freed = __try_to_reclaim_swap(si, offset); + spin_lock(&swap_lock); + /* entry was freed successfully, try to use this again */ + if (swap_was_freed) + goto checks; + goto scan; /* check next one */ + } + if (si->swap_map[offset]) goto scan; @@ -285,7 +352,10 @@ checks: si->lowest_bit = si->max; si->highest_bit = 0; } - si->swap_map[offset] = 1; + if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ + si->swap_map[offset] = encode_swapmap(0, true); + else /* at suspend */ + si->swap_map[offset] = encode_swapmap(1, false); si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; @@ -351,6 +421,10 @@ scan: spin_lock(&swap_lock); goto checks; } + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&swap_lock); + goto checks; + } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; @@ -362,6 +436,10 @@ scan: spin_lock(&swap_lock); goto checks; } + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&swap_lock); + goto checks; + } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; @@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void) continue; swap_list.next = next; - offset = scan_swap_map(si); + /* This is called for allocating swap entry for cache */ + offset = scan_swap_map(si, SWAP_CACHE); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -415,6 +494,7 @@ noswap: return (swp_entry_t) {0}; } +/* The only caller of this function is now susupend routine */ swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; @@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type) si = swap_info + type; if (si->flags & SWP_WRITEOK) { nr_swap_pages--; - offset = scan_swap_map(si); + /* This is called for allocating swap entry, not cache */ + offset = scan_swap_map(si, SWAP_MAP); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -471,26 +552,40 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) +static int swap_entry_free(struct swap_info_struct *p, + swp_entry_t ent, int cache) { unsigned long offset = swp_offset(ent); - int count = p->swap_map[offset]; - - if (count < SWAP_MAP_MAX) { - count--; - p->swap_map[offset] = count; - if (!count) { - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) - p->highest_bit = offset; - if (p->prio > swap_info[swap_list.next].prio) - swap_list.next = p - swap_info; - nr_swap_pages++; - p->inuse_pages--; - mem_cgroup_uncharge_swap(ent); + int count = swap_count(p->swap_map[offset]); + bool has_cache; + + has_cache = swap_has_cache(p->swap_map[offset]); + + if (cache == SWAP_MAP) { /* dropping usage count of swap */ + if (count < SWAP_MAP_MAX) { + count--; + p->swap_map[offset] = encode_swapmap(count, has_cache); } + } else { /* dropping swap cache flag */ + VM_BUG_ON(!has_cache); + p->swap_map[offset] = encode_swapmap(count, false); + + } + /* return code. */ + count = p->swap_map[offset]; + /* free if no reference */ + if (!count) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = p - swap_info; + nr_swap_pages++; + p->inuse_pages--; } + if (!swap_count(count)) + mem_cgroup_uncharge_swap(ent); return count; } @@ -504,9 +599,33 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { - swap_entry_free(p, entry); + swap_entry_free(p, entry, SWAP_MAP); + spin_unlock(&swap_lock); + } +} + +/* + * Called after dropping swapcache to decrease refcnt to swap entries. + */ +void swapcache_free(swp_entry_t entry, struct page *page) +{ + struct swap_info_struct *p; + int ret; + + p = swap_info_get(entry); + if (p) { + ret = swap_entry_free(p, entry, SWAP_CACHE); + if (page) { + bool swapout; + if (ret) + swapout = true; /* the end of swap out */ + else + swapout = false; /* no more swap users! */ + mem_cgroup_uncharge_swapcache(page, entry, swapout); + } spin_unlock(&swap_lock); } + return; } /* @@ -521,8 +640,7 @@ static inline int page_swapcount(struct page *page) entry.val = page_private(page); p = swap_info_get(entry); if (p) { - /* Subtract the 1 for the swap cache itself */ - count = p->swap_map[swp_offset(entry)] - 1; + count = swap_count(p->swap_map[swp_offset(entry)]); spin_unlock(&swap_lock); } return count; @@ -584,7 +702,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry) == 1) { + if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -635,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) if (!bdev) { if (bdev_p) - *bdev_p = sis->bdev; + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); return i; @@ -647,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) - *bdev_p = sis->bdev; + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); bdput(bdev); @@ -698,8 +816,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { ret = -ENOMEM; + goto out_nolock; + } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { @@ -723,6 +843,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, activate_page(page); out: pte_unmap_unlock(pte, ptl); +out_nolock: return ret; } @@ -888,7 +1009,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, i = 1; } count = si->swap_map[i]; - if (count && count != SWAP_MAP_BAD) + if (count && swap_count(count) != SWAP_MAP_BAD) break; } return i; @@ -992,13 +1113,13 @@ static int try_to_unuse(unsigned int type) */ shmem = 0; swcount = *swap_map; - if (swcount > 1) { + if (swap_count(swcount)) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else retval = unuse_mm(start_mm, entry, page); } - if (*swap_map > 1) { + if (swap_count(*swap_map)) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; @@ -1008,7 +1129,7 @@ static int try_to_unuse(unsigned int type) atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (*swap_map > 1 && !retval && !shmem && + while (swap_count(*swap_map) && !retval && !shmem && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) @@ -1020,14 +1141,16 @@ static int try_to_unuse(unsigned int type) cond_resched(); swcount = *swap_map; - if (swcount <= 1) + if (!swap_count(swcount)) /* any usage ? */ ; else if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else retval = unuse_mm(mm, entry, page); - if (set_start_mm && *swap_map < swcount) { + + if (set_start_mm && + swap_count(*swap_map) < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); new_start_mm = mm; @@ -1054,21 +1177,25 @@ static int try_to_unuse(unsigned int type) } /* - * How could swap count reach 0x7fff when the maximum - * pid is 0x7fff, and there's no way to repeat a swap - * page within an mm (except in shmem, where it's the - * shared object which takes the reference count)? - * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. - * + * How could swap count reach 0x7ffe ? + * There's no way to repeat a swap page within an mm + * (except in shmem, where it's the shared object which takes + * the reference count)? + * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned + * short is too small....) * If that's wrong, then we should worry more about * exit_mmap() and do_munmap() cases described above: * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. */ - if (*swap_map == SWAP_MAP_MAX) { + /* We might release the lock_page() in unuse_mm(). */ + if (!PageSwapCache(page) || page_private(page) != entry.val) + goto retry; + + if (swap_count(*swap_map) == SWAP_MAP_MAX) { spin_lock(&swap_lock); - *swap_map = 1; + *swap_map = encode_swapmap(0, true); spin_unlock(&swap_lock); reset_overflow = 1; } @@ -1086,7 +1213,8 @@ static int try_to_unuse(unsigned int type) * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. */ - if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { + if (swap_count(*swap_map) && + PageDirty(page) && PageSwapCache(page)) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; @@ -1113,6 +1241,7 @@ static int try_to_unuse(unsigned int type) * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); +retry: unlock_page(page); page_cache_release(page); @@ -1939,15 +2068,23 @@ void si_swapinfo(struct sysinfo *val) * * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as * "permanent", but will be reclaimed by the next swapoff. + * Returns error code in following case. + * - success -> 0 + * - swp_entry is invalid -> EINVAL + * - swp_entry is migration entry -> EINVAL + * - swap-cache reference is requested but there is already one. -> EEXIST + * - swap-cache reference is requested but the entry is not used. -> ENOENT */ -int swap_duplicate(swp_entry_t entry) +static int __swap_duplicate(swp_entry_t entry, bool cache) { struct swap_info_struct * p; unsigned long offset, type; - int result = 0; + int result = -EINVAL; + int count; + bool has_cache; if (is_migration_entry(entry)) - return 1; + return -EINVAL; type = swp_type(entry); if (type >= nr_swapfiles) @@ -1956,17 +2093,40 @@ int swap_duplicate(swp_entry_t entry) offset = swp_offset(entry); spin_lock(&swap_lock); - if (offset < p->max && p->swap_map[offset]) { - if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { - p->swap_map[offset]++; - result = 1; - } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + + if (unlikely(offset >= p->max)) + goto unlock_out; + + count = swap_count(p->swap_map[offset]); + has_cache = swap_has_cache(p->swap_map[offset]); + + if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ + + /* set SWAP_HAS_CACHE if there is no cache and entry is used */ + if (!has_cache && count) { + p->swap_map[offset] = encode_swapmap(count, true); + result = 0; + } else if (has_cache) /* someone added cache */ + result = -EEXIST; + else if (!count) /* no users */ + result = -ENOENT; + + } else if (count || has_cache) { + if (count < SWAP_MAP_MAX - 1) { + p->swap_map[offset] = encode_swapmap(count + 1, + has_cache); + result = 0; + } else if (count <= SWAP_MAP_MAX) { if (swap_overflow++ < 5) - printk(KERN_WARNING "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = SWAP_MAP_MAX; - result = 1; + printk(KERN_WARNING + "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, + has_cache); + result = 0; } - } + } else + result = -ENOENT; /* unused swap entry */ +unlock_out: spin_unlock(&swap_lock); out: return result; @@ -1975,6 +2135,27 @@ bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } +/* + * increase reference count of swap entry by 1. + */ +void swap_duplicate(swp_entry_t entry) +{ + __swap_duplicate(entry, SWAP_MAP); +} + +/* + * @entry: swap entry for which we allocate swap cache. + * + * Called when allocating swap cache for exising swap entry, + * This can return error codes. Returns 0 at success. + * -EBUSY means there is a swap cache. + * Note: return code is different from swap_duplicate(). + */ +int swapcache_prepare(swp_entry_t entry) +{ + return __swap_duplicate(entry, SWAP_CACHE); +} + struct swap_info_struct * get_swap_info_struct(unsigned type) @@ -2013,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) /* Don't read in free or bad pages */ if (!si->swap_map[toff]) break; - if (si->swap_map[toff] == SWAP_MAP_BAD) + if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; } /* Count contiguous allocated slots below our target */ @@ -2021,7 +2202,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) /* Don't read in free or bad pages */ if (!si->swap_map[toff]) break; - if (si->swap_map[toff] == SWAP_MAP_BAD) + if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; } spin_unlock(&swap_lock); diff --git a/mm/thrash.c b/mm/thrash.c index c4c5205a9c3..2372d4ed5dd 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; static unsigned int global_faults; -void grab_swap_token(void) +void grab_swap_token(struct mm_struct *mm) { int current_interval; global_faults++; - current_interval = global_faults - current->mm->faultstamp; + current_interval = global_faults - mm->faultstamp; if (!spin_trylock(&swap_token_lock)) return; /* First come first served */ if (swap_token_mm == NULL) { - current->mm->token_priority = current->mm->token_priority + 2; - swap_token_mm = current->mm; + mm->token_priority = mm->token_priority + 2; + swap_token_mm = mm; goto out; } - if (current->mm != swap_token_mm) { - if (current_interval < current->mm->last_interval) - current->mm->token_priority++; + if (mm != swap_token_mm) { + if (current_interval < mm->last_interval) + mm->token_priority++; else { - if (likely(current->mm->token_priority > 0)) - current->mm->token_priority--; + if (likely(mm->token_priority > 0)) + mm->token_priority--; } /* Check if we deserve the token */ - if (current->mm->token_priority > - swap_token_mm->token_priority) { - current->mm->token_priority += 2; - swap_token_mm = current->mm; + if (mm->token_priority > swap_token_mm->token_priority) { + mm->token_priority += 2; + swap_token_mm = mm; } } else { /* Token holder came in again! */ - current->mm->token_priority += 2; + mm->token_priority += 2; } out: - current->mm->faultstamp = global_faults; - current->mm->last_interval = current_interval; + mm->faultstamp = global_faults; + mm->last_interval = current_interval; spin_unlock(&swap_token_lock); -return; } /* Called on process exit. */ diff --git a/mm/truncate.c b/mm/truncate.c index 1229211104f..ccc3ecf7cb9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -50,7 +50,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); - if (PagePrivate(page)) + if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -99,7 +99,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return; - if (PagePrivate(page)) + if (page_has_private(page)) do_invalidatepage(page, 0); cancel_dirty_page(page, PAGE_CACHE_SIZE); @@ -126,7 +126,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return 0; - if (PagePrivate(page) && !try_to_release_page(page, 0)) + if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; clear_page_mlock(page); @@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) } EXPORT_SYMBOL(truncate_inode_pages); -unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, bool be_atomic) +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) { struct pagevec pvec; pgoff_t next = start; @@ -309,30 +322,10 @@ unlock: break; } pagevec_release(&pvec); - if (likely(!be_atomic)) - cond_resched(); + cond_resched(); } return ret; } - -/** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - */ -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) -{ - return __invalidate_mapping_pages(mapping, start, end, false); -} EXPORT_SYMBOL(invalidate_mapping_pages); /* @@ -348,7 +341,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return 0; - if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) + if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; spin_lock_irq(&mapping->tree_lock); @@ -356,9 +349,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) goto failed; clear_page_mlock(page); - BUG_ON(PagePrivate(page)); + BUG_ON(page_has_private(page)); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); page_cache_release(page); /* pagecache ref */ return 1; failed: diff --git a/mm/util.c b/mm/util.c index cb00b748ce4..7c35ad95f92 100644 --- a/mm/util.c +++ b/mm/util.c @@ -6,6 +6,9 @@ #include <linux/sched.h> #include <asm/uaccess.h> +#define CREATE_TRACE_POINTS +#include <trace/events/kmem.h> + /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -70,6 +73,36 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) EXPORT_SYMBOL(kmemdup); /** + * memdup_user - duplicate memory region from user space + * + * @src: source address in user space + * @len: number of bytes to copy + * + * Returns an ERR_PTR() on failure. + */ +void *memdup_user(const void __user *src, size_t len) +{ + void *p; + + /* + * Always use GFP_KERNEL, since copy_from_user() can sleep and + * cause pagefault, which makes it pointless to use GFP_NOFS + * or GFP_ATOMIC. + */ + p = kmalloc_track_caller(len, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + + return p; +} +EXPORT_SYMBOL(memdup_user); + +/** * __krealloc - like krealloc() but don't free @p. * @p: object to reallocate memory for. * @new_size: how many bytes of memory are required. @@ -129,6 +162,30 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) } EXPORT_SYMBOL(krealloc); +/** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. + */ +void kzfree(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); +} +EXPORT_SYMBOL(kzfree); + /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -172,6 +229,30 @@ void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + * + * get_user_pages_fast provides equivalent functionality to get_user_pages, + * operating on current and current->mm, with force=0 and vma=NULL. However + * unlike get_user_pages, it must be called without mmap_sem held. + * + * get_user_pages_fast may take mmap_sem and page table locks, so no + * assumptions can be made about lack of locking. get_user_pages_fast is to be + * implemented in a way that is advantageous (vs get_user_pages()) when the + * user memory area is already faulted in and present in ptes. However if the + * pages have to be faulted in, it may turn out to be slightly slower so + * callers need to carefully consider what to use. On many architectures, + * get_user_pages_fast simply falls back to get_user_pages. + */ int __attribute__((weak)) get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { @@ -186,3 +267,11 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, return ret; } EXPORT_SYMBOL_GPL(get_user_pages_fast); + +/* Tracepoints definitions. */ +EXPORT_TRACEPOINT_SYMBOL(kmalloc); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); +EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); +EXPORT_TRACEPOINT_SYMBOL(kfree); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 75f49d312e8..f8189a4b3e1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -23,7 +23,8 @@ #include <linux/rbtree.h> #include <linux/radix-tree.h> #include <linux/rcupdate.h> -#include <linux/bootmem.h> +#include <linux/pfn.h> +#include <linux/kmemleak.h> #include <asm/atomic.h> #include <asm/uaccess.h> @@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr, * * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] */ -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +static int vmap_page_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; @@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end, if (err) break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap(start, end); if (unlikely(err)) return err; return nr; } +static int vmap_page_range(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) +{ + int ret; + + ret = vmap_page_range_noflush(start, end, prot, pages); + flush_cache_vmap(start, end); + return ret; +} + static inline int is_vmalloc_or_module_addr(const void *x) { /* @@ -323,6 +333,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long addr; int purged = 0; + BUG_ON(!size); BUG_ON(size & ~PAGE_MASK); va = kmalloc_node(sizeof(struct vmap_area), @@ -334,6 +345,9 @@ retry: addr = ALIGN(vstart, align); spin_lock(&vmap_area_lock); + if (addr + size - 1 < addr) + goto overflow; + /* XXX: could have a last_hole cache */ n = vmap_area_root.rb_node; if (n) { @@ -365,6 +379,8 @@ retry: while (addr + size > first->va_start && addr + size <= vend) { addr = ALIGN(first->va_end + PAGE_SIZE, align); + if (addr + size - 1 < addr) + goto overflow; n = rb_next(&first->rb_node); if (n) @@ -375,6 +391,7 @@ retry: } found: if (addr + size > vend) { +overflow: spin_unlock(&vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); @@ -385,6 +402,7 @@ found: printk(KERN_WARNING "vmap allocation for size %lu failed: " "use vmalloc=<size> to increase size.\n", size); + kfree(va); return ERR_PTR(-EBUSY); } @@ -498,6 +516,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, static DEFINE_SPINLOCK(purge_lock); LIST_HEAD(valist); struct vmap_area *va; + struct vmap_area *n_va; int nr = 0; /* @@ -537,7 +556,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (nr) { spin_lock(&vmap_area_lock); - list_for_each_entry(va, &valist, purge_list) + list_for_each_entry_safe(va, n_va, &valist, purge_list) __free_vmap_area(va); spin_unlock(&vmap_area_lock); } @@ -653,10 +672,7 @@ struct vmap_block { DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); union { - struct { - struct list_head free_list; - struct list_head dirty_list; - }; + struct list_head free_list; struct rcu_head rcu_head; }; }; @@ -723,7 +739,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); INIT_LIST_HEAD(&vb->free_list); - INIT_LIST_HEAD(&vb->dirty_list); vb_idx = addr_to_vb_idx(va->va_start); spin_lock(&vmap_block_tree_lock); @@ -754,12 +769,7 @@ static void free_vmap_block(struct vmap_block *vb) struct vmap_block *tmp; unsigned long vb_idx; - spin_lock(&vb->vbq->lock); - if (!list_empty(&vb->free_list)) - list_del(&vb->free_list); - if (!list_empty(&vb->dirty_list)) - list_del(&vb->dirty_list); - spin_unlock(&vb->vbq->lock); + BUG_ON(!list_empty(&vb->free_list)); vb_idx = addr_to_vb_idx(vb->va->va_start); spin_lock(&vmap_block_tree_lock); @@ -844,11 +854,7 @@ static void vb_free(const void *addr, unsigned long size) spin_lock(&vb->lock); bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); - if (!vb->dirty) { - spin_lock(&vb->vbq->lock); - list_add(&vb->dirty_list, &vb->vbq->dirty); - spin_unlock(&vb->vbq->lock); - } + vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free || !list_empty(&vb->free_list)); @@ -982,6 +988,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro } EXPORT_SYMBOL(vm_map_ram); +/** + * vm_area_register_early - register vmap area early during boot + * @vm: vm_struct to register + * @align: requested alignment + * + * This function is used to register kernel vm area before + * vmalloc_init() is called. @vm->size and @vm->flags should contain + * proper values on entry and other fields should be zero. On return, + * vm->addr contains the allocated address. + * + * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. + */ +void __init vm_area_register_early(struct vm_struct *vm, size_t align) +{ + static size_t vm_init_off __initdata; + unsigned long addr; + + addr = ALIGN(VMALLOC_START + vm_init_off, align); + vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; + + vm->addr = (void *)addr; + + vm->next = vmlist; + vmlist = vm; +} + void __init vmalloc_init(void) { struct vmap_area *va; @@ -1000,7 +1032,7 @@ void __init vmalloc_init(void) /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { - va = alloc_bootmem(sizeof(struct vmap_area)); + va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); va->flags = tmp->flags | VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; @@ -1009,9 +1041,63 @@ void __init vmalloc_init(void) vmap_initialized = true; } +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map + * + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vmap() on to-be-mapped areas + * before calling this function. + * + * RETURNS: + * The number of pages mapped on success, -errno on failure. + */ +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) +{ + return vmap_page_range_noflush(addr, addr + size, prot, pages); +} + +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vunmap() on to-be-mapped areas + * before calling this function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) +{ + vunmap_page_range(addr, addr + size); +} + +/** + * unmap_kernel_range - unmap kernel VM area and flush cache and TLB + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Similar to unmap_kernel_range_noflush() but flushes vcache before + * the unmapping and tlb after. + */ void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; + + flush_cache_vunmap(addr, end); vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } @@ -1106,6 +1192,14 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, } EXPORT_SYMBOL_GPL(__get_vm_area); +struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end, + void *caller) +{ + return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + caller); +} + /** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area @@ -1233,6 +1327,9 @@ static void __vunmap(const void *addr, int deallocate_pages) void vfree(const void *addr) { BUG_ON(in_interrupt()); + + kmemleak_free(addr); + __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -1249,6 +1346,7 @@ EXPORT_SYMBOL(vfree); void vunmap(const void *addr) { BUG_ON(in_interrupt()); + might_sleep(); __vunmap(addr, 0); } EXPORT_SYMBOL(vunmap); @@ -1268,6 +1366,8 @@ void *vmap(struct page **pages, unsigned int count, { struct vm_struct *area; + might_sleep(); + if (count > num_physpages) return NULL; @@ -1342,8 +1442,17 @@ fail: void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) { - return __vmalloc_area_node(area, gfp_mask, prot, -1, - __builtin_return_address(0)); + void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, + __builtin_return_address(0)); + + /* + * A ref_count = 3 is needed because the vm_struct and vmap_area + * structures allocated in the __get_vm_area_node() function contain + * references to the virtual address of the vmalloc'ed block. + */ + kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); + + return addr; } /** @@ -1362,6 +1471,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) { struct vm_struct *area; + void *addr; + unsigned long real_size = size; size = PAGE_ALIGN(size); if (!size || (size >> PAGE_SHIFT) > num_physpages) @@ -1373,7 +1484,16 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, if (!area) return NULL; - return __vmalloc_area_node(area, gfp_mask, prot, node, caller); + addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + + /* + * A ref_count = 3 is needed because the vm_struct and vmap_area + * structures allocated in the __get_vm_area_node() function contain + * references to the virtual address of the vmalloc'ed block. + */ + kmemleak_alloc(addr, real_size, 3, gfp_mask); + + return addr; } void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9a27c44aa32..ba8228e0a80 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -60,6 +60,9 @@ struct scan_control { int may_writepage; + /* Can mapped pages be reclaimed? */ + int may_unmap; + /* Can pages be swapped as part of reclaim? */ int may_swap; @@ -78,6 +81,12 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + /* Pluggable isolate pages callback */ unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, @@ -214,8 +223,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, do_div(delta, lru_pages + 1); shrinker->nr += delta; if (shrinker->nr < 0) { - printk(KERN_ERR "%s: nr=%ld\n", - __func__, shrinker->nr); + printk(KERN_ERR "shrink_slab: %pF negative objects to " + "delete nr=%ld\n", + shrinker->shrink, shrinker->nr); shrinker->nr = max_pass; } @@ -276,7 +286,7 @@ static inline int page_mapping_inuse(struct page *page) static inline int is_page_cache_freeable(struct page *page) { - return page_count(page) - !!PagePrivate(page) == 2; + return page_count(page) - !!page_has_private(page) == 2; } static int may_write_to_queue(struct backing_dev_info *bdi) @@ -360,7 +370,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * Some data journaling orphaned pages can have * page->mapping == NULL while being dirty with clean buffers. */ - if (PagePrivate(page)) { + if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); printk("%s: orphaned page\n", __func__); @@ -460,10 +470,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - swap_free(swap); + swapcache_free(swap, page); } else { __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } return 1; @@ -502,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) * * lru_lock must not be held, interrupts must be enabled. */ -#ifdef CONFIG_UNEVICTABLE_LRU void putback_lru_page(struct page *page) { int lru; @@ -556,20 +566,6 @@ redo: put_page(page); /* drop ref from isolate */ } -#else /* CONFIG_UNEVICTABLE_LRU */ - -void putback_lru_page(struct page *page) -{ - int lru; - VM_BUG_ON(PageLRU(page)); - - lru = !!TestClearPageActive(page) + page_is_file_cache(page); - lru_cache_add_lru(page, lru); - put_page(page); -} -#endif /* CONFIG_UNEVICTABLE_LRU */ - - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -581,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pagevec freed_pvec; int pgactivate = 0; unsigned long nr_reclaimed = 0; + unsigned long vm_flags; cond_resched(); @@ -606,7 +603,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(!page_evictable(page, NULL))) goto cull_mlocked; - if (!sc->may_swap && page_mapped(page)) + if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ @@ -631,10 +628,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } - referenced = page_referenced(page, 1, sc->mem_cgroup); - /* In active use or really unfreeable? Activate it. */ + referenced = page_referenced(page, 1, + sc->mem_cgroup, &vm_flags); + /* + * In active use or really unfreeable? Activate it. + * If page which have PG_mlocked lost isoltation race, + * try_to_unmap moves it to unevictable list + */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page)) + referenced && page_mapping_inuse(page) + && !(vm_flags & VM_LOCKED)) goto activate_locked; /* @@ -720,7 +723,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * process address space (page_count == 1) it can be freed. * Otherwise, leave the page on the LRU so it is swappable. */ - if (PagePrivate(page)) { + if (page_has_private(page)) { if (!try_to_release_page(page, sc->gfp_mask)) goto activate_locked; if (!mapping && page_count(page) == 1) { @@ -839,7 +842,6 @@ int __isolate_lru_page(struct page *page, int mode, int file) */ ClearPageLRU(page); ret = 0; - mem_cgroup_del_lru(page); } return ret; @@ -887,12 +889,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode, file)) { case 0: list_move(&page->lru, dst); + mem_cgroup_del_lru(page); nr_taken++; break; case -EBUSY: /* else it is being freed elsewhere */ list_move(&page->lru, src); + mem_cgroup_rotate_lru_list(page, page_lru(page)); continue; default: @@ -931,18 +935,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode, file)) { - case 0: + if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); + mem_cgroup_del_lru(cursor_page); nr_taken++; scan++; - break; - - case -EBUSY: - /* else it is being freed elsewhere */ - list_move(&cursor_page->lru, src); - default: - break; /* ! on LRU or wrong list */ } } } @@ -1049,6 +1046,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_scanned = 0; unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + int lumpy_reclaim = 0; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + * + * We use the same threshold as pageout congestion_wait below. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + lumpy_reclaim = 1; + else if (sc->order && priority < DEF_PRIORITY - 2) + lumpy_reclaim = 1; pagevec_init(&pvec, 1); @@ -1061,19 +1071,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_freed; unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = ISOLATE_INACTIVE; - - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - * - * We use the same threshold as pageout congestion_wait below. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - mode = ISOLATE_BOTH; - else if (sc->order && priority < DEF_PRIORITY - 2) - mode = ISOLATE_BOTH; + int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; nr_taken = sc->isolate_pages(sc->swap_cluster_max, &page_list, &nr_scan, sc->order, mode, @@ -1110,8 +1108,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * but that should be acceptable to the caller */ if (nr_freed < nr_taken && !current_is_kswapd() && - sc->order > PAGE_ALLOC_COSTLY_ORDER) { - congestion_wait(WRITE, HZ/10); + lumpy_reclaim) { + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The attempt at page out may have made some @@ -1205,18 +1203,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) * But we had to alter page->flags anyway. */ +static void move_active_pages_to_lru(struct zone *zone, + struct list_head *list, + enum lru_list lru) +{ + unsigned long pgmoved = 0; + struct pagevec pvec; + struct page *page; + + pagevec_init(&pvec, 1); + + while (!list_empty(list)) { + page = lru_to_page(list); + prefetchw_prev_lru_page(page, list, flags); + + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + + VM_BUG_ON(!PageActive(page)); + if (!is_active_lru(lru)) + ClearPageActive(page); /* we are de-activating */ + + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_add_lru_list(page, lru); + pgmoved++; + + if (!pagevec_add(&pvec, page) || list_empty(list)) { + spin_unlock_irq(&zone->lru_lock); + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) + __count_vm_events(PGDEACTIVATE, pgmoved); +} static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) { unsigned long pgmoved; - int pgdeactivate = 0; unsigned long pgscanned; + unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct pagevec pvec; - enum lru_list lru; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); lru_add_drain(); @@ -1233,13 +1267,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[!!file] += pgmoved; + __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); else __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); spin_unlock_irq(&zone->lru_lock); - pgmoved = 0; + pgmoved = 0; /* count referenced (mapping) mapped pages */ while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1252,64 +1287,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup)) + page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { pgmoved++; + /* + * Identify referenced, file-backed active pages and + * give them one more trip around the active list. So + * that executable code get better chances to stay in + * memory under moderate memory pressure. Anon pages + * are not likely to be evicted by use-once streaming + * IO, plus JVM can create lots of anon VM_EXEC pages, + * so we ignore them here. + */ + if ((vm_flags & VM_EXEC) && !PageAnon(page)) { + list_add(&page->lru, &l_active); + continue; + } + } list_add(&page->lru, &l_inactive); } /* - * Move the pages to the [file or anon] inactive list. + * Move pages back to the lru list. */ - pagevec_init(&pvec, 1); - pgmoved = 0; - lru = LRU_BASE + file * LRU_FILE; - spin_lock_irq(&zone->lru_lock); /* - * Count referenced pages from currently used mappings as - * rotated, even though they are moved to the inactive list. - * This helps balance scan pressure between file and anonymous - * pages in get_scan_ratio. + * Count referenced pages from currently used mappings as rotated, + * even though only some of them are actually re-activated. This + * helps balance scan pressure between file and anonymous pages in + * get_scan_ratio. */ reclaim_stat->recent_rotated[!!file] += pgmoved; - while (!list_empty(&l_inactive)) { - page = lru_to_page(&l_inactive); - prefetchw_prev_lru_page(page, &l_inactive, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - ClearPageActive(page); + move_active_pages_to_lru(zone, &l_active, + LRU_ACTIVE + file * LRU_FILE); + move_active_pages_to_lru(zone, &l_inactive, + LRU_BASE + file * LRU_FILE); - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - spin_unlock_irq(&zone->lru_lock); - pgdeactivate += pgmoved; - pgmoved = 0; - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - pgdeactivate += pgmoved; - if (buffer_heads_over_limit) { - spin_unlock_irq(&zone->lru_lock); - pagevec_strip(&pvec); - spin_lock_irq(&zone->lru_lock); - } - __count_zone_vm_events(PGREFILL, zone, pgscanned); - __count_vm_events(PGDEACTIVATE, pgdeactivate); spin_unlock_irq(&zone->lru_lock); - if (vm_swap_full()) - pagevec_swap_free(&pvec); - - pagevec_release(&pvec); } static int inactive_anon_is_low_global(struct zone *zone) @@ -1344,12 +1359,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_file_is_low_global(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_FILE); + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + + return (active > inactive); +} + +/** + * inactive_file_is_low - check if file pages need to be deactivated + * @zone: zone to check + * @sc: scan control of this context + * + * When the system is doing streaming IO, memory pressure here + * ensures that active file pages get deactivated, until more + * than half of the file pages are on the inactive list. + * + * Once we get to that situation, protect the system's working + * set from being evicted by disabling active file page aging. + * + * This uses a different ratio than the anonymous pages, because + * the page cache uses a use-once replacement algorithm. + */ +static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +{ + int low; + + if (scanning_global_lru(sc)) + low = inactive_file_is_low_global(zone); + else + low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); + return low; +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE) { + if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } @@ -1378,13 +1429,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - /* If we have no swap space, do not bother scanning anon pages. */ - if (nr_swap_pages <= 0) { - percent[0] = 0; - percent[1] = 100; - return; - } - anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + @@ -1394,7 +1438,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= zone->pages_high)) { + if (unlikely(file + free <= high_wmark_pages(zone))) { percent[0] = 100; percent[1] = 0; return; @@ -1449,6 +1493,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, percent[1] = 100 - percent[0]; } +/* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan, + unsigned long swap_cluster_max) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= swap_cluster_max) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. @@ -1462,26 +1526,30 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long swap_cluster_max = sc->swap_cluster_max; + int noswap = 0; - get_scan_ratio(zone, sc, percent); + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + percent[0] = 0; + percent[1] = 100; + } else + get_scan_ratio(zone, sc, percent); for_each_evictable_lru(l) { int file = is_file_lru(l); - int scan; + unsigned long scan; - scan = zone_page_state(zone, NR_LRU_BASE + l); - if (priority) { + scan = zone_nr_pages(zone, sc, l); + if (priority || noswap) { scan >>= priority; scan = (scan * percent[file]) / 100; } - if (scanning_global_lru(sc)) { - zone->lru[l].nr_scan += scan; - nr[l] = zone->lru[l].nr_scan; - if (nr[l] >= swap_cluster_max) - zone->lru[l].nr_scan = 0; - else - nr[l] = 0; - } else + if (scanning_global_lru(sc)) + nr[l] = nr_scan_try_batch(scan, + &zone->lru[l].nr_saved_scan, + swap_cluster_max); + else nr[l] = scan; } @@ -1515,7 +1583,7 @@ static void shrink_zone(int priority, struct zone *zone, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc)) + if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); @@ -1526,11 +1594,13 @@ static void shrink_zone(int priority, struct zone *zone, * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over pages_high. Because: + * We reclaim from a zone even if that zone is over high_wmark_pages(zone). + * Because: * a) The caller may be trying to free *extra* pages to satisfy a higher-order * allocation or - * b) The zones may be over pages_high but they must go *over* pages_high to - * satisfy the `incremental min' zone defense algorithm. + * b) The target zone may be at high_wmark_pages(zone) but the lower zones + * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' + * zone defense algorithm. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. @@ -1543,7 +1613,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, struct zone *zone; sc->all_unreclaimable = 1; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, + sc->nodemask) { if (!populated_zone(zone)) continue; /* @@ -1649,13 +1720,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (total_scanned > sc->swap_cluster_max + sc->swap_cluster_max / 2) { - wakeup_pdflush(laptop_mode ? 0 : total_scanned); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ if (sc->nr_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scanning_global_lru(sc)) @@ -1688,17 +1759,19 @@ out: } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, nodemask_t *nodemask) { struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_unmap = 1, .may_swap = 1, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, .isolate_pages = isolate_pages_global, + .nodemask = nodemask, }; return do_try_to_free_pages(zonelist, &sc); @@ -1713,18 +1786,17 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, { struct scan_control sc = { .may_writepage = !laptop_mode, - .may_swap = 1, + .may_unmap = 1, + .may_swap = !noswap, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, + .nodemask = NULL, /* we don't care the placement */ }; struct zonelist *zonelist; - if (noswap) - sc.may_swap = 0; - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists; @@ -1734,7 +1806,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, /* * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at pages_high. + * they are all at high_wmark_pages(zone). * * Returns the number of pages which were actually freed. * @@ -1747,11 +1819,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * the zone for when the problem goes away. * * kswapd scans the zones in the highmem->normal->dma direction. It skips - * zones which have free_pages > pages_high, but once a zone is found to have - * free_pages <= pages_high, we scan that zone and the lower zones regardless - * of the number of free pages in the lower zones. This interoperates with - * the page allocator fallback scheme to ensure that aging of pages is balanced - * across the zones. + * zones which have free_pages > high_wmark_pages(zone), but once a zone is + * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the + * lower zones regardless of the number of free pages in the lower zones. This + * interoperates with the page allocator fallback scheme to ensure that aging + * of pages is balanced across the zones. */ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) { @@ -1762,6 +1834,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .may_unmap = 1, .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, @@ -1771,7 +1844,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) }; /* * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to free_pages == pages_high. + * this zone was successfully refilled to + * free_pages == high_wmark_pages(zone). */ int temp_priority[MAX_NR_ZONES]; @@ -1816,8 +1890,8 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, zone->pages_high, - 0, 0)) { + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), 0, 0)) { end_zone = i; break; } @@ -1851,8 +1925,8 @@ loop_again: priority != DEF_PRIORITY) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), end_zone, 0)) all_zones_ok = 0; temp_priority[i] = priority; sc.nr_scanned = 0; @@ -1861,8 +1935,8 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, 8*zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, @@ -1891,7 +1965,7 @@ loop_again: * another pass across the zones. */ if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * We do this so kswapd doesn't build up large priorities for @@ -1963,7 +2037,9 @@ static int kswapd(void *p) struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; - node_to_cpumask_ptr(cpumask, pgdat->node_id); + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + lockdep_set_current_reclaim_state(GFP_KERNEL); if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); @@ -2026,7 +2102,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; @@ -2045,52 +2121,50 @@ unsigned long global_lru_pages(void) + global_page_state(NR_INACTIVE_FILE); } -#ifdef CONFIG_PM +#ifdef CONFIG_HIBERNATION /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority, and returns the - * number of reclaimed pages + * from LRU lists system-wide, for given pass and priority. * * For pass > 3 we also try to shrink the LRU lists that contain a few pages */ -static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, +static void shrink_all_zones(unsigned long nr_pages, int prio, int pass, struct scan_control *sc) { struct zone *zone; - unsigned long nr_to_scan, ret = 0; - enum lru_list l; - - for_each_zone(zone) { + unsigned long nr_reclaimed = 0; - if (!populated_zone(zone)) - continue; + for_each_populated_zone(zone) { + enum lru_list l; if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; for_each_evictable_lru(l) { + enum zone_stat_item ls = NR_LRU_BASE + l; + unsigned long lru_pages = zone_page_state(zone, ls); + /* For pass = 0, we don't shrink the active list */ - if (pass == 0 && - (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) + if (pass == 0 && (l == LRU_ACTIVE_ANON || + l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_scan += - (zone_page_state(zone, NR_LRU_BASE + l) - >> prio) + 1; - if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { - zone->lru[l].nr_scan = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, - NR_LRU_BASE + l)); - ret += shrink_list(l, nr_to_scan, zone, + zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; + if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { + unsigned long nr_to_scan; + + zone->lru[l].nr_saved_scan = 0; + nr_to_scan = min(nr_pages, lru_pages); + nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); - if (ret >= nr_pages) - return ret; + if (nr_reclaimed >= nr_pages) { + sc->nr_reclaimed += nr_reclaimed; + return; + } } } } - - return ret; + sc->nr_reclaimed += nr_reclaimed; } /* @@ -2104,16 +2178,14 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, unsigned long shrink_all_memory(unsigned long nr_pages) { unsigned long lru_pages, nr_slab; - unsigned long ret = 0; int pass; struct reclaim_state reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .may_swap = 0, - .swap_cluster_max = nr_pages, + .may_unmap = 0, .may_writepage = 1, - .swappiness = vm_swappiness, .isolate_pages = isolate_pages_global, + .nr_reclaimed = 0, }; current->reclaim_state = &reclaim_state; @@ -2127,8 +2199,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!reclaim_state.reclaimed_slab) break; - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + if (sc.nr_reclaimed >= nr_pages) goto out; nr_slab -= reclaim_state.reclaimed_slab; @@ -2146,49 +2218,50 @@ unsigned long shrink_all_memory(unsigned long nr_pages) int prio; /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) { - sc.may_swap = 1; - sc.swappiness = 100; - } + if (pass > 2) + sc.may_unmap = 1; for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - ret; + unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; sc.nr_scanned = 0; - ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (ret >= nr_pages) + sc.swap_cluster_max = nr_to_scan; + shrink_all_zones(nr_to_scan, prio, pass, &sc); + if (sc.nr_reclaimed >= nr_pages) goto out; reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, global_lru_pages()); - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + if (sc.nr_reclaimed >= nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ / 10); + congestion_wait(BLK_RW_ASYNC, HZ / 10); } } /* - * If ret = 0, we could not shrink LRUs, but there may be something - * in slab caches + * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be + * something in slab caches */ - if (!ret) { + if (!sc.nr_reclaimed) { do { reclaim_state.reclaimed_slab = 0; shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); - ret += reclaim_state.reclaimed_slab; - } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + } while (sc.nr_reclaimed < nr_pages && + reclaim_state.reclaimed_slab > 0); } + out: current->reclaim_state = NULL; - return ret; + return sc.nr_reclaimed; } -#endif +#endif /* CONFIG_HIBERNATION */ /* It's optimal to keep kswapds on the same CPUs as their memory, but not required for correctness. So if the last cpu in a node goes @@ -2202,7 +2275,9 @@ static int __devinit cpu_callback(struct notifier_block *nfb, if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { for_each_node_state(nid, N_HIGH_MEMORY) { pg_data_t *pgdat = NODE_DATA(nid); - node_to_cpumask_ptr(mask, pgdat->node_id); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) /* One of our CPUs online: restore mask */ @@ -2280,6 +2355,48 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; +static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +{ + unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); + unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_FILE); + + /* + * It's possible for there to be more file mapped pages than + * accounted for by the pages on the file LRU lists because + * tmpfs pages accounted for as ANON can also be FILE_MAPPED + */ + return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; +} + +/* Work out how many page cache pages we can reclaim in this reclaim_mode */ +static long zone_pagecache_reclaimable(struct zone *zone) +{ + long nr_pagecache_reclaimable; + long delta = 0; + + /* + * If RECLAIM_SWAP is set, then all file pages are considered + * potentially reclaimable. Otherwise, we have to worry about + * pages like swapcache and zone_unmapped_file_pages() provides + * a better estimate + */ + if (zone_reclaim_mode & RECLAIM_SWAP) + nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + else + nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + + /* If we can't clean pages, remove dirty pages from consideration */ + if (!(zone_reclaim_mode & RECLAIM_WRITE)) + delta += zone_page_state(zone, NR_FILE_DIRTY); + + /* Watch for any possible underflows due to delta */ + if (unlikely(delta > nr_pagecache_reclaimable)) + delta = nr_pagecache_reclaimable; + + return nr_pagecache_reclaimable - delta; +} + /* * Try to free up some pages from this zone through reclaim. */ @@ -2292,11 +2409,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int priority; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, + .order = order, .isolate_pages = isolate_pages_global, }; unsigned long slab_reclaimable; @@ -2312,9 +2431,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) > - zone->min_unmapped_pages) { + if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. @@ -2372,20 +2489,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * if less than a specified percentage of the zone is used by * unmapped file backed pages. */ - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages - && zone_page_state(zone, NR_SLAB_RECLAIMABLE) - <= zone->min_slab_pages) - return 0; + if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) + return ZONE_RECLAIM_FULL; if (zone_is_all_unreclaimable(zone)) - return 0; + return ZONE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) - return 0; + return ZONE_RECLAIM_NOSCAN; /* * Only run zone reclaim on the local zone or on zones that do not @@ -2395,18 +2510,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ node_id = zone_to_nid(zone); if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return 0; + return ZONE_RECLAIM_NOSCAN; if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) - return 0; + return ZONE_RECLAIM_NOSCAN; + ret = __zone_reclaim(zone, gfp_mask, order); zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + if (!ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); + return ret; } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * page_evictable - test whether a page is evictable * @page: the page to test @@ -2653,4 +2771,3 @@ void scan_unevictable_unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 91149746bb8..138bed53706 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -27,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); - for_each_cpu_mask_nr(cpu, *cpumask) { + for_each_cpu(cpu, cpumask) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) @@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void) int cpu; int threshold; - for_each_zone(zone) { - - if (!zone->present_pages) - continue; - + for_each_populated_zone(zone) { threshold = calculate_threshold(zone); for_each_online_cpu(cpu) @@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu) int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; - for_each_zone(zone) { + for_each_populated_zone(zone) { struct per_cpu_pageset *p; - if (!populated_zone(zone)) - continue; - p = zone_pcp(zone, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) @@ -516,22 +509,11 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, continue; page = pfn_to_page(pfn); -#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES - /* - * Ordinarily, memory holes in flatmem still have a valid - * memmap for the PFN range. However, an architecture for - * embedded systems (e.g. ARM) can free up the memmap backing - * holes to save memory on the assumption the memmap is - * never used. The page_zone linkages are then broken even - * though pfn_valid() returns true. Skip the page if the - * linkages are broken. Even if this test passed, the impact - * is that the counters for the movable type are off but - * fragmentation monitoring is likely meaningless on small - * systems. - */ - if (page_zone(page) != zone) + + /* Watch for unexpected holes punched in the memmap */ + if (!memmap_valid_within(pfn, page, zone)) continue; -#endif + mtype = get_pageblock_migratetype(page); if (mtype < MIGRATE_TYPES) @@ -647,10 +629,8 @@ static const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", -#ifdef CONFIG_UNEVICTABLE_LRU "nr_unevictable", "nr_mlock", -#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -693,6 +673,9 @@ static const char * const vmstat_text[] = { TEXTS_FOR_ZONES("pgscan_kswapd") TEXTS_FOR_ZONES("pgscan_direct") +#ifdef CONFIG_NUMA + "zone_reclaim_failed", +#endif "pginodesteal", "slabs_scanned", "kswapd_steal", @@ -705,7 +688,6 @@ static const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif -#ifdef CONFIG_UNEVICTABLE_LRU "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", @@ -715,7 +697,6 @@ static const char * const vmstat_text[] = { "unevictable_pgs_stranded", "unevictable_pgs_mlockfreed", #endif -#endif }; static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, @@ -728,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" + "\n scanned %lu" "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), - zone->pages_min, - zone->pages_low, - zone->pages_high, + min_wmark_pages(zone), + low_wmark_pages(zone), + high_wmark_pages(zone), zone->pages_scanned, - zone->lru[LRU_ACTIVE_ANON].nr_scan, - zone->lru[LRU_INACTIVE_ANON].nr_scan, - zone->lru[LRU_ACTIVE_FILE].nr_scan, - zone->lru[LRU_INACTIVE_FILE].nr_scan, zone->spanned_pages, zone->present_pages); @@ -898,7 +875,7 @@ static void vmstat_update(struct work_struct *w) { refresh_cpu_vm_stats(smp_processor_id()); schedule_delayed_work(&__get_cpu_var(vmstat_work), - sysctl_stat_interval); + round_jiffies_relative(sysctl_stat_interval)); } static void __cpuinit start_cpu_timer(int cpu) @@ -906,7 +883,8 @@ static void __cpuinit start_cpu_timer(int cpu) struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); - schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); + schedule_delayed_work_on(cpu, vmstat_work, + __round_jiffies_relative(HZ, cpu)); } /* |
