diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 4 | ||||
| -rw-r--r-- | mm/bootmem.c | 2 | ||||
| -rw-r--r-- | mm/fadvise.c | 46 | ||||
| -rw-r--r-- | mm/filemap.c | 41 | ||||
| -rw-r--r-- | mm/memory.c | 8 | ||||
| -rw-r--r-- | mm/mempolicy.c | 32 | ||||
| -rw-r--r-- | mm/mmap.c | 6 | ||||
| -rw-r--r-- | mm/msync.c | 139 | ||||
| -rw-r--r-- | mm/page-writeback.c | 64 | ||||
| -rw-r--r-- | mm/page_alloc.c | 11 | ||||
| -rw-r--r-- | mm/slab.c | 351 | ||||
| -rw-r--r-- | mm/slob.c | 10 | ||||
| -rw-r--r-- | mm/util.c | 47 | ||||
| -rw-r--r-- | mm/vmscan.c | 2 |
14 files changed, 595 insertions, 168 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index bd80460360d..332f5c29b53 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS # config MIGRATION bool "Page migration" - def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM - depends on SWAP + def_bool y if NUMA + depends on SWAP && NUMA help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful for diff --git a/mm/bootmem.c b/mm/bootmem.c index 35c32290f71..b55bd39fc5d 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -152,7 +152,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, * * NOTE: This function is _not_ reentrant. */ -static void * __init +void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { diff --git a/mm/fadvise.c b/mm/fadvise.c index d257c89e770..907c39257ca 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -15,6 +15,7 @@ #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/fadvise.h> +#include <linux/writeback.h> #include <linux/syscalls.h> #include <asm/unistd.h> @@ -22,13 +23,36 @@ /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. + * + * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file + * offsets `offset' and `offset+len' inclusive. Any pages which are currently + * under writeout are skipped, whether or not they are dirty. + * + * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file + * offsets `offset' and `offset+len'. + * + * By combining these two operations the application may do several things: + * + * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. + * + * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently + * dirty pages at the disk. + * + * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push + * all of the currently dirty pages at the disk, wait until they have been + * written. + * + * It should be noted that none of these operations write out the file's + * metadata. So unless the application is strictly performing overwrites of + * already-instantiated disk blocks, there are no guarantees here that the data + * will be available after a crash. */ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { struct file *file = fget(fd); struct address_space *mapping; struct backing_dev_info *bdi; - loff_t endbyte; + loff_t endbyte; /* inclusive */ pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; @@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) endbyte = offset + len; if (!len || endbyte < len) endbyte = -1; + else + endbyte--; /* inclusive */ bdi = mapping->backing_dev_info; @@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) /* First and last PARTIAL page! */ start_index = offset >> PAGE_CACHE_SHIFT; - end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; + end_index = endbyte >> PAGE_CACHE_SHIFT; /* Careful about overflow on the "+1" */ nrpages = end_index - start_index + 1; @@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) filemap_flush(mapping); /* First and last FULL page! */ - start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; + start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; end_index = (endbyte >> PAGE_CACHE_SHIFT); - if (end_index > start_index) - invalidate_mapping_pages(mapping, start_index, end_index-1); + if (end_index >= start_index) + invalidate_mapping_pages(mapping, start_index, + end_index); + break; + case LINUX_FADV_ASYNC_WRITE: + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); + break; + case LINUX_FADV_WRITE_WAIT: + ret = wait_on_page_writeback_range(mapping, + offset >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); break; default: ret = -EINVAL; diff --git a/mm/filemap.c b/mm/filemap.c index e8f58f7dd7a..3ef20739e72 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -29,6 +29,7 @@ #include <linux/blkdev.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/cpuset.h> #include "filemap.h" #include "internal.h" @@ -174,7 +175,7 @@ static int sync_page(void *word) * dirty pages that lie within the byte offsets <start, end> * @mapping: address space structure to write * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends + * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as @@ -182,8 +183,8 @@ static int sync_page(void *word) * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. */ -static int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode) +int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) { int ret; struct writeback_control wbc = { @@ -212,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end) +static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } @@ -232,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush); * Wait for writeback to complete against pages indexed by start->end * inclusive */ -static int wait_on_page_writeback_range(struct address_space *mapping, +int wait_on_page_writeback_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; @@ -367,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping) } EXPORT_SYMBOL(filemap_write_and_wait); +/* + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that `lend' is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { @@ -427,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, return ret; } +#ifdef CONFIG_NUMA +struct page *page_cache_alloc(struct address_space *x) +{ + if (cpuset_do_page_mem_spread()) { + int n = cpuset_mem_spread_node(); + return alloc_pages_node(n, mapping_gfp_mask(x), 0); + } + return alloc_pages(mapping_gfp_mask(x), 0); +} +EXPORT_SYMBOL(page_cache_alloc); + +struct page *page_cache_alloc_cold(struct address_space *x) +{ + if (cpuset_do_page_mem_spread()) { + int n = cpuset_mem_spread_node(); + return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); + } + return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); +} +EXPORT_SYMBOL(page_cache_alloc_cold); +#endif + /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of diff --git a/mm/memory.c b/mm/memory.c index 80c3fb370f9..e347e106ca3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -395,12 +395,16 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ return NULL; } -#ifdef CONFIG_DEBUG_VM + /* + * Add some anal sanity checks for now. Eventually, + * we should just do "return pfn_to_page(pfn)", but + * in the meantime we check that we get a valid pfn, + * and that the resulting page looks ok. + */ if (unlikely(!pfn_valid(pfn))) { print_bad_pte(vma, pte, addr); return NULL; } -#endif /* * NOTE! We still have PageReserved() pages in the page diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e93cc740c22..4f71cfd29c6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes) return mpol_check_policy(mode, nodes); } + +/* + * Update task->flags PF_MEMPOLICY bit: set iff non-default + * mempolicy. Allows more rapid checking of this (combined perhaps + * with other PF_* flag bits) on memory allocation hot code paths. + * + * If called from outside this file, the task 'p' should -only- be + * a newly forked child not yet visible on the task list, because + * manipulating the task flags of a visible task is not safe. + * + * The above limitation is why this routine has the funny name + * mpol_fix_fork_child_flag(). + * + * It is also safe to call this with a task pointer of current, + * which the static wrapper mpol_set_task_struct_flag() does, + * for use within this file. + */ + +void mpol_fix_fork_child_flag(struct task_struct *p) +{ + if (p->mempolicy) + p->flags |= PF_MEMPOLICY; + else + p->flags &= ~PF_MEMPOLICY; +} + +static void mpol_set_task_struct_flag(void) +{ + mpol_fix_fork_child_flag(current); +} + /* Set the process memory policy */ long do_set_mempolicy(int mode, nodemask_t *nodes) { @@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; + mpol_set_task_struct_flag(); if (new && new->policy == MPOL_INTERLEAVE) current->il_next = first_node(new->v.nodes); return 0; diff --git a/mm/mmap.c b/mm/mmap.c index 0eb9894db6d..4f5b5709136 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1040,12 +1040,11 @@ munmap_back: * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { error = -ENOMEM; goto unacct_error; } - memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_start = addr; @@ -1896,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len) /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } - memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_start = addr; diff --git a/mm/msync.c b/mm/msync.c index 3563a56e1a5..bc6c9537636 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -9,20 +9,24 @@ */ #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/fs.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/hugetlb.h> +#include <linux/writeback.h> +#include <linux/file.h> #include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> -static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte; spinlock_t *ptl; int progress = 0; + unsigned long ret = 0; again: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -43,58 +47,64 @@ again: if (!page) continue; if (ptep_clear_flush_dirty(vma, addr, pte) || - page_test_and_clear_dirty(page)) - set_page_dirty(page); + page_test_and_clear_dirty(page)) + ret += set_page_dirty(page); progress += 3; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); cond_resched(); if (addr != end) goto again; + return ret; } -static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end) +static inline unsigned long msync_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; + unsigned long ret = 0; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - msync_pte_range(vma, pmd, addr, next); + ret += msync_pte_range(vma, pmd, addr, next); } while (pmd++, addr = next, addr != end); + return ret; } -static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end) +static inline unsigned long msync_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; + unsigned long ret = 0; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - msync_pmd_range(vma, pud, addr, next); + ret += msync_pmd_range(vma, pud, addr, next); } while (pud++, addr = next, addr != end); + return ret; } -static void msync_page_range(struct vm_area_struct *vma, +static unsigned long msync_page_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pgd_t *pgd; unsigned long next; + unsigned long ret = 0; /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need * to do anything more on an msync(). */ if (vma->vm_flags & VM_HUGETLB) - return; + return 0; BUG_ON(addr >= end); pgd = pgd_offset(vma->vm_mm, addr); @@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - msync_pud_range(vma, pgd, addr, next); + ret += msync_pud_range(vma, pgd, addr, next); } while (pgd++, addr = next, addr != end); + return ret; } /* @@ -115,53 +126,31 @@ static void msync_page_range(struct vm_area_struct *vma, * write out the dirty pages and wait on the writeout and check the result. * Or the application may run fadvise(FADV_DONTNEED) against the fd to start * async writeout immediately. - * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to + * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ -static int msync_interval(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, int flags) +static int msync_interval(struct vm_area_struct *vma, unsigned long addr, + unsigned long end, int flags, + unsigned long *nr_pages_dirtied) { - int ret = 0; struct file *file = vma->vm_file; if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) return -EBUSY; - if (file && (vma->vm_flags & VM_SHARED)) { - msync_page_range(vma, addr, end); - - if (flags & MS_SYNC) { - struct address_space *mapping = file->f_mapping; - int err; - - ret = filemap_fdatawrite(mapping); - if (file->f_op && file->f_op->fsync) { - /* - * We don't take i_mutex here because mmap_sem - * is already held. - */ - err = file->f_op->fsync(file,file->f_dentry,1); - if (err && !ret) - ret = err; - } - err = filemap_fdatawait(mapping); - if (!ret) - ret = err; - } - } - return ret; + if (file && (vma->vm_flags & VM_SHARED)) + *nr_pages_dirtied = msync_page_range(vma, addr, end); + return 0; } asmlinkage long sys_msync(unsigned long start, size_t len, int flags) { unsigned long end; struct vm_area_struct *vma; - int unmapped_error, error = -EINVAL; - - if (flags & MS_SYNC) - current->flags |= PF_SYNCWRITE; + int unmapped_error = 0; + int error = -EINVAL; + int done = 0; - down_read(¤t->mm->mmap_sem); if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (start & ~PAGE_MASK) @@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) * If the interval [start,end) covers some unmapped address ranges, * just ignore them, but return -ENOMEM at the end. */ + down_read(¤t->mm->mmap_sem); + if (flags & MS_SYNC) + current->flags |= PF_SYNCWRITE; vma = find_vma(current->mm, start); - unmapped_error = 0; - for (;;) { - /* Still start < end. */ + if (!vma) { error = -ENOMEM; - if (!vma) - goto out; + goto out_unlock; + } + do { + unsigned long nr_pages_dirtied = 0; + struct file *file; + /* Here start < vma->vm_end. */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; @@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) /* Here vma->vm_start <= start < vma->vm_end. */ if (end <= vma->vm_end) { if (start < end) { - error = msync_interval(vma, start, end, flags); + error = msync_interval(vma, start, end, flags, + &nr_pages_dirtied); if (error) - goto out; + goto out_unlock; } error = unmapped_error; - goto out; + done = 1; + } else { + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = msync_interval(vma, start, vma->vm_end, flags, + &nr_pages_dirtied); + if (error) + goto out_unlock; } - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = msync_interval(vma, start, vma->vm_end, flags); - if (error) - goto out; + file = vma->vm_file; start = vma->vm_end; - vma = vma->vm_next; - } -out: - up_read(¤t->mm->mmap_sem); + if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { + get_file(file); + up_read(¤t->mm->mmap_sem); + balance_dirty_pages_ratelimited_nr(file->f_mapping, + nr_pages_dirtied); + fput(file); + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + } else if ((flags & MS_SYNC) && file && + (vma->vm_flags & VM_SHARED)) { + get_file(file); + up_read(¤t->mm->mmap_sem); + error = do_fsync(file, 0); + fput(file); + down_read(¤t->mm->mmap_sem); + if (error) + goto out_unlock; + vma = find_vma(current->mm, start); + } else { + vma = vma->vm_next; + } + } while (vma && !done); +out_unlock: current->flags &= ~PF_SYNCWRITE; + up_read(¤t->mm->mmap_sem); +out: return error; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 945559fb63d..893d7677579 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -75,12 +75,12 @@ int vm_dirty_ratio = 40; * The interval between `kupdate'-style writebacks, in centiseconds * (hundredths of a second) */ -int dirty_writeback_centisecs = 5 * 100; +int dirty_writeback_interval = 5 * HZ; /* * The longest number of centiseconds for which data is allowed to remain dirty */ -int dirty_expire_centisecs = 30 * 100; +int dirty_expire_interval = 30 * HZ; /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100; int block_dump; /* - * Flag that puts the machine in "laptop mode". + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: + * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; @@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping) } /** - * balance_dirty_pages_ratelimited - balance dirty memory state + * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied + * @nr_pages: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's @@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping) * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, + unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(int, ratelimits) = 0; - long ratelimit; + static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; + unsigned long ratelimit; + unsigned long *p; ratelimit = ratelimit_pages; if (dirty_exceeded) @@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * Check the rate limiting. Also, we do not want to throttle real-time * tasks in balance_dirty_pages(). Period. */ - if (get_cpu_var(ratelimits)++ >= ratelimit) { - __get_cpu_var(ratelimits) = 0; - put_cpu_var(ratelimits); + preempt_disable(); + p = &__get_cpu_var(ratelimits); + *p += nr_pages_dirtied; + if (unlikely(*p >= ratelimit)) { + *p = 0; + preempt_enable(); balance_dirty_pages(mapping); return; } - put_cpu_var(ratelimits); + preempt_enable(); } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(void) { @@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * - * Try to run once per dirty_writeback_centisecs. But if a writeback event - * takes longer than a dirty_writeback_centisecs interval, then leave a + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * * older_than_this takes precedence over nr_to_write. So we'll only write back @@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg) sync_supers(); get_writeback_state(&wbs); - oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; + oldest_jif = jiffies - dirty_expire_interval; start_jif = jiffies; - next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; + next_jif = start_jif + dirty_writeback_interval; nr_to_write = wbs.nr_dirty + wbs.nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { @@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg) } if (time_before(next_jif, jiffies + HZ)) next_jif = jiffies + HZ; - if (dirty_writeback_centisecs) + if (dirty_writeback_interval) mod_timer(&wb_timer, next_jif); } @@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg) int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_centisecs) { + proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + if (dirty_writeback_interval) { mod_timer(&wb_timer, - jiffies + (dirty_writeback_centisecs * HZ) / 100); - } else { + jiffies + dirty_writeback_interval); + } else { del_timer(&wb_timer); } return 0; @@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused) */ void laptop_io_completion(void) { - mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); + mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); } /* @@ -544,7 +551,7 @@ void __init page_writeback_init(void) if (vm_dirty_ratio <= 0) vm_dirty_ratio = 1; } - mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); set_ratelimit(); register_cpu_notifier(&ratelimit_nb); } @@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page); */ int __set_page_dirty_nobuffers(struct page *page) { - int ret = 0; - if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; @@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page) I_DIRTY_PAGES); } } + return 1; } - return ret; + return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page) return (*spd)(page); return __set_page_dirty_buffers(page); } - if (!PageDirty(page)) - SetPageDirty(page); + if (!PageDirty(page)) { + if (!TestSetPageDirty(page)) + return 1; + } return 0; } EXPORT_SYMBOL(set_page_dirty); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b7f14a4799a..338a02bb004 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -943,7 +943,8 @@ restart: goto got_pg; do { - wakeup_kswapd(*z, order); + if (cpuset_zone_allowed(*z, gfp_mask)) + wakeup_kswapd(*z, order); } while (*(++z)); /* @@ -2028,8 +2029,9 @@ static __meminit void zone_pcp_init(struct zone *zone) setup_pageset(zone_pcp(zone,cpu), batch); #endif } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); + if (zone->present_pages) + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); } static __meminit void init_currently_empty_zone(struct zone *zone, @@ -2700,8 +2702,7 @@ void *__init alloc_large_system_hash(const char *tablename, else numentries <<= (PAGE_SHIFT - scale); } - /* rounded up to nearest power of 2 in size */ - numentries = 1UL << (long_log2(numentries) + 1); + numentries = roundup_pow_of_two(numentries); /* limit allocation size to 1/16 total memory by default */ if (max == 0) { diff --git a/mm/slab.c b/mm/slab.c index 1c8f5ee230d..681837499d7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -94,6 +94,7 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compiler.h> +#include <linux/cpuset.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/kallsyms.h> @@ -173,12 +174,12 @@ SLAB_CACHE_DMA | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #endif /* @@ -203,7 +204,8 @@ typedef unsigned int kmem_bufctl_t; #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) -#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) +#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) +#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) /* Max number of objs-per-slab for caches which use off-slab slabs. * Needed to avoid a possible looping condition in cache_grow(). @@ -896,8 +898,33 @@ static struct array_cache *alloc_arraycache(int node, int entries, return nc; } +/* + * Transfer objects in one arraycache to another. + * Locking must be handled by the caller. + * + * Return the number of entries transferred. + */ +static int transfer_objects(struct array_cache *to, + struct array_cache *from, unsigned int max) +{ + /* Figure out how many entries to transfer */ + int nr = min(min(from->avail, max), to->limit - to->avail); + + if (!nr) + return 0; + + memcpy(to->entry + to->avail, from->entry + from->avail -nr, + sizeof(void *) *nr); + + from->avail -= nr; + to->avail += nr; + to->touched = 1; + return nr; +} + #ifdef CONFIG_NUMA static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct array_cache **alloc_alien_cache(int node, int limit) { @@ -944,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (ac->avail) { spin_lock(&rl3->list_lock); + /* + * Stuff objects into the remote nodes shared array first. + * That way we could avoid the overhead of putting the objects + * into the free lists and getting them back later. + */ + transfer_objects(rl3->shared, ac, ac->limit); + free_block(cachep, ac->entry, ac->avail, node); ac->avail = 0; spin_unlock(&rl3->list_lock); @@ -959,8 +993,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) if (l3->alien) { struct array_cache *ac = l3->alien[node]; - if (ac && ac->avail) { - spin_lock_irq(&ac->lock); + + if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { __drain_alien_cache(cachep, ac, node); spin_unlock_irq(&ac->lock); } @@ -1987,10 +2021,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, align = ralign; /* Get cache's description obj. */ - cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL); + cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); if (!cachep) goto oops; - memset(cachep, 0, sizeof(struct kmem_cache)); #if DEBUG cachep->obj_size = size; @@ -2397,7 +2430,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, /* Verify that the slab belongs to the intended node */ WARN_ON(slabp->nodeid != nodeid); - if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { + if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); BUG(); @@ -2603,6 +2636,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, */ cachep->dtor(objp + obj_offset(cachep), cachep, 0); } +#ifdef CONFIG_DEBUG_SLAB_LEAK + slab_bufctl(slabp)[objnr] = BUFCTL_FREE; +#endif if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { @@ -2675,20 +2711,10 @@ retry: BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); - if (l3->shared) { - struct array_cache *shared_array = l3->shared; - if (shared_array->avail) { - if (batchcount > shared_array->avail) - batchcount = shared_array->avail; - shared_array->avail -= batchcount; - ac->avail = batchcount; - memcpy(ac->entry, - &(shared_array->entry[shared_array->avail]), - sizeof(void *) * batchcount); - shared_array->touched = 1; - goto alloc_done; - } - } + /* See if we can refill from the shared array */ + if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) + goto alloc_done; + while (batchcount > 0) { struct list_head *entry; struct slab *slabp; @@ -2786,6 +2812,16 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } +#ifdef CONFIG_DEBUG_SLAB_LEAK + { + struct slab *slabp; + unsigned objnr; + + slabp = page_get_slab(virt_to_page(objp)); + objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; + } +#endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) { unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; @@ -2807,11 +2843,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) struct array_cache *ac; #ifdef CONFIG_NUMA - if (unlikely(current->mempolicy && !in_interrupt())) { - int nid = slab_node(current->mempolicy); - - if (nid != numa_node_id()) - return __cache_alloc_node(cachep, flags, nid); + if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + objp = alternate_node_alloc(cachep, flags); + if (objp != NULL) + return objp; } #endif @@ -2847,6 +2882,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, #ifdef CONFIG_NUMA /* + * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * + * If we are in_interrupt, then process context, including cpusets and + * mempolicy, may not apply and should not be used for allocation policy. + */ +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + int nid_alloc, nid_here; + + if (in_interrupt()) + return NULL; + nid_alloc = nid_here = numa_node_id(); + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) + nid_alloc = cpuset_mem_spread_node(); + else if (current->mempolicy) + nid_alloc = slab_node(current->mempolicy); + if (nid_alloc != nid_here) + return __cache_alloc_node(cachep, flags, nid_alloc); + return NULL; +} + +/* * A interface to enable slab creation on nodeid */ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, @@ -3071,6 +3128,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** + * kmem_cache_alloc - Allocate an object. The memory is set to zero. + * @cache: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache and set the allocated memory to zero. + * The flags are only relevant if the cache has no available objects. + */ +void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) +{ + void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); + if (ret) + memset(ret, 0, obj_size(cache)); + return ret; +} +EXPORT_SYMBOL(kmem_cache_zalloc); + +/** * kmem_ptr_validate - check if an untrusted pointer might * be a slab entry. * @cachep: the cache we're checking against @@ -3197,22 +3271,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return __cache_alloc(cachep, flags, caller); } -#ifndef CONFIG_DEBUG_SLAB void *__kmalloc(size_t size, gfp_t flags) { +#ifndef CONFIG_DEBUG_SLAB return __do_kmalloc(size, flags, NULL); +#else + return __do_kmalloc(size, flags, __builtin_return_address(0)); +#endif } EXPORT_SYMBOL(__kmalloc); -#else - +#ifdef CONFIG_DEBUG_SLAB void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) { return __do_kmalloc(size, flags, caller); } EXPORT_SYMBOL(__kmalloc_track_caller); - #endif #ifdef CONFIG_SMP @@ -3343,63 +3418,86 @@ const char *kmem_cache_name(struct kmem_cache *cachep) EXPORT_SYMBOL_GPL(kmem_cache_name); /* - * This initializes kmem_list3 for all nodes. + * This initializes kmem_list3 or resizes varioius caches for all nodes. */ static int alloc_kmemlist(struct kmem_cache *cachep) { int node; struct kmem_list3 *l3; - int err = 0; + struct array_cache *new_shared; + struct array_cache **new_alien; for_each_online_node(node) { - struct array_cache *nc = NULL, *new; - struct array_cache **new_alien = NULL; -#ifdef CONFIG_NUMA + new_alien = alloc_alien_cache(node, cachep->limit); if (!new_alien) goto fail; -#endif - new = alloc_arraycache(node, cachep->shared*cachep->batchcount, + + new_shared = alloc_arraycache(node, + cachep->shared*cachep->batchcount, 0xbaadf00d); - if (!new) + if (!new_shared) { + free_alien_cache(new_alien); goto fail; + } + l3 = cachep->nodelists[node]; if (l3) { + struct array_cache *shared = l3->shared; + spin_lock_irq(&l3->list_lock); - nc = cachep->nodelists[node]->shared; - if (nc) - free_block(cachep, nc->entry, nc->avail, node); + if (shared) + free_block(cachep, shared->entry, + shared->avail, node); - l3->shared = new; - if (!cachep->nodelists[node]->alien) { + l3->shared = new_shared; + if (!l3->alien) { l3->alien = new_alien; new_alien = NULL; } l3->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; spin_unlock_irq(&l3->list_lock); - kfree(nc); + kfree(shared); free_alien_cache(new_alien); continue; } l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); - if (!l3) + if (!l3) { + free_alien_cache(new_alien); + kfree(new_shared); goto fail; + } kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - l3->shared = new; + l3->shared = new_shared; l3->alien = new_alien; l3->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; cachep->nodelists[node] = l3; } - return err; + return 0; + fail: - err = -ENOMEM; - return err; + if (!cachep->next.next) { + /* Cache is not active yet. Roll back what we did */ + node--; + while (node >= 0) { + if (cachep->nodelists[node]) { + l3 = cachep->nodelists[node]; + + kfree(l3->shared); + free_alien_cache(l3->alien); + kfree(l3); + cachep->nodelists[node] = NULL; + } + node--; + } + } + return -ENOMEM; } struct ccupdate_struct { @@ -3876,6 +3974,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, res = count; return res; } + +#ifdef CONFIG_DEBUG_SLAB_LEAK + +static void *leaks_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + struct list_head *p; + + mutex_lock(&cache_chain_mutex); + p = cache_chain.next; + while (n--) { + p = p->next; + if (p == &cache_chain) + return NULL; + } + return list_entry(p, struct kmem_cache, next); +} + +static inline int add_caller(unsigned long *n, unsigned long v) +{ + unsigned long *p; + int l; + if (!v) + return 1; + l = n[1]; + p = n + 2; + while (l) { + int i = l/2; + unsigned long *q = p + 2 * i; + if (*q == v) { + q[1]++; + return 1; + } + if (*q > v) { + l = i; + } else { + p = q + 2; + l -= i + 1; + } + } + if (++n[1] == n[0]) + return 0; + memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); + p[0] = v; + p[1] = 1; + return 1; +} + +static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) +{ + void *p; + int i; + if (n[0] == n[1]) + return; + for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { + if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) + continue; + if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + return; + } +} + +static void show_symbol(struct seq_file *m, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char *modname; + const char *name; + unsigned long offset, size; + char namebuf[KSYM_NAME_LEN+1]; + + name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); + + if (name) { + seq_printf(m, "%s+%#lx/%#lx", name, offset, size); + if (modname) + seq_printf(m, " [%s]", modname); + return; + } +#endif + seq_printf(m, "%p", (void *)address); +} + +static int leaks_show(struct seq_file *m, void *p) +{ + struct kmem_cache *cachep = p; + struct list_head *q; + struct slab *slabp; + struct kmem_list3 *l3; + const char *name; + unsigned long *n = m->private; + int node; + int i; + + if (!(cachep->flags & SLAB_STORE_USER)) + return 0; + if (!(cachep->flags & SLAB_RED_ZONE)) + return 0; + + /* OK, we can do it */ + + n[1] = 0; + + for_each_online_node(node) { + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + check_irq_on(); + spin_lock_irq(&l3->list_lock); + + list_for_each(q, &l3->slabs_full) { + slabp = list_entry(q, struct slab, list); + handle_slab(n, cachep, slabp); + } + list_for_each(q, &l3->slabs_partial) { + slabp = list_entry(q, struct slab, list); + handle_slab(n, cachep, slabp); + } + spin_unlock_irq(&l3->list_lock); + } + name = cachep->name; + if (n[0] == n[1]) { + /* Increase the buffer size */ + mutex_unlock(&cache_chain_mutex); + m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); + if (!m->private) { + /* Too bad, we are really out */ + m->private = n; + mutex_lock(&cache_chain_mutex); + return -ENOMEM; + } + *(unsigned long *)m->private = n[0] * 2; + kfree(n); + mutex_lock(&cache_chain_mutex); + /* Now make sure this entry will be retried */ + m->count = m->size; + return 0; + } + for (i = 0; i < n[1]; i++) { + seq_printf(m, "%s: %lu ", name, n[2*i+3]); + show_symbol(m, n[2*i+2]); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations slabstats_op = { + .start = leaks_start, + .next = s_next, + .stop = s_stop, + .show = leaks_show, +}; +#endif #endif /** diff --git a/mm/slob.c b/mm/slob.c index a1f42bdc024..9bcc7e2cabf 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); +void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) +{ + void *ret = kmem_cache_alloc(c, flags); + if (ret) + memset(ret, 0, c->size); + + return ret; +} +EXPORT_SYMBOL(kmem_cache_zalloc); + void kmem_cache_free(struct kmem_cache *c, void *b) { if (c->dtor) diff --git a/mm/util.c b/mm/util.c index 5f4bb59da63..7368479220b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,20 +1,22 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> +#include <linux/err.h> +#include <asm/uaccess.h> /** - * kzalloc - allocate memory. The memory is set to zero. + * __kzalloc - allocate memory. The memory is set to zero. * @size: how many bytes of memory are required. * @flags: the type of memory to allocate. */ -void *kzalloc(size_t size, gfp_t flags) +void *__kzalloc(size_t size, gfp_t flags) { - void *ret = kmalloc(size, flags); + void *ret = ____kmalloc(size, flags); if (ret) memset(ret, 0, size); return ret; } -EXPORT_SYMBOL(kzalloc); +EXPORT_SYMBOL(__kzalloc); /* * kstrdup - allocate space for and copy an existing string @@ -31,9 +33,44 @@ char *kstrdup(const char *s, gfp_t gfp) return NULL; len = strlen(s) + 1; - buf = kmalloc(len, gfp); + buf = ____kmalloc(len, gfp); if (buf) memcpy(buf, s, len); return buf; } EXPORT_SYMBOL(kstrdup); + +/* + * strndup_user - duplicate an existing string from user space + * + * @s: The string to duplicate + * @n: Maximum number of bytes to copy, including the trailing NUL. + */ +char *strndup_user(const char __user *s, long n) +{ + char *p; + long length; + + length = strnlen_user(s, n); + + if (!length) + return ERR_PTR(-EFAULT); + + if (length > n) + return ERR_PTR(-EINVAL); + + p = kmalloc(length, GFP_KERNEL); + + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, s, length)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + + p[length - 1] = '\0'; + + return p; +} +EXPORT_SYMBOL(strndup_user); diff --git a/mm/vmscan.c b/mm/vmscan.c index fd572bbdc9f..78865c849f8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1356,7 +1356,9 @@ static int __init kswapd_init(void) pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); BUG_ON(pid < 0); + read_lock(&tasklist_lock); pgdat->kswapd = find_task_by_pid(pid); + read_unlock(&tasklist_lock); } total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); |
