From aca50bd3b4c4bb5528a1878158ba7abce41de534 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 23 Apr 2012 11:14:50 -0700 Subject: mm: fix s390 BUG by __set_page_dirty_no_writeback on swap Mel reports a BUG_ON(slot == NULL) in radix_tree_tag_set() on s390 3.0.13: called from __set_page_dirty_nobuffers() when page_remove_rmap() tries to transfer dirty flag from s390 storage key to struct page and radix_tree. That would be because of reclaim's shrink_page_list() calling add_to_swap() on this page at the same time: first PageSwapCache is set (causing page_mapping(page) to appear as &swapper_space), then page->private set, then tree_lock taken, then page inserted into radix_tree - so there's an interval before taking the lock when the radix_tree slot is empty. We could fix this by moving __add_to_swap_cache()'s spin_lock_irq up before the SetPageSwapCache. But a better fix is simply to do what's five years overdue: Ken Chen introduced __set_page_dirty_no_writeback() (if !PageDirty TestSetPageDirty) for tmpfs to skip all the radix_tree overhead, and swap is just the same - it ignores the radix_tree tag, and does not participate in dirty page accounting, so should be using __set_page_dirty_no_writeback() too. s390 testing now confirms that this does indeed fix the problem. Reported-by: Mel Gorman Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Andrew Morton Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Rik van Riel Cc: Ken Chen Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- mm/swap_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index 9d3dd3763cf..4c5ff7f284d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -26,7 +26,7 @@ */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = __set_page_dirty_no_writeback, .migratepage = migrate_page, }; -- cgit v1.2.3 From 4e1c2b284461fd8aa8d7b295a1e911fc4390755b Mon Sep 17 00:00:00 2001 From: David Miller Date: Wed, 25 Apr 2012 16:10:50 -0400 Subject: mm: nobootmem: Correct alloc_bootmem semantics. The comments above __alloc_bootmem_node() claim that the code will first try the allocation using 'goal' and if that fails it will try again but with the 'goal' requirement dropped. Unfortunately, this is not what the code does, so fix it to do so. This is important for nobootmem conversions to architectures such as sparc where MAX_DMA_ADDRESS is infinity. On such architectures all of the allocations done by generic spots, such as the sparse-vmemmap implementation, will pass in: __pa(MAX_DMA_ADDRESS) as the goal, and with the limit given as "-1" this will always fail unless we add the appropriate fallback logic here. Signed-off-by: David S. Miller Acked-by: Yinghai Lu Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 24f0fc1a56d..e53bb8a256b 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -298,13 +298,19 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +again: ptr = __alloc_memory_core_early(pgdat->node_id, size, align, goal, -1ULL); if (ptr) return ptr; - return __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, -1ULL); + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, -1ULL); + if (!ptr && goal) { + goal = 0; + goto again; + } + return ptr; } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, -- cgit v1.2.3 From ce587e65e8c669eec61df7fb1c515720302e3cc0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 24 Apr 2012 20:22:33 +0200 Subject: mm: memcg: move pc lookup point to commit_charge() None of the callsites actually need the page_cgroup descriptor themselves, so just pass the page and do the look up in there. We already had two bugs (6568d4a 'mm: memcg: update the correct soft limit tree during migration' and 'memcg: fix Bad page state after replace_page_cache') where the passed page and pc were not referring to the same page frame. Signed-off-by: Johannes Weiner Acked-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b868def9bcc..31ab9c3f017 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2476,10 +2476,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, struct page *page, unsigned int nr_pages, - struct page_cgroup *pc, enum charge_type ctype, bool lrucare) { + struct page_cgroup *pc = lookup_page_cgroup(page); struct zone *uninitialized_var(zone); bool was_on_lru = false; bool anon; @@ -2716,7 +2716,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, { struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; - struct page_cgroup *pc; bool oom = true; int ret; @@ -2730,11 +2729,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, oom = false; } - pc = lookup_page_cgroup(page); ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); if (ret == -ENOMEM) return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false); + __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); return 0; } @@ -2831,16 +2829,13 @@ static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { - struct page_cgroup *pc; - if (mem_cgroup_disabled()) return; if (!memcg) return; cgroup_exclude_rmdir(&memcg->css); - pc = lookup_page_cgroup(page); - __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true); + __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -3298,14 +3293,13 @@ int mem_cgroup_prepare_migration(struct page *page, * page. In the case new page is migrated but not remapped, new page's * mapcount will be finally 0 and we call uncharge in end_migration(). */ - pc = lookup_page_cgroup(newpage); if (PageAnon(page)) ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; else if (page_is_file_cache(page)) ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false); + __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); return ret; } @@ -3392,8 +3386,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, * the newpage may be on LRU(or pagevec for LRU) already. We lock * LRU while we overwrite pc->mem_cgroup. */ - pc = lookup_page_cgroup(newpage); - __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true); + __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); } #ifdef CONFIG_DEBUG_VM -- cgit v1.2.3 From b1c12cbcd0a02527c180a862e8971e249d3b347d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Apr 2012 16:01:46 -0700 Subject: mm/hugetlb: fix warning in alloc_huge_page/dequeue_huge_page_vma Fix a gcc warning (and bug?) introduced in cc9a6c877 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") Local variable "page" can be uninitialized if the nodemask from vma policy does not intersects with nodemask from cpuset. Even if it doesn't happens it is better to initialize this variable explicitly than to introduce a kernel oops in a weird corner case. mm/hugetlb.c: In function `alloc_huge_page': mm/hugetlb.c:1135:5: warning: `page' may be used uninitialized in this function Signed-off-by: Konstantin Khlebnikov Acked-by: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cd65cb19c94..5a16423a512 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -532,7 +532,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; struct zonelist *zonelist; -- cgit v1.2.3 From 904249aa68010c8e223263c922fcbb840a3f42e4 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Wed, 25 Apr 2012 16:01:48 -0700 Subject: mm: fix up the vmscan stat in vmstat The "pgsteal" stat is confusing because it counts both direct reclaim as well as background reclaim. However, we have "kswapd_steal" which also counts background reclaim value. This patch fixes it and also makes it match the existng "pgscan_" stats. Test: pgsteal_kswapd_dma32 447623 pgsteal_kswapd_normal 42272677 pgsteal_kswapd_movable 0 pgsteal_direct_dma32 2801 pgsteal_direct_normal 44353270 pgsteal_direct_movable 0 Signed-off-by: Ying Han Reviewed-by: Rik van Riel Acked-by: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Mel Gorman Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Cc: Hugh Dickins Cc: Dan Magenheimer Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 5 +++-- mm/vmscan.c | 11 ++++++++--- mm/vmstat.c | 4 ++-- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 03b90cdc192..06f8e385825 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -26,13 +26,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGFREE, PGACTIVATE, PGDEACTIVATE, PGFAULT, PGMAJFAULT, FOR_ALL_ZONES(PGREFILL), - FOR_ALL_ZONES(PGSTEAL), + FOR_ALL_ZONES(PGSTEAL_KSWAPD), + FOR_ALL_ZONES(PGSTEAL_DIRECT), FOR_ALL_ZONES(PGSCAN_KSWAPD), FOR_ALL_ZONES(PGSCAN_DIRECT), #ifdef CONFIG_NUMA PGSCAN_ZONE_RECLAIM_FAILED, #endif - PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, + PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, KSWAPD_SKIP_CONGESTION_WAIT, PAGEOUTRUN, ALLOCSTALL, PGROTATED, diff --git a/mm/vmscan.c b/mm/vmscan.c index 1a518684a32..33dc256033b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1568,9 +1568,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, reclaim_stat->recent_scanned[0] += nr_anon; reclaim_stat->recent_scanned[1] += nr_file; - if (current_is_kswapd()) - __count_vm_events(KSWAPD_STEAL, nr_reclaimed); - __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); + if (global_reclaim(sc)) { + if (current_is_kswapd()) + __count_zone_vm_events(PGSTEAL_KSWAPD, zone, + nr_reclaimed); + else + __count_zone_vm_events(PGSTEAL_DIRECT, zone, + nr_reclaimed); + } putback_inactive_pages(mz, &page_list); diff --git a/mm/vmstat.c b/mm/vmstat.c index f600557a765..7db1b9bab49 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -738,7 +738,8 @@ const char * const vmstat_text[] = { "pgmajfault", TEXTS_FOR_ZONES("pgrefill") - TEXTS_FOR_ZONES("pgsteal") + TEXTS_FOR_ZONES("pgsteal_kswapd") + TEXTS_FOR_ZONES("pgsteal_direct") TEXTS_FOR_ZONES("pgscan_kswapd") TEXTS_FOR_ZONES("pgscan_direct") @@ -747,7 +748,6 @@ const char * const vmstat_text[] = { #endif "pginodesteal", "slabs_scanned", - "kswapd_steal", "kswapd_inodesteal", "kswapd_low_wmark_hit_quickly", "kswapd_high_wmark_hit_quickly", -- cgit v1.2.3 From f2a9ef880763d7fbd657a3af646e132a90d70d34 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 25 Apr 2012 16:01:52 -0700 Subject: mm: fix NULL ptr dereference in migrate_pages Commit 3268c63 ("mm: fix move/migrate_pages() race on task struct") has added an odd construct where 'mm' is checked for being NULL, and if it is, it would get dereferenced anyways by mput()ing it. This would lead to the following NULL ptr deref and BUG() when calling migrate_pages() with a pid that has no mm struct: [25904.193704] BUG: unable to handle kernel NULL pointer dereference at 0000000000000050 [25904.194235] IP: [] mmput+0x27/0xf0 [25904.194235] PGD 773e6067 PUD 77da0067 PMD 0 [25904.194235] Oops: 0002 [#1] PREEMPT SMP [25904.194235] CPU 2 [25904.194235] Pid: 31608, comm: trinity Tainted: G W 3.4.0-rc2-next-20120412-sasha #69 [25904.194235] RIP: 0010:[] [] mmput+0x27/0xf0 [25904.194235] RSP: 0018:ffff880077d49e08 EFLAGS: 00010202 [25904.194235] RAX: 0000000000000286 RBX: 0000000000000000 RCX: 0000000000000000 [25904.194235] RDX: ffff880075ef8000 RSI: 000000000000023d RDI: 0000000000000286 [25904.194235] RBP: ffff880077d49e18 R08: 0000000000000001 R09: 0000000000000001 [25904.194235] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [25904.194235] R13: 00000000ffffffea R14: ffff880034287740 R15: ffff8800218d3010 [25904.194235] FS: 00007fc8b244c700(0000) GS:ffff880029800000(0000) knlGS:0000000000000000 [25904.194235] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [25904.194235] CR2: 0000000000000050 CR3: 00000000767c6000 CR4: 00000000000406e0 [25904.194235] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [25904.194235] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [25904.194235] Process trinity (pid: 31608, threadinfo ffff880077d48000, task ffff880075ef8000) [25904.194235] Stack: [25904.194235] ffff8800342876c0 0000000000000000 ffff880077d49f78 ffffffff811b8020 [25904.194235] ffffffff811b7d91 ffff880075ef8000 ffff88002256d200 0000000000000000 [25904.194235] 00000000000003ff 0000000000000000 0000000000000000 0000000000000000 [25904.194235] Call Trace: [25904.194235] [] sys_migrate_pages+0x340/0x3a0 [25904.194235] [] ? sys_migrate_pages+0xb1/0x3a0 [25904.194235] [] system_call_fastpath+0x16/0x1b [25904.194235] Code: c9 c3 66 90 55 31 d2 48 89 e5 be 3d 02 00 00 48 83 ec 10 48 89 1c 24 4c 89 64 24 08 48 89 fb 48 c7 c7 cf 0e e1 82 e8 69 18 03 00 ff 4b 50 0f 94 c0 84 c0 0f 84 aa 00 00 00 48 89 df e8 72 f1 [25904.194235] RIP [] mmput+0x27/0xf0 [25904.194235] RSP [25904.194235] CR2: 0000000000000050 [25904.348999] ---[ end trace a307b3ed40206b4b ]--- Signed-off-by: Sasha Levin Cc: Dave Hansen Cc: Mel Gorman Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index cfb6c867875..b1956913752 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1361,11 +1361,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, mm = get_task_mm(task); put_task_struct(task); - if (mm) - err = do_migrate_pages(mm, old, new, - capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); - else + + if (!mm) { err = -EINVAL; + goto out; + } + + err = do_migrate_pages(mm, old, new, + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); mmput(mm); out: -- cgit v1.2.3 From 6e8b09eaf268bceac0c62e389b4bc0cb83dfb8e5 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 25 Apr 2012 16:01:53 -0700 Subject: mm: fix NULL ptr dereference in move_pages Commit 3268c63 ("mm: fix move/migrate_pages() race on task struct") has added an odd construct where 'mm' is checked for being NULL, and if it is, it would get dereferenced anyways by mput()ing it. Signed-off-by: Sasha Levin Cc: Dave Hansen Cc: Mel Gorman Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 51c08a0c6f6..11072383ae1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1388,14 +1388,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, mm = get_task_mm(task); put_task_struct(task); - if (mm) { - if (nodes) - err = do_pages_move(mm, task_nodes, nr_pages, pages, - nodes, status, flags); - else - err = do_pages_stat(mm, nr_pages, pages, status); - } else - err = -EINVAL; + if (!mm) + return -EINVAL; + + if (nodes) + err = do_pages_move(mm, task_nodes, nr_pages, pages, + nodes, status, flags); + else + err = do_pages_stat(mm, nr_pages, pages, status); mmput(mm); return err; -- cgit v1.2.3