From d88c0922fa0e2c021a028b310a641126c6d4b7dc Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 2 Nov 2010 13:05:18 -0700 Subject: Release page reference during page fault retry This slipped by when unifying the filemap and swap versions of lock_page_or_retry()... Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 75572b5f2374..61ba5e405791 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1563,8 +1563,10 @@ retry_find: goto no_cached_page; } - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); return ret | VM_FAULT_RETRY; + } /* Did it get truncated? */ if (unlikely(page->mapping != mapping)) { -- cgit v1.2.3 From 8d056cb965b8fb7c53c564abf28b1962d1061cd3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 11 Nov 2010 14:05:15 -0800 Subject: mm/vfs: revalidate page->mapping in do_generic_file_read() 70 hours into some stress tests of a 2.6.32-based enterprise kernel, we ran into a NULL dereference in here: int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, unsigned long from) { ----> struct inode *inode = page->mapping->host; It looks like page->mapping was the culprit. (xmon trace is below). After closer examination, I realized that do_generic_file_read() does a find_get_page(), and eventually locks the page before calling block_is_partially_uptodate(). However, it doesn't revalidate the page->mapping after the page is locked. So, there's a small window between the find_get_page() and ->is_partially_uptodate() where the page could get truncated and page->mapping cleared. We _have_ a reference, so it can't get reclaimed, but it certainly can be truncated. I think the correct thing is to check page->mapping after the trylock_page(), and jump out if it got truncated. This patch has been running in the test environment for a month or so now, and we have not seen this bug pop up again. xmon info: 1f:mon> e cpu 0x1f: Vector: 300 (Data Access) at [c0000002ae36f770] pc: c0000000001e7a6c: .block_is_partially_uptodate+0xc/0x100 lr: c000000000142944: .generic_file_aio_read+0x1e4/0x770 sp: c0000002ae36f9f0 msr: 8000000000009032 dar: 0 dsisr: 40000000 current = 0xc000000378f99e30 paca = 0xc000000000f66300 pid = 21946, comm = bash 1f:mon> r R00 = 0025c0500000006d R16 = 0000000000000000 R01 = c0000002ae36f9f0 R17 = c000000362cd3af0 R02 = c000000000e8cd80 R18 = ffffffffffffffff R03 = c0000000031d0f88 R19 = 0000000000000001 R04 = c0000002ae36fa68 R20 = c0000003bb97b8a0 R05 = 0000000000000000 R21 = c0000002ae36fa68 R06 = 0000000000000000 R22 = 0000000000000000 R07 = 0000000000000001 R23 = c0000002ae36fbb0 R08 = 0000000000000002 R24 = 0000000000000000 R09 = 0000000000000000 R25 = c000000362cd3a80 R10 = 0000000000000000 R26 = 0000000000000002 R11 = c0000000001e7b60 R27 = 0000000000000000 R12 = 0000000042000484 R28 = 0000000000000001 R13 = c000000000f66300 R29 = c0000003bb97b9b8 R14 = 0000000000000001 R30 = c000000000e28a08 R15 = 000000000000ffff R31 = c0000000031d0f88 pc = c0000000001e7a6c .block_is_partially_uptodate+0xc/0x100 lr = c000000000142944 .generic_file_aio_read+0x1e4/0x770 msr = 8000000000009032 cr = 22000488 ctr = c0000000001e7a60 xer = 0000000020000000 trap = 300 dar = 0000000000000000 dsisr = 40000000 1f:mon> t [link register ] c000000000142944 .generic_file_aio_read+0x1e4/0x770 [c0000002ae36f9f0] c000000000142a14 .generic_file_aio_read+0x2b4/0x770 (unreliable) [c0000002ae36fb40] c0000000001b03e4 .do_sync_read+0xd4/0x160 [c0000002ae36fce0] c0000000001b153c .vfs_read+0xec/0x1f0 [c0000002ae36fd80] c0000000001b1768 .SyS_read+0x58/0xb0 [c0000002ae36fe30] c00000000000852c syscall_exit+0x0/0x40 --- Exception: c00 (System Call) at 00000080a840bc54 SP (fffca15df30) is in userspace 1f:mon> di c0000000001e7a6c c0000000001e7a6c e9290000 ld r9,0(r9) c0000000001e7a70 418200c0 beq c0000000001e7b30 # .block_is_partially_uptodate+0xd0/0x100 c0000000001e7a74 e9440008 ld r10,8(r4) c0000000001e7a78 78a80020 clrldi r8,r5,32 c0000000001e7a7c 3c000001 lis r0,1 c0000000001e7a80 812900a8 lwz r9,168(r9) c0000000001e7a84 39600001 li r11,1 c0000000001e7a88 7c080050 subf r0,r8,r0 c0000000001e7a8c 7f805040 cmplw cr7,r0,r10 c0000000001e7a90 7d6b4830 slw r11,r11,r9 c0000000001e7a94 796b0020 clrldi r11,r11,32 c0000000001e7a98 419d00a8 bgt cr7,c0000000001e7b40 # .block_is_partially_uptodate+0xe0/0x100 c0000000001e7a9c 7fa55840 cmpld cr7,r5,r11 c0000000001e7aa0 7d004214 add r8,r0,r8 c0000000001e7aa4 79080020 clrldi r8,r8,32 c0000000001e7aa8 419c0078 blt cr7,c0000000001e7b20 # .block_is_partially_uptodate+0xc0/0x100 Signed-off-by: Dave Hansen Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Acked-by: Rik van Riel Cc: Cc: Cc: Christoph Hellwig Cc: Al Viro Cc: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 61ba5e405791..4ee2e998e937 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1029,6 +1029,9 @@ find_page: goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; + /* Did it get truncated before we got the lock? */ + if (!page->mapping) + goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, desc, offset)) goto page_not_up_to_date_locked; -- cgit v1.2.3 From 27d20fddc8af539464fc3ba499d6a830054c3bd6 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 11 Nov 2010 14:05:19 -0800 Subject: radix-tree: fix RCU bug Salman Qazi describes the following radix-tree bug: In the following case, we get can get a deadlock: 0. The radix tree contains two items, one has the index 0. 1. The reader (in this case find_get_pages) takes the rcu_read_lock. 2. The reader acquires slot(s) for item(s) including the index 0 item. 3. The non-zero index item is deleted, and as a consequence the other item is moved to the root of the tree. The place where it used to be is queued for deletion after the readers finish. 3b. The zero item is deleted, removing it from the direct slot, it remains in the rcu-delayed indirect node. 4. The reader looks at the index 0 slot, and finds that the page has 0 ref count 5. The reader looks at it again, hoping that the item will either be freed or the ref count will increase. This never happens, as the slot it is looking at will never be updated. Also, this slot can never be reclaimed because the reader is holding rcu_read_lock and is in an infinite loop. The fix is to re-use the same "indirect" pointer case that requires a slot lookup retry into a general "retry the lookup" bit. Signed-off-by: Nick Piggin Reported-by: Salman Qazi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 39 +++++++++++++--------- lib/radix-tree.c | 83 ++++++++++++++++++++++++++++++++-------------- mm/filemap.c | 26 ++++++--------- 3 files changed, 91 insertions(+), 57 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index a39cbed9ee17..ab2baa5c4884 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -34,19 +34,13 @@ * needed for RCU lookups (because root->height is unreliable). The only * time callers need worry about this is when doing a lookup_slot under * RCU. + * + * Indirect pointer in fact is also used to tag the last pointer of a node + * when it is shrunk, before we rcu free the node. See shrink code for + * details. */ #define RADIX_TREE_INDIRECT_PTR 1 -#define RADIX_TREE_RETRY ((void *)-1UL) - -static inline void *radix_tree_ptr_to_indirect(void *ptr) -{ - return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); -} -static inline void *radix_tree_indirect_to_ptr(void *ptr) -{ - return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); -} #define radix_tree_indirect_to_ptr(ptr) \ radix_tree_indirect_to_ptr((void __force *)(ptr)) @@ -140,16 +134,29 @@ do { \ * removed. * * For use with radix_tree_lookup_slot(). Caller must hold tree at least read - * locked across slot lookup and dereference. More likely, will be used with - * radix_tree_replace_slot(), as well, so caller will hold tree write locked. + * locked across slot lookup and dereference. Not required if write lock is + * held (ie. items cannot be concurrently inserted). + * + * radix_tree_deref_retry must be used to confirm validity of the pointer if + * only the read lock is held. */ static inline void *radix_tree_deref_slot(void **pslot) { - void *ret = rcu_dereference(*pslot); - if (unlikely(radix_tree_is_indirect_ptr(ret))) - ret = RADIX_TREE_RETRY; - return ret; + return rcu_dereference(*pslot); } + +/** + * radix_tree_deref_retry - check radix_tree_deref_slot + * @arg: pointer returned by radix_tree_deref_slot + * Returns: 0 if retry is not required, otherwise retry is required + * + * radix_tree_deref_retry must be used with radix_tree_deref_slot. + */ +static inline int radix_tree_deref_retry(void *arg) +{ + return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR); +} + /** * radix_tree_replace_slot - replace item in a slot * @pslot: pointer to slot, returned by radix_tree_lookup_slot diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 6f412ab4c24f..5086bb962b4d 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -82,6 +82,16 @@ struct radix_tree_preload { }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static inline void *ptr_to_indirect(void *ptr) +{ + return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); +} + +static inline void *indirect_to_ptr(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); +} + static inline gfp_t root_gfp_mask(struct radix_tree_root *root) { return root->gfp_mask & __GFP_BITS_MASK; @@ -265,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) return -ENOMEM; /* Increase the height. */ - node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); + node->slots[0] = indirect_to_ptr(root->rnode); /* Propagate the aggregated tag info into the new root */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { @@ -276,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) newheight = root->height+1; node->height = newheight; node->count = 1; - node = radix_tree_ptr_to_indirect(node); + node = ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); root->height = newheight; } while (height > root->height); @@ -309,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root, return error; } - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; @@ -325,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root, rcu_assign_pointer(node->slots[offset], slot); node->count++; } else - rcu_assign_pointer(root->rnode, - radix_tree_ptr_to_indirect(slot)); + rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); } /* Go a level down */ @@ -374,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, return NULL; return is_slot ? (void *)&root->rnode : node; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -393,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, height--; } while (height > 0); - return is_slot ? (void *)slot:node; + return is_slot ? (void *)slot : indirect_to_ptr(node); } /** @@ -455,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root, height = root->height; BUG_ON(index > radix_tree_maxindex(height)); - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; while (height > 0) { @@ -509,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); while (height > 0) { int offset; @@ -579,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, if (!radix_tree_is_indirect_ptr(node)) return (index == 0); - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -666,7 +675,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, } shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); /* * we fill the path from (root->height - 2) to 0, leaving the index at @@ -897,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, results[0] = node; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -916,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, slot = *(((void ***)results)[ret + i]); if (!slot) continue; - results[ret + nr_found] = rcu_dereference_raw(slot); + results[ret + nr_found] = + indirect_to_ptr(rcu_dereference_raw(slot)); nr_found++; } ret += nr_found; @@ -965,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, results[0] = (void **)&root->rnode; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -1090,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, results[0] = node; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -1109,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, slot = *(((void ***)results)[ret + i]); if (!slot) continue; - results[ret + nr_found] = rcu_dereference_raw(slot); + results[ret + nr_found] = + indirect_to_ptr(rcu_dereference_raw(slot)); nr_found++; } ret += nr_found; @@ -1159,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, results[0] = (void **)&root->rnode; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -1195,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) void *newptr; BUG_ON(!radix_tree_is_indirect_ptr(to_free)); - to_free = radix_tree_indirect_to_ptr(to_free); + to_free = indirect_to_ptr(to_free); /* * The candidate node has more than one child, or its child @@ -1208,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) /* * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another. If - * it was safe to dereference the old pointer to it + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it * (to_free->slots[0]), it will be safe to dereference the new - * one (root->rnode). + * one (root->rnode) as far as dependent read barriers go. */ newptr = to_free->slots[0]; if (root->height > 1) - newptr = radix_tree_ptr_to_indirect(newptr); + newptr = ptr_to_indirect(newptr); root->rnode = newptr; root->height--; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page is 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + if (root->height == 0) + *((unsigned long *)&to_free->slots[0]) |= + RADIX_TREE_INDIRECT_PTR; + radix_tree_node_free(to_free); } } @@ -1254,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) root->rnode = NULL; goto out; } - slot = radix_tree_indirect_to_ptr(slot); + slot = indirect_to_ptr(slot); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; @@ -1296,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) radix_tree_node_free(to_free); if (pathp->node->count) { - if (pathp->node == - radix_tree_indirect_to_ptr(root->rnode)) + if (pathp->node == indirect_to_ptr(root->rnode)) radix_tree_shrink(root); goto out; } diff --git a/mm/filemap.c b/mm/filemap.c index 4ee2e998e937..ea89840fc65f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -644,7 +644,9 @@ repeat: pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); if (pagep) { page = radix_tree_deref_slot(pagep); - if (unlikely(!page || page == RADIX_TREE_RETRY)) + if (unlikely(!page)) + goto out; + if (radix_tree_deref_retry(page)) goto repeat; if (!page_cache_get_speculative(page)) @@ -660,6 +662,7 @@ repeat: goto repeat; } } +out: rcu_read_unlock(); return page; @@ -777,12 +780,11 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) { + if (ret) + start = pages[ret-1]->index; goto restart; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -830,11 +832,7 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) goto restart; if (page->mapping == NULL || page->index != index) @@ -887,11 +885,7 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) goto restart; if (!page_cache_get_speculative(page)) -- cgit v1.2.3