From e43699d4b4c5d9ecbcb5998cdcbada060981171f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 5 May 2015 15:21:27 +0100 Subject: Btrfs: fix crash after inode cache writeback failure If the writeback of an inode cache failed we were unnecessarilly attempting to release again the delalloc metadata that we previously reserved. However attempting to do this a second time triggers an assertion at drop_outstanding_extent() because we have no more outstanding extents for our inode cache's inode. If we were able to start writeback of the cache the reserved metadata space is released at btrfs_finished_ordered_io(), even if an error happens during writeback. So make sure we don't repeat the metadata space release if writeback started for our inode cache. This issue was trivial to reproduce by running the fstest btrfs/088 with "-o inode_cache", which triggered the assertion leading to a BUG() call and requiring a reboot in order to run the remaining fstests. Trace produced by btrfs/088: [255289.385904] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5276 [255289.388094] ------------[ cut here ]------------ [255289.389184] kernel BUG at fs/btrfs/ctree.h:4057! [255289.390125] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC (...) [255289.392068] Call Trace: [255289.392068] [] drop_outstanding_extent+0x3d/0x6d [btrfs] [255289.392068] [] btrfs_delalloc_release_metadata+0x54/0xe3 [btrfs] [255289.392068] [] btrfs_write_out_ino_cache+0x95/0xad [btrfs] [255289.392068] [] btrfs_save_ino_cache+0x275/0x2dc [btrfs] [255289.392068] [] commit_fs_roots.isra.12+0xaa/0x137 [btrfs] [255289.392068] [] ? trace_hardirqs_on+0xd/0xf [255289.392068] [] ? btrfs_commit_transaction+0x4b1/0x9c9 [btrfs] [255289.392068] [] ? _raw_spin_unlock+0x32/0x46 [255289.392068] [] btrfs_commit_transaction+0x4c0/0x9c9 [btrfs] (...) Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5e020d76fd07..9dbe5b548fa6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3466,6 +3466,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; int ret; struct btrfs_io_ctl io_ctl; + bool release_metadata = true; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return 0; @@ -3473,11 +3474,20 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, memset(&io_ctl, 0, sizeof(io_ctl)); ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans, path, 0); - if (!ret) + if (!ret) { + /* + * At this point writepages() didn't error out, so our metadata + * reservation is released when the writeback finishes, at + * inode.c:btrfs_finish_ordered_io(), regardless of it finishing + * with or without an error. + */ + release_metadata = false; ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); + } if (ret) { - btrfs_delalloc_release_metadata(inode, inode->i_size); + if (release_metadata) + btrfs_delalloc_release_metadata(inode, inode->i_size); #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free ino cache for root %llu", -- cgit v1.2.3 From 28aeeac1dd3080db5108b7b446be69f05c470a90 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 5 May 2015 19:03:10 +0100 Subject: Btrfs: fix panic when starting bg cache writeout after IO error When waiting for the writeback of block group cache we returned immediately if there was an error during writeback without waiting for the ordered extent to complete. This left a short time window where if some other task attempts to start the writeout for the same block group cache it can attempt to add a new ordered extent, starting at the same offset (0) before the previous one is removed from the ordered tree, causing an ordered tree panic (calls BUG()). This normally doesn't happen in other write paths, such as buffered writes or direct IO writes for regular files, since before marking page ranges dirty we lock the ranges and wait for any ordered extents within the range to complete first. Fix this by making btrfs_wait_ordered_range() not return immediately if it gets an error from the writeback, waiting for all ordered extents to complete first. This issue happened often when running the fstest btrfs/088 and it's easy to trigger it by running in a loop until the panic happens: for ((i = 1; i <= 10000; i++)) do ./check btrfs/088 ; done [17156.862573] BTRFS critical (device sdc): panic in ordered_data_tree_panic:70: Inconsistency in ordered tree at offset 0 (errno=-17 Object already exists) [17156.864052] ------------[ cut here ]------------ [17156.864052] kernel BUG at fs/btrfs/ordered-data.c:70! (...) [17156.864052] Call Trace: [17156.864052] [] btrfs_add_ordered_extent+0x12/0x14 [btrfs] [17156.864052] [] run_delalloc_nocow+0x5bf/0x747 [btrfs] [17156.864052] [] run_delalloc_range+0x95/0x353 [btrfs] [17156.864052] [] writepage_delalloc.isra.16+0xb9/0x13f [btrfs] [17156.864052] [] __extent_writepage+0x129/0x1f7 [btrfs] [17156.864052] [] extent_write_cache_pages.isra.15.constprop.28+0x231/0x2f4 [btrfs] [17156.864052] [] ? __module_text_address+0x12/0x59 [17156.864052] [] ? trace_hardirqs_on+0xd/0xf [17156.864052] [] extent_writepages+0x4b/0x5c [btrfs] [17156.864052] [] ? kmem_cache_free+0x9b/0xce [17156.864052] [] ? btrfs_submit_direct+0x3fc/0x3fc [btrfs] [17156.864052] [] ? free_extent_state+0x8c/0xc1 [btrfs] [17156.864052] [] btrfs_writepages+0x28/0x2a [btrfs] [17156.864052] [] do_writepages+0x23/0x2c [17156.864052] [] __filemap_fdatawrite_range+0x5a/0x61 [17156.864052] [] filemap_fdatawrite_range+0x13/0x15 [17156.864052] [] btrfs_fdatawrite_range+0x21/0x48 [btrfs] [17156.864052] [] __btrfs_write_out_cache.isra.14+0x2d9/0x3a7 [btrfs] [17156.864052] [] ? btrfs_write_out_cache+0x41/0xdc [btrfs] [17156.864052] [] btrfs_write_out_cache+0x93/0xdc [btrfs] [17156.864052] [] ? btrfs_start_dirty_block_groups+0x13a/0x2b2 [btrfs] [17156.864052] [] btrfs_start_dirty_block_groups+0x1d9/0x2b2 [btrfs] [17156.864052] [] ? trace_hardirqs_on+0xd/0xf [17156.864052] [] btrfs_commit_transaction+0x130/0x9c9 [btrfs] [17156.864052] [] btrfs_sync_fs+0xe1/0x12d [btrfs] Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 157cc54fc634..760c4a5e096b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -722,6 +722,7 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { int ret = 0; + int ret_wb = 0; u64 end; u64 orig_end; struct btrfs_ordered_extent *ordered; @@ -741,9 +742,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (ret) return ret; - ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); - if (ret) - return ret; + /* + * If we have a writeback error don't return immediately. Wait first + * for any ordered extents that haven't completed yet. This is to make + * sure no one can dirty the same page ranges and call writepages() + * before the ordered extents complete - to avoid failures (-EEXIST) + * when adding the new ordered extents to the ordered tree. + */ + ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; while (1) { @@ -767,7 +773,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) break; end--; } - return ret; + return ret_wb ? ret_wb : ret; } /* -- cgit v1.2.3 From ff1f8250a9e47c1032eb5c86ababff461a11f3a0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 6 May 2015 16:15:09 +0100 Subject: Btrfs: fix race between block group creation and their cache writeout So creating a block group has 2 distinct phases: Phase 1 - creates the btrfs_block_group_cache item and adds it to the rbtree fs_info->block_group_cache_tree and to the corresponding list space_info->block_groups[]; Phase 2 - adds the block group item to the extent tree and corresponding items to the chunk tree. The first phase adds the block_group_cache_item to a list of pending block groups in the transaction handle, and phase 2 happens when btrfs_end_transaction() is called against the transaction handle. It happens that once phase 1 completes, other concurrent tasks that use their own transaction handle, but points to the same running transaction (struct btrfs_trans_handle->transaction), can use this block group for space allocations and therefore mark it dirty. Dirty block groups are tracked in a list belonging to the currently running transaction (struct btrfs_transaction) and not in the transaction handle (btrfs_trans_handle). This is a problem because once a task calls btrfs_commit_transaction(), it calls btrfs_start_dirty_block_groups() which will see all dirty block groups and attempt to start their writeout, including those that are still attached to the transaction handle of some concurrent task that hasn't called btrfs_end_transaction() yet - which means those block groups haven't gone through phase 2 yet and therefore when write_one_cache_group() is called, it won't find the block group items in the extent tree and abort the current transaction with -ENOENT, turning the fs into readonly mode and require a remount. Fix this by ignoring -ENOENT when looking for block group items in the extent tree when we attempt to start the writeout of the block group caches outside the critical section of the transaction commit. We will try again later during the critical section and if there we still don't find the block group item in the extent tree, we then abort the current transaction. This issue happened twice, once while running fstests btrfs/067 and once for btrfs/078, which produced the following trace: [ 3278.703014] WARNING: CPU: 7 PID: 18499 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]() [ 3278.707329] BTRFS: Transaction aborted (error -2) (...) [ 3278.731555] Call Trace: [ 3278.732396] [] dump_stack+0x4f/0x7b [ 3278.733860] [] ? console_unlock+0x361/0x3ad [ 3278.735312] [] warn_slowpath_common+0xa1/0xbb [ 3278.736874] [] ? __btrfs_abort_transaction+0x52/0x114 [btrfs] [ 3278.738302] [] warn_slowpath_fmt+0x46/0x48 [ 3278.739520] [] __btrfs_abort_transaction+0x52/0x114 [btrfs] [ 3278.741222] [] write_one_cache_group+0xae/0xbf [btrfs] [ 3278.742797] [] btrfs_start_dirty_block_groups+0x170/0x2b2 [btrfs] [ 3278.744492] [] btrfs_commit_transaction+0x130/0x9c9 [btrfs] [ 3278.746084] [] ? trace_hardirqs_on+0xd/0xf [ 3278.747249] [] btrfs_sync_file+0x313/0x387 [btrfs] [ 3278.748744] [] vfs_fsync_range+0x95/0xa4 [ 3278.749958] [] ? ret_from_sys_call+0x1d/0x58 [ 3278.751218] [] vfs_fsync+0x1c/0x1e [ 3278.754197] [] do_fsync+0x34/0x4e [ 3278.755192] [] SyS_fsync+0x10/0x14 [ 3278.756236] [] system_call_fastpath+0x12/0x17 [ 3278.757366] ---[ end trace 9a4d4df4969709aa ]--- Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout outside critical section in commit") Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0ec8e228b89f..7effed6f2fa6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3180,8 +3180,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); fail: btrfs_release_path(path); - if (ret) - btrfs_abort_transaction(trans, root, ret); return ret; } @@ -3487,8 +3485,30 @@ again: ret = 0; } } - if (!ret) + if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, root, ret); + } + } /* if its not on the io list, we need to put the block group */ if (should_put) @@ -3597,8 +3617,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, ret = 0; } } - if (!ret) + if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + if (ret) + btrfs_abort_transaction(trans, root, ret); + } /* if its not on the io list, we need to put the block group */ if (should_put) -- cgit v1.2.3 From 062c19e9dd692b8a78e3532f71c290520a2ab437 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 23 Apr 2015 11:28:48 +0100 Subject: Btrfs: fix race when reusing stale extent buffers that leads to BUG_ON There's a race between releasing extent buffers that are flagged as stale and recycling them that makes us it the following BUG_ON at btrfs_release_extent_buffer_page: BUG_ON(extent_buffer_under_io(eb)) The BUG_ON is triggered because the extent buffer has the flag EXTENT_BUFFER_DIRTY set as a consequence of having been reused and made dirty by another concurrent task. Here follows a sequence of steps that leads to the BUG_ON. CPU 0 CPU 1 CPU 2 path->nodes[0] == eb X X->refs == 2 (1 for the tree, 1 for the path) btrfs_header_generation(X) == current trans id flag EXTENT_BUFFER_DIRTY set on X btrfs_release_path(path) unlocks X reads eb X X->refs incremented to 3 locks eb X btrfs_del_items(X) X becomes empty clean_tree_block(X) clear EXTENT_BUFFER_DIRTY from X btrfs_del_leaf(X) unlocks X extent_buffer_get(X) X->refs incremented to 4 btrfs_free_tree_block(X) X's range is not pinned X's range added to free space cache free_extent_buffer_stale(X) lock X->refs_lock set EXTENT_BUFFER_STALE on X release_extent_buffer(X) X->refs decremented to 3 unlocks X->refs_lock btrfs_release_path() unlocks X free_extent_buffer(X) X->refs becomes 2 __btrfs_cow_block(Y) btrfs_alloc_tree_block() btrfs_reserve_extent() find_free_extent() gets offset == X->start btrfs_init_new_buffer(X->start) btrfs_find_create_tree_block(X->start) alloc_extent_buffer(X->start) find_extent_buffer(X->start) finds eb X in radix tree free_extent_buffer(X) lock X->refs_lock test X->refs == 2 test bit EXTENT_BUFFER_STALE is set test !extent_buffer_under_io(eb) increments X->refs to 3 mark_extent_buffer_accessed(X) check_buffer_tree_ref(X) --> does nothing, X->refs >= 2 and EXTENT_BUFFER_TREE_REF is set in X clear EXTENT_BUFFER_STALE from X locks X btrfs_mark_buffer_dirty() set_extent_buffer_dirty(X) check_buffer_tree_ref(X) --> does nothing, X->refs >= 2 and EXTENT_BUFFER_TREE_REF is set sets EXTENT_BUFFER_DIRTY on X test and clear EXTENT_BUFFER_TREE_REF decrements X->refs to 2 release_extent_buffer(X) decrements X->refs to 1 unlock X->refs_lock unlock X free_extent_buffer(X) lock X->refs_lock release_extent_buffer(X) decrements X->refs to 0 btrfs_release_extent_buffer_page(X) BUG_ON(extent_buffer_under_io(X)) --> EXTENT_BUFFER_DIRTY set on X Fix this by making find_extent buffer wait for any ongoing task currently executing free_extent_buffer()/free_extent_buffer_stale() if the extent buffer has the stale flag set. A more clean alternative would be to always increment the extent buffer's reference count while holding its refs_lock spinlock but find_extent_buffer is a performance critical area and that would cause lock contention whenever multiple tasks search for the same extent buffer concurrently. A build server running a SLES 12 kernel (3.12 kernel + over 450 upstream btrfs patches backported from newer kernels) was hitting this often: [1212302.461948] kernel BUG at ../fs/btrfs/extent_io.c:4507! (...) [1212302.470219] CPU: 1 PID: 19259 Comm: bs_sched Not tainted 3.12.36-38-default #1 [1212302.540792] Hardware name: Supermicro PDSM4/PDSM4, BIOS 6.00 04/17/2006 [1212302.540792] task: ffff8800e07e0100 ti: ffff8800d6412000 task.ti: ffff8800d6412000 [1212302.540792] RIP: 0010:[] [] btrfs_release_extent_buffer_page.constprop.51+0x101/0x110 [btrfs] (...) [1212302.630008] Call Trace: [1212302.630008] [] release_extent_buffer+0x3d/0xa0 [btrfs] [1212302.630008] [] btrfs_release_path+0x1d/0xa0 [btrfs] [1212302.630008] [] read_block_for_search.isra.33+0x13e/0x3a0 [btrfs] [1212302.630008] [] btrfs_search_slot+0x3f4/0xa80 [btrfs] [1212302.630008] [] lookup_inline_extent_backref+0xf8/0x630 [btrfs] [1212302.630008] [] __btrfs_free_extent+0x11d/0xc40 [btrfs] [1212302.630008] [] __btrfs_run_delayed_refs+0x394/0x11d0 [btrfs] [1212302.630008] [] btrfs_run_delayed_refs.part.66+0x69/0x280 [btrfs] [1212302.630008] [] __btrfs_end_transaction+0x2ad/0x3d0 [btrfs] [1212302.630008] [] btrfs_evict_inode+0x4a5/0x500 [btrfs] [1212302.630008] [] evict+0xa8/0x190 [1212302.630008] [] do_unlinkat+0x1a0/0x2b0 I was also able to reproduce this on a 3.19 kernel, corresponding to Chris' integration branch from about a month ago, running the following stress test on a qemu/kvm guest (with 4 virtual cpus and 16Gb of ram): while true; do mkfs.btrfs -l 4096 -f -b `expr 20 \* 1024 \* 1024 \* 1024` /dev/sdd mount /dev/sdd /mnt snapshot_cmd="btrfs subvolume snapshot -r /mnt" snapshot_cmd="$snapshot_cmd /mnt/snap_\`date +'%H_%M_%S_%N'\`" fsstress -d /mnt -n 25000 -p 8 -x "$snapshot_cmd" -X 100 umount /mnt done Which usually triggers the BUG_ON within less than 24 hours: [49558.618097] ------------[ cut here ]------------ [49558.619732] kernel BUG at fs/btrfs/extent_io.c:4551! (...) [49558.620031] CPU: 3 PID: 23908 Comm: fsstress Tainted: G W 3.19.0-btrfs-next-7+ #3 [49558.620031] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [49558.620031] task: ffff8800319fc0d0 ti: ffff880220da8000 task.ti: ffff880220da8000 [49558.620031] RIP: 0010:[] [] btrfs_release_extent_buffer_page+0x20/0xe9 [btrfs] (...) [49558.620031] Call Trace: [49558.620031] [] release_extent_buffer+0x90/0xd3 [btrfs] [49558.620031] [] ? _raw_spin_lock+0x3b/0x43 [49558.620031] [] ? free_extent_buffer+0x37/0x94 [btrfs] [49558.620031] [] free_extent_buffer+0x90/0x94 [btrfs] [49558.620031] [] btrfs_release_path+0x4a/0x69 [btrfs] [49558.620031] [] __btrfs_free_extent+0x778/0x80c [btrfs] [49558.620031] [] __btrfs_run_delayed_refs+0xad2/0xc62 [btrfs] [49558.728054] [] ? kmemleak_alloc_recursive.constprop.52+0x16/0x18 [49558.728054] [] btrfs_run_delayed_refs+0x6d/0x1ba [btrfs] [49558.728054] [] ? join_transaction.isra.9+0xb9/0x36b [btrfs] [49558.728054] [] btrfs_commit_transaction+0x4c/0x981 [btrfs] [49558.728054] [] btrfs_sync_fs+0xd5/0x10d [btrfs] [49558.728054] [] ? iterate_supers+0x60/0xc4 [49558.728054] [] ? do_sync_work+0x91/0x91 [49558.728054] [] sync_fs_one_sb+0x20/0x22 [49558.728054] [] iterate_supers+0x76/0xc4 [49558.728054] [] sys_sync+0x55/0x83 [49558.728054] [] system_call_fastpath+0x12/0x17 Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 43af5a61ad25..c32d226bfecc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4772,6 +4772,25 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); + /* + * Lock our eb's refs_lock to avoid races with + * free_extent_buffer. When we get our eb it might be flagged + * with EXTENT_BUFFER_STALE and another task running + * free_extent_buffer might have seen that flag set, + * eb->refs == 2, that the buffer isn't under IO (dirty and + * writeback flags not set) and it's still in the tree (flag + * EXTENT_BUFFER_TREE_REF set), therefore being in the process + * of decrementing the extent buffer's reference count twice. + * So here we could race and increment the eb's reference count, + * clear its stale flag, mark it as dirty and drop our reference + * before the other task finishes executing free_extent_buffer, + * which would later result in an attempt to free an extent + * buffer that is dirty. + */ + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { + spin_lock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); + } mark_extent_buffer_accessed(eb, NULL); return eb; } -- cgit v1.2.3