From b4be6aefa73c9a6899ef3ba9c5faaa8a66e333ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Feb 2022 14:56:10 -0500 Subject: btrfs: do not start relocation until in progress drops are done We hit a bug with a recovering relocation on mount for one of our file systems in production. I reproduced this locally by injecting errors into snapshot delete with balance running at the same time. This presented as an error while looking up an extent item WARNING: CPU: 5 PID: 1501 at fs/btrfs/extent-tree.c:866 lookup_inline_extent_backref+0x647/0x680 CPU: 5 PID: 1501 Comm: btrfs-balance Not tainted 5.16.0-rc8+ #8 RIP: 0010:lookup_inline_extent_backref+0x647/0x680 RSP: 0018:ffffae0a023ab960 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000000000000c RDI: 0000000000000000 RBP: ffff943fd2a39b60 R08: 0000000000000000 R09: 0000000000000001 R10: 0001434088152de0 R11: 0000000000000000 R12: 0000000001d05000 R13: ffff943fd2a39b60 R14: ffff943fdb96f2a0 R15: ffff9442fc923000 FS: 0000000000000000(0000) GS:ffff944e9eb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1157b1fca8 CR3: 000000010f092000 CR4: 0000000000350ee0 Call Trace: insert_inline_extent_backref+0x46/0xd0 __btrfs_inc_extent_ref.isra.0+0x5f/0x200 ? btrfs_merge_delayed_refs+0x164/0x190 __btrfs_run_delayed_refs+0x561/0xfa0 ? btrfs_search_slot+0x7b4/0xb30 ? btrfs_update_root+0x1a9/0x2c0 btrfs_run_delayed_refs+0x73/0x1f0 ? btrfs_update_root+0x1a9/0x2c0 btrfs_commit_transaction+0x50/0xa50 ? btrfs_update_reloc_root+0x122/0x220 prepare_to_merge+0x29f/0x320 relocate_block_group+0x2b8/0x550 btrfs_relocate_block_group+0x1a6/0x350 btrfs_relocate_chunk+0x27/0xe0 btrfs_balance+0x777/0xe60 balance_kthread+0x35/0x50 ? btrfs_balance+0xe60/0xe60 kthread+0x16b/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Normally snapshot deletion and relocation are excluded from running at the same time by the fs_info->cleaner_mutex. However if we had a pending balance waiting to get the ->cleaner_mutex, and a snapshot deletion was running, and then the box crashed, we would come up in a state where we have a half deleted snapshot. Again, in the normal case the snapshot deletion needs to complete before relocation can start, but in this case relocation could very well start before the snapshot deletion completes, as we simply add the root to the dead roots list and wait for the next time the cleaner runs to clean up the snapshot. Fix this by setting a bit on the fs_info if we have any DEAD_ROOT's that had a pending drop_progress key. If they do then we know we were in the middle of the drop operation and set a flag on the fs_info. Then balance can wait until this flag is cleared to start up again. If there are DEAD_ROOT's that don't have a drop_progress set then we're safe to start balance right away as we'll be properly protected by the cleaner_mutex. CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c3cfdfd8de9b..f17bf3764ce8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1319,6 +1319,32 @@ again: return 0; } +/* + * If we had a pending drop we need to see if there are any others left in our + * dead roots list, and if not clear our bit and wake any waiters. + */ +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + /* + * We put the drop in progress roots at the front of the list, so if the + * first entry doesn't have UNFINISHED_DROP set we can wake everybody + * up. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, + root_list); + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { + spin_unlock(&fs_info->trans_lock); + return; + } + } + spin_unlock(&fs_info->trans_lock); + + btrfs_wake_unfinished_drop(fs_info); +} + /* * dead roots are old snapshots that need to be deleted. This allocates * a dirty root struct and adds it into the list of dead roots that need to @@ -1331,7 +1357,12 @@ void btrfs_add_dead_root(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) { btrfs_grab_root(root); - list_add_tail(&root->root_list, &fs_info->dead_roots); + + /* We want to process the partially complete drops first. */ + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) + list_add(&root->root_list, &fs_info->dead_roots); + else + list_add_tail(&root->root_list, &fs_info->dead_roots); } spin_unlock(&fs_info->trans_lock); } -- cgit v1.2.3 From 5fd76bf31ccfecc06e2e6b29f8c809e934085b99 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 17 Feb 2022 15:14:43 -0800 Subject: btrfs: fix relocation crash due to premature return from btrfs_commit_transaction() We are seeing crashes similar to the following trace: [38.969182] WARNING: CPU: 20 PID: 2105 at fs/btrfs/relocation.c:4070 btrfs_relocate_block_group+0x2dc/0x340 [btrfs] [38.973556] CPU: 20 PID: 2105 Comm: btrfs Not tainted 5.17.0-rc4 #54 [38.974580] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [38.976539] RIP: 0010:btrfs_relocate_block_group+0x2dc/0x340 [btrfs] [38.980336] RSP: 0000:ffffb0dd42e03c20 EFLAGS: 00010206 [38.981218] RAX: ffff96cfc4ede800 RBX: ffff96cfc3ce0000 RCX: 000000000002ca14 [38.982560] RDX: 0000000000000000 RSI: 4cfd109a0bcb5d7f RDI: ffff96cfc3ce0360 [38.983619] RBP: ffff96cfc309c000 R08: 0000000000000000 R09: 0000000000000000 [38.984678] R10: ffff96cec0000001 R11: ffffe84c80000000 R12: ffff96cfc4ede800 [38.985735] R13: 0000000000000000 R14: 0000000000000000 R15: ffff96cfc3ce0360 [38.987146] FS: 00007f11c15218c0(0000) GS:ffff96d6dfb00000(0000) knlGS:0000000000000000 [38.988662] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [38.989398] CR2: 00007ffc922c8e60 CR3: 00000001147a6001 CR4: 0000000000370ee0 [38.990279] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [38.991219] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [38.992528] Call Trace: [38.992854] [38.993148] btrfs_relocate_chunk+0x27/0xe0 [btrfs] [38.993941] btrfs_balance+0x78e/0xea0 [btrfs] [38.994801] ? vsnprintf+0x33c/0x520 [38.995368] ? __kmalloc_track_caller+0x351/0x440 [38.996198] btrfs_ioctl_balance+0x2b9/0x3a0 [btrfs] [38.997084] btrfs_ioctl+0x11b0/0x2da0 [btrfs] [38.997867] ? mod_objcg_state+0xee/0x340 [38.998552] ? seq_release+0x24/0x30 [38.999184] ? proc_nr_files+0x30/0x30 [38.999654] ? call_rcu+0xc8/0x2f0 [39.000228] ? __x64_sys_ioctl+0x84/0xc0 [39.000872] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [39.001973] __x64_sys_ioctl+0x84/0xc0 [39.002566] do_syscall_64+0x3a/0x80 [39.003011] entry_SYSCALL_64_after_hwframe+0x44/0xae [39.003735] RIP: 0033:0x7f11c166959b [39.007324] RSP: 002b:00007fff2543e998 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [39.008521] RAX: ffffffffffffffda RBX: 00007f11c1521698 RCX: 00007f11c166959b [39.009833] RDX: 00007fff2543ea40 RSI: 00000000c4009420 RDI: 0000000000000003 [39.011270] RBP: 0000000000000003 R08: 0000000000000013 R09: 00007f11c16f94e0 [39.012581] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fff25440df3 [39.014046] R13: 0000000000000000 R14: 00007fff2543ea40 R15: 0000000000000001 [39.015040] [39.015418] ---[ end trace 0000000000000000 ]--- [43.131559] ------------[ cut here ]------------ [43.132234] kernel BUG at fs/btrfs/extent-tree.c:2717! [43.133031] invalid opcode: 0000 [#1] PREEMPT SMP PTI [43.133702] CPU: 1 PID: 1839 Comm: btrfs Tainted: G W 5.17.0-rc4 #54 [43.134863] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [43.136426] RIP: 0010:unpin_extent_range+0x37a/0x4f0 [btrfs] [43.139913] RSP: 0000:ffffb0dd4216bc70 EFLAGS: 00010246 [43.140629] RAX: 0000000000000000 RBX: ffff96cfc34490f8 RCX: 0000000000000001 [43.141604] RDX: 0000000080000001 RSI: 0000000051d00000 RDI: 00000000ffffffff [43.142645] RBP: 0000000000000000 R08: 0000000000000000 R09: ffff96cfd07dca50 [43.143669] R10: ffff96cfc46e8a00 R11: fffffffffffec000 R12: 0000000041d00000 [43.144657] R13: ffff96cfc3ce0000 R14: ffffb0dd4216bd08 R15: 0000000000000000 [43.145686] FS: 00007f7657dd68c0(0000) GS:ffff96d6df640000(0000) knlGS:0000000000000000 [43.146808] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43.147584] CR2: 00007f7fe81bf5b0 CR3: 00000001093ee004 CR4: 0000000000370ee0 [43.148589] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [43.149581] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [43.150559] Call Trace: [43.150904] [43.151253] btrfs_finish_extent_commit+0x88/0x290 [btrfs] [43.152127] btrfs_commit_transaction+0x74f/0xaa0 [btrfs] [43.152932] ? btrfs_attach_transaction_barrier+0x1e/0x50 [btrfs] [43.153786] btrfs_ioctl+0x1edc/0x2da0 [btrfs] [43.154475] ? __check_object_size+0x150/0x170 [43.155170] ? preempt_count_add+0x49/0xa0 [43.155753] ? __x64_sys_ioctl+0x84/0xc0 [43.156437] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [43.157456] __x64_sys_ioctl+0x84/0xc0 [43.157980] do_syscall_64+0x3a/0x80 [43.158543] entry_SYSCALL_64_after_hwframe+0x44/0xae [43.159231] RIP: 0033:0x7f7657f1e59b [43.161819] RSP: 002b:00007ffda5cd1658 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [43.162702] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f7657f1e59b [43.163526] RDX: 0000000000000000 RSI: 0000000000009408 RDI: 0000000000000003 [43.164358] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 [43.165208] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [43.166029] R13: 00005621b91c3232 R14: 00005621b91ba580 R15: 00007ffda5cd1800 [43.166863] [43.167125] Modules linked in: btrfs blake2b_generic xor pata_acpi ata_piix libata raid6_pq scsi_mod libcrc32c virtio_net virtio_rng net_failover rng_core failover scsi_common [43.169552] ---[ end trace 0000000000000000 ]--- [43.171226] RIP: 0010:unpin_extent_range+0x37a/0x4f0 [btrfs] [43.174767] RSP: 0000:ffffb0dd4216bc70 EFLAGS: 00010246 [43.175600] RAX: 0000000000000000 RBX: ffff96cfc34490f8 RCX: 0000000000000001 [43.176468] RDX: 0000000080000001 RSI: 0000000051d00000 RDI: 00000000ffffffff [43.177357] RBP: 0000000000000000 R08: 0000000000000000 R09: ffff96cfd07dca50 [43.178271] R10: ffff96cfc46e8a00 R11: fffffffffffec000 R12: 0000000041d00000 [43.179178] R13: ffff96cfc3ce0000 R14: ffffb0dd4216bd08 R15: 0000000000000000 [43.180071] FS: 00007f7657dd68c0(0000) GS:ffff96d6df800000(0000) knlGS:0000000000000000 [43.181073] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43.181808] CR2: 00007fe09905f010 CR3: 00000001093ee004 CR4: 0000000000370ee0 [43.182706] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [43.183591] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 We first hit the WARN_ON(rc->block_group->pinned > 0) in btrfs_relocate_block_group() and then the BUG_ON(!cache) in unpin_extent_range(). This tells us that we are exiting relocation and removing the block group with bytes still pinned for that block group. This is supposed to be impossible: the last thing relocate_block_group() does is commit the transaction to get rid of pinned extents. Commit d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when waiting for a transaction commit") introduced an optimization so that commits from fsync don't have to wait for the previous commit to unpin extents. This was only intended to affect fsync, but it inadvertently made it possible for any commit to skip waiting for the previous commit to unpin. This is because if a call to btrfs_commit_transaction() finds that another thread is already committing the transaction, it waits for the other thread to complete the commit and then returns. If that other thread was in fsync, then it completes the commit without completing the previous commit. This makes the following sequence of events possible: Thread 1____________________|Thread 2 (fsync)_____________________|Thread 3 (balance)___________________ btrfs_commit_transaction(N) | | btrfs_run_delayed_refs | | pin extents | | ... | | state = UNBLOCKED |btrfs_sync_file | | btrfs_start_transaction(N + 1) |relocate_block_group | | btrfs_join_transaction(N + 1) | btrfs_commit_transaction(N + 1) | ... | trans->state = COMMIT_START | | | btrfs_commit_transaction(N + 1) | | wait_for_commit(N + 1, COMPLETED) | wait_for_commit(N, SUPER_COMMITTED)| state = SUPER_COMMITTED | ... | btrfs_finish_extent_commit| | unpin_extent_range() | trans->state = COMPLETED | | | return | | ... | |Thread 1 isn't done, so pinned > 0 | |and we WARN | | | |btrfs_remove_block_group unpin_extent_range() | | Thread 3 removed the | | block group, so we BUG| | There are other sequences involving SUPER_COMMITTED transactions that can cause a similar outcome. We could fix this by making relocation explicitly wait for unpinning, but there may be other cases that need it. Josef mentioned ENOSPC flushing and the free space cache inode as other potential victims. Rather than playing whack-a-mole, this fix is conservative and makes all commits not in fsync wait for all previous transactions, which is what the optimization intended. Fixes: d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when waiting for a transaction commit") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f17bf3764ce8..1f1c25db6f6b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -854,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_transaction *commit, const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state >= min_state); + struct btrfs_fs_info *fs_info = commit->fs_info; + u64 transid = commit->transid; + bool put = false; + + while (1) { + wait_event(commit->commit_wait, commit->state >= min_state); + if (put) + btrfs_put_transaction(commit); + + if (min_state < TRANS_STATE_COMPLETED) + break; + + /* + * A transaction isn't really completed until all of the + * previous transactions are completed, but with fsync we can + * end up with SUPER_COMMITTED transactions before a COMPLETED + * transaction. Wait for those. + */ + + spin_lock(&fs_info->trans_lock); + commit = list_first_entry_or_null(&fs_info->trans_list, + struct btrfs_transaction, + list); + if (!commit || commit->transid > transid) { + spin_unlock(&fs_info->trans_lock); + break; + } + refcount_inc(&commit->use_count); + put = true; + spin_unlock(&fs_info->trans_lock); + } } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) -- cgit v1.2.3