diff options
60 files changed, 3278 insertions, 3233 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3dcf9bcc2326..4188ba3fd8c3 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -27,7 +27,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ + backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ subpage.o tree-mod-log.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f735b8798ba1..c9ee579bc5a6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -950,7 +950,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); @@ -1049,12 +1049,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_keyed_refs(struct btrfs_fs_info *fs_info, +static int add_keyed_refs(struct btrfs_root *extent_root, struct btrfs_path *path, u64 bytenr, int info_level, struct preftrees *preftrees, struct share_check *sc) { - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_fs_info *fs_info = extent_root->fs_info; int ret; int slot; struct extent_buffer *leaf; @@ -1170,6 +1170,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct ulist *roots, const u64 *extent_item_pos, struct share_check *sc, bool ignore_offset) { + struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct btrfs_path *path; struct btrfs_delayed_ref_root *delayed_refs = NULL; @@ -1203,28 +1204,26 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, if (time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; - /* - * grab both a lock on the path and a lock on the delayed ref head. - * We need both to get a consistent picture of how the refs look - * at a specified point in time - */ again: head = NULL; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + if (ret == 0) { + /* This shouldn't happen, indicates a bug or fs corruption. */ + ASSERT(ret != 0); + ret = -EUCLEAN; + goto out; + } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && time_seq != BTRFS_SEQ_LAST) { -#else - if (trans && time_seq != BTRFS_SEQ_LAST) { -#endif /* - * look if there are updates for this ref queued and lock the - * head + * We have a specific time_seq we care about and trans which + * means we have the path lock, we need to grab the ref head and + * lock it so we have a consistent view of the refs at the given + * time. */ delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -1271,7 +1270,7 @@ again: &info_level, &preftrees, sc); if (ret) goto out; - ret = add_keyed_refs(fs_info, path, bytenr, info_level, + ret = add_keyed_refs(root, path, bytenr, info_level, &preftrees, sc); if (ret) goto out; @@ -1360,10 +1359,18 @@ again: goto out; if (!ret && extent_item_pos) { /* - * we've recorded that parent, so we must extend - * its inode list here + * We've recorded that parent, so we must extend + * its inode list here. + * + * However if there was corruption we may not + * have found an eie, return an error in this + * case. */ - BUG_ON(!eie); + ASSERT(eie); + if (!eie) { + ret = -EUCLEAN; + goto out; + } while (eie->next) eie = eie->next; eie->next = ref->inode_list; @@ -1740,6 +1747,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags_ret) { + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); int ret; u64 flags; u64 size = 0; @@ -1755,11 +1763,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, key.objectid = logical; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0); + ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret) { if (ret > 0) ret = -ENOENT; @@ -1779,7 +1787,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, path->slots[0]); + item_size = btrfs_item_size(eb, path->slots[0]); BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); @@ -1962,7 +1970,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, extent_item_objectid); if (!search_commit_root) { - trans = btrfs_attach_transaction(fs_info->extent_root); + trans = btrfs_attach_transaction(fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) @@ -2058,7 +2066,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, u64 parent = 0; int found = 0; struct extent_buffer *eb; - struct btrfs_item *item; struct btrfs_inode_ref *iref; struct btrfs_key found_key; @@ -2084,10 +2091,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, } btrfs_release_path(path); - item = btrfs_item_nr(slot); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { + for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) { name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ btrfs_debug(fs_root->fs_info, @@ -2143,7 +2149,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, } btrfs_release_path(path); - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr_offset(eb, slot); cur_offset = 0; @@ -2330,6 +2336,7 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc( int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) { struct btrfs_fs_info *fs_info = iter->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct btrfs_path *path = iter->path; struct btrfs_extent_item *ei; struct btrfs_key key; @@ -2340,7 +2347,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) key.offset = (u64)-1; iter->bytenr = bytenr; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { @@ -2364,7 +2371,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->end_ptr = (u32)(iter->item_ptr + - btrfs_item_size_nr(path->nodes[0], path->slots[0])); + btrfs_item_size(path->nodes[0], path->slots[0])); ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); @@ -2383,7 +2390,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) /* If there is no inline backref, go search for keyed backref */ if (iter->cur_ptr >= iter->end_ptr) { - ret = btrfs_next_item(fs_info->extent_root, path); + ret = btrfs_next_item(extent_root, path); /* No inline nor keyed ref */ if (ret > 0) { @@ -2404,7 +2411,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->item_ptr = iter->cur_ptr; - iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr( + iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size( path->nodes[0], path->slots[0])); } @@ -2427,6 +2434,7 @@ release: int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) { struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct btrfs_root *extent_root; struct btrfs_path *path = iter->path; struct btrfs_extent_inline_ref *iref; int ret; @@ -2457,7 +2465,8 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) } /* We're at keyed items, there is no inline item, go to the next one */ - ret = btrfs_next_item(iter->fs_info->extent_root, iter->path); + extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr); + ret = btrfs_next_item(extent_root, iter->path); if (ret) return ret; @@ -2469,7 +2478,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->cur_ptr = iter->item_ptr; - iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0], + iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0], path->slots[0]); return 0; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 444e9c89ff3e..1db24e6d6d90 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -514,7 +514,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; @@ -529,6 +529,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) return -ENOMEM; last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); + extent_root = btrfs_extent_root(fs_info, last); #ifdef CONFIG_BTRFS_DEBUG /* @@ -841,7 +842,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret; - root = fs_info->extent_root; + root = btrfs_block_group_root(fs_info); key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; @@ -1106,6 +1107,7 @@ out: struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset) { + struct btrfs_root *root = btrfs_block_group_root(fs_info); struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; @@ -1139,8 +1141,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( num_items = 3 + map->num_stripes; free_extent_map(em); - return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, - num_items); + return btrfs_start_transaction_fallback_global_rsv(root, num_items); } /* @@ -1508,7 +1509,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; - LIST_HEAD(again_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1585,18 +1585,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); - if (ret && ret != -EAGAIN) + if (ret) btrfs_err(fs_info, "error relocating chunk %llu", bg->start); next: + btrfs_put_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); - if (ret == -EAGAIN && list_empty(&bg->bg_list)) - list_add_tail(&bg->bg_list, &again_list); - else - btrfs_put_block_group(bg); } - list_splice_tail(&again_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); @@ -1678,7 +1674,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key *key) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; struct btrfs_key found_key; struct extent_buffer *leaf; @@ -2165,6 +2161,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) int btrfs_read_block_groups(struct btrfs_fs_info *info) { + struct btrfs_root *root = btrfs_block_group_root(info); struct btrfs_path *path; int ret; struct btrfs_block_group *cache; @@ -2173,7 +2170,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) int need_clear = 0; u64 cache_gen; - if (!info->extent_root) + if (!root) return fill_dummy_bgs(info); key.objectid = 0; @@ -2276,7 +2273,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_item bgi; - struct btrfs_root *root; + struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; spin_lock(&block_group->lock); @@ -2289,7 +2286,6 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, key.offset = block_group->length; spin_unlock(&block_group->lock); - root = fs_info->extent_root; return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); } @@ -2543,12 +2539,13 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_trans_handle *trans; + struct btrfs_root *root = btrfs_block_group_root(fs_info); u64 alloc_flags; int ret; bool dirty_bg_running; do { - trans = btrfs_join_transaction(fs_info->extent_root); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -2653,7 +2650,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = btrfs_block_group_root(fs_info); unsigned long bi; struct extent_buffer *leaf; struct btrfs_block_group_item bgi; @@ -3790,7 +3787,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, } if (!ret) { - ret = btrfs_block_rsv_add(fs_info->chunk_root, + ret = btrfs_block_rsv_add(fs_info, &fs_info->chunk_block_rsv, bytes, BTRFS_RESERVE_NO_FLUSH); if (!ret) @@ -3911,9 +3908,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) list_del_init(&block_group->bg_list); btrfs_put_block_group(block_group); } - spin_unlock(&info->unused_bgs_lock); - spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->reclaim_bgs)) { block_group = list_first_entry(&info->reclaim_bgs, struct btrfs_block_group, diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 04a6226e0388..b3ee49b0b1e8 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,6 +6,7 @@ #include "space-info.h" #include "transaction.h" #include "block-group.h" +#include "disk-io.h" /* * HOW DO BLOCK RESERVES WORK @@ -208,7 +209,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) { @@ -217,7 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, if (num_bytes == 0) return 0; - ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (!ret) btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -241,7 +242,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) return ret; } -int btrfs_block_rsv_refill(struct btrfs_root *root, +int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush) { @@ -262,7 +263,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, if (!ret) return 0; - ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (!ret) { btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -351,23 +352,29 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; struct btrfs_space_info *sinfo = block_rsv->space_info; - u64 num_bytes; - unsigned min_items; + struct btrfs_root *root, *tmp; + u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item); + unsigned int min_items = 1; /* * The global block rsv is based on the size of the extent tree, the * checksum tree and the root tree. If the fs is empty we want to set * it to a minimal amount for safety. + * + * We also are going to need to modify the minimum of the tree root and + * any global roots we could touch. */ - num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + - btrfs_root_used(&fs_info->csum_root->root_item) + - btrfs_root_used(&fs_info->tree_root->root_item); - - /* - * We at a minimum are going to modify the csum root, the tree root, and - * the extent root. - */ - min_items = 3; + read_lock(&fs_info->global_root_lock); + rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree, + rb_node) { + if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID || + root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || + root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { + num_bytes += btrfs_root_used(&root->root_item); + min_items++; + } + } + read_unlock(&fs_info->global_root_lock); /* * But we also want to reserve enough space so we can do the fallback @@ -412,6 +419,30 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_unlock(&sinfo->lock); } +void btrfs_init_root_block_rsv(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + switch (root->root_key.objectid) { + case BTRFS_CSUM_TREE_OBJECTID: + case BTRFS_EXTENT_TREE_OBJECTID: + case BTRFS_FREE_SPACE_TREE_OBJECTID: + root->block_rsv = &fs_info->delayed_refs_rsv; + break; + case BTRFS_ROOT_TREE_OBJECTID: + case BTRFS_DEV_TREE_OBJECTID: + case BTRFS_QUOTA_TREE_OBJECTID: + root->block_rsv = &fs_info->global_block_rsv; + break; + case BTRFS_CHUNK_TREE_OBJECTID: + root->block_rsv = &fs_info->chunk_block_rsv; + break; + default: + root->block_rsv = NULL; + break; + } +} + void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; @@ -426,22 +457,6 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delayed_block_rsv.space_info = space_info; fs_info->delayed_refs_rsv.space_info = space_info; - /* - * Our various recovery options can leave us with NULL roots, so check - * here and just bail before we go dereferencing NULLs everywhere. - */ - if (!fs_info->extent_root || !fs_info->csum_root || - !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root) - return; - - fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; - fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; - if (fs_info->quota_root) - fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; - fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - btrfs_update_global_block_rsv(fs_info); } @@ -467,8 +482,9 @@ static struct btrfs_block_rsv *get_block_rsv( struct btrfs_block_rsv *block_rsv = NULL; if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - (root == fs_info->csum_root && trans->adding_csums) || - (root == fs_info->uuid_root)) + (root == fs_info->uuid_root) || + (trans->adding_csums && + root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -523,7 +539,7 @@ again: block_rsv->type, ret); } try_reserve: - ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 0b6ae5302837..3b67ff08d434 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -50,6 +50,7 @@ struct btrfs_block_rsv { }; void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +void btrfs_init_root_block_rsv(struct btrfs_root *root); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, @@ -57,11 +58,11 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); -int btrfs_block_rsv_refill(struct btrfs_root *root, +int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ab2a4a52e0bb..b3e46aabc3d8 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -138,19 +138,11 @@ struct btrfs_inode { /* a local copy of root's last_log_commit */ int last_log_commit; - union { - /* - * Total number of bytes pending delalloc, used by stat to - * calculate the real block usage of the file. This is used - * only for files. - */ - u64 delalloc_bytes; - /* - * The offset of the last dir item key that was logged. - * This is used only for directories. - */ - u64 last_dir_item_offset; - }; + /* + * Total number of bytes pending delalloc, used by stat to calculate the + * real block usage of the file. This is used only for files. + */ + u64 delalloc_bytes; union { /* diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 32da97c3c19d..71e5b2e9a1ba 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -96,10 +96,10 @@ static int compression_compress_pages(int type, struct list_head *ws, } } -static int compression_decompress_bio(int type, struct list_head *ws, - struct compressed_bio *cb) +static int compression_decompress_bio(struct list_head *ws, + struct compressed_bio *cb) { - switch (type) { + switch (cb->compress_type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb); case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb); case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb); @@ -157,7 +157,8 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, struct compressed_bio *cb = bio->bi_private; u8 *cb_sum = cb->sums; - if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM)) + if ((inode->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return 0; shash->tfm = fs_info->csum_shash; @@ -1359,7 +1360,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) int type = cb->compress_type; workspace = get_workspace(type, 0); - ret = compression_decompress_bio(type, workspace, cb); + ret = compression_decompress_bio(workspace, cb); put_workspace(type, workspace); return ret; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f704339c6b86..a7db3f6f1b7b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -726,21 +726,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, } /* - * search for key in the extent_buffer. The items start at offset p, - * and they are item_size apart. + * Search for a key in the given extent_buffer. * - * the slot in the array is returned via slot, and it points to - * the place where you would insert key if it is not found in - * the array. + * The lower boundary for the search is specified by the slot number @low. Use a + * value of 0 to search over the whole extent buffer. * - * Slot may point to total number of items if the key is bigger than - * all of the keys + * The slot in the extent buffer is returned via @slot. If the key exists in the + * extent buffer, then @slot will point to the slot where the key is, otherwise + * it points to the slot where you would insert the key. + * + * Slot may point to the total number of items (i.e. one position beyond the last + * key) if the key is bigger than the last key in the extent buffer. */ -static noinline int generic_bin_search(struct extent_buffer *eb, - unsigned long p, int item_size, +static noinline int generic_bin_search(struct extent_buffer *eb, int low, const struct btrfs_key *key, int *slot) { - int low = 0; + unsigned long p; + int item_size; int high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); @@ -753,6 +755,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb, return -EINVAL; } + if (btrfs_header_level(eb) == 0) { + p = offsetof(struct btrfs_leaf, items); + item_size = sizeof(struct btrfs_item); + } else { + p = offsetof(struct btrfs_node, ptrs); + item_size = sizeof(struct btrfs_key_ptr); + } + while (low < high) { unsigned long oip; unsigned long offset; @@ -791,20 +801,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb, } /* - * simple bin_search frontend that does the right thing for - * leaves vs nodes + * Simple binary search on an extent buffer. Works for both leaves and nodes, and + * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot) { - if (btrfs_header_level(eb) == 0) - return generic_bin_search(eb, - offsetof(struct btrfs_leaf, items), - sizeof(struct btrfs_item), key, slot); - else - return generic_bin_search(eb, - offsetof(struct btrfs_node, ptrs), - sizeof(struct btrfs_key_ptr), key, slot); + return generic_bin_search(eb, 0, key, slot); } static void root_add_used(struct btrfs_root *root, u32 size) @@ -1346,33 +1349,34 @@ static noinline void unlock_up(struct btrfs_path *path, int level, { int i; int skip_level = level; - int no_skips = 0; - struct extent_buffer *t; + bool check_skip = true; for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; if (!path->locks[i]) break; - if (!no_skips && path->slots[i] == 0) { - skip_level = i + 1; - continue; - } - if (!no_skips && path->keep_locks) { - u32 nritems; - t = path->nodes[i]; - nritems = btrfs_header_nritems(t); - if (nritems < 1 || path->slots[i] >= nritems - 1) { + + if (check_skip) { + if (path->slots[i] == 0) { skip_level = i + 1; continue; } + + if (path->keep_locks) { + u32 nritems; + + nritems = btrfs_header_nritems(path->nodes[i]); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } } - if (skip_level < i && i >= lowest_unlock) - no_skips = 1; - t = path->nodes[i]; if (i >= lowest_unlock && i > skip_level) { - btrfs_tree_unlock_rw(t, path->locks[i]); + check_skip = false; + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); path->locks[i] = 0; if (write_lock_level && i > min_write_lock_level && @@ -1568,35 +1572,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, struct btrfs_path *p, int write_lock_level) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; - int root_lock; + int root_lock = 0; int level = 0; - /* We try very hard to do read locks on the root */ - root_lock = BTRFS_READ_LOCK; - if (p->search_commit_root) { - /* - * The commit roots are read only so we always do read locks, - * and we always must hold the commit_root_sem when doing - * searches on them, the only exception is send where we don't - * want to block transaction commits for a long time, so - * we need to clone the commit root in order to avoid races - * with transaction commits that create a snapshot of one of - * the roots used by a send operation. - */ - if (p->need_commit_sem) { - down_read(&fs_info->commit_root_sem); - b = btrfs_clone_extent_buffer(root->commit_root); - up_read(&fs_info->commit_root_sem); - if (!b) - return ERR_PTR(-ENOMEM); - - } else { - b = root->commit_root; - atomic_inc(&b->refs); - } + b = root->commit_root; + atomic_inc(&b->refs); level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when @@ -1613,6 +1595,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, goto out; } + /* We try very hard to do read locks on the root */ + root_lock = BTRFS_READ_LOCK; + /* * If the level is set to maximum, we can skip trying to get the read * lock. @@ -1639,6 +1624,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, level = btrfs_header_level(b); out: + /* + * The root may have failed to write out at some point, and thus is no + * longer valid, return an error in this case. + */ + if (!extent_buffer_uptodate(b)) { + if (root_lock) + btrfs_tree_unlock_rw(b, root_lock); + free_extent_buffer(b); + return ERR_PTR(-EIO); + } + p->nodes[level] = b; if (!p->skip_locking) p->locks[level] = root_lock; @@ -1648,6 +1644,191 @@ out: return b; } +/* + * Replace the extent buffer at the lowest level of the path with a cloned + * version. The purpose is to be able to use it safely, after releasing the + * commit root semaphore, even if relocation is happening in parallel, the + * transaction used for relocation is committed and the extent buffer is + * reallocated in the next transaction. + * + * This is used in a context where the caller does not prevent transaction + * commits from happening, either by holding a transaction handle or holding + * some lock, while it's doing searches through a commit root. + * At the moment it's only used for send operations. + */ +static int finish_need_commit_sem_search(struct btrfs_path *path) +{ + const int i = path->lowest_level; + const int slot = path->slots[i]; + struct extent_buffer *lowest = path->nodes[i]; + struct extent_buffer *clone; + + ASSERT(path->need_commit_sem); + + if (!lowest) + return 0; + + lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); + + clone = btrfs_clone_extent_buffer(lowest); + if (!clone) + return -ENOMEM; + + btrfs_release_path(path); + path->nodes[i] = clone; + path->slots[i] = slot; + + return 0; +} + +static inline int search_for_key_slot(struct extent_buffer *eb, + int search_low_slot, + const struct btrfs_key *key, + int prev_cmp, + int *slot) +{ + /* + * If a previous call to btrfs_bin_search() on a parent node returned an + * exact match (prev_cmp == 0), we can safely assume the target key will + * always be at slot 0 on lower levels, since each key pointer + * (struct btrfs_key_ptr) refers to the lowest key accessible from the + * subtree it points to. Thus we can skip searching lower levels. + */ + if (prev_cmp == 0) { + *slot = 0; + return 0; + } + + return generic_bin_search(eb, search_low_slot, key, slot); +} + +static int search_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_path *path, + int ins_len, + int prev_cmp) +{ + struct extent_buffer *leaf = path->nodes[0]; + int leaf_free_space = -1; + int search_low_slot = 0; + int ret; + bool do_bin_search = true; + + /* + * If we are doing an insertion, the leaf has enough free space and the + * destination slot for the key is not slot 0, then we can unlock our + * write lock on the parent, and any other upper nodes, before doing the + * binary search on the leaf (with search_for_key_slot()), allowing other + * tasks to lock the parent and any other upper nodes. + */ + if (ins_len > 0) { + /* + * Cache the leaf free space, since we will need it later and it + * will not change until then. + */ + leaf_free_space = btrfs_leaf_free_space(leaf); + + /* + * !path->locks[1] means we have a single node tree, the leaf is + * the root of the tree. + */ + if (path->locks[1] && leaf_free_space >= ins_len) { + struct btrfs_disk_key first_key; + + ASSERT(btrfs_header_nritems(leaf) > 0); + btrfs_item_key(leaf, &first_key, 0); + + /* + * Doing the extra comparison with the first key is cheap, + * taking into account that the first key is very likely + * already in a cache line because it immediately follows + * the extent buffer's header and we have recently accessed + * the header's level field. + */ + ret = comp_keys(&first_key, key); + if (ret < 0) { + /* + * The first key is smaller than the key we want + * to insert, so we are safe to unlock all upper + * nodes and we have to do the binary search. + * + * We do use btrfs_unlock_up_safe() and not + * unlock_up() because the later does not unlock + * nodes with a slot of 0 - we can safely unlock + * any node even if its slot is 0 since in this + * case the key does not end up at slot 0 of the + * leaf and there's no need to split the leaf. + */ + btrfs_unlock_up_safe(path, 1); + search_low_slot = 1; + } else { + /* + * The first key is >= then the key we want to + * insert, so we can skip the binary search as + * the target key will be at slot 0. + * + * We can not unlock upper nodes when the key is + * less than the first key, because we will need + * to update the key at slot 0 of the parent node + * and possibly of other upper nodes too. + * If the key matches the first key, then we can + * unlock all the upper nodes, using + * btrfs_unlock_up_safe() instead of unlock_up() + * as stated above. + */ + if (ret == 0) + btrfs_unlock_up_safe(path, 1); + /* + * ret is already 0 or 1, matching the result of + * a btrfs_bin_search() call, so there is no need + * to adjust it. + */ + do_bin_search = false; + path->slots[0] = 0; + } + } + } + + if (do_bin_search) { + ret = search_for_key_slot(leaf, search_low_slot, key, + prev_cmp, &path->slots[0]); + if (ret < 0) + return ret; + } + + if (ins_len > 0) { + /* + * Item key already exists. In this case, if we are allowed to + * insert the item (for example, in dir_item case, item key + * collision is allowed), it will be merged with the original + * item. Only the item size grows, no new btrfs item will be + * added. If search_for_extension is not set, ins_len already + * accounts the size btrfs_item, deduct it here so leaf space + * check will be correct. + */ + if (ret == 0 && !path->search_for_extension) { + ASSERT(ins_len >= sizeof(struct btrfs_item)); + ins_len -= sizeof(struct btrfs_item); + } + + ASSERT(leaf_free_space >= 0); + + if (leaf_free_space < ins_len) { + int err; + + err = split_leaf(trans, root, key, path, ins_len, + (ret == 0)); + ASSERT(err <= 0); + if (WARN_ON(err > 0)) + err = -EUCLEAN; + if (err) + ret = err; + } + } + + return ret; +} /* * btrfs_search_slot - look for a key in a tree and perform necessary @@ -1684,6 +1865,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; @@ -1725,6 +1907,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, min_write_lock_level = write_lock_level; + if (p->need_commit_sem) { + ASSERT(p->search_commit_root); + down_read(&fs_info->commit_root_sem); + } + again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); @@ -1778,10 +1965,6 @@ again: } cow_done: p->nodes[level] = b; - /* - * Leave path with blocking locks to avoid massive - * lock context switch, this is made on purpose. - */ /* * we have a lock on b and as long as we aren't changing @@ -1803,62 +1986,22 @@ cow_done: } } - /* - * If btrfs_bin_search returns an exact match (prev_cmp == 0) - * we can safely assume the target key will always be in slot 0 - * on lower levels due to the invariants BTRFS' btree provides, - * namely that a btrfs_key_ptr entry always points to the - * lowest key in the child node, thus we can skip searching - * lower levels - */ - if (prev_cmp == 0) { - slot = 0; - ret = 0; - } else { - ret = btrfs_bin_search(b, key, &slot); - prev_cmp = ret; - if (ret < 0) - goto done; - } - if (level == 0) { - p->slots[level] = slot; - /* - * Item key already exists. In this case, if we are - * allowed to insert the item (for example, in dir_item - * case, item key collision is allowed), it will be - * merged with the original item. Only the item size - * grows, no new btrfs item will be added. If - * search_for_extension is not set, ins_len already - * accounts the size btrfs_item, deduct it here so leaf - * space check will be correct. - */ - if (ret == 0 && ins_len > 0 && !p->search_for_extension) { - ASSERT(ins_len >= sizeof(struct btrfs_item)); - ins_len -= sizeof(struct btrfs_item); - } - if (ins_len > 0 && - btrfs_leaf_free_space(b) < ins_len) { - if (write_lock_level < 1) { - write_lock_level = 1; - btrfs_release_path(p); - goto again; - } + if (ins_len > 0) + ASSERT(write_lock_level >= 1); - err = split_leaf(trans, root, key, - p, ins_len, ret == 0); - - BUG_ON(err > 0); - if (err) { - ret = err; - goto done; - } - } + ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); if (!p->search_for_split) unlock_up(p, level, lowest_unlock, min_write_lock_level, NULL); goto done; } + + ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); + if (ret < 0) + goto done; + prev_cmp = ret; + if (ret && slot > 0) { dec = 1; slot--; @@ -1919,6 +2062,16 @@ cow_done: done: if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); + + if (p->need_commit_sem) { + int ret2; + + ret2 = finish_need_commit_sem_search(p); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } + return ret; } ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); @@ -2616,19 +2769,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans, */ static int leaf_space_used(struct extent_buffer *l, int start, int nr) { - struct btrfs_item *start_item; - struct btrfs_item *end_item; int data_len; int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - start_item = btrfs_item_nr(start); - end_item = btrfs_item_nr(end); - data_len = btrfs_item_offset(l, start_item) + - btrfs_item_size(l, start_item); - data_len = data_len - btrfs_item_offset(l, end_item); + data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); + data_len = data_len - btrfs_item_offset(l, end); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; @@ -2675,7 +2823,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path, u32 i; int push_space = 0; int push_items = 0; - struct btrfs_item *item; u32 nr; u32 right_nritems; u32 data_end; @@ -2692,8 +2839,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path, slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { - item = btrfs_item_nr(i); - if (!empty && push_items > 0) { if (path->slots[0] > i) break; @@ -2708,12 +2853,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (path->slots[0] == i) push_space += data_size; - this_item_size = btrfs_item_size(left, item); - if (this_item_size + sizeof(*item) + push_space > free_space) + this_item_size = btrfs_item_size(left, i); + if (this_item_size + sizeof(struct btrfs_item) + + push_space > free_space) break; push_items++; - push_space += this_item_size + sizeof(*item); + push_space += this_item_size + sizeof(struct btrfs_item); if (i == 0) break; i--; @@ -2727,7 +2873,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, /* push left to right */ right_nritems = btrfs_header_nritems(right); - push_space = btrfs_item_end_nr(left, left_nritems - push_items); + push_space = btrfs_item_data_end(left, left_nritems - push_items); push_space -= leaf_data_end(left); /* make room in the right data area */ @@ -2758,9 +2904,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(i); - push_space -= btrfs_token_item_size(&token, item); - btrfs_set_token_item_offset(&token, item, push_space); + push_space -= btrfs_token_item_size(&token, i); + btrfs_set_token_item_offset(&token, i, push_space); } left_nritems -= push_items; @@ -2905,7 +3050,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, int i; int push_space = 0; int push_items = 0; - struct btrfs_item *item; u32 old_left_nritems; u32 nr; int ret = 0; @@ -2919,8 +3063,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { - item = btrfs_item_nr(i); - if (!empty && push_items > 0) { if (path->slots[0] < i) break; @@ -2935,12 +3077,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (path->slots[0] == i) push_space += data_size; - this_item_size = btrfs_item_size(right, item); - if (this_item_size + sizeof(*item) + push_space > free_space) + this_item_size = btrfs_item_size(right, i); + if (this_item_size + sizeof(struct btrfs_item) + push_space > + free_space) break; push_items++; - push_space += this_item_size + sizeof(*item); + push_space += this_item_size + sizeof(struct btrfs_item); } if (push_items == 0) { @@ -2956,25 +3099,23 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, push_items * sizeof(struct btrfs_item)); push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - - btrfs_item_offset_nr(right, push_items - 1); + btrfs_item_offset(right, push_items - 1); copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left) - push_space, BTRFS_LEAF_DATA_OFFSET + - btrfs_item_offset_nr(right, push_items - 1), + btrfs_item_offset(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); btrfs_init_map_token(&token, left); - old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); + old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - item = btrfs_item_nr(i); - - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -2985,7 +3126,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, right_nritems); if (push_items < right_nritems) { - push_space = btrfs_item_offset_nr(right, push_items - 1) - + push_space = btrfs_item_offset(right, push_items - 1) - leaf_data_end(right); memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, @@ -3003,10 +3144,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(i); - - push_space = push_space - btrfs_token_item_size(&token, item); - btrfs_set_token_item_offset(&token, item, push_space); + push_space = push_space - btrfs_token_item_size(&token, i); + btrfs_set_token_item_offset(&token, i, push_space); } btrfs_mark_buffer_dirty(left); @@ -3134,7 +3273,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); + data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); copy_extent_buffer(right, l, btrfs_item_nr_offset(0), btrfs_item_nr_offset(mid), @@ -3145,15 +3284,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, data_copy_size, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(l), data_copy_size); - rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); + rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); btrfs_init_map_token(&token, right); for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff + rt_data_off); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); } btrfs_set_header_nritems(l, mid); @@ -3269,7 +3407,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; - if (extend && data_size + btrfs_item_size_nr(l, slot) + + if (extend && data_size + btrfs_item_size(l, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) return -EOVERFLOW; @@ -3438,7 +3576,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, if (btrfs_leaf_free_space(leaf) >= ins_len) return 0; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (key.type == BTRFS_EXTENT_DATA_KEY) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -3458,7 +3596,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, ret = -EAGAIN; leaf = path->nodes[0]; /* if our item isn't there, return now */ - if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) + if (item_size != btrfs_item_size(leaf, path->slots[0])) goto err; /* the leaf has changed, it now has room. return now */ @@ -3489,9 +3627,7 @@ static noinline int split_item(struct btrfs_path *path, unsigned long split_offset) { struct extent_buffer *leaf; - struct btrfs_item *item; - struct btrfs_item *new_item; - int slot; + int orig_slot, slot; char *buf; u32 nritems; u32 item_size; @@ -3501,9 +3637,9 @@ static noinline int split_item(struct btrfs_path *path, leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); - item = btrfs_item_nr(path->slots[0]); - orig_offset = btrfs_item_offset(leaf, item); - item_size = btrfs_item_size(leaf, item); + orig_slot = path->slots[0]; + orig_offset = btrfs_item_offset(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); buf = kmalloc(item_size, GFP_NOFS); if (!buf) @@ -3524,14 +3660,12 @@ static noinline int split_item(struct btrfs_path *path, btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(leaf, &disk_key, slot); - new_item = btrfs_item_nr(slot); + btrfs_set_item_offset(leaf, slot, orig_offset); + btrfs_set_item_size(leaf, slot, item_size - split_offset); - btrfs_set_item_offset(leaf, new_item, orig_offset); - btrfs_set_item_size(leaf, new_item, item_size - split_offset); - - btrfs_set_item_offset(leaf, item, - orig_offset + item_size - split_offset); - btrfs_set_item_size(leaf, item, split_offset); + btrfs_set_item_offset(leaf, orig_slot, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, orig_slot, split_offset); btrfs_set_header_nritems(leaf, nritems + 1); @@ -3592,7 +3726,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; - struct btrfs_item *item; u32 nritems; unsigned int data_end; unsigned int old_data_start; @@ -3604,14 +3737,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) leaf = path->nodes[0]; slot = path->slots[0]; - old_size = btrfs_item_size_nr(leaf, slot); + old_size = btrfs_item_size(leaf, slot); if (old_size == new_size) return; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(leaf); - old_data_start = btrfs_item_offset_nr(leaf, slot); + old_data_start = btrfs_item_offset(leaf, slot); size_diff = old_size - new_size; @@ -3625,10 +3758,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff + size_diff); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + size_diff); } /* shift the data */ @@ -3671,8 +3803,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) fixup_low_keys(path, &disk_key, 1); } - item = btrfs_item_nr(slot); - btrfs_set_item_size(leaf, item, new_size); + btrfs_set_item_size(leaf, slot, new_size); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(leaf) < 0) { @@ -3688,7 +3819,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; - struct btrfs_item *item; u32 nritems; unsigned int data_end; unsigned int old_data; @@ -3706,7 +3836,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) BUG(); } slot = path->slots[0]; - old_data = btrfs_item_end_nr(leaf, slot); + old_data = btrfs_item_data_end(leaf, slot); BUG_ON(slot < 0); if (slot >= nritems) { @@ -3723,10 +3853,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff - data_size); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff - data_size); } /* shift the data */ @@ -3735,9 +3864,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) data_end, old_data - data_end); data_end = old_data; - old_size = btrfs_item_size_nr(leaf, slot); - item = btrfs_item_nr(slot); - btrfs_set_item_size(leaf, item, old_size + data_size); + old_size = btrfs_item_size(leaf, slot); + btrfs_set_item_size(leaf, slot, old_size + data_size); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(leaf) < 0) { @@ -3759,7 +3887,6 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p const struct btrfs_item_batch *batch) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_item *item; int i; u32 nritems; unsigned int data_end; @@ -3796,7 +3923,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p btrfs_init_map_token(&token, leaf); if (slot != nritems) { - unsigned int old_data = btrfs_item_end_nr(leaf, slot); + unsigned int old_data = btrfs_item_data_end(leaf, slot); if (old_data < data_end) { btrfs_print_leaf(leaf); @@ -3812,10 +3939,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, - ioff - batch->total_data_size); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, + ioff - batch->total_data_size); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), @@ -3834,10 +3960,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p for (i = 0; i < batch->nr; i++) { btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(slot + i); data_end -= batch->data_sizes[i]; - btrfs_set_token_item_offset(&token, item, data_end); - btrfs_set_token_item_size(&token, item, batch->data_sizes[i]); + btrfs_set_token_item_offset(&token, slot + i, data_end); + btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); } btrfs_set_header_nritems(leaf, nritems + batch->nr); @@ -3944,7 +4069,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, u32 item_size; leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ret = setup_leaf_for_split(trans, root, path, item_size + sizeof(struct btrfs_item)); if (ret) @@ -4045,7 +4170,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; - struct btrfs_item *item; u32 last_off; u32 dsize = 0; int ret = 0; @@ -4054,10 +4178,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 nritems; leaf = path->nodes[0]; - last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); + last_off = btrfs_item_offset(leaf, slot + nr - 1); for (i = 0; i < nr; i++) - dsize += btrfs_item_size_nr(leaf, slot + i); + dsize += btrfs_item_size(leaf, slot + i); nritems = btrfs_header_nritems(leaf); @@ -4074,9 +4198,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, for (i = slot + nr; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff + dsize); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + dsize); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), @@ -4403,7 +4526,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int level; struct extent_buffer *c; struct extent_buffer *next; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; + bool need_commit_sem = false; u32 nritems; int ret; int i; @@ -4420,14 +4545,20 @@ again: path->keep_locks = 1; - if (time_seq) + if (time_seq) { ret = btrfs_search_old_slot(root, &key, path, time_seq); - else + } else { + if (path->need_commit_sem) { + path->need_commit_sem = 0; + need_commit_sem = true; + down_read(&fs_info->commit_root_sem); + } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + } path->keep_locks = 0; if (ret < 0) - return ret; + goto done; nritems = btrfs_header_nritems(path->nodes[0]); /* @@ -4550,6 +4681,15 @@ again: ret = 0; done: unlock_up(path, 0, 1, 0, NULL); + if (need_commit_sem) { + int ret2; + + path->need_commit_sem = 1; + ret2 = finish_need_commit_sem_search(path); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5fe5eccb3c87..b4a9b1c58d22 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -143,6 +143,8 @@ enum { BTRFS_FS_STATE_DEV_REPLACING, /* The btrfs_fs_info created for self-tests */ BTRFS_FS_STATE_DUMMY_FS_INFO, + + BTRFS_FS_STATE_NO_CSUMS, }; #define BTRFS_BACKREF_REV_MAX 256 @@ -511,11 +513,6 @@ struct btrfs_discard_ctl { atomic64_t discard_bytes_saved; }; -enum btrfs_orphan_cleanup_state { - ORPHAN_CLEANUP_STARTED = 1, - ORPHAN_CLEANUP_DONE = 2, -}; - void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); /* fs_info */ @@ -553,7 +550,6 @@ struct btrfs_swapfile_pin { bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); enum { - BTRFS_FS_BARRIER, BTRFS_FS_CLOSING_START, BTRFS_FS_CLOSING_DONE, BTRFS_FS_LOG_RECOVERING, @@ -576,7 +572,6 @@ enum { /* * Indicate that relocation of a chunk has started, it's set per chunk * and is toggled between chunks. - * Set, tested and cleared while holding fs_info::send_reloc_lock. */ BTRFS_FS_RELOC_RUNNING, @@ -601,6 +596,9 @@ enum { /* Indicate whether there are any tree modification log users */ BTRFS_FS_TREE_MOD_LOG_USERS, + /* Indicate that we want the transaction kthread to commit right now. */ + BTRFS_FS_COMMIT_TRANS, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -613,6 +611,7 @@ enum { */ enum btrfs_exclusive_operation { BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE_PAUSED, BTRFS_EXCLOP_BALANCE, BTRFS_EXCLOP_DEV_ADD, BTRFS_EXCLOP_DEV_REMOVE, @@ -624,20 +623,21 @@ enum btrfs_exclusive_operation { struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; - struct btrfs_root *extent_root; struct btrfs_root *tree_root; struct btrfs_root *chunk_root; struct btrfs_root *dev_root; struct btrfs_root *fs_root; - struct btrfs_root *csum_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; - struct btrfs_root *free_space_root; struct btrfs_root *data_reloc_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; + /* The tree that holds the global roots (csum, extent, etc) */ + rwlock_t global_root_lock; + struct rb_root global_root_tree; + spinlock_t fs_roots_radix_lock; struct radix_tree_root fs_roots_radix; @@ -673,6 +673,12 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; + /* + * Generation of the last transaction used for block group relocation + * since the filesystem was last mounted (or 0 if none happened yet). + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; u64 avg_delayed_ref_runtime; /* @@ -815,7 +821,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; - struct btrfs_workqueue *readahead_workers; /* * fixup workers take dirty pages that didn't properly go through @@ -952,13 +957,6 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* readahead tree */ - spinlock_t reada_lock; - struct radix_tree_root reada_tree; - - /* readahead works cnt */ - atomic_t reada_works_cnt; - /* Extent buffer radix tree */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ @@ -1003,13 +1001,6 @@ struct btrfs_fs_info { struct crypto_shash *csum_shash; - spinlock_t send_reloc_lock; - /* - * Number of send operations in progress. - * Updated while holding fs_info::send_reloc_lock. - */ - int send_in_progress; - /* Type of exclusive operation running, protected by super_lock */ enum btrfs_exclusive_operation exclusive_operation; @@ -1110,6 +1101,8 @@ enum { BTRFS_ROOT_HAS_LOG_TREE, /* Qgroup flushing is in progress */ BTRFS_ROOT_QGROUP_FLUSHING, + /* We started the orphan cleanup for this root. */ + BTRFS_ROOT_ORPHAN_CLEANUP, }; /* @@ -1128,6 +1121,8 @@ struct btrfs_qgroup_swapped_blocks { * and for the extent tree extent_root root. */ struct btrfs_root { + struct rb_node rb_node; + struct extent_buffer *node; struct extent_buffer *commit_root; @@ -1178,8 +1173,6 @@ struct btrfs_root { spinlock_t log_extents_lock[2]; struct list_head logged_list[2]; - int orphan_cleanup_state; - spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ struct rb_root inode_tree; @@ -1960,8 +1953,8 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb, } /* struct btrfs_item */ -BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); @@ -1976,25 +1969,36 @@ static inline struct btrfs_item *btrfs_item_nr(int nr) return (struct btrfs_item *)btrfs_item_nr_offset(nr); } -static inline u32 btrfs_item_end(const struct extent_buffer *eb, - struct btrfs_item *item) -{ - return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); +#define BTRFS_ITEM_SETGET_FUNCS(member) \ +static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \ + int slot) \ +{ \ + return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ +} \ +static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ + int slot, u32 val) \ +{ \ + btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ +} \ +static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ + int slot) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(slot); \ + return btrfs_token_raw_item_##member(token, item); \ +} \ +static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ + int slot, u32 val) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(slot); \ + btrfs_set_token_raw_item_##member(token, item, val); \ } -static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr) -{ - return btrfs_item_end(eb, btrfs_item_nr(nr)); -} - -static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr) -{ - return btrfs_item_offset(eb, btrfs_item_nr(nr)); -} +BTRFS_ITEM_SETGET_FUNCS(offset) +BTRFS_ITEM_SETGET_FUNCS(size); -static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr) +static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) { - return btrfs_item_size(eb, btrfs_item_nr(nr)); + return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); } static inline void btrfs_item_key(const struct extent_buffer *eb, @@ -2463,7 +2467,7 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) if (nr == 0) return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); - return btrfs_item_offset_nr(leaf, nr - 1); + return btrfs_item_offset(leaf, nr - 1); } /* struct btrfs_file_extent_item */ @@ -2522,9 +2526,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, */ static inline u32 btrfs_file_extent_inline_item_len( const struct extent_buffer *eb, - struct btrfs_item *e) + int nr) { - return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; + return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } /* btrfs_qgroup_status_item */ @@ -2616,11 +2620,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset_nr(leaf, slot))) + btrfs_item_offset(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset_nr(leaf, slot))) + btrfs_item_offset(leaf, slot))) static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) { @@ -3119,36 +3123,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); -/* inode-item.c */ -int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 index); -int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index); -int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid); -int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, - struct btrfs_key *location, int mod); - -struct btrfs_inode_extref * -btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow); - -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len); -struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len); /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, @@ -3208,10 +3182,6 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); -int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode, u64 new_size, - u32 min_type, u64 *extents_found); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, @@ -3310,6 +3280,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op); + /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3826,23 +3799,6 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) btrfs_bio_counter_sub(fs_info, 1); } -/* reada.c */ -struct reada_control { - struct btrfs_fs_info *fs_info; /* tree to prefetch */ - struct btrfs_key key_start; - struct btrfs_key key_end; /* exclusive */ - atomic_t elems; - struct kref refcnt; - wait_queue_head_t wait; -}; -struct reada_control *btrfs_reada_add(struct btrfs_root *root, - struct btrfs_key *start, struct btrfs_key *end); -int btrfs_reada_wait(void *handle); -void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct extent_buffer *eb, int err); -void btrfs_reada_remove_dev(struct btrfs_device *dev); -void btrfs_reada_undo_remove_dev(struct btrfs_device *dev); - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 40c4d6ba3fb9..fb46a28f5065 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -334,7 +334,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); if (ret) return ret; - ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index e164766dcc38..748bf6b0d860 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -13,6 +13,7 @@ #include "ctree.h" #include "qgroup.h" #include "locking.h" +#include "inode-item.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -629,7 +630,7 @@ static int btrfs_delayed_inode_reserve_metadata( BTRFS_QGROUP_RSV_META_PREALLOC, true); if (ret < 0) return ret; - ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); /* NO_FLUSH could only fail with -ENOSPC */ ASSERT(ret == 0 || ret == -ENOSPC); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index cca7e85e32dd..4176df149d04 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -84,6 +84,17 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); u64 released = 0; + /* + * We have to check the mount option here because we could be enabling + * the free space tree for the first time and don't have the compat_ro + * option set yet. + * + * We need extra reservations if we have the free space tree because + * we'll have to modify that tree as well. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + num_bytes *= 2; + released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", @@ -108,6 +119,17 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) num_bytes = btrfs_calc_insert_metadata_size(fs_info, trans->delayed_ref_updates); + /* + * We have to check the mount option here because we could be enabling + * the free space tree for the first time and don't have the compat_ro + * option set yet. + * + * We need extra reservations if we have the free space tree because + * we'll have to modify that tree as well. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + num_bytes *= 2; + spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; delayed_rsv->full = 0; @@ -191,8 +213,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, - num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (ret) return ret; btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index c85a7d44da79..62b9651ea662 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -128,7 +128,7 @@ no_valid_dev_replace_entry_found: } slot = path->slots[0]; eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); if (item_size != sizeof(struct btrfs_dev_replace_item)) { @@ -322,7 +322,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error; @@ -381,7 +381,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) } if (ret == 0 && - btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { /* * need to delete old one and insert a new one. * Since no attempt is made to recover any old state, if the @@ -906,9 +906,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - if (!scrub_ret) - btrfs_reada_remove_dev(src_device); - /* * We have to use this loop approach because at this point src_device * has to be available for transaction commit to complete, yet new @@ -917,7 +914,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, while (1) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - btrfs_reada_undo_remove_dev(src_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return PTR_ERR(trans); } @@ -968,7 +964,6 @@ error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_reada_undo_remove_dev(src_device); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(tgt_device); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 7721ce0c0604..3b532bab0755 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; - struct btrfs_item *item; struct extent_buffer *leaf; ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); @@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; - item = btrfs_item_nr(path->slots[0]); ptr = btrfs_item_ptr(leaf, path->slots[0], char); - BUG_ON(data_size > btrfs_item_size(leaf, item)); - ptr += btrfs_item_size(leaf, item) - data_size; + ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0])); + ptr += btrfs_item_size(leaf, path->slots[0]) - data_size; return (struct btrfs_dir_item *)ptr; } @@ -271,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, data_size = sizeof(*di) + name_len; leaf = path->nodes[0]; slot = path->slots[0]; - if (data_size + btrfs_item_size_nr(leaf, slot) + + if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { ret = -EOVERFLOW; } else { @@ -409,7 +407,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); - total_len = btrfs_item_size_nr(leaf, path->slots[0]); + total_len = btrfs_item_size(leaf, path->slots[0]); while (cur < total_len) { this_len = sizeof(*dir_item) + btrfs_dir_name_len(leaf, dir_item) + @@ -445,7 +443,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di); - item_len = btrfs_item_size_nr(leaf, path->slots[0]); + item_len = btrfs_item_size(leaf, path->slots[0]); if (sub_item_len == item_len) { ret = btrfs_del_item(trans, root, path); } else { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b3f2e2232326..87a5addbedf6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -665,9 +665,6 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end, if (ret < 0) goto err; - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb, ret); - set_extent_buffer_uptodate(eb); free_extent_buffer(eb); @@ -715,10 +712,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, } ret = validate_extent_buffer(eb); err: - if (reads_done && - test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb, ret); - if (ret) { /* * our io error hook is going to dec the io pages @@ -1140,11 +1133,16 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); root->fs_info = fs_info; + root->root_key.objectid = objectid; root->node = NULL; root->commit_root = NULL; root->state = 0; - root->orphan_cleanup_state = 0; + RB_CLEAR_NODE(&root->rb_node); root->last_trans = 0; root->free_objectid = 0; @@ -1152,7 +1150,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); - root->block_rsv = NULL; + + btrfs_init_root_block_rsv(root); INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); @@ -1190,6 +1189,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; + root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); @@ -1197,12 +1197,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, IO_TREE_LOG_CSUM_RANGE, NULL); } - memset(&root->root_key, 0, sizeof(root->root_key)); - memset(&root->root_item, 0, sizeof(root->root_item)); - memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - root->root_key.objectid = objectid; - root->anon_dev = 0; - spin_lock_init(&root->root_item_lock); btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG @@ -1242,6 +1236,81 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) } #endif +static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node) +{ + const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node); + const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node); + + return btrfs_comp_cpu_keys(&a->root_key, &b->root_key); +} + +static int global_root_key_cmp(const void *k, const struct rb_node *node) +{ + const struct btrfs_key *key = k; + const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node); + + return btrfs_comp_cpu_keys(key, &root->root_key); +} + +int btrfs_global_root_insert(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *tmp; + + write_lock(&fs_info->global_root_lock); + tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp); + write_unlock(&fs_info->global_root_lock); + ASSERT(!tmp); + + return tmp ? -EEXIST : 0; +} + +void btrfs_global_root_delete(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + write_lock(&fs_info->global_root_lock); + rb_erase(&root->rb_node, &fs_info->global_root_tree); + write_unlock(&fs_info->global_root_lock); +} + +struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key) +{ + struct rb_node *node; + struct btrfs_root *root = NULL; + + read_lock(&fs_info->global_root_lock); + node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp); + if (node) + root = container_of(node, struct btrfs_root, rb_node); + read_unlock(&fs_info->global_root_lock); + + return root; +} + +struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_key key = { + .objectid = BTRFS_CSUM_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + return btrfs_global_root(fs_info, &key); +} + +struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_key key = { + .objectid = BTRFS_EXTENT_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + return btrfs_global_root(fs_info, &key); +} + struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { @@ -1554,25 +1623,33 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, u64 objectid) { + struct btrfs_key key = { + .objectid = objectid, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + if (objectid == BTRFS_ROOT_TREE_OBJECTID) return btrfs_grab_root(fs_info->tree_root); if (objectid == BTRFS_EXTENT_TREE_OBJECTID) - return btrfs_grab_root(fs_info->extent_root); + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); if (objectid == BTRFS_CHUNK_TREE_OBJECTID) return btrfs_grab_root(fs_info->chunk_root); if (objectid == BTRFS_DEV_TREE_OBJECTID) return btrfs_grab_root(fs_info->dev_root); if (objectid == BTRFS_CSUM_TREE_OBJECTID) - return btrfs_grab_root(fs_info->csum_root); + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); if (objectid == BTRFS_QUOTA_TREE_OBJECTID) return btrfs_grab_root(fs_info->quota_root) ? fs_info->quota_root : ERR_PTR(-ENOENT); if (objectid == BTRFS_UUID_TREE_OBJECTID) return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return btrfs_grab_root(fs_info->free_space_root) ? - fs_info->free_space_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { + struct btrfs_root *root = btrfs_global_root(fs_info, &key); + + return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT); + } return NULL; } @@ -1619,6 +1696,18 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) #endif } +static void free_global_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct rb_node *node; + + while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) { + root = rb_entry(node, struct btrfs_root, rb_node); + rb_erase(&root->rb_node, &fs_info->global_root_tree); + btrfs_put_root(root); + } +} + void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); @@ -1630,14 +1719,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); - btrfs_put_root(fs_info->extent_root); + free_global_roots(fs_info); btrfs_put_root(fs_info->tree_root); btrfs_put_root(fs_info->chunk_root); btrfs_put_root(fs_info->dev_root); - btrfs_put_root(fs_info->csum_root); btrfs_put_root(fs_info->quota_root); btrfs_put_root(fs_info->uuid_root); - btrfs_put_root(fs_info->free_space_root); btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); btrfs_check_leaked_roots(fs_info); @@ -1935,7 +2022,8 @@ static int transaction_kthread(void *arg) } delta = ktime_get_seconds() - cur->start_time; - if (cur->state < TRANS_STATE_COMMIT_START && + if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && + cur->state < TRANS_STATE_COMMIT_START && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay -= msecs_to_jiffies((delta - 1) * 1000); @@ -2007,6 +2095,8 @@ static void backup_super_roots(struct btrfs_fs_info *info) { const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; + struct btrfs_root *extent_root = btrfs_extent_root(info, 0); + struct btrfs_root *csum_root = btrfs_csum_root(info, 0); root_backup = info->super_for_commit->super_roots + next_backup; @@ -2031,11 +2121,11 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); - btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); + btrfs_set_backup_extent_root(root_backup, extent_root->node->start); btrfs_set_backup_extent_root_gen(root_backup, - btrfs_header_generation(info->extent_root->node)); + btrfs_header_generation(extent_root->node)); btrfs_set_backup_extent_root_level(root_backup, - btrfs_header_level(info->extent_root->node)); + btrfs_header_level(extent_root->node)); /* * we might commit during log recovery, which happens before we set @@ -2056,11 +2146,11 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); - btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); + btrfs_set_backup_csum_root(root_backup, csum_root->node->start); btrfs_set_backup_csum_root_gen(root_backup, - btrfs_header_generation(info->csum_root->node)); + btrfs_header_generation(csum_root->node)); btrfs_set_backup_csum_root_level(root_backup, - btrfs_header_level(info->csum_root->node)); + btrfs_header_level(csum_root->node)); btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); @@ -2135,7 +2225,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); btrfs_destroy_workqueue(fs_info->caching_workers); - btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); if (fs_info->discard_ctl.discard_workers) @@ -2159,21 +2248,29 @@ static void free_root_extent_buffers(struct btrfs_root *root) } } +static void free_global_root_pointers(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root, *tmp; + + rbtree_postorder_for_each_entry_safe(root, tmp, + &fs_info->global_root_tree, + rb_node) + free_root_extent_buffers(root); +} + /* helper to cleanup tree roots */ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) { free_root_extent_buffers(info->tree_root); + free_global_root_pointers(info); free_root_extent_buffers(info->dev_root); - free_root_extent_buffers(info->extent_root); - free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); - free_root_extent_buffers(info->free_space_root); } void btrfs_put_root(struct btrfs_root *root) @@ -2291,8 +2388,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->qgroup_rescan_lock); } -static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices) +static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; @@ -2341,9 +2437,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->delayed_workers = btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, max_active, 0); - fs_info->readahead_workers = - btrfs_alloc_workqueue(fs_info, "readahead", flags, - max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); fs_info->discard_ctl.discard_workers = @@ -2355,9 +2448,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_meta_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && - fs_info->caching_workers && fs_info->readahead_workers && - fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->qgroup_rescan_workers && + fs_info->caching_workers && fs_info->fixup_workers && + fs_info->delayed_workers && fs_info->qgroup_rescan_workers && fs_info->discard_ctl.discard_workers)) { return -ENOMEM; } @@ -2435,6 +2527,104 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return 0; } +static int load_global_roots_objectid(struct btrfs_root *tree_root, + struct btrfs_path *path, u64 objectid, + const char *name) +{ + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *root; + int ret; + struct btrfs_key key = { + .objectid = objectid, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + bool found = false; + + /* If we have IGNOREDATACSUMS skip loading these roots. */ + if (objectid == BTRFS_CSUM_TREE_OBJECTID && + btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { + set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + return 0; + } + + while (1) { + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) + break; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(tree_root, path); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + } + ret = 0; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != objectid) + break; + btrfs_release_path(path); + + found = true; + root = read_tree_root_path(tree_root, path, &key); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) + ret = PTR_ERR(root); + break; + } + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + ret = btrfs_global_root_insert(root); + if (ret) { + btrfs_put_root(root); + break; + } + key.offset++; + } + btrfs_release_path(path); + + if (!found || ret) { + if (objectid == BTRFS_CSUM_TREE_OBJECTID) + set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) + ret = ret ? ret : -ENOENT; + else + ret = 0; + btrfs_err(fs_info, "failed to load root %s", name); + } + return ret; +} + +static int load_global_roots(struct btrfs_root *tree_root) +{ + struct btrfs_path *path; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = load_global_roots_objectid(tree_root, path, + BTRFS_EXTENT_TREE_OBJECTID, "extent"); + if (ret) + goto out; + ret = load_global_roots_objectid(tree_root, path, + BTRFS_CSUM_TREE_OBJECTID, "csum"); + if (ret) + goto out; + if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) + goto out; + ret = load_global_roots_objectid(tree_root, path, + BTRFS_FREE_SPACE_TREE_OBJECTID, + "free space"); +out: + btrfs_free_path(path); + return ret; +} + static int btrfs_read_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *tree_root = fs_info->tree_root; @@ -2444,7 +2634,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) BUG_ON(!fs_info->tree_root); - location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + ret = load_global_roots(tree_root); + if (ret) + return ret; + + location.objectid = BTRFS_DEV_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; @@ -2456,38 +2650,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->extent_root = root; - } - - location.objectid = BTRFS_DEV_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; - } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; } /* Initialize fs_info for all devices in any case */ btrfs_init_devices_late(fs_info); - /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ - if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; - } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->csum_root = root; - } - } - /* * This tree can share blocks with some other fs tree during relocation * and we need a proper setup by btrfs_get_fs_root @@ -2525,20 +2692,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->uuid_root = root; } - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; - } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->free_space_root = root; - } - } - return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", @@ -2858,6 +3011,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* All successful */ fs_info->generation = generation; fs_info->last_trans_committed = generation; + fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ if (backup_index < 0) { @@ -2893,6 +3047,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->zone_active_bgs_lock); spin_lock_init(&fs_info->relocation_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); + rwlock_init(&fs_info->global_root_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->reclaim_bgs_lock); mutex_init(&fs_info->reloc_mutex); @@ -2924,9 +3079,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); - atomic_set(&fs_info->reada_works_cnt, 0); atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); + fs_info->global_root_tree = RB_ROOT; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; @@ -2934,9 +3089,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ - /* readahead state */ - INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - spin_lock_init(&fs_info->reada_lock); btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, @@ -2958,7 +3110,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) extent_io_tree_init(fs_info, &fs_info->excluded_extents, IO_TREE_FS_EXCLUDED_EXTENTS, NULL); - set_bit(BTRFS_FS_BARRIER, &fs_info->flags); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -2993,9 +3144,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; - spin_lock_init(&fs_info->send_reloc_lock); - fs_info->send_in_progress = 0; - fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); } @@ -3423,7 +3571,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->subpage_info = subpage_info; } - ret = btrfs_init_workqueues(fs_info, fs_devices); + ret = btrfs_init_workqueues(fs_info); if (ret) { err = ret; goto fail_sb_buffer; @@ -3571,6 +3719,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } + btrfs_free_zone_cache(fs_info); + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, @@ -4333,6 +4483,48 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) return btrfs_commit_transaction(trans); } +static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) +{ + struct btrfs_transaction *trans; + struct btrfs_transaction *tmp; + bool found = false; + + if (list_empty(&fs_info->trans_list)) + return; + + /* + * This function is only called at the very end of close_ctree(), + * thus no other running transaction, no need to take trans_lock. + */ + ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)); + list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) { + struct extent_state *cached = NULL; + u64 dirty_bytes = 0; + u64 cur = 0; + u64 found_start; + u64 found_end; + + found = true; + while (!find_first_extent_bit(&trans->dirty_pages, cur, + &found_start, &found_end, EXTENT_DIRTY, &cached)) { + dirty_bytes += found_end + 1 - found_start; + cur = found_end + 1; + } + btrfs_warn(fs_info, + "transaction %llu (with %llu dirty metadata bytes) is not committed", + trans->transid, dirty_bytes); + btrfs_cleanup_one_transaction(trans, fs_info); + + if (trans == fs_info->running_transaction) + fs_info->running_transaction = NULL; + list_del_init(&trans->list); + + btrfs_put_transaction(trans); + trace_btrfs_transaction_commit(fs_info); + } + ASSERT(!found); +} + void __cold close_ctree(struct btrfs_fs_info *fs_info) { int ret; @@ -4441,7 +4633,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_stop_all_workers(fs_info); /* We shouldn't have any transaction open at this point */ - ASSERT(list_empty(&fs_info->trans_list)); + warn_about_uncommitted_trans(fs_info); clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); @@ -4989,7 +5181,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->trans_lock); btrfs_put_transaction(t); - trace_btrfs_transaction_commit(fs_info->tree_root); + trace_btrfs_transaction_commit(fs_info); spin_lock(&fs_info->trans_lock); } spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a2b5db4ba262..5e8bef4b7563 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -71,6 +71,12 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid); +int btrfs_global_root_insert(struct btrfs_root *root); +void btrfs_global_root_delete(struct btrfs_root *root); +struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key); +struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); @@ -103,6 +109,11 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) return NULL; } +static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) +{ + return btrfs_extent_root(fs_info, 0); +} + void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 25ef6e3fd306..d89273c4b6b8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -87,6 +87,7 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { + struct btrfs_root *root = btrfs_extent_root(fs_info, start); int ret; struct btrfs_key key; struct btrfs_path *path; @@ -98,7 +99,7 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) key.objectid = start; key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); btrfs_free_path(path); return ret; } @@ -116,6 +117,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags) { + struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; @@ -153,7 +155,8 @@ search_again: else key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + extent_root = btrfs_extent_root(fs_info, bytenr); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out_free; @@ -171,7 +174,7 @@ search_again: if (ret == 0) { leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (item_size >= sizeof(*ei)) { ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -443,7 +446,7 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; @@ -519,7 +522,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset, int refs_to_add) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; u32 size; @@ -593,6 +596,7 @@ fail: } static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, int refs_to_drop, int *last_ref) { @@ -626,7 +630,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, num_refs -= refs_to_drop; if (num_refs == 0) { - ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); + ret = btrfs_del_item(trans, root, path); *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) @@ -685,7 +689,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 parent, u64 root_objectid) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; @@ -709,6 +713,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 parent, u64 root_objectid) { + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; @@ -721,8 +726,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, key.offset = root_objectid; } - ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, - path, &key, 0); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); btrfs_release_path(path); return ret; } @@ -787,7 +791,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, u64 owner, u64 offset, int insert) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -865,7 +869,7 @@ again: } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { err = -EINVAL; btrfs_print_v0_err(fs_info); @@ -1007,7 +1011,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, __run_delayed_extent_op(extent_op, leaf, ei); ptr = (unsigned long)ei + item_offset; - end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); if (ptr < end - size) memmove_extent_buffer(leaf, ptr + size, ptr, end - size - ptr); @@ -1119,7 +1123,7 @@ void update_inline_extent_backref(struct btrfs_path *path, } else { *last_ref = 1; size = btrfs_extent_inline_ref_size(type); - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; end = (unsigned long)ei + item_size; if (ptr + size < end) @@ -1174,6 +1178,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, } static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data, int *last_ref) @@ -1185,11 +1190,11 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, update_inline_extent_backref(path, iref, -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, path, refs_to_drop, + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, last_ref); } else { *last_ref = 1; - ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); + ret = btrfs_del_item(trans, root, path); } return ret; } @@ -1572,6 +1577,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -1601,8 +1607,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, key.offset = head->num_bytes; } + root = btrfs_extent_root(fs_info, key.objectid); again: - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { err = ret; goto out; @@ -1634,7 +1641,7 @@ again: } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { err = -EINVAL; @@ -1844,8 +1851,11 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) { btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info->csum_root, - head->bytenr, head->num_bytes); + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(fs_info, head->bytenr); + ret = btrfs_del_csums(trans, csum_root, head->bytenr, + head->num_bytes); } } @@ -2285,7 +2295,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, bool strict) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; struct btrfs_extent_data_ref *ref; struct btrfs_extent_inline_ref *iref; @@ -2316,7 +2326,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, goto out; ret = 1; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); /* If extent item has more than 1 inline ref then it's shared */ @@ -2920,7 +2930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; struct btrfs_path *path; - struct btrfs_root *extent_root = info->extent_root; + struct btrfs_root *extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -2936,6 +2946,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int last_ref = 0; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); + extent_root = btrfs_extent_root(info, bytenr); + ASSERT(extent_root); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2996,9 +3009,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto err_dump; } /* Must be SHARED_* item, remove the backref first */ - ret = remove_extent_backref(trans, path, NULL, - refs_to_drop, - is_data, &last_ref); + ret = remove_extent_backref(trans, extent_root, path, + NULL, refs_to_drop, is_data, + &last_ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3068,7 +3081,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, extent_slot); + item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { ret = -EINVAL; btrfs_print_v0_err(info); @@ -3122,8 +3135,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); } if (found_extent) { - ret = remove_extent_backref(trans, path, iref, - refs_to_drop, is_data, + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3179,7 +3192,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (is_data) { - ret = btrfs_del_csums(trans, info->csum_root, bytenr, + struct btrfs_root *csum_root; + csum_root = btrfs_csum_root(info, bytenr); + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3790,23 +3805,35 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, spin_unlock(&fs_info->relocation_bg_lock); if (skip) return 1; + /* Check RO and no space case before trying to activate it */ spin_lock(&block_group->lock); if (block_group->ro || block_group->alloc_offset == block_group->zone_capacity) { - spin_unlock(&block_group->lock); - return 1; + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. + * Return the error after taking the locks. + */ } spin_unlock(&block_group->lock); - if (!btrfs_zone_activate(block_group)) - return 1; + if (!ret && !btrfs_zone_activate(block_group)) { + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. + * Return the error after taking the locks. + */ + } spin_lock(&space_info->lock); spin_lock(&block_group->lock); spin_lock(&fs_info->treelog_bg_lock); spin_lock(&fs_info->relocation_bg_lock); + if (ret) + goto out; + ASSERT(!ffe_ctl->for_treelog || block_group->start == fs_info->treelog_bg || fs_info->treelog_bg == 0); @@ -3947,6 +3974,28 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, } } +static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return true; + case BTRFS_EXTENT_ALLOC_ZONED: + /* + * If we have enough free space left in an already + * active block group and we can't activate any other + * zone now, do not allow allocating a new chunk and + * let find_free_extent() retry with a smaller size. + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && + !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) + return false; + return true; + default: + BUG(); + } +} + static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) { switch (ffe_ctl->policy) { @@ -3975,7 +4024,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, bool full_search) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = fs_info->chunk_root; int ret; if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && @@ -3987,18 +4036,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return 0; } - if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && - !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) { - /* - * If we have enough free space left in an already active block - * group and we can't activate any other zone now, retry the - * active ones with a smaller allocation size. Returning early - * from here will tell btrfs_reserve_extent() to haven the - * size. - */ - return -ENOSPC; - } - if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) return 1; @@ -4034,6 +4071,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; int exist = 0; + /*Check if allocation policy allows to create a new chunk */ + if (!can_allocate_chunk(fs_info, ffe_ctl)) + return -ENOSPC; + trans = current->journal_info; if (trans) exist = 1; @@ -4570,6 +4611,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins, int ref_mod) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_extent_inline_ref *iref; @@ -4589,8 +4631,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); + extent_root = btrfs_extent_root(fs_info, ins->objectid); + ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); if (ret) { btrfs_free_path(path); return ret; @@ -4642,6 +4684,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_key extent_key; @@ -4673,8 +4716,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - &extent_key, size); + extent_root = btrfs_extent_root(fs_info, extent_key.objectid); + ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, + size); if (ret) { btrfs_free_path(path); return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9234d96a7fd5..d6d48ecf823c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2314,8 +2314,8 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); - if (btrfs_is_zoned(fs_info)) - return btrfs_repair_one_zone(fs_info, logical); + if (btrfs_repair_one_zone(fs_info, logical)) + return 0; bio = btrfs_bio_alloc(1); bio->bi_iter.bi_size = 0; @@ -3087,9 +3087,6 @@ static void end_bio_extent_readpage(struct bio *bio) set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = mirror; atomic_dec(&eb->io_pages); - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, - &eb->bflags)) - btree_readahead_hook(eb, -EIO); } readpage_ok: if (likely(uptodate)) { @@ -3187,13 +3184,12 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) /** * Attempt to add a page to bio * - * @bio: destination bio + * @bio_ctrl: record both the bio, and its bio_flags * @page: page to add to the bio * @disk_bytenr: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @pg_offset: starting offset in the page * @size: portion of page that we want to write - * @prev_bio_flags: flags of previous bio to see if we can merge the current one + * @pg_offset: starting offset in the page * @bio_flags: flags of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. @@ -3283,8 +3279,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, else bio_ctrl->len_to_stripe_boundary = (u32)geom.len; - if (!btrfs_is_zoned(fs_info) || - bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { + if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { bio_ctrl->len_to_oe_boundary = U32_MAX; return 0; } @@ -3339,7 +3334,7 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); } - if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { struct btrfs_device *device; device = btrfs_zoned_get_device(fs_info, disk_bytenr, @@ -3785,12 +3780,13 @@ static void update_nr_written(struct writeback_control *wbc, * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, - struct page *page, struct writeback_control *wbc, - unsigned long *nr_written) + struct page *page, struct writeback_control *wbc) { const u64 page_end = page_offset(page) + PAGE_SIZE - 1; u64 delalloc_start = page_offset(page); u64 delalloc_to_write = 0; + /* How many pages are started by btrfs_run_delalloc_range() */ + unsigned long nr_written = 0; int ret; int page_started = 0; @@ -3806,7 +3802,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, continue; } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, - delalloc_end, &page_started, nr_written, wbc); + delalloc_end, &page_started, &nr_written, wbc); if (ret) { btrfs_page_set_error(inode->root->fs_info, page, page_offset(page), PAGE_SIZE); @@ -3829,16 +3825,13 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, thresh); } - /* did the fill delalloc function already unlock and start - * the IO? - */ + /* Did btrfs_run_dealloc_range() already unlock and start the IO? */ if (page_started) { /* - * we've unlocked the page, so we can't update - * the mapping's writeback index, just update - * nr_to_write. + * We've unlocked the page, so we can't update the mapping's + * writeback index, just update nr_to_write. */ - wbc->nr_to_write -= *nr_written; + wbc->nr_to_write -= nr_written; return 1; } @@ -3910,7 +3903,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct writeback_control *wbc, struct extent_page_data *epd, loff_t i_size, - unsigned long nr_written, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -3929,7 +3921,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (ret) { /* Fixup worker will requeue */ redirty_page_for_writepage(wbc, page); - update_nr_written(wbc, nr_written); unlock_page(page); return 1; } @@ -3938,7 +3929,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - update_nr_written(wbc, nr_written + 1); + update_nr_written(wbc, 1); while (cur <= end) { u64 disk_bytenr; @@ -4076,7 +4067,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - unsigned long nr_written = 0; trace___extent_writepage(page, inode, wbc); @@ -4105,7 +4095,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (!epd->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written); + ret = writepage_delalloc(BTRFS_I(inode), page, wbc); if (ret == 1) return 0; if (ret) @@ -4113,7 +4103,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, - nr_written, &nr); + &nr); if (ret == 1) return 0; @@ -5189,8 +5179,6 @@ int extent_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root); - const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info); int ret = 0; struct extent_page_data epd = { .bio_ctrl = { 0 }, @@ -5202,11 +5190,9 @@ int extent_writepages(struct address_space *mapping, * Allow only a single thread to do the reloc work in zoned mode to * protect the write pointer updates. */ - if (data_reloc && zoned) - btrfs_inode_lock(inode, 0); + btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); - if (data_reloc && zoned) - btrfs_inode_unlock(inode, 0); + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); ASSERT(ret <= 0); if (ret < 0) { end_write_bio(&epd, ret); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d1cbb64a78f3..90c5c38836ab 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -208,7 +208,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits; - csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); + csums_in_item = btrfs_item_size(leaf, path->slots[0]); csums_in_item /= csum_size; if (csum_offset == csums_in_item) { @@ -257,6 +257,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 disk_bytenr, u64 len, u8 *dst) { + struct btrfs_root *csum_root; struct btrfs_csum_item *item = NULL; struct btrfs_key key; const u32 sectorsize = fs_info->sectorsize; @@ -274,7 +275,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); csum_start = key.offset; csum_len = (itemsize / csum_size) * sectorsize; @@ -285,13 +286,14 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, /* Current item doesn't contain the desired range, search again */ btrfs_release_path(path); - item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0); + csum_root = btrfs_csum_root(fs_info, disk_bytenr); + item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0); if (IS_ERR(item)) { ret = PTR_ERR(item); goto out; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); csum_start = key.offset; csum_len = (itemsize / csum_size) * sectorsize; @@ -376,7 +378,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; - if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; /* @@ -534,7 +537,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, key.type == BTRFS_EXTENT_CSUM_KEY) { offset = (start - key.offset) >> fs_info->sectorsize_bits; if (offset * csum_size < - btrfs_item_size_nr(leaf, path->slots[0] - 1)) + btrfs_item_size(leaf, path->slots[0] - 1)) path->slots[0]--; } } @@ -559,7 +562,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (key.offset > start) start = key.offset; - size = btrfs_item_size_nr(leaf, path->slots[0]); + size = btrfs_item_size(leaf, path->slots[0]); csum_end = key.offset + (size / csum_size) * fs_info->sectorsize; if (csum_end <= start) { path->slots[0]++; @@ -750,7 +753,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u32 blocksize_bits = fs_info->sectorsize_bits; leaf = path->nodes[0]; - csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; csum_end <<= blocksize_bits; csum_end += key->offset; @@ -801,7 +804,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; - ASSERT(root == fs_info->csum_root || + ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); path = btrfs_alloc_path(); @@ -834,7 +837,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, if (key.offset >= end_byte) break; - csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; csum_end <<= blocksize_bits; csum_end += key.offset; @@ -1002,7 +1005,7 @@ again: item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((char *)item_end + - btrfs_item_size_nr(leaf, path->slots[0])); + btrfs_item_size(leaf, path->slots[0])); goto found; } ret = PTR_ERR(item); @@ -1013,7 +1016,7 @@ again: u32 item_size; /* we found one, but it isn't big enough yet */ leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if ((item_size / csum_size) >= MAX_CSUM_ITEMS(fs_info, csum_size)) { /* already at max size, make a new one */ @@ -1070,7 +1073,7 @@ again: } extend_csum: - if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / + if (csum_offset == btrfs_item_size(leaf, path->slots[0]) / csum_size) { int extend_nr; u64 tmp; @@ -1125,7 +1128,7 @@ extend_csum: diff = min(diff, MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); - diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); + diff = diff - btrfs_item_size(leaf, path->slots[0]); diff = min_t(u32, btrfs_leaf_free_space(leaf), diff); diff /= csum_size; diff *= csum_size; @@ -1162,7 +1165,7 @@ insert: csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((unsigned char *)item + - btrfs_item_size_nr(leaf, path->slots[0])); + btrfs_item_size(leaf, path->slots[0])); item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); found: diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f3fee88c8ee0..01a408db5683 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -23,6 +23,7 @@ #include "block-group.h" #include "discard.h" #include "subpage.h" +#include "inode-item.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K @@ -37,7 +38,7 @@ struct btrfs_trim_range { static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info); + struct btrfs_free_space *info, bool update_stat); static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes, bool for_alloc); @@ -45,7 +46,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info); static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, - u64 bytes); + u64 bytes, bool update_stats); static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, @@ -288,9 +289,18 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, - struct inode *inode) + struct inode *vfs_inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_truncate_control control = { + .inode = BTRFS_I(vfs_inode), + .new_size = 0, + .ino = btrfs_ino(BTRFS_I(vfs_inode)), + .min_type = BTRFS_EXTENT_DATA_KEY, + .clear_extent_range = true, + }; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; + struct extent_state *cached_state = NULL; int ret = 0; bool locked = false; @@ -320,19 +330,26 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, btrfs_free_path(path); } - btrfs_i_size_write(BTRFS_I(inode), 0); - truncate_pagecache(inode, 0); + btrfs_i_size_write(inode, 0); + truncate_pagecache(vfs_inode, 0); + + lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); /* * We skip the throttling logic for free space cache inodes, so we don't * need to check for -EAGAIN. */ - ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, BTRFS_EXTENT_DATA_KEY, NULL); + ret = btrfs_truncate_inode_items(trans, root, &control); + + inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(inode, control.last_size); + + unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); fail: if (locked) @@ -666,7 +683,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->block_group; u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; @@ -872,7 +889,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (!info->bitmap) { - unlink_free_space(ctl, info); + unlink_free_space(ctl, info, true); ret = btrfs_add_free_space(block_group, info->offset, info->bytes); kmem_cache_free(btrfs_free_space_cachep, info); @@ -886,7 +903,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, bytes); if (ret) break; - bitmap_clear_bits(ctl, info, offset, bytes); + bitmap_clear_bits(ctl, info, offset, bytes, true); offset = info->offset; bytes = ctl->unit; } @@ -1581,6 +1598,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, } /* + * This is a little subtle. We *only* have ->max_extent_size set if we actually + * searched through the bitmap and figured out the largest ->max_extent_size, + * otherwise it's 0. In the case that it's 0 we don't want to tell the + * allocator the wrong thing, we want to use the actual real max_extent_size + * we've found already if it's larger, or we want to use ->bytes. + * + * This matters because find_free_space() will skip entries who's ->bytes is + * less than the required bytes. So if we didn't search down this bitmap, we + * may pick some previous entry that has a smaller ->max_extent_size than we + * have. For example, assume we have two entries, one that has + * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set + * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will + * call into find_free_space(), and return with max_extent_size == 4K, because + * that first bitmap entry had ->max_extent_size set, but the second one did + * not. If instead we returned 8K we'd come in searching for 8K, and find the + * 8K contiguous range. + * + * Consider the other case, we have 2 8K chunks in that second entry and still + * don't have ->max_extent_size set. We'll return 16K, and the next time the + * allocator comes in it'll fully search our second bitmap, and this time it'll + * get an uptodate value of 8K as the maximum chunk size. Then we'll get the + * right allocation the next loop through. + */ +static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) +{ + if (entry->bitmap && entry->max_extent_size) + return entry->max_extent_size; + return entry->bytes; +} + +/* + * We want the largest entry to be leftmost, so this is inverted from what you'd + * normally expect. + */ +static bool entry_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct btrfs_free_space *entry, *exist; + + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + exist = rb_entry(parent, struct btrfs_free_space, bytes_index); + return get_max_extent_size(exist) < get_max_extent_size(entry); +} + +/* * searches the tree for the given offset. * * fuzzy - If this is set, then we are trying to make an allocation, and we just @@ -1592,15 +1653,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, u64 offset, int bitmap_only, int fuzzy) { struct rb_node *n = ctl->free_space_offset.rb_node; - struct btrfs_free_space *entry, *prev = NULL; + struct btrfs_free_space *entry = NULL, *prev = NULL; /* find entry that is closest to the 'offset' */ - while (1) { - if (!n) { - entry = NULL; - break; - } - + while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); prev = entry; @@ -1610,6 +1666,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, n = n->rb_right; else break; + + entry = NULL; } if (bitmap_only) { @@ -1686,6 +1744,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, return NULL; while (1) { + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); if (entry->bitmap) { if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) @@ -1694,33 +1756,25 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, if (entry->offset + entry->bytes > offset) break; } - - n = rb_next(&entry->offset_index); - if (!n) - return NULL; - entry = rb_entry(n, struct btrfs_free_space, offset_index); } return entry; } -static inline void -__unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) +static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) { rb_erase(&info->offset_index, &ctl->free_space_offset); + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); ctl->free_extents--; if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; } -} -static void unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - __unlink_free_space(ctl, info); - ctl->free_space -= info->bytes; + if (update_stat) + ctl->free_space -= info->bytes; } static int link_free_space(struct btrfs_free_space_ctl *ctl, @@ -1734,6 +1788,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, if (ret) return ret; + rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]++; ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; @@ -1744,9 +1800,25 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, return ret; } -static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, - u64 offset, u64 bytes) +static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + ASSERT(info->bitmap); + + /* + * If our entry is empty it's because we're on a cluster and we don't + * want to re-link it into our ctl bytes index. + */ + if (RB_EMPTY_NODE(&info->bytes_index)) + return; + + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); + rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); +} + +static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + u64 offset, u64 bytes, bool update_stat) { unsigned long start, count, end; int extent_delta = -1; @@ -1762,6 +1834,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, if (info->max_extent_size > ctl->unit) info->max_extent_size = 0; + relink_bitmap_entry(ctl, info); + if (start && test_bit(start - 1, info->bitmap)) extent_delta++; @@ -1773,14 +1847,9 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; } -} -static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) -{ - __bitmap_clear_bits(ctl, info, offset, bytes); - ctl->free_space -= bytes; + if (update_stat) + ctl->free_space -= bytes; } static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, @@ -1797,9 +1866,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, bitmap_set(info->bitmap, start, count); + /* + * We set some bytes, we have no idea what the max extent size is + * anymore. + */ + info->max_extent_size = 0; info->bytes += bytes; ctl->free_space += bytes; + relink_bitmap_entry(ctl, info); + if (start && test_bit(start - 1, info->bitmap)) extent_delta--; @@ -1867,20 +1943,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, *bytes = (u64)(max_bits) * ctl->unit; bitmap_info->max_extent_size = *bytes; + relink_bitmap_entry(ctl, bitmap_info); return -1; } -static inline u64 get_max_extent_size(struct btrfs_free_space *entry) -{ - if (entry->bitmap) - return entry->max_extent_size; - return entry->bytes; -} - /* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, - unsigned long align, u64 *max_extent_size) + unsigned long align, u64 *max_extent_size, bool use_bytes_index) { struct btrfs_free_space *entry; struct rb_node *node; @@ -1890,16 +1960,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, if (!ctl->free_space_offset.rb_node) goto out; +again: + if (use_bytes_index) { + node = rb_first_cached(&ctl->free_space_bytes); + } else { + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), + 0, 1); + if (!entry) + goto out; + node = &entry->offset_index; + } - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); - if (!entry) - goto out; + for (; node; node = rb_next(node)) { + if (use_bytes_index) + entry = rb_entry(node, struct btrfs_free_space, + bytes_index); + else + entry = rb_entry(node, struct btrfs_free_space, + offset_index); - for (node = &entry->offset_index; node; node = rb_next(node)) { - entry = rb_entry(node, struct btrfs_free_space, offset_index); + /* + * If we are using the bytes index then all subsequent entries + * in this tree are going to be < bytes, so simply set the max + * extent size and exit the loop. + * + * If we're using the offset index then we need to keep going + * through the rest of the tree. + */ if (entry->bytes < *bytes) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); + if (use_bytes_index) + break; continue; } @@ -1916,6 +2008,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, tmp = entry->offset; } + /* + * We don't break here if we're using the bytes index because we + * may have another entry that has the correct alignment that is + * the right size, so we don't want to miss that possibility. + * At worst this adds another loop through the logic, but if we + * broke here we could prematurely ENOSPC. + */ if (entry->bytes < *bytes + align_off) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); @@ -1923,6 +2022,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, } if (entry->bitmap) { + struct rb_node *old_next = rb_next(node); u64 size = *bytes; ret = search_bitmap(ctl, entry, &tmp, &size, true); @@ -1935,6 +2035,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, max(get_max_extent_size(entry), *max_extent_size); } + + /* + * The bitmap may have gotten re-arranged in the space + * index here because the max_extent_size may have been + * updated. Start from the beginning again if this + * happened. + */ + if (use_bytes_index && old_next != rb_next(node)) + goto again; continue; } @@ -1973,7 +2082,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; } - unlink_free_space(ctl, bitmap_info); + unlink_free_space(ctl, bitmap_info, true); kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; @@ -2011,7 +2120,7 @@ again: /* Cannot clear past the end of the bitmap */ search_bytes = min(search_bytes, end - search_start + 1); - bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); + bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); *offset += search_bytes; *bytes -= search_bytes; @@ -2083,12 +2192,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, bitmap_set_bits(ctl, info, offset, bytes_to_set); - /* - * We set some bytes, we have no idea what the max extent size is - * anymore. - */ - info->max_extent_size = 0; - return bytes_to_set; } @@ -2096,7 +2199,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; bool forced = false; @@ -2165,7 +2268,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, return 0; if (ctl->op == &free_space_op) - block_group = ctl->private; + block_group = ctl->block_group; again: /* * Since we link bitmaps right into the cluster we need to see if we @@ -2310,10 +2413,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, /* See try_merge_free_space() comment. */ if (right_info && !right_info->bitmap && (!is_trimmed || btrfs_free_space_trimmed(right_info))) { - if (update_stat) - unlink_free_space(ctl, right_info); - else - __unlink_free_space(ctl, right_info); + unlink_free_space(ctl, right_info, update_stat); info->bytes += right_info->bytes; kmem_cache_free(btrfs_free_space_cachep, right_info); merged = true; @@ -2323,10 +2423,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset && (!is_trimmed || btrfs_free_space_trimmed(left_info))) { - if (update_stat) - unlink_free_space(ctl, left_info); - else - __unlink_free_space(ctl, left_info); + unlink_free_space(ctl, left_info, update_stat); info->offset = left_info->offset; info->bytes += left_info->bytes; kmem_cache_free(btrfs_free_space_cachep, left_info); @@ -2362,10 +2459,7 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; - if (update_stat) - bitmap_clear_bits(ctl, bitmap, end, bytes); - else - __bitmap_clear_bits(ctl, bitmap, end, bytes); + bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); @@ -2419,10 +2513,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; - if (update_stat) - bitmap_clear_bits(ctl, bitmap, info->offset, bytes); - else - __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); + bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); @@ -2466,12 +2557,12 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, } } -int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_free_space_ctl *ctl, +int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; int ret = 0; u64 filter_bytes = bytes; @@ -2486,6 +2577,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, info->bytes = bytes; info->trim_state = trim_state; RB_CLEAR_NODE(&info->offset_index); + RB_CLEAR_NODE(&info->bytes_index); spin_lock(&ctl->tree_lock); @@ -2602,9 +2694,7 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group, if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size, trim_state); + return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, @@ -2635,9 +2725,7 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size, trim_state); + return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_remove_free_space(struct btrfs_block_group *block_group, @@ -2696,7 +2784,7 @@ again: re_search = false; if (!info->bitmap) { - unlink_free_space(ctl, info); + unlink_free_space(ctl, info, true); if (offset == info->offset) { u64 to_free = min(bytes, info->bytes); @@ -2732,7 +2820,7 @@ again: } spin_unlock(&ctl->tree_lock); - ret = __btrfs_add_free_space(block_group->fs_info, ctl, + ret = __btrfs_add_free_space(block_group, offset + bytes, old_end - (offset + bytes), info->trim_state); @@ -2797,8 +2885,9 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; ctl->start = block_group->start; - ctl->private = block_group; + ctl->block_group = block_group; ctl->op = &free_space_op; + ctl->free_space_bytes = RB_ROOT_CACHED; INIT_LIST_HEAD(&ctl->trimming_ranges); mutex_init(&ctl->cache_writeout_mutex); @@ -2864,6 +2953,8 @@ static void __btrfs_return_cluster_to_free_space( } tree_insert_offset(&ctl->free_space_offset, entry->offset, &entry->offset_index, bitmap); + rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, + entry_less); } cluster->root = RB_ROOT; spin_unlock(&cluster->lock); @@ -2879,7 +2970,7 @@ static void __btrfs_remove_free_space_cache_locked( while ((node = rb_last(&ctl->free_space_offset)) != NULL) { info = rb_entry(node, struct btrfs_free_space, offset_index); if (!info->bitmap) { - unlink_free_space(ctl, info); + unlink_free_space(ctl, info, true); kmem_cache_free(btrfs_free_space_cachep, info); } else { free_bitmap(ctl, info); @@ -2893,8 +2984,8 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache_locked(ctl); - if (ctl->private) - btrfs_discard_update_discardable(ctl->private); + if (ctl->block_group) + btrfs_discard_update_discardable(ctl->block_group); spin_unlock(&ctl->tree_lock); } @@ -2965,18 +3056,20 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 align_gap = 0; u64 align_gap_len = 0; enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + bool use_bytes_index = (offset == block_group->start); ASSERT(!btrfs_is_zoned(block_group->fs_info)); spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, - block_group->full_stripe_len, max_extent_size); + block_group->full_stripe_len, max_extent_size, + use_bytes_index); if (!entry) goto out; ret = offset; if (entry->bitmap) { - bitmap_clear_bits(ctl, entry, offset, bytes); + bitmap_clear_bits(ctl, entry, offset, bytes, true); if (!btrfs_free_space_trimmed(entry)) atomic64_add(bytes, &discard_ctl->discard_bytes_saved); @@ -2984,7 +3077,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, if (!entry->bytes) free_bitmap(ctl, entry); } else { - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); align_gap_len = offset - entry->offset; align_gap = entry->offset; align_gap_trim_state = entry->trim_state; @@ -3006,8 +3099,7 @@ out: spin_unlock(&ctl->tree_lock); if (align_gap_len) - __btrfs_add_free_space(block_group->fs_info, ctl, - align_gap, align_gap_len, + __btrfs_add_free_space(block_group, align_gap, align_gap_len, align_gap_trim_state); return ret; } @@ -3078,7 +3170,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, } ret = search_start; - __bitmap_clear_bits(ctl, entry, ret, bytes); + bitmap_clear_bits(ctl, entry, ret, bytes, false); return ret; } @@ -3254,6 +3346,17 @@ again: cluster->window_start = start * ctl->unit + entry->offset; rb_erase(&entry->offset_index, &ctl->free_space_offset); + rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); + + /* + * We need to know if we're currently on the normal space index when we + * manipulate the bitmap so that we know we need to remove and re-insert + * it into the space_index tree. Clear the bytes_index node here so the + * bitmap manipulation helpers know not to mess with the space_index + * until this bitmap entry is added back into the normal cache. + */ + RB_CLEAR_NODE(&entry->bytes_index); + ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); ASSERT(!ret); /* -EEXIST; Logic error */ @@ -3344,6 +3447,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); + rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); total_size += entry->bytes; @@ -3535,13 +3639,13 @@ static int do_trimming(struct btrfs_block_group *block_group, mutex_lock(&ctl->cache_writeout_mutex); if (reserved_start < start) - __btrfs_add_free_space(fs_info, ctl, reserved_start, + __btrfs_add_free_space(block_group, reserved_start, start - reserved_start, reserved_trim_state); if (start + bytes < reserved_start + reserved_bytes) - __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end, + __btrfs_add_free_space(block_group, end, reserved_end - end, reserved_trim_state); - __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state); + __btrfs_add_free_space(block_group, start, bytes, trim_state); list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3615,7 +3719,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, mutex_unlock(&ctl->cache_writeout_mutex); goto next; } - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); /* * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim @@ -3641,7 +3745,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, goto next; } - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); kmem_cache_free(btrfs_free_space_cachep, entry); } @@ -3828,7 +3932,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, bytes > (max_discard_size + minlen)) bytes = max_discard_size; - bitmap_clear_bits(ctl, entry, start, bytes); + bitmap_clear_bits(ctl, entry, start, bytes, true); if (entry->bytes == 0) free_bitmap(ctl, entry); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 1f23088d43f9..15591b299895 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -22,6 +22,7 @@ enum btrfs_trim_state { struct btrfs_free_space { struct rb_node offset_index; + struct rb_node bytes_index; u64 offset; u64 bytes; u64 max_extent_size; @@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap( struct btrfs_free_space_ctl { spinlock_t tree_lock; struct rb_root free_space_offset; + struct rb_root_cached free_space_bytes; u64 free_space; int extents_thresh; int free_extents; @@ -54,7 +56,7 @@ struct btrfs_free_space_ctl { s32 discardable_extents[BTRFS_STAT_NR_ENTRIES]; s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES]; const struct btrfs_free_space_op *op; - void *private; + struct btrfs_block_group *block_group; struct mutex cache_writeout_mutex; struct list_head trimming_ranges; }; @@ -101,10 +103,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); -int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_free_space_ctl *ctl, - u64 bytenr, u64 size, - enum btrfs_trim_state trim_state); +int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, + u64 size, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 3abec44c6255..655aad0f9e1c 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -16,6 +16,18 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); +static struct btrfs_root *btrfs_free_space_root( + struct btrfs_block_group *block_group) +{ + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + return btrfs_global_root(block_group->fs_info, &key); +} + void set_free_space_tree_thresholds(struct btrfs_block_group *cache) { u32 bitmap_range; @@ -51,7 +63,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key; struct extent_buffer *leaf; @@ -85,7 +97,7 @@ struct btrfs_free_space_info *search_free_space_info( struct btrfs_path *path, int cow) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; int ret; @@ -188,7 +200,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -326,7 +338,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -586,7 +598,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 start, u64 size, int remove) { - struct btrfs_root *root = block_group->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 end = start + size; u64 cur_start, cur_size; @@ -699,7 +711,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 found_start, found_end; u64 end = start + size; @@ -851,7 +863,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key, new_key; u64 found_start, found_end; u64 end = start + size; @@ -1046,7 +1058,7 @@ out: static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { - struct btrfs_root *extent_root = trans->fs_info->extent_root; + struct btrfs_root *extent_root; struct btrfs_path *path, *path2; struct btrfs_key key; u64 start, end; @@ -1080,6 +1092,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; + extent_root = btrfs_extent_root(trans->fs_info, key.objectid); ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; @@ -1157,7 +1170,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) ret = PTR_ERR(free_space_root); goto abort; } - fs_info->free_space_root = free_space_root; + ret = btrfs_global_root_insert(free_space_root); + if (ret) { + btrfs_put_root(free_space_root); + goto abort; + } node = rb_first(&fs_info->block_group_cache_tree); while (node) { @@ -1232,7 +1249,12 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *free_space_root = fs_info->free_space_root; + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key); int ret; trans = btrfs_start_transaction(tree_root, 0); @@ -1241,7 +1263,6 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); - fs_info->free_space_root = NULL; ret = clear_free_space_tree(trans, free_space_root); if (ret) @@ -1251,6 +1272,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) if (ret) goto abort; + btrfs_global_root_delete(free_space_root); list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); @@ -1319,7 +1341,7 @@ out: int remove_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_path *path; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -1410,7 +1432,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, block_group = caching_ctl->block_group; fs_info = block_group->fs_info; - root = fs_info->free_space_root; + root = btrfs_free_space_root(block_group); end = block_group->start + block_group->length; @@ -1488,7 +1510,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, block_group = caching_ctl->block_group; fs_info = block_group->fs_info; - root = fs_info->free_space_root; + root = btrfs_free_space_root(block_group); end = block_group->start + block_group->length; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 37f36ffdaf6b..0eeb5ea87894 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -4,6 +4,7 @@ */ #include "ctree.h" +#include "inode-item.h" #include "disk-io.h" #include "transaction.h" #include "print-tree.h" @@ -19,7 +20,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, u32 cur_offset = 0; int len; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { ref = (struct btrfs_inode_ref *)(ptr + cur_offset); @@ -45,7 +46,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( u32 cur_offset = 0; int ref_name_len; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); /* @@ -139,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (index) *index = btrfs_inode_extref_index(leaf, extref); @@ -208,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (index) *index = btrfs_inode_ref_index(leaf, ref); @@ -256,7 +257,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_item *item; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; @@ -282,9 +282,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, goto out; leaf = path->nodes[0]; - item = btrfs_item_nr(path->slots[0]); ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); - ptr += btrfs_item_size(leaf, item) - ins_len; + ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len; extref = (struct btrfs_inode_extref *)ptr; btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); @@ -332,7 +331,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ref) goto out; - old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + old_size = btrfs_item_size(path->nodes[0], path->slots[0]); btrfs_extend_item(path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); @@ -419,3 +418,332 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root } return ret; } + +static inline void btrfs_trace_truncate(struct btrfs_inode *inode, + struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + u64 offset, int extent_type, int slot) +{ + if (!inode) + return; + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot, + offset); + else + trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset); +} + +/* + * Remove inode items from a given root. + * + * @trans: A transaction handle. + * @root: The root from which to remove items. + * @inode: The inode whose items we want to remove. + * @control: The btrfs_truncate_control to control how and what we + * are truncating. + * + * Remove all keys associated with the inode from the given root that have a key + * with a type greater than or equals to @min_type. When @min_type has a value of + * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value + * greater than or equals to @new_size. If a file extent item that starts before + * @new_size and ends after it is found, its length is adjusted. + * + * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is + * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. + */ +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_truncate_control *control) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; + u64 new_size = control->new_size; + u64 extent_num_bytes = 0; + u64 extent_offset = 0; + u64 item_end = 0; + u32 found_type = (u8)-1; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int ret; + u64 bytes_deleted = 0; + bool be_nice = false; + + ASSERT(control->inode || !control->clear_extent_range); + ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY); + + control->last_size = new_size; + control->sub_bytes = 0; + + /* + * For shareable roots we want to back off from time to time, this turns + * out to be subvolume roots, reloc roots, and data reloc roots. + */ + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + be_nice = true; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_BACK; + + key.objectid = control->ino; + key.offset = (u64)-1; + key.type = (u8)-1; + +search_again: + /* + * With a 16K leaf size and 128MiB extents, you can actually queue up a + * huge file in a single leaf. Most of the time that bytes_deleted is + * > 0, it will be huge by the time we get here + */ + if (be_nice && bytes_deleted > SZ_32M && + btrfs_should_end_transaction(trans)) { + ret = -EAGAIN; + goto out; + } + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = 0; + /* There are no items in the tree for us to truncate, we're done */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + while (1) { + u64 clear_start = 0, clear_len = 0, extent_start = 0; + bool should_throttle = false; + + fi = NULL; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = found_key.type; + + if (found_key.objectid != control->ino) + break; + + if (found_type < control->min_type) + break; + + item_end = found_key.offset; + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + if (extent_type != BTRFS_FILE_EXTENT_INLINE) + item_end += + btrfs_file_extent_num_bytes(leaf, fi); + else if (extent_type == BTRFS_FILE_EXTENT_INLINE) + item_end += btrfs_file_extent_ram_bytes(leaf, fi); + + btrfs_trace_truncate(control->inode, leaf, fi, + found_key.offset, extent_type, + path->slots[0]); + item_end--; + } + if (found_type > control->min_type) { + del_item = 1; + } else { + if (item_end < new_size) + break; + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; + } + + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; + + control->extents_found++; + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + + clear_start = found_key.offset; + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); + if (!del_item) { + u64 orig_num_bytes = + btrfs_file_extent_num_bytes(leaf, fi); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + fs_info->sectorsize); + clear_start = ALIGN(new_size, fs_info->sectorsize); + + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_num_bytes); + num_dec = (orig_num_bytes - extent_num_bytes); + if (extent_start != 0) + control->sub_bytes += num_dec; + btrfs_mark_buffer_dirty(leaf); + } else { + extent_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = found_key.offset - + btrfs_file_extent_offset(leaf, fi); + + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_bytes(leaf, fi); + if (extent_start != 0) + control->sub_bytes += num_dec; + } + clear_len = num_dec; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * We can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0 && + btrfs_file_extent_compression(leaf, fi) == 0) { + u32 size = (u32)(new_size - found_key.offset); + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(path, size, 1); + } else if (!del_item) { + /* + * We have to bail so the last_size is set to + * just before this extent. + */ + ret = BTRFS_NEED_TRUNCATE_BLOCK; + break; + } else { + /* + * Inline extents are special, we just treat + * them as a full sector worth in the file + * extent tree just for simplicity sake. + */ + clear_len = fs_info->sectorsize; + } + + control->sub_bytes += item_end + 1 - new_size; + } +delete: + /* + * We only want to clear the file extent range if we're + * modifying the actual inode's mapping, which is just the + * normal truncate path. + */ + if (control->clear_extent_range) { + ret = btrfs_inode_clear_file_extent_range(control->inode, + clear_start, clear_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + } + + if (del_item) { + ASSERT(!pending_del_nr || + ((path->slots[0] + 1) == pending_del_slot)); + + control->last_size = found_key.offset; + if (!pending_del_nr) { + /* No pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; + } else if (pending_del_nr && + path->slots[0] + 1 == pending_del_slot) { + /* Hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; + } + } else { + control->last_size = new_size; + break; + } + + if (del_item && extent_start != 0 && !control->skip_ref_updates) { + struct btrfs_ref ref = { 0 }; + + bytes_deleted += extent_num_bytes; + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, + extent_start, extent_num_bytes, 0); + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + control->ino, extent_offset, + root->root_key.objectid, false); + ret = btrfs_free_extent(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + if (be_nice) { + if (btrfs_should_throttle_delayed_refs(trans)) + should_throttle = true; + } + } + + if (found_type == BTRFS_INODE_ITEM_KEY) + break; + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot || + should_throttle) { + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + pending_del_nr = 0; + } + btrfs_release_path(path); + + /* + * We can generate a lot of delayed refs, so we need to + * throttle every once and a while and make sure we're + * adding enough space to keep up with the work we are + * generating. Since we hold a transaction here we + * can't flush, and we don't want to FLUSH_LIMIT because + * we could have generated too many delayed refs to + * actually allocate, so just bail if we're short and + * let the normal reservation dance happen higher up. + */ + if (should_throttle) { + ret = btrfs_delayed_refs_rsv_refill(fs_info, + BTRFS_RESERVE_NO_FLUSH); + if (ret) { + ret = -EAGAIN; + break; + } + } + goto search_again; + } else { + path->slots[0]--; + } + } +out: + if (ret >= 0 && pending_del_nr) { + int err; + + err = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + if (err) { + btrfs_abort_transaction(trans, err); + ret = err; + } + } + + ASSERT(control->last_size >= new_size); + if (!ret && control->last_size > new_size) + control->last_size = new_size; + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h new file mode 100644 index 000000000000..a8fc16d0147f --- /dev/null +++ b/fs/btrfs/inode-item.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_INODE_ITEM_H +#define BTRFS_INODE_ITEM_H + +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_root; +struct btrfs_path; +struct btrfs_key; +struct btrfs_inode_extref; +struct btrfs_inode; +struct extent_buffer; + +/* + * Return this if we need to call truncate_block for the last bit of the + * truncate. + */ +#define BTRFS_NEED_TRUNCATE_BLOCK 1 + +struct btrfs_truncate_control { + /* + * IN: the inode we're operating on, this can be NULL if + * ->clear_extent_range is false. + */ + struct btrfs_inode *inode; + + /* IN: the size we're truncating to. */ + u64 new_size; + + /* OUT: the number of extents truncated. */ + u64 extents_found; + + /* OUT: the last size we truncated this inode to. */ + u64 last_size; + + /* OUT: the number of bytes to sub from this inode. */ + u64 sub_bytes; + + /* IN: the ino we are truncating. */ + u64 ino; + + /* + * IN: minimum key type to remove. All key types with this type are + * removed only if their offset >= new_size. + */ + u32 min_type; + + /* + * IN: true if we don't want to do extent reference updates for any file + * extents we drop. + */ + bool skip_ref_updates; + + /* + * IN: true if we need to clear the file extent range for the inode as + * we drop the file extent items. + */ + bool clear_extent_range; +}; + +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_truncate_control *control); +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index); +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod); + +struct btrfs_inode_extref *btrfs_lookup_inode_extref( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow); + +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, const char *name, + int name_len); +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const char *name, int name_len); + +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b8c911a4a320..3b2403b6127f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -54,6 +54,7 @@ #include "space-info.h" #include "zoned.h" #include "subpage.h" +#include "inode-item.h" struct btrfs_iget_args { u64 ino; @@ -61,8 +62,6 @@ struct btrfs_iget_args { }; struct btrfs_dio_data { - u64 reserve; - loff_t length; ssize_t submitted; struct extent_changeset *data_reserved; }; @@ -1532,11 +1531,12 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { - int ret; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); struct btrfs_ordered_sum *sums; + int ret; LIST_HEAD(list); - ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr, + ret = btrfs_lookup_csums_range(csum_root, bytenr, bytenr + num_bytes - 1, &list, 0); if (ret == 0 && list_empty(&list)) return 0; @@ -2518,7 +2518,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || - !fs_info->csum_root; + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; @@ -2586,11 +2586,15 @@ static int add_pending_csums(struct btrfs_trans_handle *trans, struct list_head *list) { struct btrfs_ordered_sum *sum; + struct btrfs_root *csum_root = NULL; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); + if (!csum_root) + csum_root = btrfs_csum_root(trans->fs_info, + sum->bytenr); + ret = btrfs_csum_file_blocks(trans, csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -3316,7 +3320,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; - if (!root->fs_info->csum_root) + if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) return 0; ASSERT(page_offset(page) <= start && @@ -3477,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; - if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) + if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; path = btrfs_alloc_path(); @@ -3635,8 +3639,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* release the path since we're done with it */ btrfs_release_path(path); - root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; - if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) @@ -4615,389 +4617,6 @@ out: } /* - * Return this if we need to call truncate_block for the last bit of the - * truncate. - */ -#define NEED_TRUNCATE_BLOCK 1 - -/* - * Remove inode items from a given root. - * - * @trans: A transaction handle. - * @root: The root from which to remove items. - * @inode: The inode whose items we want to remove. - * @new_size: The new i_size for the inode. This is only applicable when - * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. - * @min_type: The minimum key type to remove. All keys with a type - * greater than this value are removed and all keys with - * this type are removed only if their offset is >= @new_size. - * @extents_found: Output parameter that will contain the number of file - * extent items that were removed or adjusted to the new - * inode i_size. The caller is responsible for initializing - * the counter. Also, it can be NULL if the caller does not - * need this counter. - * - * Remove all keys associated with the inode from the given root that have a key - * with a type greater than or equals to @min_type. When @min_type has a value of - * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value - * greater than or equals to @new_size. If a file extent item that starts before - * @new_size and ends after it is found, its length is adjusted. - * - * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is - * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. - */ -int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode, - u64 new_size, u32 min_type, - u64 *extents_found) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *fi; - struct btrfs_key key; - struct btrfs_key found_key; - u64 extent_start = 0; - u64 extent_num_bytes = 0; - u64 extent_offset = 0; - u64 item_end = 0; - u64 last_size = new_size; - u32 found_type = (u8)-1; - int found_extent; - int del_item; - int pending_del_nr = 0; - int pending_del_slot = 0; - int extent_type = -1; - int ret; - u64 ino = btrfs_ino(inode); - u64 bytes_deleted = 0; - bool be_nice = false; - bool should_throttle = false; - const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); - struct extent_state *cached_state = NULL; - - BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); - - /* - * For non-free space inodes and non-shareable roots, we want to back - * off from time to time. This means all inodes in subvolume roots, - * reloc roots, and data reloc roots. - */ - if (!btrfs_is_free_space_inode(inode) && - test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - be_nice = true; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = READA_BACK; - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - lock_extent_bits(&inode->io_tree, lock_start, (u64)-1, - &cached_state); - - /* - * We want to drop from the next block forward in case this - * new size is not block aligned since we will be keeping the - * last block of the extent just the way it is. - */ - btrfs_drop_extent_cache(inode, ALIGN(new_size, - fs_info->sectorsize), - (u64)-1, 0); - } - - /* - * This function is also used to drop the items in the log tree before - * we relog the inode, so if root != BTRFS_I(inode)->root, it means - * it is used to drop the logged items. So we shouldn't kill the delayed - * items. - */ - if (min_type == 0 && root == inode->root) - btrfs_kill_delayed_inode_items(inode); - - key.objectid = ino; - key.offset = (u64)-1; - key.type = (u8)-1; - -search_again: - /* - * with a 16K leaf size and 128MB extents, you can actually queue - * up a huge file in a single leaf. Most of the time that - * bytes_deleted is > 0, it will be huge by the time we get here - */ - if (be_nice && bytes_deleted > SZ_32M && - btrfs_should_end_transaction(trans)) { - ret = -EAGAIN; - goto out; - } - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = 0; - /* there are no items in the tree for us to truncate, we're - * done - */ - if (path->slots[0] == 0) - goto out; - path->slots[0]--; - } - - while (1) { - u64 clear_start = 0, clear_len = 0; - - fi = NULL; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = found_key.type; - - if (found_key.objectid != ino) - break; - - if (found_type < min_type) - break; - - item_end = found_key.offset; - if (found_type == BTRFS_EXTENT_DATA_KEY) { - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { - item_end += - btrfs_file_extent_num_bytes(leaf, fi); - - trace_btrfs_truncate_show_fi_regular( - inode, leaf, fi, found_key.offset); - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - item_end += btrfs_file_extent_ram_bytes(leaf, - fi); - - trace_btrfs_truncate_show_fi_inline( - inode, leaf, fi, path->slots[0], - found_key.offset); - } - item_end--; - } - if (found_type > min_type) { - del_item = 1; - } else { - if (item_end < new_size) - break; - if (found_key.offset >= new_size) - del_item = 1; - else - del_item = 0; - } - found_extent = 0; - /* FIXME, shrink the extent if the ref count is only 1 */ - if (found_type != BTRFS_EXTENT_DATA_KEY) - goto delete; - - if (extents_found != NULL) - (*extents_found)++; - - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { - u64 num_dec; - - clear_start = found_key.offset; - extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); - if (!del_item) { - u64 orig_num_bytes = - btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = ALIGN(new_size - - found_key.offset, - fs_info->sectorsize); - clear_start = ALIGN(new_size, fs_info->sectorsize); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_num_bytes); - num_dec = (orig_num_bytes - - extent_num_bytes); - if (test_bit(BTRFS_ROOT_SHAREABLE, - &root->state) && - extent_start != 0) - inode_sub_bytes(&inode->vfs_inode, - num_dec); - btrfs_mark_buffer_dirty(leaf); - } else { - extent_num_bytes = - btrfs_file_extent_disk_num_bytes(leaf, - fi); - extent_offset = found_key.offset - - btrfs_file_extent_offset(leaf, fi); - - /* FIXME blocksize != 4096 */ - num_dec = btrfs_file_extent_num_bytes(leaf, fi); - if (extent_start != 0) { - found_extent = 1; - if (test_bit(BTRFS_ROOT_SHAREABLE, - &root->state)) - inode_sub_bytes(&inode->vfs_inode, - num_dec); - } - } - clear_len = num_dec; - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - /* - * we can't truncate inline items that have had - * special encodings - */ - if (!del_item && - btrfs_file_extent_encryption(leaf, fi) == 0 && - btrfs_file_extent_other_encoding(leaf, fi) == 0 && - btrfs_file_extent_compression(leaf, fi) == 0) { - u32 size = (u32)(new_size - found_key.offset); - - btrfs_set_file_extent_ram_bytes(leaf, fi, size); - size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(path, size, 1); - } else if (!del_item) { - /* - * We have to bail so the last_size is set to - * just before this extent. - */ - ret = NEED_TRUNCATE_BLOCK; - break; - } else { - /* - * Inline extents are special, we just treat - * them as a full sector worth in the file - * extent tree just for simplicity sake. - */ - clear_len = fs_info->sectorsize; - } - - if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - inode_sub_bytes(&inode->vfs_inode, - item_end + 1 - new_size); - } -delete: - /* - * We use btrfs_truncate_inode_items() to clean up log trees for - * multiple fsyncs, and in this case we don't want to clear the - * file extent range because it's just the log. - */ - if (root == inode->root) { - ret = btrfs_inode_clear_file_extent_range(inode, - clear_start, clear_len); - if (ret) { - btrfs_abort_transaction(trans, ret); - break; - } - } - - if (del_item) - last_size = found_key.offset; - else - last_size = new_size; - if (del_item) { - if (!pending_del_nr) { - /* no pending yet, add ourselves */ - pending_del_slot = path->slots[0]; - pending_del_nr = 1; - } else if (pending_del_nr && - path->slots[0] + 1 == pending_del_slot) { - /* hop on the pending chunk */ - pending_del_nr++; - pending_del_slot = path->slots[0]; - } else { - BUG(); - } - } else { - break; - } - should_throttle = false; - - if (found_extent && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_ref ref = { 0 }; - - bytes_deleted += extent_num_bytes; - - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - extent_start, extent_num_bytes, 0); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - ino, extent_offset, - root->root_key.objectid, false); - ret = btrfs_free_extent(trans, &ref); - if (ret) { - btrfs_abort_transaction(trans, ret); - break; - } - if (be_nice) { - if (btrfs_should_throttle_delayed_refs(trans)) - should_throttle = true; - } - } - - if (found_type == BTRFS_INODE_ITEM_KEY) - break; - - if (path->slots[0] == 0 || - path->slots[0] != pending_del_slot || - should_throttle) { - if (pending_del_nr) { - ret = btrfs_del_items(trans, root, path, - pending_del_slot, - pending_del_nr); - if (ret) { - btrfs_abort_transaction(trans, ret); - break; - } - pending_del_nr = 0; - } - btrfs_release_path(path); - - /* - * We can generate a lot of delayed refs, so we need to - * throttle every once and a while and make sure we're - * adding enough space to keep up with the work we are - * generating. Since we hold a transaction here we - * can't flush, and we don't want to FLUSH_LIMIT because - * we could have generated too many delayed refs to - * actually allocate, so just bail if we're short and - * let the normal reservation dance happen higher up. - */ - if (should_throttle) { - ret = btrfs_delayed_refs_rsv_refill(fs_info, - BTRFS_RESERVE_NO_FLUSH); - if (ret) { - ret = -EAGAIN; - break; - } - } - goto search_again; - } else { - path->slots[0]--; - } - } -out: - if (ret >= 0 && pending_del_nr) { - int err; - - err = btrfs_del_items(trans, root, path, pending_del_slot, - pending_del_nr); - if (err) { - btrfs_abort_transaction(trans, err); - ret = err; - } - } - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ASSERT(last_size >= new_size); - if (!ret && last_size > new_size) - last_size = new_size; - btrfs_inode_safe_disk_i_size_write(inode, last_size); - unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1, - &cached_state); - } - - btrfs_free_path(path); - return ret; -} - -/* * btrfs_truncate_block - read, zero a chunk and write a block * @inode - inode that we're zeroing * @from - the offset to start zeroing @@ -5525,7 +5144,6 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_trans_handle *trans; u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); int ret; @@ -5540,18 +5158,16 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * above. We reserve our extra bit here because we generate a ton of * delayed refs activity by truncating. * - * If we cannot make our reservation we'll attempt to steal from the - * global reserve, because we really want to be able to free up space. + * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can, + * if we fail to make this reservation we can re-try without the + * delayed_refs_extra so we can make some forward progress. */ - ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra, + ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra, BTRFS_RESERVE_FLUSH_EVICT); if (ret) { - /* - * Try to steal from the global reserve if there is space for - * it. - */ - if (btrfs_check_space_for_delayed_refs(fs_info) || - btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) { + ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size, + BTRFS_RESERVE_FLUSH_EVICT); + if (ret) { btrfs_warn(fs_info, "could not allocate space for delete; will truncate on mount"); return ERR_PTR(-ENOSPC); @@ -5610,10 +5226,22 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + /* + * This makes sure the inode item in tree is uptodate and the space for + * the inode update is released. + */ ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) goto no_delete; + /* + * This drops any pending insert or delete operations we have for this + * inode. We could have a delayed dir index deletion queued up, but + * we're removing the inode completely so that'll be taken care of in + * the truncate. + */ + btrfs_kill_delayed_inode_items(BTRFS_I(inode)); + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; @@ -5623,14 +5251,20 @@ void btrfs_evict_inode(struct inode *inode) btrfs_i_size_write(BTRFS_I(inode), 0); while (1) { + struct btrfs_truncate_control control = { + .inode = BTRFS_I(inode), + .ino = btrfs_ino(BTRFS_I(inode)), + .new_size = 0, + .min_type = 0, + }; + trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) goto free_rsv; trans->block_rsv = rsv; - ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, 0, NULL); + ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -6998,8 +6632,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, WARN_ON(pg_offset != 0); compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); - inline_size = btrfs_file_extent_inline_item_len(leaf, - btrfs_item_nr(path->slots[0])); + inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); tmp = kmalloc(inline_size, GFP_NOFS); if (!tmp) return -ENOMEM; @@ -7773,6 +7406,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = *map; + int type; + u64 block_start, orig_start, orig_block_len, ram_bytes; + bool can_nocow = false; + bool space_reserved = false; int ret = 0; /* @@ -7787,9 +7424,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) type = BTRFS_ORDERED_PREALLOC; else @@ -7799,53 +7433,92 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes, false) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) { - struct extent_map *em2; + btrfs_inc_nocow_writers(fs_info, block_start)) + can_nocow = true; + } - em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); + if (can_nocow) { + struct extent_map *em2; + + /* We can NOCOW, so only need to reserve metadata space. */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + if (ret < 0) { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; btrfs_dec_nocow_writers(fs_info, block_start); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - *map = em = em2; - } + goto out; + } + space_reserved = true; - if (em2 && IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto out; - } - /* - * For inode marked NODATACOW or extent marked PREALLOC, - * use the existing or preallocated extent, so does not - * need to adjust btrfs_space_info's bytes_may_use. - */ - btrfs_free_reserved_data_space_noquota(fs_info, len); - goto skip_cow; + em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(fs_info, block_start); + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + *map = em = em2; } - } - /* this will cow the extent */ - free_extent_map(em); - *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; + if (IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto out; + } + } else { + const u64 prev_len = len; + + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + + /* We have to COW, so need to reserve metadata and data space. */ + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, len); + if (ret < 0) + goto out; + space_reserved = true; + + em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + *map = em; + len = min(len, em->len - (start - em->start)); + if (len < prev_len) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, + start + len, prev_len - len, + true); } - len = min(len, em->len - (start - em->start)); + /* + * We have created our ordered extent, so we can now release our reservation + * for an outstanding extent. + */ + btrfs_delalloc_release_extents(BTRFS_I(inode), len); -skip_cow: /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - - dio_data->reserve -= len; out: + if (ret && space_reserved) { + btrfs_delalloc_release_extents(BTRFS_I(inode), len); + if (can_nocow) { + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); + } else { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, + start, len, true); + extent_changeset_free(dio_data->data_reserved); + dio_data->data_reserved = NULL; + } + } return ret; } @@ -7887,18 +7560,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (!dio_data) return -ENOMEM; - dio_data->length = length; - if (write) { - dio_data->reserve = round_up(length, fs_info->sectorsize); - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, dio_data->reserve); - if (ret) { - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - return ret; - } - } iomap->private = dio_data; @@ -7991,14 +7652,8 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) { - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, start, - dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - } + kfree(dio_data); + return ret; } @@ -8028,14 +7683,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ret = -ENOTBLK; } - if (write) { - if (dio_data->reserve) - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, pos, - dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + if (write) extent_changeset_free(dio_data->data_reserved); - } out: kfree(dio_data); iomap->private = NULL; @@ -8884,6 +8533,12 @@ out_noreserve: static int btrfs_truncate(struct inode *inode, bool skip_writeback) { + struct btrfs_truncate_control control = { + .inode = BTRFS_I(inode), + .ino = btrfs_ino(BTRFS_I(inode)), + .min_type = BTRFS_EXTENT_DATA_KEY, + .clear_extent_range = true, + }; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; @@ -8891,7 +8546,6 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_metadata_size(fs_info, 1); - u64 extents_found = 0; if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -8952,10 +8606,30 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) trans->block_rsv = rsv; while (1) { - ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - inode->i_size, - BTRFS_EXTENT_DATA_KEY, - &extents_found); + struct extent_state *cached_state = NULL; + const u64 new_size = inode->i_size; + const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); + + control.new_size = new_size; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + /* + * We want to drop from the next block forward in case this new + * size is not block aligned since we will be keeping the last + * block of the extent just the way it is. + */ + btrfs_drop_extent_cache(BTRFS_I(inode), + ALIGN(new_size, fs_info->sectorsize), + (u64)-1, 0); + + ret = btrfs_truncate_inode_items(trans, root, &control); + + inode_sub_bytes(inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, + (u64)-1, &cached_state); + trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; @@ -8983,11 +8657,11 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) /* * We can't call btrfs_truncate_block inside a trans handle as we could - * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know - * we've truncated everything except the last little bit, and can do - * btrfs_truncate_block and then update the disk_i_size. + * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we + * know we've truncated everything except the last little bit, and can + * do btrfs_truncate_block and then update the disk_i_size. */ - if (ret == NEED_TRUNCATE_BLOCK) { + if (ret == BTRFS_NEED_TRUNCATE_BLOCK) { btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -9031,7 +8705,7 @@ out: * between the old i_size and the new i_size, and there were no prealloc * extents beyond i_size to drop. */ - if (extents_found > 0) + if (control.extents_found > 0) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); return ret; @@ -10595,9 +10269,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, struct btrfs_swap_info *bsi) { unsigned long nr_pages; + unsigned long max_pages; u64 first_ppage, first_ppage_reported, next_ppage; int ret; + /* + * Our swapfile may have had its size extended after the swap header was + * written. In that case activating the swapfile should not go beyond + * the max size set in the swap header. + */ + if (bsi->nr_pages >= sis->max) + return 0; + + max_pages = sis->max - bsi->nr_pages; first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, PAGE_SIZE) >> PAGE_SHIFT; @@ -10605,6 +10289,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; + nr_pages = min(nr_pages, max_pages); first_ppage_reported = first_ppage; if (bsi->start == 0) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index edfecfe62b4b..a5bd6926f7ff 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -387,6 +387,7 @@ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, * * Compatibility: * - the same type is already running + * - when trying to add a device and balance has been paused * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller * must check the condition first that would allow none -> @type */ @@ -394,7 +395,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == type) + if (fs_info->exclusive_operation == type || + (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && + type == BTRFS_EXCLOP_DEV_ADD)) return true; spin_unlock(&fs_info->super_lock); @@ -414,6 +417,29 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op) +{ + switch (op) { + case BTRFS_EXCLOP_BALANCE_PAUSED: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; + spin_unlock(&fs_info->super_lock); + break; + case BTRFS_EXCLOP_BALANCE: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); + break; + default: + btrfs_warn(fs_info, + "invalid exclop balance operation %d requested", op); + } +} + static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { struct inode *inode = file_inode(file); @@ -518,7 +544,6 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, struct timespec64 cur_time = current_time(dir); struct inode *inode; int ret; - int err; dev_t anon_dev = 0; u64 objectid; u64 index = 0; @@ -698,9 +723,10 @@ fail: trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); - err = btrfs_commit_transaction(trans); - if (err && !ret) - ret = err; + if (ret) + btrfs_end_transaction(trans); + else + ret = btrfs_commit_transaction(trans); if (!ret) { inode = btrfs_lookup_dentry(dir, dentry); @@ -2084,7 +2110,7 @@ static noinline int copy_to_sk(struct btrfs_path *path, for (i = slot; i < nritems; i++) { item_off = btrfs_item_ptr_offset(leaf, i); - item_len = btrfs_item_size_nr(leaf, i); + item_len = btrfs_item_size(leaf, i); btrfs_item_key_to_cpu(leaf, key, i); if (!key_in_sk(key, sk)) @@ -2538,7 +2564,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, btrfs_item_key_to_cpu(leaf, &key, slot); item_off = btrfs_item_ptr_offset(leaf, slot); - item_len = btrfs_item_size_nr(leaf, slot); + item_len = btrfs_item_size(leaf, slot); /* Check if dirid in ROOT_REF corresponds to passed dirid */ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { @@ -2740,7 +2766,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) item_off = btrfs_item_ptr_offset(leaf, slot) + sizeof(struct btrfs_root_ref); - item_len = btrfs_item_size_nr(leaf, slot) + item_len = btrfs_item_size(leaf, slot) - sizeof(struct btrfs_root_ref); read_extent_buffer(leaf, subvol_info->name, item_off, item_len); @@ -3148,13 +3174,25 @@ out: static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_vol_args *vol_args; + bool restore_op = false; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) - return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { + if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + + /* + * We can do the device add because we have a paused balanced, + * change the exclusive op type and remember we should bring + * back the paused balance + */ + fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD; + btrfs_exclop_start_unlock(fs_info); + restore_op = true; + } vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { @@ -3170,7 +3208,10 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - btrfs_exclop_finish(fs_info); + if (restore_op) + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + else + btrfs_exclop_finish(fs_info); return ret; } @@ -3622,7 +3663,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, { struct btrfs_trans_handle *trans; u64 transid; - int ret; trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -3634,11 +3674,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, goto out; } transid = trans->transid; - ret = btrfs_commit_transaction_async(trans); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } + btrfs_commit_transaction_async(trans); out: if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) @@ -4061,6 +4097,7 @@ locked: spin_lock(&fs_info->balance_lock); bctl->flags |= BTRFS_BALANCE_RESUME; spin_unlock(&fs_info->balance_lock); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); goto do_balance; } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index aae1027bd76a..0775ae9f4419 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -85,7 +85,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - u32 item_size = btrfs_item_size_nr(eb, slot); + u32 item_size = btrfs_item_size(eb, slot); u64 flags; u64 offset; int ref_index = 0; @@ -200,7 +200,6 @@ void btrfs_print_leaf(struct extent_buffer *l) struct btrfs_fs_info *fs_info; int i; u32 type, nr; - struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_dir_item *di; struct btrfs_inode_item *ii; @@ -224,12 +223,11 @@ void btrfs_print_leaf(struct extent_buffer *l) btrfs_leaf_free_space(l), btrfs_header_owner(l)); print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { - item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); type = key.type; pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n", i, key.objectid, type, key.offset, - btrfs_item_offset(l, item), btrfs_item_size(l, item)); + btrfs_item_offset(l, i), btrfs_item_size(l, i)); switch (type) { case BTRFS_INODE_ITEM_KEY: ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); @@ -347,7 +345,7 @@ void btrfs_print_leaf(struct extent_buffer *l) case BTRFS_UUID_KEY_SUBVOL: case BTRFS_UUID_KEY_RECEIVED_SUBVOL: print_uuid_item(l, btrfs_item_ptr_offset(l, i), - btrfs_item_size_nr(l, i)); + btrfs_item_size(l, i)); break; } } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index b1cb5a8c2999..1a6d2d5b4b33 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -158,7 +158,7 @@ static int iterate_object_props(struct btrfs_root *root, di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); cur = 0; - total_len = btrfs_item_size_nr(leaf, slot); + total_len = btrfs_item_size(leaf, slot); while (cur < total_len) { u32 name_len = btrfs_dir_name_len(leaf, di); @@ -377,8 +377,9 @@ static int inherit_props(struct btrfs_trans_handle *trans, */ if (need_reserve) { num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); - ret = btrfs_block_rsv_add(root, trans->block_rsv, - num_bytes, BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_block_rsv_add(fs_info, trans->block_rsv, + num_bytes, + BTRFS_RESERVE_NO_FLUSH); if (ret) return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 6c037f1252b7..8928275823a1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -940,6 +940,14 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) int ret = 0; int slot; + /* + * We need to have subvol_sem write locked, to prevent races between + * concurrent tasks trying to enable quotas, because we will unlock + * and relock qgroup_ioctl_lock before setting fs_info->quota_root + * and before setting BTRFS_FS_QUOTA_ENABLED. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out; @@ -1117,8 +1125,19 @@ out_add_root: goto out_free_path; } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + /* + * Commit the transaction while not holding qgroup_ioctl_lock, to avoid + * a deadlock with tasks concurrently doing other qgroup operations, such + * adding/removing qgroups or adding/deleting qgroup relations for example, + * because all qgroup operations first start or join a transaction and then + * lock the qgroup_ioctl_lock mutex. + * We are safe from a concurrent task trying to enable quotas, by calling + * this function, since we are serialized by fs_info->subvol_sem. + */ ret = btrfs_commit_transaction(trans); trans = NULL; + mutex_lock(&fs_info->qgroup_ioctl_lock); if (ret) goto out_free_path; @@ -3142,6 +3161,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; struct ulist *roots = NULL; @@ -3151,7 +3171,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, int ret; mutex_lock(&fs_info->qgroup_rescan_lock); - ret = btrfs_search_slot_for_read(fs_info->extent_root, + extent_root = btrfs_extent_root(fs_info, + fs_info->qgroup_rescan_progress.objectid); + ret = btrfs_search_slot_for_read(extent_root, &fs_info->qgroup_rescan_progress, path, 1, 0); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c deleted file mode 100644 index eb96fdc3be25..000000000000 --- a/fs/btrfs/reada.c +++ /dev/null @@ -1,1086 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2011 STRATO. All rights reserved. - */ - -#include <linux/sched.h> -#include <linux/pagemap.h> -#include <linux/writeback.h> -#include <linux/blkdev.h> -#include <linux/slab.h> -#include <linux/workqueue.h> -#include "ctree.h" -#include "volumes.h" -#include "disk-io.h" -#include "transaction.h" -#include "dev-replace.h" -#include "block-group.h" - -#undef DEBUG - -/* - * This is the implementation for the generic read ahead framework. - * - * To trigger a readahead, btrfs_reada_add must be called. It will start - * a read ahead for the given range [start, end) on tree root. The returned - * handle can either be used to wait on the readahead to finish - * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). - * - * The read ahead works as follows: - * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. - * reada_start_machine will then search for extents to prefetch and trigger - * some reads. When a read finishes for a node, all contained node/leaf - * pointers that lie in the given range will also be enqueued. The reads will - * be triggered in sequential order, thus giving a big win over a naive - * enumeration. It will also make use of multi-device layouts. Each disk - * will have its on read pointer and all disks will by utilized in parallel. - * Also will no two disks read both sides of a mirror simultaneously, as this - * would waste seeking capacity. Instead both disks will read different parts - * of the filesystem. - * Any number of readaheads can be started in parallel. The read order will be - * determined globally, i.e. 2 parallel readaheads will normally finish faster - * than the 2 started one after another. - */ - -#define MAX_IN_FLIGHT 6 - -struct reada_extctl { - struct list_head list; - struct reada_control *rc; - u64 generation; -}; - -struct reada_extent { - u64 logical; - u64 owner_root; - struct btrfs_key top; - struct list_head extctl; - int refcnt; - spinlock_t lock; - struct reada_zone *zones[BTRFS_MAX_MIRRORS]; - int nzones; - int scheduled; - int level; -}; - -struct reada_zone { - u64 start; - u64 end; - u64 elems; - struct list_head list; - spinlock_t lock; - int locked; - struct btrfs_device *device; - struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl - * self */ - int ndevs; - struct kref refcnt; -}; - -struct reada_machine_work { - struct btrfs_work work; - struct btrfs_fs_info *fs_info; -}; - -static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); -static void reada_control_release(struct kref *kref); -static void reada_zone_release(struct kref *kref); -static void reada_start_machine(struct btrfs_fs_info *fs_info); -static void __reada_start_machine(struct btrfs_fs_info *fs_info); - -static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, u64 owner_root, - u64 generation, int level); - -/* recurses */ -/* in case of err, eb might be NULL */ -static void __readahead_hook(struct btrfs_fs_info *fs_info, - struct reada_extent *re, struct extent_buffer *eb, - int err) -{ - int nritems; - int i; - u64 bytenr; - u64 generation; - struct list_head list; - - spin_lock(&re->lock); - /* - * just take the full list from the extent. afterwards we - * don't need the lock anymore - */ - list_replace_init(&re->extctl, &list); - re->scheduled = 0; - spin_unlock(&re->lock); - - /* - * this is the error case, the extent buffer has not been - * read correctly. We won't access anything from it and - * just cleanup our data structures. Effectively this will - * cut the branch below this node from read ahead. - */ - if (err) - goto cleanup; - - /* - * FIXME: currently we just set nritems to 0 if this is a leaf, - * effectively ignoring the content. In a next step we could - * trigger more readahead depending from the content, e.g. - * fetch the checksums for the extents in the leaf. - */ - if (!btrfs_header_level(eb)) - goto cleanup; - - nritems = btrfs_header_nritems(eb); - generation = btrfs_header_generation(eb); - for (i = 0; i < nritems; i++) { - struct reada_extctl *rec; - u64 n_gen; - struct btrfs_key key; - struct btrfs_key next_key; - - btrfs_node_key_to_cpu(eb, &key, i); - if (i + 1 < nritems) - btrfs_node_key_to_cpu(eb, &next_key, i + 1); - else - next_key = re->top; - bytenr = btrfs_node_blockptr(eb, i); - n_gen = btrfs_node_ptr_generation(eb, i); - - list_for_each_entry(rec, &list, list) { - struct reada_control *rc = rec->rc; - - /* - * if the generation doesn't match, just ignore this - * extctl. This will probably cut off a branch from - * prefetch. Alternatively one could start a new (sub-) - * prefetch for this branch, starting again from root. - * FIXME: move the generation check out of this loop - */ -#ifdef DEBUG - if (rec->generation != generation) { - btrfs_debug(fs_info, - "generation mismatch for (%llu,%d,%llu) %llu != %llu", - key.objectid, key.type, key.offset, - rec->generation, generation); - } -#endif - if (rec->generation == generation && - btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && - btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) - reada_add_block(rc, bytenr, &next_key, - btrfs_header_owner(eb), n_gen, - btrfs_header_level(eb) - 1); - } - } - -cleanup: - /* - * free extctl records - */ - while (!list_empty(&list)) { - struct reada_control *rc; - struct reada_extctl *rec; - - rec = list_first_entry(&list, struct reada_extctl, list); - list_del(&rec->list); - rc = rec->rc; - kfree(rec); - - kref_get(&rc->refcnt); - if (atomic_dec_and_test(&rc->elems)) { - kref_put(&rc->refcnt, reada_control_release); - wake_up(&rc->wait); - } - kref_put(&rc->refcnt, reada_control_release); - - reada_extent_put(fs_info, re); /* one ref for each entry */ - } - - return; -} - -int btree_readahead_hook(struct extent_buffer *eb, int err) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - int ret = 0; - struct reada_extent *re; - - /* find extent */ - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, - eb->start >> fs_info->sectorsize_bits); - if (re) - re->refcnt++; - spin_unlock(&fs_info->reada_lock); - if (!re) { - ret = -1; - goto start_machine; - } - - __readahead_hook(fs_info, re, eb, err); - reada_extent_put(fs_info, re); /* our ref */ - -start_machine: - reada_start_machine(fs_info); - return ret; -} - -static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, - struct btrfs_io_context *bioc) -{ - struct btrfs_fs_info *fs_info = dev->fs_info; - int ret; - struct reada_zone *zone; - struct btrfs_block_group *cache = NULL; - u64 start; - u64 end; - int i; - - zone = NULL; - spin_lock(&fs_info->reada_lock); - ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> fs_info->sectorsize_bits, 1); - if (ret == 1 && logical >= zone->start && logical <= zone->end) { - kref_get(&zone->refcnt); - spin_unlock(&fs_info->reada_lock); - return zone; - } - - spin_unlock(&fs_info->reada_lock); - - cache = btrfs_lookup_block_group(fs_info, logical); - if (!cache) - return NULL; - - start = cache->start; - end = start + cache->length - 1; - btrfs_put_block_group(cache); - - zone = kzalloc(sizeof(*zone), GFP_KERNEL); - if (!zone) - return NULL; - - ret = radix_tree_preload(GFP_KERNEL); - if (ret) { - kfree(zone); - return NULL; - } - - zone->start = start; - zone->end = end; - INIT_LIST_HEAD(&zone->list); - spin_lock_init(&zone->lock); - zone->locked = 0; - kref_init(&zone->refcnt); - zone->elems = 0; - zone->device = dev; /* our device always sits at index 0 */ - for (i = 0; i < bioc->num_stripes; ++i) { - /* bounds have already been checked */ - zone->devs[i] = bioc->stripes[i].dev; - } - zone->ndevs = bioc->num_stripes; - - spin_lock(&fs_info->reada_lock); - ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)(zone->end >> fs_info->sectorsize_bits), - zone); - - if (ret == -EEXIST) { - kfree(zone); - ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> fs_info->sectorsize_bits, 1); - if (ret == 1 && logical >= zone->start && logical <= zone->end) - kref_get(&zone->refcnt); - else - zone = NULL; - } - spin_unlock(&fs_info->reada_lock); - radix_tree_preload_end(); - - return zone; -} - -static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, - u64 logical, - struct btrfs_key *top, - u64 owner_root, int level) -{ - int ret; - struct reada_extent *re = NULL; - struct reada_extent *re_exist = NULL; - struct btrfs_io_context *bioc = NULL; - struct btrfs_device *dev; - struct btrfs_device *prev_dev; - u64 length; - int real_stripes; - int nzones = 0; - unsigned long index = logical >> fs_info->sectorsize_bits; - int dev_replace_is_ongoing; - int have_zone = 0; - - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, index); - if (re) - re->refcnt++; - spin_unlock(&fs_info->reada_lock); - - if (re) - return re; - - re = kzalloc(sizeof(*re), GFP_KERNEL); - if (!re) - return NULL; - - re->logical = logical; - re->top = *top; - INIT_LIST_HEAD(&re->extctl); - spin_lock_init(&re->lock); - re->refcnt = 1; - re->owner_root = owner_root; - re->level = level; - - /* - * map block - */ - length = fs_info->nodesize; - ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bioc, 0); - if (ret || !bioc || length < fs_info->nodesize) - goto error; - - if (bioc->num_stripes > BTRFS_MAX_MIRRORS) { - btrfs_err(fs_info, - "readahead: more than %d copies not supported", - BTRFS_MAX_MIRRORS); - goto error; - } - - real_stripes = bioc->num_stripes - bioc->num_tgtdevs; - for (nzones = 0; nzones < real_stripes; ++nzones) { - struct reada_zone *zone; - - dev = bioc->stripes[nzones].dev; - - /* cannot read ahead on missing device. */ - if (!dev->bdev) - continue; - - zone = reada_find_zone(dev, logical, bioc); - if (!zone) - continue; - - re->zones[re->nzones++] = zone; - spin_lock(&zone->lock); - if (!zone->elems) - kref_get(&zone->refcnt); - ++zone->elems; - spin_unlock(&zone->lock); - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - if (re->nzones == 0) { - /* not a single zone found, error and out */ - goto error; - } - - /* Insert extent in reada tree + all per-device trees, all or nothing */ - down_read(&fs_info->dev_replace.rwsem); - ret = radix_tree_preload(GFP_KERNEL); - if (ret) { - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - - spin_lock(&fs_info->reada_lock); - ret = radix_tree_insert(&fs_info->reada_tree, index, re); - if (ret == -EEXIST) { - re_exist = radix_tree_lookup(&fs_info->reada_tree, index); - re_exist->refcnt++; - spin_unlock(&fs_info->reada_lock); - radix_tree_preload_end(); - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - if (ret) { - spin_unlock(&fs_info->reada_lock); - radix_tree_preload_end(); - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - radix_tree_preload_end(); - prev_dev = NULL; - dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( - &fs_info->dev_replace); - for (nzones = 0; nzones < re->nzones; ++nzones) { - dev = re->zones[nzones]->device; - - if (dev == prev_dev) { - /* - * in case of DUP, just add the first zone. As both - * are on the same device, there's nothing to gain - * from adding both. - * Also, it wouldn't work, as the tree is per device - * and adding would fail with EEXIST - */ - continue; - } - if (!dev->bdev) - continue; - - if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state)) - continue; - - if (dev_replace_is_ongoing && - dev == fs_info->dev_replace.tgtdev) { - /* - * as this device is selected for reading only as - * a last resort, skip it for read ahead. - */ - continue; - } - prev_dev = dev; - ret = radix_tree_insert(&dev->reada_extents, index, re); - if (ret) { - while (--nzones >= 0) { - dev = re->zones[nzones]->device; - BUG_ON(dev == NULL); - /* ignore whether the entry was inserted */ - radix_tree_delete(&dev->reada_extents, index); - } - radix_tree_delete(&fs_info->reada_tree, index); - spin_unlock(&fs_info->reada_lock); - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - have_zone = 1; - } - if (!have_zone) - radix_tree_delete(&fs_info->reada_tree, index); - spin_unlock(&fs_info->reada_lock); - up_read(&fs_info->dev_replace.rwsem); - - if (!have_zone) - goto error; - - btrfs_put_bioc(bioc); - return re; - -error: - for (nzones = 0; nzones < re->nzones; ++nzones) { - struct reada_zone *zone; - - zone = re->zones[nzones]; - kref_get(&zone->refcnt); - spin_lock(&zone->lock); - --zone->elems; - if (zone->elems == 0) { - /* - * no fs_info->reada_lock needed, as this can't be - * the last ref - */ - kref_put(&zone->refcnt, reada_zone_release); - } - spin_unlock(&zone->lock); - - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - btrfs_put_bioc(bioc); - kfree(re); - return re_exist; -} - -static void reada_extent_put(struct btrfs_fs_info *fs_info, - struct reada_extent *re) -{ - int i; - unsigned long index = re->logical >> fs_info->sectorsize_bits; - - spin_lock(&fs_info->reada_lock); - if (--re->refcnt) { - spin_unlock(&fs_info->reada_lock); - return; - } - - radix_tree_delete(&fs_info->reada_tree, index); - for (i = 0; i < re->nzones; ++i) { - struct reada_zone *zone = re->zones[i]; - - radix_tree_delete(&zone->device->reada_extents, index); - } - - spin_unlock(&fs_info->reada_lock); - - for (i = 0; i < re->nzones; ++i) { - struct reada_zone *zone = re->zones[i]; - - kref_get(&zone->refcnt); - spin_lock(&zone->lock); - --zone->elems; - if (zone->elems == 0) { - /* no fs_info->reada_lock needed, as this can't be - * the last ref */ - kref_put(&zone->refcnt, reada_zone_release); - } - spin_unlock(&zone->lock); - - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - - kfree(re); -} - -static void reada_zone_release(struct kref *kref) -{ - struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); - struct btrfs_fs_info *fs_info = zone->device->fs_info; - - lockdep_assert_held(&fs_info->reada_lock); - - radix_tree_delete(&zone->device->reada_zones, - zone->end >> fs_info->sectorsize_bits); - - kfree(zone); -} - -static void reada_control_release(struct kref *kref) -{ - struct reada_control *rc = container_of(kref, struct reada_control, - refcnt); - - kfree(rc); -} - -static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, u64 owner_root, - u64 generation, int level) -{ - struct btrfs_fs_info *fs_info = rc->fs_info; - struct reada_extent *re; - struct reada_extctl *rec; - - /* takes one ref */ - re = reada_find_extent(fs_info, logical, top, owner_root, level); - if (!re) - return -1; - - rec = kzalloc(sizeof(*rec), GFP_KERNEL); - if (!rec) { - reada_extent_put(fs_info, re); - return -ENOMEM; - } - - rec->rc = rc; - rec->generation = generation; - atomic_inc(&rc->elems); - - spin_lock(&re->lock); - list_add_tail(&rec->list, &re->extctl); - spin_unlock(&re->lock); - - /* leave the ref on the extent */ - - return 0; -} - -/* - * called with fs_info->reada_lock held - */ -static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) -{ - int i; - unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits; - - for (i = 0; i < zone->ndevs; ++i) { - struct reada_zone *peer; - peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); - if (peer && peer->device != zone->device) - peer->locked = lock; - } -} - -/* - * called with fs_info->reada_lock held - */ -static int reada_pick_zone(struct btrfs_device *dev) -{ - struct reada_zone *top_zone = NULL; - struct reada_zone *top_locked_zone = NULL; - u64 top_elems = 0; - u64 top_locked_elems = 0; - unsigned long index = 0; - int ret; - - if (dev->reada_curr_zone) { - reada_peer_zones_set_lock(dev->reada_curr_zone, 0); - kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); - dev->reada_curr_zone = NULL; - } - /* pick the zone with the most elements */ - while (1) { - struct reada_zone *zone; - - ret = radix_tree_gang_lookup(&dev->reada_zones, - (void **)&zone, index, 1); - if (ret == 0) - break; - index = (zone->end >> dev->fs_info->sectorsize_bits) + 1; - if (zone->locked) { - if (zone->elems > top_locked_elems) { - top_locked_elems = zone->elems; - top_locked_zone = zone; - } - } else { - if (zone->elems > top_elems) { - top_elems = zone->elems; - top_zone = zone; - } - } - } - if (top_zone) - dev->reada_curr_zone = top_zone; - else if (top_locked_zone) - dev->reada_curr_zone = top_locked_zone; - else - return 0; - - dev->reada_next = dev->reada_curr_zone->start; - kref_get(&dev->reada_curr_zone->refcnt); - reada_peer_zones_set_lock(dev->reada_curr_zone, 1); - - return 1; -} - -static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, int level, int mirror_num, - struct extent_buffer **eb) -{ - struct extent_buffer *buf = NULL; - int ret; - - buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); - if (IS_ERR(buf)) - return 0; - - set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); - - ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); - if (ret) { - free_extent_buffer_stale(buf); - return ret; - } - - if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { - free_extent_buffer_stale(buf); - return -EIO; - } else if (extent_buffer_uptodate(buf)) { - *eb = buf; - } else { - free_extent_buffer(buf); - } - return 0; -} - -static int reada_start_machine_dev(struct btrfs_device *dev) -{ - struct btrfs_fs_info *fs_info = dev->fs_info; - struct reada_extent *re = NULL; - int mirror_num = 0; - struct extent_buffer *eb = NULL; - u64 logical; - int ret; - int i; - - spin_lock(&fs_info->reada_lock); - if (dev->reada_curr_zone == NULL) { - ret = reada_pick_zone(dev); - if (!ret) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - } - /* - * FIXME currently we issue the reads one extent at a time. If we have - * a contiguous block of extents, we could also coagulate them or use - * plugging to speed things up - */ - ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> fs_info->sectorsize_bits, 1); - if (ret == 0 || re->logical > dev->reada_curr_zone->end) { - ret = reada_pick_zone(dev); - if (!ret) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - re = NULL; - ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> fs_info->sectorsize_bits, 1); - } - if (ret == 0) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - dev->reada_next = re->logical + fs_info->nodesize; - re->refcnt++; - - spin_unlock(&fs_info->reada_lock); - - spin_lock(&re->lock); - if (re->scheduled || list_empty(&re->extctl)) { - spin_unlock(&re->lock); - reada_extent_put(fs_info, re); - return 0; - } - re->scheduled = 1; - spin_unlock(&re->lock); - - /* - * find mirror num - */ - for (i = 0; i < re->nzones; ++i) { - if (re->zones[i]->device == dev) { - mirror_num = i + 1; - break; - } - } - logical = re->logical; - - atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info, logical, re->owner_root, - re->level, mirror_num, &eb); - if (ret) - __readahead_hook(fs_info, re, NULL, ret); - else if (eb) - __readahead_hook(fs_info, re, eb, ret); - - if (eb) - free_extent_buffer(eb); - - atomic_dec(&dev->reada_in_flight); - reada_extent_put(fs_info, re); - - return 1; - -} - -static void reada_start_machine_worker(struct btrfs_work *work) -{ - struct reada_machine_work *rmw; - int old_ioprio; - - rmw = container_of(work, struct reada_machine_work, work); - - old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), - task_nice_ioprio(current)); - set_task_ioprio(current, BTRFS_IOPRIO_READA); - __reada_start_machine(rmw->fs_info); - set_task_ioprio(current, old_ioprio); - - atomic_dec(&rmw->fs_info->reada_works_cnt); - - kfree(rmw); -} - -/* Try to start up to 10k READA requests for a group of devices */ -static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) -{ - u64 enqueued; - u64 total = 0; - struct btrfs_device *device; - - do { - enqueued = 0; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (atomic_read(&device->reada_in_flight) < - MAX_IN_FLIGHT) - enqueued += reada_start_machine_dev(device); - } - total += enqueued; - } while (enqueued && total < 10000); - - return total; -} - -static void __reada_start_machine(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; - int i; - u64 enqueued = 0; - - mutex_lock(&fs_devices->device_list_mutex); - - enqueued += reada_start_for_fsdevs(fs_devices); - list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) - enqueued += reada_start_for_fsdevs(seed_devs); - - mutex_unlock(&fs_devices->device_list_mutex); - if (enqueued == 0) - return; - - /* - * If everything is already in the cache, this is effectively single - * threaded. To a) not hold the caller for too long and b) to utilize - * more cores, we broke the loop above after 10000 iterations and now - * enqueue to workers to finish it. This will distribute the load to - * the cores. - */ - for (i = 0; i < 2; ++i) { - reada_start_machine(fs_info); - if (atomic_read(&fs_info->reada_works_cnt) > - BTRFS_MAX_MIRRORS * 2) - break; - } -} - -static void reada_start_machine(struct btrfs_fs_info *fs_info) -{ - struct reada_machine_work *rmw; - - rmw = kzalloc(sizeof(*rmw), GFP_KERNEL); - if (!rmw) { - /* FIXME we cannot handle this properly right now */ - BUG(); - } - btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); - rmw->fs_info = fs_info; - - btrfs_queue_work(fs_info->readahead_workers, &rmw->work); - atomic_inc(&fs_info->reada_works_cnt); -} - -#ifdef DEBUG -static void dump_devs(struct btrfs_fs_info *fs_info, int all) -{ - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - unsigned long index; - int ret; - int i; - int j; - int cnt; - - spin_lock(&fs_info->reada_lock); - list_for_each_entry(device, &fs_devices->devices, dev_list) { - btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid, - atomic_read(&device->reada_in_flight)); - index = 0; - while (1) { - struct reada_zone *zone; - ret = radix_tree_gang_lookup(&device->reada_zones, - (void **)&zone, index, 1); - if (ret == 0) - break; - pr_debug(" zone %llu-%llu elems %llu locked %d devs", - zone->start, zone->end, zone->elems, - zone->locked); - for (j = 0; j < zone->ndevs; ++j) { - pr_cont(" %lld", - zone->devs[j]->devid); - } - if (device->reada_curr_zone == zone) - pr_cont(" curr off %llu", - device->reada_next - zone->start); - pr_cont("\n"); - index = (zone->end >> fs_info->sectorsize_bits) + 1; - } - cnt = 0; - index = 0; - while (all) { - struct reada_extent *re = NULL; - - ret = radix_tree_gang_lookup(&device->reada_extents, - (void **)&re, index, 1); - if (ret == 0) - break; - pr_debug(" re: logical %llu size %u empty %d scheduled %d", - re->logical, fs_info->nodesize, - list_empty(&re->extctl), re->scheduled); - - for (i = 0; i < re->nzones; ++i) { - pr_cont(" zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - pr_cont(" %lld", - re->zones[i]->devs[j]->devid); - } - } - pr_cont("\n"); - index = (re->logical >> fs_info->sectorsize_bits) + 1; - if (++cnt > 15) - break; - } - } - - index = 0; - cnt = 0; - while (all) { - struct reada_extent *re = NULL; - - ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, - index, 1); - if (ret == 0) - break; - if (!re->scheduled) { - index = (re->logical >> fs_info->sectorsize_bits) + 1; - continue; - } - pr_debug("re: logical %llu size %u list empty %d scheduled %d", - re->logical, fs_info->nodesize, - list_empty(&re->extctl), re->scheduled); - for (i = 0; i < re->nzones; ++i) { - pr_cont(" zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - pr_cont(" %lld", - re->zones[i]->devs[j]->devid); - } - } - pr_cont("\n"); - index = (re->logical >> fs_info->sectorsize_bits) + 1; - } - spin_unlock(&fs_info->reada_lock); -} -#endif - -/* - * interface - */ -struct reada_control *btrfs_reada_add(struct btrfs_root *root, - struct btrfs_key *key_start, struct btrfs_key *key_end) -{ - struct reada_control *rc; - u64 start; - u64 generation; - int ret; - int level; - struct extent_buffer *node; - static struct btrfs_key max_key = { - .objectid = (u64)-1, - .type = (u8)-1, - .offset = (u64)-1 - }; - - rc = kzalloc(sizeof(*rc), GFP_KERNEL); - if (!rc) - return ERR_PTR(-ENOMEM); - - rc->fs_info = root->fs_info; - rc->key_start = *key_start; - rc->key_end = *key_end; - atomic_set(&rc->elems, 0); - init_waitqueue_head(&rc->wait); - kref_init(&rc->refcnt); - kref_get(&rc->refcnt); /* one ref for having elements */ - - node = btrfs_root_node(root); - start = node->start; - generation = btrfs_header_generation(node); - level = btrfs_header_level(node); - free_extent_buffer(node); - - ret = reada_add_block(rc, start, &max_key, root->root_key.objectid, - generation, level); - if (ret) { - kfree(rc); - return ERR_PTR(ret); - } - - reada_start_machine(root->fs_info); - - return rc; -} - -#ifdef DEBUG -int btrfs_reada_wait(void *handle) -{ - struct reada_control *rc = handle; - struct btrfs_fs_info *fs_info = rc->fs_info; - - while (atomic_read(&rc->elems)) { - if (!atomic_read(&fs_info->reada_works_cnt)) - reada_start_machine(fs_info); - wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, - 5 * HZ); - dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); - } - - dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); - - kref_put(&rc->refcnt, reada_control_release); - - return 0; -} -#else -int btrfs_reada_wait(void *handle) -{ - struct reada_control *rc = handle; - struct btrfs_fs_info *fs_info = rc->fs_info; - - while (atomic_read(&rc->elems)) { - if (!atomic_read(&fs_info->reada_works_cnt)) - reada_start_machine(fs_info); - wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, - (HZ + 9) / 10); - } - - kref_put(&rc->refcnt, reada_control_release); - - return 0; -} -#endif - -void btrfs_reada_detach(void *handle) -{ - struct reada_control *rc = handle; - - kref_put(&rc->refcnt, reada_control_release); -} - -/* - * Before removing a device (device replace or device remove ioctls), call this - * function to wait for all existing readahead requests on the device and to - * make sure no one queues more readahead requests for the device. - * - * Must be called without holding neither the device list mutex nor the device - * replace semaphore, otherwise it will deadlock. - */ -void btrfs_reada_remove_dev(struct btrfs_device *dev) -{ - struct btrfs_fs_info *fs_info = dev->fs_info; - - /* Serialize with readahead extent creation at reada_find_extent(). */ - spin_lock(&fs_info->reada_lock); - set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); - spin_unlock(&fs_info->reada_lock); - - /* - * There might be readahead requests added to the radix trees which - * were not yet added to the readahead work queue. We need to start - * them and wait for their completion, otherwise we can end up with - * use-after-free problems when dropping the last reference on the - * readahead extents and their zones, as they need to access the - * device structure. - */ - reada_start_machine(fs_info); - btrfs_flush_workqueue(fs_info->readahead_workers); -} - -/* - * If when removing a device (device replace or device remove ioctls) an error - * happens after calling btrfs_reada_remove_dev(), call this to undo what that - * function did. This is safe to call even if btrfs_reada_remove_dev() was not - * called before. - */ -void btrfs_reada_undo_remove_dev(struct btrfs_device *dev) -{ - spin_lock(&dev->fs_info->reada_lock); - clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); - spin_unlock(&dev->fs_info->reada_lock); -} diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index e2b9f8616501..a248f46cfe72 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -435,7 +435,7 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; struct extent_buffer *leaf = path->nodes[0]; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); unsigned long end, ptr; u64 offset, flags, count; int type, ret; @@ -972,6 +972,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, /* Walk down all roots and build the ref tree, meant to be called at mount */ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { + struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *eb; int tree_block_level = 0; @@ -985,7 +986,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; - eb = btrfs_read_lock_root_node(fs_info->extent_root); + extent_root = btrfs_extent_root(fs_info, 0); + eb = btrfs_read_lock_root_node(extent_root); level = btrfs_header_level(eb); path->nodes[level] = eb; path->slots[level] = 0; @@ -998,7 +1000,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) * would have had to added a ref key item which may appear on a * different leaf from the original extent item. */ - ret = walk_down_tree(fs_info->extent_root, path, level, + ret = walk_down_tree(extent_root, path, level, &bytenr, &num_bytes, &tree_block_level); if (ret) break; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index e0f93b357548..a3930da4eb3f 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -439,7 +439,7 @@ process_slot: break; } next_key_min_offset = key.offset + datal; - size = btrfs_item_size_nr(leaf, slot); + size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 33a0ee7ac590..f5465197996d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -26,6 +26,7 @@ #include "misc.h" #include "subpage.h" #include "zoned.h" +#include "inode-item.h" /* * Relocation overview @@ -1736,7 +1737,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, memset(&next_key, 0, sizeof(next_key)); while (1) { - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, + min_reserved, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) goto out; @@ -1855,7 +1857,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); if (ret) err = ret; @@ -2323,8 +2325,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * If we get an enospc just kick back -EAGAIN so we know to drop the * transaction and try to refill when we can flush all the things. */ - ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, - BTRFS_RESERVE_FLUSH_LIMIT); + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; while (tmp <= rc->reserved_bytes) @@ -3149,7 +3151,7 @@ static int add_tree_block(struct reloc_control *rc, u64 owner = 0; eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, path->slots[0]); + item_size = btrfs_item_size(eb, path->slots[0]); if (extent_key->type == BTRFS_METADATA_ITEM_KEY || item_size >= sizeof(*ei) + sizeof(*bi)) { @@ -3550,7 +3552,7 @@ int prepare_to_relocate(struct reloc_control *rc) rc->reserved_bytes = 0; rc->block_rsv->size = rc->extent_root->fs_info->nodesize * RELOCATION_RESERVED_NODES; - ret = btrfs_block_rsv_refill(rc->extent_root, + ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, rc->block_rsv, rc->block_rsv->size, BTRFS_RESERVE_FLUSH_ALL); if (ret) @@ -3598,9 +3600,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) while (1) { rc->reserved_bytes = 0; - ret = btrfs_block_rsv_refill(rc->extent_root, - rc->block_rsv, rc->block_rsv->size, - BTRFS_RESERVE_FLUSH_ALL); + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, + rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { err = ret; break; @@ -3858,25 +3860,14 @@ out: * 0 success * -EINPROGRESS operation is already in progress, that's probably a bug * -ECANCELED cancellation request was set before the operation started - * -EAGAIN can not start because there are ongoing send operations */ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) { - spin_lock(&fs_info->send_reloc_lock); - if (fs_info->send_in_progress) { - btrfs_warn_rl(fs_info, -"cannot run relocation while send operations are in progress (%d in progress)", - fs_info->send_in_progress); - spin_unlock(&fs_info->send_reloc_lock); - return -EAGAIN; - } if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { /* This should not happen */ - spin_unlock(&fs_info->send_reloc_lock); btrfs_err(fs_info, "reloc already running, cannot start"); return -EINPROGRESS; } - spin_unlock(&fs_info->send_reloc_lock); if (atomic_read(&fs_info->reloc_cancel_req) > 0) { btrfs_info(fs_info, "chunk relocation canceled on start"); @@ -3898,9 +3889,7 @@ static void reloc_chunk_end(struct btrfs_fs_info *fs_info) /* Requested after start, clear bit first so any waiters can continue */ if (atomic_read(&fs_info->reloc_cancel_req) > 0) btrfs_info(fs_info, "chunk relocation canceled during operation"); - spin_lock(&fs_info->send_reloc_lock); clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); - spin_unlock(&fs_info->send_reloc_lock); atomic_set(&fs_info->reloc_cancel_req, 0); } @@ -3963,7 +3952,7 @@ static const char *stage_to_string(int stage) int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { struct btrfs_block_group *bg; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); struct reloc_control *rc; struct inode *inode; struct btrfs_path *path; @@ -4214,7 +4203,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out_end; } - rc->extent_root = fs_info->extent_root; + rc->extent_root = btrfs_extent_root(fs_info, 0); set_reloc_control(rc); @@ -4305,6 +4294,7 @@ out: int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *csum_root; struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered; int ret; @@ -4316,7 +4306,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); disk_bytenr = file_pos + inode->index_cnt; - ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, + csum_root = btrfs_csum_root(fs_info, disk_bytenr); + ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, disk_bytenr + len - 1, &list, 0); if (ret) goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index d20166336557..3d68d2dcd83e 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -25,7 +25,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, u32 len; int need_reset = 0; - len = btrfs_item_size_nr(eb, slot); + len = btrfs_item_size(eb, slot); read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), min_t(u32, len, sizeof(*item))); if (len < sizeof(*item)) @@ -146,7 +146,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr_offset(l, slot); - old_len = btrfs_item_size_nr(l, slot); + old_len = btrfs_item_size(l, slot); /* * If this is the first time we update the root item which originated @@ -502,7 +502,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, num_bytes = btrfs_calc_insert_metadata_size(fs_info, items); rsv->space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - ret = btrfs_block_rsv_add(root, rsv, num_bytes, + ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); if (ret == -ENOSPC && use_global_rsv) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8f6ceea33969..2e9a322773f2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -39,21 +39,20 @@ struct scrub_block; struct scrub_ctx; /* - * the following three values only influence the performance. + * The following three values only influence the performance. + * * The last one configures the number of parallel and outstanding I/O - * operations. The first two values configure an upper limit for the number + * operations. The first one configures an upper limit for the number * of (dynamically allocated) pages that are added to a bio. */ -#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ -#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ -#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ +#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */ /* - * the following value times PAGE_SIZE needs to be large enough to match the + * The following value times PAGE_SIZE needs to be large enough to match the * largest node/leaf/sector size that shall be supported. - * Values larger than BTRFS_STRIPE_LEN are not supported. */ -#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ +#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) struct scrub_recover { refcount_t refs; @@ -88,11 +87,7 @@ struct scrub_bio { blk_status_t status; u64 logical; u64 physical; -#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO - struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; -#else - struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; -#endif + struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; int page_count; int next_free; struct btrfs_work work; @@ -163,7 +158,7 @@ struct scrub_ctx { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_rd_bio; + int pages_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; @@ -174,7 +169,6 @@ struct scrub_ctx { struct scrub_bio *wr_curr_bio; struct mutex wr_lock; - int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ struct btrfs_device *wr_tgtdev; bool flush_all_writes; @@ -578,7 +572,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; + sctx->pages_per_bio = SCRUB_PAGES_PER_BIO; sctx->curr = -1; sctx->fs_info = fs_info; INIT_LIST_HEAD(&sctx->csum_list); @@ -616,7 +610,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sctx->wr_curr_bio = NULL; if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); - sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } @@ -758,7 +751,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); + item_size = btrfs_item_size(eb, path->slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -852,8 +845,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; - if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace) - return btrfs_repair_one_zone(fs_info, logical); + if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) + return 0; /* * We must use GFP_NOFS because the scrub task might be waiting for a @@ -1313,7 +1306,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bioc = bioc; recover->map_length = mapped_length; - BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); @@ -1675,7 +1668,7 @@ again: sbio->dev = sctx->wr_tgtdev; bio = sbio->bio; if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_wr_bio); + bio = btrfs_bio_alloc(sctx->pages_per_bio); sbio->bio = bio; } @@ -1708,7 +1701,7 @@ again: sbio->pagev[sbio->page_count] = spage; scrub_page_get(spage); sbio->page_count++; - if (sbio->page_count == sctx->pages_per_wr_bio) + if (sbio->page_count == sctx->pages_per_bio) scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); @@ -1755,7 +1748,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) struct scrub_ctx *sctx = sbio->sctx; int i; - WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); + ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; @@ -2101,7 +2094,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_rd_bio); + bio = btrfs_bio_alloc(sctx->pages_per_bio); sbio->bio = bio; } @@ -2135,7 +2128,7 @@ again: scrub_block_get(sblock); /* one for the page added to the bio */ atomic_inc(&sblock->outstanding_pages); sbio->page_count++; - if (sbio->page_count == sctx->pages_per_rd_bio) + if (sbio->page_count == sctx->pages_per_bio) scrub_submit(sctx); return 0; @@ -2297,7 +2290,7 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); scrub_page_get(spage); sblock->pagev[index] = spage; spage->sblock = sblock; @@ -2369,7 +2362,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) struct scrub_ctx *sctx = sbio->sctx; int i; - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); + ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); if (sbio->status) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -2631,7 +2624,7 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); /* For scrub block */ scrub_page_get(spage); sblock->pagev[index] = spage; @@ -2892,15 +2885,15 @@ static void scrub_parity_put(struct scrub_parity *sparity) static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *sdev, - struct btrfs_path *path, u64 logic_start, u64 logic_end) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start); + struct btrfs_root *csum_root; struct btrfs_extent_item *extent; struct btrfs_io_context *bioc = NULL; + struct btrfs_path *path; u64 flags; int ret; int slot; @@ -2919,6 +2912,16 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + path->search_commit_root = 1; + path->skip_locking = 1; + ASSERT(map->stripe_len <= U32_MAX); nsectors = map->stripe_len >> fs_info->sectorsize_bits; bitmap_len = scrub_calc_parity_bitmap_len(nsectors); @@ -2928,6 +2931,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); + btrfs_free_path(path); return -ENOMEM; } @@ -3060,6 +3064,7 @@ again: extent_dev = bioc->stripes[0].dev; btrfs_put_bioc(bioc); + csum_root = btrfs_csum_root(fs_info, extent_logical); ret = btrfs_lookup_csums_range(csum_root, extent_logical, extent_logical + extent_len - 1, @@ -3116,7 +3121,7 @@ out: scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); - btrfs_release_path(path); + btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -3161,17 +3166,18 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, } static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length, - struct btrfs_block_group *cache) + int stripe_index, u64 dev_extent_len) { - struct btrfs_path *path, *ppath; + struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_root *root; + struct btrfs_root *csum_root; struct btrfs_extent_item *extent; struct blk_plug plug; + const u64 chunk_logical = bg->start; u64 flags; int ret; int slot; @@ -3183,10 +3189,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 physical_end; u64 generation; int mirror_num; - struct reada_control *reada1; - struct reada_control *reada2; struct btrfs_key key; - struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; u64 extent_logical; @@ -3202,25 +3205,26 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - physical = map->stripes[num].physical; + physical = map->stripes[stripe_index].physical; offset = 0; - nstripes = div64_u64(length, map->stripe_len); + nstripes = div64_u64(dev_extent_len, map->stripe_len); mirror_num = 1; increment = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - offset = map->stripe_len * num; + offset = map->stripe_len * stripe_index; increment = map->stripe_len * map->num_stripes; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; - offset = map->stripe_len * (num / map->sub_stripes); + offset = map->stripe_len * (stripe_index / map->sub_stripes); increment = map->stripe_len * factor; - mirror_num = num % map->sub_stripes + 1; + mirror_num = stripe_index % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - mirror_num = num % map->num_stripes + 1; + mirror_num = stripe_index % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - mirror_num = num % map->num_stripes + 1; + mirror_num = stripe_index % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical, num, map, &offset, NULL); + get_raid56_logic_offset(physical, stripe_index, map, &offset, + NULL); increment = map->stripe_len * nr_data_stripes(map); } @@ -3228,12 +3232,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!path) return -ENOMEM; - ppath = btrfs_alloc_path(); - if (!ppath) { - btrfs_free_path(path); - return -ENOMEM; - } - /* * work on commit root. The related disk blocks are static as * long as COW is applied. This means, it is save to rewrite @@ -3241,20 +3239,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ path->search_commit_root = 1; path->skip_locking = 1; + path->reada = READA_FORWARD; - ppath->search_commit_root = 1; - ppath->skip_locking = 1; - /* - * trigger the readahead for extent tree csum tree and wait for - * completion. During readahead, the scrub is officially paused - * to not hold off transaction commits - */ - logical = base + offset; + logical = chunk_logical + offset; physical_end = physical + nstripes * map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical_end, num, + get_raid56_logic_offset(physical_end, stripe_index, map, &logic_end, NULL); - logic_end += base; + logic_end += chunk_logical; } else { logic_end = logical + increment * nstripes; } @@ -3262,32 +3254,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); - /* FIXME it might be better to start readahead at commit root */ - key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; - key_end.objectid = logic_end; - key_end.type = BTRFS_METADATA_ITEM_KEY; - key_end.offset = (u64)-1; - reada1 = btrfs_reada_add(root, &key, &key_end); - - if (cache->flags & BTRFS_BLOCK_GROUP_DATA) { - key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.type = BTRFS_EXTENT_CSUM_KEY; - key.offset = logical; - key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key, &key_end); - } else { - reada2 = NULL; - } - - if (!IS_ERR(reada1)) - btrfs_reada_wait(reada1); - if (!IS_ERR_OR_NULL(reada2)) - btrfs_reada_wait(reada2); - + root = btrfs_extent_root(fs_info, logical); + csum_root = btrfs_csum_root(fs_info, logical); /* * collect all data csums for the stripe to avoid seeking during @@ -3333,16 +3301,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = get_raid56_logic_offset(physical, num, map, - &logical, + ret = get_raid56_logic_offset(physical, stripe_index, + map, &logical, &stripe_logical); - logical += base; + logical += chunk_logical; if (ret) { /* it is parity strip */ - stripe_logical += base; + stripe_logical += chunk_logical; stripe_end = stripe_logical + increment; ret = scrub_raid56_parity(sctx, map, scrub_dev, - ppath, stripe_logical, + stripe_logical, stripe_end); if (ret) goto out; @@ -3419,13 +3387,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * Continuing would prevent reusing its device extents * for new block groups for a long time. */ - spin_lock(&cache->lock); - if (cache->removed) { - spin_unlock(&cache->lock); + spin_lock(&bg->lock); + if (bg->removed) { + spin_unlock(&bg->lock); ret = 0; goto out; } - spin_unlock(&cache->lock); + spin_unlock(&bg->lock); extent = btrfs_item_ptr(l, slot, struct btrfs_extent_item); @@ -3504,16 +3472,16 @@ again: loop: physical += map->stripe_len; ret = get_raid56_logic_offset(physical, - num, map, &logical, - &stripe_logical); - logical += base; + stripe_index, map, + &logical, &stripe_logical); + logical += chunk_logical; if (ret && physical < physical_end) { - stripe_logical += base; + stripe_logical += chunk_logical; stripe_end = stripe_logical + increment; ret = scrub_raid56_parity(sctx, - map, scrub_dev, ppath, + map, scrub_dev, stripe_logical, stripe_end); if (ret) @@ -3543,8 +3511,8 @@ skip: physical += map->stripe_len; spin_lock(&sctx->stat_lock); if (stop_loop) - sctx->stat.last_physical = map->stripes[num].physical + - length; + sctx->stat.last_physical = map->stripes[stripe_index].physical + + dev_extent_len; else sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); @@ -3560,14 +3528,14 @@ out: blk_finish_plug(&plug); btrfs_free_path(path); - btrfs_free_path(ppath); if (sctx->is_dev_replace && ret >= 0) { int ret2; - ret2 = sync_write_pointer_for_zoned(sctx, base + offset, - map->stripes[num].physical, - physical_end); + ret2 = sync_write_pointer_for_zoned(sctx, + chunk_logical + offset, + map->stripes[stripe_index].physical, + physical_end); if (ret2) ret = ret2; } @@ -3576,10 +3544,10 @@ out: } static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, struct btrfs_device *scrub_dev, - u64 chunk_offset, u64 length, u64 dev_offset, - struct btrfs_block_group *cache) + u64 dev_extent_len) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct extent_map_tree *map_tree = &fs_info->mapping_tree; @@ -3589,7 +3557,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, int ret = 0; read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, chunk_offset, 1); + em = lookup_extent_mapping(map_tree, bg->start, bg->length); read_unlock(&map_tree->lock); if (!em) { @@ -3597,26 +3565,24 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, * Might have been an unused block group deleted by the cleaner * kthread or relocation. */ - spin_lock(&cache->lock); - if (!cache->removed) + spin_lock(&bg->lock); + if (!bg->removed) ret = -EINVAL; - spin_unlock(&cache->lock); + spin_unlock(&bg->lock); return ret; } - - map = em->map_lookup; - if (em->start != chunk_offset) + if (em->start != bg->start) goto out; - - if (em->len < length) + if (em->len < dev_extent_len) goto out; + map = em->map_lookup; for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length, cache); + ret = scrub_stripe(sctx, bg, map, scrub_dev, i, + dev_extent_len); if (ret) goto out; } @@ -3654,7 +3620,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root = fs_info->dev_root; - u64 length; u64 chunk_offset; int ret = 0; int ro_set; @@ -3678,6 +3643,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, key.type = BTRFS_DEV_EXTENT_KEY; while (1) { + u64 dev_extent_len; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) break; @@ -3714,9 +3681,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - length = btrfs_dev_extent_length(l, dev_extent); + dev_extent_len = btrfs_dev_extent_length(l, dev_extent); - if (found_key.offset + length <= start) + if (found_key.offset + dev_extent_len <= start) goto skip; chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); @@ -3850,13 +3817,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); down_write(&dev_replace->rwsem); - dev_replace->cursor_right = found_key.offset + length; + dev_replace->cursor_right = found_key.offset + dev_extent_len; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; up_write(&dev_replace->rwsem); - ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, - found_key.offset, cache); + ASSERT(cache->start == chunk_offset); + ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, + dev_extent_len); /* * flush, submit all pending read and write bios, afterwards @@ -3937,7 +3905,7 @@ skip_unfreeze: break; } skip: - key.offset = found_key.offset + length; + key.offset = found_key.offset + dev_extent_len; btrfs_release_path(path); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 040324d71118..d8ccb62aa7d2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -24,6 +24,7 @@ #include "transaction.h" #include "compression.h" #include "xattr.h" +#include "print-tree.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -98,6 +99,15 @@ struct send_ctx { struct btrfs_key *cmp_key; /* + * Keep track of the generation of the last transaction that was used + * for relocating a block group. This is periodically checked in order + * to detect if a relocation happened since the last check, so that we + * don't operate on stale extent buffers for nodes (level >= 1) or on + * stale disk_bytenr values of file extent items. + */ + u64 last_reloc_trans; + + /* * infos of the currently processed inode. In case of deleted inodes, * these are the values from the deleted inode. */ @@ -898,7 +908,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, iterate_inode_ref_t iterate, void *ctx) { struct extent_buffer *eb = path->nodes[0]; - struct btrfs_item *item; struct btrfs_inode_ref *iref; struct btrfs_inode_extref *extref; struct btrfs_path *tmp_path; @@ -930,12 +939,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, if (found_key->type == BTRFS_INODE_REF_KEY) { ptr = (unsigned long)btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - item = btrfs_item_nr(slot); - total = btrfs_item_size(eb, item); + total = btrfs_item_size(eb, slot); elem_size = sizeof(*iref); } else { ptr = btrfs_item_ptr_offset(eb, slot); - total = btrfs_item_size_nr(eb, slot); + total = btrfs_item_size(eb, slot); elem_size = sizeof(*extref); } @@ -1004,7 +1012,7 @@ out: typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, - u8 type, void *ctx); + void *ctx); /* * Helper function to iterate the entries in ONE btrfs_dir_item. @@ -1018,7 +1026,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, { int ret = 0; struct extent_buffer *eb; - struct btrfs_item *item; struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; @@ -1030,7 +1037,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, u32 total; int slot; int num; - u8 type; /* * Start with a small buffer (1 page). If later we end up needing more @@ -1047,20 +1053,18 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, eb = path->nodes[0]; slot = path->slots[0]; - item = btrfs_item_nr(slot); di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); cur = 0; len = 0; - total = btrfs_item_size(eb, item); + total = btrfs_item_size(eb, slot); num = 0; while (cur < total) { name_len = btrfs_dir_name_len(eb, di); data_len = btrfs_dir_data_len(eb, di); - type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - if (type == BTRFS_FT_XATTR) { + if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) { if (name_len > XATTR_NAME_MAX) { ret = -ENAMETOOLONG; goto out; @@ -1110,7 +1114,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, cur += len; ret = iterate(num, &di_key, buf, name_len, buf + name_len, - data_len, type, ctx); + data_len, ctx); if (ret < 0) goto out; if (ret) { @@ -1427,6 +1431,26 @@ static int find_extent_clone(struct send_ctx *sctx, if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + /* + * A transaction commit for a transaction in which block group + * relocation was done just happened. + * The disk_bytenr of the file extent item we processed is + * possibly stale, referring to the extent's location before + * relocation. So act as if we haven't found any clone sources + * and fallback to write commands, which will read the correct + * data from the new extent location. Otherwise we will fail + * below because we haven't found our own back reference or we + * could be getting incorrect sources in case the old extent + * was already reallocated after the relocation. + */ + up_read(&fs_info->commit_root_sem); + ret = -ENOENT; + goto out; + } + up_read(&fs_info->commit_root_sem); + if (!backref_ctx.found_itself) { /* found a bug in backref code? */ ret = -EIO; @@ -1692,8 +1716,7 @@ out: */ static int lookup_dir_item_inode(struct btrfs_root *root, u64 dir, const char *name, int name_len, - u64 *found_inode, - u8 *found_type) + u64 *found_inode) { int ret = 0; struct btrfs_dir_item *di; @@ -1716,7 +1739,6 @@ static int lookup_dir_item_inode(struct btrfs_root *root, goto out; } *found_inode = key.objectid; - *found_type = btrfs_dir_type(path->nodes[0], di); out: btrfs_free_path(path); @@ -1839,7 +1861,6 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, int ret = 0; u64 gen; u64 other_inode = 0; - u8 other_type = 0; if (!sctx->parent_root) goto out; @@ -1867,7 +1888,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, } ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, - &other_inode, &other_type); + &other_inode); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1912,7 +1933,6 @@ static int did_overwrite_ref(struct send_ctx *sctx, int ret = 0; u64 gen; u64 ow_inode; - u8 other_type; if (!sctx->parent_root) goto out; @@ -1936,7 +1956,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, - &ow_inode, &other_type); + &ow_inode); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -3622,7 +3642,7 @@ static int is_ancestor(struct btrfs_root *root, key.type != BTRFS_INODE_EXTREF_KEY) break; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); while (cur_offset < item_size) { u64 parent; u64 parent_gen; @@ -4651,9 +4671,8 @@ out: } static int __process_new_xattr(int num, struct btrfs_key *di_key, - const char *name, int name_len, - const char *data, int data_len, - u8 type, void *ctx) + const char *name, int name_len, const char *data, + int data_len, void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4697,8 +4716,7 @@ out: static int __process_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, - const char *data, int data_len, - u8 type, void *ctx) + const char *data, int data_len, void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4743,10 +4761,8 @@ struct find_xattr_ctx { int found_data_len; }; -static int __find_xattr(int num, struct btrfs_key *di_key, - const char *name, int name_len, - const char *data, int data_len, - u8 type, void *vctx) +static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, + int name_len, const char *data, int data_len, void *vctx) { struct find_xattr_ctx *ctx = vctx; @@ -4796,7 +4812,7 @@ static int find_xattr(struct btrfs_root *root, static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, - u8 type, void *ctx) + void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4808,12 +4824,12 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, &found_data_len); if (ret == -ENOENT) { ret = __process_new_xattr(num, di_key, name, name_len, data, - data_len, type, ctx); + data_len, ctx); } else if (ret >= 0) { if (data_len != found_data_len || memcmp(data, found_data, data_len)) { ret = __process_new_xattr(num, di_key, name, name_len, - data, data_len, type, ctx); + data, data_len, ctx); } else { ret = 0; } @@ -4826,7 +4842,7 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, - u8 type, void *ctx) + void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4835,7 +4851,7 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, name, name_len, NULL, NULL); if (ret == -ENOENT) ret = __process_deleted_xattr(num, di_key, name, name_len, data, - data_len, type, ctx); + data_len, ctx); else if (ret >= 0) ret = 0; @@ -6566,7 +6582,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *)(ptr + @@ -6597,6 +6613,50 @@ static int changed_cb(struct btrfs_path *left_path, { int ret = 0; + /* + * We can not hold the commit root semaphore here. This is because in + * the case of sending and receiving to the same filesystem, using a + * pipe, could result in a deadlock: + * + * 1) The task running send blocks on the pipe because it's full; + * + * 2) The task running receive, which is the only consumer of the pipe, + * is waiting for a transaction commit (for example due to a space + * reservation when doing a write or triggering a transaction commit + * when creating a subvolume); + * + * 3) The transaction is waiting to write lock the commit root semaphore, + * but can not acquire it since it's being held at 1). + * + * Down this call chain we write to the pipe through kernel_write(). + * The same type of problem can also happen when sending to a file that + * is stored in the same filesystem - when reserving space for a write + * into the file, we can trigger a transaction commit. + * + * Our caller has supplied us with clones of leaves from the send and + * parent roots, so we're safe here from a concurrent relocation and + * further reallocation of metadata extents while we are here. Below we + * also assert that the leaves are clones. + */ + lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem); + + /* + * We always have a send root, so left_path is never NULL. We will not + * have a leaf when we have reached the end of the send root but have + * not yet reached the end of the parent root. + */ + if (left_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &left_path->nodes[0]->bflags)); + /* + * When doing a full send we don't have a parent root, so right_path is + * NULL. When doing an incremental send, we may have reached the end of + * the parent root already, so we don't have a leaf at right_path. + */ + if (right_path && right_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &right_path->nodes[0]->bflags)); + if (result == BTRFS_COMPARE_TREE_SAME) { if (key->type == BTRFS_INODE_REF_KEY || key->type == BTRFS_INODE_EXTREF_KEY) { @@ -6643,14 +6703,46 @@ out: return ret; } +static int search_key_again(const struct send_ctx *sctx, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key) +{ + int ret; + + if (!path->need_commit_sem) + lockdep_assert_held_read(&root->fs_info->commit_root_sem); + + /* + * Roots used for send operations are readonly and no one can add, + * update or remove keys from them, so we should be able to find our + * key again. The only exception is deduplication, which can operate on + * readonly roots and add, update or remove keys to/from them - but at + * the moment we don't allow it to run in parallel with send. + */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + ASSERT(ret <= 0); + if (ret > 0) { + btrfs_print_tree(path->nodes[path->lowest_level], false); + btrfs_err(root->fs_info, +"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", + key->objectid, key->type, key->offset, + (root == sctx->parent_root ? "parent" : "send"), + root->root_key.objectid, path->lowest_level, + path->slots[path->lowest_level]); + return -EUCLEAN; + } + + return ret; +} + static int full_send_tree(struct send_ctx *sctx) { int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; + struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_path *path; - struct extent_buffer *eb; - int slot; path = alloc_path_for_send(); if (!path) @@ -6661,6 +6753,10 @@ static int full_send_tree(struct send_ctx *sctx) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; + down_read(&fs_info->commit_root_sem); + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) goto out; @@ -6668,15 +6764,35 @@ static int full_send_tree(struct send_ctx *sctx) goto out_finish; while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &key, slot); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + /* + * A transaction used for relocating a block group was + * committed or is about to finish its commit. Release + * our path (leaf) and restart the search, so that we + * avoid operating on any file extent items that are + * stale, with a disk_bytenr that reflects a pre + * relocation value. This way we avoid as much as + * possible to fallback to regular writes when checking + * if we can clone file ranges. + */ + btrfs_release_path(path); + ret = search_key_again(sctx, send_root, path, &key); + if (ret < 0) + goto out; + } else { + up_read(&fs_info->commit_root_sem); + } + ret = btrfs_next_item(send_root, path); if (ret < 0) goto out; @@ -6694,6 +6810,20 @@ out: return ret; } +static int replace_node_with_clone(struct btrfs_path *path, int level) +{ + struct extent_buffer *clone; + + clone = btrfs_clone_extent_buffer(path->nodes[level]); + if (!clone) + return -ENOMEM; + + free_extent_buffer(path->nodes[level]); + path->nodes[level] = clone; + + return 0; +} + static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) { struct extent_buffer *eb; @@ -6703,6 +6833,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen u64 reada_max; u64 reada_done = 0; + lockdep_assert_held_read(&parent->fs_info->commit_root_sem); + BUG_ON(*level == 0); eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) @@ -6726,6 +6858,10 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; + + if (*level == 0) + return replace_node_with_clone(path, 0); + return 0; } @@ -6739,8 +6875,10 @@ static int tree_move_next_or_upnext(struct btrfs_path *path, path->slots[*level]++; while (path->slots[*level] >= nritems) { - if (*level == root_level) + if (*level == root_level) { + path->slots[*level] = nritems - 1; return -1; + } /* move upnext */ path->slots[*level] = 0; @@ -6772,14 +6910,20 @@ static int tree_advance(struct btrfs_path *path, } else { ret = tree_move_down(path, level, reada_min_gen); } - if (ret >= 0) { - if (*level == 0) - btrfs_item_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - else - btrfs_node_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - } + + /* + * Even if we have reached the end of a tree, ret is -1, update the key + * anyway, so that in case we need to restart due to a block group + * relocation, we can assert that the last key of the root node still + * exists in the tree. + */ + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + return ret; } @@ -6791,8 +6935,8 @@ static int tree_compare_item(struct btrfs_path *left_path, int len1, len2; unsigned long off1, off2; - len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); - len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]); if (len1 != len2) return 1; @@ -6809,6 +6953,97 @@ static int tree_compare_item(struct btrfs_path *left_path, } /* + * A transaction used for relocating a block group was committed or is about to + * finish its commit. Release our paths and restart the search, so that we are + * not using stale extent buffers: + * + * 1) For levels > 0, we are only holding references of extent buffers, without + * any locks on them, which does not prevent them from having been relocated + * and reallocated after the last time we released the commit root semaphore. + * The exception are the root nodes, for which we always have a clone, see + * the comment at btrfs_compare_trees(); + * + * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so + * we are safe from the concurrent relocation and reallocation. However they + * can have file extent items with a pre relocation disk_bytenr value, so we + * restart the start from the current commit roots and clone the new leaves so + * that we get the post relocation disk_bytenr values. Not doing so, could + * make us clone the wrong data in case there are new extents using the old + * disk_bytenr that happen to be shared. + */ +static int restart_after_relocation(struct btrfs_path *left_path, + struct btrfs_path *right_path, + const struct btrfs_key *left_key, + const struct btrfs_key *right_key, + int left_level, + int right_level, + const struct send_ctx *sctx) +{ + int root_level; + int ret; + + lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem); + + btrfs_release_path(left_path); + btrfs_release_path(right_path); + + /* + * Since keys can not be added or removed to/from our roots because they + * are readonly and we do not allow deduplication to run in parallel + * (which can add, remove or change keys), the layout of the trees should + * not change. + */ + left_path->lowest_level = left_level; + ret = search_key_again(sctx, sctx->send_root, left_path, left_key); + if (ret < 0) + return ret; + + right_path->lowest_level = right_level; + ret = search_key_again(sctx, sctx->parent_root, right_path, right_key); + if (ret < 0) + return ret; + + /* + * If the lowest level nodes are leaves, clone them so that they can be + * safely used by changed_cb() while not under the protection of the + * commit root semaphore, even if relocation and reallocation happens in + * parallel. + */ + if (left_level == 0) { + ret = replace_node_with_clone(left_path, 0); + if (ret < 0) + return ret; + } + + if (right_level == 0) { + ret = replace_node_with_clone(right_path, 0); + if (ret < 0) + return ret; + } + + /* + * Now clone the root nodes (unless they happen to be the leaves we have + * already cloned). This is to protect against concurrent snapshotting of + * the send and parent roots (see the comment at btrfs_compare_trees()). + */ + root_level = btrfs_header_level(sctx->send_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(left_path, root_level); + if (ret < 0) + return ret; + } + + root_level = btrfs_header_level(sctx->parent_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(right_path, root_level); + if (ret < 0) + return ret; + } + + return 0; +} + +/* * This function compares two trees and calls the provided callback for * every changed/new/deleted item it finds. * If shared tree blocks are encountered, whole subtrees are skipped, making @@ -6836,10 +7071,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, int right_root_level; int left_level; int right_level; - int left_end_reached; - int right_end_reached; - int advance_left; - int advance_right; + int left_end_reached = 0; + int right_end_reached = 0; + int advance_left = 0; + int advance_right = 0; u64 left_blockptr; u64 right_blockptr; u64 left_gen; @@ -6907,12 +7142,18 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, down_read(&fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; + /* + * We clone the root node of the send and parent roots to prevent races + * with snapshot creation of these roots. Snapshot creation COWs the + * root node of a tree, so after the transaction is committed the old + * extent can be reallocated while this send operation is still ongoing. + * So we clone them, under the commit root semaphore, to be race free. + */ left_path->nodes[left_level] = btrfs_clone_extent_buffer(left_root->commit_root); if (!left_path->nodes[left_level]) { - up_read(&fs_info->commit_root_sem); ret = -ENOMEM; - goto out; + goto out_unlock; } right_level = btrfs_header_level(right_root->commit_root); @@ -6920,9 +7161,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, right_path->nodes[right_level] = btrfs_clone_extent_buffer(right_root->commit_root); if (!right_path->nodes[right_level]) { - up_read(&fs_info->commit_root_sem); ret = -ENOMEM; - goto out; + goto out_unlock; } /* * Our right root is the parent root, while the left root is the "send" @@ -6932,7 +7172,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, * will need to read them at some point. */ reada_min_gen = btrfs_header_generation(right_root->commit_root); - up_read(&fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -6947,11 +7186,26 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, btrfs_node_key_to_cpu(right_path->nodes[right_level], &right_key, right_path->slots[right_level]); - left_end_reached = right_end_reached = 0; - advance_left = advance_right = 0; + sctx->last_reloc_trans = fs_info->last_reloc_trans; while (1) { - cond_resched(); + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + up_read(&fs_info->commit_root_sem); + cond_resched(); + down_read(&fs_info->commit_root_sem); + } + + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + ret = restart_after_relocation(left_path, right_path, + &left_key, &right_key, + left_level, right_level, + sctx); + if (ret < 0) + goto out_unlock; + sctx->last_reloc_trans = fs_info->last_reloc_trans; + } + if (advance_left && !left_end_reached) { ret = tree_advance(left_path, &left_level, left_root_level, @@ -6960,7 +7214,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, if (ret == -1) left_end_reached = ADVANCE; else if (ret < 0) - goto out; + goto out_unlock; advance_left = 0; } if (advance_right && !right_end_reached) { @@ -6971,54 +7225,55 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, if (ret == -1) right_end_reached = ADVANCE; else if (ret < 0) - goto out; + goto out_unlock; advance_right = 0; } if (left_end_reached && right_end_reached) { ret = 0; - goto out; + goto out_unlock; } else if (left_end_reached) { if (right_level == 0) { + up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); } advance_right = ADVANCE; continue; } else if (right_end_reached) { if (left_level == 0) { + up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); } advance_left = ADVANCE; continue; } if (left_level == 0 && right_level == 0) { + up_read(&fs_info->commit_root_sem); cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); - if (ret < 0) - goto out; advance_left = ADVANCE; } else if (cmp > 0) { ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); - if (ret < 0) - goto out; advance_right = ADVANCE; } else { enum btrfs_compare_tree_result result; @@ -7032,11 +7287,13 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_path, right_path, &left_key, result, sctx); - if (ret < 0) - goto out; advance_left = ADVANCE; advance_right = ADVANCE; } + + if (ret < 0) + goto out; + down_read(&fs_info->commit_root_sem); } else if (left_level == right_level) { cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { @@ -7076,6 +7333,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, } } +out_unlock: + up_read(&fs_info->commit_root_sem); out: btrfs_free_path(left_path); btrfs_free_path(right_path); @@ -7425,21 +7684,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; - spin_lock(&fs_info->send_reloc_lock); - if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { - spin_unlock(&fs_info->send_reloc_lock); - btrfs_warn_rl(fs_info, - "cannot run send because a relocation operation is in progress"); - ret = -EAGAIN; - goto out; - } - fs_info->send_in_progress++; - spin_unlock(&fs_info->send_reloc_lock); - ret = send_subvol(sctx); - spin_lock(&fs_info->send_reloc_lock); - fs_info->send_in_progress--; - spin_unlock(&fs_info->send_reloc_lock); if (ret < 0) goto out; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 48d77f360a24..294242c194d8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -617,7 +617,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, enum btrfs_flush_state state, bool for_preempt) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int nr; int ret = 0; @@ -844,6 +844,9 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 min_bytes; + if (!ticket->steal) + return false; + if (global_rsv->space_info != space_info) return false; @@ -899,8 +902,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (!aborted && ticket->steal && - steal_from_global_rsv(fs_info, space_info, ticket)) + if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) return true; if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) @@ -1260,18 +1262,23 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int states_nr) { u64 to_reclaim; - int flush_state; + int flush_state = 0; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); - if (!to_reclaim) { + /* + * This is the priority reclaim path, so to_reclaim could be >0 still + * because we may have only satisified the priority tickets and still + * left non priority tickets on the list. We would then have + * to_reclaim but ->bytes == 0. + */ + if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); - flush_state = 0; - do { + while (flush_state < states_nr) { + spin_unlock(&space_info->lock); flush_space(fs_info, space_info, to_reclaim, states[flush_state], false); flush_state++; @@ -1280,23 +1287,49 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); - } while (flush_state < states_nr); + } + + /* Attempt to steal from the global rsv if we can. */ + if (!steal_from_global_rsv(fs_info, space_info, ticket)) { + ticket->error = -ENOSPC; + remove_ticket(space_info, ticket); + } + + /* + * We must run try_granting_tickets here because we could be a large + * ticket in front of a smaller ticket that can now be satisfied with + * the available space. + */ + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); } static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { + spin_lock(&space_info->lock); + + /* We could have been granted before we got here. */ + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + while (!space_info->full) { + spin_unlock(&space_info->lock); flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); } + + ticket->error = -ENOSPC; + remove_ticket(space_info, ticket); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); } static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, @@ -1378,25 +1411,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, break; } - spin_lock(&space_info->lock); ret = ticket->error; - if (ticket->bytes || ticket->error) { - /* - * We were a priority ticket, so we need to delete ourselves - * from the list. Because we could have other priority tickets - * behind us that require less space, run - * btrfs_try_granting_tickets() to see if their reservations can - * now be made. - */ - if (!list_empty(&ticket->list)) { - remove_ticket(space_info, ticket); - btrfs_try_granting_tickets(fs_info, space_info); - } - - if (!ret) - ret = -ENOSPC; - } - spin_unlock(&space_info->lock); ASSERT(list_empty(&ticket->list)); /* * Check that we can't have an error set if the reservation succeeded, @@ -1438,6 +1453,12 @@ static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, space_info->clamp = min(space_info->clamp + 1, 8); } +static inline bool can_steal(enum btrfs_reserve_flush_enum flush) +{ + return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_EVICT); +} + /** * Try to reserve bytes from the block_rsv's space * @@ -1511,7 +1532,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, ticket.error = 0; space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); - ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); + ticket.steal = can_steal(flush); if (trace_btrfs_reserve_ticket_enabled()) start_ns = ktime_get_ns(); @@ -1567,7 +1588,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, /** * Trye to reserve metadata bytes from the block_rsv's space * - * @root: the root we're allocating for + * @fs_info: the filesystem * @block_rsv: block_rsv we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation @@ -1579,22 +1600,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -int btrfs_reserve_metadata_bytes(struct btrfs_root *root, +int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); - if (ret == -ENOSPC && - unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { - if (block_rsv != global_rsv && - !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) - ret = 0; - } if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", block_rsv->space_info->flags, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index cb5056472e79..d841fed73492 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -123,7 +123,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -int btrfs_reserve_metadata_bytes(struct btrfs_root *root, +int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a1c54a2c787c..0ec09fe01be6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1842,7 +1842,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, new_pool_size); } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index f9eff3b0f77c..beb7f72d50b8 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1537,6 +1537,16 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); +static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid); +} +BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show); + static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1572,6 +1582,7 @@ BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); */ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, error_stats), + BTRFS_ATTR_PTR(devid, fsid), BTRFS_ATTR_PTR(devid, in_fs_metadata), BTRFS_ATTR_PTR(devid, missing), BTRFS_ATTR_PTR(devid, replace_target), diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 3a4099a2bf05..d8e56edd6991 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -204,6 +204,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; + btrfs_global_root_delete(root); btrfs_put_root(root); } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 2a95f7224e18..51a8b075c259 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -15,7 +15,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) struct btrfs_path *path = NULL; struct btrfs_root *root = NULL; struct extent_buffer *eb; - struct btrfs_item *item; char *value = "mary had a little lamb"; char *split1 = "mary had a little"; char *split2 = " lamb"; @@ -61,7 +60,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.offset = 0; btrfs_setup_item_for_insert(root, path, &key, value_len); - item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); @@ -90,8 +88,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(0); - if (btrfs_item_size(eb, item) != strlen(split1)) { + if (btrfs_item_size(eb, 0) != strlen(split1)) { test_err("invalid len in the first split"); ret = -EINVAL; goto out; @@ -115,8 +112,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(1); - if (btrfs_item_size(eb, item) != strlen(split2)) { + if (btrfs_item_size(eb, 1) != strlen(split2)) { test_err("invalid len in the second split"); ret = -EINVAL; goto out; @@ -147,8 +143,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(0); - if (btrfs_item_size(eb, item) != strlen(split3)) { + if (btrfs_item_size(eb, 0) != strlen(split3)) { test_err("invalid len in the first split"); ret = -EINVAL; goto out; @@ -171,8 +166,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(1); - if (btrfs_item_size(eb, item) != strlen(split4)) { + if (btrfs_item_size(eb, 1) != strlen(split4)) { test_err("invalid len in the second split"); ret = -EINVAL; goto out; @@ -195,8 +189,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(2); - if (btrfs_item_size(eb, item) != strlen(split2)) { + if (btrfs_item_size(eb, 2) != strlen(split2)) { test_err("invalid len in the second split"); ret = -EINVAL; goto out; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index c2e72e7a8ff0..a232b15b8021 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -56,6 +56,54 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, return count; } +#define STATE_FLAG_STR_LEN 256 + +#define PRINT_ONE_FLAG(state, dest, cur, name) \ +({ \ + if (state->state & EXTENT_##name) \ + cur += scnprintf(dest + cur, STATE_FLAG_STR_LEN - cur, \ + "%s" #name, cur == 0 ? "" : "|"); \ +}) + +static void extent_flag_to_str(const struct extent_state *state, char *dest) +{ + int cur = 0; + + dest[0] = 0; + PRINT_ONE_FLAG(state, dest, cur, DIRTY); + PRINT_ONE_FLAG(state, dest, cur, UPTODATE); + PRINT_ONE_FLAG(state, dest, cur, LOCKED); + PRINT_ONE_FLAG(state, dest, cur, NEW); + PRINT_ONE_FLAG(state, dest, cur, DELALLOC); + PRINT_ONE_FLAG(state, dest, cur, DEFRAG); + PRINT_ONE_FLAG(state, dest, cur, BOUNDARY); + PRINT_ONE_FLAG(state, dest, cur, NODATASUM); + PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV); + PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT); + PRINT_ONE_FLAG(state, dest, cur, DAMAGED); + PRINT_ONE_FLAG(state, dest, cur, NORESERVE); + PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED); + PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV); +} + +static void dump_extent_io_tree(const struct extent_io_tree *tree) +{ + struct rb_node *node; + char flags_str[STATE_FLAG_STR_LEN]; + + node = rb_first(&tree->state); + test_msg("io tree content:"); + while (node) { + struct extent_state *state; + + state = rb_entry(node, struct extent_state, rb_node); + extent_flag_to_str(state, flags_str); + test_msg(" start=%llu len=%llu flags=%s", state->start, + state->end + 1 - state->start, flags_str); + node = rb_next(node); + } +} + static int test_find_delalloc(u32 sectorsize) { struct inode *inode; @@ -258,6 +306,8 @@ static int test_find_delalloc(u32 sectorsize) } ret = 0; out_bits: + if (ret) + dump_extent_io_tree(tmp); clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) @@ -534,6 +584,8 @@ static int test_find_first_clear_extent_bit(void) ret = 0; out: + if (ret) + dump_extent_io_tree(&tree); clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); return ret; diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 8f05c1eb833f..5930cdcae5cb 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -824,6 +824,184 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, return 0; } +static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + return true; +} + +static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) +{ + const struct btrfs_free_space_op test_free_space_ops = { + .use_bitmap = bytes_index_use_bitmap, + }; + const struct btrfs_free_space_op *orig_free_space_ops; + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + u64 offset, max_extent_size, bytes; + int ret, i; + + test_msg("running bytes index tests"); + + /* First just validate that it does everything in order. */ + offset = 0; + for (i = 0; i < 10; i++) { + bytes = (i + 1) * SZ_1M; + ret = test_add_free_space_entry(cache, offset, bytes, 0); + if (ret) { + test_err("couldn't add extent entry %d\n", ret); + return ret; + } + offset += bytes + sectorsize; + } + + for (node = rb_first_cached(&ctl->free_space_bytes), i = 9; node; + node = rb_next(node), i--) { + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + bytes = (i + 1) * SZ_1M; + if (entry->bytes != bytes) { + test_err("invalid bytes index order, found %llu expected %llu", + entry->bytes, bytes); + return -EINVAL; + } + } + + /* Now validate bitmaps do the correct thing. */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + for (i = 0; i < 2; i++) { + offset = i * BITS_PER_BITMAP * sectorsize; + bytes = (i + 1) * SZ_1M; + ret = test_add_free_space_entry(cache, offset, bytes, 1); + if (ret) { + test_err("couldn't add bitmap entry"); + return ret; + } + } + + for (node = rb_first_cached(&ctl->free_space_bytes), i = 1; node; + node = rb_next(node), i--) { + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + bytes = (i + 1) * SZ_1M; + if (entry->bytes != bytes) { + test_err("invalid bytes index order, found %llu expected %llu", + entry->bytes, bytes); + return -EINVAL; + } + } + + /* Now validate bitmaps with different ->max_extent_size. */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + orig_free_space_ops = cache->free_space_ctl->op; + cache->free_space_ctl->op = &test_free_space_ops; + + ret = test_add_free_space_entry(cache, 0, sectorsize, 1); + if (ret) { + test_err("couldn't add bitmap entry"); + return ret; + } + + offset = BITS_PER_BITMAP * sectorsize; + ret = test_add_free_space_entry(cache, offset, sectorsize, 1); + if (ret) { + test_err("couldn't add bitmap_entry"); + return ret; + } + + /* + * Now set a bunch of sectorsize extents in the first entry so it's + * ->bytes is large. + */ + for (i = 2; i < 20; i += 2) { + offset = sectorsize * i; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error populating sparse bitmap %d", ret); + return ret; + } + } + + /* + * Now set a contiguous extent in the second bitmap so its + * ->max_extent_size is larger than the first bitmaps. + */ + offset = (BITS_PER_BITMAP * sectorsize) + sectorsize; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error adding contiguous extent %d", ret); + return ret; + } + + /* + * Since we don't set ->max_extent_size unless we search everything + * should be indexed on bytes. + */ + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (10 * sectorsize)) { + test_err("error, wrong entry in the first slot in bytes_index"); + return -EINVAL; + } + + max_extent_size = 0; + offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3, + 0, &max_extent_size); + if (offset != 0) { + test_err("found space to alloc even though we don't have enough space"); + return -EINVAL; + } + + if (max_extent_size != (2 * sectorsize)) { + test_err("got the wrong max_extent size %llu expected %llu", + max_extent_size, (unsigned long long)(2 * sectorsize)); + return -EINVAL; + } + + /* + * The search should have re-arranged the bytes index to use the + * ->max_extent_size, validate it's now what we expect it to be. + */ + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (2 * sectorsize)) { + test_err("error, the bytes index wasn't recalculated properly"); + return -EINVAL; + } + + /* Add another sectorsize to re-arrange the tree back to ->bytes. */ + offset = (BITS_PER_BITMAP * sectorsize) - sectorsize; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error adding extent to the sparse entry %d", ret); + return ret; + } + + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (11 * sectorsize)) { + test_err("error, wrong entry in the first slot in bytes_index"); + return -EINVAL; + } + + /* + * Now make sure we find our correct entry after searching that will + * result in a re-arranging of the tree. + */ + max_extent_size = 0; + offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2, + 0, &max_extent_size); + if (offset != (BITS_PER_BITMAP * sectorsize)) { + test_err("error, found %llu instead of %llu for our alloc", + offset, + (unsigned long long)(BITS_PER_BITMAP * sectorsize)); + return -EINVAL; + } + + cache->free_space_ctl->op = orig_free_space_ops; + __btrfs_remove_free_space_cache(cache->free_space_ctl); + return 0; +} + int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; @@ -858,7 +1036,10 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) goto out; } - root->fs_info->extent_root = root; + root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); ret = test_extents(cache); if (ret) @@ -871,6 +1052,9 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) goto out; ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize); + if (ret) + goto out; + ret = test_bytes_index(cache, sectorsize); out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 2c783d2f5228..13734ed43bfc 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -446,7 +446,10 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE); - root->fs_info->free_space_root = root; + root->root_key.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); root->fs_info->tree_root = root; root->node = alloc_test_extent_buffer(root->fs_info, nodesize); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 19ba7d5b7d8f..eee1e4459541 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -455,7 +455,10 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) } /* We are using this root as our extent root */ - root->fs_info->extent_root = root; + root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); /* * Some of the paths we test assume we have a filled out fs_info, so we diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1c3a1189c0bd..03de89b45f27 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -162,7 +162,17 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_root *root, *tmp; struct btrfs_caching_control *caching_ctl, *next; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); + down_write(&fs_info->commit_root_sem); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + fs_info->last_reloc_trans = trans->transid; + list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, dirty_list) { list_del_init(&root->dirty_list); @@ -413,7 +423,6 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && root->last_trans < trans->transid) || force) { - WARN_ON(root == fs_info->extent_root); WARN_ON(!force && root->commit_root != root->node); /* @@ -628,7 +637,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); + ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush); if (ret) goto reserve_fail; if (delayed_refs_bytes) { @@ -692,7 +701,6 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; - h->root = root; refcount_set(&h->use_count, 1); h->fs_info = root->fs_info; @@ -1236,6 +1244,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) struct extent_buffer *eb; int ret; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, BTRFS_NESTING_COW); @@ -1267,9 +1281,8 @@ again: root = list_entry(next, struct btrfs_root, dirty_list); clear_bit(BTRFS_ROOT_DIRTY, &root->state); - if (root != fs_info->extent_root) - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; @@ -1299,9 +1312,6 @@ again: if (!list_empty(&fs_info->dirty_cowonly_roots)) goto again; - list_add_tail(&fs_info->extent_root->dirty_list, - &trans->transaction->switch_commits); - /* Update dev-replace pointer once everything is committed */ fs_info->dev_replace.committed_cursor_left = fs_info->dev_replace.cursor_left_last_write_of_item; @@ -1327,7 +1337,8 @@ void btrfs_add_dead_root(struct btrfs_root *root) } /* - * update all the cowonly tree roots on disk + * Update each subvolume root and its relocation root, if it exists, in the tree + * of tree roots. Also free log roots if they exist. */ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { @@ -1336,6 +1347,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) int i; int ret; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, @@ -1348,6 +1365,14 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) struct btrfs_root *root = gang[i]; int ret2; + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); @@ -1472,12 +1497,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, return ret; } - /* - * We are going to commit transaction, see btrfs_commit_transaction() - * comment for reason locking tree_log_mutex - */ - mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans); if (ret) goto out; @@ -1513,8 +1532,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, "Error while writing out transaction for qgroup"); out: - mutex_unlock(&fs_info->tree_log_mutex); - /* * Force parent root to be updated, as we recorded it before so its * last_trans == cur_transid. @@ -1578,7 +1595,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(pending, &to_reserve); if (to_reserve > 0) { - pending->error = btrfs_block_rsv_add(root, + pending->error = btrfs_block_rsv_add(fs_info, &pending->block_rsv, to_reserve, BTRFS_RESERVE_NO_FLUSH); @@ -1861,50 +1878,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) return ret; } -/* - * commit transactions asynchronously. once btrfs_commit_transaction_async - * returns, any subsequent transaction will not be allowed to join. - */ -struct btrfs_async_commit { - struct btrfs_trans_handle *newtrans; - struct work_struct work; -}; - -static void do_async_commit(struct work_struct *work) -{ - struct btrfs_async_commit *ac = - container_of(work, struct btrfs_async_commit, work); - - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - if (ac->newtrans->type & __TRANS_FREEZABLE) - __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS); - - current->journal_info = ac->newtrans; - - btrfs_commit_transaction(ac->newtrans); - kfree(ac); -} - -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) +void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_async_commit *ac; struct btrfs_transaction *cur_trans; - ac = kmalloc(sizeof(*ac), GFP_NOFS); - if (!ac) - return -ENOMEM; - - INIT_WORK(&ac->work, do_async_commit); - ac->newtrans = btrfs_join_transaction(trans->root); - if (IS_ERR(ac->newtrans)) { - int err = PTR_ERR(ac->newtrans); - kfree(ac); - return err; - } + /* Kick the transaction kthread. */ + set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); + wake_up_process(fs_info->transaction_kthread); /* take transaction reference */ cur_trans = trans->transaction; @@ -1913,28 +1894,15 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) btrfs_end_transaction(trans); /* - * Tell lockdep we've released the freeze rwsem, since the - * async commit thread will be the one to unlock it. - */ - if (ac->newtrans->type & __TRANS_FREEZABLE) - __sb_writers_release(fs_info->sb, SB_FREEZE_FS); - - schedule_work(&ac->work); - /* * Wait for the current transaction commit to start and block * subsequent transaction joins */ wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); - if (current->journal_info == trans) - current->journal_info = NULL; - btrfs_put_transaction(cur_trans); - return 0; } - static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1986,7 +1954,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); - trace_btrfs_transaction_commit(trans->root); + trace_btrfs_transaction_commit(fs_info); if (current->journal_info == trans) current->journal_info = NULL; @@ -2200,6 +2168,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* + * We've started the commit, clear the flag in case we were triggered to + * do an async commit but somebody else started before the transaction + * kthread could do the work. + */ + clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto scrub_continue; @@ -2246,24 +2221,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) WARN_ON(cur_trans != trans->transaction); - /* btrfs_commit_tree_roots is responsible for getting the - * various roots consistent with each other. Every pointer - * in the tree of tree roots has to point to the most up to date - * root for every subvolume and other tree. So, we have to keep - * the tree logging code from jumping in and changing any - * of the trees. - * - * At this point in the commit, there can't be any tree-log - * writers, but a little lower down we drop the trans mutex - * and let new people in. By holding the tree_log_mutex - * from now until after the super is written, we avoid races - * with the tree-log code. - */ - mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans); if (ret) - goto unlock_tree_log; + goto unlock_reloc; /* * Since the transaction is done, we can apply the pending changes @@ -2282,11 +2242,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ ret = btrfs_qgroup_account_extents(trans); if (ret < 0) - goto unlock_tree_log; + goto unlock_reloc; ret = commit_cowonly_roots(trans); if (ret) - goto unlock_tree_log; + goto unlock_reloc; /* * The tasks which save the space cache and inode cache may also @@ -2294,7 +2254,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; - goto unlock_tree_log; + goto unlock_reloc; } cur_trans = fs_info->running_transaction; @@ -2327,6 +2287,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_release_chunk_metadata(trans); + /* + * Before changing the transaction state to TRANS_STATE_UNBLOCKED and + * setting fs_info->running_transaction to NULL, lock tree_log_mutex to + * make sure that before we commit our superblock, no other task can + * start a new transaction and commit a log tree before we commit our + * superblock. Anyone trying to commit a log tree locks this mutex before + * writing its superblock. + */ + mutex_lock(&fs_info->tree_log_mutex); + spin_lock(&fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; fs_info->running_transaction = NULL; @@ -2339,10 +2309,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) { btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction"); - /* - * reloc_mutex has been unlocked, tree_log_mutex is still held - * but we can't jump to unlock_tree_log causing double unlock - */ mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } @@ -2393,7 +2359,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); - trace_btrfs_transaction_commit(trans->root); + trace_btrfs_transaction_commit(fs_info); btrfs_scrub_continue(fs_info); @@ -2404,8 +2370,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; -unlock_tree_log: - mutex_unlock(&fs_info->tree_log_mutex); unlock_reloc: mutex_unlock(&fs_info->reloc_mutex); scrub_continue: diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index ba45065f9451..1852ed9de7fd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -135,7 +135,6 @@ struct btrfs_trans_handle { bool removing_chunk; bool reloc_reserved; bool in_fsync; - struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct list_head new_bgs; }; @@ -217,7 +216,7 @@ void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); +void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7733e8ac0a69..72e1c942197d 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -202,7 +202,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_file_extent_item *fi; u32 sectorsize = fs_info->sectorsize; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); u64 extent_end; if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { @@ -354,17 +354,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, key->offset, sectorsize); return -EUCLEAN; } - if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) { + if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) { generic_err(leaf, slot, "unaligned item size for csum item, have %u should be aligned to %u", - btrfs_item_size_nr(leaf, slot), csumsize); + btrfs_item_size(leaf, slot), csumsize); return -EUCLEAN; } if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { u64 prev_csum_end; u32 prev_item_size; - prev_item_size = btrfs_item_size_nr(leaf, slot - 1); + prev_item_size = btrfs_item_size(leaf, slot - 1); prev_csum_end = (prev_item_size / csumsize) * sectorsize; prev_csum_end += prev_key->offset; if (unlikely(prev_csum_end > key->offset)) { @@ -483,7 +483,7 @@ static int check_dir_item(struct extent_buffer *leaf, { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_dir_item *di; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); u32 cur = 0; if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) @@ -640,7 +640,7 @@ static int check_block_group_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_block_group_item bgi; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); u64 flags; u64 type; @@ -912,10 +912,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, { int num_stripes; - if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) { + if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { chunk_err(leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), sizeof(struct btrfs_chunk), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); return -EUCLEAN; @@ -927,10 +927,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, goto out; if (unlikely(btrfs_chunk_item_size(num_stripes) != - btrfs_item_size_nr(leaf, slot))) { + btrfs_item_size(leaf, slot))) { chunk_err(leaf, chunk, key->offset, "invalid chunk item size: have %u expect %lu", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), btrfs_chunk_item_size(num_stripes)); return -EUCLEAN; } @@ -1095,12 +1095,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, if (unlikely(ret < 0)) return ret; - if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) && - btrfs_item_size_nr(leaf, slot) != + if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) && + btrfs_item_size(leaf, slot) != btrfs_legacy_root_item_size())) { generic_err(leaf, slot, "invalid root item size, have %u expect %zu or %u", - btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_item_size(leaf, slot), sizeof(ri), btrfs_legacy_root_item_size()); return -EUCLEAN; } @@ -1111,7 +1111,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, * And since we allow geneartion_v2 as 0, it will still pass the check. */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), - btrfs_item_size_nr(leaf, slot)); + btrfs_item_size(leaf, slot)); /* Generation related */ if (unlikely(btrfs_root_generation(&ri) > @@ -1208,7 +1208,7 @@ static int check_extent_item(struct extent_buffer *leaf, bool is_tree_block = false; unsigned long ptr; /* Current pointer inside inline refs */ unsigned long end; /* Extent item end */ - const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 item_size = btrfs_item_size(leaf, slot); u64 flags; u64 generation; u64 total_refs; /* Total refs in btrfs_extent_item */ @@ -1432,10 +1432,10 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, if (key->type == BTRFS_SHARED_DATA_REF_KEY) expect_item_size = sizeof(struct btrfs_shared_data_ref); - if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) { + if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, "invalid item size, have %u expect %u for key type %u", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), expect_item_size, key->type); return -EUCLEAN; } @@ -1460,12 +1460,12 @@ static int check_extent_data_ref(struct extent_buffer *leaf, { struct btrfs_extent_data_ref *dref; unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); - const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot); + const unsigned long end = ptr + btrfs_item_size(leaf, slot); - if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) { + if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) { generic_err(leaf, slot, "invalid item size, have %u expect aligned to %zu for key type %u", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), sizeof(*dref), key->type); return -EUCLEAN; } @@ -1507,16 +1507,16 @@ static int check_inode_ref(struct extent_buffer *leaf, if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; /* namelen can't be 0, so item_size == sizeof() is also invalid */ - if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) { + if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) { inode_ref_err(leaf, slot, "invalid item size, have %u expect (%zu, %u)", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); return -EUCLEAN; } ptr = btrfs_item_ptr_offset(leaf, slot); - end = ptr + btrfs_item_size_nr(leaf, slot); + end = ptr + btrfs_item_size(leaf, slot); while (ptr < end) { u16 namelen; @@ -1689,12 +1689,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) if (slot == 0) item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); else - item_end_expected = btrfs_item_offset_nr(leaf, + item_end_expected = btrfs_item_offset(leaf, slot - 1); - if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) { + if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) { generic_err(leaf, slot, "unexpected item end, have %u expect %u", - btrfs_item_end_nr(leaf, slot), + btrfs_item_data_end(leaf, slot), item_end_expected); return -EUCLEAN; } @@ -1704,11 +1704,11 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * just in case all the items are consistent to each other, but * all point outside of the leaf. */ - if (unlikely(btrfs_item_end_nr(leaf, slot) > + if (unlikely(btrfs_item_data_end(leaf, slot) > BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, "slot end outside of leaf, have %u expect range [0, %u]", - btrfs_item_end_nr(leaf, slot), + btrfs_item_data_end(leaf, slot), BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 7c45d960b53c..b6cf39f4e7e4 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -27,14 +27,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int next_key_ret = 0; u64 last_ret = 0; - if (root->fs_info->extent_root == root) { - /* - * there's recursion here right now in the tree locking, - * we can't defrag the extent root without deadlock - */ - goto out; - } - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6993dcdba6f1..c1ddbe800897 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -20,6 +20,7 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "inode-item.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -386,7 +387,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) overwrite_root = 1; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); /* Our caller must have done a search for the key for us. */ @@ -409,7 +410,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, if (ret == 0) { char *src_copy; char *dst_copy; - u32 dst_size = btrfs_item_size_nr(path->nodes[0], + u32 dst_size = btrfs_item_size(path->nodes[0], path->slots[0]); if (dst_size != item_size) goto insert; @@ -503,7 +504,7 @@ insert: /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { u32 found_size; - found_size = btrfs_item_size_nr(path->nodes[0], + found_size = btrfs_item_size(path->nodes[0], path->slots[0]); if (found_size > item_size) btrfs_truncate_item(path, item_size, 1); @@ -872,17 +873,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; + struct btrfs_root *csum_root; + sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); + csum_root = btrfs_csum_root(fs_info, + sums->bytenr); if (!ret) - ret = btrfs_del_csums(trans, - fs_info->csum_root, + ret = btrfs_del_csums(trans, csum_root, sums->bytenr, sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, - fs_info->csum_root, sums); + csum_root, + sums); list_del(&sums->list); kfree(sums); } @@ -1096,7 +1101,7 @@ again: * otherwise they must be unlinked as a conflict */ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { victim_ref = (struct btrfs_inode_ref *)ptr; victim_name_len = btrfs_inode_ref_name_len(leaf, @@ -1155,7 +1160,7 @@ again: leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { @@ -1318,7 +1323,7 @@ again: eb = path->nodes[0]; ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); + ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); while (ref_ptr < ref_end) { char *name = NULL; int namelen; @@ -1504,7 +1509,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, int ref_struct_size; ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + ref_end = ref_ptr + btrfs_item_size(eb, slot); if (key->type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *r; @@ -1678,7 +1683,7 @@ static int count_inode_extrefs(struct btrfs_root *root, break; leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); cur_offset = 0; @@ -1732,7 +1737,7 @@ process_slot: key.type != BTRFS_INODE_REF_KEY) break; ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], + ptr_end = ptr + btrfs_item_size(path->nodes[0], path->slots[0]); while (ptr < ptr_end) { struct btrfs_inode_ref *ref; @@ -1950,6 +1955,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return ret; } +static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, + struct btrfs_path *path, + struct btrfs_dir_item *dst_di, + const struct btrfs_key *log_key, + u8 log_type, + bool exists) +{ + struct btrfs_key found_key; + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* The existing dentry points to the same inode, don't delete it. */ + if (found_key.objectid == log_key->objectid && + found_key.type == log_key->type && + found_key.offset == log_key->offset && + btrfs_dir_type(path->nodes[0], dst_di) == log_type) + return 1; + + /* + * Don't drop the conflicting directory entry if the inode for the new + * entry doesn't exist. + */ + if (!exists) + return 0; + + return drop_one_dir_item(trans, path, dir, dst_di); +} + /* * take a single entry in a log directory item and replay it into * the subvolume. @@ -1975,14 +2008,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, { char *name; int name_len; - struct btrfs_dir_item *dst_di; - struct btrfs_key found_key; + struct btrfs_dir_item *dir_dst_di; + struct btrfs_dir_item *index_dst_di; + bool dir_dst_matches = false; + bool index_dst_matches = false; struct btrfs_key log_key; + struct btrfs_key search_key; struct inode *dir; u8 log_type; bool exists; int ret; - bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); + bool update_size = true; bool name_added = false; dir = read_one_inode(root, key->objectid); @@ -2008,76 +2044,53 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, exists = (ret == 0); ret = 0; - if (key->type == BTRFS_DIR_ITEM_KEY) { - dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - name, name_len, 1); - } else if (key->type == BTRFS_DIR_INDEX_KEY) { - dst_di = btrfs_lookup_dir_index_item(trans, root, path, - key->objectid, - key->offset, name, - name_len, 1); - } else { - /* Corruption */ - ret = -EINVAL; - goto out; - } - - if (IS_ERR(dst_di)) { - ret = PTR_ERR(dst_di); + dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + name, name_len, 1); + if (IS_ERR(dir_dst_di)) { + ret = PTR_ERR(dir_dst_di); goto out; - } else if (!dst_di) { - /* we need a sequence number to insert, so we only - * do inserts for the BTRFS_DIR_INDEX_KEY types - */ - if (key->type != BTRFS_DIR_INDEX_KEY) + } else if (dir_dst_di) { + ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, + dir_dst_di, &log_key, log_type, + exists); + if (ret < 0) goto out; - goto insert; + dir_dst_matches = (ret == 1); } - btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); - /* the existing item matches the logged item */ - if (found_key.objectid == log_key.objectid && - found_key.type == log_key.type && - found_key.offset == log_key.offset && - btrfs_dir_type(path->nodes[0], dst_di) == log_type) { - update_size = false; + btrfs_release_path(path); + + index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, key->offset, + name, name_len, 1); + if (IS_ERR(index_dst_di)) { + ret = PTR_ERR(index_dst_di); goto out; + } else if (index_dst_di) { + ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, + index_dst_di, &log_key, + log_type, exists); + if (ret < 0) + goto out; + index_dst_matches = (ret == 1); } - /* - * don't drop the conflicting directory entry if the inode - * for the new entry doesn't exist - */ - if (!exists) - goto out; + btrfs_release_path(path); - ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di); - if (ret) + if (dir_dst_matches && index_dst_matches) { + ret = 0; + update_size = false; goto out; - - if (key->type == BTRFS_DIR_INDEX_KEY) - goto insert; -out: - btrfs_release_path(path); - if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); } - kfree(name); - iput(dir); - if (!ret && name_added) - ret = 1; - return ret; -insert: /* * Check if the inode reference exists in the log for the given name, * inode and parent inode */ - found_key.objectid = log_key.objectid; - found_key.type = BTRFS_INODE_REF_KEY; - found_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); + search_key.objectid = log_key.objectid; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); if (ret < 0) { goto out; } else if (ret) { @@ -2087,10 +2100,10 @@ insert: goto out; } - found_key.objectid = log_key.objectid; - found_key.type = BTRFS_INODE_EXTREF_KEY; - found_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &found_key, key->objectid, name, + search_key.objectid = log_key.objectid; + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &search_key, key->objectid, name, name_len); if (ret < 0) { goto out; @@ -2109,87 +2122,76 @@ insert: name_added = true; update_size = false; ret = 0; - goto out; + +out: + if (!ret && update_size) { + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); + ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + } + kfree(name); + iput(dir); + if (!ret && name_added) + ret = 1; + return ret; } -/* - * find all the names in a directory item and reconcile them into - * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than - * one name in a directory item, but the same code gets used for - * both directory index types - */ +/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - int ret = 0; - u32 item_size = btrfs_item_size_nr(eb, slot); + int ret; struct btrfs_dir_item *di; - int name_len; - unsigned long ptr; - unsigned long ptr_end; - struct btrfs_path *fixup_path = NULL; - - ptr = btrfs_item_ptr_offset(eb, slot); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - di = (struct btrfs_dir_item *)ptr; - name_len = btrfs_dir_name_len(eb, di); - ret = replay_one_name(trans, root, path, eb, di, key); - if (ret < 0) - break; - ptr = (unsigned long)(di + 1); - ptr += name_len; - /* - * If this entry refers to a non-directory (directories can not - * have a link count > 1) and it was added in the transaction - * that was not committed, make sure we fixup the link count of - * the inode it the entry points to. Otherwise something like - * the following would result in a directory pointing to an - * inode with a wrong link that does not account for this dir - * entry: - * - * mkdir testdir - * touch testdir/foo - * touch testdir/bar - * sync - * - * ln testdir/bar testdir/bar_link - * ln testdir/foo testdir/foo_link - * xfs_io -c "fsync" testdir/bar - * - * <power failure> - * - * mount fs, log replay happens - * - * File foo would remain with a link count of 1 when it has two - * entries pointing to it in the directory testdir. This would - * make it impossible to ever delete the parent directory has - * it would result in stale dentries that can never be deleted. - */ - if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { - struct btrfs_key di_key; + /* We only log dir index keys, which only contain a single dir item. */ + ASSERT(key->type == BTRFS_DIR_INDEX_KEY); - if (!fixup_path) { - fixup_path = btrfs_alloc_path(); - if (!fixup_path) { - ret = -ENOMEM; - break; - } - } + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + ret = replay_one_name(trans, root, path, eb, di, key); + if (ret < 0) + return ret; - btrfs_dir_item_key_to_cpu(eb, di, &di_key); - ret = link_to_fixup_dir(trans, root, fixup_path, - di_key.objectid); - if (ret) - break; - } - ret = 0; + /* + * If this entry refers to a non-directory (directories can not have a + * link count > 1) and it was added in the transaction that was not + * committed, make sure we fixup the link count of the inode the entry + * points to. Otherwise something like the following would result in a + * directory pointing to an inode with a wrong link that does not account + * for this dir entry: + * + * mkdir testdir + * touch testdir/foo + * touch testdir/bar + * sync + * + * ln testdir/bar testdir/bar_link + * ln testdir/foo testdir/foo_link + * xfs_io -c "fsync" testdir/bar + * + * <power failure> + * + * mount fs, log replay happens + * + * File foo would remain with a link count of 1 when it has two entries + * pointing to it in the directory testdir. This would make it impossible + * to ever delete the parent directory has it would result in stale + * dentries that can never be deleted. + */ + if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + struct btrfs_path *fixup_path; + struct btrfs_key di_key; + + fixup_path = btrfs_alloc_path(); + if (!fixup_path) + return -ENOMEM; + + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); + btrfs_free_path(fixup_path); } - btrfs_free_path(fixup_path); + return ret; } @@ -2206,7 +2208,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, */ static noinline int find_dir_range(struct btrfs_root *root, struct btrfs_path *path, - u64 dirid, int key_type, + u64 dirid, u64 *start_ret, u64 *end_ret) { struct btrfs_key key; @@ -2219,7 +2221,7 @@ static noinline int find_dir_range(struct btrfs_root *root, return 1; key.objectid = dirid; - key.type = key_type; + key.type = BTRFS_DIR_LOG_INDEX_KEY; key.offset = *start_ret; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -2233,7 +2235,7 @@ static noinline int find_dir_range(struct btrfs_root *root, if (ret != 0) btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.type != key_type || key.objectid != dirid) { + if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { ret = 1; goto next; } @@ -2260,7 +2262,7 @@ next: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.type != key_type || key.objectid != dirid) { + if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { ret = 1; goto out; } @@ -2291,95 +2293,82 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, int ret; struct extent_buffer *eb; int slot; - u32 item_size; struct btrfs_dir_item *di; - struct btrfs_dir_item *log_di; int name_len; - unsigned long ptr; - unsigned long ptr_end; char *name; - struct inode *inode; + struct inode *inode = NULL; struct btrfs_key location; -again: + /* + * Currenly we only log dir index keys. Even if we replay a log created + * by an older kernel that logged both dir index and dir item keys, all + * we need to do is process the dir index keys, we (and our caller) can + * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). + */ + ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); + eb = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(eb, slot); - ptr = btrfs_item_ptr_offset(eb, slot); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - di = (struct btrfs_dir_item *)ptr; - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; - goto out; - } - read_extent_buffer(eb, name, (unsigned long)(di + 1), - name_len); - log_di = NULL; - if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { - log_di = btrfs_lookup_dir_item(trans, log, log_path, - dir_key->objectid, - name, name_len, 0); - } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { - log_di = btrfs_lookup_dir_index_item(trans, log, - log_path, - dir_key->objectid, - dir_key->offset, - name, name_len, 0); - } - if (!log_di) { - btrfs_dir_item_key_to_cpu(eb, di, &location); - btrfs_release_path(path); - btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - kfree(name); - return -EIO; - } + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } - ret = link_to_fixup_dir(trans, root, - path, location.objectid); - if (ret) { - kfree(name); - iput(inode); - goto out; - } + read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); - inc_nlink(inode); - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), - BTRFS_I(inode), name, name_len); - if (!ret) - ret = btrfs_run_delayed_items(trans); - kfree(name); - iput(inode); - if (ret) - goto out; + if (log) { + struct btrfs_dir_item *log_di; - /* there might still be more names under this key - * check and repeat if required - */ - ret = btrfs_search_slot(NULL, root, dir_key, path, - 0, 0); - if (ret == 0) - goto again; + log_di = btrfs_lookup_dir_index_item(trans, log, log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); + if (IS_ERR(log_di)) { + ret = PTR_ERR(log_di); + goto out; + } else if (log_di) { + /* The dentry exists in the log, we have nothing to do. */ ret = 0; goto out; - } else if (IS_ERR(log_di)) { - kfree(name); - return PTR_ERR(log_di); } - btrfs_release_path(log_path); - kfree(name); + } - ptr = (unsigned long)(di + 1); - ptr += name_len; + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(path); + btrfs_release_path(log_path); + inode = read_one_inode(root, location.objectid); + if (!inode) { + ret = -EIO; + goto out; } - ret = 0; + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + if (ret) + goto out; + + inc_nlink(inode); + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, + name_len); + if (ret) + goto out; + + ret = btrfs_run_delayed_items(trans); + if (ret) + goto out; + + /* + * Unlike dir item keys, dir index keys can only have one name (entry) in + * them, as there are no key collisions since each key has a unique offset + * (an index number), so we're done. + */ out: btrfs_release_path(path); btrfs_release_path(log_path); + kfree(name); + iput(inode); return ret; } @@ -2422,7 +2411,7 @@ process_leaf: } di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); - total_size = btrfs_item_size_nr(path->nodes[0], i); + total_size = btrfs_item_size(path->nodes[0], i); cur = 0; while (cur < total_size) { u16 name_len = btrfs_dir_name_len(path->nodes[0], di); @@ -2499,7 +2488,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, { u64 range_start; u64 range_end; - int key_type = BTRFS_DIR_LOG_ITEM_KEY; int ret = 0; struct btrfs_key dir_key; struct btrfs_key found_key; @@ -2507,7 +2495,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct inode *dir; dir_key.objectid = dirid; - dir_key.type = BTRFS_DIR_ITEM_KEY; + dir_key.type = BTRFS_DIR_INDEX_KEY; log_path = btrfs_alloc_path(); if (!log_path) return -ENOMEM; @@ -2521,14 +2509,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, btrfs_free_path(log_path); return 0; } -again: + range_start = 0; range_end = 0; while (1) { if (del_all) range_end = (u64)-1; else { - ret = find_dir_range(log, path, dirid, key_type, + ret = find_dir_range(log, path, dirid, &range_start, &range_end); if (ret < 0) goto out; @@ -2555,8 +2543,10 @@ again: btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (found_key.objectid != dirid || - found_key.type != dir_key.type) - goto next_type; + found_key.type != dir_key.type) { + ret = 0; + goto out; + } if (found_key.offset > range_end) break; @@ -2575,15 +2565,7 @@ again: break; range_start = range_end + 1; } - -next_type: ret = 0; - if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { - key_type = BTRFS_DIR_LOG_INDEX_KEY; - dir_key.type = BTRFS_DIR_INDEX_KEY; - btrfs_release_path(path); - goto again; - } out: btrfs_release_path(path); btrfs_free_path(log_path); @@ -2743,12 +2725,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, eb, i, &key); if (ret) break; - } else if (key.type == BTRFS_DIR_ITEM_KEY) { - ret = replay_one_dir_item(wc->trans, root, path, - eb, i, &key); - if (ret) - break; } + /* + * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the + * BTRFS_DIR_INDEX_KEY items which we use to derive the + * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an + * older kernel with such keys, ignore them. + */ } btrfs_free_path(path); return ret; @@ -3551,20 +3534,10 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, goto out_unlock; } - di = btrfs_lookup_dir_item(trans, log, path, dir_ino, - name, name_len, -1); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto fail; - } - if (di) { - ret = btrfs_delete_one_dir_name(trans, log, path, di); - if (ret) { - err = ret; - goto fail; - } - } - btrfs_release_path(path); + /* + * We only log dir index items of a directory, so we don't need to look + * for dir item keys. + */ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, index, name, name_len, -1); if (IS_ERR(di)) { @@ -3628,7 +3601,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - int key_type, u64 dirid, + u64 dirid, u64 first_offset, u64 last_offset) { int ret; @@ -3637,10 +3610,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, key.objectid = dirid; key.offset = first_offset; - if (key_type == BTRFS_DIR_ITEM_KEY) - key.type = BTRFS_DIR_LOG_ITEM_KEY; - else - key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); if (ret) return ret; @@ -3675,7 +3645,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, if (count == 1) { btrfs_item_key_to_cpu(src, &key, start_slot); - item_size = btrfs_item_size_nr(src, start_slot); + item_size = btrfs_item_size(src, start_slot); batch.keys = &key; batch.data_sizes = &item_size; batch.total_data_size = item_size; @@ -3698,7 +3668,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, const int slot = start_slot + i; btrfs_item_key_to_cpu(src, &ins_keys[i], slot); - ins_sizes[i] = btrfs_item_size_nr(src, slot); + ins_sizes[i] = btrfs_item_size(src, slot); batch.total_data_size += ins_sizes[i]; } } @@ -3732,7 +3702,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, - int key_type, struct btrfs_log_ctx *ctx) { struct btrfs_root *log = inode->root->log_root; @@ -3740,24 +3709,18 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, const int nritems = btrfs_header_nritems(src); const u64 ino = btrfs_ino(inode); const bool inode_logged_before = inode_logged(trans, inode); - u64 last_logged_key_offset; bool last_found = false; int batch_start = 0; int batch_size = 0; int i; - if (key_type == BTRFS_DIR_ITEM_KEY) - last_logged_key_offset = inode->last_dir_item_offset; - else - last_logged_key_offset = inode->last_dir_index_offset; - for (i = path->slots[0]; i < nritems; i++) { struct btrfs_key key; int ret; btrfs_item_key_to_cpu(src, &key, i); - if (key.objectid != ino || key.type != key_type) { + if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { last_found = true; break; } @@ -3806,7 +3769,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, * we logged is in the log tree, saving time and avoiding adding * contention on the log tree. */ - if (key.offset > last_logged_key_offset) + if (key.offset > inode->last_dir_index_offset) goto add_to_batch; /* * Check if the key was already logged before. If not we can add @@ -3865,7 +3828,7 @@ add_to_batch: static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path, int key_type, + struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { @@ -3879,7 +3842,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); min_key.objectid = ino; - min_key.type = key_type; + min_key.type = BTRFS_DIR_INDEX_KEY; min_key.offset = min_offset; ret = btrfs_search_forward(root, &min_key, path, trans->transid); @@ -3888,9 +3851,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * we didn't find anything from this transaction, see if there * is anything at all */ - if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { + if (ret != 0 || min_key.objectid != ino || + min_key.type != BTRFS_DIR_INDEX_KEY) { min_key.objectid = ino; - min_key.type = key_type; + min_key.type = BTRFS_DIR_INDEX_KEY; min_key.offset = (u64)-1; btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); @@ -3898,7 +3862,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, btrfs_release_path(path); return ret; } - ret = btrfs_previous_item(root, path, ino, key_type); + ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); /* if ret == 0 there are items for this type, * create a range to tell us the last key of this type. @@ -3909,18 +3873,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key tmp; btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (key_type == tmp.type) + if (tmp.type == BTRFS_DIR_INDEX_KEY) first_offset = max(min_offset, tmp.offset) + 1; } goto done; } /* go backward to find any previous key */ - ret = btrfs_previous_item(root, path, ino, key_type); + ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); if (ret == 0) { struct btrfs_key tmp; btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (key_type == tmp.type) { + if (tmp.type == BTRFS_DIR_INDEX_KEY) { first_offset = tmp.offset; ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], @@ -3951,8 +3915,7 @@ search: * from our directory */ while (1) { - ret = process_dir_items_leaf(trans, inode, path, dst_path, - key_type, ctx); + ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx); if (ret != 0) { if (ret < 0) err = ret; @@ -3973,7 +3936,7 @@ search: goto done; } btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); - if (min_key.objectid != ino || min_key.type != key_type) { + if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { last_offset = (u64)-1; goto done; } @@ -4004,8 +3967,8 @@ done: * insert the log range keys to indicate where the log * is valid */ - ret = insert_dir_log_key(trans, log, path, key_type, - ino, first_offset, last_offset); + ret = insert_dir_log_key(trans, log, path, ino, first_offset, + last_offset); if (ret) err = ret; } @@ -4033,35 +3996,28 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, u64 min_key; u64 max_key; int ret; - int key_type = BTRFS_DIR_ITEM_KEY; /* * If this is the first time we are being logged in the current * transaction, or we were logged before but the inode was evicted and - * reloaded later, in which case its logged_trans is 0, reset the values - * of the last logged key offsets. Note that we don't use the helper + * reloaded later, in which case its logged_trans is 0, reset the value + * of the last logged key offset. Note that we don't use the helper * function inode_logged() here - that is because the function returns * true after an inode eviction, assuming the worst case as it can not * know for sure if the inode was logged before. So we can not skip key * searches in the case the inode was evicted, because it may not have * been logged in this transaction and may have been logged in a past - * transaction, so we need to reset the last dir item and index offsets - * to (u64)-1. + * transaction, so we need to reset the last dir index offset to (u64)-1. */ - if (inode->logged_trans != trans->transid) { - inode->last_dir_item_offset = (u64)-1; + if (inode->logged_trans != trans->transid) inode->last_dir_index_offset = (u64)-1; - } -again: + min_key = 0; max_key = 0; - if (key_type == BTRFS_DIR_ITEM_KEY) - ctx->last_dir_item_offset = inode->last_dir_item_offset; - else - ctx->last_dir_item_offset = inode->last_dir_index_offset; + ctx->last_dir_item_offset = inode->last_dir_index_offset; while (1) { - ret = log_dir_items(trans, inode, path, dst_path, key_type, + ret = log_dir_items(trans, inode, path, dst_path, ctx, min_key, &max_key); if (ret) return ret; @@ -4070,13 +4026,8 @@ again: min_key = max_key + 1; } - if (key_type == BTRFS_DIR_ITEM_KEY) { - inode->last_dir_item_offset = ctx->last_dir_item_offset; - key_type = BTRFS_DIR_INDEX_KEY; - goto again; - } else { - inode->last_dir_index_offset = ctx->last_dir_item_offset; - } + inode->last_dir_index_offset = ctx->last_dir_item_offset; + return 0; } @@ -4147,14 +4098,14 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 new_size, u32 min_type) { - int ret; - - do { - ret = btrfs_truncate_inode_items(trans, log_root, inode, - new_size, min_type, NULL); - } while (ret == -EAGAIN); + struct btrfs_truncate_control control = { + .new_size = new_size, + .ino = btrfs_ino(inode), + .min_type = min_type, + .skip_ref_updates = true, + }; - return ret; + return btrfs_truncate_inode_items(trans, log_root, &control); } static void fill_inode_item(struct btrfs_trans_handle *trans, @@ -4350,7 +4301,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.nr = nr; for (i = 0; i < nr; i++) { - ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + ins_sizes[i] = btrfs_item_size(src, i + start_slot); batch.total_data_size += ins_sizes[i]; btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); } @@ -4394,6 +4345,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(src, extent); if (found_type == BTRFS_FILE_EXTENT_REG) { + struct btrfs_root *csum_root; u64 ds, dl, cs, cl; ds = btrfs_file_extent_disk_bytenr(src, extent); @@ -4412,8 +4364,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, cl = dl; } - ret = btrfs_lookup_csums_range( - fs_info->csum_root, + csum_root = btrfs_csum_root(fs_info, ds); + ret = btrfs_lookup_csums_range(csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); if (ret) @@ -4465,6 +4417,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ordered_extent *ordered; + struct btrfs_root *csum_root; u64 csum_offset; u64 csum_len; u64 mod_start = em->mod_start; @@ -4545,7 +4498,8 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, } /* block start is already adjusted for the file extent offset. */ - ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, + csum_root = btrfs_csum_root(trans->fs_info, em->block_start); + ret = btrfs_lookup_csums_range(csum_root, em->block_start + csum_offset, em->block_start + csum_offset + csum_len - 1, &ordered_sums, 0); @@ -5166,7 +5120,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, struct btrfs_path *search_path; char *name = NULL; u32 name_len = 0; - u32 item_size = btrfs_item_size_nr(eb, slot); + u32 item_size = btrfs_item_size(eb, slot); u32 cur_offset = 0; unsigned long ptr = btrfs_item_ptr_offset(eb, slot); @@ -5899,18 +5853,12 @@ struct btrfs_dir_list { * link_to_fixup_dir()); * * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that - * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and - * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item + * while logging the inode's items new index items (key type + * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item * has a size that doesn't match the sum of the lengths of all the logged - * names. This does not result in a problem because if a dir_item key is - * logged but its matching dir_index key is not logged, at log replay time we - * don't use it to replay the respective name (see replay_one_name()). On the - * other hand if only the dir_index key ends up being logged, the respective - * name is added to the fs/subvol tree with both the dir_item and dir_index - * keys created (see replay_one_name()). - * The directory's inode item with a wrong i_size is not a problem as well, - * since we don't use it at log replay time to set the i_size in the inode - * item of the fs/subvol tree (see overwrite_item()). + * names - this is ok, not a problem, because at log replay time we set the + * directory's i_size to the correct value (see replay_one_name() and + * do_overwrite_item()). */ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5956,7 +5904,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, goto next_dir_inode; min_key.objectid = dir_elem->ino; - min_key.type = BTRFS_DIR_ITEM_KEY; + min_key.type = BTRFS_DIR_INDEX_KEY; min_key.offset = 0; again: btrfs_release_path(path); @@ -5981,7 +5929,7 @@ process_leaf: btrfs_item_key_to_cpu(leaf, &min_key, i); if (min_key.objectid != dir_elem->ino || - min_key.type != BTRFS_DIR_ITEM_KEY) + min_key.type != BTRFS_DIR_INDEX_KEY) goto next_dir_inode; di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); @@ -6093,7 +6041,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) break; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; @@ -6795,15 +6743,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * was previously logged, make sure the next log attempt on the directory * is not skipped and logs the inode again. This is because the log may * not currently be authoritative for a range including the old - * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make - * sure after a log replay we do not end up with both the new and old - * dentries around (in case the inode is a directory we would have a - * directory with two hard links and 2 inode references for different - * parents). The next log attempt of old_dir will happen at - * btrfs_log_all_parents(), called through btrfs_log_inode_parent() - * below, because we have previously set inode->last_unlink_trans to the - * current transaction ID, either here or at btrfs_record_unlink_dir() in - * case inode is a directory. + * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we + * do not end up with both the new and old dentries around (in case the + * inode is a directory we would have a directory with two hard links and + * 2 inode references for different parents). The next log attempt of + * old_dir will happen at btrfs_log_all_parents(), called through + * btrfs_log_inode_parent() below, because we have previously set + * inode->last_unlink_trans to the current transaction ID, either here or + * at btrfs_record_unlink_dir() in case the inode is a directory. */ if (old_dir) old_dir->logged_trans = 0; diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 74023c8a783f..b458452a1aaf 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -52,7 +52,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, eb = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); offset = btrfs_item_ptr_offset(eb, slot); ret = -ENOENT; @@ -125,7 +125,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); - offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); + offset += btrfs_item_size(eb, slot) - sizeof(subid_le); } else { btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", @@ -186,7 +186,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); @@ -208,7 +208,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, goto out; } - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); if (item_size == sizeof(subid)) { ret = btrfs_del_item(trans, uuid_root, path); goto out; @@ -331,7 +331,7 @@ again_search_slot: goto skip; offset = btrfs_item_ptr_offset(leaf, slot); - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 4968535dfff0..90eb5c2830a9 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -333,7 +333,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, if (key.objectid != btrfs_ino(inode) || key.type != key_type) break; - item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset; + item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset; if (copied > 0) { /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fd0ced829edb..b07d382d53a8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -34,6 +34,10 @@ #include "discard.h" #include "zoned.h" +#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID10 | \ + BTRFS_BLOCK_GROUP_RAID56_MASK) + const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, @@ -1162,7 +1166,6 @@ static void btrfs_close_one_device(struct btrfs_device *device) ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); ASSERT(list_empty(&device->dev_alloc_list)); ASSERT(list_empty(&device->post_commit_list)); - ASSERT(atomic_read(&device->reada_in_flight) == 0); } static void close_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -2146,8 +2149,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } ret = btrfs_shrink_device(device, 0); - if (!ret) - btrfs_reada_remove_dev(device); if (ret) goto error_undo; @@ -2245,7 +2246,6 @@ out: return ret; error_undo: - btrfs_reada_undo_remove_dev(device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, @@ -2431,21 +2431,15 @@ struct btrfs_device *btrfs_find_device_by_devspec( return device; } -/* - * does all the dirty work required for changing file system's UUID. - */ -static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) +static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = fs_info->super_copy; - struct btrfs_device *device; - u64 super_flags; lockdep_assert_held(&uuid_mutex); if (!fs_devices->seeding) - return -EINVAL; + return ERR_PTR(-EINVAL); /* * Private copy of the seed devices, anchored at @@ -2453,7 +2447,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) */ seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) - return PTR_ERR(seed_devices); + return seed_devices; /* * It's necessary to retain a copy of the original seed fs_devices in @@ -2464,7 +2458,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); - return PTR_ERR(old_devices); + return old_devices; } list_add(&old_devices->fs_list, &fs_uuids); @@ -2475,7 +2469,41 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&seed_devices->alloc_list); mutex_init(&seed_devices->device_list_mutex); - mutex_lock(&fs_devices->device_list_mutex); + return seed_devices; +} + +/* + * Splice seed devices into the sprout fs_devices. + * Generate a new fsid for the sprouted read-write filesystem. + */ +static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *seed_devices) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_super_block *disk_super = fs_info->super_copy; + struct btrfs_device *device; + u64 super_flags; + + /* + * We are updating the fsid, the thread leading to device_list_add() + * could race, so uuid_mutex is needed. + */ + lockdep_assert_held(&uuid_mutex); + + /* + * The threads listed below may traverse dev_list but can do that without + * device_list_mutex: + * - All device ops and balance - as we are in btrfs_exclop_start. + * - Various dev_list readers - are using RCU. + * - btrfs_ioctl_fitrim() - is using RCU. + * + * For-read threads as below are using device_list_mutex: + * - Readonly scrub btrfs_scrub_dev() + * - Readonly scrub btrfs_scrub_progress() + * - btrfs_get_dev_stats() + */ + lockdep_assert_held(&fs_devices->device_list_mutex); + list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); list_for_each_entry(device, &seed_devices->devices, dev_list) @@ -2491,13 +2519,10 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); - mutex_unlock(&fs_devices->device_list_mutex); super_flags = btrfs_super_flags(disk_super) & ~BTRFS_SUPER_FLAG_SEEDING; btrfs_set_super_flags(disk_super, super_flags); - - return 0; } /* @@ -2588,10 +2613,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct super_block *sb = fs_info->sb; struct rcu_string *name; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *seed_devices; u64 orig_super_total_bytes; u64 orig_super_num_devices; - int seeding_dev = 0; int ret = 0; + bool seeding_dev = false; bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) @@ -2608,7 +2634,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } if (fs_devices->seeding) { - seeding_dev = 1; + seeding_dev = true; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); locked = true; @@ -2643,7 +2669,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->fs_info = fs_info; device->bdev = bdev; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error_free_device; @@ -2671,18 +2697,25 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { btrfs_clear_sb_rdonly(sb); - ret = btrfs_prepare_sprout(fs_info); - if (ret) { + + /* GFP_KERNEL allocation must not be under device_list_mutex */ + seed_devices = btrfs_init_sprout(fs_info); + if (IS_ERR(seed_devices)) { + ret = PTR_ERR(seed_devices); btrfs_abort_transaction(trans, ret); goto error_trans; } + } + + mutex_lock(&fs_devices->device_list_mutex); + if (seeding_dev) { + btrfs_setup_sprout(fs_info, seed_devices); btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, device); } device->fs_devices = fs_devices; - mutex_lock(&fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); list_add(&device->dev_alloc_list, &fs_devices->alloc_list); @@ -2744,7 +2777,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path /* * fs_devices now represents the newly sprouted filesystem and - * its fsid has been changed by btrfs_prepare_sprout + * its fsid has been changed by btrfs_sprout_splice(). */ btrfs_sysfs_update_sprout_fsid(fs_devices); } @@ -4357,8 +4390,10 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); - if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { btrfs_info(fs_info, "balance: paused"); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + } /* * Balance can be canceled by: * @@ -4434,6 +4469,10 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); /* * A ro->rw remount sequence should continue with the paused balance * regardless of who pauses it, system or the user as of now, so set @@ -4502,7 +4541,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4643,7 +4682,7 @@ int btrfs_uuid_scan_kthread(void *data) eb = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); if (item_size < sizeof(root_item)) goto skip; @@ -5504,7 +5543,6 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_key key; struct btrfs_chunk *chunk; @@ -5576,7 +5614,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, } btrfs_set_stack_chunk_length(chunk, bg->length); - btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); btrfs_set_stack_chunk_type(chunk, map->type); btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); @@ -6314,7 +6352,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, stripe_offset = offset - stripe_offset; data_stripes = nr_data_stripes(map); - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + /* Only stripe based profiles needs to check against stripe length. */ + if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { u64 max_len = stripe_len - stripe_offset; /* @@ -6937,11 +6976,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); - atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE, NULL); @@ -7730,7 +7766,7 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device, } slot = path->slots[0]; eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); @@ -7808,7 +7844,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, } if (ret == 0 && - btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { @@ -8298,23 +8334,26 @@ out: return ret; } -int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) +bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_block_group *cache; + if (!btrfs_is_zoned(fs_info)) + return false; + /* Do not attempt to repair in degraded state */ if (btrfs_test_opt(fs_info, DEGRADED)) - return 0; + return true; cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) - return 0; + return true; spin_lock(&cache->lock); if (cache->relocating_repair) { spin_unlock(&cache->lock); btrfs_put_block_group(cache); - return 0; + return true; } cache->relocating_repair = 1; spin_unlock(&cache->lock); @@ -8322,5 +8361,5 @@ int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) kthread_run(relocating_repair_kthread, cache, "btrfs-relocating-repair"); - return 0; + return true; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3b8130680749..005c9e2a491a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -123,13 +123,6 @@ struct btrfs_device { /* per-device scrub information */ struct scrub_ctx *scrub_ctx; - /* readahead state */ - atomic_t reada_in_flight; - u64 reada_next; - struct reada_zone *reada_curr_zone; - struct radix_tree_root reada_zones; - struct radix_tree_root reada_extents; - /* disk I/O failure stats. For detailed description refer to * enum btrfs_dev_stat_values in ioctl.h */ int dev_stats_valid; @@ -637,6 +630,6 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); -int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); +bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 2837b4c8424d..99abf41b89b9 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -168,9 +168,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const int slot = path->slots[0]; struct extent_buffer *leaf = path->nodes[0]; const u16 old_data_len = btrfs_dir_data_len(leaf, di); - const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 item_size = btrfs_item_size(leaf, slot); const u32 data_size = sizeof(*di) + name_len + size; - struct btrfs_item *item; unsigned long data_ptr; char *ptr; @@ -196,9 +195,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_extend_item(path, data_size); } - item = btrfs_item_nr(slot); ptr = btrfs_item_ptr(leaf, slot, char); - ptr += btrfs_item_size(leaf, item) - data_size; + ptr += btrfs_item_size(leaf, slot) - data_size; di = (struct btrfs_dir_item *)ptr; btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; @@ -335,7 +333,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) goto next_item; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); cur = 0; while (cur < item_size) { u16 name_len = btrfs_dir_name_len(leaf, di); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 678a29469511..f559d517c7c4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -5,6 +5,7 @@ #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/atomic.h> +#include <linux/vmalloc.h> #include "ctree.h" #include "volumes.h" #include "zoned.h" @@ -213,6 +214,8 @@ static int emulate_report_zones(struct btrfs_device *device, u64 pos, static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { + struct btrfs_zoned_device_info *zinfo = device->zone_info; + u32 zno; int ret; if (!*nr_zones) @@ -224,6 +227,34 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return 0; } + /* Check cache */ + if (zinfo->zone_cache) { + unsigned int i; + + ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); + zno = pos >> zinfo->zone_size_shift; + /* + * We cannot report zones beyond the zone end. So, it is OK to + * cap *nr_zones to at the end. + */ + *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); + + for (i = 0; i < *nr_zones; i++) { + struct blk_zone *zone_info; + + zone_info = &zinfo->zone_cache[zno + i]; + if (!zone_info->len) + break; + } + + if (i == *nr_zones) { + /* Cache hit on all the zones */ + memcpy(zones, zinfo->zone_cache + zno, + sizeof(*zinfo->zone_cache) * *nr_zones); + return 0; + } + } + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { @@ -237,6 +268,11 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, if (!ret) return -EIO; + /* Populate cache */ + if (zinfo->zone_cache) + memcpy(zinfo->zone_cache + zno, zones, + sizeof(*zinfo->zone_cache) * *nr_zones); + return 0; } @@ -300,7 +336,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) if (!device->bdev) continue; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, true); if (ret) break; } @@ -309,7 +345,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) return ret; } -int btrfs_get_dev_zone_info(struct btrfs_device *device) +int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; @@ -339,6 +375,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!zone_info) return -ENOMEM; + device->zone_info = zone_info; + if (!bdev_is_zoned(bdev)) { if (!fs_info->zone_size) { ret = calculate_emulated_zone_size(fs_info); @@ -407,6 +445,23 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) goto out; } + /* + * Enable zone cache only for a zoned device. On a non-zoned device, we + * fill the zone info with emulated CONVENTIONAL zones, so no need to + * use the cache. + */ + if (populate_cache && bdev_is_zoned(device->bdev)) { + zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * + zone_info->nr_zones); + if (!zone_info->zone_cache) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to allocate zone cache for %s", + rcu_str_deref(device->name)); + ret = -ENOMEM; + goto out; + } + } + /* Get zones type */ nactive = 0; while (sector < nr_sectors) { @@ -505,8 +560,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) kfree(zones); - device->zone_info = zone_info; - switch (bdev_zoned_model(bdev)) { case BLK_ZONED_HM: model = "host-managed zoned"; @@ -539,11 +592,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) out: kfree(zones); out_free_zone_info: - bitmap_free(zone_info->active_zones); - bitmap_free(zone_info->empty_zones); - bitmap_free(zone_info->seq_zones); - kfree(zone_info); - device->zone_info = NULL; + btrfs_destroy_dev_zone_info(device); return ret; } @@ -558,6 +607,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) bitmap_free(zone_info->active_zones); bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); + vfree(zone_info->zone_cache); kfree(zone_info); device->zone_info = NULL; } @@ -1104,7 +1154,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, u64 *offset_ret) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; @@ -1119,6 +1169,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, key.type = 0; key.offset = 0; + root = btrfs_extent_root(fs_info, key.objectid); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); /* We should not find the exact match */ if (!ret) @@ -1586,29 +1637,19 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (!btrfs_is_zoned(fs_info)) return true; - cache = *cache_ret; + cache = btrfs_lookup_block_group(fs_info, eb->start); + if (!cache) + return true; - if (cache && (eb->start < cache->start || - cache->start + cache->length <= eb->start)) { + if (cache->meta_write_pointer != eb->start) { btrfs_put_block_group(cache); cache = NULL; - *cache_ret = NULL; + ret = false; + } else { + cache->meta_write_pointer = eb->start + eb->len; } - if (!cache) - cache = btrfs_lookup_block_group(fs_info, eb->start); - - if (cache) { - if (cache->meta_write_pointer != eb->start) { - btrfs_put_block_group(cache); - cache = NULL; - ret = false; - } else { - cache->meta_write_pointer = eb->start + eb->len; - } - - *cache_ret = cache; - } + *cache_ret = cache; return ret; } @@ -1884,7 +1925,7 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) return ret; } -bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index) +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { struct btrfs_device *device; bool ret = false; @@ -1893,8 +1934,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index return true; /* Non-single profiles are not supported yet */ - if (raid_index != BTRFS_RAID_SINGLE) - return false; + ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); /* Check if there is a device with active zones left */ mutex_lock(&fs_devices->device_list_mutex); @@ -1975,3 +2015,21 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) fs_info->data_reloc_bg = 0; spin_unlock(&fs_info->relocation_bg_lock); } + +void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + + if (!btrfs_is_zoned(fs_info)) + return; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->zone_info) { + vfree(device->zone_info->zone_cache); + device->zone_info->zone_cache = NULL; + } + } + mutex_unlock(&fs_devices->device_list_mutex); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index e53ab7b96437..cbf016a7bb5d 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -8,6 +8,7 @@ #include "volumes.h" #include "disk-io.h" #include "block-group.h" +#include "btrfs_inode.h" /* * Block groups with more than this value (percents) of unusable space will be @@ -28,6 +29,7 @@ struct btrfs_zoned_device_info { unsigned long *seq_zones; unsigned long *empty_zones; unsigned long *active_zones; + struct blk_zone *zone_cache; struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; @@ -35,7 +37,7 @@ struct btrfs_zoned_device_info { int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); -int btrfs_get_dev_zone_info(struct btrfs_device *device); +int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); @@ -71,11 +73,11 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, u64 logical, u64 length); bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); -bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, - int raid_index); +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); +void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -88,7 +90,8 @@ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_i return 0; } -static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) +static inline int btrfs_get_dev_zone_info(struct btrfs_device *device, + bool populate_cache) { return 0; } @@ -222,7 +225,7 @@ static inline int btrfs_zone_finish(struct btrfs_block_group *block_group) } static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, - int raid_index) + u64 flags) { return true; } @@ -232,6 +235,7 @@ static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } +static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -350,4 +354,20 @@ static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg) spin_unlock(&fs_info->treelog_bg_lock); } +static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + + if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) + btrfs_inode_lock(&inode->vfs_inode, 0); +} + +static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + + if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) + btrfs_inode_unlock(&inode->vfs_inode, 0); +} + #endif diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8f58fd95efc7..0d729664b4b4 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -182,18 +182,18 @@ FLUSH_STATES TRACE_EVENT(btrfs_transaction_commit, - TP_PROTO(const struct btrfs_root *root), + TP_PROTO(const struct btrfs_fs_info *fs_info), - TP_ARGS(root), + TP_ARGS(fs_info), TP_STRUCT__entry_btrfs( __field( u64, generation ) __field( u64, root_objectid ) ), - TP_fast_assign_btrfs(root->fs_info, - __entry->generation = root->fs_info->generation; - __entry->root_objectid = root->root_key.objectid; + TP_fast_assign_btrfs(fs_info, + __entry->generation = fs_info->generation; + __entry->root_objectid = BTRFS_ROOT_TREE_OBJECTID; ), TP_printk_btrfs("root=%llu(%s) gen=%llu", diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index e1c4c732aaba..5416f1f1a77a 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -146,7 +146,9 @@ /* * dir items are the name -> inode pointers in a directory. There is one - * for every name in a directory. + * for every name in a directory. BTRFS_DIR_LOG_ITEM_KEY is no longer used + * but it's still defined here for documentation purposes and to help avoid + * having its numerical value reused in the future. */ #define BTRFS_DIR_LOG_ITEM_KEY 60 #define BTRFS_DIR_LOG_INDEX_KEY 72 |