summaryrefslogtreecommitdiff
path: root/fs/btrfs/ioctl.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-24 18:52:35 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-24 18:52:35 -0700
commitbd1b7c1384ec15294ee45bf3add7b7036e146dad (patch)
tree5b8efc004782d52f8697b2831bdcce9c9a884988 /fs/btrfs/ioctl.c
parent3842007b1a33589d57f67eac479b132b77767514 (diff)
parent0a05fafe9def0d9f0fbef3dfc8094925af9e3185 (diff)
Merge tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "Features: - subpage: - support for PAGE_SIZE > 4K (previously only 64K) - make it work with raid56 - repair super block num_devices automatically if it does not match the number of device items - defrag can convert inline extents to regular extents, up to now inline files were skipped but the setting of mount option max_inline could affect the decision logic - zoned: - minimal accepted zone size is explicitly set to 4MiB - make zone reclaim less aggressive and don't reclaim if there are enough free zones - add per-profile sysfs tunable of the reclaim threshold - allow automatic block group reclaim for non-zoned filesystems, with sysfs tunables - tree-checker: new check, compare extent buffer owner against owner rootid Performance: - avoid blocking on space reservation when doing nowait direct io writes (+7% throughput for reads and writes) - NOCOW write throughput improvement due to refined locking (+3%) - send: reduce pressure to page cache by dropping extent pages right after they're processed Core: - convert all radix trees to xarray - add iterators for b-tree node items - support printk message index - user bulk page allocation for extent buffers - switch to bio_alloc API, use on-stack bios where convenient, other bio cleanups - use rw lock for block groups to favor concurrent reads - simplify workques, don't allocate high priority threads for all normal queues as we need only one - refactor scrub, process chunks based on their constraints and similarity - allocate direct io structures on stack and pass around only pointers, avoids allocation and reduces potential error handling Fixes: - fix count of reserved transaction items for various inode operations - fix deadlock between concurrent dio writes when low on free data space - fix a few cases when zones need to be finished VFS, iomap: - add helper to check if sb write has started (usable for assertions) - new helper iomap_dio_alloc_bio, export iomap_dio_bio_end_io" * tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (173 commits) btrfs: zoned: introduce a minimal zone size 4M and reject mount btrfs: allow defrag to convert inline extents to regular extents btrfs: add "0x" prefix for unsupported optional features btrfs: do not account twice for inode ref when reserving metadata units btrfs: zoned: fix comparison of alloc_offset vs meta_write_pointer btrfs: send: avoid trashing the page cache btrfs: send: keep the current inode open while processing it btrfs: allocate the btrfs_dio_private as part of the iomap dio bio btrfs: move struct btrfs_dio_private to inode.c btrfs: remove the disk_bytenr in struct btrfs_dio_private btrfs: allocate dio_data on stack iomap: add per-iomap_iter private data iomap: allow the file system to provide a bio_set for direct I/O btrfs: add a btrfs_dio_rw wrapper btrfs: zoned: zone finish unused block group btrfs: zoned: properly finish block group on metadata write btrfs: zoned: finish block group when there are no more allocatable bytes left btrfs: zoned: consolidate zone finish functions btrfs: zoned: introduce btrfs_zoned_bg_is_full btrfs: improve error reporting in lookup_inline_extent_backref ...
Diffstat (limited to 'fs/btrfs/ioctl.c')
-rw-r--r--fs/btrfs/ioctl.c268
1 files changed, 136 insertions, 132 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b2c692b2fd8d..43b6f23bbd89 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -540,9 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
return 1;
}
+/*
+ * Calculate the number of transaction items to reserve for creating a subvolume
+ * or snapshot, not including the inode, directory entries, or parent directory.
+ */
+static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+{
+ /*
+ * 1 to add root block
+ * 1 to add root item
+ * 1 to add root ref
+ * 1 to add root backref
+ * 1 to add UUID item
+ * 1 to add qgroup info
+ * 1 to add qgroup limit
+ *
+ * Ideally the last two would only be accounted if qgroups are enabled,
+ * but that can change between now and the time we would insert them.
+ */
+ unsigned int num_items = 7;
+
+ if (inherit) {
+ /* 2 to add qgroup relations for each inherited qgroup */
+ num_items += 2 * inherit->num_qgroups;
+ }
+ return num_items;
+}
+
static noinline int create_subvol(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
- const char *name, int namelen,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -555,11 +581,15 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
struct timespec64 cur_time = current_time(dir);
- struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .subvol = true,
+ };
+ unsigned int trans_num_items;
int ret;
- dev_t anon_dev = 0;
+ dev_t anon_dev;
u64 objectid;
- u64 index = 0;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
@@ -567,11 +597,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
- goto fail_free;
-
- ret = get_anon_bdev(&anon_dev);
- if (ret < 0)
- goto fail_free;
+ goto out_root_item;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
@@ -579,36 +605,47 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
*/
if (btrfs_qgroup_level(objectid)) {
ret = -ENOSPC;
- goto fail_free;
+ goto out_root_item;
}
+ ret = get_anon_bdev(&anon_dev);
+ if (ret < 0)
+ goto out_root_item;
+
+ new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir);
+ if (!new_inode_args.inode) {
+ ret = -ENOMEM;
+ goto out_anon_dev;
+ }
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
+ goto out_inode;
+ trans_num_items += create_subvol_num_items(inherit);
+
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
- /*
- * The same as the snapshot creation, please see the comment
- * of create_snapshot().
- */
- ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ trans_num_items, false);
if (ret)
- goto fail_free;
+ goto out_new_inode_args;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_subvolume_release_metadata(root, &block_rsv);
- goto fail_free;
+ goto out_new_inode_args;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
if (ret)
- goto fail;
+ goto out;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
- goto fail;
+ goto out;
}
btrfs_mark_buffer_dirty(leaf);
@@ -663,75 +700,46 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
btrfs_tree_unlock(leaf);
btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
free_extent_buffer(leaf);
- goto fail;
+ goto out;
}
free_extent_buffer(leaf);
leaf = NULL;
- key.offset = (u64)-1;
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
if (IS_ERR(new_root)) {
- free_anon_bdev(anon_dev);
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- /* Freeing will be done in btrfs_put_root() of new_root */
+ /* anon_dev is owned by new_root now. */
anon_dev = 0;
+ BTRFS_I(new_inode_args.inode)->root = new_root;
+ /* ... and new_root is owned by new_inode_args.inode now. */
ret = btrfs_record_root_in_trans(trans, new_root);
if (ret) {
- btrfs_put_root(new_root);
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
- btrfs_put_root(new_root);
- if (ret) {
- /* We potentially lose an unused inode item here */
btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- /*
- * insert the directory item
- */
- ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
- BTRFS_FT_DIR, index);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
- ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ ret = btrfs_uuid_tree_add(trans, root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
- btrfs_ino(BTRFS_I(dir)), index, name, namelen);
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- ret = btrfs_uuid_tree_add(trans, root_item->uuid,
- BTRFS_UUID_KEY_SUBVOL, objectid);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ d_instantiate_new(dentry, new_inode_args.inode);
+ new_inode_args.inode = NULL;
-fail:
- kfree(root_item);
+out:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv);
@@ -740,18 +748,14 @@ fail:
btrfs_end_transaction(trans);
else
ret = btrfs_commit_transaction(trans);
-
- if (!ret) {
- inode = btrfs_lookup_dentry(dir, dentry);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- d_instantiate(dentry, inode);
- }
- return ret;
-
-fail_free:
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ iput(new_inode_args.inode);
+out_anon_dev:
if (anon_dev)
free_anon_bdev(anon_dev);
+out_root_item:
kfree(root_item);
return ret;
}
@@ -763,6 +767,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
+ unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
int ret;
@@ -800,16 +805,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
- * 1 - parent dir inode
- * 2 - dir entries
- * 1 - root item
- * 2 - root ref/backref
- * 1 - root of snapshot
- * 1 - UUID item
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
*/
+ trans_num_items = create_subvol_num_items(inherit) + 3;
ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
- &pending_snapshot->block_rsv, 8,
- false);
+ &pending_snapshot->block_rsv,
+ trans_num_items, false);
if (ret)
goto free_pending;
@@ -979,7 +982,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (snap_src)
error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
- error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
+ error = create_subvol(mnt_userns, dir, dentry, inherit);
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -1413,8 +1416,19 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (!em)
break;
- /* Skip hole/inline/preallocated extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+ /*
+ * If the file extent is an inlined one, we may still want to
+ * defrag it (fallthrough) if it will cause a regular extent.
+ * This is for users who want to convert inline extents to
+ * regular ones through max_inline= mount option.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE &&
+ em->len <= inode->root->fs_info->max_inline)
+ goto next;
+
+ /* Skip hole/delalloc/preallocated extents */
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ em->block_start == EXTENT_MAP_DELALLOC ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
goto next;
@@ -1473,6 +1487,15 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (em->len >= get_extent_max_capacity(em))
goto next;
+ /*
+ * Normally there are no more extents after an inline one, thus
+ * @next_mergeable will normally be false and not defragged.
+ * So if an inline extent passed all above checks, just add it
+ * for defrag, and be converted to regular extents.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE)
+ goto add;
+
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
extent_thresh, newer_than, locked);
if (!next_mergeable) {
@@ -2594,7 +2617,7 @@ err:
static noinline int btrfs_ioctl_tree_search(struct inode *inode,
void __user *argp)
{
- struct btrfs_ioctl_search_args __user *uargs;
+ struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
int ret;
size_t buf_size;
@@ -2602,8 +2625,6 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- uargs = (struct btrfs_ioctl_search_args __user *)argp;
-
if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
return -EFAULT;
@@ -2626,7 +2647,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
void __user *argp)
{
- struct btrfs_ioctl_search_args_v2 __user *uarg;
+ struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
int ret;
size_t buf_size;
@@ -2636,7 +2657,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
return -EPERM;
/* copy search header and buffer size */
- uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
if (copy_from_user(&args, uarg, sizeof(args)))
return -EFAULT;
@@ -4344,10 +4364,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
bool need_unlock; /* for mut. excl. ops lock */
int ret;
- if (!arg)
- btrfs_warn(fs_info,
- "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18");
-
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4355,6 +4371,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
if (ret)
return ret;
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ bargs = NULL;
+ goto out;
+ }
+
again:
if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
mutex_lock(&fs_info->balance_mutex);
@@ -4402,59 +4425,42 @@ again:
}
locked:
-
- if (arg) {
- bargs = memdup_user(arg, sizeof(*bargs));
- if (IS_ERR(bargs)) {
- ret = PTR_ERR(bargs);
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
goto out_unlock;
}
- if (bargs->flags & BTRFS_BALANCE_RESUME) {
- if (!fs_info->balance_ctl) {
- ret = -ENOTCONN;
- goto out_bargs;
- }
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
- bctl = fs_info->balance_ctl;
- spin_lock(&fs_info->balance_lock);
- bctl->flags |= BTRFS_BALANCE_RESUME;
- spin_unlock(&fs_info->balance_lock);
- btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
+ goto do_balance;
+ }
- goto do_balance;
- }
- } else {
- bargs = NULL;
+ if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+ ret = -EINVAL;
+ goto out_unlock;
}
if (fs_info->balance_ctl) {
ret = -EINPROGRESS;
- goto out_bargs;
+ goto out_unlock;
}
bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
- goto out_bargs;
- }
-
- if (arg) {
- memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
- memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
- memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
-
- bctl->flags = bargs->flags;
- } else {
- /* balance everything - no filters */
- bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+ goto out_unlock;
}
- if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
- ret = -EINVAL;
- goto out_bctl;
- }
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+ bctl->flags = bargs->flags;
do_balance:
/*
* Ownership of bctl and exclusive operation goes to btrfs_balance.
@@ -4467,21 +4473,19 @@ do_balance:
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
- if ((ret == 0 || ret == -ECANCELED) && arg) {
+ if (ret == 0 || ret == -ECANCELED) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
-out_bctl:
kfree(bctl);
-out_bargs:
- kfree(bargs);
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
+ kfree(bargs);
return ret;
}