summaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-08-29 11:36:22 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-08-29 11:36:22 -0700
commit318e15101993c0fdc3f23f24ac61fc7769d27e68 (patch)
tree98e2805502dc83f64c632706aabe06391469df32 /fs/btrfs/inode.c
parenta7ccbcf3307022c48810eebd99aa8dba84f13caf (diff)
parent256dd1bb3750ac5ad49b40887c1691788dc44b33 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs fixes from Chris Mason: "I've split out the big send/receive update from my last pull request and now have just the fixes in my for-linus branch. The send/recv branch will wander over to linux-next shortly though. The largest patches in this pull are Josef's patches to fix DIO locking problems and his patch to fix a crash during balance. They are both well tested. The rest are smaller fixes that we've had queued. The last rc came out while I was hacking new and exciting ways to recover from a misplaced rm -rf on my dev box, so these missed rc3." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (25 commits) Btrfs: fix that repair code is spuriously executed for transid failures Btrfs: fix ordered extent leak when failing to start a transaction Btrfs: fix a dio write regression Btrfs: fix deadlock with freeze and sync V2 Btrfs: revert checksum error statistic which can cause a BUG() Btrfs: remove superblock writing after fatal error Btrfs: allow delayed refs to be merged Btrfs: fix enospc problems when deleting a subvol Btrfs: fix wrong mtime and ctime when creating snapshots Btrfs: fix race in run_clustered_refs Btrfs: don't run __tree_mod_log_free_eb on leaves Btrfs: increase the size of the free space cache Btrfs: barrier before waitqueue_active Btrfs: fix deadlock in wait_for_more_refs btrfs: fix second lock in btrfs_delete_delayed_items() Btrfs: don't allocate a seperate csums array for direct reads Btrfs: do not strdup non existent strings Btrfs: do not use missing devices when showing devname Btrfs: fix that error value is changed by mistake Btrfs: lock extents as we map them in DIO ...
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c326
1 files changed, 164 insertions, 162 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6e8f416773d4..ec154f954646 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1008,9 +1008,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
- atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
-
- if (atomic_read(&root->fs_info->async_delalloc_pages) <
+ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
5 * 1024 * 1024 &&
waitqueue_active(&root->fs_info->async_submit_wait))
wake_up(&root->fs_info->async_submit_wait);
@@ -1885,8 +1883,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans = btrfs_join_transaction_nolock(root);
else
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) /* -ENOMEM or corruption */
@@ -3174,7 +3175,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
- ret = btrfs_update_inode(trans, root, dir);
+ ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
btrfs_abort_transaction(trans, root, ret);
out:
@@ -5774,18 +5775,112 @@ out:
return ret;
}
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+ struct extent_state **cached_state, int writing)
+{
+ struct btrfs_ordered_extent *ordered;
+ int ret = 0;
+
+ while (1) {
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, cached_state);
+ /*
+ * We're concerned with the entire range that we're going to be
+ * doing DIO to, so we need to make sure theres no ordered
+ * extents in this range.
+ */
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+
+ /*
+ * We need to make sure there are no buffered pages in this
+ * range either, we could have raced between the invalidate in
+ * generic_file_direct_write and locking the extent. The
+ * invalidate needs to happen so that reads after a write do not
+ * get stale data.
+ */
+ if (!ordered && (!writing ||
+ !test_range_bit(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, EXTENT_UPTODATE, 0,
+ *cached_state)))
+ break;
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state, GFP_NOFS);
+
+ if (ordered) {
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ /* Screw you mmap */
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ lockstart,
+ lockend);
+ if (ret)
+ break;
+
+ /*
+ * If we found a page that couldn't be invalidated just
+ * fall back to buffered.
+ */
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ lockstart >> PAGE_CACHE_SHIFT,
+ lockend >> PAGE_CACHE_SHIFT);
+ if (ret)
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_state *cached_state = NULL;
u64 start = iblock << inode->i_blkbits;
+ u64 lockstart, lockend;
u64 len = bh_result->b_size;
struct btrfs_trans_handle *trans;
+ int unlock_bits = EXTENT_LOCKED;
+ int ret;
+
+ if (create) {
+ ret = btrfs_delalloc_reserve_space(inode, len);
+ if (ret)
+ return ret;
+ unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+ } else {
+ len = min_t(u64, len, root->sectorsize);
+ }
+
+ lockstart = start;
+ lockend = start + len - 1;
+
+ /*
+ * If this errors out it's because we couldn't invalidate pagecache for
+ * this range and we need to fallback to buffered.
+ */
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
+ return -ENOTBLK;
+
+ if (create) {
+ ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_DELALLOC, NULL,
+ &cached_state, GFP_NOFS);
+ if (ret)
+ goto unlock_err;
+ }
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
/*
* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5804,17 +5899,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
- return -ENOTBLK;
+ ret = -ENOTBLK;
+ goto unlock_err;
}
/* Just a good old fashioned hole, return */
if (!create && (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
free_extent_map(em);
- /* DIO will do one hole at a time, so just unlock a sector */
- unlock_extent(&BTRFS_I(inode)->io_tree, start,
- start + root->sectorsize - 1);
- return 0;
+ ret = 0;
+ goto unlock_err;
}
/*
@@ -5827,8 +5921,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
*
*/
if (!create) {
- len = em->len - (start - em->start);
- goto map;
+ len = min(len, em->len - (start - em->start));
+ lockstart = start + len;
+ goto unlock;
}
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5860,7 +5955,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
btrfs_end_transaction(trans, root);
if (ret) {
free_extent_map(em);
- return ret;
+ goto unlock_err;
}
goto unlock;
}
@@ -5873,14 +5968,12 @@ must_cow:
*/
len = bh_result->b_size;
em = btrfs_new_extent_direct(inode, em, start, len);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
len = min(len, em->len - (start - em->start));
unlock:
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
- EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
- 0, NULL, GFP_NOFS);
-map:
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
@@ -5898,9 +5991,44 @@ map:
i_size_write(inode, start + len);
}
+ /*
+ * In the case of write we need to clear and unlock the entire range,
+ * in the case of read we need to unlock only the end area that we
+ * aren't using if there is any left over space.
+ */
+ if (lockstart < lockend) {
+ if (create && len < lockend - lockstart) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockstart + len - 1, unlock_bits, 1, 0,
+ &cached_state, GFP_NOFS);
+ /*
+ * Beside unlock, we also need to cleanup reserved space
+ * for the left range by attaching EXTENT_DO_ACCOUNTING.
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ lockstart + len, lockend,
+ unlock_bits | EXTENT_DO_ACCOUNTING,
+ 1, 0, NULL, GFP_NOFS);
+ } else {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, unlock_bits, 1, 0,
+ &cached_state, GFP_NOFS);
+ }
+ } else {
+ free_extent_state(cached_state);
+ }
+
free_extent_map(em);
return 0;
+
+unlock_err:
+ if (create)
+ unlock_bits |= EXTENT_DO_ACCOUNTING;
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+ return ret;
}
struct btrfs_dio_private {
@@ -5908,7 +6036,6 @@ struct btrfs_dio_private {
u64 logical_offset;
u64 disk_bytenr;
u64 bytes;
- u32 *csums;
void *private;
/* number of bios pending for this dio */
@@ -5928,7 +6055,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 start;
- u32 *private = dip->csums;
start = dip->logical_offset;
do {
@@ -5936,8 +6062,12 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
struct page *page = bvec->bv_page;
char *kaddr;
u32 csum = ~(u32)0;
+ u64 private = ~(u32)0;
unsigned long flags;
+ if (get_state_private(&BTRFS_I(inode)->io_tree,
+ start, &private))
+ goto failed;
local_irq_save(flags);
kaddr = kmap_atomic(page);
csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
@@ -5947,18 +6077,18 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
local_irq_restore(flags);
flush_dcache_page(bvec->bv_page);
- if (csum != *private) {
+ if (csum != private) {
+failed:
printk(KERN_ERR "btrfs csum failed ino %llu off"
" %llu csum %u private %u\n",
(unsigned long long)btrfs_ino(inode),
(unsigned long long)start,
- csum, *private);
+ csum, (unsigned)private);
err = -EIO;
}
}
start += bvec->bv_len;
- private++;
bvec++;
} while (bvec <= bvec_end);
@@ -5966,7 +6096,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
dip->logical_offset + dip->bytes - 1);
bio->bi_private = dip->private;
- kfree(dip->csums);
kfree(dip);
/* If we had a csum failure make sure to clear the uptodate flag */
@@ -6072,7 +6201,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
int rw, u64 file_offset, int skip_sum,
- u32 *csums, int async_submit)
+ int async_submit)
{
int write = rw & REQ_WRITE;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -6105,8 +6234,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
if (ret)
goto err;
} else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
- file_offset, csums);
+ ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset);
if (ret)
goto err;
}
@@ -6132,10 +6260,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
u64 submit_len = 0;
u64 map_length;
int nr_pages = 0;
- u32 *csums = dip->csums;
int ret = 0;
int async_submit = 0;
- int write = rw & REQ_WRITE;
map_length = orig_bio->bi_size;
ret = btrfs_map_block(map_tree, READ, start_sector << 9,
@@ -6171,16 +6297,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
atomic_inc(&dip->pending_bios);
ret = __btrfs_submit_dio_bio(bio, inode, rw,
file_offset, skip_sum,
- csums, async_submit);
+ async_submit);
if (ret) {
bio_put(bio);
atomic_dec(&dip->pending_bios);
goto out_err;
}
- /* Write's use the ordered csums */
- if (!write && !skip_sum)
- csums = csums + nr_pages;
start_sector += submit_len >> 9;
file_offset += submit_len;
@@ -6210,7 +6333,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
submit:
ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
- csums, async_submit);
+ async_submit);
if (!ret)
return 0;
@@ -6246,17 +6369,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
ret = -ENOMEM;
goto free_ordered;
}
- dip->csums = NULL;
-
- /* Write's use the ordered csum stuff, so we don't need dip->csums */
- if (!write && !skip_sum) {
- dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
- if (!dip->csums) {
- kfree(dip);
- ret = -ENOMEM;
- goto free_ordered;
- }
- }
dip->private = bio->bi_private;
dip->inode = inode;
@@ -6341,132 +6453,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
out:
return retval;
}
+
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- u64 lockstart, lockend;
- ssize_t ret;
- int writing = rw & WRITE;
- int write_bits = 0;
- size_t count = iov_length(iov, nr_segs);
if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
- offset, nr_segs)) {
+ offset, nr_segs))
return 0;
- }
-
- lockstart = offset;
- lockend = offset + count - 1;
-
- if (writing) {
- ret = btrfs_delalloc_reserve_space(inode, count);
- if (ret)
- goto out;
- }
-
- while (1) {
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, &cached_state);
- /*
- * We're concerned with the entire range that we're going to be
- * doing DIO to, so we need to make sure theres no ordered
- * extents in this range.
- */
- ordered = btrfs_lookup_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
-
- /*
- * We need to make sure there are no buffered pages in this
- * range either, we could have raced between the invalidate in
- * generic_file_direct_write and locking the extent. The
- * invalidate needs to happen so that reads after a write do not
- * get stale data.
- */
- if (!ordered && (!writing ||
- !test_range_bit(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, EXTENT_UPTODATE, 0,
- cached_state)))
- break;
-
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state, GFP_NOFS);
-
- if (ordered) {
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- } else {
- /* Screw you mmap */
- ret = filemap_write_and_wait_range(file->f_mapping,
- lockstart,
- lockend);
- if (ret)
- goto out;
-
- /*
- * If we found a page that couldn't be invalidated just
- * fall back to buffered.
- */
- ret = invalidate_inode_pages2_range(file->f_mapping,
- lockstart >> PAGE_CACHE_SHIFT,
- lockend >> PAGE_CACHE_SHIFT);
- if (ret) {
- if (ret == -EBUSY)
- ret = 0;
- goto out;
- }
- }
-
- cond_resched();
- }
- /*
- * we don't use btrfs_set_extent_delalloc because we don't want
- * the dirty or uptodate bits
- */
- if (writing) {
- write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
- ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- EXTENT_DELALLOC, NULL, &cached_state,
- GFP_NOFS);
- if (ret) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, EXTENT_LOCKED | write_bits,
- 1, 0, &cached_state, GFP_NOFS);
- goto out;
- }
- }
-
- free_extent_state(cached_state);
- cached_state = NULL;
-
- ret = __blockdev_direct_IO(rw, iocb, inode,
+ return __blockdev_direct_IO(rw, iocb, inode,
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, 0);
-
- if (ret < 0 && ret != -EIOCBQUEUED) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
- offset + iov_length(iov, nr_segs) - 1,
- EXTENT_LOCKED | write_bits, 1, 0,
- &cached_state, GFP_NOFS);
- } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
- /*
- * We're falling back to buffered, unlock the section we didn't
- * do IO on.
- */
- clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
- offset + iov_length(iov, nr_segs) - 1,
- EXTENT_LOCKED | write_bits, 1, 0,
- &cached_state, GFP_NOFS);
- }
-out:
- free_extent_state(cached_state);
- return ret;
}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,