From 848cce0d4102b5b4b26b0987b43e1919d462afe2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 21 Feb 2012 17:04:28 +0800
Subject: Btrfs: avoid setting ->d_op twice

Follow those instructions, and you'll trigger a warning in the
beginning of d_set_d_op():

  # mkfs.btrfs /dev/loop3
  # mount /dev/loop3 /mnt
  # btrfs sub create /mnt/sub
  # btrfs sub snap /mnt /mnt/snap
  # touch /mnt/snap/sub
  touch: cannot touch `tmp': Permission denied

__d_alloc() set d_op to sb->s_d_op (btrfs_dentry_operations), and
then simple_lookup() reset it to simple_dentry_operations, which
triggered the warning.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1be31368e881..a682c267576d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4069,7 +4069,7 @@ static struct inode *new_simple_dir(struct super_block *s,
 	BTRFS_I(inode)->dummy_inode = 1;
 
 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
-	inode->i_op = &simple_dir_inode_operations;
+	inode->i_op = &btrfs_dir_ro_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4140,14 +4140,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 static int btrfs_dentry_delete(const struct dentry *dentry)
 {
 	struct btrfs_root *root;
+	struct inode *inode = dentry->d_inode;
 
-	if (!dentry->d_inode && !IS_ROOT(dentry))
-		dentry = dentry->d_parent;
+	if (!inode && !IS_ROOT(dentry))
+		inode = dentry->d_parent->d_inode;
 
-	if (dentry->d_inode) {
-		root = BTRFS_I(dentry->d_inode)->root;
+	if (inode) {
+		root = BTRFS_I(inode)->root;
 		if (btrfs_root_refs(&root->root_item) == 0)
 			return 1;
+
+		if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+			return 1;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 8c9c2bf7a3c4f7e9d158c0be9c49f372fb943ad2 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Sat, 25 Feb 2012 09:09:30 +0100
Subject: btrfs: fix race in reada

When inserting into the radix tree returns EEXIST, get the existing
entry without giving up the spinlock in between.
There was a race for both the zones trees and the extent tree.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/inode.c |  8 +++++++-
 fs/btrfs/reada.c | 35 ++++++++++++++++-------------------
 2 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a682c267576d..98ee5a51aa29 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4332,7 +4332,13 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 			}
 no_dentry:
 			/* is this a reference to our own snapshot? If so
-			 * skip it
+			 * skip it.
+			 *
+			 * In contrast to old kernels, we insert the snapshot's
+			 * dir item and dir index after it has been created, so
+			 * we won't find a reference to our own snapshot. We
+			 * still keep the following code for backward
+			 * compatibility.
 			 */
 			if (location.type == BTRFS_ROOT_ITEM_KEY &&
 			    location.objectid == root->root_key.objectid) {
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index dc5d33146fdb..8dec650099c8 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -250,14 +250,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 					  struct btrfs_bio *bbio)
 {
 	int ret;
-	int looped = 0;
 	struct reada_zone *zone;
 	struct btrfs_block_group_cache *cache = NULL;
 	u64 start;
 	u64 end;
 	int i;
 
-again:
 	zone = NULL;
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
@@ -274,9 +272,6 @@ again:
 		spin_unlock(&fs_info->reada_lock);
 	}
 
-	if (looped)
-		return NULL;
-
 	cache = btrfs_lookup_block_group(fs_info, logical);
 	if (!cache)
 		return NULL;
@@ -307,13 +302,15 @@ again:
 	ret = radix_tree_insert(&dev->reada_zones,
 				(unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
 				zone);
-	spin_unlock(&fs_info->reada_lock);
 
-	if (ret) {
+	if (ret == -EEXIST) {
 		kfree(zone);
-		looped = 1;
-		goto again;
+		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+					     logical >> PAGE_CACHE_SHIFT, 1);
+		if (ret == 1)
+			kref_get(&zone->refcnt);
 	}
+	spin_unlock(&fs_info->reada_lock);
 
 	return zone;
 }
@@ -323,8 +320,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 					      struct btrfs_key *top, int level)
 {
 	int ret;
-	int looped = 0;
 	struct reada_extent *re = NULL;
+	struct reada_extent *re_exist = NULL;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct btrfs_bio *bbio = NULL;
@@ -335,14 +332,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	int i;
 	unsigned long index = logical >> PAGE_CACHE_SHIFT;
 
-again:
 	spin_lock(&fs_info->reada_lock);
 	re = radix_tree_lookup(&fs_info->reada_tree, index);
 	if (re)
 		kref_get(&re->refcnt);
 	spin_unlock(&fs_info->reada_lock);
 
-	if (re || looped)
+	if (re)
 		return re;
 
 	re = kzalloc(sizeof(*re), GFP_NOFS);
@@ -398,12 +394,15 @@ again:
 	/* insert extent in reada_tree + all per-device trees, all or nothing */
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+	if (ret == -EEXIST) {
+		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
+		BUG_ON(!re_exist);
+		kref_get(&re_exist->refcnt);
+		spin_unlock(&fs_info->reada_lock);
+		goto error;
+	}
 	if (ret) {
 		spin_unlock(&fs_info->reada_lock);
-		if (ret != -ENOMEM) {
-			/* someone inserted the extent in the meantime */
-			looped = 1;
-		}
 		goto error;
 	}
 	for (i = 0; i < nzones; ++i) {
@@ -450,9 +449,7 @@ error:
 	}
 	kfree(bbio);
 	kfree(re);
-	if (looped)
-		goto again;
-	return NULL;
+	return re_exist;
 }
 
 static void reada_kref_dummy(struct kref *kr)
-- 
cgit v1.2.3


From 5cf1ab56133ad7b712673c071b439d4a555a2d1e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 16 Apr 2012 09:42:26 -0400
Subject: Btrfs: always store the mirror we read the eb from

A user reported a panic where we were trying to fix a bad mirror but the
mirror number we were giving was 0, which is invalid.  This is because we
don't do the transid verification until after the read, so as far as the
read code is concerned the read was a success.  So instead store the mirror
we read from so that if there is some failure post read we know which mirror
to try next and which mirror needs to be fixed if we find a good copy of the
block.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/disk-io.c   | 16 ++++++++--------
 fs/btrfs/extent_io.c | 15 ++++++---------
 fs/btrfs/extent_io.h |  4 ++--
 fs/btrfs/inode.c     |  2 +-
 4 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b9866f27ebd7..d0c969beaad4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -383,17 +383,16 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
 			break;
 
-		if (!failed_mirror) {
-			failed = 1;
-			printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror);
-			failed_mirror = eb->failed_mirror;
-		}
-
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
 		if (num_copies == 1)
 			break;
 
+		if (!failed_mirror) {
+			failed = 1;
+			failed_mirror = eb->read_mirror;
+		}
+
 		mirror_num++;
 		if (mirror_num == failed_mirror)
 			mirror_num++;
@@ -564,7 +563,7 @@ struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
 }
 
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
-			       struct extent_state *state)
+			       struct extent_state *state, int mirror)
 {
 	struct extent_io_tree *tree;
 	u64 found_start;
@@ -589,6 +588,7 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	if (!reads_done)
 		goto err;
 
+	eb->read_mirror = mirror;
 	if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
 		ret = -EIO;
 		goto err;
@@ -652,7 +652,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
 
 	eb = (struct extent_buffer *)page->private;
 	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
-	eb->failed_mirror = failed_mirror;
+	eb->read_mirror = failed_mirror;
 	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 		btree_readahead_hook(root, eb, eb->start, -EIO);
 	return -EIO;	/* we fixed nothing */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 11eeb81fe695..0c23e57077c6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2304,7 +2304,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 	u64 start;
 	u64 end;
 	int whole_page;
-	int failed_mirror;
+	int mirror;
 	int ret;
 
 	if (err)
@@ -2343,20 +2343,18 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		}
 		spin_unlock(&tree->lock);
 
+		mirror = (int)(unsigned long)bio->bi_bdev;
 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
 			ret = tree->ops->readpage_end_io_hook(page, start, end,
-							      state);
+							      state, mirror);
 			if (ret)
 				uptodate = 0;
 			else
 				clean_io_failure(start, page);
 		}
 
-		if (!uptodate)
-			failed_mirror = (int)(unsigned long)bio->bi_bdev;
-
 		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
-			ret = tree->ops->readpage_io_failed_hook(page, failed_mirror);
+			ret = tree->ops->readpage_io_failed_hook(page, mirror);
 			if (!ret && !err &&
 			    test_bit(BIO_UPTODATE, &bio->bi_flags))
 				uptodate = 1;
@@ -2371,8 +2369,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			 * can't handle the error it will return -EIO and we
 			 * remain responsible for that page.
 			 */
-			ret = bio_readpage_error(bio, page, start, end,
-							failed_mirror, NULL);
+			ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -4465,7 +4462,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
-	eb->failed_mirror = 0;
+	eb->read_mirror = 0;
 	atomic_set(&eb->io_pages, num_reads);
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index faf10eb57f75..b516c3b8dec6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -79,7 +79,7 @@ struct extent_io_ops {
 					u64 start, u64 end,
 				       struct extent_state *state);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
-				    struct extent_state *state);
+				    struct extent_state *state, int mirror);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state, int uptodate);
 	void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
@@ -135,7 +135,7 @@ struct extent_buffer {
 	spinlock_t refs_lock;
 	atomic_t refs;
 	atomic_t io_pages;
-	int failed_mirror;
+	int read_mirror;
 	struct list_head leak_list;
 	struct rcu_head rcu_head;
 	pid_t lock_owner;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 98ee5a51aa29..d953f8820464 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1947,7 +1947,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
  * extent_io.c will try to find good copies for us.
  */
 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
-			       struct extent_state *state)
+			       struct extent_state *state, int mirror)
 {
 	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
 	struct inode *inode = page->mapping->host;
-- 
cgit v1.2.3


From fede766f28dd766d4e8feb321fdb19edb21ef6fb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 27 Apr 2012 14:23:22 -0400
Subject: Btrfs: avoid deadlocks from GFP_KERNEL allocations during
 btrfs_real_readdir

Btrfs has an optimization where it will preallocate dentries during
readdir to fill in enough information to open the inode without an extra
lookup.

But, we're calling d_alloc, which is doing GFP_KERNEL allocations, and
that leads to deadlocks because our readdir code has tree locks held.

For now, disable this optimization.  We'll fix the gfp mask in the next
merge window.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d953f8820464..3ce7805d1117 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4192,7 +4192,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 	struct btrfs_path *path;
 	struct list_head ins_list;
 	struct list_head del_list;
-	struct qstr q;
 	int ret;
 	struct extent_buffer *leaf;
 	int slot;
@@ -4283,7 +4282,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 
 		while (di_cur < di_total) {
 			struct btrfs_key location;
-			struct dentry *tmp;
 
 			if (verify_dir_item(root, leaf, di))
 				break;
@@ -4304,33 +4302,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
 
-			q.name = name_ptr;
-			q.len = name_len;
-			q.hash = full_name_hash(q.name, q.len);
-			tmp = d_lookup(filp->f_dentry, &q);
-			if (!tmp) {
-				struct btrfs_key *newkey;
-
-				newkey = kzalloc(sizeof(struct btrfs_key),
-						 GFP_NOFS);
-				if (!newkey)
-					goto no_dentry;
-				tmp = d_alloc(filp->f_dentry, &q);
-				if (!tmp) {
-					kfree(newkey);
-					dput(tmp);
-					goto no_dentry;
-				}
-				memcpy(newkey, &location,
-				       sizeof(struct btrfs_key));
-				tmp->d_fsdata = newkey;
-				tmp->d_flags |= DCACHE_NEED_LOOKUP;
-				d_rehash(tmp);
-				dput(tmp);
-			} else {
-				dput(tmp);
-			}
-no_dentry:
+
 			/* is this a reference to our own snapshot? If so
 			 * skip it.
 			 *
-- 
cgit v1.2.3