diff options
Diffstat (limited to 'fs/btrfs')
| -rw-r--r-- | fs/btrfs/ctree.c | 234 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 4 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 10 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 5 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.c | 18 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.h | 16 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/locking.c | 208 | ||||
| -rw-r--r-- | fs/btrfs/locking.h | 6 | ||||
| -rw-r--r-- | fs/btrfs/tree-defrag.c | 1 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.c | 4 | 
11 files changed, 470 insertions, 39 deletions
| diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3b6e35aafc9..3af777357ac 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -54,6 +54,31 @@ struct btrfs_path *btrfs_alloc_path(void)  	return path;  } +/* + * set all locked nodes in the path to blocking locks.  This should + * be done before scheduling + */ +noinline void btrfs_set_path_blocking(struct btrfs_path *p) +{ +	int i; +	for (i = 0; i < BTRFS_MAX_LEVEL; i++) { +		if (p->nodes[i] && p->locks[i]) +			btrfs_set_lock_blocking(p->nodes[i]); +	} +} + +/* + * reset all the locked nodes in the patch to spinning locks. + */ +noinline void btrfs_clear_path_blocking(struct btrfs_path *p) +{ +	int i; +	for (i = 0; i < BTRFS_MAX_LEVEL; i++) { +		if (p->nodes[i] && p->locks[i]) +			btrfs_clear_lock_blocking(p->nodes[i]); +	} +} +  /* this also releases the path */  void btrfs_free_path(struct btrfs_path *p)  { @@ -272,6 +297,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	if (IS_ERR(cow))  		return PTR_ERR(cow); +	/* cow is set to blocking by btrfs_init_new_buffer */ +  	copy_extent_buffer(cow, buf, 0, 0, cow->len);  	btrfs_set_header_bytenr(cow, cow->start);  	btrfs_set_header_generation(cow, trans->transid); @@ -397,6 +424,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,  	}  	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + +	if (parent) +		btrfs_set_lock_blocking(parent); +	btrfs_set_lock_blocking(buf); +  	ret = __btrfs_cow_block(trans, root, buf, parent,  				 parent_slot, cow_ret, search_start, 0,  				 prealloc_dest); @@ -502,6 +534,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,  	if (parent_nritems == 1)  		return 0; +	btrfs_set_lock_blocking(parent); +  	for (i = start_slot; i < end_slot; i++) {  		int close = 1; @@ -562,6 +596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,  			search_start = last_block;  		btrfs_tree_lock(cur); +		btrfs_set_lock_blocking(cur);  		err = __btrfs_cow_block(trans, root, cur, parent, i,  					&cur, search_start,  					min(16 * blocksize, @@ -860,6 +895,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		return 0;  	mid = path->nodes[level]; +  	WARN_ON(!path->locks[level]);  	WARN_ON(btrfs_header_generation(mid) != trans->transid); @@ -882,6 +918,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		/* promote the child to a root */  		child = read_node_slot(root, mid, 0);  		btrfs_tree_lock(child); +		btrfs_set_lock_blocking(child);  		BUG_ON(!child);  		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);  		BUG_ON(ret); @@ -898,6 +935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		add_root_to_dirty_list(root);  		btrfs_tree_unlock(child); +  		path->locks[level] = 0;  		path->nodes[level] = NULL;  		clean_tree_block(trans, root, mid); @@ -922,6 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	left = read_node_slot(root, parent, pslot - 1);  	if (left) {  		btrfs_tree_lock(left); +		btrfs_set_lock_blocking(left);  		wret = btrfs_cow_block(trans, root, left,  				       parent, pslot - 1, &left, 0);  		if (wret) { @@ -932,6 +971,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	right = read_node_slot(root, parent, pslot + 1);  	if (right) {  		btrfs_tree_lock(right); +		btrfs_set_lock_blocking(right);  		wret = btrfs_cow_block(trans, root, right,  				       parent, pslot + 1, &right, 0);  		if (wret) { @@ -1107,6 +1147,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  		u32 left_nr;  		btrfs_tree_lock(left); +		btrfs_set_lock_blocking(left); +  		left_nr = btrfs_header_nritems(left);  		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {  			wret = 1; @@ -1153,7 +1195,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  	 */  	if (right) {  		u32 right_nr; +  		btrfs_tree_lock(right); +		btrfs_set_lock_blocking(right); +  		right_nr = btrfs_header_nritems(right);  		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {  			wret = 1; @@ -1265,6 +1310,68 @@ static noinline void reada_for_search(struct btrfs_root *root,  }  /* + * returns -EAGAIN if it had to drop the path, or zero if everything was in + * cache + */ +static noinline int reada_for_balance(struct btrfs_root *root, +				      struct btrfs_path *path, int level) +{ +	int slot; +	int nritems; +	struct extent_buffer *parent; +	struct extent_buffer *eb; +	u64 gen; +	u64 block1 = 0; +	u64 block2 = 0; +	int ret = 0; +	int blocksize; + +	parent = path->nodes[level - 1]; +	if (!parent) +		return 0; + +	nritems = btrfs_header_nritems(parent); +	slot = path->slots[level]; +	blocksize = btrfs_level_size(root, level); + +	if (slot > 0) { +		block1 = btrfs_node_blockptr(parent, slot - 1); +		gen = btrfs_node_ptr_generation(parent, slot - 1); +		eb = btrfs_find_tree_block(root, block1, blocksize); +		if (eb && btrfs_buffer_uptodate(eb, gen)) +			block1 = 0; +		free_extent_buffer(eb); +	} +	if (slot < nritems) { +		block2 = btrfs_node_blockptr(parent, slot + 1); +		gen = btrfs_node_ptr_generation(parent, slot + 1); +		eb = btrfs_find_tree_block(root, block2, blocksize); +		if (eb && btrfs_buffer_uptodate(eb, gen)) +			block2 = 0; +		free_extent_buffer(eb); +	} +	if (block1 || block2) { +		ret = -EAGAIN; +		btrfs_release_path(root, path); +		if (block1) +			readahead_tree_block(root, block1, blocksize, 0); +		if (block2) +			readahead_tree_block(root, block2, blocksize, 0); + +		if (block1) { +			eb = read_tree_block(root, block1, blocksize, 0); +			free_extent_buffer(eb); +		} +		if (block1) { +			eb = read_tree_block(root, block2, blocksize, 0); +			free_extent_buffer(eb); +		} +	} +	return ret; +} + + +/*   * when we walk down the tree, it is usually safe to unlock the higher layers   * in the tree.  The exceptions are when our path goes through slot 0, because   * operations on the tree might require changing key pointers higher up in the @@ -1315,6 +1422,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,  }  /* + * This releases any locks held in the path starting at level and + * going all the way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few + * corner cases, such as COW of the block at slot zero in the node.  This + * ignores those rules, and it should only be called when there are no + * more updates to be done higher up in the tree. + */ +noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) +{ +	int i; + +	if (path->keep_locks || path->lowest_level) +		return; + +	for (i = level; i < BTRFS_MAX_LEVEL; i++) { +		if (!path->nodes[i]) +			break; +		if (!path->locks[i]) +			break; +		btrfs_tree_unlock(path->nodes[i]); +		path->locks[i] = 0; +	} +} + +/*   * look for key in the tree.  path is filled in with nodes along the way   * if key is found, we return zero and you can find the item in the leaf   * level of the path (level 0) @@ -1385,6 +1518,7 @@ again:  			 */  			if (prealloc_block.objectid &&  			    prealloc_block.offset != b->len) { +				btrfs_set_path_blocking(p);  				btrfs_free_reserved_extent(root,  					   prealloc_block.objectid,  					   prealloc_block.offset); @@ -1409,6 +1543,8 @@ again:  				goto again;  			} +			btrfs_set_path_blocking(p); +  			wret = btrfs_cow_block(trans, root, b,  					       p->nodes[level + 1],  					       p->slots[level + 1], @@ -1430,6 +1566,22 @@ cow_done:  		if (!p->skip_locking)  			p->locks[level] = 1; +		btrfs_clear_path_blocking(p); + +		/* +		 * we have a lock on b and as long as we aren't changing +		 * the tree, there is no way to for the items in b to change. +		 * It is safe to drop the lock on our parent before we +		 * go through the expensive btree search on b. +		 * +		 * If cow is true, then we might be changing slot zero, +		 * which may require changing the parent.  So, we can't +		 * drop the lock until after we know which slot we're +		 * operating on. +		 */ +		if (!cow) +			btrfs_unlock_up_safe(p, level + 1); +  		ret = check_block(root, p, level);  		if (ret) {  			ret = -1; @@ -1437,6 +1589,7 @@ cow_done:  		}  		ret = bin_search(b, key, level, &slot); +  		if (level != 0) {  			if (ret && slot > 0)  				slot -= 1; @@ -1444,7 +1597,16 @@ cow_done:  			if ((p->search_for_split || ins_len > 0) &&  			    btrfs_header_nritems(b) >=  			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { -				int sret = split_node(trans, root, p, level); +				int sret; + +				sret = reada_for_balance(root, p, level); +				if (sret) +					goto again; + +				btrfs_set_path_blocking(p); +				sret = split_node(trans, root, p, level); +				btrfs_clear_path_blocking(p); +  				BUG_ON(sret > 0);  				if (sret) {  					ret = sret; @@ -1453,8 +1615,16 @@ cow_done:  				b = p->nodes[level];  				slot = p->slots[level];  			} else if (ins_len < 0) { -				int sret = balance_level(trans, root, p, -							 level); +				int sret; + +				sret = reada_for_balance(root, p, level); +				if (sret) +					goto again; + +				btrfs_set_path_blocking(p); +				sret = balance_level(trans, root, p, level); +				btrfs_clear_path_blocking(p); +  				if (sret) {  					ret = sret;  					goto done; @@ -1488,7 +1658,7 @@ cow_done:  				 * of the btree by dropping locks before  				 * we read.  				 */ -				if (level > 1) { +				if (level > 0) {  					btrfs_release_path(NULL, p);  					if (tmp)  						free_extent_buffer(tmp); @@ -1503,6 +1673,7 @@ cow_done:  						free_extent_buffer(tmp);  					goto again;  				} else { +					btrfs_set_path_blocking(p);  					if (tmp)  						free_extent_buffer(tmp);  					if (should_reada) @@ -1512,14 +1683,29 @@ cow_done:  					b = read_node_slot(root, b, slot);  				}  			} -			if (!p->skip_locking) -				btrfs_tree_lock(b); +			if (!p->skip_locking) { +				int lret; + +				btrfs_clear_path_blocking(p); +				lret = btrfs_try_spin_lock(b); + +				if (!lret) { +					btrfs_set_path_blocking(p); +					btrfs_tree_lock(b); +					btrfs_clear_path_blocking(p); +				} +			}  		} else {  			p->slots[level] = slot;  			if (ins_len > 0 &&  			    btrfs_leaf_free_space(root, b) < ins_len) { -				int sret = split_leaf(trans, root, key, +				int sret; + +				btrfs_set_path_blocking(p); +				sret = split_leaf(trans, root, key,  						      p, ins_len, ret == 0); +				btrfs_clear_path_blocking(p); +  				BUG_ON(sret > 0);  				if (sret) {  					ret = sret; @@ -1533,12 +1719,16 @@ cow_done:  	}  	ret = 1;  done: +	/* +	 * we don't really know what they plan on doing with the path +	 * from here on, so for now just mark it as blocking +	 */ +	btrfs_set_path_blocking(p);  	if (prealloc_block.objectid) {  		btrfs_free_reserved_extent(root,  			   prealloc_block.objectid,  			   prealloc_block.offset);  	} -  	return ret;  } @@ -1562,6 +1752,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,  	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);  	BUG_ON(ret); +	btrfs_set_lock_blocking(eb); +  	parent = eb;  	while (1) {  		level = btrfs_header_level(parent); @@ -1586,6 +1778,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,  			eb = read_tree_block(root, bytenr, blocksize,  					     generation);  			btrfs_tree_lock(eb); +			btrfs_set_lock_blocking(eb);  		}  		/* @@ -1610,6 +1803,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,  				eb = read_tree_block(root, bytenr, blocksize,  						generation);  				btrfs_tree_lock(eb); +				btrfs_set_lock_blocking(eb);  			}  			ret = btrfs_cow_block(trans, root, eb, parent, slot, @@ -2156,6 +2350,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root  	right = read_node_slot(root, upper, slot + 1);  	btrfs_tree_lock(right); +	btrfs_set_lock_blocking(right); +  	free_space = btrfs_leaf_free_space(root, right);  	if (free_space < data_size)  		goto out_unlock; @@ -2351,6 +2547,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root  	left = read_node_slot(root, path->nodes[1], slot - 1);  	btrfs_tree_lock(left); +	btrfs_set_lock_blocking(left); +  	free_space = btrfs_leaf_free_space(root, left);  	if (free_space < data_size) {  		ret = 1; @@ -2809,6 +3007,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,  	path->keep_locks = 0;  	BUG_ON(ret); +	/* +	 * make sure any changes to the path from split_leaf leave it +	 * in a blocking state +	 */ +	btrfs_set_path_blocking(path); +  	leaf = path->nodes[0];  	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); @@ -3338,6 +3542,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,  		BUG();  	}  out: +	btrfs_unlock_up_safe(path, 1);  	return ret;  } @@ -3705,12 +3910,14 @@ find_next_key:  		 */  		if (slot >= nritems) {  			path->slots[level] = slot; +			btrfs_set_path_blocking(path);  			sret = btrfs_find_next_key(root, path, min_key, level,  						  cache_only, min_trans);  			if (sret == 0) {  				btrfs_release_path(root, path);  				goto again;  			} else { +				btrfs_clear_path_blocking(path);  				goto out;  			}  		} @@ -3722,16 +3929,20 @@ find_next_key:  			unlock_up(path, level, 1);  			goto out;  		} +		btrfs_set_path_blocking(path);  		cur = read_node_slot(root, cur, slot);  		btrfs_tree_lock(cur); +  		path->locks[level - 1] = 1;  		path->nodes[level - 1] = cur;  		unlock_up(path, level, 1); +		btrfs_clear_path_blocking(path);  	}  out:  	if (ret == 0)  		memcpy(min_key, &found_key, sizeof(found_key)); +	btrfs_set_path_blocking(path);  	return ret;  } @@ -3827,6 +4038,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)  	if (ret < 0)  		return ret; +	btrfs_set_path_blocking(path);  	nritems = btrfs_header_nritems(path->nodes[0]);  	/*  	 * by releasing the path above we dropped all our locks.  A balance @@ -3857,6 +4069,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)  			free_extent_buffer(next);  		} +		/* the path was set to blocking above */  		if (level == 1 && (path->locks[1] || path->skip_locking) &&  		    path->reada)  			reada_for_search(root, path, level, slot, 0); @@ -3865,6 +4078,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)  		if (!path->skip_locking) {  			WARN_ON(!btrfs_tree_locked(c));  			btrfs_tree_lock(next); +			btrfs_set_lock_blocking(next);  		}  		break;  	} @@ -3881,12 +4095,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)  			path->locks[level] = 1;  		if (!level)  			break; + +		btrfs_set_path_blocking(path);  		if (level == 1 && path->locks[1] && path->reada)  			reada_for_search(root, path, level, slot, 0);  		next = read_node_slot(root, next, 0);  		if (!path->skip_locking) {  			WARN_ON(!btrfs_tree_locked(path->nodes[level]));  			btrfs_tree_lock(next); +			btrfs_set_lock_blocking(next);  		}  	}  done: @@ -3911,6 +4128,7 @@ int btrfs_previous_item(struct btrfs_root *root,  	while (1) {  		if (path->slots[0] == 0) { +			btrfs_set_path_blocking(path);  			ret = btrfs_prev_leaf(root, path);  			if (ret != 0)  				return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f2b8d26b047..531db112c8b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1835,6 +1835,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);  struct btrfs_path *btrfs_alloc_path(void);  void btrfs_free_path(struct btrfs_path *p);  void btrfs_init_path(struct btrfs_path *p); +void btrfs_set_path_blocking(struct btrfs_path *p); +void btrfs_clear_path_blocking(struct btrfs_path *p); +void btrfs_unlock_up_safe(struct btrfs_path *p, int level); +  int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		   struct btrfs_path *path, int slot, int nr);  int btrfs_del_leaf(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 549271607c1..5aebddd7119 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -799,7 +799,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,  	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);  	if (ret == 0) -		buf->flags |= EXTENT_UPTODATE; +		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);  	else  		WARN_ON(1);  	return buf; @@ -813,6 +813,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,  	if (btrfs_header_generation(buf) ==  	    root->fs_info->running_transaction->transid) {  		WARN_ON(!btrfs_tree_locked(buf)); + +		/* ugh, clear_extent_buffer_dirty can be expensive */ +		btrfs_set_lock_blocking(buf); +  		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,  					  buf);  	} @@ -2311,6 +2315,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)  	u64 transid = btrfs_header_generation(buf);  	struct inode *btree_inode = root->fs_info->btree_inode; +	btrfs_set_lock_blocking(buf); +  	WARN_ON(!btrfs_tree_locked(buf));  	if (transid != root->fs_info->generation) {  		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " @@ -2353,7 +2359,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)  	int ret;  	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);  	if (ret == 0) -		buf->flags |= EXTENT_UPTODATE; +		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);  	return ret;  } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7a22f2e6ec4..ed1e25d7248 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3407,7 +3407,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,  	btrfs_set_header_generation(buf, trans->transid);  	btrfs_tree_lock(buf);  	clean_tree_block(trans, root, buf); + +	btrfs_set_lock_blocking(buf);  	btrfs_set_buffer_uptodate(buf); +  	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {  		set_extent_dirty(&root->dirty_log_pages, buf->start,  			 buf->start + buf->len - 1, GFP_NOFS); @@ -3416,6 +3419,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,  			 buf->start + buf->len - 1, GFP_NOFS);  	}  	trans->blocks_used++; +	/* this returns a buffer locked for blocking */  	return buf;  } @@ -3752,6 +3756,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,  		next = read_tree_block(root, bytenr, blocksize, ptr_gen);  		btrfs_tree_lock(next); +		btrfs_set_lock_blocking(next);  		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,  					      &refs); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2ea7f052722..dd5df53e045 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2990,7 +2990,9 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,  	eb = kmem_cache_zalloc(extent_buffer_cache, mask);  	eb->start = start;  	eb->len = len; -	mutex_init(&eb->mutex); +	spin_lock_init(&eb->lock); +	init_waitqueue_head(&eb->lock_wq); +  #if LEAK_DEBUG  	spin_lock_irqsave(&leak_lock, flags);  	list_add(&eb->leak_list, &buffers); @@ -3071,8 +3073,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,  		unlock_page(p);  	}  	if (uptodate) -		eb->flags |= EXTENT_UPTODATE; -	eb->flags |= EXTENT_BUFFER_FILLED; +		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);  	spin_lock(&tree->buffer_lock);  	exists = buffer_tree_insert(tree, start, &eb->rb_node); @@ -3226,7 +3227,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,  	unsigned long num_pages;  	num_pages = num_extent_pages(eb->start, eb->len); -	eb->flags &= ~EXTENT_UPTODATE; +	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);  	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,  			      GFP_NOFS); @@ -3297,7 +3298,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,  	struct page *page;  	int pg_uptodate = 1; -	if (eb->flags & EXTENT_UPTODATE) +	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))  		return 1;  	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, @@ -3333,7 +3334,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,  	struct bio *bio = NULL;  	unsigned long bio_flags = 0; -	if (eb->flags & EXTENT_UPTODATE) +	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))  		return 0;  	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, @@ -3364,7 +3365,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,  	}  	if (all_uptodate) {  		if (start_i == 0) -			eb->flags |= EXTENT_UPTODATE; +			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);  		goto unlock_exit;  	} @@ -3400,7 +3401,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,  	}  	if (!ret) -		eb->flags |= EXTENT_UPTODATE; +		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);  	return ret;  unlock_exit: @@ -3497,7 +3498,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,  		unmap_extent_buffer(eb, eb->map_token, km);  		eb->map_token = NULL;  		save = 1; -		WARN_ON(!mutex_is_locked(&eb->mutex));  	}  	err = map_private_extent_buffer(eb, start, min_len, token, map,  				       map_start, map_len, km); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index e80c6d96b31..1f9df88afbf 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -22,6 +22,10 @@  /* flags for bio submission */  #define EXTENT_BIO_COMPRESSED 1 +/* these are bit numbers for test/set bit */ +#define EXTENT_BUFFER_UPTODATE 0 +#define EXTENT_BUFFER_BLOCKING 1 +  /*   * page->private values.  Every page that is controlled by the extent   * map has page->private set to one. @@ -95,11 +99,19 @@ struct extent_buffer {  	unsigned long map_start;  	unsigned long map_len;  	struct page *first_page; +	unsigned long bflags;  	atomic_t refs; -	int flags;  	struct list_head leak_list;  	struct rb_node rb_node; -	struct mutex mutex; + +	/* the spinlock is used to protect most operations */ +	spinlock_t lock; + +	/* +	 * when we keep the lock held while blocking, waiters go onto +	 * the wq +	 */ +	wait_queue_head_t lock_wq;  };  struct extent_map_tree; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4a79e1c5ebd..ebd7d6c37df 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -50,6 +50,7 @@  #include "tree-log.h"  #include "ref-cache.h"  #include "compression.h" +#include "locking.h"  struct btrfs_iget_args {  	u64 ino; @@ -2021,6 +2022,7 @@ void btrfs_read_locked_inode(struct inode *inode)  	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);  	alloc_group_block = btrfs_inode_block_group(leaf, inode_item); +  	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,  						alloc_group_block, 0);  	btrfs_free_path(path); @@ -2117,6 +2119,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,  		goto failed;  	} +	btrfs_unlock_up_safe(path, 1);  	leaf = path->nodes[0];  	inode_item = btrfs_item_ptr(leaf, path->slots[0],  				  struct btrfs_inode_item); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 39bae7761db..68fd9ccf180 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -26,45 +26,215 @@  #include "locking.h"  /* - * locks the per buffer mutex in an extent buffer.  This uses adaptive locks - * and the spin is not tuned very extensively.  The spinning does make a big - * difference in almost every workload, but spinning for the right amount of - * time needs some help. - * - * In general, we want to spin as long as the lock holder is doing btree - * searches, and we should give up if they are in more expensive code. + * btrfs_header_level() isn't free, so don't call it when lockdep isn't + * on   */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void spin_nested(struct extent_buffer *eb) +{ +	spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); +} +#else +static inline void spin_nested(struct extent_buffer *eb) +{ +	spin_lock(&eb->lock); +} +#endif -int btrfs_tree_lock(struct extent_buffer *eb) +/* + * Setting a lock to blocking will drop the spinlock and set the + * flag that forces other procs who want the lock to wait.  After + * this you can safely schedule with the lock held. + */ +void btrfs_set_lock_blocking(struct extent_buffer *eb)  { -	int i; +	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { +		set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); +		spin_unlock(&eb->lock); +	} +	/* exit with the spin lock released and the bit set */ +} -	if (mutex_trylock(&eb->mutex)) -		return 0; +/* + * clearing the blocking flag will take the spinlock again. + * After this you can't safely schedule + */ +void btrfs_clear_lock_blocking(struct extent_buffer *eb) +{ +	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { +		spin_nested(eb); +		clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); +		smp_mb__after_clear_bit(); +	} +	/* exit with the spin lock held */ +} + +/* + * unfortunately, many of the places that currently set a lock to blocking + * don't end up blocking for every long, and often they don't block + * at all.  For a dbench 50 run, if we don't spin one the blocking bit + * at all, the context switch rate can jump up to 400,000/sec or more. + * + * So, we're still stuck with this crummy spin on the blocking bit, + * at least until the most common causes of the short blocks + * can be dealt with. + */ +static int btrfs_spin_on_block(struct extent_buffer *eb) +{ +	int i;  	for (i = 0; i < 512; i++) {  		cpu_relax(); -		if (mutex_trylock(&eb->mutex)) +		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) +			return 1; +		if (need_resched()) +			break; +	} +	return 0; +} + +/* + * This is somewhat different from trylock.  It will take the + * spinlock but if it finds the lock is set to blocking, it will + * return without the lock held. + * + * returns 1 if it was able to take the lock and zero otherwise + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_try_spin_lock(struct extent_buffer *eb) +{ +	int i; + +	spin_nested(eb); +	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) +		return 1; +	spin_unlock(&eb->lock); + +	/* spin for a bit on the BLOCKING flag */ +	for (i = 0; i < 2; i++) { +		if (!btrfs_spin_on_block(eb)) +			break; + +		spin_nested(eb); +		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) +			return 1; +		spin_unlock(&eb->lock); +	} +	return 0; +} + +/* + * the autoremove wake function will return 0 if it tried to wake up + * a process that was already awake, which means that process won't + * count as an exclusive wakeup.  The waitq code will continue waking + * procs until it finds one that was actually sleeping. + * + * For btrfs, this isn't quite what we want.  We want a single proc + * to be notified that the lock is ready for taking.  If that proc + * already happen to be awake, great, it will loop around and try for + * the lock. + * + * So, btrfs_wake_function always returns 1, even when the proc that we + * tried to wake up was already awake. + */ +static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, +			       int sync, void *key) +{ +	autoremove_wake_function(wait, mode, sync, key); +	return 1; +} + +/* + * returns with the extent buffer spinlocked. + * + * This will spin and/or wait as required to take the lock, and then + * return with the spinlock held. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_tree_lock(struct extent_buffer *eb) +{ +	DEFINE_WAIT(wait); +	wait.func = btrfs_wake_function; + +	while(1) { +		spin_nested(eb); + +		/* nobody is blocking, exit with the spinlock held */ +		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))  			return 0; + +		/* +		 * we have the spinlock, but the real owner is blocking. +		 * wait for them +		 */ +		spin_unlock(&eb->lock); + +		/* +		 * spin for a bit, and if the blocking flag goes away, +		 * loop around +		 */ +		if (btrfs_spin_on_block(eb)) +			continue; + +		prepare_to_wait_exclusive(&eb->lock_wq, &wait, +					  TASK_UNINTERRUPTIBLE); + +		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) +			schedule(); + +		finish_wait(&eb->lock_wq, &wait);  	} -	cpu_relax(); -	mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));  	return 0;  } +/* + * Very quick trylock, this does not spin or schedule.  It returns + * 1 with the spinlock held if it was able to take the lock, or it + * returns zero if it was unable to take the lock. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */  int btrfs_try_tree_lock(struct extent_buffer *eb)  { -	return mutex_trylock(&eb->mutex); +	if (spin_trylock(&eb->lock)) { +		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { +			/* +			 * we've got the spinlock, but the real owner is +			 * blocking.  Drop the spinlock and return failure +			 */ +			spin_unlock(&eb->lock); +			return 0; +		} +		return 1; +	} +	/* someone else has the spinlock giveup */ +	return 0;  }  int btrfs_tree_unlock(struct extent_buffer *eb)  { -	mutex_unlock(&eb->mutex); +	/* +	 * if we were a blocking owner, we don't have the spinlock held +	 * just clear the bit and look for waiters +	 */ +	if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) +		smp_mb__after_clear_bit(); +	else +		spin_unlock(&eb->lock); + +	if (waitqueue_active(&eb->lock_wq)) +		wake_up(&eb->lock_wq);  	return 0;  }  int btrfs_tree_locked(struct extent_buffer *eb)  { -	return mutex_is_locked(&eb->mutex); +	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || +			spin_is_locked(&eb->lock);  }  /* @@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level)  {  	int i;  	struct extent_buffer *eb; +  	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {  		eb = path->nodes[i];  		if (!eb)  			break;  		smp_mb(); -		if (!list_empty(&eb->mutex.wait_list)) +		if (spin_is_contended(&eb->lock) || +		    waitqueue_active(&eb->lock_wq))  			return 1;  	}  	return 0; diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index bc1faef1251..d92e707f587 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -22,6 +22,12 @@  int btrfs_tree_lock(struct extent_buffer *eb);  int btrfs_tree_unlock(struct extent_buffer *eb);  int btrfs_tree_locked(struct extent_buffer *eb); +  int btrfs_try_tree_lock(struct extent_buffer *eb); +int btrfs_try_spin_lock(struct extent_buffer *eb); +  int btrfs_path_lock_waiting(struct btrfs_path *path, int level); + +void btrfs_set_lock_blocking(struct extent_buffer *eb); +void btrfs_clear_lock_blocking(struct extent_buffer *eb);  #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3e8358c3616..98d25fa4570 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,  		u32 nritems;  		root_node = btrfs_lock_root_node(root); +		btrfs_set_lock_blocking(root_node);  		nritems = btrfs_header_nritems(root_node);  		root->defrag_max.objectid = 0;  		/* from above we know this is not a leaf */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4f26f3ed0c8..20794290256 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1615,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,  				btrfs_tree_lock(next);  				clean_tree_block(trans, root, next); +				btrfs_set_lock_blocking(next);  				btrfs_wait_tree_block_writeback(next);  				btrfs_tree_unlock(next); @@ -1661,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,  		next = path->nodes[*level];  		btrfs_tree_lock(next);  		clean_tree_block(trans, root, next); +		btrfs_set_lock_blocking(next);  		btrfs_wait_tree_block_writeback(next);  		btrfs_tree_unlock(next); @@ -1718,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,  				btrfs_tree_lock(next);  				clean_tree_block(trans, root, next); +				btrfs_set_lock_blocking(next);  				btrfs_wait_tree_block_writeback(next);  				btrfs_tree_unlock(next); @@ -1790,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,  			btrfs_tree_lock(next);  			clean_tree_block(trans, log, next); +			btrfs_set_lock_blocking(next);  			btrfs_wait_tree_block_writeback(next);  			btrfs_tree_unlock(next); | 
