diff options
Diffstat (limited to 'fs')
55 files changed, 991 insertions, 596 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index de34bfad9ec..5d505aaa72f 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -178,16 +178,17 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, if (value) { acl = posix_acl_from_xattr(value, size); - if (acl == NULL) { - value = NULL; - size = 0; + if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto out; } else if (IS_ERR(acl)) { return PTR_ERR(acl); } } ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); - +out: posix_acl_release(acl); return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3458b572554..8f4b81de3ae 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -718,7 +718,7 @@ struct btrfs_space_info { u64 total_bytes; /* total bytes in the space, this doesn't take mirrors into account */ u64 bytes_used; /* total bytes used, - this does't take mirrors into account */ + this doesn't take mirrors into account */ u64 bytes_pinned; /* total bytes pinned, will be freed when the transaction finishes */ u64 bytes_reserved; /* total bytes the allocator has reserved for @@ -740,8 +740,10 @@ struct btrfs_space_info { */ unsigned long reservation_progress; - int full; /* indicates that we cannot allocate any more + int full:1; /* indicates that we cannot allocate any more chunks for this space */ + int chunk_alloc:1; /* set if we are allocating a chunk */ + int force_alloc; /* set if we need to force a chunk alloc for this space */ @@ -2576,6 +2578,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); +void btrfs_drop_pages(struct page **pages, size_t num_pages); +int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8f1d44ba332..228cf36ece8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2824,6 +2824,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_lock(&delayed_refs->lock); if (delayed_refs->num_entries == 0) { + spin_unlock(&delayed_refs->lock); printk(KERN_INFO "delayed_refs has NO entry\n"); return ret; } @@ -3057,7 +3058,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); - t->use_count = 0; + atomic_set(&t->use_count, 0); list_del_init(&t->list); memset(t, 0, sizeof(*t)); kmem_cache_free(btrfs_transaction_cachep, t); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f619c3cb13b..cd52f7f556e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,25 @@ #include "locking.h" #include "free-space-cache.h" +/* control flags for do_chunk_alloc's force field + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk + * if we really need one. + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one + * if we have very few chunks already allocated. This is + * used as part of the clustering code to help make sure + * we have a good pool of storage to cluster in, without + * filling the FS with empty chunks + * + */ +enum { + CHUNK_ALLOC_NO_FORCE = 0, + CHUNK_ALLOC_FORCE = 1, + CHUNK_ALLOC_LIMITED = 2, +}; + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -3019,7 +3038,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_readonly = 0; found->bytes_may_use = 0; found->full = 0; - found->force_alloc = 0; + found->force_alloc = CHUNK_ALLOC_NO_FORCE; + found->chunk_alloc = 0; *space_info = found; list_add_rcu(&found->list, &info->space_info); atomic_set(&found->caching_threads, 0); @@ -3150,7 +3170,7 @@ again: if (!data_sinfo->full && alloc_chunk) { u64 alloc_target; - data_sinfo->force_alloc = 1; + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); @@ -3160,7 +3180,8 @@ alloc: ret = do_chunk_alloc(trans, root->fs_info->extent_root, bytes + 2 * 1024 * 1024, - alloc_target, 0); + alloc_target, + CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); if (ret < 0) { if (ret != -ENOSPC) @@ -3239,31 +3260,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - found->force_alloc = 1; + found->force_alloc = CHUNK_ALLOC_FORCE; } rcu_read_unlock(); } static int should_alloc_chunk(struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 alloc_bytes) + struct btrfs_space_info *sinfo, u64 alloc_bytes, + int force) { u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; u64 thresh; - if (sinfo->bytes_used + sinfo->bytes_reserved + - alloc_bytes + 256 * 1024 * 1024 < num_bytes) + if (force == CHUNK_ALLOC_FORCE) + return 1; + + /* + * in limited mode, we want to have some free space up to + * about 1% of the FS size. + */ + if (force == CHUNK_ALLOC_LIMITED) { + thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = max_t(u64, 64 * 1024 * 1024, + div_factor_fine(thresh, 1)); + + if (num_bytes - num_allocated < thresh) + return 1; + } + + /* + * we have two similar checks here, one based on percentage + * and once based on a hard number of 256MB. The idea + * is that if we have a good amount of free + * room, don't allocate a chunk. A good mount is + * less than 80% utilized of the chunks we have allocated, + * or more than 256MB free + */ + if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) return 0; - if (sinfo->bytes_used + sinfo->bytes_reserved + - alloc_bytes < div_factor(num_bytes, 8)) + if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) return 0; thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + + /* 256MB or 5% of the FS */ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) return 0; - return 1; } @@ -3273,10 +3319,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, { struct btrfs_space_info *space_info; struct btrfs_fs_info *fs_info = extent_root->fs_info; + int wait_for_alloc = 0; int ret = 0; - mutex_lock(&fs_info->chunk_mutex); - flags = btrfs_reduce_alloc_profile(extent_root, flags); space_info = __find_space_info(extent_root->fs_info, flags); @@ -3287,21 +3332,40 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } BUG_ON(!space_info); +again: spin_lock(&space_info->lock); if (space_info->force_alloc) - force = 1; + force = space_info->force_alloc; if (space_info->full) { spin_unlock(&space_info->lock); - goto out; + return 0; } - if (!force && !should_alloc_chunk(extent_root, space_info, - alloc_bytes)) { + if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { spin_unlock(&space_info->lock); - goto out; + return 0; + } else if (space_info->chunk_alloc) { + wait_for_alloc = 1; + } else { + space_info->chunk_alloc = 1; } + spin_unlock(&space_info->lock); + mutex_lock(&fs_info->chunk_mutex); + + /* + * The chunk_mutex is held throughout the entirety of a chunk + * allocation, so once we've acquired the chunk_mutex we know that the + * other guy is done and we need to recheck and see if we should + * allocate. + */ + if (wait_for_alloc) { + mutex_unlock(&fs_info->chunk_mutex); + wait_for_alloc = 0; + goto again; + } + /* * If we have mixed data/metadata chunks we want to make sure we keep * allocating mixed chunks instead of individual chunks. @@ -3327,9 +3391,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info->full = 1; else ret = 1; - space_info->force_alloc = 0; + + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); -out: mutex_unlock(&extent_root->fs_info->chunk_mutex); return ret; } @@ -5303,11 +5368,13 @@ loop: if (allowed_chunk_alloc) { ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, 1); + 2 * 1024 * 1024, data, + CHUNK_ALLOC_LIMITED); allowed_chunk_alloc = 0; done_chunk_alloc = 1; - } else if (!done_chunk_alloc) { - space_info->force_alloc = 1; + } else if (!done_chunk_alloc && + space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) { + space_info->force_alloc = CHUNK_ALLOC_LIMITED; } if (loop < LOOP_NO_EMPTY_SIZE) { @@ -5393,7 +5460,8 @@ again: */ if (empty_size || root->ref_cows) ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, data, 0); + num_bytes + 2 * 1024 * 1024, data, + CHUNK_ALLOC_NO_FORCE); WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, @@ -5405,7 +5473,7 @@ again: num_bytes = num_bytes & ~(root->sectorsize - 1); num_bytes = max(num_bytes, min_alloc_size); do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, 1); + num_bytes, data, CHUNK_ALLOC_FORCE); goto again; } if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { @@ -7991,6 +8059,10 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, u64 group_start = group->key.objectid; new_extents = kmalloc(sizeof(*new_extents), GFP_NOFS); + if (!new_extents) { + ret = -ENOMEM; + goto out; + } nr_extents = 1; ret = get_new_locations(reloc_inode, extent_key, @@ -8109,13 +8181,15 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, alloc_flags = update_block_group_flags(root, cache->flags); if (alloc_flags != cache->flags) - do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); + do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); ret = set_block_group_ro(cache); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); if (ret < 0) goto out; ret = set_block_group_ro(cache); @@ -8128,7 +8202,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 type) { u64 alloc_flags = get_alloc_profile(root, type); - return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); + return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); } /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 20ddb28602a..ba41da59e31 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -690,6 +690,15 @@ static void cache_state(struct extent_state *state, } } +static void uncache_state(struct extent_state **cached_ptr) +{ + if (cached_ptr && (*cached_ptr)) { + struct extent_state *state = *cached_ptr; + *cached_ptr = NULL; + free_extent_state(state); + } +} + /* * set some bits on a range in the tree. This may require allocations or * sleeping, so the gfp mask is used to indicate what is allowed. @@ -940,10 +949,10 @@ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, } int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) + struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, + NULL, cached_state, mask); } static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -1012,8 +1021,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, mask); } -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, mask); @@ -1735,6 +1743,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) do { struct page *page = bvec->bv_page; + struct extent_state *cached = NULL; + struct extent_state *state; + tree = &BTRFS_I(page->mapping->host)->io_tree; start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1749,9 +1760,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (++bvec <= bvec_end) prefetchw(&bvec->bv_page->flags); + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); + if (state && state->start == start) { + /* + * take a reference on the state, unlock will drop + * the ref + */ + cache_state(state, &cached); + } + spin_unlock(&tree->lock); + if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, - NULL); + state); if (ret) uptodate = 0; } @@ -1764,15 +1786,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err) test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) uptodate = 0; + uncache_state(&cached); continue; } } if (uptodate) { - set_extent_uptodate(tree, start, end, + set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); } - unlock_extent(tree, start, end, GFP_ATOMIC); + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); if (whole_page) { if (uptodate) { @@ -1811,6 +1834,7 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err) do { struct page *page = bvec->bv_page; + struct extent_state *cached = NULL; tree = &BTRFS_I(page->mapping->host)->io_tree; start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1821,13 +1845,14 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err) prefetchw(&bvec->bv_page->flags); if (uptodate) { - set_extent_uptodate(tree, start, end, GFP_ATOMIC); + set_extent_uptodate(tree, start, end, &cached, + GFP_ATOMIC); } else { ClearPageUptodate(page); SetPageError(page); } - unlock_extent(tree, start, end, GFP_ATOMIC); + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); } while (bvec >= bio->bi_io_vec); @@ -2016,14 +2041,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree, while (cur <= end) { if (cur >= last_byte) { char *userpage; + struct extent_state *cached = NULL; + iosize = PAGE_CACHE_SIZE - page_offset; userpage = kmap_atomic(page, KM_USER0); memset(userpage + page_offset, 0, iosize); flush_dcache_page(page); kunmap_atomic(userpage, KM_USER0); set_extent_uptodate(tree, cur, cur + iosize - 1, - GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); break; } em = get_extent(inode, page, page_offset, cur, @@ -2063,14 +2091,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { char *userpage; + struct extent_state *cached = NULL; + userpage = kmap_atomic(page, KM_USER0); memset(userpage + page_offset, 0, iosize); flush_dcache_page(page); kunmap_atomic(userpage, KM_USER0); set_extent_uptodate(tree, cur, cur + iosize - 1, - GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; page_offset += iosize; continue; @@ -2650,7 +2681,7 @@ int extent_readpages(struct extent_io_tree *tree, prefetchw(&page->flags); list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, - page->index, GFP_KERNEL)) { + page->index, GFP_NOFS)) { __extent_read_full_page(tree, page, get_extent, &bio, 0, &bio_flags); } @@ -2789,9 +2820,12 @@ int extent_prepare_write(struct extent_io_tree *tree, iocount++; block_start = block_start + iosize; } else { - set_extent_uptodate(tree, block_start, cur_end, + struct extent_state *cached = NULL; + + set_extent_uptodate(tree, block_start, cur_end, &cached, GFP_NOFS); - unlock_extent(tree, block_start, cur_end, GFP_NOFS); + unlock_extent_cached(tree, block_start, cur_end, + &cached, GFP_NOFS); block_start = cur_end + 1; } page_offset = block_start & (PAGE_CACHE_SIZE - 1); @@ -3457,7 +3491,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - GFP_NOFS); + NULL, GFP_NOFS); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || @@ -3885,6 +3919,12 @@ static void move_pages(struct page *dst_page, struct page *src_page, kunmap_atomic(dst_kaddr, KM_USER0); } +static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) +{ + unsigned long distance = (src > dst) ? src - dst : dst - src; + return distance < len; +} + static void copy_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) @@ -3892,10 +3932,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page, char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); char *src_kaddr; - if (dst_page != src_page) + if (dst_page != src_page) { src_kaddr = kmap_atomic(src_page, KM_USER1); - else + } else { src_kaddr = dst_kaddr; + BUG_ON(areas_overlap(src_off, dst_off, len)); + } memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); kunmap_atomic(dst_kaddr, KM_USER0); @@ -3970,7 +4012,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } - if (dst_offset < src_offset) { + if (!areas_overlap(src_offset, dst_offset, len)) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f62c5442835..af2d7179c37 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -208,7 +208,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int exclusive_bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); + struct extent_state **cached_state, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e621ea54a3f..75899a01dde 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -104,7 +104,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, /* * unlocks pages after btrfs_file_write is done with them */ -static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) +void btrfs_drop_pages(struct page **pages, size_t num_pages) { size_t i; for (i = 0; i < num_pages; i++) { @@ -127,16 +127,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) * this also makes the decision about creating an inline extent vs * doing real data extents, marking pages dirty and delalloc as required. */ -static noinline int dirty_and_release_pages(struct btrfs_root *root, - struct file *file, - struct page **pages, - size_t num_pages, - loff_t pos, - size_t write_bytes) +int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached) { int err = 0; int i; - struct inode *inode = fdentry(file)->d_inode; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -149,7 +146,7 @@ static noinline int dirty_and_release_pages(struct btrfs_root *root, end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, - NULL); + cached); if (err) return err; @@ -992,9 +989,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } if (copied > 0) { - ret = dirty_and_release_pages(root, file, pages, - dirty_pages, pos, - copied); + ret = btrfs_dirty_pages(root, inode, pages, + dirty_pages, pos, copied, + NULL); if (ret) { btrfs_delalloc_release_space(inode, dirty_pages << PAGE_CACHE_SHIFT); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f561c953205..63731a1fb0a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -508,6 +508,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode; struct rb_node *node; struct list_head *pos, *n; + struct page **pages; struct page *page; struct extent_state *cached_state = NULL; struct btrfs_free_cluster *cluster = NULL; @@ -517,13 +518,13 @@ int btrfs_write_out_cache(struct btrfs_root *root, u64 start, end, len; u64 bytes = 0; u32 *crc, *checksums; - pgoff_t index = 0, last_index = 0; unsigned long first_page_offset; - int num_checksums; + int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; int ret = 0; bool next_page = false; + bool out_of_space = false; root = root->fs_info->tree_root; @@ -551,24 +552,31 @@ int btrfs_write_out_cache(struct btrfs_root *root, return 0; } - last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & ~(root->sectorsize - 1), (u64)-1); /* We need a checksum per page. */ - num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; - crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); + crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); if (!crc) { iput(inode); return 0; } + pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); + if (!pages) { + kfree(crc); + iput(inode); + return 0; + } + /* Since the first page has all of our checksums and our generation we * need to calculate the offset into the page that we can start writing * our entries. */ - first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); + first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); /* Get the cluster for this block_group if it exists */ if (!list_empty(&block_group->cluster_list)) @@ -590,20 +598,18 @@ int btrfs_write_out_cache(struct btrfs_root *root, * after find_get_page at this point. Just putting this here so people * know and don't freak out. */ - while (index <= last_index) { + while (index < num_pages) { page = grab_cache_page(inode->i_mapping, index); if (!page) { - pgoff_t i = 0; + int i; - while (i < index) { - page = find_get_page(inode->i_mapping, i); - unlock_page(page); - page_cache_release(page); - page_cache_release(page); - i++; + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); } goto out_free; } + pages[index] = page; index++; } @@ -631,7 +637,12 @@ int btrfs_write_out_cache(struct btrfs_root *root, offset = start_offset; } - page = find_get_page(inode->i_mapping, index); + if (index >= num_pages) { + out_of_space = true; + break; + } + + page = pages[index]; addr = kmap(page); entry = addr + start_offset; @@ -708,23 +719,6 @@ int btrfs_write_out_cache(struct btrfs_root *root, bytes += PAGE_CACHE_SIZE; - ClearPageChecked(page); - set_page_extent_mapped(page); - SetPageUptodate(page); - set_page_dirty(page); - - /* - * We need to release our reference we got for grab_cache_page, - * except for the first page which will hold our checksums, we - * do that below. - */ - if (index != 0) { - unlock_page(page); - page_cache_release(page); - } - - page_cache_release(page); - index++; } while (node || next_page); @@ -734,7 +728,11 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space *entry = list_entry(pos, struct btrfs_free_space, list); - page = find_get_page(inode->i_mapping, index); + if (index >= num_pages) { + out_of_space = true; + break; + } + page = pages[index]; addr = kmap(page); memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); @@ -745,64 +743,58 @@ int btrfs_write_out_cache(struct btrfs_root *root, crc++; bytes += PAGE_CACHE_SIZE; - ClearPageChecked(page); - set_page_extent_mapped(page); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); - page_cache_release(page); list_del_init(&entry->list); index++; } + if (out_of_space) { + btrfs_drop_pages(pages, num_pages); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, + GFP_NOFS); + ret = 0; + goto out_free; + } + /* Zero out the rest of the pages just to make sure */ - while (index <= last_index) { + while (index < num_pages) { void *addr; - page = find_get_page(inode->i_mapping, index); - + page = pages[index]; addr = kmap(page); memset(addr, 0, PAGE_CACHE_SIZE); kunmap(page); - ClearPageChecked(page); - set_page_extent_mapped(page); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); - page_cache_release(page); bytes += PAGE_CACHE_SIZE; index++; } - btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state); - /* Write the checksums and trans id to the first page */ { void *addr; u64 *gen; - page = find_get_page(inode->i_mapping, 0); + page = pages[0]; addr = kmap(page); - memcpy(addr, checksums, sizeof(u32) * num_checksums); - gen = addr + (sizeof(u32) * num_checksums); + memcpy(addr, checksums, sizeof(u32) * num_pages); + gen = addr + (sizeof(u32) * num_pages); *gen = trans->transid; kunmap(page); - ClearPageChecked(page); - set_page_extent_mapped(page); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); - page_cache_release(page); } - BTRFS_I(inode)->generation = trans->transid; + ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, + bytes, &cached_state); + btrfs_drop_pages(pages, num_pages); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); + if (ret) { + ret = 0; + goto out_free; + } + + BTRFS_I(inode)->generation = trans->transid; + filemap_write_and_wait(inode->i_mapping); key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -853,6 +845,7 @@ out_free: BTRFS_I(inode)->generation = 0; } kfree(checksums); + kfree(pages); btrfs_update_inode(trans, root, inode); iput(inode); return ret; @@ -1775,10 +1768,13 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) while ((node = rb_last(&block_group->free_space_offset)) != NULL) { info = rb_entry(node, struct btrfs_free_space, offset_index); - unlink_free_space(block_group, info); - if (info->bitmap) - kfree(info->bitmap); - kmem_cache_free(btrfs_free_space_cachep, info); + if (!info->bitmap) { + unlink_free_space(block_group, info); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(block_group, info); + } + if (need_resched()) { spin_unlock(&block_group->tree_lock); cond_resched(); @@ -2308,7 +2304,7 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, start = entry->offset; bytes = min(entry->bytes, end - start); unlink_free_space(block_group, entry); - kfree(entry); + kmem_cache_free(btrfs_free_space_cachep, entry); } spin_unlock(&block_group->tree_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5cc64ab9c48..7cd8ab0ef04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -954,6 +954,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); + BUG_ON(!async_cow); async_cow->inode = inode; async_cow->root = root; async_cow->locked_page = locked_page; @@ -1770,9 +1771,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - btrfs_ordered_update_i_size(inode, 0, ordered_extent); - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (!ret) { + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + } + ret = 0; out: if (nolock) { if (trans) @@ -2590,6 +2594,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { + if (!leaf->map_token) + map_private_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_inode_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + btrfs_set_inode_uid(leaf, item, inode->i_uid); btrfs_set_inode_gid(leaf, item, inode->i_gid); btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); @@ -2618,6 +2629,11 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } } /* @@ -4207,10 +4223,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, struct btrfs_key found_key; struct btrfs_path *path; int ret; - u32 nritems; struct extent_buffer *leaf; int slot; - int advance; unsigned char d_type; int over = 0; u32 di_cur; @@ -4253,27 +4267,19 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto err; - advance = 0; while (1) { leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; - if (advance || slot >= nritems) { - if (slot >= nritems - 1) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } else { - slot++; - path->slots[0]++; - } + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + else if (ret > 0) + break; + continue; } - advance = 1; item = btrfs_item_nr(leaf, slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -4282,7 +4288,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, if (btrfs_key_type(&found_key) != key_type) break; if (found_key.offset < filp->f_pos) - continue; + goto next; filp->f_pos = found_key.offset; @@ -4335,6 +4341,8 @@ skip: di_cur += di_len; di = (struct btrfs_dir_item *)((char *)di + di_len); } +next: + path->slots[0]++; } /* Reached end of directory/root. Bump pos past the last item. */ @@ -4527,14 +4535,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BUG_ON(!path); inode = new_inode(root->fs_info->sb); - if (!inode) + if (!inode) { + btrfs_free_path(path); return ERR_PTR(-ENOMEM); + } if (dir) { trace_btrfs_inode_request(dir); ret = btrfs_set_inode_index(dir, index); if (ret) { + btrfs_free_path(path); iput(inode); return ERR_PTR(ret); } @@ -4721,9 +4732,10 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, mode, &index); - err = PTR_ERR(inode); - if (IS_ERR(inode)) + if (IS_ERR(inode)) { + err = PTR_ERR(inode); goto out_unlock; + } err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { @@ -4782,9 +4794,10 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, mode, &index); - err = PTR_ERR(inode); - if (IS_ERR(inode)) + if (IS_ERR(inode)) { + err = PTR_ERR(inode); goto out_unlock; + } err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { @@ -4834,9 +4847,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == ~0U) return -EMLINK; - btrfs_inc_nlink(inode); - inode->i_ctime = CURRENT_TIME; - err = btrfs_set_inode_index(dir, &index); if (err) goto fail; @@ -4852,6 +4862,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } + btrfs_inc_nlink(inode); + inode->i_ctime = CURRENT_TIME; + btrfs_set_trans_block_group(trans, dir); ihold(inode); @@ -4989,6 +5002,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, inline_size = btrfs_file_extent_inline_item_len(leaf, btrfs_item_nr(leaf, path->slots[0])); tmp = kmalloc(inline_size, GFP_NOFS); + if (!tmp) + return -ENOMEM; ptr = btrfs_file_extent_inline_start(item); read_extent_buffer(leaf, tmp, ptr, inline_size); @@ -5221,7 +5236,7 @@ again: btrfs_mark_buffer_dirty(leaf); } set_extent_uptodate(io_tree, em->start, - extent_map_end(em) - 1, GFP_NOFS); + extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; } else { printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); @@ -5428,17 +5443,30 @@ out: } static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + struct extent_map *em, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - struct extent_map *em; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct btrfs_key ins; u64 alloc_hint; int ret; + bool insert = false; - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + /* + * Ok if the extent map we looked up is a hole and is for the exact + * range we want, there is no reason to allocate a new one, however if + * it is not right then we need to free this one and drop the cache for + * our range. + */ + if (em->block_start != EXTENT_MAP_HOLE || em->start != start || + em->len != len) { + free_extent_map(em); + em = NULL; + insert = true; + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + } trans = btrfs_join_transaction(root, 0); if (IS_ERR(trans)) @@ -5454,10 +5482,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, goto out; } - em = alloc_extent_map(GFP_NOFS); if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } } em->start = start; @@ -5467,9 +5497,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, em->block_start = ins.objectid; em->block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; + + /* + * We need to do this because if we're using the original em we searched + * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. + */ + em->flags = 0; set_bit(EXTENT_FLAG_PINNED, &em->flags); - while (1) { + while (insert) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); @@ -5687,8 +5723,7 @@ must_cow: * it above */ len = bh_result->b_size; - free_extent_map(em); - em = btrfs_new_extent_direct(inode, start, len); + em = btrfs_new_extent_direct(inode, em, start, len); if (IS_ERR(em)) return PTR_ERR(em); len = min(len, em->len - (start - em->start)); @@ -5851,8 +5886,10 @@ again: } add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); - btrfs_ordered_update_i_size(inode, 0, ordered); - btrfs_update_inode(trans, root, inode); + ret = btrfs_ordered_update_i_size(inode, 0, ordered); + if (!ret) + btrfs_update_inode(trans, root, inode); + ret = 0; out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, ordered->file_offset + ordered->len - 1, @@ -5938,7 +5975,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int rw, u64 file_offset, int skip_sum, - u32 *csums) + u32 *csums, int async_submit) { int write = rw & REQ_WRITE; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -5949,13 +5986,24 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (ret) goto err; - if (write && !skip_sum) { + if (skip_sum) + goto map; + + if (write && async_submit) { ret = btrfs_wq_submit_bio(root->fs_info, inode, rw, bio, 0, 0, file_offset, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; + } else if (write) { + /* + * If we aren't doing async submit, calculate the csum of the + * bio now. + */ + ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); + if (ret) + goto err; } else if (!skip_sum) { ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset, csums); @@ -5963,7 +6011,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, goto err; } - ret = btrfs_map_bio(root, rw, bio, 0, 1); +map: + ret = btrfs_map_bio(root, rw, bio, 0, async_submit); err: bio_put(bio); return ret; @@ -5985,23 +6034,30 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int nr_pages = 0; u32 *csums = dip->csums; int ret = 0; + int async_submit = 0; int write = rw & REQ_WRITE; - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); - if (!bio) - return -ENOMEM; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - atomic_inc(&dip->pending_bios); - map_length = orig_bio->bi_size; ret = btrfs_map_block(map_tree, READ, start_sector << 9, &map_length, NULL, 0); if (ret) { - bio_put(bio); + bio_put(orig_bio); return -EIO; } + if (map_length >= orig_bio->bi_size) { + bio = orig_bio; + goto submit; + } + + async_submit = 1; + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); + if (!bio) + return -ENOMEM; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + atomic_inc(&dip->pending_bios); + while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { if (unlikely(map_length < submit_len + bvec->bv_len || bio_add_page(bio, bvec->bv_page, bvec->bv_len, @@ -6015,7 +6071,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, atomic_inc(&dip->pending_bios); ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, - csums); + csums, async_submit); if (ret) { bio_put(bio); atomic_dec(&dip->pending_bios); @@ -6052,8 +6108,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, } } +submit: ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, - csums); + csums, async_submit); if (!ret) return 0; @@ -6148,6 +6205,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io unsigned long nr_segs) { int seg; + int i; size_t size; unsigned long addr; unsigned blocksize_mask = root->sectorsize - 1; @@ -6162,8 +6220,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io addr = (unsigned long)iov[seg].iov_base; size = iov[seg].iov_len; end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) + if ((addr & blocksize_mask) || (size & blocksize_mask)) goto out; + + /* If this is a write we don't need to check anymore */ + if (rw & WRITE) + continue; + + /* + * Check to make sure we don't have duplicate iov_base's in this + * iovec, if so return EINVAL, otherwise we'll get csum errors + * when reading back. + */ + for (i = seg + 1; i < nr_segs; i++) { + if (iov[seg].iov_base == iov[i].iov_base) + goto out; + } } retval = 0; out: @@ -7206,9 +7278,10 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, &index); - err = PTR_ERR(inode); - if (IS_ERR(inode)) + if (IS_ERR(inode)) { + err = PTR_ERR(inode); goto out_unlock; + } err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cfc264fefdb..ffb48d6c543 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2287,7 +2287,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; - struct btrfs_ioctl_space_info *user_dest; + struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; u64 types[] = {BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_SYSTEM, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 58e7de9cc90..0ac712efcdf 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -159,7 +159,7 @@ enum { Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_err, + Opt_enospc_debug, Opt_subvolrootid, Opt_err, }; static match_table_t tokens = { @@ -189,6 +189,7 @@ static match_table_t tokens = { {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_enospc_debug, "enospc_debug"}, + {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_err, NULL}, }; @@ -232,6 +233,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) break; case Opt_subvol: case Opt_subvolid: + case Opt_subvolrootid: case Opt_device: /* * These are parsed by btrfs_parse_early_options @@ -388,7 +390,7 @@ out: */ static int btrfs_parse_early_options(const char *options, fmode_t flags, void *holder, char **subvol_name, u64 *subvol_objectid, - struct btrfs_fs_devices **fs_devices) + u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; char *opts, *orig, *p; @@ -429,6 +431,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, *subvol_objectid = intarg; } break; + case Opt_subvolrootid: + intarg = 0; + error = match_int(&args[0], &intarg); + if (!error) { + /* we want the original fs_tree */ + if (!intarg) + *subvol_rootid = + BTRFS_FS_TREE_OBJECTID; + else + *subvol_rootid = intarg; + } + break; case Opt_device: error = btrfs_scan_one_device(match_strdup(&args[0]), flags, holder, fs_devices); @@ -736,6 +750,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; + u64 subvol_rootid = 0; int error = 0; if (!(flags & MS_RDONLY)) @@ -743,7 +758,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, - &fs_devices); + &subvol_rootid, &fs_devices); if (error) return ERR_PTR(error); @@ -807,15 +822,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; } - root = get_default_root(s, subvol_objectid); - if (IS_ERR(root)) { - error = PTR_ERR(root); - deactivate_locked_super(s); - goto error_free_subvol_name; - } /* if they gave us a subvolume name bind mount into that */ if (strcmp(subvol_name, ".")) { struct dentry *new_root; + + root = get_default_root(s, subvol_rootid); + if (IS_ERR(root)) { + error = PTR_ERR(root); + deactivate_locked_super(s); + goto error_free_subvol_name; + } + mutex_lock(&root->d_inode->i_mutex); new_root = lookup_one_len(subvol_name, root, strlen(subvol_name)); @@ -836,6 +853,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } dput(root); root = new_root; + } else { + root = get_default_root(s, subvol_objectid); + if (IS_ERR(root)) { + error = PTR_ERR(root); + deactivate_locked_super(s); + goto error_free_subvol_name; + } } kfree(subvol_name); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5b158da7e0b..c571734d5e5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -32,10 +32,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(transaction->use_count == 0); - transaction->use_count--; - if (transaction->use_count == 0) { - list_del_init(&transaction->list); + WARN_ON(atomic_read(&transaction->use_count) == 0); + if (atomic_dec_and_test(&transaction->use_count)) { memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -60,14 +58,14 @@ static noinline int join_transaction(struct btrfs_root *root) if (!cur_trans) return -ENOMEM; root->fs_info->generation++; - cur_trans->num_writers = 1; + atomic_set(&cur_trans->num_writers, 1); cur_trans->num_joined = 0; cur_trans->transid = root->fs_info->generation; init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); cur_trans->in_commit = 0; cur_trans->blocked = 0; - cur_trans->use_count = 1; + atomic_set(&cur_trans->use_count, 1); cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); @@ -88,7 +86,7 @@ static noinline int join_transaction(struct btrfs_root *root) root->fs_info->running_transaction = cur_trans; spin_unlock(&root->fs_info->new_trans_lock); } else { - cur_trans->num_writers++; + atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; } @@ -145,7 +143,7 @@ static void wait_current_trans(struct btrfs_root *root) cur_trans = root->fs_info->running_transaction; if (cur_trans && cur_trans->blocked) { DEFINE_WAIT(wait); - cur_trans->use_count++; + atomic_inc(&cur_trans->use_count); while (1) { prepare_to_wait(&root->fs_info->transaction_wait, &wait, TASK_UNINTERRUPTIBLE); @@ -181,6 +179,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; + int retries = 0; int ret; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) @@ -204,7 +203,7 @@ again: } cur_trans = root->fs_info->running_transaction; - cur_trans->use_count++; + atomic_inc(&cur_trans->use_count); if (type != TRANS_JOIN_NOLOCK) mutex_unlock(&root->fs_info->trans_mutex); @@ -224,10 +223,18 @@ again: if (num_items > 0) { ret = btrfs_trans_reserve_metadata(h, root, num_items); - if (ret == -EAGAIN) { + if (ret == -EAGAIN && !retries) { + retries++; btrfs_commit_transaction(h, root); goto again; + } else if (ret == -EAGAIN) { + /* + * We have already retried and got EAGAIN, so really we + * don't have space, so set ret to -ENOSPC. + */ + ret = -ENOSPC; } + if (ret < 0) { btrfs_end_transaction(h, root); return ERR_PTR(ret); @@ -327,7 +334,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) goto out_unlock; /* nothing committing|committed */ } - cur_trans->use_count++; + atomic_inc(&cur_trans->use_count); mutex_unlock(&root->fs_info->trans_mutex); wait_for_commit(root, cur_trans); @@ -457,18 +464,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up_process(info->transaction_kthread); } - if (lock) - mutex_lock(&info->trans_mutex); WARN_ON(cur_trans != info->running_transaction); - WARN_ON(cur_trans->num_writers < 1); - cur_trans->num_writers--; + WARN_ON(atomic_read(&cur_trans->num_writers) < 1); + atomic_dec(&cur_trans->num_writers); smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); put_transaction(cur_trans); - if (lock) - mutex_unlock(&info->trans_mutex); if (current->journal_info == trans) current->journal_info = NULL; @@ -1178,7 +1181,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, /* take transaction reference */ mutex_lock(&root->fs_info->trans_mutex); cur_trans = trans->transaction; - cur_trans->use_count++; + atomic_inc(&cur_trans->use_count); mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); @@ -1237,7 +1240,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->trans_mutex); if (cur_trans->in_commit) { - cur_trans->use_count++; + atomic_inc(&cur_trans->use_count); mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); @@ -1259,7 +1262,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (!prev_trans->commit_done) { - prev_trans->use_count++; + atomic_inc(&prev_trans->use_count); mutex_unlock(&root->fs_info->trans_mutex); wait_for_commit(root, prev_trans); @@ -1300,14 +1303,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, TASK_UNINTERRUPTIBLE); smp_mb(); - if (cur_trans->num_writers > 1) + if (atomic_read(&cur_trans->num_writers) > 1) schedule_timeout(MAX_SCHEDULE_TIMEOUT); else if (should_grow) schedule_timeout(1); mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); - } while (cur_trans->num_writers > 1 || + } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); ret = create_pending_snapshots(trans, root->fs_info); @@ -1394,6 +1397,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wake_up(&cur_trans->commit_wait); + list_del_init(&cur_trans->list); put_transaction(cur_trans); put_transaction(cur_trans); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 229a594cacd..e441acc6c58 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -27,11 +27,11 @@ struct btrfs_transaction { * total writers in this transaction, it must be zero before the * transaction can end */ - unsigned long num_writers; + atomic_t num_writers; unsigned long num_joined; int in_commit; - int use_count; + atomic_t use_count; int commit_done; int blocked; struct list_head list; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c50271ad315..f997ec0c1ba 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2209,8 +2209,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, log = root->log_root; path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + err = -ENOMEM; + goto out_unlock; + } di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, name, name_len, -1); @@ -2271,6 +2273,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } fail: btrfs_free_path(path); +out_unlock: mutex_unlock(&BTRFS_I(dir)->log_mutex); if (ret == -ENOSPC) { root->fs_info->last_trans_log_full_commit = trans->transid; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 309a57b9fc8..c7367ae5a3e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -155,6 +155,15 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; + struct blk_plug plug; + + /* + * this function runs all the bios we've collected for + * a particular device. We don't want to wander off to + * another device without first sending all of these down. + * So, setup a plug here and finish it off before we return + */ + blk_start_plug(&plug); bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; @@ -294,6 +303,7 @@ loop_lock: spin_unlock(&device->io_lock); done: + blk_finish_plug(&plug); return 0; } diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index a5303b871b1..cfd660550de 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -180,11 +180,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; - int ret = 0, slot, advance; + int ret = 0, slot; size_t total_size = 0, size_left = size; unsigned long name_ptr; size_t name_len; - u32 nritems; /* * ok we want all objects associated with this id. @@ -204,34 +203,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto err; - advance = 0; + while (1) { leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; /* this is where we start walking through the path */ - if (advance || slot >= nritems) { + if (slot >= btrfs_header_nritems(leaf)) { /* * if we've reached the last slot in this leaf we need * to go to the next leaf and reset everything */ - if (slot >= nritems-1) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } else { - /* - * just walking through the slots on this leaf - */ - slot++; - path->slots[0]++; - } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + else if (ret > 0) + break; + continue; } - advance = 1; btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -250,7 +239,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) /* we are just looking for how big our buffer needs to be */ if (!size) - continue; + goto next; if (!buffer || (name_len + 1) > size_left) { ret = -ERANGE; @@ -263,6 +252,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) size_left -= name_len + 1; buffer += name_len + 1; +next: + path->slots[0]++; } ret = total_size; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index db9d55b507d..4bc862a80ef 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -807,8 +807,7 @@ static int cifs_parse_mount_options(char *options, const char *devname, struct smb_vol *vol) { - char *value; - char *data; + char *value, *data, *end; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; @@ -851,6 +850,7 @@ cifs_parse_mount_options(char *options, const char *devname, if (!options) return 1; + end = options + strlen(options); if (strncmp(options, "sep=", 4) == 0) { if (options[4] != 0) { separator[0] = options[4]; @@ -916,6 +916,7 @@ cifs_parse_mount_options(char *options, const char *devname, the only illegal character in a password is null */ if ((value[temp_len] == 0) && + (value + temp_len < end) && (value[temp_len+1] == separator[0])) { /* reinsert comma */ value[temp_len] = separator[0]; diff --git a/fs/dcache.c b/fs/dcache.c index 129a3573099..22a0ef41bad 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -99,12 +99,9 @@ static struct kmem_cache *dentry_cache __read_mostly; static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; -struct dcache_hash_bucket { - struct hlist_bl_head head; -}; -static struct dcache_hash_bucket *dentry_hashtable __read_mostly; +static struct hlist_bl_head *dentry_hashtable __read_mostly; -static inline struct dcache_hash_bucket *d_hash(struct dentry *parent, +static inline struct hlist_bl_head *d_hash(struct dentry *parent, unsigned long hash) { hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; @@ -112,16 +109,6 @@ static inline struct dcache_hash_bucket *d_hash(struct dentry *parent, return dentry_hashtable + (hash & D_HASHMASK); } -static inline void spin_lock_bucket(struct dcache_hash_bucket *b) -{ - bit_spin_lock(0, (unsigned long *)&b->head.first); -} - -static inline void spin_unlock_bucket(struct dcache_hash_bucket *b) -{ - __bit_spin_unlock(0, (unsigned long *)&b->head.first); -} - /* Statistics gathering. */ struct dentry_stat_t dentry_stat = { .age_limit = 45, @@ -167,8 +154,8 @@ static void d_free(struct dentry *dentry) if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - /* if dentry was never inserted into hash, immediate free is OK */ - if (hlist_bl_unhashed(&dentry->d_hash)) + /* if dentry was never visible to RCU, immediate free is OK */ + if (!(dentry->d_flags & DCACHE_RCUACCESS)) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); @@ -330,28 +317,19 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) */ void __d_drop(struct dentry *dentry) { - if (!(dentry->d_flags & DCACHE_UNHASHED)) { - if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) { - bit_spin_lock(0, - (unsigned long *)&dentry->d_sb->s_anon.first); - dentry->d_flags |= DCACHE_UNHASHED; - hlist_bl_del_init(&dentry->d_hash); - __bit_spin_unlock(0, - (unsigned long *)&dentry->d_sb->s_anon.first); - } else { - struct dcache_hash_bucket *b; + if (!d_unhashed(dentry)) { + struct hlist_bl_head *b; + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + b = &dentry->d_sb->s_anon; + else b = d_hash(dentry->d_parent, dentry->d_name.hash); - spin_lock_bucket(b); - /* - * We may not actually need to put DCACHE_UNHASHED - * manipulations under the hash lock, but follow - * the principle of least surprise. - */ - dentry->d_flags |= DCACHE_UNHASHED; - hlist_bl_del_rcu(&dentry->d_hash); - spin_unlock_bucket(b); - dentry_rcuwalk_barrier(dentry); - } + + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + dentry->d_hash.pprev = NULL; + hlist_bl_unlock(b); + + dentry_rcuwalk_barrier(dentry); } } EXPORT_SYMBOL(__d_drop); @@ -1304,7 +1282,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dname[name->len] = 0; dentry->d_count = 1; - dentry->d_flags = DCACHE_UNHASHED; + dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); seqcount_init(&dentry->d_seq); dentry->d_inode = NULL; @@ -1606,10 +1584,9 @@ struct dentry *d_obtain_alias(struct inode *inode) tmp->d_inode = inode; tmp->d_flags |= DCACHE_DISCONNECTED; list_add(&tmp->d_alias, &inode->i_dentry); - bit_spin_lock(0, (unsigned long *)&tmp->d_sb->s_anon.first); - tmp->d_flags &= ~DCACHE_UNHASHED; + hlist_bl_lock(&tmp->d_sb->s_anon); hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); - __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first); + hlist_bl_unlock(&tmp->d_sb->s_anon); spin_unlock(&tmp->d_lock); spin_unlock(&inode->i_lock); security_d_instantiate(tmp, inode); @@ -1789,7 +1766,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; - struct dcache_hash_bucket *b = d_hash(parent, hash); + struct hlist_bl_head *b = d_hash(parent, hash); struct hlist_bl_node *node; struct dentry *dentry; @@ -1813,7 +1790,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, * * See Documentation/filesystems/path-lookup.txt for more details. */ - hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) { + hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { struct inode *i; const char *tname; int tlen; @@ -1908,7 +1885,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; - struct dcache_hash_bucket *b = d_hash(parent, hash); + struct hlist_bl_head *b = d_hash(parent, hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; @@ -1935,7 +1912,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) */ rcu_read_lock(); - hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) { + hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { const char *tname; int tlen; @@ -2086,13 +2063,13 @@ again: } EXPORT_SYMBOL(d_delete); -static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b) +static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) { BUG_ON(!d_unhashed(entry)); - spin_lock_bucket(b); - entry->d_flags &= ~DCACHE_UNHASHED; - hlist_bl_add_head_rcu(&entry->d_hash, &b->head); - spin_unlock_bucket(b); + hlist_bl_lock(b); + entry->d_flags |= DCACHE_RCUACCESS; + hlist_bl_add_head_rcu(&entry->d_hash, b); + hlist_bl_unlock(b); } static void _d_rehash(struct dentry * entry) @@ -3025,7 +3002,7 @@ static void __init dcache_init_early(void) dentry_hashtable = alloc_large_system_hash("Dentry cache", - sizeof(struct dcache_hash_bucket), + sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_EARLY, @@ -3034,7 +3011,7 @@ static void __init dcache_init_early(void) 0); for (loop = 0; loop < (1 << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head); + INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) @@ -3057,7 +3034,7 @@ static void __init dcache_init(void) dentry_hashtable = alloc_large_system_hash("Dentry cache", - sizeof(struct dcache_hash_bucket), + sizeof(struct hlist_bl_head), dhash_entries, 13, 0, @@ -3066,7 +3043,7 @@ static void __init dcache_init(void) 0); for (loop = 0; loop < (1 << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head); + INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } /* SLAB cache for __getname() consumers */ diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index d2a70a4561f..b8d5c809102 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1452,6 +1452,25 @@ static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; } +void ecryptfs_i_size_init(const char *page_virt, struct inode *inode) +{ + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_crypt_stat *crypt_stat; + u64 file_size; + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + mount_crypt_stat = + &ecryptfs_superblock_to_private(inode->i_sb)->mount_crypt_stat; + if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { + file_size = i_size_read(ecryptfs_inode_to_lower(inode)); + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + file_size += crypt_stat->metadata_size; + } else + file_size = get_unaligned_be64(page_virt); + i_size_write(inode, (loff_t)file_size); + crypt_stat->flags |= ECRYPTFS_I_SIZE_INITIALIZED; +} + /** * ecryptfs_read_headers_virt * @page_virt: The virtual address into which to read the headers @@ -1482,6 +1501,8 @@ static int ecryptfs_read_headers_virt(char *page_virt, rc = -EINVAL; goto out; } + if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED)) + ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset), &bytes_read); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index bd3cafd0949..e70282775e2 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -269,6 +269,7 @@ struct ecryptfs_crypt_stat { #define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00000800 #define ECRYPTFS_ENCFN_USE_FEK 0x00001000 #define ECRYPTFS_UNLINK_SIGS 0x00002000 +#define ECRYPTFS_I_SIZE_INITIALIZED 0x00004000 u32 flags; unsigned int file_version; size_t iv_bytes; @@ -295,6 +296,8 @@ struct ecryptfs_crypt_stat { struct ecryptfs_inode_info { struct inode vfs_inode; struct inode *wii_inode; + struct mutex lower_file_mutex; + atomic_t lower_file_count; struct file *lower_file; struct ecryptfs_crypt_stat crypt_stat; }; @@ -626,6 +629,7 @@ struct ecryptfs_open_req { int ecryptfs_interpose(struct dentry *hidden_dentry, struct dentry *this_dentry, struct super_block *sb, u32 flags); +void ecryptfs_i_size_init(const char *page_virt, struct inode *inode); int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, struct inode *ecryptfs_dir_inode); @@ -757,7 +761,8 @@ int ecryptfs_privileged_open(struct file **lower_file, struct dentry *lower_dentry, struct vfsmount *lower_mnt, const struct cred *cred); -int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); +int ecryptfs_get_lower_file(struct dentry *ecryptfs_dentry); +void ecryptfs_put_lower_file(struct inode *inode); int ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, size_t *packet_size, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index cedc913d11b..566e5472f78 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -191,10 +191,10 @@ static int ecryptfs_open(struct inode *inode, struct file *file) | ECRYPTFS_ENCRYPTED); } mutex_unlock(&crypt_stat->cs_mutex); - rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + rc = ecryptfs_get_lower_file(ecryptfs_dentry); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " - "the persistent file for the dentry with name " + "the lower file for the dentry with name " "[%s]; rc = [%d]\n", __func__, ecryptfs_dentry->d_name.name, rc); goto out_free; @@ -202,9 +202,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file) if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE) == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) { rc = -EPERM; - printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " + printk(KERN_WARNING "%s: Lower file is RO; eCryptfs " "file must hence be opened RO\n", __func__); - goto out_free; + goto out_put; } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); @@ -232,10 +232,11 @@ static int ecryptfs_open(struct inode *inode, struct file *file) "Plaintext passthrough mode is not " "enabled; returning -EIO\n"); mutex_unlock(&crypt_stat->cs_mutex); - goto out_free; + goto out_put; } rc = 0; - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED + | ECRYPTFS_ENCRYPTED); mutex_unlock(&crypt_stat->cs_mutex); goto out; } @@ -245,6 +246,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file) "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino, (unsigned long long)i_size_read(inode)); goto out; +out_put: + ecryptfs_put_lower_file(inode); out_free: kmem_cache_free(ecryptfs_file_info_cache, ecryptfs_file_to_private(file)); @@ -254,17 +257,13 @@ out: static int ecryptfs_flush(struct file *file, fl_owner_t td) { - int rc = 0; - struct file *lower_file = NULL; - - lower_file = ecryptfs_file_to_lower(file); - if (lower_file->f_op && lower_file->f_op->flush) - rc = lower_file->f_op->flush(lower_file, td); - return rc; + return file->f_mode & FMODE_WRITE + ? filemap_write_and_wait(file->f_mapping) : 0; } static int ecryptfs_release(struct inode *inode, struct file *file) { + ecryptfs_put_lower_file(inode); kmem_cache_free(ecryptfs_file_info_cache, ecryptfs_file_to_private(file)); return 0; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index f99051b7ada..4d4cc6a90cd 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -168,19 +168,18 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) "context; rc = [%d]\n", rc); goto out; } - rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + rc = ecryptfs_get_lower_file(ecryptfs_dentry); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " - "the persistent file for the dentry with name " + "the lower file for the dentry with name " "[%s]; rc = [%d]\n", __func__, ecryptfs_dentry->d_name.name, rc); goto out; } rc = ecryptfs_write_metadata(ecryptfs_dentry); - if (rc) { + if (rc) printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); - goto out; - } + ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); out: return rc; } @@ -226,11 +225,9 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dir_dentry; struct vfsmount *lower_mnt; struct inode *lower_inode; - struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_crypt_stat *crypt_stat; char *page_virt = NULL; - u64 file_size; - int rc = 0; + int put_lower = 0, rc = 0; lower_dir_dentry = lower_dentry->d_parent; lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( @@ -277,14 +274,15 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, rc = -ENOMEM; goto out; } - rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + rc = ecryptfs_get_lower_file(ecryptfs_dentry); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " - "the persistent file for the dentry with name " + "the lower file for the dentry with name " "[%s]; rc = [%d]\n", __func__, ecryptfs_dentry->d_name.name, rc); goto out_free_kmem; } + put_lower = 1; crypt_stat = &ecryptfs_inode_to_private( ecryptfs_dentry->d_inode)->crypt_stat; /* TODO: lock for crypt_stat comparison */ @@ -302,18 +300,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, } crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; } - mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; - if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { - if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - file_size = (crypt_stat->metadata_size - + i_size_read(lower_dentry->d_inode)); - else - file_size = i_size_read(lower_dentry->d_inode); - } else { - file_size = get_unaligned_be64(page_virt); - } - i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size); + ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); out_free_kmem: kmem_cache_free(ecryptfs_header_cache_2, page_virt); goto out; @@ -322,6 +309,8 @@ out_put: mntput(lower_mnt); d_drop(ecryptfs_dentry); out: + if (put_lower) + ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); return rc; } @@ -538,8 +527,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) dget(lower_dentry); rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); dput(lower_dentry); - if (!rc) - d_delete(lower_dentry); fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; unlock_dir(lower_dir_dentry); @@ -610,8 +597,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); out_lock: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); - dput(lower_new_dentry->d_parent); - dput(lower_old_dentry->d_parent); + dput(lower_new_dir_dentry); + dput(lower_old_dir_dentry); dput(lower_new_dentry); dput(lower_old_dentry); return rc; @@ -759,8 +746,11 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, if (unlikely((ia->ia_size == i_size))) { lower_ia->ia_valid &= ~ATTR_SIZE; - goto out; + return 0; } + rc = ecryptfs_get_lower_file(dentry); + if (rc) + return rc; crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; /* Switch on growing or shrinking file */ if (ia->ia_size > i_size) { @@ -838,6 +828,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, lower_ia->ia_valid &= ~ATTR_SIZE; } out: + ecryptfs_put_lower_file(inode); return rc; } @@ -913,7 +904,13 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; + rc = ecryptfs_get_lower_file(dentry); + if (rc) { + mutex_unlock(&crypt_stat->cs_mutex); + goto out; + } rc = ecryptfs_read_metadata(dentry); + ecryptfs_put_lower_file(inode); if (rc) { if (!(mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) { @@ -927,10 +924,17 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) goto out; } rc = 0; - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED + | ECRYPTFS_ENCRYPTED); } } mutex_unlock(&crypt_stat->cs_mutex); + if (S_ISREG(inode->i_mode)) { + rc = filemap_write_and_wait(inode->i_mapping); + if (rc) + goto out; + fsstack_copy_attr_all(inode, lower_inode); + } memcpy(&lower_ia, ia, sizeof(lower_ia)); if (ia->ia_valid & ATTR_FILE) lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file); diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index 0851ab6980f..69f994a7d52 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -44,7 +44,7 @@ static struct task_struct *ecryptfs_kthread; * @ignored: ignored * * The eCryptfs kernel thread that has the responsibility of getting - * the lower persistent file with RW permissions. + * the lower file with RW permissions. * * Returns zero on success; non-zero otherwise */ @@ -141,8 +141,8 @@ int ecryptfs_privileged_open(struct file **lower_file, int rc = 0; /* Corresponding dput() and mntput() are done when the - * persistent file is fput() when the eCryptfs inode is - * destroyed. */ + * lower file is fput() when all eCryptfs files for the inode are + * released. */ dget(lower_dentry); mntget(lower_mnt); flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index fdb2eb0ad09..89b93389af8 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -96,7 +96,7 @@ void __ecryptfs_printk(const char *fmt, ...) } /** - * ecryptfs_init_persistent_file + * ecryptfs_init_lower_file * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with * the lower dentry and the lower mount set * @@ -104,42 +104,70 @@ void __ecryptfs_printk(const char *fmt, ...) * inode. All I/O operations to the lower inode occur through that * file. When the first eCryptfs dentry that interposes with the first * lower dentry for that inode is created, this function creates the - * persistent file struct and associates it with the eCryptfs - * inode. When the eCryptfs inode is destroyed, the file is closed. + * lower file struct and associates it with the eCryptfs + * inode. When all eCryptfs files associated with the inode are released, the + * file is closed. * - * The persistent file will be opened with read/write permissions, if + * The lower file will be opened with read/write permissions, if * possible. Otherwise, it is opened read-only. * - * This function does nothing if a lower persistent file is already + * This function does nothing if a lower file is already * associated with the eCryptfs inode. * * Returns zero on success; non-zero otherwise */ -int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) +static int ecryptfs_init_lower_file(struct dentry *dentry, + struct file **lower_file) { const struct cred *cred = current_cred(); - struct ecryptfs_inode_info *inode_info = - ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); - int rc = 0; + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + int rc; - if (!inode_info->lower_file) { - struct dentry *lower_dentry; - struct vfsmount *lower_mnt = - ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry); + rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt, + cred); + if (rc) { + printk(KERN_ERR "Error opening lower file " + "for lower_dentry [0x%p] and lower_mnt [0x%p]; " + "rc = [%d]\n", lower_dentry, lower_mnt, rc); + (*lower_file) = NULL; + } + return rc; +} - lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); - rc = ecryptfs_privileged_open(&inode_info->lower_file, - lower_dentry, lower_mnt, cred); - if (rc) { - printk(KERN_ERR "Error opening lower persistent file " - "for lower_dentry [0x%p] and lower_mnt [0x%p]; " - "rc = [%d]\n", lower_dentry, lower_mnt, rc); - inode_info->lower_file = NULL; - } +int ecryptfs_get_lower_file(struct dentry *dentry) +{ + struct ecryptfs_inode_info *inode_info = + ecryptfs_inode_to_private(dentry->d_inode); + int count, rc = 0; + + mutex_lock(&inode_info->lower_file_mutex); + count = atomic_inc_return(&inode_info->lower_file_count); + if (WARN_ON_ONCE(count < 1)) + rc = -EINVAL; + else if (count == 1) { + rc = ecryptfs_init_lower_file(dentry, + &inode_info->lower_file); + if (rc) + atomic_set(&inode_info->lower_file_count, 0); } + mutex_unlock(&inode_info->lower_file_mutex); return rc; } +void ecryptfs_put_lower_file(struct inode *inode) +{ + struct ecryptfs_inode_info *inode_info; + + inode_info = ecryptfs_inode_to_private(inode); + if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count, + &inode_info->lower_file_mutex)) { + fput(inode_info->lower_file); + inode_info->lower_file = NULL; + mutex_unlock(&inode_info->lower_file_mutex); + } +} + static struct inode *ecryptfs_get_inode(struct inode *lower_inode, struct super_block *sb) { diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index bacc882e1ae..245b517bf1b 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -55,6 +55,8 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb) if (unlikely(!inode_info)) goto out; ecryptfs_init_crypt_stat(&inode_info->crypt_stat); + mutex_init(&inode_info->lower_file_mutex); + atomic_set(&inode_info->lower_file_count, 0); inode_info->lower_file = NULL; inode = &inode_info->vfs_inode; out: @@ -77,8 +79,7 @@ static void ecryptfs_i_callback(struct rcu_head *head) * * This is used during the final destruction of the inode. All * allocation of memory related to the inode, including allocated - * memory in the crypt_stat struct, will be released here. This - * function also fput()'s the persistent file for the lower inode. + * memory in the crypt_stat struct, will be released here. * There should be no chance that this deallocation will be missed. */ static void ecryptfs_destroy_inode(struct inode *inode) @@ -86,16 +87,7 @@ static void ecryptfs_destroy_inode(struct inode *inode) struct ecryptfs_inode_info *inode_info; inode_info = ecryptfs_inode_to_private(inode); - if (inode_info->lower_file) { - struct dentry *lower_dentry = - inode_info->lower_file->f_dentry; - - BUG_ON(!lower_dentry); - if (lower_dentry->d_inode) { - fput(inode_info->lower_file); - inode_info->lower_file = NULL; - } - } + BUG_ON(inode_info->lower_file); ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); call_rcu(&inode->i_rcu, ecryptfs_i_callback); } diff --git a/fs/file.c b/fs/file.c index 0be344755c0..4c6992d8f3b 100644 --- a/fs/file.c +++ b/fs/file.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/mmzone.h> #include <linux/time.h> #include <linux/sched.h> #include <linux/slab.h> @@ -39,14 +40,17 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */ */ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); -static inline void *alloc_fdmem(unsigned int size) +static void *alloc_fdmem(unsigned int size) { - void *data; - - data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); - if (data != NULL) - return data; - + /* + * Very large allocations can stress page reclaim, so fall back to + * vmalloc() if the allocation size will be considered "large" by the VM. + */ + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (data != NULL) + return data; + } return vmalloc(size); } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index c71995b111b..0f5c4f9d5d6 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -884,8 +884,8 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, } brelse(dibh); - gfs2_trans_end(sdp); failed: + gfs2_trans_end(sdp); if (al) { gfs2_inplace_release(ip); gfs2_quota_unlock(ip); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 5c356d09c32..f789c5732b7 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1506,7 +1506,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) inode = gfs2_inode_lookup(dir->i_sb, be16_to_cpu(dent->de_type), be64_to_cpu(dent->de_inum.no_addr), - be64_to_cpu(dent->de_inum.no_formal_ino)); + be64_to_cpu(dent->de_inum.no_formal_ino), 0); brelse(bh); return inode; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index b2682e073ee..e48310885c4 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -617,18 +617,51 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return generic_file_aio_write(iocb, iov, nr_segs, pos); } -static void empty_write_end(struct page *page, unsigned from, - unsigned to) +static int empty_write_end(struct page *page, unsigned from, + unsigned to, int mode) { - struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh; + unsigned offset, blksize = 1 << inode->i_blkbits; + pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT; zero_user(page, from, to-from); mark_page_accessed(page); - if (!gfs2_is_writeback(ip)) - gfs2_page_add_databufs(ip, page, from, to); + if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) { + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + block_commit_write(page, from, to); + return 0; + } + + offset = 0; + bh = page_buffers(page); + while (offset < to) { + if (offset >= from) { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + clear_buffer_new(bh); + write_dirty_buffer(bh, WRITE); + } + offset += blksize; + bh = bh->b_this_page; + } - block_commit_write(page, from, to); + offset = 0; + bh = page_buffers(page); + while (offset < to) { + if (offset >= from) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + return -EIO; + } + offset += blksize; + bh = bh->b_this_page; + } + return 0; } static int needs_empty_write(sector_t block, struct inode *inode) @@ -643,7 +676,8 @@ static int needs_empty_write(sector_t block, struct inode *inode) return !buffer_mapped(&bh_map); } -static int write_empty_blocks(struct page *page, unsigned from, unsigned to) +static int write_empty_blocks(struct page *page, unsigned from, unsigned to, + int mode) { struct inode *inode = page->mapping->host; unsigned start, end, next, blksize; @@ -668,7 +702,9 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) gfs2_block_map); if (unlikely(ret)) return ret; - empty_write_end(page, start, end); + ret = empty_write_end(page, start, end, mode); + if (unlikely(ret)) + return ret; end = 0; } start = next; @@ -682,7 +718,9 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) ret = __block_write_begin(page, start, end - start, gfs2_block_map); if (unlikely(ret)) return ret; - empty_write_end(page, start, end); + ret = empty_write_end(page, start, end, mode); + if (unlikely(ret)) + return ret; } return 0; @@ -731,7 +769,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, if (curr == end) to = end_offset; - error = write_empty_blocks(page, from, to); + error = write_empty_blocks(page, from, to, mode); if (!error && offset + to > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) { i_size_write(inode, offset + to); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f07643e21bf..7a4fb630a32 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -93,14 +93,12 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp, static inline void spin_lock_bucket(unsigned int hash) { - struct hlist_bl_head *bl = &gl_hash_table[hash]; - bit_spin_lock(0, (unsigned long *)bl); + hlist_bl_lock(&gl_hash_table[hash]); } static inline void spin_unlock_bucket(unsigned int hash) { - struct hlist_bl_head *bl = &gl_hash_table[hash]; - __bit_spin_unlock(0, (unsigned long *)bl); + hlist_bl_unlock(&gl_hash_table[hash]); } static void gfs2_glock_dealloc(struct rcu_head *rcu) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 3754e3cbf02..25eeb2bcee4 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -385,6 +385,10 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl) static void iopen_go_callback(struct gfs2_glock *gl) { struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; + struct gfs2_sbd *sdp = gl->gl_sbd; + + if (sdp->sd_vfs->s_flags & MS_RDONLY) + return; if (gl->gl_demote_state == LM_ST_UNLOCKED && gl->gl_state == LM_ST_SHARED && ip) { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 97d54a28776..9134dcb8947 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -40,37 +40,61 @@ struct gfs2_inum_range_host { u64 ir_length; }; +struct gfs2_skip_data { + u64 no_addr; + int skipped; + int non_block; +}; + static int iget_test(struct inode *inode, void *opaque) { struct gfs2_inode *ip = GFS2_I(inode); - u64 *no_addr = opaque; + struct gfs2_skip_data *data = opaque; - if (ip->i_no_addr == *no_addr) + if (ip->i_no_addr == data->no_addr) { + if (data->non_block && + inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { + data->skipped = 1; + return 0; + } return 1; - + } return 0; } static int iget_set(struct inode *inode, void *opaque) { struct gfs2_inode *ip = GFS2_I(inode); - u64 *no_addr = opaque; + struct gfs2_skip_data *data = opaque; - inode->i_ino = (unsigned long)*no_addr; - ip->i_no_addr = *no_addr; + if (data->skipped) + return -ENOENT; + inode->i_ino = (unsigned long)(data->no_addr); + ip->i_no_addr = data->no_addr; return 0; } struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr) { unsigned long hash = (unsigned long)no_addr; - return ilookup5(sb, hash, iget_test, &no_addr); + struct gfs2_skip_data data; + + data.no_addr = no_addr; + data.skipped = 0; + data.non_block = 0; + return ilookup5(sb, hash, iget_test, &data); } -static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) +static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr, + int non_block) { + struct gfs2_skip_data data; unsigned long hash = (unsigned long)no_addr; - return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); + + data.no_addr = no_addr; + data.skipped = 0; + data.non_block = non_block; + return iget5_locked(sb, hash, iget_test, iget_set, &data); } /** @@ -111,19 +135,20 @@ static void gfs2_set_iop(struct inode *inode) * @sb: The super block * @no_addr: The inode number * @type: The type of the inode + * non_block: Can we block on inodes that are being freed? * * Returns: A VFS inode, or an error */ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, - u64 no_addr, u64 no_formal_ino) + u64 no_addr, u64 no_formal_ino, int non_block) { struct inode *inode; struct gfs2_inode *ip; struct gfs2_glock *io_gl = NULL; int error; - inode = gfs2_iget(sb, no_addr); + inode = gfs2_iget(sb, no_addr, non_block); ip = GFS2_I(inode); if (!inode) @@ -185,11 +210,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, { struct super_block *sb = sdp->sd_vfs; struct gfs2_holder i_gh; - struct inode *inode; + struct inode *inode = NULL; int error; + /* Must not read in block until block type is verified */ error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + LM_ST_EXCLUSIVE, GL_SKIP, &i_gh); if (error) return ERR_PTR(error); @@ -197,7 +223,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, if (error) goto fail; - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1); if (IS_ERR(inode)) goto fail; @@ -843,7 +869,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, goto fail_gunlock2; inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, - inum.no_formal_ino); + inum.no_formal_ino, 0); if (IS_ERR(inode)) goto fail_gunlock2; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 3e00a66e7cb..099ca305e51 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -97,7 +97,8 @@ err: } extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino); + u64 no_addr, u64 no_formal_ino, + int non_block); extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, u64 *no_formal_ino, unsigned int blktype); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 42ef24355af..d3c69eb91c7 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -430,7 +430,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, struct dentry *dentry; struct inode *inode; - inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0); + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); if (IS_ERR(inode)) { fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); return PTR_ERR(inode); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index cf930cd9664..6fcae8469f6 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -945,7 +945,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip /* rgblk_search can return a block < goal, so we need to keep it marching forward. */ no_addr = block + rgd->rd_data0; - goal++; + goal = max(block + 1, goal + 1); if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked) continue; if (no_addr == skip) @@ -971,7 +971,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip found++; /* Limit reclaim to sensible number of tasks */ - if (found > 2*NR_CPUS) + if (found > NR_CPUS) return; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index a4e23d68a39..b9f28e66dad 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1318,15 +1318,17 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) static void gfs2_evict_inode(struct inode *inode) { - struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; - if (inode->i_nlink) + if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) goto out; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + /* Must not read inode block until block type has been verified */ + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; @@ -1336,6 +1338,12 @@ static void gfs2_evict_inode(struct inode *inode) if (error) goto out_truncate; + if (test_bit(GIF_INVALID, &ip->i_flags)) { + error = gfs2_inode_refresh(ip); + if (error) + goto out_truncate; + } + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_wait(&ip->i_iopen_gh); gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 33435e4b14d..ce03a182c77 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -480,10 +480,6 @@ static int logfs_read_sb(struct super_block *sb, int read_only) !read_only) return -EIO; - mutex_init(&super->s_dirop_mutex); - mutex_init(&super->s_object_alias_mutex); - INIT_LIST_HEAD(&super->s_freeing_list); - ret = logfs_init_rw(sb); if (ret) return ret; @@ -601,6 +597,10 @@ static struct dentry *logfs_mount(struct file_system_type *type, int flags, if (!super) return ERR_PTR(-ENOMEM); + mutex_init(&super->s_dirop_mutex); + mutex_init(&super->s_object_alias_mutex); + INIT_LIST_HEAD(&super->s_freeing_list); + if (!devname) err = logfs_get_sb_bdev(super, type, devname); else if (strncmp(devname, "mtd", 3)) diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 89fc160fd5b..1f063bacd28 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -119,7 +119,7 @@ Elong: } #ifdef CONFIG_NFS_V4 -static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors, struct inode *inode) +static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) { struct gss_api_mech *mech; struct xdr_netobj oid; @@ -166,7 +166,7 @@ static int nfs_negotiate_security(const struct dentry *parent, } flavors = page_address(page); ret = secinfo(parent->d_inode, &dentry->d_name, flavors); - *flavor = nfs_find_best_sec(flavors, dentry->d_inode); + *flavor = nfs_find_best_sec(flavors); put_page(page); } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index e1c261ddd65..c4a69833dd0 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -47,6 +47,7 @@ enum nfs4_client_state { NFS4CLNT_LAYOUTRECALL, NFS4CLNT_SESSION_RESET, NFS4CLNT_RECALL_SLOT, + NFS4CLNT_LEASE_CONFIRM, }; enum nfs4_session_state { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9bf41eab3e4..69c0f3c5ee7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -46,6 +46,7 @@ #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/nfs_mount.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/module.h> @@ -443,8 +444,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * if (res->sr_status == 1) res->sr_status = NFS_OK; - /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */ - if (!res->sr_slot) + /* don't increment the sequence number if the task wasn't sent */ + if (!RPC_WAS_SENT(task)) goto out; /* Check the SEQUENCE operation status */ @@ -2185,9 +2186,14 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(server, - _nfs4_lookup_root(server, fhandle, info), - &exception); + err = _nfs4_lookup_root(server, fhandle, info); + switch (err) { + case 0: + case -NFS4ERR_WRONGSEC: + break; + default: + err = nfs4_handle_exception(server, err, &exception); + } } while (exception.retry); return err; } @@ -2208,25 +2214,47 @@ out: return ret; } -/* - * get the file handle for the "/" directory on the server - */ -static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, +static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { int i, len, status = 0; - rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS + 2]; + rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS]; - flav_array[0] = RPC_AUTH_UNIX; - len = gss_mech_list_pseudoflavors(&flav_array[1]); - flav_array[1+len] = RPC_AUTH_NULL; - len += 2; + len = gss_mech_list_pseudoflavors(&flav_array[0]); + flav_array[len] = RPC_AUTH_NULL; + len += 1; for (i = 0; i < len; i++) { status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); - if (status != -EPERM) - break; + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; } + /* + * -EACCESS could mean that the user doesn't have correct permissions + * to access the mount. It could also mean that we tried to mount + * with a gss auth flavor, but rpc.gssd isn't running. Either way, + * existing mount programs don't handle -EACCES very well so it should + * be mapped to -EPERM instead. + */ + if (status == -EACCES) + status = -EPERM; + return status; +} + +/* + * get the file handle for the "/" directory on the server + */ +static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status = nfs4_lookup_root(server, fhandle, info); + if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR)) + /* + * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM + * by nfs4_map_errors() as this function exits. + */ + status = nfs4_find_root_sec(server, fhandle, info); if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) @@ -3723,21 +3751,20 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); - status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status != -NFS4ERR_CLID_INUSE) break; - if (signalled()) + if (loop != 0) { + ++clp->cl_id_uniquifier; break; - if (loop++ & 1) - ssleep(clp->cl_lease_time / HZ + 1); - else - if (++clp->cl_id_uniquifier == 0) - break; + } + ++loop; + ssleep(clp->cl_lease_time / HZ + 1); } return status; } -static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, +int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct nfs4_setclientid_res *arg, struct rpc_cred *cred) { @@ -3752,7 +3779,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, int status; now = jiffies; - status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status == 0) { spin_lock(&clp->cl_lock); clp->cl_lease_time = fsinfo.lease_time * HZ; @@ -3762,26 +3789,6 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, return status; } -int nfs4_proc_setclientid_confirm(struct nfs_client *clp, - struct nfs4_setclientid_res *arg, - struct rpc_cred *cred) -{ - long timeout = 0; - int err; - do { - err = _nfs4_proc_setclientid_confirm(clp, arg, cred); - switch (err) { - case 0: - return err; - case -NFS4ERR_RESOURCE: - /* The IBM lawyers misread another document! */ - case -NFS4ERR_DELAY: - err = nfs4_delay(clp->cl_rpcclient, &timeout); - } - } while (err == 0); - return err; -} - struct nfs4_delegreturndata { struct nfs4_delegreturnargs args; struct nfs4_delegreturnres res; @@ -4786,7 +4793,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) init_utsname()->domainname, clp->cl_rpcclient->cl_auth->au_flavor); - status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (!status) status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); dprintk("<-- %s status= %d\n", __func__, status); @@ -4869,7 +4876,8 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs4_get_lease_time_ops, - .callback_data = &data + .callback_data = &data, + .flags = RPC_TASK_TIMEOUT, }; int status; @@ -5171,7 +5179,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp) nfs4_init_channel_attrs(&args); args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); - status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0); + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (!status) /* Verify the session's negotiated channel_attrs values */ @@ -5194,20 +5202,10 @@ int nfs4_proc_create_session(struct nfs_client *clp) int status; unsigned *ptr; struct nfs4_session *session = clp->cl_session; - long timeout = 0; - int err; dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); - do { - status = _nfs4_proc_create_session(clp); - if (status == -NFS4ERR_DELAY) { - err = nfs4_delay(clp->cl_rpcclient, &timeout); - if (err) - status = err; - } - } while (status == -NFS4ERR_DELAY); - + status = _nfs4_proc_create_session(clp); if (status) goto out; @@ -5248,7 +5246,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session) msg.rpc_argp = session; msg.rpc_resp = NULL; msg.rpc_cred = NULL; - status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0); + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status) printk(KERN_WARNING diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a6804f704d9..036f5adc9e1 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -64,10 +64,15 @@ static LIST_HEAD(nfs4_clientid_list); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) { - struct nfs4_setclientid_res clid; + struct nfs4_setclientid_res clid = { + .clientid = clp->cl_clientid, + .confirm = clp->cl_confirm, + }; unsigned short port; int status; + if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) + goto do_confirm; port = nfs_callback_tcpport; if (clp->cl_addr.ss_family == AF_INET6) port = nfs_callback_tcpport6; @@ -75,10 +80,14 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); if (status != 0) goto out; + clp->cl_clientid = clid.clientid; + clp->cl_confirm = clid.confirm; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +do_confirm: status = nfs4_proc_setclientid_confirm(clp, &clid, cred); if (status != 0) goto out; - clp->cl_clientid = clid.clientid; + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); nfs4_schedule_state_renewal(clp); out: return status; @@ -230,13 +239,18 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) { int status; + if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) + goto do_confirm; nfs4_begin_drain_session(clp); status = nfs4_proc_exchange_id(clp, cred); if (status != 0) goto out; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +do_confirm: status = nfs4_proc_create_session(clp); if (status != 0) goto out; + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); nfs41_setup_state_renewal(clp); nfs_mark_client_ready(clp, NFS_CS_READY); out: @@ -1584,20 +1598,23 @@ static int nfs4_recall_slot(struct nfs_client *clp) { return 0; } */ static void nfs4_set_lease_expired(struct nfs_client *clp, int status) { - if (nfs4_has_session(clp)) { - switch (status) { - case -NFS4ERR_DELAY: - case -NFS4ERR_CLID_INUSE: - case -EAGAIN: - break; + switch (status) { + case -NFS4ERR_CLID_INUSE: + case -NFS4ERR_STALE_CLIENTID: + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + break; + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EAGAIN: + ssleep(1); + break; - case -EKEYEXPIRED: - nfs4_warn_keyexpired(clp->cl_hostname); - case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery - * in nfs4_exchange_id */ - default: - return; - } + case -EKEYEXPIRED: + nfs4_warn_keyexpired(clp->cl_hostname); + case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery + * in nfs4_exchange_id */ + default: + return; } set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); } @@ -1607,7 +1624,7 @@ static void nfs4_state_manager(struct nfs_client *clp) int status = 0; /* Ensure exclusive access to NFSv4 state */ - for(;;) { + do { if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { /* We're going to have to re-establish a clientid */ status = nfs4_reclaim_lease(clp); @@ -1691,7 +1708,7 @@ static void nfs4_state_manager(struct nfs_client *clp) break; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) break; - } + } while (atomic_read(&clp->cl_count) > 1); return; out_error: printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index dddfb5795d7..c3ccd2c4683 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1452,26 +1452,25 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { - uint32_t attrs[2] = {0, 0}; + uint32_t attrs[2] = { + FATTR4_WORD0_RDATTR_ERROR, + FATTR4_WORD1_MOUNTED_ON_FILEID, + }; uint32_t dircount = readdir->count >> 1; __be32 *p; if (readdir->plus) { attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| - FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE; + FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID; attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER| FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; dircount >>= 1; } - attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID; - attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; - /* Switch to mounted_on_fileid if the server supports it */ - if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) - attrs[0] &= ~FATTR4_WORD0_FILEID; - else - attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + /* Use mounted_on_fileid only if the server supports it */ + if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) + attrs[0] |= FATTR4_WORD0_FILEID; p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); *p++ = cpu_to_be32(OP_READDIR); @@ -3140,7 +3139,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma goto out_overflow; xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - ret = NFS_ATTR_FATTR_FILEID; + ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; @@ -4002,7 +4001,6 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, { int status; umode_t fmode = 0; - uint64_t fileid; uint32_t type; status = decode_attr_type(xdr, bitmap, &type); @@ -4101,13 +4099,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; - status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid); + status = decode_attr_mounted_on_fileid(xdr, bitmap, &fattr->mounted_on_fileid); if (status < 0) goto xdr_error; - if (status != 0 && !(fattr->valid & status)) { - fattr->fileid = fileid; - fattr->valid |= status; - } + fattr->valid |= status; xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); @@ -4838,17 +4833,21 @@ static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) struct nfs4_secinfo_flavor *sec_flavor; int status; __be32 *p; - int i; + int i, num_flavors; status = decode_op_hdr(xdr, OP_SECINFO); + if (status) + goto out; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - res->flavors->num_flavors = be32_to_cpup(p); - for (i = 0; i < res->flavors->num_flavors; i++) { + res->flavors->num_flavors = 0; + num_flavors = be32_to_cpup(p); + + for (i = 0; i < num_flavors; i++) { sec_flavor = &res->flavors->flavors[i]; - if ((char *)&sec_flavor[1] - (char *)res > PAGE_SIZE) + if ((char *)&sec_flavor[1] - (char *)res->flavors > PAGE_SIZE) break; p = xdr_inline_decode(xdr, 4); @@ -4857,13 +4856,15 @@ static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) sec_flavor->flavor = be32_to_cpup(p); if (sec_flavor->flavor == RPC_AUTH_GSS) { - if (decode_secinfo_gss(xdr, sec_flavor)) - break; + status = decode_secinfo_gss(xdr, sec_flavor); + if (status) + goto out; } + res->flavors->num_flavors++; } - return 0; - +out: + return status; out_overflow: print_overflow_msg(__func__, xdr); return -EIO; @@ -6408,7 +6409,9 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, entry->server, 1) < 0) goto out_overflow; - if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) + if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + entry->ino = entry->fattr->mounted_on_fileid; + else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) entry->ino = entry->fattr->fileid; entry->d_type = DT_UNKNOWN; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d9ab97269ce..ff681ab65d3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1004,6 +1004,7 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata) { struct nfs_inode *nfsi = NFS_I(wdata->inode); loff_t end_pos = wdata->args.offset + wdata->res.count; + bool mark_as_dirty = false; spin_lock(&nfsi->vfs_inode.i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { @@ -1011,13 +1012,18 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata) get_lseg(wdata->lseg); wdata->lseg->pls_lc_cred = get_rpccred(wdata->args.context->state->owner->so_cred); - mark_inode_dirty_sync(wdata->inode); + mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", __func__, wdata->inode->i_ino); } if (end_pos > wdata->lseg->pls_end_pos) wdata->lseg->pls_end_pos = end_pos; spin_unlock(&nfsi->vfs_inode.i_lock); + + /* if pnfs_layoutcommit_inode() runs between inode locks, the next one + * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ + if (mark_as_dirty) + mark_inode_dirty_sync(wdata->inode); } EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2b8e9a5e366..e288f06d3fa 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1004,6 +1004,7 @@ static int nfs_parse_security_flavors(char *value, return 0; } + mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 1; return 1; } @@ -1976,6 +1977,15 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) if (error < 0) goto out; + /* + * noac is a special case. It implies -o sync, but that's not + * necessarily reflected in the mtab options. do_remount_sb + * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the + * remount options, so we have to explicitly reset it. + */ + if (data->flags & NFS_MOUNT_NOAC) + *flags |= MS_SYNCHRONOUS; + /* compare new mount options with old ones */ error = nfs_compare_remount_data(nfss, data); out: @@ -2235,8 +2245,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ nfs_fill_super(s, data); - nfs_fscache_get_super_cookie( - s, data ? data->fscache_uniq : NULL, NULL); + nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL); } mntroot = nfs_get_root(s, mntfh, dev_name); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e4cbc11a74a..3bd5d7e80f6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -680,7 +680,6 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, req = nfs_setup_write_request(ctx, page, offset, count); if (IS_ERR(req)) return PTR_ERR(req); - nfs_mark_request_dirty(req); /* Update file length */ nfs_grow_file(page, offset, count); nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); @@ -1418,8 +1417,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) task->tk_pid, task->tk_status); /* Call the NFS version-specific code */ - if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) - return; + NFS_PROTO(data->inode)->commit_done(task, data); } void nfs_commit_release_pages(struct nfs_write_data *data) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index aa309aa93fe..4cf04e11c66 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -258,6 +258,7 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) if (atomic_dec_and_test(&fp->fi_delegees)) { vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); fp->fi_lease = NULL; + fput(fp->fi_deleg_file); fp->fi_deleg_file = NULL; } } @@ -402,8 +403,8 @@ static void free_generic_stateid(struct nfs4_stateid *stp) if (stp->st_access_bmap) { oflag = nfs4_access_bmap_to_omode(stp); nfs4_file_put_access(stp->st_file, oflag); - put_nfs4_file(stp->st_file); } + put_nfs4_file(stp->st_file); kmem_cache_free(stateid_slab, stp); } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2e1cebde90d..129f3c9f62d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1363,7 +1363,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; if (!(iap->ia_valid & ATTR_MODE)) iap->ia_mode = 0; - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); if (err) goto out; @@ -1385,6 +1385,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dchild)) goto out_nfserr; + /* If file doesn't exist, check for permissions to create one */ + if (!dchild->d_inode) { + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + } + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); if (err) goto out; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index b68f87a8392..938387a10d5 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1019,7 +1019,7 @@ struct ocfs2_xattr_entry { __le16 xe_name_offset; /* byte offset from the 1st entry in the local xattr storage(inode, xattr block or xattr bucket). */ - __u8 xe_name_len; /* xattr name len, does't include prefix. */ + __u8 xe_name_len; /* xattr name len, doesn't include prefix. */ __u8 xe_type; /* the low 7 bits indicate the name prefix * type and the highest bit indicates whether * the EA is stored in the local storage. */ diff --git a/fs/proc/base.c b/fs/proc/base.c index dd6628d3ba4..dfa532730e5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3124,11 +3124,16 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode); + unsigned int nr; + struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; + if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) + goto out_no_task; + nr = filp->f_pos - FIRST_PROCESS_ENTRY; + + reaper = get_proc_task(filp->f_path.dentry->d_inode); if (!reaper) goto out_no_task; diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 4d0cb124146..40fa780ebea 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -175,26 +175,6 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud) } /** - * ubifs_create_buds_lists - create journal head buds lists for remount rw. - * @c: UBIFS file-system description object - */ -void ubifs_create_buds_lists(struct ubifs_info *c) -{ - struct rb_node *p; - - spin_lock(&c->buds_lock); - p = rb_first(&c->buds); - while (p) { - struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb); - struct ubifs_jhead *jhead = &c->jheads[bud->jhead]; - - list_add_tail(&bud->list, &jhead->buds_list); - p = rb_next(p); - } - spin_unlock(&c->buds_lock); -} - -/** * ubifs_add_bud_to_log - add a new bud to the log. * @c: UBIFS file-system description object * @jhead: journal head the bud belongs to diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 936f2cbfe6b..3dbad6fbd1e 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -317,6 +317,32 @@ int ubifs_recover_master_node(struct ubifs_info *c) goto out_free; } memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ); + + /* + * We had to recover the master node, which means there was an + * unclean reboot. However, it is possible that the master node + * is clean at this point, i.e., %UBIFS_MST_DIRTY is not set. + * E.g., consider the following chain of events: + * + * 1. UBIFS was cleanly unmounted, so the master node is clean + * 2. UBIFS is being mounted R/W and starts changing the master + * node in the first (%UBIFS_MST_LNUM). A power cut happens, + * so this LEB ends up with some amount of garbage at the + * end. + * 3. UBIFS is being mounted R/O. We reach this place and + * recover the master node from the second LEB + * (%UBIFS_MST_LNUM + 1). But we cannot update the media + * because we are being mounted R/O. We have to defer the + * operation. + * 4. However, this master node (@c->mst_node) is marked as + * clean (since the step 1). And if we just return, the + * mount code will be confused and won't recover the master + * node when it is re-mounter R/W later. + * + * Thus, to force the recovery by marking the master node as + * dirty. + */ + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); } else { /* Write the recovered master node */ c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index eed0fcff8d7..d3d6d365bfc 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -59,6 +59,7 @@ enum { * @new_size: truncation new size * @free: amount of free space in a bud * @dirty: amount of dirty space in a bud from padding and deletion nodes + * @jhead: journal head number of the bud * * UBIFS journal replay must compare node sequence numbers, which means it must * build a tree of node information to insert into the TNC. @@ -80,6 +81,7 @@ struct replay_entry { struct { int free; int dirty; + int jhead; }; }; }; @@ -159,6 +161,11 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) err = PTR_ERR(lp); goto out; } + + /* Make sure the journal head points to the latest bud */ + err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum, + c->leb_size - r->free, UBI_SHORTTERM); + out: ubifs_release_lprops(c); return err; @@ -627,10 +634,6 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, ubifs_assert(sleb->endpt - offs >= used); ubifs_assert(sleb->endpt % c->min_io_size == 0); - if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount) - err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum, - sleb->endpt, UBI_SHORTTERM); - *dirty = sleb->endpt - offs - used; *free = c->leb_size - sleb->endpt; @@ -653,12 +656,14 @@ out_dump: * @sqnum: sequence number * @free: amount of free space in bud * @dirty: amount of dirty space from padding and deletion nodes + * @jhead: journal head number for the bud * * This function inserts a reference node to the replay tree and returns zero * in case of success or a negative error code in case of failure. */ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, - unsigned long long sqnum, int free, int dirty) + unsigned long long sqnum, int free, int dirty, + int jhead) { struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; struct replay_entry *r; @@ -688,6 +693,7 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, r->flags = REPLAY_REF; r->free = free; r->dirty = dirty; + r->jhead = jhead; rb_link_node(&r->rb, parent, p); rb_insert_color(&r->rb, &c->replay_tree); @@ -712,7 +718,7 @@ static int replay_buds(struct ubifs_info *c) if (err) return err; err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum, - free, dirty); + free, dirty, b->bud->jhead); if (err) return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index c75f6133206..04ad07f4fcc 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1257,12 +1257,12 @@ static int mount_ubifs(struct ubifs_info *c) goto out_free; } + err = alloc_wbufs(c); + if (err) + goto out_cbuf; + sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); if (!c->ro_mount) { - err = alloc_wbufs(c); - if (err) - goto out_cbuf; - /* Create background thread */ c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { @@ -1631,12 +1631,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (err) goto out; - err = alloc_wbufs(c); - if (err) - goto out; - - ubifs_create_buds_lists(c); - /* Create background thread */ c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { @@ -1671,14 +1665,25 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (err) goto out; + dbg_gen("re-mounted read-write"); + c->remounting_rw = 0; + if (c->need_recovery) { c->need_recovery = 0; ubifs_msg("deferred recovery completed"); + } else { + /* + * Do not run the debugging space check if the were doing + * recovery, because when we saved the information we had the + * file-system in a state where the TNC and lprops has been + * modified in memory, but all the I/O operations (including a + * commit) were deferred. So the file-system was in + * "non-committed" state. Now the file-system is in committed + * state, and of course the amount of free space will change + * because, for example, the old index size was imprecise. + */ + err = dbg_check_space_info(c); } - - dbg_gen("re-mounted read-write"); - c->remounting_rw = 0; - err = dbg_check_space_info(c); mutex_unlock(&c->umount_mutex); return err; @@ -1733,7 +1738,6 @@ static void ubifs_remount_ro(struct ubifs_info *c) if (err) ubifs_ro_mode(c, err); - free_wbufs(c); vfree(c->orph_buf); c->orph_buf = NULL; kfree(c->write_reserve_buf); @@ -1761,10 +1765,12 @@ static void ubifs_put_super(struct super_block *sb) * of the media. For example, there will be dirty inodes if we failed * to write them back because of I/O errors. */ - ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); - ubifs_assert(c->budg_idx_growth == 0); - ubifs_assert(c->budg_dd_growth == 0); - ubifs_assert(c->budg_data_growth == 0); + if (!c->ro_error) { + ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); + ubifs_assert(c->budg_idx_growth == 0); + ubifs_assert(c->budg_dd_growth == 0); + ubifs_assert(c->budg_data_growth == 0); + } /* * The 'c->umount_lock' prevents races between UBIFS memory shrinker diff --git a/fs/xattr.c b/fs/xattr.c index a19acdb81cd..f1ef94974de 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -666,7 +666,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) return -EOPNOTSUPP; - return handler->set(dentry, name, value, size, 0, handler->flags); + return handler->set(dentry, name, value, size, flags, handler->flags); } /* diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c index 3ca79560911..9f76cceb678 100644 --- a/fs/xfs/linux-2.6/xfs_message.c +++ b/fs/xfs/linux-2.6/xfs_message.c @@ -34,8 +34,10 @@ __xfs_printk( const struct xfs_mount *mp, struct va_format *vaf) { - if (mp && mp->m_fsname) + if (mp && mp->m_fsname) { printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf); + return; + } printk("%sXFS: %pV\n", level, vaf); } |