diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/Kconfig | 22 | ||||
-rw-r--r-- | fs/Makefile | 1 | ||||
-rw-r--r-- | fs/dax.c | 534 | ||||
-rw-r--r-- | fs/exofs/inode.c | 1 | ||||
-rw-r--r-- | fs/ext2/Kconfig | 11 | ||||
-rw-r--r-- | fs/ext2/Makefile | 1 | ||||
-rw-r--r-- | fs/ext2/ext2.h | 10 | ||||
-rw-r--r-- | fs/ext2/file.c | 44 | ||||
-rw-r--r-- | fs/ext2/inode.c | 38 | ||||
-rw-r--r-- | fs/ext2/namei.c | 13 | ||||
-rw-r--r-- | fs/ext2/super.c | 53 | ||||
-rw-r--r-- | fs/ext2/xip.c | 86 | ||||
-rw-r--r-- | fs/ext2/xip.h | 26 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 6 | ||||
-rw-r--r-- | fs/ext4/file.c | 49 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 18 | ||||
-rw-r--r-- | fs/ext4/inode.c | 89 | ||||
-rw-r--r-- | fs/ext4/namei.c | 10 | ||||
-rw-r--r-- | fs/ext4/super.c | 39 | ||||
-rw-r--r-- | fs/ocfs2/aops.c | 242 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 76 | ||||
-rw-r--r-- | fs/ocfs2/file.h | 9 | ||||
-rw-r--r-- | fs/ocfs2/inode.c | 2 | ||||
-rw-r--r-- | fs/ocfs2/inode.h | 2 | ||||
-rw-r--r-- | fs/ocfs2/journal.c | 110 | ||||
-rw-r--r-- | fs/ocfs2/journal.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/namei.c | 284 | ||||
-rw-r--r-- | fs/ocfs2/namei.h | 8 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 23 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_fs.h | 14 | ||||
-rw-r--r-- | fs/ocfs2/super.c | 2 | ||||
-rw-r--r-- | fs/open.c | 5 |
32 files changed, 1537 insertions, 296 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index a6bb530b1ec5..ec35851e5b71 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -13,13 +13,6 @@ if BLOCK source "fs/ext2/Kconfig" source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" - -config FS_XIP -# execute in place - bool - depends on EXT2_FS_XIP - default y - source "fs/jbd/Kconfig" source "fs/jbd2/Kconfig" @@ -40,6 +33,21 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" +config FS_DAX + bool "Direct Access (DAX) support" + depends on MMU + depends on !(ARM || MIPS || SPARC) + help + Direct Access (DAX) can be used on memory-backed block devices. + If the block device supports DAX and the filesystem supports DAX, + then you can avoid using the pagecache to buffer I/Os. Turning + on this option will compile in support for DAX; you will need to + mount the filesystem using the -o dax option. + + If you do not have a block device that is capable of using this, + or if unsure, say N. Saying Y will increase the size of the kernel + by about 5kB. + endif # BLOCK # Posix ACL utility routines diff --git a/fs/Makefile b/fs/Makefile index bedff48e8fdc..0f4635f7c49c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o diff --git a/fs/dax.c b/fs/dax.c new file mode 100644 index 000000000000..ed1619ec6537 --- /dev/null +++ b/fs/dax.c @@ -0,0 +1,534 @@ +/* + * fs/dax.c - Direct Access filesystem code + * Copyright (c) 2013-2014 Intel Corporation + * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> + * Author: Ross Zwisler <ross.zwisler@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/atomic.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/highmem.h> +#include <linux/memcontrol.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/uio.h> +#include <linux/vmstat.h> + +int dax_clear_blocks(struct inode *inode, sector_t block, long size) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + sector_t sector = block << (inode->i_blkbits - 9); + + might_sleep(); + do { + void *addr; + unsigned long pfn; + long count; + + count = bdev_direct_access(bdev, sector, &addr, &pfn, size); + if (count < 0) + return count; + BUG_ON(size < count); + while (count > 0) { + unsigned pgsz = PAGE_SIZE - offset_in_page(addr); + if (pgsz > count) + pgsz = count; + if (pgsz < PAGE_SIZE) + memset(addr, 0, pgsz); + else + clear_page(addr); + addr += pgsz; + size -= pgsz; + count -= pgsz; + BUG_ON(pgsz & 511); + sector += pgsz / 512; + cond_resched(); + } + } while (size); + + return 0; +} +EXPORT_SYMBOL_GPL(dax_clear_blocks); + +static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) +{ + unsigned long pfn; + sector_t sector = bh->b_blocknr << (blkbits - 9); + return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); +} + +static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, + loff_t end) +{ + loff_t final = end - pos + first; /* The final byte of the buffer */ + + if (first > 0) + memset(addr, 0, first); + if (final < size) + memset(addr + final, 0, size - final); +} + +static bool buffer_written(struct buffer_head *bh) +{ + return buffer_mapped(bh) && !buffer_unwritten(bh); +} + +/* + * When ext4 encounters a hole, it returns without modifying the buffer_head + * which means that we can't trust b_size. To cope with this, we set b_state + * to 0 before calling get_block and, if any bit is set, we know we can trust + * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is + * and would save us time calling get_block repeatedly. + */ +static bool buffer_size_valid(struct buffer_head *bh) +{ + return bh->b_state != 0; +} + +static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, + loff_t start, loff_t end, get_block_t get_block, + struct buffer_head *bh) +{ + ssize_t retval = 0; + loff_t pos = start; + loff_t max = start; + loff_t bh_max = start; + void *addr; + bool hole = false; + + if (rw != WRITE) + end = min(end, i_size_read(inode)); + + while (pos < end) { + unsigned len; + if (pos == max) { + unsigned blkbits = inode->i_blkbits; + sector_t block = pos >> blkbits; + unsigned first = pos - (block << blkbits); + long size; + + if (pos == bh_max) { + bh->b_size = PAGE_ALIGN(end - pos); + bh->b_state = 0; + retval = get_block(inode, block, bh, + rw == WRITE); + if (retval) + break; + if (!buffer_size_valid(bh)) + bh->b_size = 1 << blkbits; + bh_max = pos - first + bh->b_size; + } else { + unsigned done = bh->b_size - + (bh_max - (pos - first)); + bh->b_blocknr += done >> blkbits; + bh->b_size -= done; + } + + hole = (rw != WRITE) && !buffer_written(bh); + if (hole) { + addr = NULL; + size = bh->b_size - first; + } else { + retval = dax_get_addr(bh, &addr, blkbits); + if (retval < 0) + break; + if (buffer_unwritten(bh) || buffer_new(bh)) + dax_new_buf(addr, retval, first, pos, + end); + addr += first; + size = retval - first; + } + max = min(pos + size, end); + } + + if (rw == WRITE) + len = copy_from_iter(addr, max - pos, iter); + else if (!hole) + len = copy_to_iter(addr, max - pos, iter); + else + len = iov_iter_zero(max - pos, iter); + + if (!len) + break; + + pos += len; + addr += len; + } + + return (pos == start) ? retval : pos - start; +} + +/** + * dax_do_io - Perform I/O to a DAX file + * @rw: READ to read or WRITE to write + * @iocb: The control block for this I/O + * @inode: The file which the I/O is directed at + * @iter: The addresses to do I/O from or to + * @pos: The file offset where the I/O starts + * @get_block: The filesystem method used to translate file offsets to blocks + * @end_io: A filesystem callback for I/O completion + * @flags: See below + * + * This function uses the same locking scheme as do_blockdev_direct_IO: + * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the + * caller for writes. For reads, we take and release the i_mutex ourselves. + * If DIO_LOCKING is not set, the filesystem takes care of its own locking. + * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O + * is in progress. + */ +ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, + struct iov_iter *iter, loff_t pos, + get_block_t get_block, dio_iodone_t end_io, int flags) +{ + struct buffer_head bh; + ssize_t retval = -EINVAL; + loff_t end = pos + iov_iter_count(iter); + + memset(&bh, 0, sizeof(bh)); + + if ((flags & DIO_LOCKING) && (rw == READ)) { + struct address_space *mapping = inode->i_mapping; + mutex_lock(&inode->i_mutex); + retval = filemap_write_and_wait_range(mapping, pos, end - 1); + if (retval) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + + /* Protects against truncate */ + atomic_inc(&inode->i_dio_count); + + retval = dax_io(rw, inode, iter, pos, end, get_block, &bh); + + if ((flags & DIO_LOCKING) && (rw == READ)) + mutex_unlock(&inode->i_mutex); + + if ((retval > 0) && end_io) + end_io(iocb, pos, retval, bh.b_private); + + inode_dio_done(inode); + out: + return retval; +} +EXPORT_SYMBOL_GPL(dax_do_io); + +/* + * The user has performed a load from a hole in the file. Allocating + * a new page in the file would cause excessive storage usage for + * workloads with sparse files. We allocate a page cache page instead. + * We'll kick it out of the page cache if it's ever written to, + * otherwise it will simply fall out of the page cache under memory + * pressure without ever having been dirtied. + */ +static int dax_load_hole(struct address_space *mapping, struct page *page, + struct vm_fault *vmf) +{ + unsigned long size; + struct inode *inode = mapping->host; + if (!page) + page = find_or_create_page(mapping, vmf->pgoff, + GFP_KERNEL | __GFP_ZERO); + if (!page) + return VM_FAULT_OOM; + /* Recheck i_size under page lock to avoid truncate race */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) { + unlock_page(page); + page_cache_release(page); + return VM_FAULT_SIGBUS; + } + + vmf->page = page; + return VM_FAULT_LOCKED; +} + +static int copy_user_bh(struct page *to, struct buffer_head *bh, + unsigned blkbits, unsigned long vaddr) +{ + void *vfrom, *vto; + if (dax_get_addr(bh, &vfrom, blkbits) < 0) + return -EIO; + vto = kmap_atomic(to); + copy_user_page(vto, vfrom, vaddr, to); + kunmap_atomic(vto); + return 0; +} + +static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct address_space *mapping = inode->i_mapping; + sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); + unsigned long vaddr = (unsigned long)vmf->virtual_address; + void *addr; + unsigned long pfn; + pgoff_t size; + int error; + + i_mmap_lock_read(mapping); + + /* + * Check truncate didn't happen while we were allocating a block. + * If it did, this block may or may not be still allocated to the + * file. We can't tell the filesystem to free it because we can't + * take i_mutex here. In the worst case, the file still has blocks + * allocated past the end of the file. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + error = -EIO; + goto out; + } + + error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); + if (error < 0) + goto out; + if (error < PAGE_SIZE) { + error = -EIO; + goto out; + } + + if (buffer_unwritten(bh) || buffer_new(bh)) + clear_page(addr); + + error = vm_insert_mixed(vma, vaddr, pfn); + + out: + i_mmap_unlock_read(mapping); + + if (bh->b_end_io) + bh->b_end_io(bh, 1); + + return error; +} + +static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head bh; + unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned blkbits = inode->i_blkbits; + sector_t block; + pgoff_t size; + int error; + int major = 0; + + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_size = PAGE_SIZE; + + repeat: + page = find_get_page(mapping, vmf->pgoff); + if (page) { + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return VM_FAULT_RETRY; + } + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + /* + * We have a struct page covering a hole in the file + * from a read fault and we've raced with a truncate + */ + error = -EIO; + goto unlock_page; + } + } + + error = get_block(inode, block, &bh, 0); + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; /* fs corruption? */ + if (error) + goto unlock_page; + + if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_page; + } else { + return dax_load_hole(mapping, page, vmf); + } + } + + if (vmf->cow_page) { + struct page *new_page = vmf->cow_page; + if (buffer_written(&bh)) + error = copy_user_bh(new_page, &bh, blkbits, vaddr); + else + clear_user_highpage(new_page, vaddr); + if (error) + goto unlock_page; + vmf->page = page; + if (!page) { + i_mmap_lock_read(mapping); + /* Check we didn't race with truncate */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + if (vmf->pgoff >= size) { + i_mmap_unlock_read(mapping); + error = -EIO; + goto out; + } + } + return VM_FAULT_LOCKED; + } + + /* Check we didn't race with a read fault installing a new page */ + if (!page && major) + page = find_lock_page(mapping, vmf->pgoff); + + if (page) { + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_CACHE_SIZE, 0); + delete_from_page_cache(page); + unlock_page(page); + page_cache_release(page); + } + + error = dax_insert_mapping(inode, &bh, vma, vmf); + + out: + if (error == -ENOMEM) + return VM_FAULT_OOM | major; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if ((error < 0) && (error != -EBUSY)) + return VM_FAULT_SIGBUS | major; + return VM_FAULT_NOPAGE | major; + + unlock_page: + if (page) { + unlock_page(page); + page_cache_release(page); + } + goto out; +} + +/** + * dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. + */ +int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + int result; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + if (vmf->flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + } + result = do_dax_fault(vma, vmf, get_block); + if (vmf->flags & FAULT_FLAG_WRITE) + sb_end_pagefault(sb); + + return result; +} +EXPORT_SYMBOL_GPL(dax_fault); + +/** + * dax_zero_page_range - zero a range within a page of a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @length: The number of bytes to zero + * @get_block: The filesystem method used to translate file offsets to blocks + * + * This function can be called by a filesystem when it is zeroing part of a + * page in a DAX file. This is intended for hole-punch operations. If + * you are truncating a file, the helper function dax_truncate_page() may be + * more convenient. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, + get_block_t get_block) +{ + struct buffer_head bh; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + int err; + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + BUG_ON((offset + length) > PAGE_CACHE_SIZE); + + memset(&bh, 0, sizeof(bh)); + bh.b_size = PAGE_CACHE_SIZE; + err = get_block(inode, index, &bh, 0); + if (err < 0) + return err; + if (buffer_written(&bh)) { + void *addr; + err = dax_get_addr(&bh, &addr, inode->i_blkbits); + if (err < 0) + return err; + memset(addr + offset, 0, length); + } + + return 0; +} +EXPORT_SYMBOL_GPL(dax_zero_page_range); + +/** + * dax_truncate_page - handle a partial page being truncated in a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @get_block: The filesystem method used to translate file offsets to blocks + * + * Similar to block_truncate_page(), this function can be called by a + * filesystem when it is truncating a DAX file to handle the partial page. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +{ + unsigned length = PAGE_CACHE_ALIGN(from) - from; + return dax_zero_page_range(inode, from, length, get_block); +} +EXPORT_SYMBOL_GPL(dax_truncate_page); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 6fc91df99ff8..a198e94813fe 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = { .direct_IO = exofs_direct_IO, /* With these NULL has special meaning or default is not exported */ - .get_xip_mem = NULL, .migratepage = NULL, .launder_page = NULL, .is_partially_uptodate = NULL, diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 14a6780fd034..c634874e12d9 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -42,14 +42,3 @@ config EXT2_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. - -config EXT2_FS_XIP - bool "Ext2 execute in place support" - depends on EXT2_FS && MMU - help - Execute in place can be used on memory-backed block devices. If you - enable this option, you can select to mount block devices which are - capable of this feature without using the page cache. - - If you do not use a block device that is capable of using this, - or if unsure, say N. diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile index f42af45cfd88..445b0e996a12 100644 --- a/fs/ext2/Makefile +++ b/fs/ext2/Makefile @@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o -ext2-$(CONFIG_EXT2_FS_XIP) += xip.o diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e4279ead4a05..678f9ab08c48 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -380,10 +380,15 @@ struct ext2_inode { #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ #define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ #define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ -#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ +#define EXT2_MOUNT_XIP 0x010000 /* Obsolete, use DAX */ #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ +#ifdef CONFIG_FS_DAX +#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */ +#else +#define EXT2_MOUNT_DAX 0 +#endif #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt @@ -788,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; -extern const struct file_operations ext2_xip_file_operations; +extern const struct file_operations ext2_dax_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; -extern const struct address_space_operations ext2_aops_xip; extern const struct address_space_operations ext2_nobh_aops; /* namei.c */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 7c87b22a7228..e31701713516 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -25,6 +25,36 @@ #include "xattr.h" #include "acl.h" +#ifdef CONFIG_FS_DAX +static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext2_get_block); +} + +static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext2_get_block); +} + +static const struct vm_operations_struct ext2_dax_vm_ops = { + .fault = ext2_dax_fault, + .page_mkwrite = ext2_dax_mkwrite, +}; + +static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!IS_DAX(file_inode(file))) + return generic_file_mmap(file, vma); + + file_accessed(file); + vma->vm_ops = &ext2_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + return 0; +} +#else +#define ext2_file_mmap generic_file_mmap +#endif + /* * Called when filp is released. This happens when all file descriptors * for a single struct file are closed. Note that different open() calls @@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ext2_file_mmap, .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, @@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = { .splice_write = iter_file_splice_write, }; -#ifdef CONFIG_EXT2_FS_XIP -const struct file_operations ext2_xip_file_operations = { +#ifdef CONFIG_FS_DAX +const struct file_operations ext2_dax_file_operations = { .llseek = generic_file_llseek, - .read = xip_file_read, - .write = xip_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = xip_file_mmap, + .mmap = ext2_file_mmap, .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 36d35c36311d..6434bc000125 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -34,7 +34,6 @@ #include <linux/aio.h> #include "ext2.h" #include "acl.h" -#include "xip.h" #include "xattr.h" static int __ext2_write_inode(struct inode *inode, int do_sync); @@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode, goto cleanup; } - if (ext2_use_xip(inode->i_sb)) { + if (IS_DAX(inode)) { /* - * we need to clear the block + * block must be initialised before we put it in the tree + * so that it's not found by another thread before it's + * initialised */ - err = ext2_clear_xip_target (inode, - le32_to_cpu(chain[depth-1].key)); + err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key), + 1 << inode->i_blkbits); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; @@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block, + NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, + ext2_get_block); if (ret < 0 && (rw & WRITE)) ext2_write_failed(mapping, offset + count); return ret; @@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = { .error_remove_page = generic_error_remove_page, }; -const struct address_space_operations ext2_aops_xip = { - .bmap = ext2_bmap, - .get_xip_mem = ext2_get_xip_mem, -}; - const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, @@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (mapping_is_xip(inode->i_mapping)) - error = xip_truncate_page(inode->i_mapping, newsize); + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, ext2_get_block); else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, newsize, ext2_get_block); @@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode) { unsigned int flags = EXT2_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC | S_DAX); if (flags & EXT2_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & EXT2_APPEND_FL) @@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + inode->i_flags |= S_DAX; } /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ @@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index c268d0af1db9..148f6e3789ea 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -35,7 +35,6 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -#include "xip.h" static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) { @@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; @@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index ae55fddc26a9..d0e746e96511 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -35,7 +35,6 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -#include "xip.h" static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, int wait); @@ -292,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",grpquota"); #endif -#if defined(CONFIG_EXT2_FS_XIP) +#ifdef CONFIG_FS_DAX if (sbi->s_mount_opt & EXT2_MOUNT_XIP) seq_puts(seq, ",xip"); + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) + seq_puts(seq, ",dax"); #endif if (!test_opt(sb, RESERVATION)) @@ -403,7 +404,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, - Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, + Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation }; @@ -432,6 +433,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_xip, "xip"}, + {Opt_dax, "dax"}, {Opt_grpquota, "grpquota"}, {Opt_ignore, "noquota"}, {Opt_quota, "quota"}, @@ -559,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb) break; #endif case Opt_xip: -#ifdef CONFIG_EXT2_FS_XIP - set_opt (sbi->s_mount_opt, XIP); + ext2_msg(sb, KERN_INFO, "use dax instead of xip"); + set_opt(sbi->s_mount_opt, XIP); + /* Fall through */ + case Opt_dax: +#ifdef CONFIG_FS_DAX + set_opt(sbi->s_mount_opt, DAX); #else - ext2_msg(sb, KERN_INFO, "xip option not supported"); + ext2_msg(sb, KERN_INFO, "dax option not supported"); #endif break; @@ -877,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); - ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset - EXT2_MOUNT_XIP if not */ - if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || @@ -909,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { - if (!silent) + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for xip"); - goto failed_mount; + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext2_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } } /* If the blocksize doesn't match, re-read the thing.. */ @@ -1259,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) { struct ext2_sb_info * sbi = EXT2_SB(sb); struct ext2_super_block * es; - unsigned long old_mount_opt = sbi->s_mount_opt; struct ext2_mount_options old_opts; unsigned long old_sb_flags; int err; @@ -1284,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); - ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset - EXT2_MOUNT_XIP if not */ - - if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { - ext2_msg(sb, KERN_WARNING, - "warning: unsupported blocksize for xip"); - err = -EINVAL; - goto restore_opts; - } - es = sbi->s_es; - if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) { ext2_msg(sb, KERN_WARNING, "warning: refusing change of " - "xip flag with busy inodes while remounting"); - sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; - sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT2_MOUNT_DAX; } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { spin_unlock(&sbi->s_lock); diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c deleted file mode 100644 index bbc5fec6ff7f..000000000000 --- a/fs/ext2/xip.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * linux/fs/ext2/xip.c - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte (cotte@de.ibm.com) - */ - -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/buffer_head.h> -#include <linux/blkdev.h> -#include "ext2.h" -#include "xip.h" - -static inline long __inode_direct_access(struct inode *inode, sector_t block, - void **kaddr, unsigned long *pfn, long size) -{ - struct block_device *bdev = inode->i_sb->s_bdev; - sector_t sector = block * (PAGE_SIZE / 512); - return bdev_direct_access(bdev, sector, kaddr, pfn, size); -} - -static inline int -__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create, - sector_t *result) -{ - struct buffer_head tmp; - int rc; - - memset(&tmp, 0, sizeof(struct buffer_head)); - tmp.b_size = 1 << inode->i_blkbits; - rc = ext2_get_block(inode, pgoff, &tmp, create); - *result = tmp.b_blocknr; - - /* did we get a sparse block (hole in the file)? */ - if (!tmp.b_blocknr && !rc) { - BUG_ON(create); - rc = -ENODATA; - } - - return rc; -} - -int -ext2_clear_xip_target(struct inode *inode, sector_t block) -{ - void *kaddr; - unsigned long pfn; - long size; - - size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE); - if (size < 0) - return size; - clear_page(kaddr); - return 0; -} - -void ext2_xip_verify_sb(struct super_block *sb) -{ - struct ext2_sb_info *sbi = EXT2_SB(sb); - - if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && - !sb->s_bdev->bd_disk->fops->direct_access) { - sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); - ext2_msg(sb, KERN_WARNING, - "warning: ignoring xip option - " - "not supported by bdev"); - } -} - -int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, - void **kmem, unsigned long *pfn) -{ - long rc; - sector_t block; - - /* first, retrieve the sector number */ - rc = __ext2_get_block(mapping->host, pgoff, create, &block); - if (rc) - return rc; - - /* retrieve address of the target data */ - rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE); - return (rc < 0) ? rc : 0; -} diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h deleted file mode 100644 index 18b34d2f31b3..000000000000 --- a/fs/ext2/xip.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * linux/fs/ext2/xip.h - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte (cotte@de.ibm.com) - */ - -#ifdef CONFIG_EXT2_FS_XIP -extern void ext2_xip_verify_sb (struct super_block *); -extern int ext2_clear_xip_target (struct inode *, sector_t); - -static inline int ext2_use_xip (struct super_block *sb) -{ - struct ext2_sb_info *sbi = EXT2_SB(sb); - return (sbi->s_mount_opt & EXT2_MOUNT_XIP); -} -int ext2_get_xip_mem(struct address_space *, pgoff_t, int, - void **, unsigned long *); -#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem) -#else -#define mapping_is_xip(map) 0 -#define ext2_xip_verify_sb(sb) do { } while (0) -#define ext2_use_xip(sb) 0 -#define ext2_clear_xip_target(inode, chain) 0 -#define ext2_get_xip_mem NULL -#endif diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a75fba67bb1f..982d934fd9ac 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -965,6 +965,11 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ @@ -2578,6 +2583,7 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; +extern const struct file_operations ext4_dax_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7cb592386121..33a09da16c9c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct mutex *aio_mutex = NULL; struct blk_plug plug; - int o_direct = file->f_flags & O_DIRECT; + int o_direct = io_is_direct(file); int overwrite = 0; size_t length = iov_iter_count(from); ssize_t ret; @@ -191,6 +191,26 @@ errout: return ret; } +#ifdef CONFIG_FS_DAX +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext4_get_block); + /* Is this the right get_block? */ +} + +static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext4_get_block); +} + +static const struct vm_operations_struct ext4_dax_vm_ops = { + .fault = ext4_dax_fault, + .page_mkwrite = ext4_dax_mkwrite, +}; +#else +#define ext4_dax_vm_ops ext4_file_vm_ops +#endif + static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, @@ -200,7 +220,12 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); - vma->vm_ops = &ext4_file_vm_ops; + if (IS_DAX(file_inode(file))) { + vma->vm_ops = &ext4_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + } else { + vma->vm_ops = &ext4_file_vm_ops; + } return 0; } @@ -599,6 +624,26 @@ const struct file_operations ext4_file_operations = { .fallocate = ext4_fallocate, }; +#ifdef CONFIG_FS_DAX +const struct file_operations ext4_dax_file_operations = { + .llseek = ext4_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ext4_file_write_iter, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + /* Splice not yet supported with DAX */ + .fallocate = ext4_fallocate, +}; +#endif + const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 36b369697a13..6b9878a24182 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -689,14 +689,22 @@ retry: inode_dio_done(inode); goto locked; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, offset, - ext4_get_block, NULL, NULL, 0); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, 0); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + ext4_get_block, NULL, NULL, 0); inode_dio_done(inode); } else { locked: - ret = blockdev_direct_IO(rw, iocb, inode, iter, - offset, ext4_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, + offset, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5653fa42930b..28555f191b62 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -657,6 +657,18 @@ has_zeroout: return retval; } +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + bh->b_assoc_map = inode->i_mapping; + bh->b_private = (void *)(unsigned long)iblock; + bh->b_end_io = ext4_end_io_unwritten; + } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, - offset, - get_block_func, - ext4_end_io_dio, - NULL, - dio_flags); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func, + ext4_end_io_dio, dio_flags); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + get_block_func, + ext4_end_io_dio, NULL, dio_flags); /* * Put our reference to io_end. This can free the io_end structure e.g. @@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -static int ext4_block_zero_page_range(handle_t *handle, +static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; + unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle, return -ENOMEM; blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -3278,6 +3281,33 @@ unlock: } /* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + struct inode *inode = mapping->host; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + + if (IS_DAX(inode)) + return dax_zero_page_range(inode, from, length, ext4_get_block); + return __ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end @@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + new_fl |= S_DAX; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; @@ -4534,7 +4569,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ - truncate_pagecache(inode, inode->i_size); + truncate_pagecache(inode, inode->i_size); } /* * We want to call ext4_truncate() even if attr->ia_size == diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2291923dae4e..28fe71a2904c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2235,7 +2235,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) @@ -2299,7 +2302,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 64c39c7c594f..10e8c6b7ca08 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1124,7 +1124,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, @@ -1187,6 +1187,7 @@ static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, + {Opt_dax, "dax"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, @@ -1371,6 +1372,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -1607,6 +1609,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif +#ifndef CONFIG_FS_DAX + } else if (token == Opt_dax) { + ext4_msg(sb, KERN_INFO, "dax option not supported"); + return -1; +#endif } else { if (!args->from) arg = 1; @@ -3589,6 +3596,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + goto failed_mount; + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } @@ -3652,6 +3664,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext4_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } + } + if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { @@ -4869,6 +4894,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + err = -EINVAL; + goto restore_opts; + } + } + + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + ext4_msg(sb, KERN_WARNING, "warning: refusing change of " + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT4_MOUNT_DAX; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 46d93e941f3d..44db1808cdb5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -28,6 +28,7 @@ #include <linux/pipe_fs_i.h> #include <linux/mpage.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> #include <cluster/masklog.h> @@ -47,6 +48,9 @@ #include "ocfs2_trace.h" #include "buffer_head_io.h" +#include "dir.h" +#include "namei.h" +#include "sysfile.h" static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -506,18 +510,21 @@ bail: * * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); - * - * Note that we never bother to allocate blocks here, and thus ignore the - * create argument. */ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; + u32 cpos = 0; + int alloc_locked = 0; u64 p_blkno, inode_blocks, contig_blocks; unsigned int ext_flags; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + unsigned long len = bh_result->b_size; + unsigned int clusters_to_alloc = 0; + + cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); /* This function won't even be called if the request isn't all * nicely aligned and of the right size, so there's no need @@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, /* We should already CoW the refcounted extent in case of create. */ BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); + /* allocate blocks if no p_blkno is found, and create == 1 */ + if (!p_blkno && create) { + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + alloc_locked = 1; + + /* fill hole, allocate blocks can't be larger than the size + * of the hole */ + clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); + if (clusters_to_alloc > contig_blocks) + clusters_to_alloc = contig_blocks; + + /* allocate extent and insert them into the extent tree */ + ret = ocfs2_extend_allocation(inode, cpos, + clusters_to_alloc, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, + &contig_blocks, &ext_flags); + if (ret < 0) { + mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", + (unsigned long long)iblock); + ret = -EIO; + goto bail; + } + } + /* * get_more_blocks() expects us to describe a hole by clearing * the mapped bit on bh_result(). @@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, contig_blocks = max_blocks; bh_result->b_size = contig_blocks << blocksize_bits; bail: + if (alloc_locked) + ocfs2_inode_unlock(inode, 1); return ret; } @@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static int ocfs2_is_overwrite(struct ocfs2_super *osb, + struct inode *inode, loff_t offset) +{ + int ret = 0; + u32 v_cpos = 0; + u32 p_cpos = 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + return 1; + + return 0; +} + +static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset) +{ + ssize_t ret = 0; + ssize_t written = 0; + bool orphaned = false; + int is_overwrite = 0; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + size_t count = iter->count; + journal_t *journal = osb->journal->j_journal; + u32 zero_len; + int cluster_align; + loff_t final_size = offset + count; + int append_write = offset >= i_size_read(inode) ? 1 : 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + { + u64 o = offset; + + zero_len = do_div(o, 1 << osb->s_clustersize_bits); + cluster_align = !zero_len; + } + + /* + * when final_size > inode->i_size, inode->i_size will be + * updated after direct write, so add the inode to orphan + * dir first. + */ + if (final_size > i_size_read(inode)) { + ret = ocfs2_add_inode_to_orphan(osb, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + orphaned = true; + } + + if (append_write) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, offset); + else + ret = ocfs2_extend_no_holes(inode, di_bh, offset, + offset); + if (ret < 0) { + mlog_errno(ret); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + + is_overwrite = ocfs2_is_overwrite(osb, inode, offset); + if (is_overwrite < 0) { + mlog_errno(is_overwrite); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; + } + + written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev, + iter, offset, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); + if (unlikely(written < 0)) { + loff_t i_size = i_size_read(inode); + + if (offset + count > i_size) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (i_size == i_size_read(inode)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + ret = jbd2_journal_force_commit(journal); + if (ret < 0) + mlog_errno(ret); + } + } else if (written < 0 && append_write && !is_overwrite && + !cluster_align) { + u32 p_cpos = 0; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); + + ret = blkdev_issue_zeroout(osb->sb->s_bdev, + p_cpos << (osb->s_clustersize_bits - 9), + zero_len >> 9, GFP_KERNEL, false); + if (ret < 0) + mlog_errno(ret); + } + +clean_orphan: + if (orphaned) { + int tmp_ret; + int update_isize = written > 0 ? 1 : 0; + loff_t end = update_isize ? offset + written : 0; + + tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, + update_isize, end); + if (tmp_ret < 0) { + ret = tmp_ret; + goto out; + } + + tmp_ret = jbd2_journal_force_commit(journal); + if (tmp_ret < 0) { + ret = tmp_ret; + mlog_errno(tmp_ret); + } + } + +out: + if (ret >= 0) + ret = written; + return ret; +} + static ssize_t ocfs2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, @@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw, { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * Fallback to buffered I/O if we see an inode without @@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; - /* Fallback to buffered I/O if we are appending. */ - if (i_size_read(inode) <= offset) + /* Fallback to buffered I/O if we are appending and + * concurrent O_DIRECT writes are allowed. + */ + if (i_size_read(inode) <= offset && !full_coherency) return 0; - return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + if (rw == READ) + return __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io, NULL, 0); + else + return ocfs2_direct_IO_write(iocb, iter, offset); } static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index e0f04d55fd05..46e0d4e857c7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -295,7 +295,7 @@ out: return ret; } -static int ocfs2_set_inode_size(handle_t *handle, +int ocfs2_set_inode_size(handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, u64 new_i_size) @@ -441,7 +441,7 @@ out: return status; } -static int ocfs2_truncate_file(struct inode *inode, +int ocfs2_truncate_file(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size) { @@ -709,6 +709,13 @@ leave: return status; } +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) +{ + return __ocfs2_extend_allocation(inode, logical_start, + clusters_to_add, mark_unwritten); +} + /* * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. @@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file, struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; loff_t saved_pos = 0, end; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * We start with a read level meta lock and only jump to an ex @@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * one node could wind up truncating another * nodes writes. */ - if (end > i_size_read(inode)) { + if (end > i_size_read(inode) && !full_coherency) { + *direct_io = 0; + break; + } + + /* + * Fallback to old way if the feature bit is not set. + */ + if (end > i_size_read(inode) && + !ocfs2_supports_append_dio(osb)) { *direct_io = 0; break; } @@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file, */ ret = ocfs2_check_range_for_holes(inode, saved_pos, count); if (ret == 1) { - *direct_io = 0; + /* + * Fallback to old way if the feature bit is not set. + * Otherwise try dio first and then complete the rest + * request through buffer io. + */ + if (!ocfs2_supports_append_dio(osb)) + *direct_io = 0; ret = 0; } else if (ret < 0) mlog_errno(ret); @@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); @@ -2357,11 +2383,51 @@ relock: iov_iter_truncate(from, count); if (direct_io) { + loff_t endbyte; + ssize_t written_buffered; written = generic_file_direct_write(iocb, from, *ppos); - if (written < 0) { + if (written < 0 || written == count) { ret = written; goto out_dio; } + + /* + * for completing the rest of the request. + */ + *ppos += written; + count -= written; + written_buffered = generic_perform_write(file, from, *ppos); + /* + * If generic_file_buffered_write() returned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + ret = written_buffered; + goto out_dio; + } + + iocb->ki_pos = *ppos + written_buffered; + /* We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = *ppos + written_buffered - 1; + ret = filemap_write_and_wait_range(file->f_mapping, *ppos, + endbyte); + if (ret == 0) { + written += written_buffered; + invalidate_mapping_pages(mapping, + *ppos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } } else { current->backing_dev_info = inode_to_bdi(inode); written = generic_perform_write(file, from, *ppos); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 97bf761c9e7c..e8c62f22215c 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_set_inode_size(handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size); int ocfs2_simple_size_update(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size); +int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index c8b25de9efbb..3025c0da6b8a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode, if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto bail_commit; diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ca3431ee7f24..5e86b247c821 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -81,6 +81,8 @@ struct ocfs2_inode_info tid_t i_sync_tid; tid_t i_datasync_tid; + wait_queue_head_t append_dio_wq; + struct dquot *i_dquot[MAXQUOTAS]; }; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d10860fde165..ff531928269e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -50,6 +50,8 @@ #include "sysfile.h" #include "uptodate.h" #include "quota.h" +#include "file.h" +#include "namei.h" #include "buffer_head_io.h" #include "ocfs2_trace.h" @@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, static int ocfs2_trylock_journal(struct ocfs2_super *osb, int slot_num); static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot); + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type); static int ocfs2_commit_thread(void *arg); static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec); + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type); static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) { @@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) return 0; } -void ocfs2_queue_replay_slots(struct ocfs2_super *osb) +void ocfs2_queue_replay_slots(struct ocfs2_super *osb, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_replay_map *replay_map = osb->replay_map; int i; @@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb) for (i = 0; i < replay_map->rm_slots; i++) if (replay_map->rm_replay_slots[i]) ocfs2_queue_recovery_completion(osb->journal, i, NULL, - NULL, NULL); + NULL, NULL, + orphan_reco_type); replay_map->rm_state = REPLAY_DONE; } @@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item { struct ocfs2_dinode *lri_la_dinode; struct ocfs2_dinode *lri_tl_dinode; struct ocfs2_quota_recovery *lri_qrec; + enum ocfs2_orphan_reco_type lri_orphan_reco_type; }; /* Does the second half of the recovery process. By this point, the @@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work) struct ocfs2_dinode *la_dinode, *tl_dinode; struct ocfs2_la_recovery_item *item, *n; struct ocfs2_quota_recovery *qrec; + enum ocfs2_orphan_reco_type orphan_reco_type; LIST_HEAD(tmp_la_list); trace_ocfs2_complete_recovery( @@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work) la_dinode = item->lri_la_dinode; tl_dinode = item->lri_tl_dinode; qrec = item->lri_qrec; + orphan_reco_type = item->lri_orphan_reco_type; trace_ocfs2_complete_recovery_slot(item->lri_slot, la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, @@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work) kfree(tl_dinode); } - ret = ocfs2_recover_orphans(osb, item->lri_slot); + ret = ocfs2_recover_orphans(osb, item->lri_slot, + orphan_reco_type); if (ret < 0) mlog_errno(ret); @@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec) + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_la_recovery_item *item; @@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, item->lri_slot = slot_num; item->lri_tl_dinode = tl_dinode; item->lri_qrec = qrec; + item->lri_orphan_reco_type = orphan_reco_type; spin_lock(&journal->j_lock); list_add_tail(&item->lri_list, &journal->j_la_cleanups); @@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* No need to queue up our truncate_log as regular cleanup will catch * that */ ocfs2_queue_recovery_completion(journal, osb->slot_num, - osb->local_alloc_copy, NULL, NULL); + osb->local_alloc_copy, NULL, NULL, + ORPHAN_NEED_TRUNCATE); ocfs2_schedule_truncate_log_flush(osb, 0); osb->local_alloc_copy = NULL; @@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* queue to recover orphan slots for all offline slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); ocfs2_free_replay_slots(osb); } @@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) osb->slot_num, NULL, NULL, - osb->quota_rec); + osb->quota_rec, + ORPHAN_NEED_TRUNCATE); osb->quota_rec = NULL; } } @@ -1360,7 +1374,7 @@ restart: /* queue recovery for our own slot */ ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, - NULL, NULL); + NULL, NULL, ORPHAN_NO_NEED_TRUNCATE); spin_lock(&osb->osb_lock); while (rm->rm_used) { @@ -1419,13 +1433,14 @@ skip_recovery: continue; } ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], - NULL, NULL, qrec); + NULL, NULL, qrec, + ORPHAN_NEED_TRUNCATE); } ocfs2_super_unlock(osb, 1); /* queue recovery for offline slots */ - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); bail: mutex_lock(&osb->recovery_lock); @@ -1711,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* This will kfree the memory pointed to by la_copy and tl_copy */ ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, - tl_copy, NULL); + tl_copy, NULL, ORPHAN_NEED_TRUNCATE); status = 0; done: @@ -1901,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) for (i = 0; i < osb->max_slots; i++) ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, - NULL); + NULL, ORPHAN_NO_NEED_TRUNCATE); /* * We queued a recovery on orphan slots, increment the sequence * number and update LVB so other node will skip the scan for a while @@ -2000,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, if (IS_ERR(iter)) return 0; + /* Skip inodes which are already added to recover list, since dio may + * happen concurrently with unlink/rename */ + if (OCFS2_I(iter)->ip_next_orphan) { + iput(iter); + return 0; + } + trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); /* No locking is required for the next_orphan queue as there * is only ever a single process doing orphan recovery. */ @@ -2108,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, * advertising our state to ocfs2_delete_inode(). */ static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot) + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type) { int ret = 0; struct inode *inode = NULL; @@ -2132,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, (unsigned long long)oi->ip_blkno); iter = oi->ip_next_orphan; + oi->ip_next_orphan = NULL; + + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret) { + mlog_errno(ret); + goto next; + } + ocfs2_inode_unlock(inode, 0); + + if (inode->i_nlink == 0) { + spin_lock(&oi->ip_lock); + /* Set the proper information to get us going into + * ocfs2_delete_inode. */ + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { + struct buffer_head *di_bh = NULL; + + ret = ocfs2_rw_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto next; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + ocfs2_rw_unlock(inode, 1); + mlog_errno(ret); + goto next; + } + + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); + brelse(di_bh); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto next; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); + if (ret) + mlog_errno(ret); - spin_lock(&oi->ip_lock); - /* Set the proper information to get us going into - * ocfs2_delete_inode. */ - oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; - spin_unlock(&oi->ip_lock); + wake_up(&OCFS2_I(inode)->append_dio_wq); + } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ +next: iput(inode); inode = iter; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 7f8cde94abfe..f4cd3c3e9fb7 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb) * orphan dir index leaf */ #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) +/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry + + * orphan dir index root + orphan dir index leaf */ +#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4) +#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS + /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming * directory + target unlink + 3 x dir index leaves */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 914c121ec890..b5c3a5ea3ee6 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup); + struct ocfs2_dir_lookup_result *lookup, + bool dio); static int ocfs2_orphan_add(struct ocfs2_super *osb, handle_t *handle, @@ -87,7 +88,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode); + struct inode *orphan_dir_inode, + bool dio); static int ocfs2_create_symlink_data(struct ocfs2_super *osb, handle_t *handle, @@ -104,6 +106,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -952,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir, if (ocfs2_inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto leave; @@ -1004,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir, if (is_unlinkable) { status = ocfs2_orphan_add(osb, handle, inode, fe_bh, - orphan_name, &orphan_insert, orphan_dir); + orphan_name, &orphan_insert, orphan_dir, false); if (status < 0) mlog_errno(status); } @@ -1440,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(new_inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto bail; @@ -1507,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir, if (should_add_orphan) { status = ocfs2_orphan_add(osb, handle, new_inode, newfe_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto bail; @@ -2088,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, struct buffer_head *orphan_dir_bh, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { int ret; struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; + + if (dio) { + ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + ret = -EINVAL; + mlog_errno(ret); + return ret; + } - ret = ocfs2_blkno_stringify(blkno, name); + ret = ocfs2_blkno_stringify(blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + ret = ocfs2_blkno_stringify(blkno, name); if (ret < 0) { mlog_errno(ret); return ret; @@ -2101,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); + namelen, lookup); if (ret < 0) { mlog_errno(ret); return ret; @@ -2128,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; @@ -2142,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, } ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, - blkno, name, lookup); + blkno, name, lookup, dio); if (ret < 0) { mlog_errno(ret); goto out; @@ -2170,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode) + struct inode *orphan_dir_inode, + bool dio) { struct buffer_head *orphan_dir_bh = NULL; int status = 0; struct ocfs2_dinode *orphan_fe; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; trace_ocfs2_orphan_add_begin( (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -2219,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, - OCFS2_ORPHAN_NAMELEN, inode, + namelen, inode, OCFS2_I(inode)->ip_blkno, orphan_dir_bh, lookup); if (status < 0) { @@ -2227,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto rollback; } - fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); - OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; + if (dio) { + /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan + * slot. + */ + fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num); + } else { + fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); + OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; - /* Record which orphan dir our inode now resides - * in. delete_inode will use this to determine which orphan - * dir to lock. */ - fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + /* Record which orphan dir our inode now resides + * in. delete_inode will use this to determine which orphan + * dir to lock. */ + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + } ocfs2_journal_dirty(handle, fe_bh); @@ -2258,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh) + struct buffer_head *orphan_dir_bh, + bool dio) { - char name[OCFS2_ORPHAN_NAMELEN + 1]; + const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN; + char name[namelen + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (dio) { + status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + status = -EINVAL; + mlog_errno(status); + return status; + } + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); if (status < 0) { mlog_errno(status); goto leave; @@ -2273,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, trace_ocfs2_orphan_del( (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, - name, OCFS2_ORPHAN_NAMELEN); + name, namelen); /* find it's spot in the orphan directory */ - status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, + status = ocfs2_find_entry(name, namelen, orphan_dir_inode, &lookup); if (status) { mlog_errno(status); @@ -2376,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, - di_blkno, orphan_name, orphan_insert); + di_blkno, orphan_name, orphan_insert, + false); if (ret < 0) { mlog_errno(ret); goto out; @@ -2482,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, di = (struct ocfs2_dinode *)new_di_bh->b_data; status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto leave; @@ -2527,6 +2577,186 @@ leave: return status; } +static int ocfs2_dio_orphan_recovered(struct inode *inode) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + return 0; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + return ret; +} + +#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000 +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode) +{ + char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; + struct inode *orphan_dir_inode = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct buffer_head *di_bh = NULL; + int status = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = NULL; + +restart: + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + /* + * Another append dio crashed? + * If so, wait for recovery first. + */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, + ocfs2_dio_orphan_recovered(inode), + msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); + goto restart; + } + + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, + OCFS2_I(inode)->ip_blkno, + orphan_name, + &orphan_insert, + true); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_ADD_TO_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name, + &orphan_insert, orphan_dir_inode, true); + if (status) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + ocfs2_free_dir_lookup_result(&orphan_insert); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + handle_t *handle = NULL; + int status = 0; + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *) di_bh->b_data; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + le16_to_cpu(di->i_dio_orphaned_slot)); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail_unlock_inode; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))); + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, + inode, orphan_dir_bh, true); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + di->i_dio_orphaned_slot = 0; + + if (update_isize) { + status = ocfs2_set_inode_size(handle, inode, di_bh, end); + if (status) + mlog_errno(status); + } else + ocfs2_journal_dirty(handle, di_bh); + +bail_commit: + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + brelse(orphan_dir_bh); + iput(orphan_dir_inode); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *inode, struct dentry *dentry) @@ -2615,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, } status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto out_commit; diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e5d059d4f115..5ddecce172fa 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh); + struct buffer_head *orphan_dir_bh, + bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, struct inode **new_inode); +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode); +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fdbcbfed529e..8490c64d34fe 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -209,6 +209,11 @@ struct ocfs2_lock_res { #endif }; +enum ocfs2_orphan_reco_type { + ORPHAN_NO_NEED_TRUNCATE = 0, + ORPHAN_NEED_TRUNCATE, +}; + enum ocfs2_orphan_scan_state { ORPHAN_SCAN_ACTIVE, ORPHAN_SCAN_INACTIVE @@ -495,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) +{ + if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) + return 1; + return 0; +} + + static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) @@ -726,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, return clusters; } +static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = (unsigned int)(bytes >> cl_bits); + return clusters; +} + static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, u64 bytes) { diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 938387a10d5d..20e37a3ed26f 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -105,7 +105,8 @@ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ - | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \ + | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) /* * Heartbeat-only devices are missing journals and other files. The @@ -199,6 +200,11 @@ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 +/* + * Append Direct IO support + */ +#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008 + /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ @@ -229,6 +235,8 @@ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ #define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ +#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially + * for dio */ /* * Flags on ocfs2_dinode.i_dyn_features @@ -729,7 +737,9 @@ struct ocfs2_dinode { inode belongs to. Only valid if allocated from a discontiguous block group */ -/*A0*/ __le64 i_reserved2[3]; +/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */ + __le16 i_reserved1[3]; + __le64 i_reserved2[2]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 87a1f7679d9b..26675185b886 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1746,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data) ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); + init_waitqueue_head(&oi->append_dio_wq); + ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), &ocfs2_inode_caching_ops); diff --git a/fs/open.c b/fs/open.c index 813be037b412..a293c2020676 100644 --- a/fs/open.c +++ b/fs/open.c @@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || - ((!f->f_mapping->a_ops->direct_IO) && - (!f->f_mapping->a_ops->get_xip_mem))) { + if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; - } } return 0; } |