From 22fe404218156328a27e66349b1175cd0baa4990 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 18 Sep 2009 13:05:44 -0700 Subject: vfs: split generic_forget_inode() so that hugetlbfs does not have to copy it Hugetlbfs needs to do special things instead of truncate_inode_pages(). Currently, it copied generic_forget_inode() except for truncate_inode_pages() call which is asking for trouble (the code there isn't trivial). So create a separate function generic_detach_inode() which does all the list magic done in generic_forget_inode() and call it from hugetlbfs_forget_inode(). Signed-off-by: Jan Kara Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 51803528b09..955e34615cb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2156,6 +2156,7 @@ extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); extern void generic_delete_inode(struct inode *inode); extern void generic_drop_inode(struct inode *inode); +extern int generic_detach_inode(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), -- cgit v1.2.3 From 42cb56ae2ab67390da34906b27bedc3f2ff1393b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 18 Sep 2009 13:05:53 -0700 Subject: vfs: change sb->s_maxbytes to a loff_t sb->s_maxbytes is supposed to indicate the maximum size of a file that can exist on the filesystem. It's declared as an unsigned long long. Even if a filesystem has no inherent limit that prevents it from using every bit in that unsigned long long, it's still problematic to set it to anything larger than MAX_LFS_FILESIZE. There are places in the kernel that cast s_maxbytes to a signed value. If it's set too large then this cast makes it a negative number and generally breaks the comparison. Change s_maxbytes to be loff_t instead. That should help eliminate the temptation to set it too large by making it a signed value. Also, add a warning for couple of releases to help catch filesystems that set s_maxbytes too large. Eventually we can either convert this to a BUG() or just remove it and in the hope that no one will get it wrong now that it's a signed value. Signed-off-by: Jeff Layton Cc: Johannes Weiner Cc: Christoph Hellwig Cc: Al Viro Cc: Robert Love Cc: Mandeep Singh Baines Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/super.c | 10 ++++++++++ include/linux/fs.h | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/fs/super.c b/fs/super.c index 0e7207b9815..4906e2d8f40 100644 --- a/fs/super.c +++ b/fs/super.c @@ -892,6 +892,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (error) goto out_sb; + /* + * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE + * but s_maxbytes was an unsigned long long for many releases. Throw + * this warning for a little while to try and catch filesystems that + * violate this rule. This warning should be either removed or + * converted to a BUG() in 2.6.34. + */ + WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " + "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes); + mnt->mnt_mountpoint = mnt->mnt_root; mnt->mnt_parent = mnt; up_write(&mnt->mnt_sb->s_umount); diff --git a/include/linux/fs.h b/include/linux/fs.h index 955e34615cb..cbb7724c11d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1315,7 +1315,7 @@ struct super_block { unsigned long s_blocksize; unsigned char s_blocksize_bits; unsigned char s_dirt; - unsigned long long s_maxbytes; /* Max file size */ + loff_t s_maxbytes; /* Max file size */ struct file_system_type *s_type; const struct super_operations *s_op; const struct dquot_operations *dq_op; -- cgit v1.2.3 From 4fadd7bb20a1e7c774ed88dc703d8fbcd00ff917 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Aug 2009 23:28:06 +0200 Subject: freeze_bdev: kill bd_mount_sem Now that we have the freeze count there is not much reason for bd_mount_sem anymore. The actual freeze/thaw operations are serialized using the bd_fsfreeze_mutex, and the only other place we take bd_mount_sem is get_sb_bdev which tries to prevent mounting a filesystem while the block device is frozen. Instead of add a check for bd_fsfreeze_count and return -EBUSY if a filesystem is frozen. While that is a change in user visible behaviour a failing mount is much better for this case rather than having the mount process stuck uninterruptible for a long time. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/block_dev.c | 8 +------- fs/super.c | 9 +++++++-- include/linux/fs.h | 1 - 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/block_dev.c b/fs/block_dev.c index 5d1ed50bd46..22506eb4a58 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -216,8 +216,6 @@ EXPORT_SYMBOL(fsync_bdev); * freeze_bdev -- lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * - * This takes the block device bd_mount_sem to make sure no new mounts - * happen on bdev until thaw_bdev() is called. * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last @@ -240,7 +238,6 @@ struct super_block *freeze_bdev(struct block_device *bdev) } bdev->bd_fsfreeze_count++; - down(&bdev->bd_mount_sem); sb = get_super(bdev); if (sb && !(sb->s_flags & MS_RDONLY)) { sb->s_frozen = SB_FREEZE_WRITE; @@ -260,7 +257,6 @@ struct super_block *freeze_bdev(struct block_device *bdev) "VFS:Filesystem freeze failed\n"); sb->s_frozen = SB_UNFROZEN; drop_super(sb); - up(&bdev->bd_mount_sem); bdev->bd_fsfreeze_count--; mutex_unlock(&bdev->bd_fsfreeze_mutex); return ERR_PTR(error); @@ -271,7 +267,7 @@ struct super_block *freeze_bdev(struct block_device *bdev) sync_blockdev(bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ + return sb; /* thaw_bdev releases s->s_umount */ } EXPORT_SYMBOL(freeze_bdev); @@ -321,7 +317,6 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) drop_super(sb); } - up(&bdev->bd_mount_sem); mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } @@ -430,7 +425,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); - sema_init(&bdev->bd_mount_sem, 1); INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); #ifdef CONFIG_SYSFS diff --git a/fs/super.c b/fs/super.c index 4906e2d8f40..1cb26a3e3df 100644 --- a/fs/super.c +++ b/fs/super.c @@ -743,9 +743,14 @@ int get_sb_bdev(struct file_system_type *fs_type, * will protect the lockfs code from trying to start a snapshot * while we are mounting */ - down(&bdev->bd_mount_sem); + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = -EBUSY; + goto error_bdev; + } s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); - up(&bdev->bd_mount_sem); + mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) goto error_s; diff --git a/include/linux/fs.h b/include/linux/fs.h index cbb7724c11d..72dfbd42397 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -640,7 +640,6 @@ struct block_device { struct super_block * bd_super; int bd_openers; struct mutex bd_mutex; /* open/close mutex */ - struct semaphore bd_mount_sem; struct list_head bd_inodes; void * bd_holder; int bd_holders; -- cgit v1.2.3 From 4504230a71566785a05d3e6b53fa1ee071b864eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Aug 2009 23:28:35 +0200 Subject: freeze_bdev: grab active reference to frozen superblocks Currently we held s_umount while a filesystem is frozen, despite that we might return to userspace and unlock it from a different process. Instead grab an active reference to keep the file system busy and add an explicit check for frozen filesystems in remount and reject the remount instead of blocking on s_umount. Add a new get_active_super helper to super.c for use by freeze_bdev that grabs an active reference to a superblock from a given block device. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/block_dev.c | 132 +++++++++++++++++++++++++++++------------------------ fs/super.c | 48 ++++++++++++++++++- include/linux/fs.h | 1 + 3 files changed, 120 insertions(+), 61 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/block_dev.c b/fs/block_dev.c index 22506eb4a58..9cf4b926f8e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -230,43 +230,54 @@ struct super_block *freeze_bdev(struct block_device *bdev) int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - bdev->bd_fsfreeze_count++; + if (++bdev->bd_fsfreeze_count > 1) { + /* + * We don't even need to grab a reference - the first call + * to freeze_bdev grab an active reference and only the last + * thaw_bdev drops it. + */ sb = get_super(bdev); + drop_super(sb); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } - bdev->bd_fsfreeze_count++; - - sb = get_super(bdev); - if (sb && !(sb->s_flags & MS_RDONLY)) { - sb->s_frozen = SB_FREEZE_WRITE; - smp_wmb(); - - sync_filesystem(sb); - - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); - - sync_blockdev(sb->s_bdev); - - if (sb->s_op->freeze_fs) { - error = sb->s_op->freeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; - drop_super(sb); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } + + sb = get_active_super(bdev); + if (!sb) + goto out; + if (sb->s_flags & MS_RDONLY) { + deactivate_locked_super(sb); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; + } + + sb->s_frozen = SB_FREEZE_WRITE; + smp_wmb(); + + sync_filesystem(sb); + + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + + sync_blockdev(sb->s_bdev); + + if (sb->s_op->freeze_fs) { + error = sb->s_op->freeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + deactivate_locked_super(sb); + bdev->bd_fsfreeze_count--; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return ERR_PTR(error); } } + up_write(&sb->s_umount); + out: sync_blockdev(bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; /* thaw_bdev releases s->s_umount */ } EXPORT_SYMBOL(freeze_bdev); @@ -280,43 +291,44 @@ EXPORT_SYMBOL(freeze_bdev); */ int thaw_bdev(struct block_device *bdev, struct super_block *sb) { - int error = 0; + int error = -EINVAL; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return -EINVAL; - } - - bdev->bd_fsfreeze_count--; - if (bdev->bd_fsfreeze_count > 0) { - if (sb) - drop_super(sb); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; - } - - if (sb) { - BUG_ON(sb->s_bdev != bdev); - if (!(sb->s_flags & MS_RDONLY)) { - if (sb->s_op->unfreeze_fs) { - error = sb->s_op->unfreeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; - bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } - } - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + if (!bdev->bd_fsfreeze_count) + goto out_unlock; + + error = 0; + if (--bdev->bd_fsfreeze_count > 0) + goto out_unlock; + + if (!sb) + goto out_unlock; + + BUG_ON(sb->s_bdev != bdev); + down_write(&sb->s_umount); + if (sb->s_flags & MS_RDONLY) + goto out_deactivate; + + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; } - drop_super(sb); } + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + +out_deactivate: + if (sb) + deactivate_locked_super(sb); +out_unlock: mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } diff --git a/fs/super.c b/fs/super.c index 1cb26a3e3df..19eb70b374b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -465,6 +465,48 @@ rescan: } EXPORT_SYMBOL(get_super); + +/** + * get_active_super - get an active reference to the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. Returns the superblock with an active + * reference and s_umount held exclusively or %NULL if none was found. + */ +struct super_block *get_active_super(struct block_device *bdev) +{ + struct super_block *sb; + + if (!bdev) + return NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_bdev != bdev) + continue; + + sb->s_count++; + spin_unlock(&sb_lock); + down_write(&sb->s_umount); + if (sb->s_root) { + spin_lock(&sb_lock); + if (sb->s_count > S_BIAS) { + atomic_inc(&sb->s_active); + sb->s_count--; + spin_unlock(&sb_lock); + return sb; + } + spin_unlock(&sb_lock); + } + up_write(&sb->s_umount); + put_super(sb); + yield(); + spin_lock(&sb_lock); + } + spin_unlock(&sb_lock); + return NULL; +} struct super_block * user_get_super(dev_t dev) { @@ -527,11 +569,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) { int retval; int remount_rw; - + + if (sb->s_frozen != SB_UNFROZEN) + return -EBUSY; + #ifdef CONFIG_BLOCK if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev)) return -EACCES; #endif + if (flags & MS_RDONLY) acct_auto_close(sb); shrink_dcache_sb(sb); diff --git a/include/linux/fs.h b/include/linux/fs.h index 72dfbd42397..502d96ef345 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2334,6 +2334,7 @@ extern void get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern struct super_block *get_super(struct block_device *); +extern struct super_block *get_active_super(struct block_device *bdev); extern struct super_block *user_get_super(dev_t); extern void drop_super(struct super_block *sb); -- cgit v1.2.3 From 25d9e2d15286281ec834b829a4aaf8969011f1cd Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Fri, 21 Aug 2009 02:35:05 +1000 Subject: truncate: new helpers Introduce new truncate helpers truncate_pagecache and inode_newsize_ok. vmtruncate is also consolidated from mm/memory.c and mm/nommu.c and into mm/truncate.c. Reviewed-by: Christoph Hellwig Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- Documentation/vm/locking | 2 +- fs/attr.c | 46 ++++++++++++++++++++++++++++++++-- include/linux/fs.h | 3 ++- include/linux/mm.h | 5 ++-- mm/filemap.c | 2 +- mm/memory.c | 62 +++------------------------------------------- mm/mremap.c | 4 +-- mm/nommu.c | 40 ------------------------------ mm/truncate.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 120 insertions(+), 108 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/vm/locking b/Documentation/vm/locking index f366fa95617..25fadb44876 100644 --- a/Documentation/vm/locking +++ b/Documentation/vm/locking @@ -80,7 +80,7 @@ Note: PTL can also be used to guarantee that no new clones using the mm start up ... this is a loose form of stability on mm_users. For example, it is used in copy_mm to protect against a racing tlb_gather_mmu single address space optimization, so that the zap_page_range (from -vmtruncate) does not lose sending ipi's to cloned threads that might +truncate) does not lose sending ipi's to cloned threads that might be spawned underneath it and go to user mode to drag in pte's into tlbs. swap_lock diff --git a/fs/attr.c b/fs/attr.c index 9fe1b1bd30a..96d394bdadd 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,7 +18,7 @@ /* Taken over from the old code... */ /* POSIX UID/GID verification for setting inode attributes. */ -int inode_change_ok(struct inode *inode, struct iattr *attr) +int inode_change_ok(const struct inode *inode, struct iattr *attr) { int retval = -EPERM; unsigned int ia_valid = attr->ia_valid; @@ -60,9 +60,51 @@ fine: error: return retval; } - EXPORT_SYMBOL(inode_change_ok); +/** + * inode_newsize_ok - may this inode be truncated to a given size + * @inode: the inode to be truncated + * @offset: the new size to assign to the inode + * @Returns: 0 on success, -ve errno on failure + * + * inode_newsize_ok will check filesystem limits and ulimits to check that the + * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ + * when necessary. Caller must not proceed with inode size change if failure is + * returned. @inode must be a file (not directory), with appropriate + * permissions to allow truncate (inode_newsize_ok does NOT check these + * conditions). + * + * inode_newsize_ok must be called with i_mutex held. + */ +int inode_newsize_ok(const struct inode *inode, loff_t offset) +{ + if (inode->i_size < offset) { + unsigned long limit; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + } else { + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + } + + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +} +EXPORT_SYMBOL(inode_newsize_ok); + int inode_setattr(struct inode * inode, struct iattr * attr) { unsigned int ia_valid = attr->ia_valid; diff --git a/include/linux/fs.h b/include/linux/fs.h index 502d96ef345..2b08b5ce09b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2382,7 +2382,8 @@ extern int buffer_migrate_page(struct address_space *, #define buffer_migrate_page NULL #endif -extern int inode_change_ok(struct inode *, struct iattr *); +extern int inode_change_ok(const struct inode *, struct iattr *); +extern int inode_newsize_ok(const struct inode *, loff_t offset); extern int __must_check inode_setattr(struct inode *, struct iattr *); extern void file_update_time(struct file *file); diff --git a/include/linux/mm.h b/include/linux/mm.h index b6eae5e3144..8347e938fb2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -791,8 +791,9 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, unmap_mapping_range(mapping, holebegin, holelen, 0); } -extern int vmtruncate(struct inode * inode, loff_t offset); -extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); +extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); +extern int vmtruncate(struct inode *inode, loff_t offset); +extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end); #ifdef CONFIG_MMU extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/mm/filemap.c b/mm/filemap.c index bcc7372aebb..33349adb227 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -58,7 +58,7 @@ /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_lock (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock diff --git a/mm/memory.c b/mm/memory.c index b1443ac07c0..ebcd3decac8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -297,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr = vma->vm_start; /* - * Hide vma from rmap and vmtruncate before freeing pgtables + * Hide vma from rmap and truncate_pagecache before freeing + * pgtables */ anon_vma_unlink(vma); unlink_file_vma(vma); @@ -2407,7 +2408,7 @@ restart: * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE - * boundary. Note that this is different from vmtruncate(), which + * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded @@ -2458,63 +2459,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/** - * vmtruncate - unmap mappings "freed" by truncate() syscall - * @inode: inode of the file used - * @offset: file offset to start truncating - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode * inode, loff_t offset) -{ - if (inode->i_size < offset) { - unsigned long limit; - - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - } else { - struct address_space *mapping = inode->i_mapping; - - /* - * truncation of in-use swapfiles is disallowed - it would - * cause subsequent swapout to scribble on the now-freed - * blocks. - */ - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - i_size_write(inode, offset); - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - } - - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; - -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; -} -EXPORT_SYMBOL(vmtruncate); - int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) { struct address_space *mapping = inode->i_mapping; diff --git a/mm/mremap.c b/mm/mremap.c index 20a07dba6be..97bff254771 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -86,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock vmtruncate out, - * since it might clean the dst vma before the src vma, + * moving file-based ptes, we must lock truncate_pagecache + * out, since it might clean the dst vma before the src vma, * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; diff --git a/mm/nommu.c b/mm/nommu.c index 8d484241d03..56a446f0597 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -82,46 +82,6 @@ DECLARE_RWSEM(nommu_region_sem); struct vm_operations_struct generic_file_vm_ops = { }; -/* - * Handle all mappings that got truncated by a "truncate()" - * system call. - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode *inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long limit; - - if (inode->i_size < offset) - goto do_expand; - i_size_write(inode, offset); - - truncate_inode_pages(mapping, offset); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out; - i_size_write(inode, offset); - -out_truncate: - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out: - return -EFBIG; -} - -EXPORT_SYMBOL(vmtruncate); - /* * Return the total memory allocated for this pointer, not * just what the caller asked for. diff --git a/mm/truncate.c b/mm/truncate.c index ccc3ecf7cb9..5900afca0fa 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -465,3 +465,67 @@ int invalidate_inode_pages2(struct address_space *mapping) return invalidate_inode_pages2_range(mapping, 0, -1); } EXPORT_SYMBOL_GPL(invalidate_inode_pages2); + +/** + * truncate_pagecache - unmap and remove pagecache that has been truncated + * @inode: inode + * @old: old file offset + * @new: new file offset + * + * inode's new i_size must already be written before truncate_pagecache + * is called. + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +{ + if (new < old) { + struct address_space *mapping = inode->i_mapping; + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, new); + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + } +} +EXPORT_SYMBOL(truncate_pagecache); + +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode *inode, loff_t offset) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, offset); + if (error) + return error; + oldsize = inode->i_size; + i_size_write(inode, offset); + truncate_pagecache(inode, oldsize, offset); + if (inode->i_op->truncate) + inode->i_op->truncate(inode); + + return error; +} +EXPORT_SYMBOL(vmtruncate); -- cgit v1.2.3