summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c16
-rw-r--r--fs/9p/vfs_super.c5
-rw-r--r--fs/Kconfig5
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/super.c3
-rw-r--r--fs/affs/super.c7
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/fsclient.c8
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/mntpt.c4
-rw-r--r--fs/afs/rxrpc.c3
-rw-r--r--fs/afs/super.c7
-rw-r--r--fs/aio.c158
-rw-r--r--fs/anon_inodes.c109
-rw-r--r--fs/attr.c2
-rw-r--r--fs/autofs4/autofs_i.h1
-rw-r--r--fs/autofs4/dev-ioctl.c3
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/autofs4/init.c6
-rw-r--r--fs/autofs4/inode.c12
-rw-r--r--fs/autofs4/waitq.c24
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/bfs/inode.c3
-rw-r--r--fs/binfmt_aout.c19
-rw-r--r--fs/binfmt_elf.c62
-rw-r--r--fs/binfmt_elf_fdpic.c7
-rw-r--r--fs/binfmt_em86.c3
-rw-r--r--fs/binfmt_flat.c7
-rw-r--r--fs/binfmt_misc.c10
-rw-r--r--fs/binfmt_script.c3
-rw-r--r--fs/binfmt_som.c4
-rw-r--r--fs/bio-integrity.c10
-rw-r--r--fs/bio.c12
-rw-r--r--fs/block_dev.c24
-rw-r--r--fs/btrfs/Kconfig19
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/async-thread.c15
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/backref.c1159
-rw-r--r--fs/btrfs/backref.h10
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c3068
-rw-r--r--fs/btrfs/check-integrity.h36
-rw-r--r--fs/btrfs/compression.c52
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c426
-rw-r--r--fs/btrfs/ctree.h370
-rw-r--r--fs/btrfs/delayed-inode.c78
-rw-r--r--fs/btrfs/delayed-ref.c174
-rw-r--r--fs/btrfs/delayed-ref.h104
-rw-r--r--fs/btrfs/dir-item.c10
-rw-r--r--fs/btrfs/disk-io.c790
-rw-r--r--fs/btrfs/disk-io.h16
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c1106
-rw-r--r--fs/btrfs/extent_io.c1164
-rw-r--r--fs/btrfs/extent_io.h65
-rw-r--r--fs/btrfs/extent_map.h4
-rw-r--r--fs/btrfs/file-item.c61
-rw-r--r--fs/btrfs/file.c92
-rw-r--r--fs/btrfs/free-space-cache.c440
-rw-r--r--fs/btrfs/inode-item.c6
-rw-r--r--fs/btrfs/inode-map.c23
-rw-r--r--fs/btrfs/inode.c595
-rw-r--r--fs/btrfs/ioctl.c507
-rw-r--r--fs/btrfs/ioctl.h54
-rw-r--r--fs/btrfs/locking.c59
-rw-r--r--fs/btrfs/locking.h4
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/ordered-data.c60
-rw-r--r--fs/btrfs/ordered-data.h24
-rw-r--r--fs/btrfs/orphan.c2
-rw-r--r--fs/btrfs/reada.c12
-rw-r--r--fs/btrfs/relocation.c148
-rw-r--r--fs/btrfs/root-tree.c25
-rw-r--r--fs/btrfs/scrub.c1423
-rw-r--r--fs/btrfs/struct-funcs.c53
-rw-r--r--fs/btrfs/super.c388
-rw-r--r--fs/btrfs/transaction.c237
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-log.c104
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/ulist.c220
-rw-r--r--fs/btrfs/ulist.h68
-rw-r--r--fs/btrfs/volumes.c1202
-rw-r--r--fs/btrfs/volumes.h58
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c4
-rw-r--r--fs/buffer.c17
-rw-r--r--fs/cachefiles/namei.c3
-rw-r--r--fs/ceph/caps.c4
-rw-r--r--fs/ceph/dir.c80
-rw-r--r--fs/ceph/export.c6
-rw-r--r--fs/ceph/inode.c14
-rw-r--r--fs/ceph/mds_client.c21
-rw-r--r--fs/ceph/mds_client.h7
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c38
-rw-r--r--fs/ceph/super.h5
-rw-r--r--fs/ceph/xattr.c224
-rw-r--r--fs/cifs/Kconfig7
-rw-r--r--fs/cifs/README6
-rw-r--r--fs/cifs/cifs_debug.c82
-rw-r--r--fs/cifs/cifs_debug.h4
-rw-r--r--fs/cifs/cifs_spnego.c10
-rw-r--r--fs/cifs/cifs_unicode.c41
-rw-r--r--fs/cifs/cifs_unicode.h20
-rw-r--r--fs/cifs/cifsacl.c3
-rw-r--r--fs/cifs/cifsencrypt.c21
-rw-r--r--fs/cifs/cifsfs.c45
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h88
-rw-r--r--fs/cifs/cifsproto.h29
-rw-r--r--fs/cifs/cifssmb.c330
-rw-r--r--fs/cifs/connect.c1809
-rw-r--r--fs/cifs/dir.c26
-rw-r--r--fs/cifs/file.c355
-rw-r--r--fs/cifs/inode.c28
-rw-r--r--fs/cifs/misc.c119
-rw-r--r--fs/cifs/netmisc.c6
-rw-r--r--fs/cifs/readdir.c9
-rw-r--r--fs/cifs/sess.c45
-rw-r--r--fs/cifs/smbencrypt.c2
-rw-r--r--fs/cifs/transport.c305
-rw-r--r--fs/cifs/xattr.c6
-rw-r--r--fs/coda/inode.c7
-rw-r--r--fs/coda/psdev.c1
-rw-r--r--fs/coda/upcall.c1
-rw-r--r--fs/compat.c83
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/configfs/configfs_internal.h7
-rw-r--r--fs/configfs/dir.c72
-rw-r--r--fs/configfs/inode.c62
-rw-r--r--fs/configfs/mount.c16
-rw-r--r--fs/configfs/symlink.c12
-rw-r--r--fs/cramfs/inode.c12
-rw-r--r--fs/dcache.c112
-rw-r--r--fs/dcookies.c2
-rw-r--r--fs/debugfs/file.c18
-rw-r--r--fs/debugfs/inode.c149
-rw-r--r--fs/devpts/inode.c88
-rw-r--r--fs/direct-io.c61
-rw-r--r--fs/dlm/debug_fs.c9
-rw-r--r--fs/dlm/dir.c17
-rw-r--r--fs/dlm/lock.c8
-rw-r--r--fs/dlm/lock.h3
-rw-r--r--fs/dlm/lowcomms.c24
-rw-r--r--fs/ecryptfs/crypto.c122
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h11
-rw-r--r--fs/ecryptfs/file.c9
-rw-r--r--fs/ecryptfs/inode.c50
-rw-r--r--fs/ecryptfs/keystore.c14
-rw-r--r--fs/ecryptfs/main.c19
-rw-r--r--fs/ecryptfs/miscdev.c140
-rw-r--r--fs/ecryptfs/mmap.c12
-rw-r--r--fs/ecryptfs/read_write.c100
-rw-r--r--fs/ecryptfs/super.c15
-rw-r--r--fs/efs/super.c3
-rw-r--r--fs/eventfd.c2
-rw-r--r--fs/eventpoll.c310
-rw-r--r--fs/exec.c93
-rw-r--r--fs/exofs/dir.c4
-rw-r--r--fs/exofs/namei.c13
-rw-r--r--fs/exofs/super.c11
-rw-r--r--fs/ext2/dir.c4
-rw-r--r--fs/ext2/ext2.h631
-rw-r--r--fs/ext2/ioctl.c22
-rw-r--r--fs/ext2/namei.c13
-rw-r--r--fs/ext2/super.c4
-rw-r--r--fs/ext2/xattr_security.c5
-rw-r--r--fs/ext2/xattr_trusted.c5
-rw-r--r--fs/ext2/xip.c2
-rw-r--r--fs/ext3/acl.c8
-rw-r--r--fs/ext3/balloc.c94
-rw-r--r--fs/ext3/bitmap.c4
-rw-r--r--fs/ext3/dir.c7
-rw-r--r--fs/ext3/ext3.h1322
-rw-r--r--fs/ext3/ext3_jbd.c2
-rw-r--r--fs/ext3/file.c6
-rw-r--r--fs/ext3/fsync.c8
-rw-r--r--fs/ext3/hash.c4
-rw-r--r--fs/ext3/ialloc.c13
-rw-r--r--fs/ext3/inode.c21
-rw-r--r--fs/ext3/ioctl.c7
-rw-r--r--fs/ext3/namei.c14
-rw-r--r--fs/ext3/resize.c5
-rw-r--r--fs/ext3/super.c21
-rw-r--r--fs/ext3/symlink.c4
-rw-r--r--fs/ext3/xattr.c7
-rw-r--r--fs/ext3/xattr_security.c6
-rw-r--r--fs/ext3/xattr_trusted.c6
-rw-r--r--fs/ext3/xattr_user.c5
-rw-r--r--fs/ext4/balloc.c63
-rw-r--r--fs/ext4/dir.c227
-rw-r--r--fs/ext4/ext4.h40
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.h128
-rw-r--r--fs/ext4/extents.c330
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/hash.c4
-rw-r--r--fs/ext4/ialloc.c260
-rw-r--r--fs/ext4/inode.c95
-rw-r--r--fs/ext4/mballoc.c342
-rw-r--r--fs/ext4/mballoc.h20
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c4
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c11
-rw-r--r--fs/ext4/resize.c37
-rw-r--r--fs/ext4/super.c1083
-rw-r--r--fs/ext4/xattr.c25
-rw-r--r--fs/fat/inode.c8
-rw-r--r--fs/fat/namei_vfat.c83
-rw-r--r--fs/fcntl.c18
-rw-r--r--fs/file.c54
-rw-r--r--fs/file_table.c3
-rw-r--r--fs/freevxfs/vxfs_super.c3
-rw-r--r--fs/fs-writeback.c42
-rw-r--r--fs/fs_struct.c31
-rw-r--r--fs/fuse/dev.c61
-rw-r--r--fs/fuse/dir.c58
-rw-r--r--fs/fuse/file.c62
-rw-r--r--fs/fuse/fuse_i.h10
-rw-r--r--fs/fuse/inode.c9
-rw-r--r--fs/gfs2/Kconfig7
-rw-r--r--fs/gfs2/aops.c16
-rw-r--r--fs/gfs2/bmap.c10
-rw-r--r--fs/gfs2/dir.c4
-rw-r--r--fs/gfs2/file.c16
-rw-r--r--fs/gfs2/glock.c226
-rw-r--r--fs/gfs2/glock.h7
-rw-r--r--fs/gfs2/incore.h110
-rw-r--r--fs/gfs2/inode.c26
-rw-r--r--fs/gfs2/lock_dlm.c1116
-rw-r--r--fs/gfs2/log.c244
-rw-r--r--fs/gfs2/log.h5
-rw-r--r--fs/gfs2/lops.c103
-rw-r--r--fs/gfs2/main.c28
-rw-r--r--fs/gfs2/ops_fstype.c48
-rw-r--r--fs/gfs2/quota.c6
-rw-r--r--fs/gfs2/recovery.c11
-rw-r--r--fs/gfs2/rgrp.c206
-rw-r--r--fs/gfs2/rgrp.h10
-rw-r--r--fs/gfs2/super.c3
-rw-r--r--fs/gfs2/sys.c33
-rw-r--r--fs/gfs2/sys.h2
-rw-r--r--fs/gfs2/trace_gfs2.h60
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/util.h3
-rw-r--r--fs/gfs2/xattr.c16
-rw-r--r--fs/hfs/super.c6
-rw-r--r--fs/hfsplus/hfsplus_fs.h5
-rw-r--r--fs/hfsplus/hfsplus_raw.h2
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/ioctl.c34
-rw-r--r--fs/hfsplus/super.c17
-rw-r--r--fs/hostfs/hostfs.h3
-rw-r--r--fs/hostfs/hostfs_kern.c9
-rw-r--r--fs/hostfs/hostfs_user.c4
-rw-r--r--fs/hpfs/super.c6
-rw-r--r--fs/hppfs/hppfs.c9
-rw-r--r--fs/hugetlbfs/inode.c153
-rw-r--r--fs/inode.c43
-rw-r--r--fs/ioctl.c2
-rw-r--r--fs/ioprio.c24
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jbd/checkpoint.c27
-rw-r--r--fs/jbd/journal.c14
-rw-r--r--fs/jbd/recovery.c4
-rw-r--r--fs/jbd/transaction.c4
-rw-r--r--fs/jbd2/checkpoint.c140
-rw-r--r--fs/jbd2/commit.c52
-rw-r--r--fs/jbd2/journal.c376
-rw-r--r--fs/jbd2/recovery.c5
-rw-r--r--fs/jbd2/revoke.c12
-rw-r--r--fs/jbd2/transaction.c52
-rw-r--r--fs/jffs2/acl.c2
-rw-r--r--fs/jffs2/background.c29
-rw-r--r--fs/jffs2/build.c6
-rw-r--r--fs/jffs2/compr.c32
-rw-r--r--fs/jffs2/compr_lzo.c1
-rw-r--r--fs/jffs2/compr_rubin.c2
-rw-r--r--fs/jffs2/compr_zlib.c45
-rw-r--r--fs/jffs2/debug.c22
-rw-r--r--fs/jffs2/debug.h50
-rw-r--r--fs/jffs2/dir.c41
-rw-r--r--fs/jffs2/erase.c74
-rw-r--r--fs/jffs2/file.c33
-rw-r--r--fs/jffs2/fs.c73
-rw-r--r--fs/jffs2/gc.c322
-rw-r--r--fs/jffs2/malloc.c2
-rw-r--r--fs/jffs2/nodelist.c30
-rw-r--r--fs/jffs2/nodemgmt.c214
-rw-r--r--fs/jffs2/os-linux.h4
-rw-r--r--fs/jffs2/read.c70
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/scan.c229
-rw-r--r--fs/jffs2/security.c4
-rw-r--r--fs/jffs2/summary.c16
-rw-r--r--fs/jffs2/super.c30
-rw-r--r--fs/jffs2/symlink.c7
-rw-r--r--fs/jffs2/wbuf.c148
-rw-r--r--fs/jffs2/write.c113
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/jfs/namei.c13
-rw-r--r--fs/jfs/super.c12
-rw-r--r--fs/libfs.c19
-rw-r--r--fs/lockd/clnt4xdr.c2
-rw-r--r--fs/lockd/clntlock.c3
-rw-r--r--fs/lockd/clntxdr.c8
-rw-r--r--fs/lockd/host.c42
-rw-r--r--fs/lockd/mon.c23
-rw-r--r--fs/lockd/netns.h12
-rw-r--r--fs/lockd/svc.c119
-rw-r--r--fs/lockd/svclock.c59
-rw-r--r--fs/locks.c3
-rw-r--r--fs/logfs/dev_mtd.c6
-rw-r--r--fs/logfs/dir.c23
-rw-r--r--fs/logfs/file.c2
-rw-r--r--fs/logfs/gc.c2
-rw-r--r--fs/logfs/inode.c4
-rw-r--r--fs/logfs/journal.c1
-rw-r--r--fs/logfs/logfs.h5
-rw-r--r--fs/logfs/readwrite.c89
-rw-r--r--fs/logfs/segment.c55
-rw-r--r--fs/logfs/super.c15
-rw-r--r--fs/minix/dir.c4
-rw-r--r--fs/minix/inode.c38
-rw-r--r--fs/minix/minix.h1
-rw-r--r--fs/minix/namei.c14
-rw-r--r--fs/mpage.c6
-rw-r--r--fs/namei.c410
-rw-r--r--fs/ncpfs/file.c1
-rw-r--r--fs/ncpfs/inode.c7
-rw-r--r--fs/ncpfs/mmap.c1
-rw-r--r--fs/nfs/Kconfig29
-rw-r--r--fs/nfs/blocklayout/blocklayout.c363
-rw-r--r--fs/nfs/blocklayout/blocklayout.h23
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c46
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c33
-rw-r--r--fs/nfs/blocklayout/extents.c178
-rw-r--r--fs/nfs/cache_lib.c61
-rw-r--r--fs/nfs/cache_lib.h10
-rw-r--r--fs/nfs/callback.c19
-rw-r--r--fs/nfs/callback.h5
-rw-r--r--fs/nfs/callback_proc.c99
-rw-r--r--fs/nfs/callback_xdr.c25
-rw-r--r--fs/nfs/client.c250
-rw-r--r--fs/nfs/delegation.c68
-rw-r--r--fs/nfs/delegation.h4
-rw-r--r--fs/nfs/dir.c35
-rw-r--r--fs/nfs/direct.c7
-rw-r--r--fs/nfs/dns_resolve.c130
-rw-r--r--fs/nfs/dns_resolve.h14
-rw-r--r--fs/nfs/file.c3
-rw-r--r--fs/nfs/fscache.c2
-rw-r--r--fs/nfs/getroot.c7
-rw-r--r--fs/nfs/idmap.c734
-rw-r--r--fs/nfs/inode.c122
-rw-r--r--fs/nfs/internal.h17
-rw-r--r--fs/nfs/mount_clnt.c16
-rw-r--r--fs/nfs/namespace.c5
-rw-r--r--fs/nfs/netns.h27
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c24
-rw-r--r--fs/nfs/nfs3xdr.c4
-rw-r--r--fs/nfs/nfs4_fs.h58
-rw-r--r--fs/nfs/nfs4filelayout.c271
-rw-r--r--fs/nfs/nfs4filelayout.h7
-rw-r--r--fs/nfs/nfs4filelayoutdev.c92
-rw-r--r--fs/nfs/nfs4namespace.c10
-rw-r--r--fs/nfs/nfs4proc.c726
-rw-r--r--fs/nfs/nfs4state.c353
-rw-r--r--fs/nfs/nfs4xdr.c702
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/objlayout/objio_osd.c54
-rw-r--r--fs/nfs/objlayout/objlayout.c142
-rw-r--r--fs/nfs/objlayout/objlayout.h2
-rw-r--r--fs/nfs/pagelist.c92
-rw-r--r--fs/nfs/pnfs.c46
-rw-r--r--fs/nfs/pnfs.h98
-rw-r--r--fs/nfs/pnfs_dev.c4
-rw-r--r--fs/nfs/proc.c24
-rw-r--r--fs/nfs/read.c15
-rw-r--r--fs/nfs/super.c168
-rw-r--r--fs/nfs/sysctl.c2
-rw-r--r--fs/nfs/unlink.c45
-rw-r--r--fs/nfs/write.c217
-rw-r--r--fs/nfsd/Kconfig10
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/current_stateid.h28
-rw-r--r--fs/nfsd/export.c12
-rw-r--r--fs/nfsd/fault_inject.c91
-rw-r--r--fs/nfsd/fault_inject.h28
-rw-r--r--fs/nfsd/netns.h34
-rw-r--r--fs/nfsd/nfs4callback.c27
-rw-r--r--fs/nfsd/nfs4idmap.c64
-rw-r--r--fs/nfsd/nfs4proc.c125
-rw-r--r--fs/nfsd/nfs4recover.c663
-rw-r--r--fs/nfsd/nfs4state.c693
-rw-r--r--fs/nfsd/nfs4xdr.c135
-rw-r--r--fs/nfsd/nfsctl.c36
-rw-r--r--fs/nfsd/nfsd.h27
-rw-r--r--fs/nfsd/nfssvc.c48
-rw-r--r--fs/nfsd/state.h48
-rw-r--r--fs/nfsd/stats.c5
-rw-r--r--fs/nfsd/vfs.c61
-rw-r--r--fs/nfsd/vfs.h2
-rw-r--r--fs/nfsd/xdr4.h34
-rw-r--r--fs/nilfs2/cpfile.c94
-rw-r--r--fs/nilfs2/dat.c38
-rw-r--r--fs/nilfs2/dir.c4
-rw-r--r--fs/nilfs2/ifile.c4
-rw-r--r--fs/nilfs2/ioctl.c2
-rw-r--r--fs/nilfs2/mdt.c4
-rw-r--r--fs/nilfs2/namei.c11
-rw-r--r--fs/nilfs2/page.c8
-rw-r--r--fs/nilfs2/recovery.c4
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/sufile.c68
-rw-r--r--fs/nilfs2/super.c4
-rw-r--r--fs/nilfs2/the_nilfs.c7
-rw-r--r--fs/notify/mark.c8
-rw-r--r--fs/notify/notification.c3
-rw-r--r--fs/ntfs/aops.c20
-rw-r--r--fs/ntfs/attrib.c26
-rw-r--r--fs/ntfs/file.c16
-rw-r--r--fs/ntfs/layout.h4
-rw-r--r--fs/ntfs/mft.c6
-rw-r--r--fs/ntfs/super.c23
-rw-r--r--fs/ocfs2/aops.c16
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c14
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/super.c51
-rw-r--r--fs/omfs/inode.c6
-rw-r--r--fs/open.c4
-rw-r--r--fs/openpromfs/inode.c3
-rw-r--r--fs/pipe.c11
-rw-r--r--fs/posix_acl.c2
-rw-r--r--fs/proc/array.c118
-rw-r--r--fs/proc/base.c253
-rw-r--r--fs/proc/inode.c17
-rw-r--r--fs/proc/internal.h12
-rw-r--r--fs/proc/kcore.c8
-rw-r--r--fs/proc/namespaces.c8
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/proc_sysctl.c1276
-rw-r--r--fs/proc/root.c9
-rw-r--r--fs/proc/stat.c98
-rw-r--r--fs/proc/task_mmu.c360
-rw-r--r--fs/proc/task_nommu.c69
-rw-r--r--fs/proc/vmcore.c23
-rw-r--r--fs/pstore/inode.c57
-rw-r--r--fs/pstore/platform.c30
-rw-r--r--fs/qnx4/inode.c150
-rw-r--r--fs/qnx4/namei.c9
-rw-r--r--fs/qnx4/qnx4.h2
-rw-r--r--fs/qnx6/Kconfig26
-rw-r--r--fs/qnx6/Makefile7
-rw-r--r--fs/qnx6/README8
-rw-r--r--fs/qnx6/dir.c291
-rw-r--r--fs/qnx6/inode.c698
-rw-r--r--fs/qnx6/namei.c42
-rw-r--r--fs/qnx6/qnx6.h135
-rw-r--r--fs/qnx6/super_mmi.c150
-rw-r--r--fs/quota/dquot.c198
-rw-r--r--fs/quota/quota.c27
-rw-r--r--fs/ramfs/inode.c30
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/readdir.c2
-rw-r--r--fs/reiserfs/acl.h76
-rw-r--r--fs/reiserfs/bitmap.c4
-rw-r--r--fs/reiserfs/dir.c2
-rw-r--r--fs/reiserfs/do_balan.c2
-rw-r--r--fs/reiserfs/file.c6
-rw-r--r--fs/reiserfs/fix_node.c2
-rw-r--r--fs/reiserfs/hashes.c2
-rw-r--r--fs/reiserfs/ibalance.c2
-rw-r--r--fs/reiserfs/inode.c6
-rw-r--r--fs/reiserfs/ioctl.c2
-rw-r--r--fs/reiserfs/item_ops.c2
-rw-r--r--fs/reiserfs/journal.c3
-rw-r--r--fs/reiserfs/lbalance.c4
-rw-r--r--fs/reiserfs/lock.c2
-rw-r--r--fs/reiserfs/namei.c6
-rw-r--r--fs/reiserfs/objectid.c3
-rw-r--r--fs/reiserfs/prints.c4
-rw-r--r--fs/reiserfs/procfs.c3
-rw-r--r--fs/reiserfs/reiserfs.h2923
-rw-r--r--fs/reiserfs/resize.c3
-rw-r--r--fs/reiserfs/stree.c6
-rw-r--r--fs/reiserfs/super.c12
-rw-r--r--fs/reiserfs/tail_conversion.c6
-rw-r--r--fs/reiserfs/xattr.c6
-rw-r--r--fs/reiserfs/xattr.h122
-rw-r--r--fs/reiserfs/xattr_acl.c6
-rw-r--r--fs/reiserfs/xattr_security.c4
-rw-r--r--fs/reiserfs/xattr_trusted.c4
-rw-r--r--fs/reiserfs/xattr_user.c4
-rw-r--r--fs/romfs/storage.c2
-rw-r--r--fs/romfs/super.c6
-rw-r--r--fs/select.c46
-rw-r--r--fs/seq_file.c114
-rw-r--r--fs/signalfd.c15
-rw-r--r--fs/splice.c14
-rw-r--r--fs/squashfs/block.c3
-rw-r--r--fs/squashfs/cache.c30
-rw-r--r--fs/squashfs/dir.c7
-rw-r--r--fs/squashfs/file.c8
-rw-r--r--fs/squashfs/inode.c4
-rw-r--r--fs/squashfs/namei.c5
-rw-r--r--fs/squashfs/squashfs_fs.h19
-rw-r--r--fs/squashfs/squashfs_fs_sb.h1
-rw-r--r--fs/squashfs/super.c10
-rw-r--r--fs/squashfs/symlink.c4
-rw-r--r--fs/stack.c2
-rw-r--r--fs/stat.c4
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/super.c29
-rw-r--r--fs/sync.c2
-rw-r--r--fs/sysfs/dir.c227
-rw-r--r--fs/sysfs/file.c6
-rw-r--r--fs/sysfs/group.c6
-rw-r--r--fs/sysfs/inode.c16
-rw-r--r--fs/sysfs/mount.c5
-rw-r--r--fs/sysfs/sysfs.h17
-rw-r--r--fs/sysv/namei.c12
-rw-r--r--fs/sysv/super.c27
-rw-r--r--fs/sysv/sysv.h1
-rw-r--r--fs/ubifs/debug.c494
-rw-r--r--fs/ubifs/debug.h74
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/ubifs/file.c4
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/recovery.c3
-rw-r--r--fs/ubifs/replay.c8
-rw-r--r--fs/ubifs/sb.c19
-rw-r--r--fs/ubifs/super.c6
-rw-r--r--fs/ubifs/tnc.c55
-rw-r--r--fs/ubifs/tnc_misc.c10
-rw-r--r--fs/ubifs/ubifs.h11
-rw-r--r--fs/udf/balloc.c84
-rw-r--r--fs/udf/file.c6
-rw-r--r--fs/udf/ialloc.c1
-rw-r--r--fs/udf/inode.c20
-rw-r--r--fs/udf/namei.c13
-rw-r--r--fs/udf/super.c11
-rw-r--r--fs/udf/udf_i.h1
-rw-r--r--fs/ufs/inode.c1
-rw-r--r--fs/ufs/namei.c14
-rw-r--r--fs/ufs/super.c8
-rw-r--r--fs/xattr.c42
-rw-r--r--fs/xattr_acl.c2
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/kmem.h6
-rw-r--r--fs/xfs/xfs_alloc.c36
-rw-r--r--fs/xfs/xfs_alloc.h12
-rw-r--r--fs/xfs/xfs_aops.c210
-rw-r--r--fs/xfs/xfs_aops.h4
-rw-r--r--fs/xfs/xfs_attr.c20
-rw-r--r--fs/xfs/xfs_attr_leaf.c49
-rw-r--r--fs/xfs/xfs_bmap.c138
-rw-r--r--fs/xfs/xfs_buf.c17
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_da_btree.c32
-rw-r--r--fs/xfs/xfs_dfrag.c67
-rw-r--r--fs/xfs/xfs_dir2_block.c1
-rw-r--r--fs/xfs/xfs_discard.c61
-rw-r--r--fs/xfs/xfs_dquot.c539
-rw-r--r--fs/xfs/xfs_dquot.h49
-rw-r--r--fs/xfs/xfs_file.c268
-rw-r--r--fs/xfs/xfs_fs_subr.c2
-rw-r--r--fs/xfs/xfs_iget.c71
-rw-r--r--fs/xfs/xfs_inode.c289
-rw-r--r--fs/xfs/xfs_inode.h141
-rw-r--r--fs/xfs/xfs_inode_item.c305
-rw-r--r--fs/xfs/xfs_inode_item.h16
-rw-r--r--fs/xfs/xfs_ioctl.c28
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c65
-rw-r--r--fs/xfs/xfs_iops.c117
-rw-r--r--fs/xfs/xfs_itable.c24
-rw-r--r--fs/xfs/xfs_log.c615
-rw-r--r--fs/xfs/xfs_log.h16
-rw-r--r--fs/xfs/xfs_log_priv.h28
-rw-r--r--fs/xfs/xfs_log_recover.c47
-rw-r--r--fs/xfs/xfs_mount.c8
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_qm.c863
-rw-r--r--fs/xfs/xfs_qm.h63
-rw-r--r--fs/xfs/xfs_qm_bhv.c42
-rw-r--r--fs/xfs/xfs_qm_stats.c105
-rw-r--r--fs/xfs/xfs_qm_stats.h53
-rw-r--r--fs/xfs/xfs_qm_syscalls.c142
-rw-r--r--fs/xfs/xfs_quota.h2
-rw-r--r--fs/xfs/xfs_quota_priv.h11
-rw-r--r--fs/xfs/xfs_rename.c11
-rw-r--r--fs/xfs/xfs_rtalloc.c9
-rw-r--r--fs/xfs/xfs_sb.h1
-rw-r--r--fs/xfs/xfs_stats.c99
-rw-r--r--fs/xfs/xfs_stats.h10
-rw-r--r--fs/xfs/xfs_super.c212
-rw-r--r--fs/xfs/xfs_super.h8
-rw-r--r--fs/xfs/xfs_sync.c55
-rw-r--r--fs/xfs/xfs_sync.h2
-rw-r--r--fs/xfs/xfs_trace.h140
-rw-r--r--fs/xfs/xfs_trans.c35
-rw-r--r--fs/xfs/xfs_trans_ail.c83
-rw-r--r--fs/xfs/xfs_trans_buf.c25
-rw-r--r--fs/xfs/xfs_trans_dquot.c19
-rw-r--r--fs/xfs/xfs_trans_inode.c8
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/xfs/xfs_utils.c2
-rw-r--r--fs/xfs/xfs_vnode.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c63
-rw-r--r--fs/xfs/xfs_vnodeops.h3
619 files changed, 37560 insertions, 16761 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 1964f98e74b..b85efa77394 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -594,21 +594,21 @@ static int __init init_v9fs(void)
int err;
pr_info("Installing v9fs 9p2000 file system support\n");
/* TODO: Setup list of registered trasnport modules */
- err = register_filesystem(&v9fs_fs_type);
- if (err < 0) {
- pr_err("Failed to register filesystem\n");
- return err;
- }
err = v9fs_cache_register();
if (err < 0) {
pr_err("Failed to register v9fs for caching\n");
- goto out_fs_unreg;
+ return err;
}
err = v9fs_sysfs_init();
if (err < 0) {
pr_err("Failed to register with sysfs\n");
+ goto out_cache;
+ }
+ err = register_filesystem(&v9fs_fs_type);
+ if (err < 0) {
+ pr_err("Failed to register filesystem\n");
goto out_sysfs_cleanup;
}
@@ -617,8 +617,8 @@ static int __init init_v9fs(void)
out_sysfs_cleanup:
v9fs_sysfs_cleanup();
-out_fs_unreg:
- unregister_filesystem(&v9fs_fs_type);
+out_cache:
+ v9fs_cache_unregister();
return err;
}
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 7b0cd87b07c..8c92a9ba833 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -155,9 +155,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto release_sb;
}
- root = d_alloc_root(inode);
+ root = d_make_root(inode);
if (!root) {
- iput(inode);
retval = -ENOMEM;
goto release_sb;
}
@@ -260,7 +259,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
if (v9fs_proto_dotl(v9ses)) {
res = p9_client_statfs(fid, &rs);
if (res == 0) {
- buf->f_type = V9FS_MAGIC;
+ buf->f_type = rs.type;
buf->f_bsize = rs.bsize;
buf->f_blocks = rs.blocks;
buf->f_bfree = rs.bfree;
diff --git a/fs/Kconfig b/fs/Kconfig
index d621f02a3f9..f95ae3a027f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -4,6 +4,10 @@
menu "File systems"
+# Use unaligned word dcache accesses
+config DCACHE_WORD_ACCESS
+ bool
+
if BLOCK
source "fs/ext2/Kconfig"
@@ -210,6 +214,7 @@ source "fs/minix/Kconfig"
source "fs/omfs/Kconfig"
source "fs/hpfs/Kconfig"
source "fs/qnx4/Kconfig"
+source "fs/qnx6/Kconfig"
source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 93804d4d66e..2fb97793467 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_UBIFS_FS) += ubifs/
obj-$(CONFIG_AFFS_FS) += affs/
obj-$(CONFIG_ROMFS_FS) += romfs/
obj-$(CONFIG_QNX4FS_FS) += qnx4/
+obj-$(CONFIG_QNX6FS_FS) += qnx6/
obj-$(CONFIG_AUTOFS4_FS) += autofs4/
obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 8e3b36ace30..06fdcc9382c 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -483,10 +483,9 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_d_op = &adfs_dentry_operations;
root = adfs_iget(sb, &root_obj);
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
int i;
- iput(root);
for (i = 0; i < asb->s_map_size; i++)
brelse(asb->s_map[i].dm_bh);
kfree(asb->s_map);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 8ba73fed796..0782653a05a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -473,7 +473,7 @@ got_root:
root_inode = affs_iget(sb, root_block);
if (IS_ERR(root_inode)) {
ret = PTR_ERR(root_inode);
- goto out_error_noinode;
+ goto out_error;
}
if (AFFS_SB(sb)->s_flags & SF_INTL)
@@ -481,7 +481,7 @@ got_root:
else
sb->s_d_op = &affs_dentry_operations;
- sb->s_root = d_alloc_root(root_inode);
+ sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
printk(KERN_ERR "AFFS: Get root inode failed\n");
goto out_error;
@@ -494,9 +494,6 @@ got_root:
* Begin the cascaded cleanup ...
*/
out_error:
- if (root_inode)
- iput(root_inode);
-out_error_noinode:
kfree(sbi->s_bitmap);
affs_brelse(root_bh);
kfree(sbi->s_prefix);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 14d89fa58fe..8f6e9234d56 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -251,7 +251,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
ASSERT(key != NULL);
vnode = AFS_FS_I(mapping->host);
- if (vnode->flags & AFS_VNODE_DELETED) {
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
_leave(" = -ESTALE");
return -ESTALE;
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 2f213d109c2..b960ff05ea0 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -365,10 +365,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
_debug("extract data");
if (call->count > 0) {
page = call->reply3;
- buffer = kmap_atomic(page, KM_USER0);
+ buffer = kmap_atomic(page);
ret = afs_extract_data(call, skb, last, buffer,
call->count);
- kunmap_atomic(buffer, KM_USER0);
+ kunmap_atomic(buffer);
switch (ret) {
case 0: break;
case -EAGAIN: return 0;
@@ -411,9 +411,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
if (call->count < PAGE_SIZE) {
_debug("clear");
page = call->reply3;
- buffer = kmap_atomic(page, KM_USER0);
+ buffer = kmap_atomic(page);
memset(buffer + call->count, 0, PAGE_SIZE - call->count);
- kunmap_atomic(buffer, KM_USER0);
+ kunmap_atomic(buffer);
}
_leave(" = 0 [done]");
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index d2b0888126d..a306bb6d88d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -109,7 +109,7 @@ struct afs_call {
unsigned reply_size; /* current size of reply */
unsigned first_offset; /* offset into mapping[first] */
unsigned last_to; /* amount of mapping[last] */
- unsigned short offset; /* offset into received data store */
+ unsigned offset; /* offset into received data store */
unsigned char unmarshall; /* unmarshalling phase */
bool incoming; /* T if incoming call */
bool send_pages; /* T if data from mapping should be sent */
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 8f4ce2658b7..298cf8919ec 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -200,9 +200,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
if (PageError(page))
goto error;
- buf = kmap_atomic(page, KM_USER0);
+ buf = kmap_atomic(page);
memcpy(devname, buf, size);
- kunmap_atomic(buf, KM_USER0);
+ kunmap_atomic(buf);
page_cache_release(page);
page = NULL;
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index e45a323aebb..8ad8c2a0703 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -314,6 +314,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
struct msghdr msg;
struct kvec iov[1];
int ret;
+ struct sk_buff *skb;
_enter("%x,{%d},", addr->s_addr, ntohs(call->port));
@@ -380,6 +381,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
error_do_abort:
rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
+ while ((skb = skb_dequeue(&call->rx_queue)))
+ afs_free_skb(skb);
rxrpc_kernel_end_call(rxcall);
call->rxcall = NULL;
error_kill_call:
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 983ec59fc80..f02b31e7e64 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -301,7 +301,6 @@ static int afs_fill_super(struct super_block *sb,
{
struct afs_super_info *as = sb->s_fs_info;
struct afs_fid fid;
- struct dentry *root = NULL;
struct inode *inode = NULL;
int ret;
@@ -327,18 +326,16 @@ static int afs_fill_super(struct super_block *sb,
set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
ret = -ENOMEM;
- root = d_alloc_root(inode);
- if (!root)
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
goto error;
sb->s_d_op = &afs_fs_dentry_operations;
- sb->s_root = root;
_leave(" = 0");
return 0;
error:
- iput(inode);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/aio.c b/fs/aio.c
index 78c514cfd21..da887604dfc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -13,7 +13,7 @@
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/aio_abi.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
@@ -160,7 +160,7 @@ static int aio_setup_ring(struct kioctx *ctx)
info->nr = nr_events; /* trusted copy */
- ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+ ring = kmap_atomic(info->ring_pages[0]);
ring->nr = nr_events; /* user copy */
ring->id = ctx->user_id;
ring->head = ring->tail = 0;
@@ -168,47 +168,38 @@ static int aio_setup_ring(struct kioctx *ctx)
ring->compat_features = AIO_RING_COMPAT_FEATURES;
ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
ring->header_length = sizeof(struct aio_ring);
- kunmap_atomic(ring, KM_USER0);
+ kunmap_atomic(ring);
return 0;
}
/* aio_ring_event: returns a pointer to the event at the given index from
- * kmap_atomic(, km). Release the pointer with put_aio_ring_event();
+ * kmap_atomic(). Release the pointer with put_aio_ring_event();
*/
#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-#define aio_ring_event(info, nr, km) ({ \
+#define aio_ring_event(info, nr) ({ \
unsigned pos = (nr) + AIO_EVENTS_OFFSET; \
struct io_event *__event; \
__event = kmap_atomic( \
- (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
+ (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \
__event += pos % AIO_EVENTS_PER_PAGE; \
__event; \
})
-#define put_aio_ring_event(event, km) do { \
+#define put_aio_ring_event(event) do { \
struct io_event *__event = (event); \
(void)__event; \
- kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
+ kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \
} while(0)
static void ctx_rcu_free(struct rcu_head *head)
{
struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
- unsigned nr_events = ctx->max_reqs;
-
kmem_cache_free(kioctx_cachep, ctx);
-
- if (nr_events) {
- spin_lock(&aio_nr_lock);
- BUG_ON(aio_nr - nr_events > aio_nr);
- aio_nr -= nr_events;
- spin_unlock(&aio_nr_lock);
- }
}
/* __put_ioctx
@@ -217,23 +208,23 @@ static void ctx_rcu_free(struct rcu_head *head)
*/
static void __put_ioctx(struct kioctx *ctx)
{
+ unsigned nr_events = ctx->max_reqs;
BUG_ON(ctx->reqs_active);
- cancel_delayed_work(&ctx->wq);
- cancel_work_sync(&ctx->wq.work);
+ cancel_delayed_work_sync(&ctx->wq);
aio_free_ring(ctx);
mmdrop(ctx->mm);
ctx->mm = NULL;
+ if (nr_events) {
+ spin_lock(&aio_nr_lock);
+ BUG_ON(aio_nr - nr_events > aio_nr);
+ aio_nr -= nr_events;
+ spin_unlock(&aio_nr_lock);
+ }
pr_debug("__put_ioctx: freeing %p\n", ctx);
call_rcu(&ctx->rcu_head, ctx_rcu_free);
}
-static inline void get_ioctx(struct kioctx *kioctx)
-{
- BUG_ON(atomic_read(&kioctx->users) <= 0);
- atomic_inc(&kioctx->users);
-}
-
static inline int try_get_ioctx(struct kioctx *kioctx)
{
return atomic_inc_not_zero(&kioctx->users);
@@ -253,7 +244,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
{
struct mm_struct *mm;
struct kioctx *ctx;
- int did_sync = 0;
+ int err = -ENOMEM;
/* Prevent overflows */
if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -262,7 +253,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
return ERR_PTR(-EINVAL);
}
- if ((unsigned long)nr_events > aio_max_nr)
+ if (!nr_events || (unsigned long)nr_events > aio_max_nr)
return ERR_PTR(-EAGAIN);
ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -273,7 +264,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
mm = ctx->mm = current->mm;
atomic_inc(&mm->mm_count);
- atomic_set(&ctx->users, 1);
+ atomic_set(&ctx->users, 2);
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->ring_info.ring_lock);
init_waitqueue_head(&ctx->wait);
@@ -286,25 +277,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
goto out_freectx;
/* limit the number of system wide aios */
- do {
- spin_lock_bh(&aio_nr_lock);
- if (aio_nr + nr_events > aio_max_nr ||
- aio_nr + nr_events < aio_nr)
- ctx->max_reqs = 0;
- else
- aio_nr += ctx->max_reqs;
- spin_unlock_bh(&aio_nr_lock);
- if (ctx->max_reqs || did_sync)
- break;
-
- /* wait for rcu callbacks to have completed before giving up */
- synchronize_rcu();
- did_sync = 1;
- ctx->max_reqs = nr_events;
- } while (1);
-
- if (ctx->max_reqs == 0)
+ spin_lock(&aio_nr_lock);
+ if (aio_nr + nr_events > aio_max_nr ||
+ aio_nr + nr_events < aio_nr) {
+ spin_unlock(&aio_nr_lock);
goto out_cleanup;
+ }
+ aio_nr += ctx->max_reqs;
+ spin_unlock(&aio_nr_lock);
/* now link into global list. */
spin_lock(&mm->ioctx_lock);
@@ -316,27 +296,27 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
return ctx;
out_cleanup:
- __put_ioctx(ctx);
- return ERR_PTR(-EAGAIN);
-
+ err = -EAGAIN;
+ aio_free_ring(ctx);
out_freectx:
mmdrop(mm);
kmem_cache_free(kioctx_cachep, ctx);
- ctx = ERR_PTR(-ENOMEM);
-
- dprintk("aio: error allocating ioctx %p\n", ctx);
- return ctx;
+ dprintk("aio: error allocating ioctx %d\n", err);
+ return ERR_PTR(err);
}
-/* aio_cancel_all
+/* kill_ctx
* Cancels all outstanding aio requests on an aio context. Used
* when the processes owning a context have all exited to encourage
* the rapid destruction of the kioctx.
*/
-static void aio_cancel_all(struct kioctx *ctx)
+static void kill_ctx(struct kioctx *ctx)
{
int (*cancel)(struct kiocb *, struct io_event *);
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
struct io_event res;
+
spin_lock_irq(&ctx->ctx_lock);
ctx->dead = 1;
while (!list_empty(&ctx->active_reqs)) {
@@ -352,15 +332,7 @@ static void aio_cancel_all(struct kioctx *ctx)
spin_lock_irq(&ctx->ctx_lock);
}
}
- spin_unlock_irq(&ctx->ctx_lock);
-}
-
-static void wait_for_all_aios(struct kioctx *ctx)
-{
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
- spin_lock_irq(&ctx->ctx_lock);
if (!ctx->reqs_active)
goto out;
@@ -410,13 +382,7 @@ void exit_aio(struct mm_struct *mm)
ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
hlist_del_rcu(&ctx->list);
- aio_cancel_all(ctx);
-
- wait_for_all_aios(ctx);
- /*
- * Ensure we don't leave the ctx on the aio_wq
- */
- cancel_work_sync(&ctx->wq.work);
+ kill_ctx(ctx);
if (1 != atomic_read(&ctx->users))
printk(KERN_DEBUG
@@ -476,14 +442,23 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total)
batch->count = total;
}
-static void kiocb_batch_free(struct kiocb_batch *batch)
+static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
{
struct kiocb *req, *n;
+ if (list_empty(&batch->head))
+ return;
+
+ spin_lock_irq(&ctx->ctx_lock);
list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
list_del(&req->ki_batch);
+ list_del(&req->ki_list);
kmem_cache_free(kiocb_cachep, req);
+ ctx->reqs_active--;
}
+ if (unlikely(!ctx->reqs_active && ctx->dead))
+ wake_up_all(&ctx->wait);
+ spin_unlock_irq(&ctx->ctx_lock);
}
/*
@@ -600,11 +575,16 @@ static void aio_fput_routine(struct work_struct *data)
fput(req->ki_filp);
/* Link the iocb into the context's free list */
+ rcu_read_lock();
spin_lock_irq(&ctx->ctx_lock);
really_put_req(ctx, req);
+ /*
+ * at that point ctx might've been killed, but actual
+ * freeing is RCU'd
+ */
spin_unlock_irq(&ctx->ctx_lock);
+ rcu_read_unlock();
- put_ioctx(ctx);
spin_lock_irq(&fput_lock);
}
spin_unlock_irq(&fput_lock);
@@ -635,7 +615,6 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
* this function will be executed w/out any aio kthread wakeup.
*/
if (unlikely(!fput_atomic(req->ki_filp))) {
- get_ioctx(ctx);
spin_lock(&fput_lock);
list_add(&req->ki_list, &fput_head);
spin_unlock(&fput_lock);
@@ -913,7 +892,7 @@ static void aio_kick_handler(struct work_struct *work)
unuse_mm(mm);
set_fs(oldfs);
/*
- * we're in a worker thread already, don't use queue_delayed_work,
+ * we're in a worker thread already; no point using non-zero delay
*/
if (requeue)
queue_delayed_work(aio_wq, &ctx->wq, 0);
@@ -1012,10 +991,10 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
if (kiocbIsCancelled(iocb))
goto put_rq;
- ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+ ring = kmap_atomic(info->ring_pages[0]);
tail = info->tail;
- event = aio_ring_event(info, tail, KM_IRQ0);
+ event = aio_ring_event(info, tail);
if (++tail >= info->nr)
tail = 0;
@@ -1036,8 +1015,8 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
info->tail = tail;
ring->tail = tail;
- put_aio_ring_event(event, KM_IRQ0);
- kunmap_atomic(ring, KM_IRQ1);
+ put_aio_ring_event(event);
+ kunmap_atomic(ring);
pr_debug("added to ring %p at [%lu]\n", iocb, tail);
@@ -1082,7 +1061,7 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
unsigned long head;
int ret = 0;
- ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+ ring = kmap_atomic(info->ring_pages[0]);
dprintk("in aio_read_evt h%lu t%lu m%lu\n",
(unsigned long)ring->head, (unsigned long)ring->tail,
(unsigned long)ring->nr);
@@ -1094,18 +1073,18 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
head = ring->head % info->nr;
if (head != ring->tail) {
- struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+ struct io_event *evp = aio_ring_event(info, head);
*ent = *evp;
head = (head + 1) % info->nr;
smp_mb(); /* finish reading the event before updatng the head */
ring->head = head;
ret = 1;
- put_aio_ring_event(evp, KM_USER1);
+ put_aio_ring_event(evp);
}
spin_unlock(&info->ring_lock);
out:
- kunmap_atomic(ring, KM_USER0);
+ kunmap_atomic(ring);
dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
(unsigned long)ring->head, (unsigned long)ring->tail);
return ret;
@@ -1283,8 +1262,7 @@ static void io_destroy(struct kioctx *ioctx)
if (likely(!was_dead))
put_ioctx(ioctx); /* twice for the list */
- aio_cancel_all(ioctx);
- wait_for_all_aios(ioctx);
+ kill_ctx(ioctx);
/*
* Wake up any waiters. The setting of ctx->dead must be seen
@@ -1292,7 +1270,6 @@ static void io_destroy(struct kioctx *ioctx)
* locking done by the above calls to ensure this consistency.
*/
wake_up_all(&ioctx->wait);
- put_ioctx(ioctx); /* once for the lookup */
}
/* sys_io_setup:
@@ -1329,11 +1306,9 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
ret = PTR_ERR(ioctx);
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
- if (!ret)
- return 0;
-
- get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
- io_destroy(ioctx);
+ if (ret)
+ io_destroy(ioctx);
+ put_ioctx(ioctx);
}
out:
@@ -1351,6 +1326,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
io_destroy(ioctx);
+ put_ioctx(ioctx);
return 0;
}
pr_debug("EINVAL: io_destroy: invalid context id\n");
@@ -1742,7 +1718,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
}
blk_finish_plug(&plug);
- kiocb_batch_free(&batch);
+ kiocb_batch_free(ctx, &batch);
put_ioctx(ctx);
return i ? i : ret;
}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index f11e43ed907..28d39fb84ae 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -39,19 +39,6 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
.d_dname = anon_inodefs_dname,
};
-static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_pseudo(fs_type, "anon_inode:", NULL,
- &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
-}
-
-static struct file_system_type anon_inode_fs_type = {
- .name = "anon_inodefs",
- .mount = anon_inodefs_mount,
- .kill_sb = kill_anon_super,
-};
-
/*
* nop .set_page_dirty method so that people can use .page_mkwrite on
* anon inodes.
@@ -65,6 +52,62 @@ static const struct address_space_operations anon_aops = {
.set_page_dirty = anon_set_page_dirty,
};
+/*
+ * A single inode exists for all anon_inode files. Contrary to pipes,
+ * anon_inode inodes have no associated per-instance data, so we need
+ * only allocate one of them.
+ */
+static struct inode *anon_inode_mkinode(struct super_block *s)
+{
+ struct inode *inode = new_inode_pseudo(s);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ inode->i_ino = get_next_ino();
+ inode->i_fop = &anon_inode_fops;
+
+ inode->i_mapping->a_ops = &anon_aops;
+
+ /*
+ * Mark the inode dirty from the very beginning,
+ * that way it will never be moved to the dirty
+ * list because mark_inode_dirty() will think
+ * that it already _is_ on the dirty list.
+ */
+ inode->i_state = I_DIRTY;
+ inode->i_mode = S_IRUSR | S_IWUSR;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_flags |= S_PRIVATE;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ return inode;
+}
+
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ struct dentry *root;
+ root = mount_pseudo(fs_type, "anon_inode:", NULL,
+ &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+ if (!IS_ERR(root)) {
+ struct super_block *s = root->d_sb;
+ anon_inode_inode = anon_inode_mkinode(s);
+ if (IS_ERR(anon_inode_inode)) {
+ dput(root);
+ deactivate_locked_super(s);
+ root = ERR_CAST(anon_inode_inode);
+ }
+ }
+ return root;
+}
+
+static struct file_system_type anon_inode_fs_type = {
+ .name = "anon_inodefs",
+ .mount = anon_inodefs_mount,
+ .kill_sb = kill_anon_super,
+};
+
/**
* anon_inode_getfile - creates a new file instance by hooking it up to an
* anonymous inode, and a dentry that describe the "class"
@@ -180,38 +223,6 @@ err_put_unused_fd:
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);
-/*
- * A single inode exists for all anon_inode files. Contrary to pipes,
- * anon_inode inodes have no associated per-instance data, so we need
- * only allocate one of them.
- */
-static struct inode *anon_inode_mkinode(void)
-{
- struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb);
-
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- inode->i_ino = get_next_ino();
- inode->i_fop = &anon_inode_fops;
-
- inode->i_mapping->a_ops = &anon_aops;
-
- /*
- * Mark the inode dirty from the very beginning,
- * that way it will never be moved to the dirty
- * list because mark_inode_dirty() will think
- * that it already _is_ on the dirty list.
- */
- inode->i_state = I_DIRTY;
- inode->i_mode = S_IRUSR | S_IWUSR;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_flags |= S_PRIVATE;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- return inode;
-}
-
static int __init anon_inode_init(void)
{
int error;
@@ -224,16 +235,8 @@ static int __init anon_inode_init(void)
error = PTR_ERR(anon_inode_mnt);
goto err_unregister_filesystem;
}
- anon_inode_inode = anon_inode_mkinode();
- if (IS_ERR(anon_inode_inode)) {
- error = PTR_ERR(anon_inode_inode);
- goto err_mntput;
- }
-
return 0;
-err_mntput:
- kern_unmount(anon_inode_mnt);
err_unregister_filesystem:
unregister_filesystem(&anon_inode_fs_type);
err_exit:
diff --git a/fs/attr.c b/fs/attr.c
index 95053ad8abc..73f69a6ce9e 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -5,7 +5,7 @@
* changes by Thomas Schoebel-Theuer
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/string.h>
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index d8d8e7ba6a1..eb1cc92cd67 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -110,6 +110,7 @@ struct autofs_sb_info {
int sub_version;
int min_proto;
int max_proto;
+ int compat_daemon;
unsigned long exp_timeout;
unsigned int type;
int reghost_enabled;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 76741d8d778..9dacb858670 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -230,7 +230,7 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
- FD_SET(fd, fdt->close_on_exec);
+ __set_close_on_exec(fd, fdt);
spin_unlock(&files->file_lock);
}
@@ -385,6 +385,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
sbi->pipefd = pipefd;
sbi->pipe = pipe;
sbi->catatonic = 0;
+ sbi->compat_daemon = is_compat_task();
}
out:
mutex_unlock(&sbi->wq_mutex);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 450f529a4ea..1feb68ecef9 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -124,6 +124,7 @@ start:
/* Negative dentry - try next */
if (!simple_positive(q)) {
spin_unlock(&p->d_lock);
+ lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_);
p = q;
goto again;
}
@@ -186,6 +187,7 @@ again:
/* Negative dentry - try next */
if (!simple_positive(ret)) {
spin_unlock(&p->d_lock);
+ lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_);
p = ret;
goto again;
}
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index c038727b405..cddc74b9cdb 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -31,11 +31,11 @@ static int __init init_autofs4_fs(void)
{
int err;
+ autofs_dev_ioctl_init();
+
err = register_filesystem(&autofs_fs_type);
if (err)
- return err;
-
- autofs_dev_ioctl_init();
+ autofs_dev_ioctl_exit();
return err;
}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index e16980b00b8..d8dc002e9cc 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -19,6 +19,7 @@
#include <linux/parser.h>
#include <linux/bitops.h>
#include <linux/magic.h>
+#include <linux/compat.h>
#include "autofs_i.h"
#include <linux/module.h>
@@ -224,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
set_autofs_type_indirect(&sbi->type);
sbi->min_proto = 0;
sbi->max_proto = 0;
+ sbi->compat_daemon = is_compat_task();
mutex_init(&sbi->wq_mutex);
mutex_init(&sbi->pipe_mutex);
spin_lock_init(&sbi->fs_lock);
@@ -245,12 +247,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
if (!ino)
goto fail_free;
root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
- if (!root_inode)
- goto fail_ino;
-
- root = d_alloc_root(root_inode);
+ root = d_make_root(root_inode);
if (!root)
- goto fail_iput;
+ goto fail_ino;
pipe = NULL;
root->d_fsdata = ino;
@@ -315,9 +314,6 @@ fail_fput:
fail_dput:
dput(root);
goto fail_free;
-fail_iput:
- printk("autofs: get root dentry failed\n");
- iput(root_inode);
fail_ino:
kfree(ino);
fail_free:
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 9ef5b291440..9c098db4334 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -76,7 +76,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
data += wr;
bytes -= wr;
}
- mutex_lock(&sbi->pipe_mutex);
+ mutex_unlock(&sbi->pipe_mutex);
set_fs(fs);
@@ -91,7 +91,24 @@ static int autofs4_write(struct autofs_sb_info *sbi,
return (bytes > 0);
}
-
+
+/*
+ * The autofs_v5 packet was misdesigned.
+ *
+ * The packets are identical on x86-32 and x86-64, but have different
+ * alignment. Which means that 'sizeof()' will give different results.
+ * Fix it up for the case of running 32-bit user mode on a 64-bit kernel.
+ */
+static noinline size_t autofs_v5_packet_size(struct autofs_sb_info *sbi)
+{
+ size_t pktsz = sizeof(struct autofs_v5_packet);
+#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT)
+ if (sbi->compat_daemon > 0)
+ pktsz -= 4;
+#endif
+ return pktsz;
+}
+
static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct autofs_wait_queue *wq,
int type)
@@ -155,8 +172,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
{
struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
- pktsz = sizeof(*packet);
-
+ pktsz = autofs_v5_packet_size(sbi);
packet->wait_queue_token = wq->wait_queue_token;
packet->len = wq->name.len;
memcpy(packet->name, wq->name.name, wq->name.len);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 22e9a78872f..37268c5bb98 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -9,7 +9,7 @@
*/
#include <linux/fs.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/stat.h>
#include <linux/time.h>
#include <linux/namei.h>
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 6e6d536767f..e18da23d42b 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -852,9 +852,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
ret = PTR_ERR(root);
goto unacquire_priv_sbp;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
- iput(root);
befs_error(sb, "get root inode failed");
goto unacquire_priv_sbp;
}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index b0391bc402b..e23dc7c8b88 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -367,9 +367,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
ret = PTR_ERR(inode);
goto out2;
}
- s->s_root = d_alloc_root(inode);
+ s->s_root = d_make_root(inode);
if (!s->s_root) {
- iput(inode);
ret = -ENOMEM;
goto out2;
}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index a6395bdb26a..2eb12f13593 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -26,7 +26,6 @@
#include <linux/coredump.h>
#include <linux/slab.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/a.out-core.h>
@@ -259,8 +258,14 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
current->mm->free_area_cache = current->mm->mmap_base;
current->mm->cached_hole_size = 0;
+ retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+ if (retval < 0) {
+ /* Someone check-me: is this error path enough? */
+ send_sig(SIGKILL, current, 0);
+ return retval;
+ }
+
install_exec_creds(bprm);
- current->flags &= ~PF_FORKNOEXEC;
if (N_MAGIC(ex) == OMAGIC) {
unsigned long text_addr, map_size;
@@ -352,13 +357,6 @@ beyond_if:
return retval;
}
- retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0) {
- /* Someone check-me: is this error path enough? */
- send_sig(SIGKILL, current, 0);
- return retval;
- }
-
current->mm->start_stack =
(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
#ifdef __alpha__
@@ -454,7 +452,8 @@ out:
static int __init init_aout_binfmt(void)
{
- return register_binfmt(&aout_format);
+ register_binfmt(&aout_format);
+ return 0;
}
static void __exit exit_aout_binfmt(void)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index bcb884e2d61..48ffb3dc610 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -35,6 +35,7 @@
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
+#include <asm/exec.h>
static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
static int load_elf_library(struct file *);
@@ -712,7 +713,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
goto out_free_dentry;
/* OK, This is the point of no return */
- current->flags &= ~PF_FORKNOEXEC;
current->mm->def_flags = def_flags;
/* Do this immediately, since STACK_TOP as used in setup_arg_pages
@@ -934,7 +934,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
install_exec_creds(bprm);
- current->flags &= ~PF_FORKNOEXEC;
retval = create_elf_tables(bprm, &loc->elf_ex,
load_addr, interp_load_addr);
if (retval < 0) {
@@ -1095,6 +1094,29 @@ out:
*/
/*
+ * The purpose of always_dump_vma() is to make sure that special kernel mappings
+ * that are useful for post-mortem analysis are included in every core dump.
+ * In that way we ensure that the core dump is fully interpretable later
+ * without matching up the same kernel and hardware config to see what PC values
+ * meant. These special mappings include - vDSO, vsyscall, and other
+ * architecture specific mappings
+ */
+static bool always_dump_vma(struct vm_area_struct *vma)
+{
+ /* Any vsyscall mappings? */
+ if (vma == get_gate_vma(vma->vm_mm))
+ return true;
+ /*
+ * arch_vma_name() returns non-NULL for special architecture mappings,
+ * such as vDSO sections.
+ */
+ if (arch_vma_name(vma))
+ return true;
+
+ return false;
+}
+
+/*
* Decide what to dump of a segment, part, all or none.
*/
static unsigned long vma_dump_size(struct vm_area_struct *vma,
@@ -1102,10 +1124,13 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
{
#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
- /* The vma can be set up to tell us the answer directly. */
- if (vma->vm_flags & VM_ALWAYSDUMP)
+ /* always dump the vdso and vsyscall sections */
+ if (always_dump_vma(vma))
goto whole;
+ if (vma->vm_flags & VM_NODUMP)
+ return 0;
+
/* Hugetlb memory check */
if (vma->vm_flags & VM_HUGETLB) {
if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
@@ -1390,6 +1415,22 @@ static void do_thread_regset_writeback(struct task_struct *task,
regset->writeback(task, regset, 1);
}
+#ifndef PR_REG_SIZE
+#define PR_REG_SIZE(S) sizeof(S)
+#endif
+
+#ifndef PRSTATUS_SIZE
+#define PRSTATUS_SIZE(S) sizeof(S)
+#endif
+
+#ifndef PR_REG_PTR
+#define PR_REG_PTR(S) (&((S)->pr_reg))
+#endif
+
+#ifndef SET_PR_FPVALID
+#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V))
+#endif
+
static int fill_thread_core_info(struct elf_thread_core_info *t,
const struct user_regset_view *view,
long signr, size_t *total)
@@ -1404,11 +1445,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
*/
fill_prstatus(&t->prstatus, t->task, signr);
(void) view->regsets[0].get(t->task, &view->regsets[0],
- 0, sizeof(t->prstatus.pr_reg),
- &t->prstatus.pr_reg, NULL);
+ 0, PR_REG_SIZE(t->prstatus.pr_reg),
+ PR_REG_PTR(&t->prstatus), NULL);
fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
- sizeof(t->prstatus), &t->prstatus);
+ PRSTATUS_SIZE(t->prstatus), &t->prstatus);
*total += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1421,7 +1462,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
for (i = 1; i < view->n; ++i) {
const struct user_regset *regset = &view->regsets[i];
do_thread_regset_writeback(t->task, regset);
- if (regset->core_note_type &&
+ if (regset->core_note_type && regset->get &&
(!regset->active || regset->active(t->task, regset))) {
int ret;
size_t size = regset->n * regset->size;
@@ -1438,7 +1479,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
regset->core_note_type,
size, data);
else {
- t->prstatus.pr_fpvalid = 1;
+ SET_PR_FPVALID(&t->prstatus, 1);
fill_note(&t->notes[i], "CORE",
NT_PRFPREG, size, data);
}
@@ -2077,7 +2118,8 @@ out:
static int __init init_elf_binfmt(void)
{
- return register_binfmt(&elf_format);
+ register_binfmt(&elf_format);
+ return 0;
}
static void __exit exit_elf_binfmt(void)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 30745f459fa..9bd5612a822 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -39,6 +39,7 @@
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/pgalloc.h>
+#include <asm/exec.h>
typedef char *elf_caddr_t;
@@ -91,7 +92,8 @@ static struct linux_binfmt elf_fdpic_format = {
static int __init init_elf_fdpic_binfmt(void)
{
- return register_binfmt(&elf_fdpic_format);
+ register_binfmt(&elf_fdpic_format);
+ return 0;
}
static void __exit exit_elf_fdpic_binfmt(void)
@@ -334,8 +336,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
current->mm->context.exec_fdpic_loadmap = 0;
current->mm->context.interp_fdpic_loadmap = 0;
- current->flags &= ~PF_FORKNOEXEC;
-
#ifdef CONFIG_MMU
elf_fdpic_arch_lay_out_mm(&exec_params,
&interp_params,
@@ -413,7 +413,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
#endif
install_exec_creds(bprm);
- current->flags &= ~PF_FORKNOEXEC;
if (create_elf_fdpic_tables(bprm, current->mm,
&exec_params, &interp_params) < 0)
goto error_kill;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index b8e8b0acf9b..2790c7e1912 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -100,7 +100,8 @@ static struct linux_binfmt em86_format = {
static int __init init_em86_binfmt(void)
{
- return register_binfmt(&em86_format);
+ register_binfmt(&em86_format);
+ return 0;
}
static void __exit exit_em86_binfmt(void)
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 1bffbe0ed77..024d20ee3ca 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -15,7 +15,7 @@
* JAN/99 -- coded full program relocation (gerg@snapgear.com)
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -37,7 +37,6 @@
#include <linux/syscalls.h>
#include <asm/byteorder.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/unaligned.h>
#include <asm/cacheflush.h>
@@ -902,7 +901,6 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
libinfo.lib_list[j].start_data:UNLOADED_LIB;
install_exec_creds(bprm);
- current->flags &= ~PF_FORKNOEXEC;
set_binfmt(&flat_format);
@@ -950,7 +948,8 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
static int __init init_flat_binfmt(void)
{
- return register_binfmt(&flat_format);
+ register_binfmt(&flat_format);
+ return 0;
}
/****************************************************************************/
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index a9198dfd5f8..613aa061823 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
+#include <linux/magic.h>
#include <linux/binfmts.h>
#include <linux/slab.h>
#include <linux/ctype.h>
@@ -699,7 +700,7 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
[3] = {"register", &bm_register_operations, S_IWUSR},
/* last one */ {""}
};
- int err = simple_fill_super(sb, 0x42494e4d, bm_files);
+ int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
if (!err)
sb->s_op = &s_ops;
return err;
@@ -726,11 +727,8 @@ static struct file_system_type bm_fs_type = {
static int __init init_misc_binfmt(void)
{
int err = register_filesystem(&bm_fs_type);
- if (!err) {
- err = insert_binfmt(&misc_format);
- if (err)
- unregister_filesystem(&bm_fs_type);
- }
+ if (!err)
+ insert_binfmt(&misc_format);
return err;
}
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 396a9884591..d3b8c1f6315 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -105,7 +105,8 @@ static struct linux_binfmt script_format = {
static int __init init_script_binfmt(void)
{
- return register_binfmt(&script_format);
+ register_binfmt(&script_format);
+ return 0;
}
static void __exit exit_script_binfmt(void)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index cc8560f6c9b..e4fc746629a 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -225,7 +225,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
goto out_free;
/* OK, This is the point of no return */
- current->flags &= ~PF_FORKNOEXEC;
current->personality = PER_HPUX;
setup_new_exec(bprm);
@@ -289,7 +288,8 @@ static int load_som_library(struct file *f)
static int __init init_som_binfmt(void)
{
- return register_binfmt(&som_format);
+ register_binfmt(&som_format);
+ return 0;
}
static void __exit exit_som_binfmt(void)
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index c2183f3917c..e85c04b9f61 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -357,7 +357,7 @@ static void bio_integrity_generate(struct bio *bio)
bix.sector_size = bi->sector_size;
bio_for_each_segment(bv, bio, i) {
- void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+ void *kaddr = kmap_atomic(bv->bv_page);
bix.data_buf = kaddr + bv->bv_offset;
bix.data_size = bv->bv_len;
bix.prot_buf = prot_buf;
@@ -371,7 +371,7 @@ static void bio_integrity_generate(struct bio *bio)
total += sectors * bi->tuple_size;
BUG_ON(total > bio->bi_integrity->bip_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
}
@@ -498,7 +498,7 @@ static int bio_integrity_verify(struct bio *bio)
bix.sector_size = bi->sector_size;
bio_for_each_segment(bv, bio, i) {
- void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+ void *kaddr = kmap_atomic(bv->bv_page);
bix.data_buf = kaddr + bv->bv_offset;
bix.data_size = bv->bv_len;
bix.prot_buf = prot_buf;
@@ -507,7 +507,7 @@ static int bio_integrity_verify(struct bio *bio)
ret = bi->verify_fn(&bix);
if (ret) {
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
return ret;
}
@@ -517,7 +517,7 @@ static int bio_integrity_verify(struct bio *bio)
total += sectors * bi->tuple_size;
BUG_ON(total > bio->bi_integrity->bip_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
return ret;
diff --git a/fs/bio.c b/fs/bio.c
index b1fe82cf88c..e453924036e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -22,7 +22,7 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <scsi/sg.h> /* for struct sg_iovec */
@@ -505,13 +505,9 @@ EXPORT_SYMBOL(bio_clone);
int bio_get_nr_vecs(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
- int nr_pages;
-
- nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (nr_pages > queue_max_segments(q))
- nr_pages = queue_max_segments(q);
-
- return nr_pages;
+ return min_t(unsigned,
+ queue_max_segments(q),
+ queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
}
EXPORT_SYMBOL(bio_get_nr_vecs);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index afe74dda632..e08f6a20a5b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -16,6 +16,7 @@
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
+#include <linux/magic.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagevec.h>
@@ -109,7 +110,7 @@ void invalidate_bdev(struct block_device *bdev)
/* 99% of the time, we don't need to flush the cleancache on the bdev.
* But, for the strange corners, lets be cautious
*/
- cleancache_flush_inode(mapping);
+ cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(invalidate_bdev);
@@ -506,7 +507,7 @@ static const struct super_operations bdev_sops = {
static struct dentry *bd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
+ return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
}
static struct file_system_type bd_type = {
@@ -1139,6 +1140,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
bdev->bd_disk = disk;
+ bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
if (!partno) {
struct backing_dev_info *bdi;
@@ -1159,6 +1161,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
bdev->bd_disk = NULL;
+ bdev->bd_queue = NULL;
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
put_disk(disk);
@@ -1181,8 +1184,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
- if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
- rescan_partitions(disk, bdev);
+ if (bdev->bd_invalidated) {
+ if (!ret)
+ rescan_partitions(disk, bdev);
+ else if (ret == -ENOMEDIUM)
+ invalidate_partitions(disk, bdev);
+ }
if (ret)
goto out_clear;
} else {
@@ -1212,8 +1219,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
- if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
- rescan_partitions(bdev->bd_disk, bdev);
+ if (bdev->bd_invalidated) {
+ if (!ret)
+ rescan_partitions(bdev->bd_disk, bdev);
+ else if (ret == -ENOMEDIUM)
+ invalidate_partitions(bdev->bd_disk, bdev);
+ }
if (ret)
goto out_unlock_bdev;
}
@@ -1232,6 +1243,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
+ bdev->bd_queue = NULL;
bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be14..d33f01c08b6 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
Linux website <http://acl.bestbits.at/>.
If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+ bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+ depends on BTRFS_FS
+ help
+ Adds code that examines all block write requests (including
+ writes of the super block). The goal is to verify that the
+ state of the filesystem on disk is always consistent, i.e.,
+ after a power-loss or kernel panic event the filesystem is
+ in a consistent state.
+
+ If the integrity check tool is included and activated in
+ the mount options, plenty of kernel memory is used, and
+ plenty of additional CPU cycles are spent. Enabling this
+ functionality is not intended for normal use.
+
+ In most cases, unless you are a btrfs developer who needs
+ to verify the integrity of (super)-block write requests
+ during the run of a regression test, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e..0c4fa2befae 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o
+ reada.o backref.o ulist.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 0cc20b35c1c..42704149b72 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -171,11 +171,11 @@ out:
spin_unlock_irqrestore(&workers->lock, flags);
}
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
+static noinline void run_ordered_completions(struct btrfs_workers *workers,
struct btrfs_work *work)
{
if (!workers->ordered)
- return 0;
+ return;
set_bit(WORK_DONE_BIT, &work->flags);
@@ -213,7 +213,6 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
}
spin_unlock(&workers->order_lock);
- return 0;
}
static void put_worker(struct btrfs_worker_thread *worker)
@@ -399,7 +398,7 @@ again:
/*
* this will wait for all the worker threads to shutdown
*/
-int btrfs_stop_workers(struct btrfs_workers *workers)
+void btrfs_stop_workers(struct btrfs_workers *workers)
{
struct list_head *cur;
struct btrfs_worker_thread *worker;
@@ -427,7 +426,6 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
put_worker(worker);
}
spin_unlock_irq(&workers->lock);
- return 0;
}
/*
@@ -615,14 +613,14 @@ found:
* it was taken from. It is intended for use with long running work functions
* that make some progress and want to give the cpu up for others.
*/
-int btrfs_requeue_work(struct btrfs_work *work)
+void btrfs_requeue_work(struct btrfs_work *work)
{
struct btrfs_worker_thread *worker = work->worker;
unsigned long flags;
int wake = 0;
if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
- goto out;
+ return;
spin_lock_irqsave(&worker->lock, flags);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
@@ -649,9 +647,6 @@ int btrfs_requeue_work(struct btrfs_work *work)
if (wake)
wake_up_process(worker->task);
spin_unlock_irqrestore(&worker->lock, flags);
-out:
-
- return 0;
}
void btrfs_set_work_high_prio(struct btrfs_work *work)
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index f34cc31fa3c..063698b90ce 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -111,9 +111,9 @@ struct btrfs_workers {
void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
int btrfs_start_workers(struct btrfs_workers *workers);
-int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_stop_workers(struct btrfs_workers *workers);
void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
struct btrfs_workers *async_starter);
-int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_requeue_work(struct btrfs_work *work);
void btrfs_set_work_high_prio(struct btrfs_work *work);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd..f4e90748940 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,18 +19,803 @@
#include "ctree.h"
#include "disk-io.h"
#include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
-struct __data_ref {
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
struct list_head list;
- u64 inum;
- u64 root;
- u64 extent_data_item_offset;
+ u64 root_id;
+ struct btrfs_key key;
+ int level;
+ int count;
+ u64 parent;
+ u64 wanted_disk_byte;
};
-struct __shared_ref {
- struct list_head list;
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+ struct btrfs_key *key, int level, u64 parent,
+ u64 wanted_disk_byte, int count)
+{
+ struct __prelim_ref *ref;
+
+ /* in case we're adding delayed refs, we're holding the refs spinlock */
+ ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+ if (!ref)
+ return -ENOMEM;
+
+ ref->root_id = root_id;
+ if (key)
+ ref->key = *key;
+ else
+ memset(&ref->key, 0, sizeof(ref->key));
+
+ ref->level = level;
+ ref->count = count;
+ ref->parent = parent;
+ ref->wanted_disk_byte = wanted_disk_byte;
+ list_add_tail(&ref->list, head);
+
+ return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+ struct ulist *parents,
+ struct extent_buffer *eb, int level,
+ u64 wanted_objectid, u64 wanted_disk_byte)
+{
+ int ret;
+ int slot;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
u64 disk_byte;
-};
+
+add_parent:
+ ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+ if (ret < 0)
+ return ret;
+
+ if (level != 0)
+ return 0;
+
+ /*
+ * if the current leaf is full with EXTENT_DATA items, we must
+ * check the next one if that holds a reference as well.
+ * ref->count cannot be used to skip this check.
+ * repeat this until we don't find any additional EXTENT_DATA items.
+ */
+ while (1) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+
+ eb = path->nodes[0];
+ for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ if (key.objectid != wanted_objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+ fi = btrfs_item_ptr(eb, slot,
+ struct btrfs_file_extent_item);
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte == wanted_disk_byte)
+ goto add_parent;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+ int search_commit_root,
+ struct __prelim_ref *ref,
+ struct ulist *parents)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ struct btrfs_key root_key;
+ struct btrfs_key key = {0};
+ struct extent_buffer *eb;
+ int ret = 0;
+ int root_level;
+ int level = ref->level;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->search_commit_root = !!search_commit_root;
+
+ root_key.objectid = ref->root_id;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ rcu_read_lock();
+ root_level = btrfs_header_level(root->node);
+ rcu_read_unlock();
+
+ if (root_level + 1 == level)
+ goto out;
+
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+ pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+ "%d for key (%llu %u %llu)\n",
+ (unsigned long long)ref->root_id, level, ref->count, ret,
+ (unsigned long long)ref->key.objectid, ref->key.type,
+ (unsigned long long)ref->key.offset);
+ if (ret < 0)
+ goto out;
+
+ eb = path->nodes[level];
+ if (!eb) {
+ WARN_ON(1);
+ ret = 1;
+ goto out;
+ }
+
+ if (level == 0) {
+ if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ goto out;
+ eb = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+ }
+
+ /* the last two parameters will only be used for level == 0 */
+ ret = add_all_parents(root, path, parents, eb, level, key.objectid,
+ ref->wanted_disk_byte);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+ int search_commit_root,
+ struct list_head *head)
+{
+ int err;
+ int ret = 0;
+ struct __prelim_ref *ref;
+ struct __prelim_ref *ref_safe;
+ struct __prelim_ref *new_ref;
+ struct ulist *parents;
+ struct ulist_node *node;
+
+ parents = ulist_alloc(GFP_NOFS);
+ if (!parents)
+ return -ENOMEM;
+
+ /*
+ * _safe allows us to insert directly after the current item without
+ * iterating over the newly inserted items.
+ * we're also allowed to re-assign ref during iteration.
+ */
+ list_for_each_entry_safe(ref, ref_safe, head, list) {
+ if (ref->parent) /* already direct */
+ continue;
+ if (ref->count == 0)
+ continue;
+ err = __resolve_indirect_ref(fs_info, search_commit_root,
+ ref, parents);
+ if (err) {
+ if (ret == 0)
+ ret = err;
+ continue;
+ }
+
+ /* we put the first parent into the ref at hand */
+ node = ulist_next(parents, NULL);
+ ref->parent = node ? node->val : 0;
+
+ /* additional parents require new refs being added here */
+ while ((node = ulist_next(parents, node))) {
+ new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
+ if (!new_ref) {
+ ret = -ENOMEM;
+ break;
+ }
+ memcpy(new_ref, ref, sizeof(*ref));
+ new_ref->parent = node->val;
+ list_add(&new_ref->list, &ref->list);
+ }
+ ulist_reinit(parents);
+ }
+
+ ulist_free(parents);
+ return ret;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ * mode = 2: merge identical parents
+ */
+static int __merge_refs(struct list_head *head, int mode)
+{
+ struct list_head *pos1;
+
+ list_for_each(pos1, head) {
+ struct list_head *n2;
+ struct list_head *pos2;
+ struct __prelim_ref *ref1;
+
+ ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+ if (mode == 1 && ref1->key.type == 0)
+ continue;
+ for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+ pos2 = n2, n2 = pos2->next) {
+ struct __prelim_ref *ref2;
+
+ ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+ if (mode == 1) {
+ if (memcmp(&ref1->key, &ref2->key,
+ sizeof(ref1->key)) ||
+ ref1->level != ref2->level ||
+ ref1->root_id != ref2->root_id)
+ continue;
+ ref1->count += ref2->count;
+ } else {
+ if (ref1->parent != ref2->parent)
+ continue;
+ ref1->count += ref2->count;
+ }
+ list_del(&ref2->list);
+ kfree(ref2);
+ }
+
+ }
+ return 0;
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+ struct btrfs_key *info_key,
+ struct list_head *prefs)
+{
+ struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+ struct rb_node *n = &head->node.rb_node;
+ int sgn;
+ int ret = 0;
+
+ if (extent_op && extent_op->update_key)
+ btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+
+ while ((n = rb_prev(n))) {
+ struct btrfs_delayed_ref_node *node;
+ node = rb_entry(n, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (node->bytenr != head->node.bytenr)
+ break;
+ WARN_ON(node->is_head);
+
+ if (node->seq > seq)
+ continue;
+
+ switch (node->action) {
+ case BTRFS_ADD_DELAYED_EXTENT:
+ case BTRFS_UPDATE_DELAYED_HEAD:
+ WARN_ON(1);
+ continue;
+ case BTRFS_ADD_DELAYED_REF:
+ sgn = 1;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ sgn = -1;
+ break;
+ default:
+ BUG_ON(1);
+ }
+ switch (node->type) {
+ case BTRFS_TREE_BLOCK_REF_KEY: {
+ struct btrfs_delayed_tree_ref *ref;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ ret = __add_prelim_ref(prefs, ref->root, info_key,
+ ref->level + 1, 0, node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ case BTRFS_SHARED_BLOCK_REF_KEY: {
+ struct btrfs_delayed_tree_ref *ref;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ ret = __add_prelim_ref(prefs, ref->root, info_key,
+ ref->level + 1, ref->parent,
+ node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_delayed_data_ref *ref;
+ struct btrfs_key key;
+
+ ref = btrfs_delayed_node_to_data_ref(node);
+
+ key.objectid = ref->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ref->offset;
+ ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+ node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_delayed_data_ref *ref;
+ struct btrfs_key key;
+
+ ref = btrfs_delayed_node_to_data_ref(node);
+
+ key.objectid = ref->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ref->offset;
+ ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+ ref->parent, node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ BUG_ON(ret);
+ }
+
+ return 0;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 bytenr,
+ struct btrfs_key *info_key, int *info_level,
+ struct list_head *prefs)
+{
+ int ret = 0;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long ptr;
+ unsigned long end;
+ struct btrfs_extent_item *ei;
+ u64 flags;
+ u64 item_size;
+
+ /*
+ * enumerate all inline refs
+ */
+ leaf = path->nodes[0];
+ slot = path->slots[0] - 1;
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + item_size;
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ struct btrfs_tree_block_info *info;
+ struct btrfs_disk_key disk_key;
+
+ info = (struct btrfs_tree_block_info *)ptr;
+ *info_level = btrfs_tree_block_level(leaf, info);
+ btrfs_tree_block_key(leaf, info, &disk_key);
+ btrfs_disk_key_to_cpu(info_key, &disk_key);
+ ptr += sizeof(struct btrfs_tree_block_info);
+ BUG_ON(ptr > end);
+ } else {
+ BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+ }
+
+ while (ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ u64 offset;
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+ offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+ switch (type) {
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, 0, info_key,
+ *info_level + 1, offset,
+ bytenr, 1);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_shared_data_ref *sdref;
+ int count;
+
+ sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+ count = btrfs_shared_data_ref_count(leaf, sdref);
+ ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+ bytenr, count);
+ break;
+ }
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, offset, info_key,
+ *info_level + 1, 0, bytenr, 1);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_extent_data_ref *dref;
+ int count;
+ u64 root;
+
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ count = btrfs_extent_data_ref_count(leaf, dref);
+ key.objectid = btrfs_extent_data_ref_objectid(leaf,
+ dref);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+ root = btrfs_extent_data_ref_root(leaf, dref);
+ ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
+ count);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ BUG_ON(ret);
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+
+ return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 bytenr,
+ struct btrfs_key *info_key, int info_level,
+ struct list_head *prefs)
+{
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ int ret;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ while (1) {
+ ret = btrfs_next_item(extent_root, path);
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = 0;
+ break;
+ }
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.objectid != bytenr)
+ break;
+ if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+ continue;
+ if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+ break;
+
+ switch (key.type) {
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, 0, info_key,
+ info_level + 1, key.offset,
+ bytenr, 1);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_shared_data_ref *sdref;
+ int count;
+
+ sdref = btrfs_item_ptr(leaf, slot,
+ struct btrfs_shared_data_ref);
+ count = btrfs_shared_data_ref_count(leaf, sdref);
+ ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+ bytenr, count);
+ break;
+ }
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, key.offset, info_key,
+ info_level + 1, 0, bytenr, 1);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_extent_data_ref *dref;
+ int count;
+ u64 root;
+
+ dref = btrfs_item_ptr(leaf, slot,
+ struct btrfs_extent_data_ref);
+ count = btrfs_extent_data_ref_count(leaf, dref);
+ key.objectid = btrfs_extent_data_ref_objectid(leaf,
+ dref);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+ root = btrfs_extent_data_ref_root(leaf, dref);
+ ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+ bytenr, count);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ BUG_ON(ret);
+ }
+
+ return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 seq, struct ulist *refs, struct ulist *roots)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_key info_key = { 0 };
+ struct btrfs_delayed_ref_root *delayed_refs = NULL;
+ struct btrfs_delayed_ref_head *head;
+ int info_level = 0;
+ int ret;
+ int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
+ struct list_head prefs_delayed;
+ struct list_head prefs;
+ struct __prelim_ref *ref;
+
+ INIT_LIST_HEAD(&prefs);
+ INIT_LIST_HEAD(&prefs_delayed);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->search_commit_root = !!search_commit_root;
+
+ /*
+ * grab both a lock on the path and a lock on the delayed ref head.
+ * We need both to get a consistent picture of how the refs look
+ * at a specified point in time
+ */
+again:
+ head = NULL;
+
+ ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+ /*
+ * look if there are updates for this ref queued and lock the
+ * head
+ */
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (head) {
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ /*
+ * Mutex was contended, block until it's
+ * released and try again
+ */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ goto again;
+ }
+ ret = __add_delayed_refs(head, seq, &info_key,
+ &prefs_delayed);
+ if (ret) {
+ spin_unlock(&delayed_refs->lock);
+ goto out;
+ }
+ }
+ spin_unlock(&delayed_refs->lock);
+ }
+
+ if (path->slots[0]) {
+ struct extent_buffer *leaf;
+ int slot;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0] - 1;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY) {
+ ret = __add_inline_refs(fs_info, path, bytenr,
+ &info_key, &info_level, &prefs);
+ if (ret)
+ goto out;
+ ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+ info_level, &prefs);
+ if (ret)
+ goto out;
+ }
+ }
+ btrfs_release_path(path);
+
+ /*
+ * when adding the delayed refs above, the info_key might not have
+ * been known yet. Go over the list and replace the missing keys
+ */
+ list_for_each_entry(ref, &prefs_delayed, list) {
+ if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
+ memcpy(&ref->key, &info_key, sizeof(ref->key));
+ }
+ list_splice_init(&prefs_delayed, &prefs);
+
+ ret = __merge_refs(&prefs, 1);
+ if (ret)
+ goto out;
+
+ ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs);
+ if (ret)
+ goto out;
+
+ ret = __merge_refs(&prefs, 2);
+ if (ret)
+ goto out;
+
+ while (!list_empty(&prefs)) {
+ ref = list_first_entry(&prefs, struct __prelim_ref, list);
+ list_del(&ref->list);
+ if (ref->count < 0)
+ WARN_ON(1);
+ if (ref->count && ref->root_id && ref->parent == 0) {
+ /* no parent == root of tree */
+ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+ BUG_ON(ret < 0);
+ }
+ if (ref->count && ref->parent) {
+ ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+ BUG_ON(ret < 0);
+ }
+ kfree(ref);
+ }
+
+out:
+ if (head)
+ mutex_unlock(&head->mutex);
+ btrfs_free_path(path);
+ while (!list_empty(&prefs)) {
+ ref = list_first_entry(&prefs, struct __prelim_ref, list);
+ list_del(&ref->list);
+ kfree(ref);
+ }
+ while (!list_empty(&prefs_delayed)) {
+ ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+ list);
+ list_del(&ref->list);
+ kfree(ref);
+ }
+
+ return ret;
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 num_bytes, u64 seq, struct ulist **leafs)
+{
+ struct ulist *tmp;
+ int ret;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+ *leafs = ulist_alloc(GFP_NOFS);
+ if (!*leafs) {
+ ulist_free(tmp);
+ return -ENOMEM;
+ }
+
+ ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+ ulist_free(tmp);
+
+ if (ret < 0 && ret != -ENOENT) {
+ ulist_free(*leafs);
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 num_bytes, u64 seq, struct ulist **roots)
+{
+ struct ulist *tmp;
+ struct ulist_node *node = NULL;
+ int ret;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+ *roots = ulist_alloc(GFP_NOFS);
+ if (!*roots) {
+ ulist_free(tmp);
+ return -ENOMEM;
+ }
+
+ while (1) {
+ ret = find_parent_nodes(trans, fs_info, bytenr, seq,
+ tmp, *roots);
+ if (ret < 0 && ret != -ENOENT) {
+ ulist_free(tmp);
+ ulist_free(*roots);
+ return ret;
+ }
+ node = ulist_next(tmp, node);
+ if (!node)
+ break;
+ bytenr = node->val;
+ }
+
+ ulist_free(tmp);
+ return 0;
+}
+
static int __inode_info(u64 inum, u64 ioff, u8 key_type,
struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -121,6 +906,8 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
if (eb != eb_in)
free_extent_buffer(eb);
ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+ if (ret > 0)
+ ret = -ENOENT;
if (ret)
break;
next_inum = found_key.offset;
@@ -181,8 +968,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
found_key->objectid > logical ||
- found_key->objectid + found_key->offset <= logical)
+ found_key->objectid + found_key->offset <= logical) {
+ pr_debug("logical %llu is not within any extent\n",
+ (unsigned long long)logical);
return -ENOENT;
+ }
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +981,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
+ pr_debug("logical %llu is at position %llu within the extent (%llu "
+ "EXTENT_ITEM %llu) flags %#llx size %u\n",
+ (unsigned long long)logical,
+ (unsigned long long)(logical - found_key->objectid),
+ (unsigned long long)found_key->objectid,
+ (unsigned long long)found_key->offset,
+ (unsigned long long)flags, item_size);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
return BTRFS_EXTENT_FLAG_TREE_BLOCK;
if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1084,10 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
return 0;
}
-static int __data_list_add(struct list_head *head, u64 inum,
- u64 extent_data_item_offset, u64 root)
-{
- struct __data_ref *ref;
-
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
- if (!ref)
- return -ENOMEM;
-
- ref->inum = inum;
- ref->extent_data_item_offset = extent_data_item_offset;
- ref->root = root;
- list_add_tail(&ref->list, head);
-
- return 0;
-}
-
-static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
- struct btrfs_extent_data_ref *dref)
-{
- return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
- btrfs_extent_data_ref_offset(eb, dref),
- btrfs_extent_data_ref_root(eb, dref));
-}
-
-static int __shared_list_add(struct list_head *head, u64 disk_byte)
-{
- struct __shared_ref *ref;
-
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
- if (!ref)
- return -ENOMEM;
-
- ref->disk_byte = disk_byte;
- list_add_tail(&ref->list, head);
-
- return 0;
-}
-
-static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
- u64 logical, u64 inum,
- u64 extent_data_item_offset,
- u64 extent_offset,
- struct btrfs_path *path,
- struct list_head *data_refs,
- iterate_extent_inodes_t *iterate,
- void *ctx)
-{
- u64 ref_root;
- u32 item_size;
- struct btrfs_key key;
- struct extent_buffer *eb;
- struct btrfs_extent_item *ei;
- struct btrfs_extent_inline_ref *eiref;
- struct __data_ref *ref;
- int ret;
- int type;
- int last;
- unsigned long ptr = 0;
-
- WARN_ON(!list_empty(data_refs));
- ret = extent_from_logical(fs_info, logical, path, &key);
- if (ret & BTRFS_EXTENT_FLAG_DATA)
- ret = -EIO;
- if (ret < 0)
- goto out;
-
- eb = path->nodes[0];
- ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
- ret = 0;
- ref_root = 0;
- /*
- * as done in iterate_extent_inodes, we first build a list of refs to
- * iterate, then free the path and then iterate them to avoid deadlocks.
- */
- do {
- last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
- &eiref, &type);
- if (last < 0) {
- ret = last;
- goto out;
- }
- if (type == BTRFS_TREE_BLOCK_REF_KEY ||
- type == BTRFS_SHARED_BLOCK_REF_KEY) {
- ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
- ret = __data_list_add(data_refs, inum,
- extent_data_item_offset,
- ref_root);
- }
- } while (!ret && !last);
-
- btrfs_release_path(path);
-
- if (ref_root == 0) {
- printk(KERN_ERR "btrfs: failed to find tree block ref "
- "for shared data backref %llu\n", logical);
- WARN_ON(1);
- ret = -EIO;
- }
-
-out:
- while (!list_empty(data_refs)) {
- ref = list_first_entry(data_refs, struct __data_ref, list);
- list_del(&ref->list);
- if (!ret)
- ret = iterate(ref->inum, extent_offset +
- ref->extent_data_item_offset,
- ref->root, ctx);
- kfree(ref);
- }
-
- return ret;
-}
-
-static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
- u64 logical, u64 orig_extent_item_objectid,
- u64 extent_offset, struct btrfs_path *path,
- struct list_head *data_refs,
- iterate_extent_inodes_t *iterate,
- void *ctx)
+static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 orig_extent_item_objectid,
+ u64 extent_item_pos, u64 root,
+ iterate_extent_inodes_t *iterate, void *ctx)
{
u64 disk_byte;
struct btrfs_key key;
@@ -416,8 +1095,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb;
int slot;
int nritems;
- int ret;
- int found = 0;
+ int ret = 0;
+ int extent_type;
+ u64 data_offset;
+ u64 data_len;
eb = read_tree_block(fs_info->tree_root, logical,
fs_info->tree_root->leafsize, 0);
@@ -435,147 +1116,103 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- if (!fi) {
- free_extent_buffer(eb);
- return -EIO;
- }
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte != orig_extent_item_objectid) {
- if (found)
- break;
- else
- continue;
- }
- ++found;
- ret = __iter_shared_inline_ref_inodes(fs_info, logical,
- key.objectid,
- key.offset,
- extent_offset, path,
- data_refs,
- iterate, ctx);
- if (ret)
- break;
- }
+ if (disk_byte != orig_extent_item_objectid)
+ continue;
- if (!found) {
- printk(KERN_ERR "btrfs: failed to follow shared data backref "
- "to parent %llu\n", logical);
- WARN_ON(1);
- ret = -EIO;
+ data_offset = btrfs_file_extent_offset(eb, fi);
+ data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+ if (extent_item_pos < data_offset ||
+ extent_item_pos >= data_offset + data_len)
+ continue;
+
+ pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+ "root %llu\n", orig_extent_item_objectid,
+ key.objectid, key.offset, root);
+ ret = iterate(key.objectid,
+ key.offset + (extent_item_pos - data_offset),
+ root, ctx);
+ if (ret) {
+ pr_debug("stopping iteration because ret=%d\n", ret);
+ break;
+ }
}
free_extent_buffer(eb);
+
return ret;
}
/*
* calls iterate() for every inode that references the extent identified by
- * the given parameters. will use the path given as a parameter and return it
- * released.
+ * the given parameters.
* when the iterator function returns a non-zero value, iteration stops.
*/
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- u64 extent_item_objectid,
- u64 extent_offset,
+ u64 extent_item_objectid, u64 extent_item_pos,
+ int search_commit_root,
iterate_extent_inodes_t *iterate, void *ctx)
{
- unsigned long ptr = 0;
- int last;
int ret;
- int type;
- u64 logical;
- u32 item_size;
- struct btrfs_extent_inline_ref *eiref;
- struct btrfs_extent_data_ref *dref;
- struct extent_buffer *eb;
- struct btrfs_extent_item *ei;
- struct btrfs_key key;
struct list_head data_refs = LIST_HEAD_INIT(data_refs);
struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
- struct __data_ref *ref_d;
- struct __shared_ref *ref_s;
+ struct btrfs_trans_handle *trans;
+ struct ulist *refs = NULL;
+ struct ulist *roots = NULL;
+ struct ulist_node *ref_node = NULL;
+ struct ulist_node *root_node = NULL;
+ struct seq_list seq_elem;
+ struct btrfs_delayed_ref_root *delayed_refs = NULL;
+
+ pr_debug("resolving all inodes for extent %llu\n",
+ extent_item_objectid);
+
+ if (search_commit_root) {
+ trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
+ } else {
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+ spin_unlock(&delayed_refs->lock);
+ }
- eb = path->nodes[0];
- ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+ extent_item_pos, seq_elem.seq,
+ &refs);
- /* first we iterate the inline refs, ... */
- do {
- last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
- &eiref, &type);
- if (last == -ENOENT) {
- ret = 0;
- break;
- }
- if (last < 0) {
- ret = last;
- break;
- }
-
- if (type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
- ret = __data_list_add_eb(&data_refs, eb, dref);
- } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
- logical = btrfs_extent_inline_ref_offset(eb, eiref);
- ret = __shared_list_add(&shared_refs, logical);
- }
- } while (!ret && !last);
+ if (ret)
+ goto out;
- /* ... then we proceed to in-tree references and ... */
- while (!ret) {
- ++path->slots[0];
- if (path->slots[0] > btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(fs_info->extent_root, path);
- if (ret) {
- if (ret == 1)
- ret = 0; /* we're done */
- break;
- }
- eb = path->nodes[0];
- }
- btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
- if (key.objectid != extent_item_objectid)
+ while (!ret && (ref_node = ulist_next(refs, ref_node))) {
+ ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
+ seq_elem.seq, &roots);
+ if (ret)
break;
- if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = btrfs_item_ptr(eb, path->slots[0],
- struct btrfs_extent_data_ref);
- ret = __data_list_add_eb(&data_refs, eb, dref);
- } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
- ret = __shared_list_add(&shared_refs, key.offset);
+ while (!ret && (root_node = ulist_next(roots, root_node))) {
+ pr_debug("root %llu references leaf %llu\n",
+ root_node->val, ref_node->val);
+ ret = iterate_leaf_refs(fs_info, ref_node->val,
+ extent_item_objectid,
+ extent_item_pos, root_node->val,
+ iterate, ctx);
}
}
- btrfs_release_path(path);
-
- /*
- * ... only at the very end we can process the refs we found. this is
- * because the iterator function we call is allowed to make tree lookups
- * and we have to avoid deadlocks. additionally, we need more tree
- * lookups ourselves for shared data refs.
- */
- while (!list_empty(&data_refs)) {
- ref_d = list_first_entry(&data_refs, struct __data_ref, list);
- list_del(&ref_d->list);
- if (!ret)
- ret = iterate(ref_d->inum, extent_offset +
- ref_d->extent_data_item_offset,
- ref_d->root, ctx);
- kfree(ref_d);
- }
-
- while (!list_empty(&shared_refs)) {
- ref_s = list_first_entry(&shared_refs, struct __shared_ref,
- list);
- list_del(&ref_s->list);
- if (!ret)
- ret = __iter_shared_inline_ref(fs_info,
- ref_s->disk_byte,
- extent_item_objectid,
- extent_offset, path,
- &data_refs,
- iterate, ctx);
- kfree(ref_s);
+ ulist_free(refs);
+ ulist_free(roots);
+out:
+ if (!search_commit_root) {
+ btrfs_put_delayed_seq(delayed_refs, &seq_elem);
+ btrfs_end_transaction(trans, fs_info->extent_root);
}
return ret;
@@ -586,19 +1223,22 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
iterate_extent_inodes_t *iterate, void *ctx)
{
int ret;
- u64 offset;
+ u64 extent_item_pos;
struct btrfs_key found_key;
+ int search_commit_root = path->search_commit_root;
ret = extent_from_logical(fs_info, logical, path,
&found_key);
+ btrfs_release_path(path);
if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = -EINVAL;
if (ret < 0)
return ret;
- offset = logical - found_key.objectid;
- ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
- offset, iterate, ctx);
+ extent_item_pos = logical - found_key.objectid;
+ ret = iterate_extent_inodes(fs_info, found_key.objectid,
+ extent_item_pos, search_commit_root,
+ iterate, ctx);
return ret;
}
@@ -643,6 +1283,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
name_len = btrfs_inode_ref_name_len(eb, iref);
/* path must be released before calling iterate()! */
+ pr_debug("following ref at offset %u for inode %llu in "
+ "tree %llu\n", cur,
+ (unsigned long long)found_key.objectid,
+ (unsigned long long)fs_root->objectid);
ret = iterate(parent, iref, eb, ctx);
if (ret) {
free_extent_buffer(eb);
@@ -683,10 +1327,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
return PTR_ERR(fspath);
if (fspath > fspath_min) {
+ pr_debug("path resolved: %s\n", fspath);
ipath->fspath->val[i] = (u64)(unsigned long)fspath;
++ipath->fspath->elem_cnt;
ipath->fspath->bytes_left = fspath - fspath_min;
} else {
+ pr_debug("missed path, not enough space. missing bytes: %lu, "
+ "constructed so far: %s\n",
+ (unsigned long)(fspath_min - fspath), fspath_min);
++ipath->fspath->elem_missed;
ipath->fspath->bytes_missing += fspath_min - fspath;
ipath->fspath->bytes_left = 0;
@@ -711,12 +1359,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
inode_to_path, ipath);
}
-/*
- * allocates space to return multiple file system paths for an inode.
- * total_bytes to allocate are passed, note that space usable for actual path
- * information will be total_bytes - sizeof(struct inode_fs_paths).
- * the returned pointer must be freed with free_ipath() in the end.
- */
struct btrfs_data_container *init_data_container(u32 total_bytes)
{
struct btrfs_data_container *data;
@@ -772,5 +1414,6 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
void free_ipath(struct inode_fs_paths *ipath)
{
+ kfree(ipath->fspath);
kfree(ipath);
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8..57ea2e959e4 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,9 @@
#define __BTRFS_BACKREF__
#include "ioctl.h"
+#include "ulist.h"
+
+#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
@@ -43,9 +46,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
u64 *out_root, u8 *out_level);
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
u64 extent_item_objectid,
- u64 extent_offset,
+ u64 extent_offset, int search_commit_root,
iterate_extent_inodes_t *iterate, void *ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
@@ -54,6 +56,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 num_bytes, u64 seq, struct ulist **roots);
+
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d..9b9b15fd520 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,9 @@ struct btrfs_inode {
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
+ /* held while doing delalloc reservations */
+ struct mutex delalloc_mutex;
+
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 00000000000..c053e90f200
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3068 @@
+/*
+ * Copyright (C) STRATO AG 2011. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ * currently referenced by the super block (either directly
+ * or indirectly).
+ * 2. When a super block is written, it is verified that all
+ * referenced (directly or indirectly) blocks fulfill the
+ * following requirements:
+ * 2a. All referenced blocks have either been present when
+ * the file system was mounted, (i.e., they have been
+ * referenced by the super block) or they have been
+ * written since then and the write completion callback
+ * was called and a FLUSH request to the device where
+ * these blocks are located was received and completed.
+ * 2b. All referenced blocks need to have a generation
+ * number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/crc32c.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
+ * excluding " [...]" */
+#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
+
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+ u32 magic_num; /* only used for debug purposes */
+ unsigned int is_metadata:1; /* if it is meta-data, not data-data */
+ unsigned int is_superblock:1; /* if it is one of the superblocks */
+ unsigned int is_iodone:1; /* if is done by lower subsystem */
+ unsigned int iodone_w_error:1; /* error was indicated to endio */
+ unsigned int never_written:1; /* block was added because it was
+ * referenced, not because it was
+ * written */
+ unsigned int mirror_num:2; /* large enough to hold
+ * BTRFS_SUPER_MIRROR_MAX */
+ struct btrfsic_dev_state *dev_state;
+ u64 dev_bytenr; /* key, physical byte num on disk */
+ u64 logical_bytenr; /* logical byte num on disk */
+ u64 generation;
+ struct btrfs_disk_key disk_key; /* extra info to print in case of
+ * issues, will not always be correct */
+ struct list_head collision_resolving_node; /* list node */
+ struct list_head all_blocks_node; /* list node */
+
+ /* the following two lists contain block_link items */
+ struct list_head ref_to_list; /* list */
+ struct list_head ref_from_list; /* list */
+ struct btrfsic_block *next_in_same_bio;
+ void *orig_bio_bh_private;
+ union {
+ bio_end_io_t *bio;
+ bh_end_io_t *bh;
+ } orig_bio_bh_end_io;
+ int submit_bio_bh_rw;
+ u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+ u32 magic_num; /* only used for debug purposes */
+ u32 ref_cnt;
+ struct list_head node_ref_to; /* list node */
+ struct list_head node_ref_from; /* list node */
+ struct list_head collision_resolving_node; /* list node */
+ struct btrfsic_block *block_ref_to;
+ struct btrfsic_block *block_ref_from;
+ u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+ u32 magic_num; /* only used for debug purposes */
+ struct block_device *bdev;
+ struct btrfsic_state *state;
+ struct list_head collision_resolving_node; /* list node */
+ struct btrfsic_block dummy_block_for_bio_bh_flush;
+ u64 last_flush_gen;
+ char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+ struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+ struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+ struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+ u64 start; /* virtual bytenr */
+ u64 dev_bytenr; /* physical bytenr on device */
+ u32 len;
+ struct btrfsic_dev_state *dev;
+ char *data;
+ struct buffer_head *bh; /* do not use if set to NULL */
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+ u32 magic;
+ u32 nr;
+ int error;
+ int i;
+ int limit_nesting;
+ int num_copies;
+ int mirror_num;
+ struct btrfsic_block *block;
+ struct btrfsic_block_data_ctx *block_ctx;
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx next_block_ctx;
+ struct btrfs_header *hdr;
+ struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+ u32 print_mask;
+ int include_extent_data;
+ int csum_size;
+ struct list_head all_blocks_list;
+ struct btrfsic_block_hashtable block_hashtable;
+ struct btrfsic_block_link_hashtable block_link_hashtable;
+ struct btrfs_root *root;
+ u64 max_superblock_generation;
+ struct btrfsic_block *latest_superblock;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+ struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+ struct block_device *bdev,
+ u64 dev_bytenr,
+ struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+ struct btrfsic_block_link *l,
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+ struct block_device *bdev_ref_to,
+ u64 dev_bytenr_ref_to,
+ struct block_device *bdev_ref_from,
+ u64 dev_bytenr_ref_from,
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+ struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+ struct btrfsic_dev_state *ds,
+ struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+ struct block_device *bdev,
+ struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+ struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ struct btrfs_header *hdr,
+ int limit_nesting, int force_iodone_flag);
+static int btrfsic_create_link_to_next_block(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx
+ *block_ctx, u64 next_bytenr,
+ int limit_nesting,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block **next_blockp,
+ int force_iodone_flag,
+ int *num_copiesp, int *mirror_nump,
+ struct btrfs_disk_key *disk_key,
+ u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+ struct btrfsic_block_data_ctx *block_ctx_out,
+ int mirror_num);
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+ u32 len, struct block_device *bdev,
+ struct btrfsic_block_data_ctx *block_ctx_out);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+ const u8 *data, unsigned int size);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, u8 *mapped_data,
+ unsigned int len, struct bio *bio,
+ int *bio_is_patched,
+ struct buffer_head *bh,
+ int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+ const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block *next_block,
+ struct btrfsic_block *from_block,
+ u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx,
+ const char *additional_string,
+ int is_metadata,
+ int is_iodone,
+ int never_written,
+ int mirror_num,
+ int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+ struct btrfsic_state *state,
+ struct btrfsic_dev_state *dev_state,
+ struct btrfs_device *device,
+ int superblock_mirror_num,
+ struct btrfsic_dev_state **selected_dev_state,
+ struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+ struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+ u64 bytenr,
+ struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, char *data);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+ b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+ b->dev_state = NULL;
+ b->dev_bytenr = 0;
+ b->logical_bytenr = 0;
+ b->generation = BTRFSIC_GENERATION_UNKNOWN;
+ b->disk_key.objectid = 0;
+ b->disk_key.type = 0;
+ b->disk_key.offset = 0;
+ b->is_metadata = 0;
+ b->is_superblock = 0;
+ b->is_iodone = 0;
+ b->iodone_w_error = 0;
+ b->never_written = 0;
+ b->mirror_num = 0;
+ b->next_in_same_bio = NULL;
+ b->orig_bio_bh_private = NULL;
+ b->orig_bio_bh_end_io.bio = NULL;
+ INIT_LIST_HEAD(&b->collision_resolving_node);
+ INIT_LIST_HEAD(&b->all_blocks_node);
+ INIT_LIST_HEAD(&b->ref_to_list);
+ INIT_LIST_HEAD(&b->ref_from_list);
+ b->submit_bio_bh_rw = 0;
+ b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+ struct btrfsic_block *b;
+
+ b = kzalloc(sizeof(*b), GFP_NOFS);
+ if (NULL != b)
+ btrfsic_block_init(b);
+
+ return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+ BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+ kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+ l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+ l->ref_cnt = 1;
+ INIT_LIST_HEAD(&l->node_ref_to);
+ INIT_LIST_HEAD(&l->node_ref_from);
+ INIT_LIST_HEAD(&l->collision_resolving_node);
+ l->block_ref_to = NULL;
+ l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+ struct btrfsic_block_link *l;
+
+ l = kzalloc(sizeof(*l), GFP_NOFS);
+ if (NULL != l)
+ btrfsic_block_link_init(l);
+
+ return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+ BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+ kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+ ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+ ds->bdev = NULL;
+ ds->state = NULL;
+ ds->name[0] = '\0';
+ INIT_LIST_HEAD(&ds->collision_resolving_node);
+ ds->last_flush_gen = 0;
+ btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+ ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+ ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+ struct btrfsic_dev_state *ds;
+
+ ds = kzalloc(sizeof(*ds), GFP_NOFS);
+ if (NULL != ds)
+ btrfsic_dev_state_init(ds);
+
+ return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+ BUG_ON(!(NULL == ds ||
+ BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+ kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+ struct btrfsic_block_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(b->dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+ (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+ list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+ list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+ struct block_device *bdev,
+ u64 dev_bytenr,
+ struct btrfsic_block_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)bdev))) &
+ (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_block *const b =
+ list_entry(elem, struct btrfsic_block,
+ collision_resolving_node);
+
+ if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+ return b;
+ }
+
+ return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+ struct btrfsic_block_link_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+ struct btrfsic_block_link *l,
+ struct btrfsic_block_link_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+ ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+ ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+ & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+ BUG_ON(NULL == l->block_ref_to);
+ BUG_ON(NULL == l->block_ref_from);
+ list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+ list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+ struct block_device *bdev_ref_to,
+ u64 dev_bytenr_ref_to,
+ struct block_device *bdev_ref_from,
+ u64 dev_bytenr_ref_from,
+ struct btrfsic_block_link_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+ ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+ ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+ ((unsigned int)((uintptr_t)bdev_ref_from))) &
+ (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem, struct btrfsic_block_link,
+ collision_resolving_node);
+
+ BUG_ON(NULL == l->block_ref_to);
+ BUG_ON(NULL == l->block_ref_from);
+ if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+ l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+ l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+ l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+ return l;
+ }
+
+ return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+ struct btrfsic_dev_state_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+ struct btrfsic_dev_state *ds,
+ struct btrfsic_dev_state_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)((uintptr_t)ds->bdev)) &
+ (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+ list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+ list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+ struct block_device *bdev,
+ struct btrfsic_dev_state_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)((uintptr_t)bdev)) &
+ (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_dev_state *const ds =
+ list_entry(elem, struct btrfsic_dev_state,
+ collision_resolving_node);
+
+ if (ds->bdev == bdev)
+ return ds;
+ }
+
+ return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int ret = 0;
+ struct btrfs_super_block *selected_super;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+ struct btrfsic_dev_state *selected_dev_state = NULL;
+ int pass;
+
+ BUG_ON(NULL == state);
+ selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+ if (NULL == selected_super) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ return -1;
+ }
+
+ list_for_each_entry(device, dev_head, dev_list) {
+ int i;
+ struct btrfsic_dev_state *dev_state;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ dev_state = btrfsic_dev_state_lookup(device->bdev);
+ BUG_ON(NULL == dev_state);
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ ret = btrfsic_process_superblock_dev_mirror(
+ state, dev_state, device, i,
+ &selected_dev_state, selected_super);
+ if (0 != ret && 0 == i) {
+ kfree(selected_super);
+ return ret;
+ }
+ }
+ }
+
+ if (NULL == state->latest_superblock) {
+ printk(KERN_INFO "btrfsic: no superblock found!\n");
+ kfree(selected_super);
+ return -1;
+ }
+
+ state->csum_size = btrfs_super_csum_size(selected_super);
+
+ for (pass = 0; pass < 3; pass++) {
+ int num_copies;
+ int mirror_num;
+ u64 next_bytenr;
+
+ switch (pass) {
+ case 0:
+ next_bytenr = btrfs_super_root(selected_super);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "root@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 1:
+ next_bytenr = btrfs_super_chunk_root(selected_super);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "chunk@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 2:
+ next_bytenr = btrfs_super_log_root(selected_super);
+ if (0 == next_bytenr)
+ continue;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "log@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, PAGE_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+ struct btrfs_header *hdr;
+
+ ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ &tmp_next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO "btrfsic:"
+ " btrfsic_map_block(root @%llu,"
+ " mirror %d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ kfree(selected_super);
+ return -1;
+ }
+
+ next_block = btrfsic_block_hashtable_lookup(
+ tmp_next_block_ctx.dev->bdev,
+ tmp_next_block_ctx.dev_bytenr,
+ &state->block_hashtable);
+ BUG_ON(NULL == next_block);
+
+ l = btrfsic_block_link_hashtable_lookup(
+ tmp_next_block_ctx.dev->bdev,
+ tmp_next_block_ctx.dev_bytenr,
+ state->latest_superblock->dev_state->
+ bdev,
+ state->latest_superblock->dev_bytenr,
+ &state->block_link_hashtable);
+ BUG_ON(NULL == l);
+
+ ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+ if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+ printk(KERN_INFO
+ "btrfsic: read @logical %llu failed!\n",
+ (unsigned long long)
+ tmp_next_block_ctx.start);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ kfree(selected_super);
+ return -1;
+ }
+
+ hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
+ ret = btrfsic_process_metablock(state,
+ next_block,
+ &tmp_next_block_ctx,
+ hdr,
+ BTRFS_MAX_LEVEL + 3, 1);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ }
+ }
+
+ kfree(selected_super);
+ return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+ struct btrfsic_state *state,
+ struct btrfsic_dev_state *dev_state,
+ struct btrfs_device *device,
+ int superblock_mirror_num,
+ struct btrfsic_dev_state **selected_dev_state,
+ struct btrfs_super_block *selected_super)
+{
+ struct btrfs_super_block *super_tmp;
+ u64 dev_bytenr;
+ struct buffer_head *bh;
+ struct btrfsic_block *superblock_tmp;
+ int pass;
+ struct block_device *const superblock_bdev = device->bdev;
+
+ /* super block bytenr is always the unmapped device bytenr */
+ dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+ bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+ if (NULL == bh)
+ return -1;
+ super_tmp = (struct btrfs_super_block *)
+ (bh->b_data + (dev_bytenr & 4095));
+
+ if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+ strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
+ sizeof(super_tmp->magic)) ||
+ memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+ brelse(bh);
+ return 0;
+ }
+
+ superblock_tmp =
+ btrfsic_block_hashtable_lookup(superblock_bdev,
+ dev_bytenr,
+ &state->block_hashtable);
+ if (NULL == superblock_tmp) {
+ superblock_tmp = btrfsic_block_alloc();
+ if (NULL == superblock_tmp) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ brelse(bh);
+ return -1;
+ }
+ /* for superblock, only the dev_bytenr makes sense */
+ superblock_tmp->dev_bytenr = dev_bytenr;
+ superblock_tmp->dev_state = dev_state;
+ superblock_tmp->logical_bytenr = dev_bytenr;
+ superblock_tmp->generation = btrfs_super_generation(super_tmp);
+ superblock_tmp->is_metadata = 1;
+ superblock_tmp->is_superblock = 1;
+ superblock_tmp->is_iodone = 1;
+ superblock_tmp->never_written = 0;
+ superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk(KERN_INFO "New initial S-block (bdev %p, %s)"
+ " @%llu (%s/%llu/%d)\n",
+ superblock_bdev, device->name,
+ (unsigned long long)dev_bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ superblock_mirror_num);
+ list_add(&superblock_tmp->all_blocks_node,
+ &state->all_blocks_list);
+ btrfsic_block_hashtable_add(superblock_tmp,
+ &state->block_hashtable);
+ }
+
+ /* select the one with the highest generation field */
+ if (btrfs_super_generation(super_tmp) >
+ state->max_superblock_generation ||
+ 0 == state->max_superblock_generation) {
+ memcpy(selected_super, super_tmp, sizeof(*selected_super));
+ *selected_dev_state = dev_state;
+ state->max_superblock_generation =
+ btrfs_super_generation(super_tmp);
+ state->latest_superblock = superblock_tmp;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ u64 next_bytenr;
+ int num_copies;
+ int mirror_num;
+ const char *additional_string = NULL;
+ struct btrfs_disk_key tmp_disk_key;
+
+ tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_disk_key.offset = 0;
+ switch (pass) {
+ case 0:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+ additional_string = "initial root ";
+ next_bytenr = btrfs_super_root(super_tmp);
+ break;
+ case 1:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+ additional_string = "initial chunk ";
+ next_bytenr = btrfs_super_chunk_root(super_tmp);
+ break;
+ case 2:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+ additional_string = "initial log ";
+ next_bytenr = btrfs_super_log_root(super_tmp);
+ if (0 == next_bytenr)
+ continue;
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, PAGE_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+
+ if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ &tmp_next_block_ctx,
+ mirror_num)) {
+ printk(KERN_INFO "btrfsic: btrfsic_map_block("
+ "bytenr @%llu, mirror %d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ brelse(bh);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state, &tmp_next_block_ctx,
+ additional_string, 1, 1, 0,
+ mirror_num, NULL);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ brelse(bh);
+ return -1;
+ }
+
+ next_block->disk_key = tmp_disk_key;
+ next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ l = btrfsic_block_link_lookup_or_add(
+ state, &tmp_next_block_ctx,
+ next_block, superblock_tmp,
+ BTRFSIC_GENERATION_UNKNOWN);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ if (NULL == l) {
+ brelse(bh);
+ return -1;
+ }
+ }
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+ btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+ brelse(bh);
+ return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+ struct btrfsic_stack_frame *sf;
+
+ sf = kzalloc(sizeof(*sf), GFP_NOFS);
+ if (NULL == sf)
+ printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+ else
+ sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+ return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+ BUG_ON(!(NULL == sf ||
+ BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+ kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const first_block,
+ struct btrfsic_block_data_ctx *const first_block_ctx,
+ struct btrfs_header *const first_hdr,
+ int first_limit_nesting, int force_iodone_flag)
+{
+ struct btrfsic_stack_frame initial_stack_frame = { 0 };
+ struct btrfsic_stack_frame *sf;
+ struct btrfsic_stack_frame *next_stack;
+
+ sf = &initial_stack_frame;
+ sf->error = 0;
+ sf->i = -1;
+ sf->limit_nesting = first_limit_nesting;
+ sf->block = first_block;
+ sf->block_ctx = first_block_ctx;
+ sf->next_block = NULL;
+ sf->hdr = first_hdr;
+ sf->prev = NULL;
+
+continue_with_new_stack_frame:
+ sf->block->generation = le64_to_cpu(sf->hdr->generation);
+ if (0 == sf->hdr->level) {
+ struct btrfs_leaf *const leafhdr =
+ (struct btrfs_leaf *)sf->hdr;
+
+ if (-1 == sf->i) {
+ sf->nr = le32_to_cpu(leafhdr->header.nritems);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "leaf %llu items %d generation %llu"
+ " owner %llu\n",
+ (unsigned long long)
+ sf->block_ctx->start,
+ sf->nr,
+ (unsigned long long)
+ le64_to_cpu(leafhdr->header.generation),
+ (unsigned long long)
+ le64_to_cpu(leafhdr->header.owner));
+ }
+
+continue_with_current_leaf_stack_frame:
+ if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+ sf->i++;
+ sf->num_copies = 0;
+ }
+
+ if (sf->i < sf->nr) {
+ struct btrfs_item *disk_item = leafhdr->items + sf->i;
+ struct btrfs_disk_key *disk_key = &disk_item->key;
+ u8 type;
+ const u32 item_offset = le32_to_cpu(disk_item->offset);
+
+ type = disk_key->type;
+
+ if (BTRFS_ROOT_ITEM_KEY == type) {
+ const struct btrfs_root_item *const root_item =
+ (struct btrfs_root_item *)
+ (sf->block_ctx->data +
+ offsetof(struct btrfs_leaf, items) +
+ item_offset);
+ const u64 next_bytenr =
+ le64_to_cpu(root_item->bytenr);
+
+ sf->error =
+ btrfsic_create_link_to_next_block(
+ state,
+ sf->block,
+ sf->block_ctx,
+ next_bytenr,
+ sf->limit_nesting,
+ &sf->next_block_ctx,
+ &sf->next_block,
+ force_iodone_flag,
+ &sf->num_copies,
+ &sf->mirror_num,
+ disk_key,
+ le64_to_cpu(root_item->
+ generation));
+ if (sf->error)
+ goto one_stack_frame_backwards;
+
+ if (NULL != sf->next_block) {
+ struct btrfs_header *const next_hdr =
+ (struct btrfs_header *)
+ sf->next_block_ctx.data;
+
+ next_stack =
+ btrfsic_stack_frame_alloc();
+ if (NULL == next_stack) {
+ btrfsic_release_block_ctx(
+ &sf->
+ next_block_ctx);
+ goto one_stack_frame_backwards;
+ }
+
+ next_stack->i = -1;
+ next_stack->block = sf->next_block;
+ next_stack->block_ctx =
+ &sf->next_block_ctx;
+ next_stack->next_block = NULL;
+ next_stack->hdr = next_hdr;
+ next_stack->limit_nesting =
+ sf->limit_nesting - 1;
+ next_stack->prev = sf;
+ sf = next_stack;
+ goto continue_with_new_stack_frame;
+ }
+ } else if (BTRFS_EXTENT_DATA_KEY == type &&
+ state->include_extent_data) {
+ sf->error = btrfsic_handle_extent_data(
+ state,
+ sf->block,
+ sf->block_ctx,
+ item_offset,
+ force_iodone_flag);
+ if (sf->error)
+ goto one_stack_frame_backwards;
+ }
+
+ goto continue_with_current_leaf_stack_frame;
+ }
+ } else {
+ struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+ if (-1 == sf->i) {
+ sf->nr = le32_to_cpu(nodehdr->header.nritems);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "node %llu level %d items %d"
+ " generation %llu owner %llu\n",
+ (unsigned long long)
+ sf->block_ctx->start,
+ nodehdr->header.level, sf->nr,
+ (unsigned long long)
+ le64_to_cpu(nodehdr->header.generation),
+ (unsigned long long)
+ le64_to_cpu(nodehdr->header.owner));
+ }
+
+continue_with_current_node_stack_frame:
+ if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+ sf->i++;
+ sf->num_copies = 0;
+ }
+
+ if (sf->i < sf->nr) {
+ struct btrfs_key_ptr *disk_key_ptr =
+ nodehdr->ptrs + sf->i;
+ const u64 next_bytenr =
+ le64_to_cpu(disk_key_ptr->blockptr);
+
+ sf->error = btrfsic_create_link_to_next_block(
+ state,
+ sf->block,
+ sf->block_ctx,
+ next_bytenr,
+ sf->limit_nesting,
+ &sf->next_block_ctx,
+ &sf->next_block,
+ force_iodone_flag,
+ &sf->num_copies,
+ &sf->mirror_num,
+ &disk_key_ptr->key,
+ le64_to_cpu(disk_key_ptr->generation));
+ if (sf->error)
+ goto one_stack_frame_backwards;
+
+ if (NULL != sf->next_block) {
+ struct btrfs_header *const next_hdr =
+ (struct btrfs_header *)
+ sf->next_block_ctx.data;
+
+ next_stack = btrfsic_stack_frame_alloc();
+ if (NULL == next_stack)
+ goto one_stack_frame_backwards;
+
+ next_stack->i = -1;
+ next_stack->block = sf->next_block;
+ next_stack->block_ctx = &sf->next_block_ctx;
+ next_stack->next_block = NULL;
+ next_stack->hdr = next_hdr;
+ next_stack->limit_nesting =
+ sf->limit_nesting - 1;
+ next_stack->prev = sf;
+ sf = next_stack;
+ goto continue_with_new_stack_frame;
+ }
+
+ goto continue_with_current_node_stack_frame;
+ }
+ }
+
+one_stack_frame_backwards:
+ if (NULL != sf->prev) {
+ struct btrfsic_stack_frame *const prev = sf->prev;
+
+ /* the one for the initial block is freed in the caller */
+ btrfsic_release_block_ctx(sf->block_ctx);
+
+ if (sf->error) {
+ prev->error = sf->error;
+ btrfsic_stack_frame_free(sf);
+ sf = prev;
+ goto one_stack_frame_backwards;
+ }
+
+ btrfsic_stack_frame_free(sf);
+ sf = prev;
+ goto continue_with_new_stack_frame;
+ } else {
+ BUG_ON(&initial_stack_frame != sf);
+ }
+
+ return sf->error;
+}
+
+static int btrfsic_create_link_to_next_block(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u64 next_bytenr,
+ int limit_nesting,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block **next_blockp,
+ int force_iodone_flag,
+ int *num_copiesp, int *mirror_nump,
+ struct btrfs_disk_key *disk_key,
+ u64 parent_generation)
+{
+ struct btrfsic_block *next_block = NULL;
+ int ret;
+ struct btrfsic_block_link *l;
+ int did_alloc_block_link;
+ int block_was_created;
+
+ *next_blockp = NULL;
+ if (0 == *num_copiesp) {
+ *num_copiesp =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, PAGE_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, *num_copiesp);
+ *mirror_nump = 1;
+ }
+
+ if (*mirror_nump > *num_copiesp)
+ return 0;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+ *mirror_nump);
+ ret = btrfsic_map_block(state, next_bytenr,
+ BTRFSIC_BLOCK_SIZE,
+ next_block_ctx, *mirror_nump);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+ (unsigned long long)next_bytenr, *mirror_nump);
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(state,
+ next_block_ctx, "referenced ",
+ 1, force_iodone_flag,
+ !force_iodone_flag,
+ *mirror_nump,
+ &block_was_created);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+ if (block_was_created) {
+ l = NULL;
+ next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ } else {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr)) {
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d)"
+ " found in hash table, %c,"
+ " bytenr mismatch (!= stored %llu).\n",
+ (unsigned long long)next_bytenr,
+ next_block_ctx->dev->name,
+ (unsigned long long)next_block_ctx->dev_bytenr,
+ *mirror_nump,
+ btrfsic_get_block_type(state, next_block),
+ (unsigned long long)next_block->logical_bytenr);
+ } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d)"
+ " found in hash table, %c.\n",
+ (unsigned long long)next_bytenr,
+ next_block_ctx->dev->name,
+ (unsigned long long)next_block_ctx->dev_bytenr,
+ *mirror_nump,
+ btrfsic_get_block_type(state, next_block));
+ next_block->logical_bytenr = next_bytenr;
+
+ next_block->mirror_num = *mirror_nump;
+ l = btrfsic_block_link_hashtable_lookup(
+ next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr,
+ block_ctx->dev->bdev,
+ block_ctx->dev_bytenr,
+ &state->block_link_hashtable);
+ }
+
+ next_block->disk_key = *disk_key;
+ if (NULL == l) {
+ l = btrfsic_block_link_alloc();
+ if (NULL == l) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ did_alloc_block_link = 1;
+ l->block_ref_to = next_block;
+ l->block_ref_from = block;
+ l->ref_cnt = 1;
+ l->parent_generation = parent_generation;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+
+ list_add(&l->node_ref_to, &block->ref_to_list);
+ list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+ btrfsic_block_link_hashtable_add(l,
+ &state->block_link_hashtable);
+ } else {
+ did_alloc_block_link = 0;
+ if (0 == limit_nesting) {
+ l->ref_cnt++;
+ l->parent_generation = parent_generation;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+ }
+ }
+
+ if (limit_nesting > 0 && did_alloc_block_link) {
+ ret = btrfsic_read_block(state, next_block_ctx);
+ if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+ printk(KERN_INFO
+ "btrfsic: read block @logical %llu failed!\n",
+ (unsigned long long)next_bytenr);
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ *next_blockp = next_block;
+ } else {
+ *next_blockp = NULL;
+ }
+ (*mirror_nump)++;
+
+ return 0;
+}
+
+static int btrfsic_handle_extent_data(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u32 item_offset, int force_iodone_flag)
+{
+ int ret;
+ struct btrfs_file_extent_item *file_extent_item =
+ (struct btrfs_file_extent_item *)(block_ctx->data +
+ offsetof(struct btrfs_leaf,
+ items) + item_offset);
+ u64 next_bytenr =
+ le64_to_cpu(file_extent_item->disk_bytenr) +
+ le64_to_cpu(file_extent_item->offset);
+ u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
+ u64 generation = le64_to_cpu(file_extent_item->generation);
+ struct btrfsic_block_link *l;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+ " offset = %llu, num_bytes = %llu\n",
+ file_extent_item->type,
+ (unsigned long long)
+ le64_to_cpu(file_extent_item->disk_bytenr),
+ (unsigned long long)
+ le64_to_cpu(file_extent_item->offset),
+ (unsigned long long)
+ le64_to_cpu(file_extent_item->num_bytes));
+ if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
+ ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
+ return 0;
+ while (num_bytes > 0) {
+ u32 chunk_len;
+ int num_copies;
+ int mirror_num;
+
+ if (num_bytes > BTRFSIC_BLOCK_SIZE)
+ chunk_len = BTRFSIC_BLOCK_SIZE;
+ else
+ chunk_len = num_bytes;
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, PAGE_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block_data_ctx next_block_ctx;
+ struct btrfsic_block *next_block;
+ int block_was_created;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "btrfsic_handle_extent_data("
+ "mirror_num=%d)\n", mirror_num);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO
+ "\tdisk_bytenr = %llu, num_bytes %u\n",
+ (unsigned long long)next_bytenr,
+ chunk_len);
+ ret = btrfsic_map_block(state, next_bytenr,
+ chunk_len, &next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu,"
+ " mirror=%d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state,
+ &next_block_ctx,
+ "referenced ",
+ 0,
+ force_iodone_flag,
+ !force_iodone_flag,
+ mirror_num,
+ &block_was_created);
+ if (NULL == next_block) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&next_block_ctx);
+ return -1;
+ }
+ if (!block_was_created) {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr)) {
+ printk(KERN_INFO
+ "Referenced block"
+ " @%llu (%s/%llu/%d)"
+ " found in hash table, D,"
+ " bytenr mismatch"
+ " (!= stored %llu).\n",
+ (unsigned long long)next_bytenr,
+ next_block_ctx.dev->name,
+ (unsigned long long)
+ next_block_ctx.dev_bytenr,
+ mirror_num,
+ (unsigned long long)
+ next_block->logical_bytenr);
+ }
+ next_block->logical_bytenr = next_bytenr;
+ next_block->mirror_num = mirror_num;
+ }
+
+ l = btrfsic_block_link_lookup_or_add(state,
+ &next_block_ctx,
+ next_block, block,
+ generation);
+ btrfsic_release_block_ctx(&next_block_ctx);
+ if (NULL == l)
+ return -1;
+ }
+
+ next_bytenr += chunk_len;
+ num_bytes -= chunk_len;
+ }
+
+ return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+ struct btrfsic_block_data_ctx *block_ctx_out,
+ int mirror_num)
+{
+ int ret;
+ u64 length;
+ struct btrfs_bio *multi = NULL;
+ struct btrfs_device *device;
+
+ length = len;
+ ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+ bytenr, &length, &multi, mirror_num);
+
+ device = multi->stripes[0].dev;
+ block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+ block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+ block_ctx_out->start = bytenr;
+ block_ctx_out->len = len;
+ block_ctx_out->data = NULL;
+ block_ctx_out->bh = NULL;
+
+ if (0 == ret)
+ kfree(multi);
+ if (NULL == block_ctx_out->dev) {
+ ret = -ENXIO;
+ printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+ }
+
+ return ret;
+}
+
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+ u32 len, struct block_device *bdev,
+ struct btrfsic_block_data_ctx *block_ctx_out)
+{
+ block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
+ block_ctx_out->dev_bytenr = bytenr;
+ block_ctx_out->start = bytenr;
+ block_ctx_out->len = len;
+ block_ctx_out->data = NULL;
+ block_ctx_out->bh = NULL;
+ if (NULL != block_ctx_out->dev) {
+ return 0;
+ } else {
+ printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
+ return -ENXIO;
+ }
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+ if (NULL != block_ctx->bh) {
+ brelse(block_ctx->bh);
+ block_ctx->bh = NULL;
+ }
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx)
+{
+ block_ctx->bh = NULL;
+ if (block_ctx->dev_bytenr & 4095) {
+ printk(KERN_INFO
+ "btrfsic: read_block() with unaligned bytenr %llu\n",
+ (unsigned long long)block_ctx->dev_bytenr);
+ return -1;
+ }
+ if (block_ctx->len > 4096) {
+ printk(KERN_INFO
+ "btrfsic: read_block() with too huge size %d\n",
+ block_ctx->len);
+ return -1;
+ }
+
+ block_ctx->bh = __bread(block_ctx->dev->bdev,
+ block_ctx->dev_bytenr >> 12, 4096);
+ if (NULL == block_ctx->bh)
+ return -1;
+ block_ctx->data = block_ctx->bh->b_data;
+
+ return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+ struct list_head *elem_all;
+
+ BUG_ON(NULL == state);
+
+ printk(KERN_INFO "all_blocks_list:\n");
+ list_for_each(elem_all, &state->all_blocks_list) {
+ const struct btrfsic_block *const b_all =
+ list_entry(elem_all, struct btrfsic_block,
+ all_blocks_node);
+ struct list_head *elem_ref_to;
+ struct list_head *elem_ref_from;
+
+ printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num);
+
+ list_for_each(elem_ref_to, &b_all->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+ " refers %u* to"
+ " %c @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ }
+
+ list_for_each(elem_ref_from, &b_all->ref_from_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_from,
+ struct btrfsic_block_link,
+ node_ref_from);
+
+ printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+ " is ref %u* from"
+ " %c @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)
+ l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num);
+ }
+
+ printk(KERN_INFO "\n");
+ }
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+ const u8 *data, unsigned int size)
+{
+ struct btrfs_header *h;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 crc = ~(u32)0;
+ int fail = 0;
+ int crc_fail = 0;
+
+ h = (struct btrfs_header *)data;
+
+ if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+ fail++;
+
+ crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+ btrfs_csum_final(crc, csum);
+ if (memcmp(csum, h->csum, state->csum_size))
+ crc_fail++;
+
+ return fail || crc_fail;
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr,
+ u8 *mapped_data, unsigned int len,
+ struct bio *bio,
+ int *bio_is_patched,
+ struct buffer_head *bh,
+ int submit_bio_bh_rw)
+{
+ int is_metadata;
+ struct btrfsic_block *block;
+ struct btrfsic_block_data_ctx block_ctx;
+ int ret;
+ struct btrfsic_state *state = dev_state->state;
+ struct block_device *bdev = dev_state->bdev;
+
+ WARN_ON(len > PAGE_SIZE);
+ is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
+ if (NULL != bio_is_patched)
+ *bio_is_patched = 0;
+
+ block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+ &state->block_hashtable);
+ if (NULL != block) {
+ u64 bytenr = 0;
+ struct list_head *elem_ref_to;
+ struct list_head *tmp_ref_to;
+
+ if (block->is_superblock) {
+ bytenr = le64_to_cpu(((struct btrfs_super_block *)
+ mapped_data)->bytenr);
+ is_metadata = 1;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+ printk(KERN_INFO
+ "[before new superblock is written]:\n");
+ btrfsic_dump_tree_sub(state, block, 0);
+ }
+ }
+ if (is_metadata) {
+ if (!block->is_superblock) {
+ bytenr = le64_to_cpu(((struct btrfs_header *)
+ mapped_data)->bytenr);
+ btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+ dev_state,
+ dev_bytenr,
+ mapped_data);
+ }
+ if (block->logical_bytenr != bytenr) {
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d)"
+ " found in hash table, %c,"
+ " bytenr mismatch"
+ " (!= stored %llu).\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)
+ block->logical_bytenr);
+ block->logical_bytenr = bytenr;
+ } else if (state->print_mask &
+ BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d)"
+ " found in hash table, %c.\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block));
+ } else {
+ bytenr = block->logical_bytenr;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d)"
+ " found in hash table, %c.\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block));
+ }
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "ref_to_list: %cE, ref_from_list: %cE\n",
+ list_empty(&block->ref_to_list) ? ' ' : '!',
+ list_empty(&block->ref_from_list) ? ' ' : '!');
+ if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+ printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+ " @%llu (%s/%llu/%d), old(gen=%llu,"
+ " objectid=%llu, type=%d, offset=%llu),"
+ " new(gen=%llu),"
+ " which is referenced by most recent superblock"
+ " (superblockgen=%llu)!\n",
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ (unsigned long long)block->generation,
+ (unsigned long long)
+ le64_to_cpu(block->disk_key.objectid),
+ block->disk_key.type,
+ (unsigned long long)
+ le64_to_cpu(block->disk_key.offset),
+ (unsigned long long)
+ le64_to_cpu(((struct btrfs_header *)
+ mapped_data)->generation),
+ (unsigned long long)
+ state->max_superblock_generation);
+ btrfsic_dump_tree(state);
+ }
+
+ if (!block->is_iodone && !block->never_written) {
+ printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+ " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+ " which is not yet iodone!\n",
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ (unsigned long long)block->generation,
+ (unsigned long long)
+ le64_to_cpu(((struct btrfs_header *)
+ mapped_data)->generation));
+ /* it would not be safe to go on */
+ btrfsic_dump_tree(state);
+ return;
+ }
+
+ /*
+ * Clear all references of this block. Do not free
+ * the block itself even if is not referenced anymore
+ * because it still carries valueable information
+ * like whether it was ever written and IO completed.
+ */
+ list_for_each_safe(elem_ref_to, tmp_ref_to,
+ &block->ref_to_list) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_rem_link(state, l);
+ l->ref_cnt--;
+ if (0 == l->ref_cnt) {
+ list_del(&l->node_ref_to);
+ list_del(&l->node_ref_from);
+ btrfsic_block_link_hashtable_remove(l);
+ btrfsic_block_link_free(l);
+ }
+ }
+
+ if (block->is_superblock)
+ ret = btrfsic_map_superblock(state, bytenr, len,
+ bdev, &block_ctx);
+ else
+ ret = btrfsic_map_block(state, bytenr, len,
+ &block_ctx, 0);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(root @%llu)"
+ " failed!\n", (unsigned long long)bytenr);
+ return;
+ }
+ block_ctx.data = mapped_data;
+ /* the following is required in case of writes to mirrors,
+ * use the same that was used for the lookup */
+ block_ctx.dev = dev_state;
+ block_ctx.dev_bytenr = dev_bytenr;
+
+ if (is_metadata || state->include_extent_data) {
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ if (NULL != bio) {
+ block->is_iodone = 0;
+ BUG_ON(NULL == bio_is_patched);
+ if (!*bio_is_patched) {
+ block->orig_bio_bh_private =
+ bio->bi_private;
+ block->orig_bio_bh_end_io.bio =
+ bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ *bio_is_patched = 1;
+ } else {
+ struct btrfsic_block *chained_block =
+ (struct btrfsic_block *)
+ bio->bi_private;
+
+ BUG_ON(NULL == chained_block);
+ block->orig_bio_bh_private =
+ chained_block->orig_bio_bh_private;
+ block->orig_bio_bh_end_io.bio =
+ chained_block->orig_bio_bh_end_io.
+ bio;
+ block->next_in_same_bio = chained_block;
+ bio->bi_private = block;
+ }
+ } else if (NULL != bh) {
+ block->is_iodone = 0;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ } else {
+ block->is_iodone = 1;
+ block->orig_bio_bh_private = NULL;
+ block->orig_bio_bh_end_io.bio = NULL;
+ block->next_in_same_bio = NULL;
+ }
+ }
+
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = submit_bio_bh_rw;
+ if (is_metadata) {
+ block->logical_bytenr = bytenr;
+ block->is_metadata = 1;
+ if (block->is_superblock) {
+ ret = btrfsic_process_written_superblock(
+ state,
+ block,
+ (struct btrfs_super_block *)
+ mapped_data);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+ printk(KERN_INFO
+ "[after new superblock is written]:\n");
+ btrfsic_dump_tree_sub(state, block, 0);
+ }
+ } else {
+ block->mirror_num = 0; /* unknown */
+ ret = btrfsic_process_metablock(
+ state,
+ block,
+ &block_ctx,
+ (struct btrfs_header *)
+ block_ctx.data,
+ 0, 0);
+ }
+ if (ret)
+ printk(KERN_INFO
+ "btrfsic: btrfsic_process_metablock"
+ "(root @%llu) failed!\n",
+ (unsigned long long)dev_bytenr);
+ } else {
+ block->is_metadata = 0;
+ block->mirror_num = 0; /* unknown */
+ block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ if (!state->include_extent_data
+ && list_empty(&block->ref_from_list)) {
+ /*
+ * disk block is overwritten with extent
+ * data (not meta data) and we are configured
+ * to not include extent data: take the
+ * chance and free the block's memory
+ */
+ btrfsic_block_hashtable_remove(block);
+ list_del(&block->all_blocks_node);
+ btrfsic_block_free(block);
+ }
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ } else {
+ /* block has not been found in hash table */
+ u64 bytenr;
+
+ if (!is_metadata) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "Written block (%s/%llu/?)"
+ " !found in hash table, D.\n",
+ dev_state->name,
+ (unsigned long long)dev_bytenr);
+ if (!state->include_extent_data)
+ return; /* ignore that written D block */
+
+ /* this is getting ugly for the
+ * include_extent_data case... */
+ bytenr = 0; /* unknown */
+ block_ctx.start = bytenr;
+ block_ctx.len = len;
+ block_ctx.bh = NULL;
+ } else {
+ bytenr = le64_to_cpu(((struct btrfs_header *)
+ mapped_data)->bytenr);
+ btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+ dev_bytenr,
+ mapped_data);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/?)"
+ " !found in hash table, M.\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr);
+
+ ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
+ 0);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(root @%llu)"
+ " failed!\n",
+ (unsigned long long)dev_bytenr);
+ return;
+ }
+ }
+ block_ctx.data = mapped_data;
+ /* the following is required in case of writes to mirrors,
+ * use the same that was used for the lookup */
+ block_ctx.dev = dev_state;
+ block_ctx.dev_bytenr = dev_bytenr;
+
+ block = btrfsic_block_alloc();
+ if (NULL == block) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&block_ctx);
+ return;
+ }
+ block->dev_state = dev_state;
+ block->dev_bytenr = dev_bytenr;
+ block->logical_bytenr = bytenr;
+ block->is_metadata = is_metadata;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->mirror_num = 0; /* unknown */
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = submit_bio_bh_rw;
+ if (NULL != bio) {
+ block->is_iodone = 0;
+ BUG_ON(NULL == bio_is_patched);
+ if (!*bio_is_patched) {
+ block->orig_bio_bh_private = bio->bi_private;
+ block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ *bio_is_patched = 1;
+ } else {
+ struct btrfsic_block *chained_block =
+ (struct btrfsic_block *)
+ bio->bi_private;
+
+ BUG_ON(NULL == chained_block);
+ block->orig_bio_bh_private =
+ chained_block->orig_bio_bh_private;
+ block->orig_bio_bh_end_io.bio =
+ chained_block->orig_bio_bh_end_io.bio;
+ block->next_in_same_bio = chained_block;
+ bio->bi_private = block;
+ }
+ } else if (NULL != bh) {
+ block->is_iodone = 0;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ } else {
+ block->is_iodone = 1;
+ block->orig_bio_bh_private = NULL;
+ block->orig_bio_bh_end_io.bio = NULL;
+ block->next_in_same_bio = NULL;
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "New written %c-block @%llu (%s/%llu/%d)\n",
+ is_metadata ? 'M' : 'D',
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+ list_add(&block->all_blocks_node, &state->all_blocks_list);
+ btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+ if (is_metadata) {
+ ret = btrfsic_process_metablock(state, block,
+ &block_ctx,
+ (struct btrfs_header *)
+ block_ctx.data, 0, 0);
+ if (ret)
+ printk(KERN_INFO
+ "btrfsic: process_metablock(root @%llu)"
+ " failed!\n",
+ (unsigned long long)dev_bytenr);
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ }
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+ struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+ int iodone_w_error;
+
+ /* mutex is not held! This is not save if IO is not yet completed
+ * on umount */
+ iodone_w_error = 0;
+ if (bio_error_status)
+ iodone_w_error = 1;
+
+ BUG_ON(NULL == block);
+ bp->bi_private = block->orig_bio_bh_private;
+ bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+ do {
+ struct btrfsic_block *next_block;
+ struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+ bio_error_status,
+ btrfsic_get_block_type(dev_state->state, block),
+ (unsigned long long)block->logical_bytenr,
+ dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+ next_block = block->next_in_same_bio;
+ block->iodone_w_error = iodone_w_error;
+ if (block->submit_bio_bh_rw & REQ_FLUSH) {
+ dev_state->last_flush_gen++;
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bio_end_io() new %s flush_gen=%llu\n",
+ dev_state->name,
+ (unsigned long long)
+ dev_state->last_flush_gen);
+ }
+ if (block->submit_bio_bh_rw & REQ_FUA)
+ block->flush_gen = 0; /* FUA completed means block is
+ * on disk */
+ block->is_iodone = 1; /* for FLUSH, this releases the block */
+ block = next_block;
+ } while (NULL != block);
+
+ bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+ struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+ int iodone_w_error = !uptodate;
+ struct btrfsic_dev_state *dev_state;
+
+ BUG_ON(NULL == block);
+ dev_state = block->dev_state;
+ if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+ iodone_w_error,
+ btrfsic_get_block_type(dev_state->state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+
+ block->iodone_w_error = iodone_w_error;
+ if (block->submit_bio_bh_rw & REQ_FLUSH) {
+ dev_state->last_flush_gen++;
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bh_end_io() new %s flush_gen=%llu\n",
+ dev_state->name,
+ (unsigned long long)dev_state->last_flush_gen);
+ }
+ if (block->submit_bio_bh_rw & REQ_FUA)
+ block->flush_gen = 0; /* FUA completed means block is on disk */
+
+ bh->b_private = block->orig_bio_bh_private;
+ bh->b_end_io = block->orig_bio_bh_end_io.bh;
+ block->is_iodone = 1; /* for FLUSH, this releases the block */
+ bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const superblock,
+ struct btrfs_super_block *const super_hdr)
+{
+ int pass;
+
+ superblock->generation = btrfs_super_generation(super_hdr);
+ if (!(superblock->generation > state->max_superblock_generation ||
+ 0 == state->max_superblock_generation)) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk(KERN_INFO
+ "btrfsic: superblock @%llu (%s/%llu/%d)"
+ " with old gen %llu <= %llu\n",
+ (unsigned long long)superblock->logical_bytenr,
+ superblock->dev_state->name,
+ (unsigned long long)superblock->dev_bytenr,
+ superblock->mirror_num,
+ (unsigned long long)
+ btrfs_super_generation(super_hdr),
+ (unsigned long long)
+ state->max_superblock_generation);
+ } else {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk(KERN_INFO
+ "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+ " with new gen %llu > %llu\n",
+ (unsigned long long)superblock->logical_bytenr,
+ superblock->dev_state->name,
+ (unsigned long long)superblock->dev_bytenr,
+ superblock->mirror_num,
+ (unsigned long long)
+ btrfs_super_generation(super_hdr),
+ (unsigned long long)
+ state->max_superblock_generation);
+
+ state->max_superblock_generation =
+ btrfs_super_generation(super_hdr);
+ state->latest_superblock = superblock;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ int ret;
+ u64 next_bytenr;
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+ int num_copies;
+ int mirror_num;
+ const char *additional_string = NULL;
+ struct btrfs_disk_key tmp_disk_key;
+
+ tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_disk_key.offset = 0;
+
+ switch (pass) {
+ case 0:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+ additional_string = "root ";
+ next_bytenr = btrfs_super_root(super_hdr);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "root@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 1:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+ additional_string = "chunk ";
+ next_bytenr = btrfs_super_chunk_root(super_hdr);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "chunk@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 2:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+ additional_string = "log ";
+ next_bytenr = btrfs_super_log_root(super_hdr);
+ if (0 == next_bytenr)
+ continue;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "log@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, PAGE_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ int was_created;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic_process_written_superblock("
+ "mirror_num=%d)\n", mirror_num);
+ ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ &tmp_next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu,"
+ " mirror=%d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state,
+ &tmp_next_block_ctx,
+ additional_string,
+ 1, 0, 1,
+ mirror_num,
+ &was_created);
+ if (NULL == next_block) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ return -1;
+ }
+
+ next_block->disk_key = tmp_disk_key;
+ if (was_created)
+ next_block->generation =
+ BTRFSIC_GENERATION_UNKNOWN;
+ l = btrfsic_block_link_lookup_or_add(
+ state,
+ &tmp_next_block_ctx,
+ next_block,
+ superblock,
+ BTRFSIC_GENERATION_UNKNOWN);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ if (NULL == l)
+ return -1;
+ }
+ }
+
+ if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
+ WARN_ON(1);
+ btrfsic_dump_tree(state);
+ }
+
+ return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ int recursion_level)
+{
+ struct list_head *elem_ref_to;
+ int ret = 0;
+
+ if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+ /*
+ * Note that this situation can happen and does not
+ * indicate an error in regular cases. It happens
+ * when disk blocks are freed and later reused.
+ * The check-integrity module is not aware of any
+ * block free operations, it just recognizes block
+ * write operations. Therefore it keeps the linkage
+ * information for a block until a block is
+ * rewritten. This can temporarily cause incorrect
+ * and even circular linkage informations. This
+ * causes no harm unless such blocks are referenced
+ * by the most recent super block.
+ */
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic: abort cyclic linkage (case 1).\n");
+
+ return ret;
+ }
+
+ /*
+ * This algorithm is recursive because the amount of used stack
+ * space is very small and the max recursion depth is limited.
+ */
+ list_for_each(elem_ref_to, &block->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to, struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "rl=%d, %c @%llu (%s/%llu/%d)"
+ " %u* refers to %c @%llu (%s/%llu/%d)\n",
+ recursion_level,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ if (l->block_ref_to->never_written) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is never written!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (!l->block_ref_to->is_iodone) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is not yet iodone!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (l->parent_generation !=
+ l->block_ref_to->generation &&
+ BTRFSIC_GENERATION_UNKNOWN !=
+ l->parent_generation &&
+ BTRFSIC_GENERATION_UNKNOWN !=
+ l->block_ref_to->generation) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " with generation %llu !="
+ " parent generation %llu!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num,
+ (unsigned long long)l->block_ref_to->generation,
+ (unsigned long long)l->parent_generation);
+ ret = -1;
+ } else if (l->block_ref_to->flush_gen >
+ l->block_ref_to->dev_state->last_flush_gen) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is not flushed out of disk's write cache"
+ " (block flush_gen=%llu,"
+ " dev->flush_gen=%llu)!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num,
+ (unsigned long long)block->flush_gen,
+ (unsigned long long)
+ l->block_ref_to->dev_state->last_flush_gen);
+ ret = -1;
+ } else if (-1 == btrfsic_check_all_ref_blocks(state,
+ l->block_ref_to,
+ recursion_level +
+ 1)) {
+ ret = -1;
+ }
+ }
+
+ return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+ const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int recursion_level)
+{
+ struct list_head *elem_ref_from;
+
+ if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+ /* refer to comment at "abort cyclic linkage (case 1)" */
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic: abort cyclic linkage (case 2).\n");
+
+ return 0;
+ }
+
+ /*
+ * This algorithm is recursive because the amount of used stack space
+ * is very small and the max recursion depth is limited.
+ */
+ list_for_each(elem_ref_from, &block->ref_from_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_from, struct btrfsic_block_link,
+ node_ref_from);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "rl=%d, %c @%llu (%s/%llu/%d)"
+ " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ recursion_level,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)
+ l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num);
+ if (l->block_ref_from->is_superblock &&
+ state->latest_superblock->dev_bytenr ==
+ l->block_ref_from->dev_bytenr &&
+ state->latest_superblock->dev_state->bdev ==
+ l->block_ref_from->dev_state->bdev)
+ return 1;
+ else if (btrfsic_is_block_ref_by_superblock(state,
+ l->block_ref_from,
+ recursion_level +
+ 1))
+ return 1;
+ }
+
+ return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l)
+{
+ printk(KERN_INFO
+ "Add %u* link from %c @%llu (%s/%llu/%d)"
+ " to %c @%llu (%s/%llu/%d).\n",
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l)
+{
+ printk(KERN_INFO
+ "Rem %u* link from %c @%llu (%s/%llu/%d)"
+ " to %c @%llu (%s/%llu/%d).\n",
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+ const struct btrfsic_block *block)
+{
+ if (block->is_superblock &&
+ state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+ state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+ return 'S';
+ else if (block->is_superblock)
+ return 's';
+ else if (block->is_metadata)
+ return 'M';
+ else
+ return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+ btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int indent_level)
+{
+ struct list_head *elem_ref_to;
+ int indent_add;
+ static char buf[80];
+ int cursor_position;
+
+ /*
+ * Should better fill an on-stack buffer with a complete line and
+ * dump it at once when it is time to print a newline character.
+ */
+
+ /*
+ * This algorithm is recursive because the amount of used stack space
+ * is very small and the max recursion depth is limited.
+ */
+ indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+ if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+ printk("[...]\n");
+ return;
+ }
+ printk(buf);
+ indent_level += indent_add;
+ if (list_empty(&block->ref_to_list)) {
+ printk("\n");
+ return;
+ }
+ if (block->mirror_num > 1 &&
+ !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+ printk(" [...]\n");
+ return;
+ }
+
+ cursor_position = indent_level;
+ list_for_each(elem_ref_to, &block->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to, struct btrfsic_block_link,
+ node_ref_to);
+
+ while (cursor_position < indent_level) {
+ printk(" ");
+ cursor_position++;
+ }
+ if (l->ref_cnt > 1)
+ indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+ else
+ indent_add = sprintf(buf, " --> ");
+ if (indent_level + indent_add >
+ BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+ printk("[...]\n");
+ cursor_position = 0;
+ continue;
+ }
+
+ printk(buf);
+
+ btrfsic_dump_tree_sub(state, l->block_ref_to,
+ indent_level + indent_add);
+ cursor_position = 0;
+ }
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block *next_block,
+ struct btrfsic_block *from_block,
+ u64 parent_generation)
+{
+ struct btrfsic_block_link *l;
+
+ l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr,
+ from_block->dev_state->bdev,
+ from_block->dev_bytenr,
+ &state->block_link_hashtable);
+ if (NULL == l) {
+ l = btrfsic_block_link_alloc();
+ if (NULL == l) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc" " failed!\n");
+ return NULL;
+ }
+
+ l->block_ref_to = next_block;
+ l->block_ref_from = from_block;
+ l->ref_cnt = 1;
+ l->parent_generation = parent_generation;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+
+ list_add(&l->node_ref_to, &from_block->ref_to_list);
+ list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+ btrfsic_block_link_hashtable_add(l,
+ &state->block_link_hashtable);
+ } else {
+ l->ref_cnt++;
+ l->parent_generation = parent_generation;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+ }
+
+ return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx,
+ const char *additional_string,
+ int is_metadata,
+ int is_iodone,
+ int never_written,
+ int mirror_num,
+ int *was_created)
+{
+ struct btrfsic_block *block;
+
+ block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+ block_ctx->dev_bytenr,
+ &state->block_hashtable);
+ if (NULL == block) {
+ struct btrfsic_dev_state *dev_state;
+
+ block = btrfsic_block_alloc();
+ if (NULL == block) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ return NULL;
+ }
+ dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+ if (NULL == dev_state) {
+ printk(KERN_INFO
+ "btrfsic: error, lookup dev_state failed!\n");
+ btrfsic_block_free(block);
+ return NULL;
+ }
+ block->dev_state = dev_state;
+ block->dev_bytenr = block_ctx->dev_bytenr;
+ block->logical_bytenr = block_ctx->start;
+ block->is_metadata = is_metadata;
+ block->is_iodone = is_iodone;
+ block->never_written = never_written;
+ block->mirror_num = mirror_num;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "New %s%c-block @%llu (%s/%llu/%d)\n",
+ additional_string,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ mirror_num);
+ list_add(&block->all_blocks_node, &state->all_blocks_list);
+ btrfsic_block_hashtable_add(block, &state->block_hashtable);
+ if (NULL != was_created)
+ *was_created = 1;
+ } else {
+ if (NULL != was_created)
+ *was_created = 0;
+ }
+
+ return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+ u64 bytenr,
+ struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, char *data)
+{
+ int num_copies;
+ int mirror_num;
+ int ret;
+ struct btrfsic_block_data_ctx block_ctx;
+ int match = 0;
+
+ num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ bytenr, PAGE_SIZE);
+
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+ &block_ctx, mirror_num);
+ if (ret) {
+ printk(KERN_INFO "btrfsic:"
+ " btrfsic_map_block(logical @%llu,"
+ " mirror %d) failed!\n",
+ (unsigned long long)bytenr, mirror_num);
+ continue;
+ }
+
+ if (dev_state->bdev == block_ctx.dev->bdev &&
+ dev_bytenr == block_ctx.dev_bytenr) {
+ match++;
+ btrfsic_release_block_ctx(&block_ctx);
+ break;
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ }
+
+ if (!match) {
+ printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+ " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+ " phys_bytenr=%llu)!\n",
+ (unsigned long long)bytenr, dev_state->name,
+ (unsigned long long)dev_bytenr);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+ &block_ctx, mirror_num);
+ if (ret)
+ continue;
+
+ printk(KERN_INFO "Read logical bytenr @%llu maps to"
+ " (%s/%llu/%d)\n",
+ (unsigned long long)bytenr,
+ block_ctx.dev->name,
+ (unsigned long long)block_ctx.dev_bytenr,
+ mirror_num);
+ }
+ WARN_ON(1);
+ }
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+ struct block_device *bdev)
+{
+ struct btrfsic_dev_state *ds;
+
+ ds = btrfsic_dev_state_hashtable_lookup(bdev,
+ &btrfsic_dev_state_hashtable);
+ return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized)
+ return submit_bh(rw, bh);
+
+ mutex_lock(&btrfsic_mutex);
+ /* since btrfsic_submit_bh() might also be called before
+ * btrfsic_mount(), this might return NULL */
+ dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+ /* Only called to write the superblock (incl. FLUSH/FUA) */
+ if (NULL != dev_state &&
+ (rw & WRITE) && bh->b_size > 0) {
+ u64 dev_bytenr;
+
+ dev_bytenr = 4096 * bh->b_blocknr;
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
+ " size=%lu, data=%p, bdev=%p)\n",
+ rw, (unsigned long)bh->b_blocknr,
+ (unsigned long long)dev_bytenr,
+ (unsigned long)bh->b_size, bh->b_data,
+ bh->b_bdev);
+ btrfsic_process_written_block(dev_state, dev_bytenr,
+ bh->b_data, bh->b_size, NULL,
+ NULL, bh, rw);
+ } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+ rw, bh->b_bdev);
+ if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "btrfsic_submit_bh(%s) with FLUSH"
+ " but dummy block already in use"
+ " (ignored)!\n",
+ dev_state->name);
+ } else {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = rw;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ }
+ }
+ mutex_unlock(&btrfsic_mutex);
+ return submit_bh(rw, bh);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized) {
+ submit_bio(rw, bio);
+ return;
+ }
+
+ mutex_lock(&btrfsic_mutex);
+ /* since btrfsic_submit_bio() is also called before
+ * btrfsic_mount(), this might return NULL */
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+ if (NULL != dev_state &&
+ (rw & WRITE) && NULL != bio->bi_io_vec) {
+ unsigned int i;
+ u64 dev_bytenr;
+ int bio_is_patched;
+
+ dev_bytenr = 512 * bio->bi_sector;
+ bio_is_patched = 0;
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bio(rw=0x%x, bi_vcnt=%u,"
+ " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
+ rw, bio->bi_vcnt, (unsigned long)bio->bi_sector,
+ (unsigned long long)dev_bytenr,
+ bio->bi_bdev);
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ u8 *mapped_data;
+
+ mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+ if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE) ==
+ (dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "#%u: page=%p, mapped=%p, len=%u,"
+ " offset=%u\n",
+ i, bio->bi_io_vec[i].bv_page,
+ mapped_data,
+ bio->bi_io_vec[i].bv_len,
+ bio->bi_io_vec[i].bv_offset);
+ btrfsic_process_written_block(dev_state, dev_bytenr,
+ mapped_data,
+ bio->bi_io_vec[i].bv_len,
+ bio, &bio_is_patched,
+ NULL, rw);
+ kunmap(bio->bi_io_vec[i].bv_page);
+ dev_bytenr += bio->bi_io_vec[i].bv_len;
+ }
+ } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+ rw, bio->bi_bdev);
+ if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "btrfsic_submit_bio(%s) with FLUSH"
+ " but dummy block already in use"
+ " (ignored)!\n",
+ dev_state->name);
+ } else {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = rw;
+ block->orig_bio_bh_private = bio->bi_private;
+ block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ }
+ }
+ mutex_unlock(&btrfsic_mutex);
+
+ submit_bio(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
+ int including_extent_data, u32 print_mask)
+{
+ int ret;
+ struct btrfsic_state *state;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+
+ state = kzalloc(sizeof(*state), GFP_NOFS);
+ if (NULL == state) {
+ printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+ return -1;
+ }
+
+ if (!btrfsic_is_initialized) {
+ mutex_init(&btrfsic_mutex);
+ btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+ btrfsic_is_initialized = 1;
+ }
+ mutex_lock(&btrfsic_mutex);
+ state->root = root;
+ state->print_mask = print_mask;
+ state->include_extent_data = including_extent_data;
+ state->csum_size = 0;
+ INIT_LIST_HEAD(&state->all_blocks_list);
+ btrfsic_block_hashtable_init(&state->block_hashtable);
+ btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+ state->max_superblock_generation = 0;
+ state->latest_superblock = NULL;
+
+ list_for_each_entry(device, dev_head, dev_list) {
+ struct btrfsic_dev_state *ds;
+ char *p;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ ds = btrfsic_dev_state_alloc();
+ if (NULL == ds) {
+ printk(KERN_INFO
+ "btrfs check-integrity: kmalloc() failed!\n");
+ mutex_unlock(&btrfsic_mutex);
+ return -1;
+ }
+ ds->bdev = device->bdev;
+ ds->state = state;
+ bdevname(ds->bdev, ds->name);
+ ds->name[BDEVNAME_SIZE - 1] = '\0';
+ for (p = ds->name; *p != '\0'; p++);
+ while (p > ds->name && *p != '/')
+ p--;
+ if (*p == '/')
+ p++;
+ strlcpy(ds->name, p, sizeof(ds->name));
+ btrfsic_dev_state_hashtable_add(ds,
+ &btrfsic_dev_state_hashtable);
+ }
+
+ ret = btrfsic_process_superblock(state, fs_devices);
+ if (0 != ret) {
+ mutex_unlock(&btrfsic_mutex);
+ btrfsic_unmount(root, fs_devices);
+ return ret;
+ }
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+ btrfsic_dump_database(state);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+ btrfsic_dump_tree(state);
+
+ mutex_unlock(&btrfsic_mutex);
+ return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices)
+{
+ struct list_head *elem_all;
+ struct list_head *tmp_all;
+ struct btrfsic_state *state;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ mutex_lock(&btrfsic_mutex);
+
+ state = NULL;
+ list_for_each_entry(device, dev_head, dev_list) {
+ struct btrfsic_dev_state *ds;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ ds = btrfsic_dev_state_hashtable_lookup(
+ device->bdev,
+ &btrfsic_dev_state_hashtable);
+ if (NULL != ds) {
+ state = ds->state;
+ btrfsic_dev_state_hashtable_remove(ds);
+ btrfsic_dev_state_free(ds);
+ }
+ }
+
+ if (NULL == state) {
+ printk(KERN_INFO
+ "btrfsic: error, cannot find state information"
+ " on umount!\n");
+ mutex_unlock(&btrfsic_mutex);
+ return;
+ }
+
+ /*
+ * Don't care about keeping the lists' state up to date,
+ * just free all memory that was allocated dynamically.
+ * Free the blocks and the block_links.
+ */
+ list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+ struct btrfsic_block *const b_all =
+ list_entry(elem_all, struct btrfsic_block,
+ all_blocks_node);
+ struct list_head *elem_ref_to;
+ struct list_head *tmp_ref_to;
+
+ list_for_each_safe(elem_ref_to, tmp_ref_to,
+ &b_all->ref_to_list) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_rem_link(state, l);
+
+ l->ref_cnt--;
+ if (0 == l->ref_cnt)
+ btrfsic_block_link_free(l);
+ }
+
+ if (b_all->is_iodone)
+ btrfsic_block_free(b_all);
+ else
+ printk(KERN_INFO "btrfs: attempt to free %c-block"
+ " @%llu (%s/%llu/%d) on umount which is"
+ " not yet iodone!\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num);
+ }
+
+ mutex_unlock(&btrfsic_mutex);
+
+ kfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 00000000000..8b59175cc50
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) STRATO AG 2011. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
+ int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices);
+
+#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 14f1c5a0b2d..d286b40a567 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -120,10 +120,10 @@ static int check_compressed_csum(struct inode *inode,
page = cb->compressed_pages[i];
csum = ~(u32)0;
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
btrfs_csum_final(csum, (char *)&csum);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (csum != *cb_sum) {
printk(KERN_INFO "btrfs csum failed ino %llu "
@@ -226,8 +226,8 @@ out:
* Clear the writeback bits on all of the file
* pages for a compressed write
*/
-static noinline int end_compressed_writeback(struct inode *inode, u64 start,
- unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode, u64 start,
+ unsigned long ram_size)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
@@ -253,7 +253,6 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start,
index += ret;
}
/* the inode may be gone now */
- return 0;
}
/*
@@ -392,16 +391,16 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
*/
atomic_inc(&cb->pending_bios);
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
ret = btrfs_csum_one_bio(root, inode, bio,
start, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(bio);
@@ -421,15 +420,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_get(bio);
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(bio);
return 0;
@@ -497,7 +496,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* sure they map to this compressed extent on disk.
*/
set_page_extent_mapped(page);
- lock_extent(tree, last_offset, end, GFP_NOFS);
+ lock_extent(tree, last_offset, end);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
PAGE_CACHE_SIZE);
@@ -507,7 +506,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
(last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_sector) {
free_extent_map(em);
- unlock_extent(tree, last_offset, end, GFP_NOFS);
+ unlock_extent(tree, last_offset, end);
unlock_page(page);
page_cache_release(page);
break;
@@ -521,10 +520,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (zero_offset) {
int zeros;
zeros = PAGE_CACHE_SIZE - zero_offset;
- userpage = kmap_atomic(page, KM_USER0);
+ userpage = kmap_atomic(page);
memset(userpage + zero_offset, 0, zeros);
flush_dcache_page(page);
- kunmap_atomic(userpage, KM_USER0);
+ kunmap_atomic(userpage);
}
}
@@ -535,7 +534,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
nr_pages++;
page_cache_release(page);
} else {
- unlock_extent(tree, last_offset, end, GFP_NOFS);
+ unlock_extent(tree, last_offset, end);
unlock_page(page);
page_cache_release(page);
break;
@@ -588,6 +587,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page_offset(bio->bi_io_vec->bv_page),
PAGE_CACHE_SIZE);
read_unlock(&em_tree->lock);
+ if (!em)
+ return -EIO;
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
@@ -660,7 +661,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_get(comp_bio);
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
/*
* inc the count before we submit the bio so
@@ -673,14 +674,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(root, inode,
comp_bio, sums);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
sums += (comp_bio->bi_size + root->sectorsize - 1) /
root->sectorsize;
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(comp_bio);
@@ -696,15 +697,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_get(comp_bio);
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(comp_bio);
return 0;
@@ -732,7 +733,7 @@ struct btrfs_compress_op *btrfs_compress_op[] = {
&btrfs_lzo_compress,
};
-int __init btrfs_init_compress(void)
+void __init btrfs_init_compress(void)
{
int i;
@@ -742,7 +743,6 @@ int __init btrfs_init_compress(void)
atomic_set(&comp_alloc_workspace[i], 0);
init_waitqueue_head(&comp_workspace_wait[i]);
}
- return 0;
}
/*
@@ -991,9 +991,9 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
bytes = min(PAGE_CACHE_SIZE - *pg_offset,
PAGE_CACHE_SIZE - buf_offset);
bytes = min(bytes, working_bytes);
- kaddr = kmap_atomic(page_out, KM_USER0);
+ kaddr = kmap_atomic(page_out);
memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
flush_dcache_page(page_out);
*pg_offset += bytes;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index a12059f4f0f..9afb0a62ae8 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,7 +19,7 @@
#ifndef __BTRFS_COMPRESSION_
#define __BTRFS_COMPRESSION_
-int btrfs_init_compress(void);
+void btrfs_init_compress(void);
void btrfs_exit_compress(void);
int btrfs_compress_pages(int type, struct address_space *mapping,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdee..e801f226d7e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -36,7 +36,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
-static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
struct btrfs_path *btrfs_alloc_path(void)
@@ -156,10 +156,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
- rcu_read_lock();
- eb = rcu_dereference(root->node);
- extent_buffer_get(eb);
- rcu_read_unlock();
+ while (1) {
+ rcu_read_lock();
+ eb = rcu_dereference(root->node);
+
+ /*
+ * RCU really hurts here, we could free up the root node because
+ * it was cow'ed but we may not get the new root node yet so do
+ * the inc_not_zero dance and if it doesn't work then
+ * synchronize_rcu and try again.
+ */
+ if (atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ synchronize_rcu();
+ }
return eb;
}
@@ -240,7 +253,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
new_root_objectid, &disk_key, level,
- buf->start, 0);
+ buf->start, 0, 1);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -261,9 +274,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0);
+ ret = btrfs_inc_ref(trans, root, cow, 0, 1);
if (ret)
return ret;
@@ -331,8 +344,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (btrfs_block_can_be_shared(root, buf)) {
ret = btrfs_lookup_extent_info(trans, root, buf->start,
buf->len, &refs, &flags);
- BUG_ON(ret);
- BUG_ON(refs == 0);
+ if (ret)
+ return ret;
+ if (refs == 0) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ return ret;
+ }
} else {
refs = 1;
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
@@ -350,43 +368,44 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if ((owner == root->root_key.objectid ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
- ret = btrfs_inc_ref(trans, root, buf, 1);
- BUG_ON(ret);
+ ret = btrfs_inc_ref(trans, root, buf, 1, 1);
+ BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID) {
- ret = btrfs_dec_ref(trans, root, buf, 0);
- BUG_ON(ret);
- ret = btrfs_inc_ref(trans, root, cow, 1);
- BUG_ON(ret);
+ ret = btrfs_dec_ref(trans, root, buf, 0, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+ BUG_ON(ret); /* -ENOMEM */
}
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
} else {
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0);
- BUG_ON(ret);
+ ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+ BUG_ON(ret); /* -ENOMEM */
}
if (new_flags != 0) {
ret = btrfs_set_disk_extent_flags(trans, root,
buf->start,
buf->len,
new_flags, 0);
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
} else {
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0);
- BUG_ON(ret);
- ret = btrfs_dec_ref(trans, root, buf, 1);
- BUG_ON(ret);
+ ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_dec_ref(trans, root, buf, 1, 1);
+ BUG_ON(ret); /* -ENOMEM */
}
clean_tree_block(trans, root, buf);
*last_ref = 1;
@@ -415,7 +434,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
{
struct btrfs_disk_key disk_key;
struct extent_buffer *cow;
- int level;
+ int level, ret;
int last_ref = 0;
int unlock_orig = 0;
u64 parent_start;
@@ -446,7 +465,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
root->root_key.objectid, &disk_key,
- level, search_start, empty_size);
+ level, search_start, empty_size, 1);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -467,7 +486,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_header_fsid(cow),
BTRFS_FSID_SIZE);
- update_ref_for_cow(trans, root, buf, cow, &last_ref);
+ ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
if (root->ref_cows)
btrfs_reloc_cow_block(trans, root, buf, cow);
@@ -484,7 +507,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ last_ref, 1);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -500,11 +523,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid);
btrfs_mark_buffer_dirty(parent);
btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ last_ref, 1);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
- free_extent_buffer(buf);
+ free_extent_buffer_stale(buf);
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
return 0;
@@ -934,7 +957,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* promote the child to a root */
child = read_node_slot(root, mid, 0);
- BUG_ON(!child);
+ if (!child) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ goto enospc;
+ }
+
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
@@ -957,9 +985,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
/* once for the root ptr */
- free_extent_buffer(mid);
+ free_extent_buffer_stale(mid);
return 0;
}
if (btrfs_header_nritems(mid) >
@@ -1010,13 +1038,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(right) == 0) {
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- wret = del_ptr(trans, root, path, level + 1, pslot +
- 1);
- if (wret)
- ret = wret;
+ del_ptr(trans, root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
- btrfs_free_tree_block(trans, root, right, 0, 1);
- free_extent_buffer(right);
+ btrfs_free_tree_block(trans, root, right, 0, 1, 0);
+ free_extent_buffer_stale(right);
right = NULL;
} else {
struct btrfs_disk_key right_key;
@@ -1035,7 +1060,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
* otherwise we would have pulled some pointers from the
* right
*/
- BUG_ON(!left);
+ if (!left) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ goto enospc;
+ }
wret = balance_node_right(trans, root, mid, left);
if (wret < 0) {
ret = wret;
@@ -1051,12 +1080,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(mid) == 0) {
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- wret = del_ptr(trans, root, path, level + 1, pslot);
- if (wret)
- ret = wret;
+ del_ptr(trans, root, path, level + 1, pslot);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
- free_extent_buffer(mid);
+ btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+ free_extent_buffer_stale(mid);
mid = NULL;
} else {
/* update the parent key to reflect our changes */
@@ -1382,7 +1409,8 @@ static noinline int reada_for_balance(struct btrfs_root *root,
* if lowest_unlock is 1, level 0 won't be unlocked
*/
static noinline void unlock_up(struct btrfs_path *path, int level,
- int lowest_unlock)
+ int lowest_unlock, int min_write_lock_level,
+ int *write_lock_level)
{
int i;
int skip_level = level;
@@ -1414,6 +1442,11 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
btrfs_tree_unlock_rw(t, path->locks[i]);
path->locks[i] = 0;
+ if (write_lock_level &&
+ i > min_write_lock_level &&
+ i <= *write_lock_level) {
+ *write_lock_level = i - 1;
+ }
}
}
}
@@ -1637,6 +1670,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
/* everything at write_lock_level or lower must be write locked */
int write_lock_level = 0;
u8 lowest_level = 0;
+ int min_write_lock_level;
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
@@ -1664,6 +1698,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (cow && (p->keep_locks || p->lowest_level))
write_lock_level = BTRFS_MAX_LEVEL;
+ min_write_lock_level = write_lock_level;
+
again:
/*
* we try very hard to do read locks on the root
@@ -1795,7 +1831,8 @@ cow_done:
goto again;
}
- unlock_up(p, level, lowest_unlock);
+ unlock_up(p, level, lowest_unlock,
+ min_write_lock_level, &write_lock_level);
if (level == lowest_level) {
if (dec)
@@ -1857,7 +1894,8 @@ cow_done:
}
}
if (!p->search_for_split)
- unlock_up(p, level, lowest_unlock);
+ unlock_up(p, level, lowest_unlock,
+ min_write_lock_level, &write_lock_level);
goto done;
}
}
@@ -1881,15 +1919,12 @@ done:
* fixing up pointers when a given leaf/node is not in slot 0 of the
* higher levels
*
- * If this fails to write a tree block, it returns -1, but continues
- * fixing up the blocks in ram so the tree is consistent.
*/
-static int fixup_low_keys(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_disk_key *key, int level)
+static void fixup_low_keys(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_disk_key *key, int level)
{
int i;
- int ret = 0;
struct extent_buffer *t;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1902,7 +1937,6 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
if (tslot != 0)
break;
}
- return ret;
}
/*
@@ -1911,9 +1945,9 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
* This function isn't completely safe. It's the caller's responsibility
* that the new key won't break the order
*/
-int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *new_key)
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *new_key)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *eb;
@@ -1923,13 +1957,11 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
slot = path->slots[0];
if (slot > 0) {
btrfs_item_key(eb, &disk_key, slot - 1);
- if (comp_keys(&disk_key, new_key) >= 0)
- return -1;
+ BUG_ON(comp_keys(&disk_key, new_key) >= 0);
}
if (slot < btrfs_header_nritems(eb) - 1) {
btrfs_item_key(eb, &disk_key, slot + 1);
- if (comp_keys(&disk_key, new_key) <= 0)
- return -1;
+ BUG_ON(comp_keys(&disk_key, new_key) <= 0);
}
btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -1937,7 +1969,6 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(eb);
if (slot == 0)
fixup_low_keys(trans, root, path, &disk_key, 1);
- return 0;
}
/*
@@ -2089,7 +2120,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
root->root_key.objectid, &lower_key,
- level, root->node->start, 0);
+ level, root->node->start, 0, 0);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -2140,12 +2171,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
*
* slot and level indicate where you want the key to go, and
* blocknr is the block the key points to.
- *
- * returns zero on success and < 0 on any error
*/
-static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, struct btrfs_disk_key
- *key, u64 bytenr, int slot, int level)
+static void insert_ptr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_disk_key *key, u64 bytenr,
+ int slot, int level)
{
struct extent_buffer *lower;
int nritems;
@@ -2155,8 +2185,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
- if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
- BUG();
+ BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
if (slot != nritems) {
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(slot + 1),
@@ -2169,7 +2198,6 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_set_node_ptr_generation(lower, slot, trans->transid);
btrfs_set_header_nritems(lower, nritems + 1);
btrfs_mark_buffer_dirty(lower);
- return 0;
}
/*
@@ -2190,7 +2218,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
int mid;
int ret;
- int wret;
u32 c_nritems;
c = path->nodes[level];
@@ -2216,7 +2243,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
root->root_key.objectid,
- &disk_key, level, c->start, 0);
+ &disk_key, level, c->start, 0, 0);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -2247,11 +2274,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
- wret = insert_ptr(trans, root, path, &disk_key, split->start,
- path->slots[level + 1] + 1,
- level + 1);
- if (wret)
- ret = wret;
+ insert_ptr(trans, root, path, &disk_key, split->start,
+ path->slots[level + 1] + 1, level + 1);
if (path->slots[level] >= mid) {
path->slots[level] -= mid;
@@ -2320,6 +2344,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
+ struct btrfs_map_token token;
struct btrfs_disk_key disk_key;
int slot;
u32 i;
@@ -2331,6 +2356,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
u32 data_end;
u32 this_item_size;
+ btrfs_init_map_token(&token);
+
if (empty)
nr = 0;
else
@@ -2408,8 +2435,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
push_space = BTRFS_LEAF_DATA_SIZE(root);
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(right, i);
- push_space -= btrfs_item_size(right, item);
- btrfs_set_item_offset(right, item, push_space);
+ push_space -= btrfs_token_item_size(right, item, &token);
+ btrfs_set_token_item_offset(right, item, push_space, &token);
}
left_nritems -= push_items;
@@ -2537,9 +2564,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
u32 old_left_nritems;
u32 nr;
int ret = 0;
- int wret;
u32 this_item_size;
u32 old_left_item_size;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
if (empty)
nr = min(right_nritems, max_slot);
@@ -2600,9 +2629,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
item = btrfs_item_nr(left, i);
- ioff = btrfs_item_offset(left, item);
- btrfs_set_item_offset(left, item,
- ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+ ioff = btrfs_token_item_offset(left, item, &token);
+ btrfs_set_token_item_offset(left, item,
+ ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
+ &token);
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -2632,8 +2662,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(right, i);
- push_space = push_space - btrfs_item_size(right, item);
- btrfs_set_item_offset(right, item, push_space);
+ push_space = push_space - btrfs_token_item_size(right,
+ item, &token);
+ btrfs_set_token_item_offset(right, item, push_space, &token);
}
btrfs_mark_buffer_dirty(left);
@@ -2643,9 +2674,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
clean_tree_block(trans, root, right);
btrfs_item_key(right, &disk_key, 0);
- wret = fixup_low_keys(trans, root, path, &disk_key, 1);
- if (wret)
- ret = wret;
+ fixup_low_keys(trans, root, path, &disk_key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
@@ -2716,7 +2745,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
path->nodes[1], slot - 1, &left);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
- ret = 1;
+ if (ret == -ENOSPC)
+ ret = 1;
goto out;
}
@@ -2738,22 +2768,21 @@ out:
/*
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
- *
- * returns 0 if all went well and < 0 on failure.
*/
-static noinline int copy_for_split(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *l,
- struct extent_buffer *right,
- int slot, int mid, int nritems)
+static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *l,
+ struct extent_buffer *right,
+ int slot, int mid, int nritems)
{
int data_copy_size;
int rt_data_off;
int i;
- int ret = 0;
- int wret;
struct btrfs_disk_key disk_key;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
@@ -2775,17 +2804,15 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
struct btrfs_item *item = btrfs_item_nr(right, i);
u32 ioff;
- ioff = btrfs_item_offset(right, item);
- btrfs_set_item_offset(right, item, ioff + rt_data_off);
+ ioff = btrfs_token_item_offset(right, item, &token);
+ btrfs_set_token_item_offset(right, item,
+ ioff + rt_data_off, &token);
}
btrfs_set_header_nritems(l, mid);
- ret = 0;
btrfs_item_key(right, &disk_key, 0);
- wret = insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1);
- if (wret)
- ret = wret;
+ insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
btrfs_mark_buffer_dirty(right);
btrfs_mark_buffer_dirty(l);
@@ -2803,8 +2830,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
}
BUG_ON(path->slots[0] < 0);
-
- return ret;
}
/*
@@ -2970,7 +2995,7 @@ again:
right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
root->root_key.objectid,
- &disk_key, 0, l->start, 0);
+ &disk_key, 0, l->start, 0, 0);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -2993,12 +3018,8 @@ again:
if (split == 0) {
if (mid <= slot) {
btrfs_set_header_nritems(right, 0);
- wret = insert_ptr(trans, root, path,
- &disk_key, right->start,
- path->slots[1] + 1, 1);
- if (wret)
- ret = wret;
-
+ insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3006,29 +3027,21 @@ again:
path->slots[1] += 1;
} else {
btrfs_set_header_nritems(right, 0);
- wret = insert_ptr(trans, root, path,
- &disk_key,
- right->start,
+ insert_ptr(trans, root, path, &disk_key, right->start,
path->slots[1], 1);
- if (wret)
- ret = wret;
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
path->slots[0] = 0;
- if (path->slots[1] == 0) {
- wret = fixup_low_keys(trans, root,
- path, &disk_key, 1);
- if (wret)
- ret = wret;
- }
+ if (path->slots[1] == 0)
+ fixup_low_keys(trans, root, path,
+ &disk_key, 1);
}
btrfs_mark_buffer_dirty(right);
return ret;
}
- ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
- BUG_ON(ret);
+ copy_for_split(trans, root, path, l, right, slot, mid, nritems);
if (split == 2) {
BUG_ON(num_doubles != 0);
@@ -3036,7 +3049,7 @@ again:
goto again;
}
- return ret;
+ return 0;
push_for_double:
push_for_double_split(trans, root, path, data_size);
@@ -3238,11 +3251,9 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
- BUG_ON(ret);
-
+ setup_items_for_insert(trans, root, path, new_key, &item_size,
+ item_size, item_size +
+ sizeof(struct btrfs_item), 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -3257,10 +3268,10 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
* off the end of the item or if we shift the item to chop bytes off
* the front.
*/
-int btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
@@ -3271,13 +3282,16 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
unsigned int old_size;
unsigned int size_diff;
int i;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
slot = path->slots[0];
old_size = btrfs_item_size_nr(leaf, slot);
if (old_size == new_size)
- return 0;
+ return;
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(root, leaf);
@@ -3297,8 +3311,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff + size_diff);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff + size_diff, &token);
}
/* shift the data */
@@ -3350,15 +3365,14 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
btrfs_print_leaf(root, leaf);
BUG();
}
- return 0;
}
/*
* make the item pointed to by the path bigger, data_size is the new size.
*/
-int btrfs_extend_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- u32 data_size)
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ u32 data_size)
{
int slot;
struct extent_buffer *leaf;
@@ -3368,6 +3382,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
unsigned int old_data;
unsigned int old_size;
int i;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
@@ -3397,8 +3414,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff - data_size);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - data_size, &token);
}
/* shift the data */
@@ -3416,7 +3434,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
btrfs_print_leaf(root, leaf);
BUG();
}
- return 0;
}
/*
@@ -3441,6 +3458,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
unsigned int data_end;
struct btrfs_disk_key disk_key;
struct btrfs_key found_key;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
for (i = 0; i < nr; i++) {
if (total_size + data_size[i] + sizeof(struct btrfs_item) >
@@ -3506,8 +3526,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff - total_data);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - total_data, &token);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -3534,9 +3555,10 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(leaf, slot + i);
- btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+ btrfs_set_token_item_offset(leaf, item,
+ data_end - data_size[i], &token);
data_end -= data_size[i];
- btrfs_set_item_size(leaf, item, data_size[i]);
+ btrfs_set_token_item_size(leaf, item, data_size[i], &token);
}
btrfs_set_header_nritems(leaf, nritems + nr);
btrfs_mark_buffer_dirty(leaf);
@@ -3544,7 +3566,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
ret = 0;
if (slot == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(trans, root, path, &disk_key, 1);
}
if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -3562,19 +3584,21 @@ out:
* to save stack depth by doing the bulk of the work in a function
* that doesn't call btrfs_search_slot
*/
-int setup_items_for_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+void setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr)
{
struct btrfs_item *item;
int i;
u32 nritems;
unsigned int data_end;
struct btrfs_disk_key disk_key;
- int ret;
struct extent_buffer *leaf;
int slot;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
slot = path->slots[0];
@@ -3606,8 +3630,9 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff - total_data);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - total_data, &token);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -3626,17 +3651,17 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(leaf, slot + i);
- btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+ btrfs_set_token_item_offset(leaf, item,
+ data_end - data_size[i], &token);
data_end -= data_size[i];
- btrfs_set_item_size(leaf, item, data_size[i]);
+ btrfs_set_token_item_size(leaf, item, data_size[i], &token);
}
btrfs_set_header_nritems(leaf, nritems + nr);
- ret = 0;
if (slot == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(trans, root, path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(leaf);
@@ -3645,7 +3670,6 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_print_leaf(root, leaf);
BUG();
}
- return ret;
}
/*
@@ -3672,16 +3696,14 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
if (ret == 0)
return -EEXIST;
if (ret < 0)
- goto out;
+ return ret;
slot = path->slots[0];
BUG_ON(slot < 0);
- ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+ setup_items_for_insert(trans, root, path, cpu_key, data_size,
total_data, total_size, nr);
-
-out:
- return ret;
+ return 0;
}
/*
@@ -3717,13 +3739,11 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
* the tree should have been previously balanced so the deletion does not
* empty a node.
*/
-static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot)
+static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int level, int slot)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
- int ret = 0;
- int wret;
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
@@ -3743,12 +3763,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_node_key(parent, &disk_key, 0);
- wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
- if (wret)
- ret = wret;
+ fixup_low_keys(trans, root, path, &disk_key, level + 1);
}
btrfs_mark_buffer_dirty(parent);
- return ret;
}
/*
@@ -3761,17 +3778,13 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* The path must have already been setup for deleting the leaf, including
* all the proper balancing. path->nodes[1] must be locked.
*/
-static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *leaf)
+static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf)
{
- int ret;
-
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
- ret = del_ptr(trans, root, path, 1, path->slots[1]);
- if (ret)
- return ret;
+ del_ptr(trans, root, path, 1, path->slots[1]);
/*
* btrfs_free_extent is expensive, we want to make sure we
@@ -3781,8 +3794,9 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
- btrfs_free_tree_block(trans, root, leaf, 0, 1);
- return 0;
+ extent_buffer_get(leaf);
+ btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
+ free_extent_buffer_stale(leaf);
}
/*
* delete the item at the leaf level in path. If that empties
@@ -3799,6 +3813,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int wret;
int i;
u32 nritems;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
@@ -3820,8 +3837,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff + dsize);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff + dsize, &token);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -3839,8 +3857,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
} else {
btrfs_set_path_blocking(path);
clean_tree_block(trans, root, leaf);
- ret = btrfs_del_leaf(trans, root, path, leaf);
- BUG_ON(ret);
+ btrfs_del_leaf(trans, root, path, leaf);
}
} else {
int used = leaf_space_used(leaf, 0, nritems);
@@ -3848,10 +3865,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_item_key(leaf, &disk_key, 0);
- wret = fixup_low_keys(trans, root, path,
- &disk_key, 1);
- if (wret)
- ret = wret;
+ fixup_low_keys(trans, root, path, &disk_key, 1);
}
/* delete the leaf if it is mostly empty */
@@ -3879,9 +3893,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (btrfs_header_nritems(leaf) == 0) {
path->slots[1] = slot;
- ret = btrfs_del_leaf(trans, root, path, leaf);
- BUG_ON(ret);
+ btrfs_del_leaf(trans, root, path, leaf);
free_extent_buffer(leaf);
+ ret = 0;
} else {
/* if we're still in the path, make sure
* we're dirty. Otherwise, one of the
@@ -4059,18 +4073,18 @@ find_next_key:
path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
- unlock_up(path, level, 1);
+ unlock_up(path, level, 1, 0, NULL);
goto out;
}
btrfs_set_path_blocking(path);
cur = read_node_slot(root, cur, slot);
- BUG_ON(!cur);
+ BUG_ON(!cur); /* -ENOMEM */
btrfs_tree_read_lock(cur);
path->locks[level - 1] = BTRFS_READ_LOCK;
path->nodes[level - 1] = cur;
- unlock_up(path, level, 1);
+ unlock_up(path, level, 1, 0, NULL);
btrfs_clear_path_blocking(path, NULL, 0);
}
out:
@@ -4306,7 +4320,7 @@ again:
}
ret = 0;
done:
- unlock_up(path, 0, 1);
+ unlock_up(path, 0, 1, 0, NULL);
path->leave_spinning = old_spinning;
if (!old_spinning)
btrfs_set_path_blocking(path);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323..5b8ef8eb352 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,6 +48,8 @@ struct btrfs_ordered_sum;
#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_MAX_MIRRORS 2
+
#define BTRFS_MAX_LEVEL 8
#define BTRFS_COMPAT_EXTENT_TREE_V0
@@ -86,6 +88,9 @@ struct btrfs_ordered_sum;
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
@@ -135,6 +140,12 @@ struct btrfs_ordered_sum;
#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
/*
+ * the max metadata block size. This limit is somewhat artificial,
+ * but the memmove costs go through the roof for larger blocks.
+ */
+#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
+
+/*
* we can actually store much bigger names, but lets not confuse the rest
* of linux
*/
@@ -458,6 +469,19 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
+/*
+ * some patches floated around with a second compression method
+ * lets save that incompat here for when they do get in
+ * Note we don't actually support it, we're just reserving the
+ * number
+ */
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4)
+
+/*
+ * older kernels tried to do bigger metadata blocks, but the
+ * code was pretty buggy. Lets not let them try anymore.
+ */
+#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -465,6 +489,7 @@ struct btrfs_super_block {
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
/*
@@ -692,6 +717,54 @@ struct btrfs_root_ref {
__le16 name_len;
} __attribute__ ((__packed__));
+struct btrfs_disk_balance_args {
+ /*
+ * profiles to operate on, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 profiles;
+
+ /* usage filter */
+ __le64 usage;
+
+ /* devid filter */
+ __le64 devid;
+
+ /* devid subset filter [pstart..pend) */
+ __le64 pstart;
+ __le64 pend;
+
+ /* btrfs virtual address space subset filter [vstart..vend) */
+ __le64 vstart;
+ __le64 vend;
+
+ /*
+ * profile to convert to, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 target;
+
+ /* BTRFS_BALANCE_ARGS_* */
+ __le64 flags;
+
+ __le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+ /* BTRFS_BALANCE_* */
+ __le64 flags;
+
+ struct btrfs_disk_balance_args data;
+ struct btrfs_disk_balance_args meta;
+ struct btrfs_disk_balance_args sys;
+
+ __le64 unused[4];
+} __attribute__ ((__packed__));
+
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +824,47 @@ struct btrfs_csum_item {
} __attribute__ ((__packed__));
/* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
-#define BTRFS_NR_RAID_TYPES 5
+#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_NR_RAID_TYPES 5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
+ BTRFS_BLOCK_GROUP_SYSTEM | \
+ BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID1 | \
+ BTRFS_BLOCK_GROUP_DUP | \
+ BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available. This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk). The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
+
+#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+ BTRFS_AVAIL_ALLOC_BIT_SINGLE)
+
+static inline u64 chunk_to_extended(u64 flags)
+{
+ if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
+ flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ return flags;
+}
+static inline u64 extended_to_chunk(u64 flags)
+{
+ return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+}
struct btrfs_block_group_item {
__le64 used;
@@ -817,7 +923,7 @@ struct btrfs_block_rsv {
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
- unsigned int full:1;
+ unsigned int full;
};
/*
@@ -916,6 +1022,7 @@ struct btrfs_block_group_cache {
struct reloc_control;
struct btrfs_device;
struct btrfs_fs_devices;
+struct btrfs_balance_control;
struct btrfs_delayed_root;
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1078,7 @@ struct btrfs_fs_info {
* is required instead of the faster short fsync log commits
*/
u64 last_trans_log_full_commit;
- unsigned long mount_opt:20;
+ unsigned long mount_opt:21;
unsigned long compress_type:4;
u64 max_inline;
u64 alloc_start;
@@ -1132,12 +1239,23 @@ struct btrfs_fs_info {
spinlock_t ref_cache_lock;
u64 total_ref_cache_size;
+ /*
+ * these three are in extended format (availability of single
+ * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+ * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+ */
u64 avail_data_alloc_bits;
u64 avail_metadata_alloc_bits;
u64 avail_system_alloc_bits;
- u64 data_alloc_profile;
- u64 metadata_alloc_profile;
- u64 system_alloc_profile;
+
+ /* restriper state */
+ spinlock_t balance_lock;
+ struct mutex balance_mutex;
+ atomic_t balance_running;
+ atomic_t balance_pause_req;
+ atomic_t balance_cancel_req;
+ struct btrfs_balance_control *balance_ctl;
+ wait_queue_head_t balance_wait_q;
unsigned data_chunk_allocations;
unsigned metadata_ratio;
@@ -1155,6 +1273,10 @@ struct btrfs_fs_info {
int scrub_workers_refcnt;
struct btrfs_workers scrub_workers;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ u32 check_integrity_print_mask;
+#endif
+
/* filesystem state */
u64 fs_state;
@@ -1383,6 +1505,8 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
+#define BTRFS_BALANCE_ITEM_KEY 248
+
/*
* string items are for debugging. They just store a short string of
* data in the FS
@@ -1413,6 +1537,10 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
#define BTRFS_MOUNT_RECOVERY (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
+#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1436,6 +1564,17 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
+struct btrfs_map_token {
+ struct extent_buffer *eb;
+ char *kaddr;
+ unsigned long offset;
+};
+
+static inline void btrfs_init_map_token (struct btrfs_map_token *token)
+{
+ memset(token, 0, sizeof(*token));
+}
+
/* some macros to generate set/get funcs for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple
* one for u8:
@@ -1459,20 +1598,22 @@ struct btrfs_ioctl_defrag_range_args {
#ifndef BTRFS_SETGET_FUNCS
#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
+u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \
+void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\
void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
#endif
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(struct extent_buffer *eb) \
{ \
- type *p = page_address(eb->first_page); \
+ type *p = page_address(eb->pages[0]); \
u##bits res = le##bits##_to_cpu(p->member); \
return res; \
} \
static inline void btrfs_set_##name(struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->first_page); \
+ type *p = page_address(eb->pages[0]); \
p->member = cpu_to_le##bits(val); \
}
@@ -2077,8 +2218,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+}
+/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2196,7 +2415,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
return btrfs_item_size(eb, e) - offset;
}
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
}
@@ -2277,11 +2496,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
- u64 hint, u64 empty_size);
+ u64 hint, u64 empty_size, int for_cow);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- u64 parent, int last_ref);
+ u64 parent, int last_ref, int for_cow);
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -2298,32 +2517,31 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data);
+ struct btrfs_key *ins, u64 data);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, int full_backref, int for_cow);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, int full_backref, int for_cow);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
int is_data);
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset);
+ u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int for_cow);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset);
+ u64 root_objectid, u64 owner, u64 offset, int for_cow);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -2380,8 +2598,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
u64 num_bytes);
int btrfs_set_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
-int btrfs_set_block_group_rw(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache);
+void btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_root *root,
@@ -2400,9 +2618,9 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
-int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *new_key);
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
@@ -2422,12 +2640,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct extent_buffer **cow_ret, u64 new_root_objectid);
int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf);
-int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, u32 data_size);
-int btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ u32 data_size);
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -2461,10 +2680,10 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
return btrfs_del_items(trans, root, path, path->slots[0], 1);
}
-int setup_items_for_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
+void setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -2482,10 +2701,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
}
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+ ++p->slots[0];
+ if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+ return btrfs_next_leaf(root, p);
+ return 0;
+}
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-void btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ int update_ref, int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2500,6 +2727,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
}
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
{
+ kfree(fs_info->balance_ctl);
kfree(fs_info->delayed_root);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
@@ -2528,9 +2756,10 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_root_item
*item);
-int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_key *key, struct btrfs_root_item
- *item);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key,
+ struct btrfs_root_item *item);
int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
btrfs_root_item *item, struct btrfs_key *key);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
@@ -2714,7 +2943,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
-int btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_root *root);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -2766,13 +2995,41 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* super.c */
int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
+void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...);
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno);
+ unsigned int line, int errno, const char *fmt, ...);
+
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *function,
+ unsigned int line, int errno);
+
+#define btrfs_abort_transaction(trans, root, errno) \
+do { \
+ __btrfs_abort_transaction(trans, root, __func__, \
+ __LINE__, errno); \
+} while (0)
#define btrfs_std_error(fs_info, errno) \
do { \
if ((errno)) \
- __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+ __btrfs_std_error((fs_info), __func__, \
+ __LINE__, (errno), NULL); \
+} while (0)
+
+#define btrfs_error(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_std_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
+} while (0)
+
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+
+#define btrfs_panic(fs_info, errno, fmt, args...) \
+do { \
+ struct btrfs_fs_info *_i = (fs_info); \
+ __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \
+ BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \
} while (0)
/* acl.c */
@@ -2808,16 +3065,17 @@ void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
-void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
/* scrub.c */
int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
struct btrfs_scrub_progress *progress, int readonly);
-int btrfs_scrub_pause(struct btrfs_root *root);
-int btrfs_scrub_pause_super(struct btrfs_root *root);
-int btrfs_scrub_continue(struct btrfs_root *root);
-int btrfs_scrub_continue_super(struct btrfs_root *root);
+void btrfs_scrub_pause(struct btrfs_root *root);
+void btrfs_scrub_pause_super(struct btrfs_root *root);
+void btrfs_scrub_continue(struct btrfs_root *root);
+void btrfs_scrub_continue_super(struct btrfs_root *root);
+int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel(struct btrfs_root *root);
int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c50..03e3748d84d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -115,6 +115,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
return NULL;
}
+/* Will return either the node or PTR_ERR(-ENOMEM) */
static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
struct inode *inode)
{
@@ -595,8 +596,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
- if (!ret)
+ if (!ret) {
+ trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+ item->key.objectid,
+ num_bytes, 1);
item->bytes_reserved = num_bytes;
+ }
return ret;
}
@@ -610,6 +615,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
return;
rsv = &root->fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+ item->key.objectid, item->bytes_reserved,
+ 0);
btrfs_block_rsv_release(root, rsv,
item->bytes_reserved);
}
@@ -624,7 +632,7 @@ static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_block_rsv *dst_rsv;
u64 num_bytes;
int ret;
- int release = false;
+ bool release = false;
src_rsv = trans->block_rsv;
dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +659,13 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (ret == -EAGAIN)
ret = -ENOSPC;
- if (!ret)
+ if (!ret) {
node->bytes_reserved = num_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "delayed_inode",
+ btrfs_ino(inode),
+ num_bytes, 1);
+ }
return ret;
} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +720,17 @@ out:
* reservation here. I think it may be time for a documentation page on
* how block rsvs. work.
*/
- if (!ret)
+ if (!ret) {
+ trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+ btrfs_ino(inode), num_bytes, 1);
node->bytes_reserved = num_bytes;
+ }
- if (release)
+ if (release) {
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), num_bytes, 0);
btrfs_block_rsv_release(root, src_rsv, num_bytes);
+ }
return ret;
}
@@ -725,6 +744,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
return;
rsv = &root->fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+ node->inode_id, node->bytes_reserved, 0);
btrfs_block_rsv_release(root, rsv,
node->bytes_reserved);
node->bytes_reserved = 0;
@@ -816,10 +837,8 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
btrfs_clear_path_blocking(path, NULL, 0);
/* insert the keys of the items */
- ret = setup_items_for_insert(trans, root, path, keys, data_size,
- total_data_size, total_size, nitems);
- if (ret)
- goto error;
+ setup_items_for_insert(trans, root, path, keys, data_size,
+ total_data_size, total_size, nitems);
/* insert the dir index items */
slot = path->slots[0];
@@ -1088,16 +1107,25 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
return 0;
}
-/* Called when committing the transaction. */
+/*
+ * Called when committing the transaction.
+ * Returns 0 on success.
+ * Returns < 0 on error and returns with an aborted transaction with any
+ * outstanding delayed items cleaned up.
+ */
int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_root *curr_root = root;
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret = 0;
+ if (trans->aborted)
+ return -EIO;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1110,17 +1138,18 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
curr_node = btrfs_first_delayed_node(delayed_root);
while (curr_node) {
- root = curr_node->root;
- ret = btrfs_insert_delayed_items(trans, path, root,
+ curr_root = curr_node->root;
+ ret = btrfs_insert_delayed_items(trans, path, curr_root,
curr_node);
if (!ret)
- ret = btrfs_delete_delayed_items(trans, path, root,
- curr_node);
+ ret = btrfs_delete_delayed_items(trans, path,
+ curr_root, curr_node);
if (!ret)
- ret = btrfs_update_delayed_inode(trans, root, path,
- curr_node);
+ ret = btrfs_update_delayed_inode(trans, curr_root,
+ path, curr_node);
if (ret) {
btrfs_release_delayed_node(curr_node);
+ btrfs_abort_transaction(trans, root, ret);
break;
}
@@ -1131,6 +1160,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
trans->block_rsv = block_rsv;
+
return ret;
}
@@ -1351,6 +1381,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
btrfs_wq_run_delayed_node(delayed_root, root, 0);
}
+/* Will return 0 or -ENOMEM */
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *name,
int name_len, struct inode *dir,
@@ -1372,13 +1403,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
goto release_node;
}
- ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
- /*
- * we have reserved enough space when we start a new transaction,
- * so reserving metadata failure is impossible
- */
- BUG_ON(ret);
-
delayed_item->key.objectid = btrfs_ino(dir);
btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
delayed_item->key.offset = index;
@@ -1391,6 +1415,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
dir_item->type = type;
memcpy((char *)(dir_item + 1), name, name_len);
+ ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+ /*
+ * we have reserved enough space when we start a new transaction,
+ * so reserving metadata failure is impossible
+ */
+ BUG_ON(ret);
+
+
mutex_lock(&delayed_node->mutex);
ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
if (unlikely(ret)) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd0..69f22e3ab3b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
return -1;
if (ref1->type > ref2->type)
return 1;
+ /* merging of sequenced refs is not allowed */
+ if (ref1->seq < ref2->seq)
+ return -1;
+ if (ref1->seq > ref2->seq)
+ return 1;
if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
/*
* find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
*/
static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
u64 bytenr,
- struct btrfs_delayed_ref_node **last)
+ struct btrfs_delayed_ref_node **last,
+ int return_bigger)
{
- struct rb_node *n = root->rb_node;
+ struct rb_node *n;
struct btrfs_delayed_ref_node *entry;
- int cmp;
+ int cmp = 0;
+again:
+ n = root->rb_node;
+ entry = NULL;
while (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
else
return entry;
}
+ if (entry && return_bigger) {
+ if (cmp > 0) {
+ n = rb_next(&entry->rb_node);
+ if (!n)
+ n = rb_first(root);
+ entry = rb_entry(n, struct btrfs_delayed_ref_node,
+ rb_node);
+ bytenr = entry->bytenr;
+ return_bigger = 0;
+ goto again;
+ }
+ return entry;
+ }
return NULL;
}
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
return 0;
}
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq)
+{
+ struct seq_list *elem;
+
+ assert_spin_locked(&delayed_refs->lock);
+ if (list_empty(&delayed_refs->seq_head))
+ return 0;
+
+ elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
+ if (seq >= elem->seq) {
+ pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
+ seq, elem->seq, delayed_refs);
+ return 1;
+ }
+ return 0;
+}
+
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 start)
{
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
node = rb_first(&delayed_refs->root);
} else {
ref = NULL;
- find_ref_head(&delayed_refs->root, start, &ref);
+ find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
if (ref) {
- struct btrfs_delayed_ref_node *tmp;
-
- node = rb_prev(&ref->rb_node);
- while (node) {
- tmp = rb_entry(node,
- struct btrfs_delayed_ref_node,
- rb_node);
- if (tmp->bytenr < start)
- break;
- ref = tmp;
- node = rb_prev(&ref->rb_node);
- }
node = &ref->rb_node;
} else
node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
* this does all the dirty work in terms of maintaining the correct
* overall modification count.
*/
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes,
int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
ref->action = 0;
ref->is_head = 1;
ref->in_tree = 1;
+ ref->seq = 0;
head_ref = btrfs_delayed_node_to_head(ref);
head_ref->must_insert_reserved = must_insert_reserved;
@@ -455,27 +487,29 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(ref);
+ kfree(head_ref);
} else {
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
- return 0;
}
/*
* helper to insert a delayed tree ref into the rbtree.
*/
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 ref_root, int level, int action)
+ u64 ref_root, int level, int action,
+ int for_cow)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_tree_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ u64 seq = 0;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +525,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref->is_head = 0;
ref->in_tree = 1;
+ if (need_ref_seq(for_cow, ref_root))
+ seq = inc_delayed_seq(delayed_refs);
+ ref->seq = seq;
+
full_ref = btrfs_delayed_node_to_tree_ref(ref);
- if (parent) {
- full_ref->parent = parent;
+ full_ref->parent = parent;
+ full_ref->root = ref_root;
+ if (parent)
ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
- } else {
- full_ref->root = ref_root;
+ else
ref->type = BTRFS_TREE_BLOCK_REF_KEY;
- }
full_ref->level = level;
trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -511,26 +548,27 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(ref);
+ kfree(full_ref);
} else {
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
- return 0;
}
/*
* helper to insert a delayed data ref into the rbtree.
*/
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, u64 owner, u64 offset,
- int action)
+ int action, int for_cow)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_data_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ u64 seq = 0;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +584,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
ref->is_head = 0;
ref->in_tree = 1;
+ if (need_ref_seq(for_cow, ref_root))
+ seq = inc_delayed_seq(delayed_refs);
+ ref->seq = seq;
+
full_ref = btrfs_delayed_node_to_data_ref(ref);
- if (parent) {
- full_ref->parent = parent;
+ full_ref->parent = parent;
+ full_ref->root = ref_root;
+ if (parent)
ref->type = BTRFS_SHARED_DATA_REF_KEY;
- } else {
- full_ref->root = ref_root;
+ else
ref->type = BTRFS_EXTENT_DATA_REF_KEY;
- }
+
full_ref->objectid = owner;
full_ref->offset = offset;
@@ -567,12 +609,11 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(ref);
+ kfree(full_ref);
} else {
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
- return 0;
}
/*
@@ -580,15 +621,16 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
* to make sure the delayed ref is eventually processed before this
* transaction commits.
*/
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- int ret;
BUG_ON(extent_op && extent_op->is_data);
ref = kmalloc(sizeof(*ref), GFP_NOFS);
@@ -610,13 +652,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
- action, 0);
- BUG_ON(ret);
-
- ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
- parent, ref_root, level, action);
- BUG_ON(ret);
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+ num_bytes, action, 0);
+
+ add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+ num_bytes, parent, ref_root, level, action,
+ for_cow);
+ if (!need_ref_seq(for_cow, ref_root) &&
+ waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -624,16 +668,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
/*
* add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
*/
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- int ret;
BUG_ON(extent_op && !extent_op->is_data);
ref = kmalloc(sizeof(*ref), GFP_NOFS);
@@ -655,24 +700,26 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
- action, 1);
- BUG_ON(ret);
-
- ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
- parent, ref_root, owner, offset, action);
- BUG_ON(ret);
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+ num_bytes, action, 1);
+
+ add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+ num_bytes, parent, ref_root, owner, offset,
+ action, for_cow);
+ if (!need_ref_seq(for_cow, ref_root) &&
+ waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
return 0;
}
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- int ret;
head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
if (!head_ref)
@@ -683,11 +730,12 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data);
- BUG_ON(ret);
+ if (waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -704,7 +752,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
struct btrfs_delayed_ref_root *delayed_refs;
delayed_refs = &trans->transaction->delayed_refs;
- ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+ ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
if (ref)
return btrfs_delayed_node_to_head(ref);
return NULL;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab..d8f244d9492 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
/* the size of the extent */
u64 num_bytes;
+ /* seq number to keep track of insertion order */
+ u64 seq;
+
/* ref count on this data structure */
atomic_t refs;
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
struct btrfs_delayed_tree_ref {
struct btrfs_delayed_ref_node node;
- union {
- u64 root;
- u64 parent;
- };
+ u64 root;
+ u64 parent;
int level;
};
struct btrfs_delayed_data_ref {
struct btrfs_delayed_ref_node node;
- union {
- u64 root;
- u64 parent;
- };
+ u64 root;
+ u64 parent;
u64 objectid;
u64 offset;
};
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
int flushing;
u64 run_delayed_start;
+
+ /*
+ * seq number of delayed refs. We need to know if a backref was being
+ * added before the currently processed ref or afterwards.
+ */
+ u64 seq;
+
+ /*
+ * seq_list holds a list of all seq numbers that are currently being
+ * added to the list. While walking backrefs (btrfs_find_all_roots,
+ * qgroups), which might take some time, no newer ref must be processed,
+ * as it might influence the outcome of the walk.
+ */
+ struct list_head seq_head;
+
+ /*
+ * when the only refs we have in the list must not be processed, we want
+ * to wait for more refs to show up or for the end of backref walking.
+ */
+ wait_queue_head_t seq_wait;
};
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
}
}
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head);
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start);
+
+struct seq_list {
+ struct list_head list;
+ u64 seq;
+};
+
+static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
+{
+ assert_spin_locked(&delayed_refs->lock);
+ ++delayed_refs->seq;
+ return delayed_refs->seq;
+}
+
+static inline void
+btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ struct seq_list *elem)
+{
+ assert_spin_locked(&delayed_refs->lock);
+ elem->seq = delayed_refs->seq;
+ list_add_tail(&elem->list, &delayed_refs->seq_head);
+}
+
+static inline void
+btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ struct seq_list *elem)
+{
+ spin_lock(&delayed_refs->lock);
+ list_del(&elem->list);
+ wake_up(&delayed_refs->seq_wait);
+ spin_unlock(&delayed_refs->lock);
+}
+
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq);
+
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
+{
+ if (for_cow)
+ return 0;
+
+ if (rootid == BTRFS_FS_TREE_OBJECTID)
+ return 1;
+
+ if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+
+ return 0;
+}
+
/*
* a node might live in a head or a regular ref, this lets you
* test for the proper type to use.
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 31d84e78129..c1a074d0696 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -49,9 +49,8 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
di = btrfs_match_dir_item_name(root, path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
- ret = btrfs_extend_item(trans, root, path, data_size);
- }
- if (ret < 0)
+ btrfs_extend_item(trans, root, path, data_size);
+ } else if (ret < 0)
return ERR_PTR(ret);
WARN_ON(ret > 0);
leaf = path->nodes[0];
@@ -116,6 +115,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* 'location' is the key to stuff into the directory item, 'type' is the
* type of the inode we're pointing to, and 'index' is the sequence number
* to use for the second index (if one is created).
+ * Will return 0 or -ENOMEM
*/
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
*root, const char *name, int name_len,
@@ -383,8 +383,8 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_len - (ptr + sub_item_len - start));
- ret = btrfs_truncate_item(trans, root, path,
- item_len - sub_item_len, 1);
+ btrfs_truncate_item(trans, root, path,
+ item_len - sub_item_len, 1);
}
return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f99a099a774..20196f41120 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,24 +43,24 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "inode-map.h"
+#include "check-integrity.h"
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
-static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
-static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
-static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
-static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
-static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages,
int mark);
static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
struct extent_io_tree *pinned_extents);
-static int btrfs_cleanup_transaction(struct btrfs_root *root);
/*
* end_io_wq structs are used to do processing in task context when an IO is
@@ -98,6 +98,7 @@ struct async_submit_bio {
*/
u64 bio_offset;
struct btrfs_work work;
+ int error;
};
/*
@@ -331,8 +332,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
return 0;
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- 0, &cached_state, GFP_NOFS);
- if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
+ 0, &cached_state);
+ if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
@@ -343,7 +344,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
(unsigned long long)parent_transid,
(unsigned long long)btrfs_header_generation(eb));
ret = 1;
- clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
+ clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state, GFP_NOFS);
@@ -359,9 +360,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
u64 start, u64 parent_transid)
{
struct extent_io_tree *io_tree;
+ int failed = 0;
int ret;
int num_copies = 0;
int mirror_num = 0;
+ int failed_mirror = 0;
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
@@ -369,9 +372,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
ret = read_extent_buffer_pages(io_tree, eb, start,
WAIT_COMPLETE,
btree_get_extent, mirror_num);
- if (!ret &&
- !verify_parent_transid(io_tree, eb, parent_transid))
- return ret;
+ if (!ret && !verify_parent_transid(io_tree, eb, parent_transid))
+ break;
/*
* This buffer's crc is fine, but its contents are corrupted, so
@@ -379,18 +381,31 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
* any less wrong.
*/
if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
- return ret;
+ break;
+
+ if (!failed_mirror) {
+ failed = 1;
+ printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror);
+ failed_mirror = eb->failed_mirror;
+ }
num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
eb->start, eb->len);
if (num_copies == 1)
- return ret;
+ break;
mirror_num++;
+ if (mirror_num == failed_mirror)
+ mirror_num++;
+
if (mirror_num > num_copies)
- return ret;
+ break;
}
- return -EIO;
+
+ if (failed && !ret)
+ repair_eb_io_failure(root, eb, failed_mirror);
+
+ return ret;
}
/*
@@ -403,50 +418,27 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
struct extent_io_tree *tree;
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 found_start;
- unsigned long len;
struct extent_buffer *eb;
- int ret;
tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE) {
- WARN_ON(1);
- goto out;
- }
- if (!page->private) {
- WARN_ON(1);
- goto out;
- }
- len = page->private >> 2;
- WARN_ON(len == 0);
-
- eb = alloc_extent_buffer(tree, start, len, page);
- if (eb == NULL) {
- WARN_ON(1);
- goto out;
- }
- ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
- btrfs_header_generation(eb));
- BUG_ON(ret);
- WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
-
+ eb = (struct extent_buffer *)page->private;
+ if (page != eb->pages[0])
+ return 0;
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
WARN_ON(1);
- goto err;
+ return 0;
}
- if (eb->first_page != page) {
+ if (eb->pages[0] != page) {
WARN_ON(1);
- goto err;
+ return 0;
}
if (!PageUptodate(page)) {
WARN_ON(1);
- goto err;
+ return 0;
}
csum_tree_block(root, eb, 0);
-err:
- free_extent_buffer(eb);
-out:
return 0;
}
@@ -536,34 +528,74 @@ static noinline int check_leaf(struct btrfs_root *root,
return 0;
}
+struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
+ struct page *page, int max_walk)
+{
+ struct extent_buffer *eb;
+ u64 start = page_offset(page);
+ u64 target = start;
+ u64 min_start;
+
+ if (start < max_walk)
+ min_start = 0;
+ else
+ min_start = start - max_walk;
+
+ while (start >= min_start) {
+ eb = find_extent_buffer(tree, start, 0);
+ if (eb) {
+ /*
+ * we found an extent buffer and it contains our page
+ * horray!
+ */
+ if (eb->start <= target &&
+ eb->start + eb->len > target)
+ return eb;
+
+ /* we found an extent buffer that wasn't for us */
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ if (start == 0)
+ break;
+ start -= PAGE_CACHE_SIZE;
+ }
+ return NULL;
+}
+
static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state)
{
struct extent_io_tree *tree;
u64 found_start;
int found_level;
- unsigned long len;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
int ret = 0;
+ int reads_done;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE)
- goto out;
if (!page->private)
goto out;
- len = page->private >> 2;
- WARN_ON(len == 0);
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ eb = (struct extent_buffer *)page->private;
+
+ /* the pending IO might have been the only thing that kept this buffer
+ * in memory. Make sure we have a ref for all this other checks
+ */
+ extent_buffer_get(eb);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ if (!reads_done)
+ goto err;
- eb = alloc_extent_buffer(tree, start, len, page);
- if (eb == NULL) {
+ if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
ret = -EIO;
- goto out;
+ goto err;
}
found_start = btrfs_header_bytenr(eb);
- if (found_start != start) {
+ if (found_start != eb->start) {
printk_ratelimited(KERN_INFO "btrfs bad tree block start "
"%llu %llu\n",
(unsigned long long)found_start,
@@ -571,13 +603,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
ret = -EIO;
goto err;
}
- if (eb->first_page != page) {
- printk(KERN_INFO "btrfs bad first page %lu %lu\n",
- eb->first_page->index, page->index);
- WARN_ON(1);
- ret = -EIO;
- goto err;
- }
if (check_tree_block_fsid(root, eb)) {
printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
(unsigned long long)eb->start);
@@ -605,48 +630,31 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
ret = -EIO;
}
- end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
- end = eb->start + end - 1;
+ if (!ret)
+ set_extent_buffer_uptodate(eb);
err:
if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
btree_readahead_hook(root, eb, eb->start, ret);
}
+ if (ret)
+ clear_extent_buffer_uptodate(eb);
free_extent_buffer(eb);
out:
return ret;
}
-static int btree_io_failed_hook(struct bio *failed_bio,
- struct page *page, u64 start, u64 end,
- int mirror_num, struct extent_state *state)
+static int btree_io_failed_hook(struct page *page, int failed_mirror)
{
- struct extent_io_tree *tree;
- unsigned long len;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE)
- goto out;
- if (!page->private)
- goto out;
-
- len = page->private >> 2;
- WARN_ON(len == 0);
-
- eb = alloc_extent_buffer(tree, start, len, page);
- if (eb == NULL)
- goto out;
-
- if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
- clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ eb = (struct extent_buffer *)page->private;
+ set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ eb->failed_mirror = failed_mirror;
+ if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(root, eb, eb->start, -EIO);
- }
- free_extent_buffer(eb);
-
-out:
return -EIO; /* we fixed nothing */
}
@@ -718,11 +726,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
+ int ret;
async = container_of(work, struct async_submit_bio, work);
- async->submit_bio_start(async->inode, async->rw, async->bio,
- async->mirror_num, async->bio_flags,
- async->bio_offset);
+ ret = async->submit_bio_start(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
+ if (ret)
+ async->error = ret;
}
static void run_one_async_done(struct btrfs_work *work)
@@ -743,6 +754,12 @@ static void run_one_async_done(struct btrfs_work *work)
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
+ /* If an error occured we just want to clean up the bio and move on */
+ if (async->error) {
+ bio_endio(async->bio, async->error);
+ return;
+ }
+
async->submit_bio_done(async->inode, async->rw, async->bio,
async->mirror_num, async->bio_flags,
async->bio_offset);
@@ -784,6 +801,8 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->bio_flags = bio_flags;
async->bio_offset = bio_offset;
+ async->error = 0;
+
atomic_inc(&fs_info->nr_async_submits);
if (rw & REQ_SYNC)
@@ -805,15 +824,18 @@ static int btree_csum_one_bio(struct bio *bio)
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
struct btrfs_root *root;
+ int ret = 0;
WARN_ON(bio->bi_vcnt <= 0);
while (bio_index < bio->bi_vcnt) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
- csum_dirty_buffer(root, bvec->bv_page);
+ ret = csum_dirty_buffer(root, bvec->bv_page);
+ if (ret)
+ break;
bio_index++;
bvec++;
}
- return 0;
+ return ret;
}
static int __btree_submit_bio_start(struct inode *inode, int rw,
@@ -825,8 +847,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
*/
- btree_csum_one_bio(bio);
- return 0;
+ return btree_csum_one_bio(bio);
}
static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
@@ -846,15 +867,16 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
{
int ret;
- ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
- bio, 1);
- BUG_ON(ret);
-
if (!(rw & REQ_WRITE)) {
+
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
*/
+ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+ bio, 1);
+ if (ret)
+ return ret;
return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
mirror_num, 0);
}
@@ -872,7 +894,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
#ifdef CONFIG_MIGRATION
static int btree_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
@@ -887,38 +910,10 @@ static int btree_migratepage(struct address_space *mapping,
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
#endif
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct extent_io_tree *tree;
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- struct extent_buffer *eb;
- int was_dirty;
-
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (!(current->flags & PF_MEMALLOC)) {
- return extent_write_full_page(tree, page,
- btree_get_extent, wbc);
- }
-
- redirty_page_for_writepage(wbc, page);
- eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
- WARN_ON(!eb);
-
- was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- if (!was_dirty) {
- spin_lock(&root->fs_info->delalloc_lock);
- root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
- spin_unlock(&root->fs_info->delalloc_lock);
- }
- free_extent_buffer(eb);
-
- unlock_page(page);
- return 0;
-}
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
@@ -938,7 +933,7 @@ static int btree_writepages(struct address_space *mapping,
if (num_dirty < thresh)
return 0;
}
- return extent_writepages(tree, mapping, btree_get_extent, wbc);
+ return btree_write_cache_pages(mapping, wbc);
}
static int btree_readpage(struct file *file, struct page *page)
@@ -950,28 +945,16 @@ static int btree_readpage(struct file *file, struct page *page)
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
- struct extent_io_tree *tree;
- struct extent_map_tree *map;
- int ret;
-
if (PageWriteback(page) || PageDirty(page))
return 0;
+ /*
+ * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
+ * slab allocation from alloc_extent_state down the callchain where
+ * it'd hit a BUG_ON as those flags are not allowed.
+ */
+ gfp_flags &= ~GFP_SLAB_BUG_MASK;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- map = &BTRFS_I(page->mapping->host)->extent_tree;
-
- ret = try_release_extent_state(map, tree, page, gfp_flags);
- if (!ret)
- return 0;
-
- ret = try_release_extent_buffer(tree, page);
- if (ret == 1) {
- ClearPagePrivate(page);
- set_page_private(page, 0);
- page_cache_release(page);
- }
-
- return ret;
+ return try_release_extent_buffer(page, gfp_flags);
}
static void btree_invalidatepage(struct page *page, unsigned long offset)
@@ -989,15 +972,28 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
}
}
+static int btree_set_page_dirty(struct page *page)
+{
+ struct extent_buffer *eb;
+
+ BUG_ON(!PagePrivate(page));
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(!atomic_read(&eb->refs));
+ btrfs_assert_tree_locked(eb);
+ return __set_page_dirty_nobuffers(page);
+}
+
static const struct address_space_operations btree_aops = {
.readpage = btree_readpage,
- .writepage = btree_writepage,
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
+ .set_page_dirty = btree_set_page_dirty,
};
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1040,7 +1036,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
free_extent_buffer(buf);
return -EIO;
- } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+ } else if (extent_buffer_uptodate(buf)) {
*eb = buf;
} else {
free_extent_buffer(buf);
@@ -1065,20 +1061,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
struct extent_buffer *eb;
eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
- bytenr, blocksize, NULL);
+ bytenr, blocksize);
return eb;
}
int btrfs_write_tree_block(struct extent_buffer *buf)
{
- return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+ return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
buf->start + buf->len - 1);
}
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
- return filemap_fdatawait_range(buf->first_page->mapping,
+ return filemap_fdatawait_range(buf->pages[0]->mapping,
buf->start, buf->start + buf->len - 1);
}
@@ -1093,17 +1089,13 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
return NULL;
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-
- if (ret == 0)
- set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
return buf;
}
-int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf)
+void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf)
{
- struct inode *btree_inode = root->fs_info->btree_inode;
if (btrfs_header_generation(buf) ==
root->fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
@@ -1112,23 +1104,27 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
spin_lock(&root->fs_info->delalloc_lock);
if (root->fs_info->dirty_metadata_bytes >= buf->len)
root->fs_info->dirty_metadata_bytes -= buf->len;
- else
- WARN_ON(1);
+ else {
+ spin_unlock(&root->fs_info->delalloc_lock);
+ btrfs_panic(root->fs_info, -EOVERFLOW,
+ "Can't clear %lu bytes from "
+ " dirty_mdatadata_bytes (%lu)",
+ buf->len,
+ root->fs_info->dirty_metadata_bytes);
+ }
spin_unlock(&root->fs_info->delalloc_lock);
}
/* ugh, clear_extent_buffer_dirty needs to lock the page */
btrfs_set_lock_blocking(buf);
- clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
- buf);
+ clear_extent_buffer_dirty(buf);
}
- return 0;
}
-static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
- u32 stripesize, struct btrfs_root *root,
- struct btrfs_fs_info *fs_info,
- u64 objectid)
+static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ u32 stripesize, struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid)
{
root->node = NULL;
root->commit_root = NULL;
@@ -1142,7 +1138,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->orphan_item_inserted = 0;
root->orphan_cleanup_state = 0;
- root->fs_info = fs_info;
root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
@@ -1181,13 +1176,12 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->defrag_running = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
- return 0;
}
-static int find_and_setup_root(struct btrfs_root *tree_root,
- struct btrfs_fs_info *fs_info,
- u64 objectid,
- struct btrfs_root *root)
+static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid,
+ struct btrfs_root *root)
{
int ret;
u32 blocksize;
@@ -1200,7 +1194,8 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
&root->root_item, &root->root_key);
if (ret > 0)
return -ENOENT;
- BUG_ON(ret);
+ else if (ret < 0)
+ return ret;
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
@@ -1216,6 +1211,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
return 0;
}
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (root)
+ root->fs_info = fs_info;
+ return root;
+}
+
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -1223,7 +1226,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
- root = kzalloc(sizeof(*root), GFP_NOFS);
+ root = btrfs_alloc_root(fs_info);
if (!root)
return ERR_PTR(-ENOMEM);
@@ -1243,7 +1246,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
root->ref_cows = 0;
leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+ BTRFS_TREE_LOG_OBJECTID, NULL,
+ 0, 0, 0, 0);
if (IS_ERR(leaf)) {
kfree(root);
return ERR_CAST(leaf);
@@ -1317,7 +1321,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
u32 blocksize;
int ret = 0;
- root = kzalloc(sizeof(*root), GFP_NOFS);
+ root = btrfs_alloc_root(fs_info);
if (!root)
return ERR_PTR(-ENOMEM);
if (location->offset == (u64)-1) {
@@ -1360,7 +1364,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
root->commit_root = btrfs_root_node(root);
- BUG_ON(!root->node);
+ BUG_ON(!root->node); /* -ENOMEM */
out:
if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
root->ref_cows = 1;
@@ -1496,41 +1500,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
return 0;
}
-static int bio_ready_for_csum(struct bio *bio)
-{
- u64 length = 0;
- u64 buf_len = 0;
- u64 start = 0;
- struct page *page;
- struct extent_io_tree *io_tree = NULL;
- struct bio_vec *bvec;
- int i;
- int ret;
-
- bio_for_each_segment(bvec, bio, i) {
- page = bvec->bv_page;
- if (page->private == EXTENT_PAGE_PRIVATE) {
- length += bvec->bv_len;
- continue;
- }
- if (!page->private) {
- length += bvec->bv_len;
- continue;
- }
- length = bvec->bv_len;
- buf_len = page->private >> 2;
- start = page_offset(page) + bvec->bv_offset;
- io_tree = &BTRFS_I(page->mapping->host)->io_tree;
- }
- /* are we fully contained in this bio? */
- if (buf_len <= length)
- return 1;
-
- ret = extent_range_uptodate(io_tree, start + length,
- start + buf_len - 1);
- return ret;
-}
-
/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
@@ -1546,17 +1515,6 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio = end_io_wq->bio;
fs_info = end_io_wq->info;
- /* metadata bio reads are special because the whole tree block must
- * be checksummed at once. This makes sure the entire block is in
- * ram and up to date before trying to verify things. For
- * blocksize <= pagesize, it is basically a noop
- */
- if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
- !bio_ready_for_csum(bio)) {
- btrfs_queue_worker(&fs_info->endio_meta_workers,
- &end_io_wq->work);
- return;
- }
error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
@@ -1597,9 +1555,10 @@ static int transaction_kthread(void *arg)
u64 transid;
unsigned long now;
unsigned long delay;
- int ret;
+ bool cannot_commit;
do {
+ cannot_commit = false;
delay = HZ * 30;
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
@@ -1621,11 +1580,14 @@ static int transaction_kthread(void *arg)
transid = cur->transid;
spin_unlock(&root->fs_info->trans_lock);
+ /* If the file system is aborted, this will always fail. */
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ cannot_commit = true;
+ goto sleep;
+ }
if (transid == trans->transid) {
- ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
+ btrfs_commit_transaction(trans, root);
} else {
btrfs_end_transaction(trans, root);
}
@@ -1636,7 +1598,8 @@ sleep:
if (!try_to_freeze()) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop() &&
- !btrfs_transaction_blocked(root->fs_info))
+ (!btrfs_transaction_blocked(root->fs_info) ||
+ cannot_commit))
schedule_timeout(delay);
__set_current_state(TASK_RUNNING);
}
@@ -1873,9 +1836,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
}
-struct btrfs_root *open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options)
+int open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options)
{
u32 sectorsize;
u32 nodesize;
@@ -1887,8 +1850,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_key location;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
- struct btrfs_root *tree_root = btrfs_sb(sb);
- struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *tree_root;
struct btrfs_root *extent_root;
struct btrfs_root *csum_root;
struct btrfs_root *chunk_root;
@@ -1899,16 +1862,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
int num_backups_tried = 0;
int backup_index = 0;
- extent_root = fs_info->extent_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- csum_root = fs_info->csum_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- chunk_root = fs_info->chunk_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- dev_root = fs_info->dev_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+ extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
+ csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
+ chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+ dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
- if (!extent_root || !csum_root || !chunk_root || !dev_root) {
+ if (!tree_root || !extent_root || !csum_root ||
+ !chunk_root || !dev_root) {
err = -ENOMEM;
goto fail;
}
@@ -1997,6 +1958,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->scrub_pause_wait);
init_rwsem(&fs_info->scrub_super_lock);
fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ fs_info->check_integrity_print_mask = 0;
+#endif
+
+ spin_lock_init(&fs_info->balance_lock);
+ mutex_init(&fs_info->balance_mutex);
+ atomic_set(&fs_info->balance_running, 0);
+ atomic_set(&fs_info->balance_pause_req, 0);
+ atomic_set(&fs_info->balance_cancel_req, 0);
+ fs_info->balance_ctl = NULL;
+ init_waitqueue_head(&fs_info->balance_wait_q);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
@@ -2016,6 +1988,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
fs_info->btree_inode->i_mapping);
+ BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
@@ -2058,6 +2031,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
__setup_root(4096, 4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
+ invalidate_bdev(fs_devices->latest_bdev);
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
if (!bh) {
err = -EINVAL;
@@ -2078,7 +2052,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
/* check FS state, whether FS is broken. */
fs_info->fs_state |= btrfs_super_flags(disk_super);
- btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ if (ret) {
+ printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
+ err = ret;
+ goto fail_alloc;
+ }
/*
* run through our array of backup supers and setup
@@ -2109,10 +2088,55 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_alloc;
}
+ if (btrfs_super_leafsize(disk_super) !=
+ btrfs_super_nodesize(disk_super)) {
+ printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+ "blocksizes don't match. node %d leaf %d\n",
+ btrfs_super_nodesize(disk_super),
+ btrfs_super_leafsize(disk_super));
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+ if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+ "blocksize (%d) was too large\n",
+ btrfs_super_leafsize(disk_super));
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
features = btrfs_super_incompat_flags(disk_super);
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+
+ /*
+ * flag our filesystem as having big metadata blocks if
+ * they are bigger than the page size
+ */
+ if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+ printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+ }
+
+ nodesize = btrfs_super_nodesize(disk_super);
+ leafsize = btrfs_super_leafsize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = btrfs_super_stripesize(disk_super);
+
+ /*
+ * mixed block groups end up with duplicate but slightly offset
+ * extent buffers for the same range. It leads to corruptions
+ */
+ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (sectorsize != leafsize)) {
+ printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes "
+ "are not allowed for mixed block groups on %s\n",
+ sb->s_id);
+ goto fail_alloc;
+ }
+
btrfs_set_super_incompat_flags(disk_super, features);
features = btrfs_super_compat_ro_flags(disk_super) &
@@ -2216,10 +2240,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
4 * 1024 * 1024 / PAGE_CACHE_SIZE);
- nodesize = btrfs_super_nodesize(disk_super);
- leafsize = btrfs_super_leafsize(disk_super);
- sectorsize = btrfs_super_sectorsize(disk_super);
- stripesize = btrfs_super_stripesize(disk_super);
tree_root->nodesize = nodesize;
tree_root->leafsize = leafsize;
tree_root->sectorsize = sectorsize;
@@ -2234,6 +2254,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
+ if (sectorsize < PAGE_SIZE) {
+ printk(KERN_WARNING "btrfs: Incompatible sector size "
+ "found on %s\n", sb->s_id);
+ goto fail_sb_buffer;
+ }
+
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2253,7 +2279,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
blocksize, generation);
- BUG_ON(!chunk_root->node);
+ BUG_ON(!chunk_root->node); /* -ENOMEM */
if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
sb->s_id);
@@ -2266,9 +2292,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
(unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
BTRFS_UUID_SIZE);
- mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_chunk_tree(chunk_root);
- mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
sb->s_id);
@@ -2277,6 +2301,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_close_extra_devices(fs_devices);
+ if (!fs_devices->latest_bdev) {
+ printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
+ sb->s_id);
+ goto fail_tree_roots;
+ }
+
retry_root_backup:
blocksize = btrfs_level_size(tree_root,
btrfs_super_root_level(disk_super));
@@ -2317,9 +2347,6 @@ retry_root_backup:
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
- fs_info->data_alloc_profile = (u64)-1;
- fs_info->metadata_alloc_profile = (u64)-1;
- fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
ret = btrfs_init_space_info(fs_info);
if (ret) {
@@ -2352,6 +2379,19 @@ retry_root_backup:
btrfs_set_opt(fs_info->mount_opt, SSD);
}
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+ ret = btrfsic_mount(tree_root, fs_devices,
+ btrfs_test_opt(tree_root,
+ CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+ 1 : 0,
+ fs_info->check_integrity_print_mask);
+ if (ret)
+ printk(KERN_WARNING "btrfs: failed to initialize"
+ " integrity check module %s\n", sb->s_id);
+ }
+#endif
+
/* do not make disk changes in broken FS */
if (btrfs_super_log_root(disk_super) != 0 &&
!(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2367,7 +2407,7 @@ retry_root_backup:
btrfs_level_size(tree_root,
btrfs_super_log_root_level(disk_super));
- log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
err = -ENOMEM;
goto fail_trans_kthread;
@@ -2379,21 +2419,31 @@ retry_root_backup:
log_tree_root->node = read_tree_block(tree_root, bytenr,
blocksize,
generation + 1);
+ /* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_error(tree_root->fs_info, ret,
+ "Failed to recover log tree");
+ free_extent_buffer(log_tree_root->node);
+ kfree(log_tree_root);
+ goto fail_trans_kthread;
+ }
if (sb->s_flags & MS_RDONLY) {
- ret = btrfs_commit_super(tree_root);
- BUG_ON(ret);
+ ret = btrfs_commit_super(tree_root);
+ if (ret)
+ goto fail_trans_kthread;
}
}
ret = btrfs_find_orphan_roots(tree_root);
- BUG_ON(ret);
+ if (ret)
+ goto fail_trans_kthread;
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_cleanup_fs_roots(fs_info);
- BUG_ON(ret);
+ if (ret) {
+ }
ret = btrfs_recover_relocation(tree_root);
if (ret < 0) {
@@ -2422,13 +2472,17 @@ retry_root_backup:
if (!err)
err = btrfs_orphan_cleanup(fs_info->tree_root);
up_read(&fs_info->cleanup_work_sem);
+
+ if (!err)
+ err = btrfs_recover_balance(fs_info->tree_root);
+
if (err) {
close_ctree(tree_root);
- return ERR_PTR(err);
+ return err;
}
}
- return tree_root;
+ return 0;
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
@@ -2474,8 +2528,7 @@ fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
btrfs_close_devices(fs_info->fs_devices);
- free_fs_info(fs_info);
- return ERR_PTR(err);
+ return err;
recovery_tree_root:
if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2630,7 +2683,7 @@ static int write_dev_supers(struct btrfs_device *device,
* we fua the first super. The others we allow
* to go down lazy.
*/
- ret = submit_bh(WRITE_FUA, bh);
+ ret = btrfsic_submit_bh(WRITE_FUA, bh);
if (ret)
errors++;
}
@@ -2707,7 +2760,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
device->flush_bio = bio;
bio_get(bio);
- submit_bio(WRITE_FLUSH, bio);
+ btrfsic_submit_bio(WRITE_FLUSH, bio);
return 0;
}
@@ -2810,6 +2863,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (total_errors > max_errors) {
printk(KERN_ERR "btrfs: %d errors while writing supers\n",
total_errors);
+
+ /* This shouldn't happen. FUA is masked off if unsupported */
BUG();
}
@@ -2826,9 +2881,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
- printk(KERN_ERR "btrfs: %d errors while writing supers\n",
- total_errors);
- BUG();
+ btrfs_error(root->fs_info, -EIO,
+ "%d errors while writing supers", total_errors);
+ return -EIO;
}
return 0;
}
@@ -2842,7 +2897,20 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+/* Kill all outstanding I/O */
+void btrfs_abort_devices(struct btrfs_root *root)
+{
+ struct list_head *head;
+ struct btrfs_device *dev;
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ head = &root->fs_info->fs_devices->devices;
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ blk_abort_queue(dev->bdev->bd_disk->queue);
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+}
+
+void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
{
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
@@ -2855,7 +2923,6 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
__btrfs_remove_free_space_cache(root->free_ino_pinned);
__btrfs_remove_free_space_cache(root->free_ino_ctl);
free_fs_root(root);
- return 0;
}
static void free_fs_root(struct btrfs_root *root)
@@ -2872,7 +2939,7 @@ static void free_fs_root(struct btrfs_root *root)
kfree(root);
}
-static int del_fs_roots(struct btrfs_fs_info *fs_info)
+static void del_fs_roots(struct btrfs_fs_info *fs_info)
{
int ret;
struct btrfs_root *gang[8];
@@ -2901,7 +2968,6 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++)
btrfs_free_fs_root(fs_info, gang[i]);
}
- return 0;
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2950,14 +3016,21 @@ int btrfs_commit_super(struct btrfs_root *root)
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
+ if (ret)
+ return ret;
/* run commit again to drop the original snapshot */
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
ret = btrfs_write_and_wait_transaction(NULL, root);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_error(root->fs_info, ret,
+ "Failed to sync btree inode to disk.");
+ return ret;
+ }
ret = write_ctree_super(NULL, root, 0);
return ret;
@@ -2971,6 +3044,9 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ /* pause restriper - we want to resume on mount */
+ btrfs_pause_balance(root->fs_info);
+
btrfs_scrub_cancel(root);
/* wait for any defraggers to finish */
@@ -2978,7 +3054,7 @@ int close_ctree(struct btrfs_root *root)
(atomic_read(&fs_info->defrag_running) == 0));
/* clear out the rbtree of defraggable inodes */
- btrfs_run_defrag_inodes(root->fs_info);
+ btrfs_run_defrag_inodes(fs_info);
/*
* Here come 2 situations when btrfs is broken to flip readonly:
@@ -3007,8 +3083,8 @@ int close_ctree(struct btrfs_root *root)
btrfs_put_block_group_cache(fs_info);
- kthread_stop(root->fs_info->transaction_kthread);
- kthread_stop(root->fs_info->cleaner_kthread);
+ kthread_stop(fs_info->transaction_kthread);
+ kthread_stop(fs_info->cleaner_kthread);
fs_info->closing = 2;
smp_mb();
@@ -3026,14 +3102,14 @@ int close_ctree(struct btrfs_root *root)
free_extent_buffer(fs_info->extent_root->commit_root);
free_extent_buffer(fs_info->tree_root->node);
free_extent_buffer(fs_info->tree_root->commit_root);
- free_extent_buffer(root->fs_info->chunk_root->node);
- free_extent_buffer(root->fs_info->chunk_root->commit_root);
- free_extent_buffer(root->fs_info->dev_root->node);
- free_extent_buffer(root->fs_info->dev_root->commit_root);
- free_extent_buffer(root->fs_info->csum_root->node);
- free_extent_buffer(root->fs_info->csum_root->commit_root);
+ free_extent_buffer(fs_info->chunk_root->node);
+ free_extent_buffer(fs_info->chunk_root->commit_root);
+ free_extent_buffer(fs_info->dev_root->node);
+ free_extent_buffer(fs_info->dev_root->commit_root);
+ free_extent_buffer(fs_info->csum_root->node);
+ free_extent_buffer(fs_info->csum_root->commit_root);
- btrfs_free_block_groups(root->fs_info);
+ btrfs_free_block_groups(fs_info);
del_fs_roots(fs_info);
@@ -3053,24 +3129,26 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->caching_workers);
btrfs_stop_workers(&fs_info->readahead_workers);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(root, CHECK_INTEGRITY))
+ btrfsic_unmount(root, fs_info->fs_devices);
+#endif
+
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
- free_fs_info(fs_info);
-
return 0;
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
{
int ret;
- struct inode *btree_inode = buf->first_page->mapping->host;
+ struct inode *btree_inode = buf->pages[0]->mapping->host;
- ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
- NULL);
+ ret = extent_buffer_uptodate(buf);
if (!ret)
return ret;
@@ -3081,16 +3159,13 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
{
- struct inode *btree_inode = buf->first_page->mapping->host;
- return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
- buf);
+ return set_extent_buffer_uptodate(buf);
}
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+ struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
u64 transid = btrfs_header_generation(buf);
- struct inode *btree_inode = root->fs_info->btree_inode;
int was_dirty;
btrfs_assert_tree_locked(buf);
@@ -3102,8 +3177,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
(unsigned long long)root->fs_info->generation);
WARN_ON(1);
}
- was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
- buf);
+ was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty) {
spin_lock(&root->fs_info->delalloc_lock);
root->fs_info->dirty_metadata_bytes += buf->len;
@@ -3157,12 +3231,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
- int ret;
- ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
- if (ret == 0)
- set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
- return ret;
+ struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+ return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
}
static int btree_lock_page_hook(struct page *page, void *data,
@@ -3170,17 +3240,21 @@ static int btree_lock_page_hook(struct page *page, void *data,
{
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_buffer *eb;
- unsigned long len;
- u64 bytenr = page_offset(page);
- if (page->private == EXTENT_PAGE_PRIVATE)
+ /*
+ * We culled this eb but the page is still hanging out on the mapping,
+ * carry on.
+ */
+ if (!PagePrivate(page))
goto out;
- len = page->private >> 2;
- eb = find_extent_buffer(io_tree, bytenr, len);
- if (!eb)
+ eb = (struct extent_buffer *)page->private;
+ if (!eb) {
+ WARN_ON(1);
+ goto out;
+ }
+ if (page != eb->pages[0])
goto out;
if (!btrfs_try_tree_write_lock(eb)) {
@@ -3199,7 +3273,6 @@ static int btree_lock_page_hook(struct page *page, void *data,
}
btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
out:
if (!trylock_page(page)) {
flush_fn(data);
@@ -3208,15 +3281,23 @@ out:
return 0;
}
-static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
+ if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) {
+ printk(KERN_ERR "btrfs: unsupported checksum algorithm\n");
+ return -EINVAL;
+ }
+
if (read_only)
- return;
+ return 0;
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
printk(KERN_WARNING "warning: mount fs with errors, "
"running btrfsck is recommended\n");
+ }
+
+ return 0;
}
int btrfs_error_commit_super(struct btrfs_root *root)
@@ -3238,7 +3319,7 @@ int btrfs_error_commit_super(struct btrfs_root *root)
return ret;
}
-static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
@@ -3260,11 +3341,9 @@ static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
spin_unlock(&root->fs_info->ordered_extent_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
-
- return 0;
}
-static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct list_head splice;
struct btrfs_ordered_extent *ordered;
@@ -3296,12 +3375,10 @@ static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-
- return 0;
}
-static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
- struct btrfs_root *root)
+int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root)
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -3310,6 +3387,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
delayed_refs = &trans->delayed_refs;
+again:
spin_lock(&delayed_refs->lock);
if (delayed_refs->num_entries == 0) {
spin_unlock(&delayed_refs->lock);
@@ -3331,6 +3409,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_delayed_ref_head *head;
head = btrfs_delayed_node_to_head(ref);
+ spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
kfree(head->extent_op);
delayed_refs->num_heads--;
@@ -3338,8 +3417,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
delayed_refs->num_heads_ready--;
list_del_init(&head->cluster);
mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(ref);
+ goto again;
}
-
spin_unlock(&delayed_refs->lock);
btrfs_put_delayed_ref(ref);
@@ -3352,7 +3432,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
return ret;
}
-static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
{
struct btrfs_pending_snapshot *snapshot;
struct list_head splice;
@@ -3370,11 +3450,9 @@ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
kfree(snapshot);
}
-
- return 0;
}
-static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
@@ -3394,8 +3472,6 @@ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
}
spin_unlock(&root->fs_info->delalloc_lock);
-
- return 0;
}
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3486,13 +3562,43 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
return 0;
}
-static int btrfs_cleanup_transaction(struct btrfs_root *root)
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
+ struct btrfs_root *root)
+{
+ btrfs_destroy_delayed_refs(cur_trans, root);
+ btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+ cur_trans->dirty_pages.dirty_bytes);
+
+ /* FIXME: cleanup wait for commit */
+ cur_trans->in_commit = 1;
+ cur_trans->blocked = 1;
+ if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+ wake_up(&root->fs_info->transaction_blocked_wait);
+
+ cur_trans->blocked = 0;
+ if (waitqueue_active(&root->fs_info->transaction_wait))
+ wake_up(&root->fs_info->transaction_wait);
+
+ cur_trans->commit_done = 1;
+ if (waitqueue_active(&cur_trans->commit_wait))
+ wake_up(&cur_trans->commit_wait);
+
+ btrfs_destroy_pending_snapshots(cur_trans);
+
+ btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
+ EXTENT_DIRTY);
+
+ /*
+ memset(cur_trans, 0, sizeof(*cur_trans));
+ kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ */
+}
+
+int btrfs_cleanup_transaction(struct btrfs_root *root)
{
struct btrfs_transaction *t;
LIST_HEAD(list);
- WARN_ON(1);
-
mutex_lock(&root->fs_info->transaction_kthread_mutex);
spin_lock(&root->fs_info->trans_lock);
@@ -3557,6 +3663,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
return 0;
}
+static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
+ u64 start, u64 end,
+ struct extent_state *state)
+{
+ struct super_block *sb = page->mapping->host->i_sb;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ btrfs_error(fs_info, -EIO,
+ "Error occured while writing out btree at %llu", start);
+ return -EIO;
+}
+
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
@@ -3564,4 +3681,5 @@ static struct extent_io_ops btree_extent_io_ops = {
.submit_bio_hook = btree_submit_bio_hook,
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,
+ .writepage_io_failed_hook = btree_writepage_io_failed_hook,
};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13f..a7ace1a2dd1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -44,11 +44,11 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
-int clean_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options);
+void clean_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf);
+int open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options);
int close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
@@ -64,7 +64,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
@@ -85,6 +85,10 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_cleanup_transaction(struct btrfs_root *root);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
+ struct btrfs_root *root);
+void btrfs_abort_devices(struct btrfs_root *root);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f..e887ee62b6d 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
u64 root_objectid, u32 generation,
int check_generation)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
struct inode *inode;
struct btrfs_key key;
@@ -193,7 +193,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
if (ret < 0)
goto fail;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Key with offset of -1 found */
if (path->slots[0] == 0) {
ret = -ENOENT;
goto fail;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2b..a84420491c1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,23 +34,24 @@
#include "locking.h"
#include "free-space-cache.h"
-/* control flags for do_chunk_alloc's force field
+/*
+ * control flags for do_chunk_alloc's force field
* CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
* if we really need one.
*
- * CHUNK_ALLOC_FORCE means it must try to allocate one
- *
* CHUNK_ALLOC_LIMITED means to only try and allocate one
* if we have very few chunks already allocated. This is
* used as part of the clustering code to help make sure
* we have a good pool of storage to cluster in, without
* filling the FS with empty chunks
*
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
*/
enum {
CHUNK_ALLOC_NO_FORCE = 0,
- CHUNK_ALLOC_FORCE = 1,
- CHUNK_ALLOC_LIMITED = 2,
+ CHUNK_ALLOC_LIMITED = 1,
+ CHUNK_ALLOC_FORCE = 2,
};
/*
@@ -244,7 +245,7 @@ static int exclude_super_stripes(struct btrfs_root *root,
cache->bytes_super += stripe_len;
ret = add_excluded_extent(root, cache->key.objectid,
stripe_len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -252,13 +253,13 @@ static int exclude_super_stripes(struct btrfs_root *root,
ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
cache->key.objectid, bytenr,
0, &logical, &nr, &stripe_len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
while (nr--) {
cache->bytes_super += stripe_len;
ret = add_excluded_extent(root, logical[nr],
stripe_len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
kfree(logical);
@@ -320,7 +321,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
total_added += size;
ret = btrfs_add_free_space(block_group, start,
size);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or logic error */
start = extent_end + 1;
} else {
break;
@@ -331,7 +332,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
size = end - start;
total_added += size;
ret = btrfs_add_free_space(block_group, start, size);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or logic error */
}
return total_added;
@@ -473,7 +474,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
int ret = 0;
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
- BUG_ON(!caching_ctl);
+ if (!caching_ctl)
+ return -ENOMEM;
INIT_LIST_HEAD(&caching_ctl->list);
mutex_init(&caching_ctl->mutex);
@@ -618,8 +620,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA;
+ flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
@@ -982,7 +983,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ret;
- BUG_ON(ret > 0);
+ BUG_ON(ret > 0); /* Corruption */
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1008,9 +1009,9 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
new_size + extra_size, 1);
if (ret < 0)
return ret;
- BUG_ON(ret);
+ BUG_ON(ret); /* Corruption */
- ret = btrfs_extend_item(trans, root, path, new_size);
+ btrfs_extend_item(trans, root, path, new_size);
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1478,7 +1479,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
err = ret;
goto out;
}
- BUG_ON(ret);
+ if (ret && !insert) {
+ err = -ENOENT;
+ goto out;
+ }
+ BUG_ON(ret); /* Corruption */
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -1592,13 +1597,13 @@ out:
* helper to add new inline back ref
*/
static noinline_for_stack
-int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_extent_inline_ref *iref,
- u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int refs_to_add,
- struct btrfs_delayed_extent_op *extent_op)
+void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -1608,7 +1613,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
u64 refs;
int size;
int type;
- int ret;
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1617,7 +1621,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
type = extent_ref_type(parent, owner);
size = btrfs_extent_inline_ref_size(type);
- ret = btrfs_extend_item(trans, root, path, size);
+ btrfs_extend_item(trans, root, path, size);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
@@ -1652,7 +1656,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
}
btrfs_mark_buffer_dirty(leaf);
- return 0;
}
static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1687,12 +1690,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
* helper to update/remove inline back ref
*/
static noinline_for_stack
-int update_inline_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_extent_inline_ref *iref,
- int refs_to_mod,
- struct btrfs_delayed_extent_op *extent_op)
+void update_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ int refs_to_mod,
+ struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -1703,7 +1706,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
u32 item_size;
int size;
int type;
- int ret;
u64 refs;
leaf = path->nodes[0];
@@ -1745,10 +1747,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
memmove_extent_buffer(leaf, ptr, ptr + size,
end - ptr - size);
item_size -= size;
- ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+ btrfs_truncate_item(trans, root, path, item_size, 1);
}
btrfs_mark_buffer_dirty(leaf);
- return 0;
}
static noinline_for_stack
@@ -1768,13 +1769,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
root_objectid, owner, offset, 1);
if (ret == 0) {
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
- ret = update_inline_extent_backref(trans, root, path, iref,
- refs_to_add, extent_op);
+ update_inline_extent_backref(trans, root, path, iref,
+ refs_to_add, extent_op);
} else if (ret == -ENOENT) {
- ret = setup_inline_extent_backref(trans, root, path, iref,
- parent, root_objectid,
- owner, offset, refs_to_add,
- extent_op);
+ setup_inline_extent_backref(trans, root, path, iref, parent,
+ root_objectid, owner, offset,
+ refs_to_add, extent_op);
+ ret = 0;
}
return ret;
}
@@ -1804,12 +1805,12 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_extent_inline_ref *iref,
int refs_to_drop, int is_data)
{
- int ret;
+ int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
if (iref) {
- ret = update_inline_extent_backref(trans, root, path, iref,
- -refs_to_drop, NULL);
+ update_inline_extent_backref(trans, root, path, iref,
+ -refs_to_drop, NULL);
} else if (is_data) {
ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
} else {
@@ -1835,6 +1836,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
bytenr, &num_bytes, &bbio, 0);
+ /* Error condition is -ENOMEM */
if (!ret) {
struct btrfs_bio_stripe *stripe = bbio->stripes;
int i;
@@ -1850,7 +1852,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
if (!ret)
discarded_bytes += stripe->length;
else if (ret != -EOPNOTSUPP)
- break;
+ break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
/*
* Just in case we get back EOPNOTSUPP for some reason,
@@ -1869,23 +1871,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
return ret;
}
+/* Can return -ENOMEM */
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset)
+ u64 root_objectid, u64 owner, u64 offset, int for_cow)
{
int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
root_objectid == BTRFS_TREE_LOG_OBJECTID);
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+ num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL);
+ BTRFS_ADD_DELAYED_REF, NULL, for_cow);
} else {
- ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+ ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+ num_bytes,
parent, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_REF, NULL);
+ BTRFS_ADD_DELAYED_REF, NULL, for_cow);
}
return ret;
}
@@ -1940,7 +1947,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ret = insert_extent_backref(trans, root->fs_info->extent_root,
path, bytenr, parent, root_objectid,
owner, offset, refs_to_add);
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
out:
btrfs_free_path(path);
return err;
@@ -2027,6 +2035,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
int ret;
int err = 0;
+ if (trans->aborted)
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2124,7 +2135,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op,
int insert_reserved)
{
- int ret;
+ int ret = 0;
+
+ if (trans->aborted)
+ return 0;
+
if (btrfs_delayed_ref_is_head(node)) {
struct btrfs_delayed_ref_head *head;
/*
@@ -2142,11 +2157,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
ret = btrfs_del_csums(trans, root,
node->bytenr,
node->num_bytes);
- BUG_ON(ret);
}
}
mutex_unlock(&head->mutex);
- return 0;
+ return ret;
}
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -2193,6 +2207,10 @@ again:
return NULL;
}
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct list_head *cluster)
@@ -2233,6 +2251,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
}
/*
+ * locked_ref is the head node, so we have to go one
+ * node back for any delayed ref updates
+ */
+ ref = select_delayed_ref(locked_ref);
+
+ if (ref && ref->seq &&
+ btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+ /*
+ * there are still refs with lower seq numbers in the
+ * process of being added. Don't run this ref yet.
+ */
+ list_del_init(&locked_ref->cluster);
+ mutex_unlock(&locked_ref->mutex);
+ locked_ref = NULL;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ continue;
+ }
+
+ /*
* record the must insert reserved flag before we
* drop the spin lock.
*/
@@ -2242,11 +2282,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
extent_op = locked_ref->extent_op;
locked_ref->extent_op = NULL;
- /*
- * locked_ref is the head node, so we have to go one
- * node back for any delayed ref updates
- */
- ref = select_delayed_ref(locked_ref);
if (!ref) {
/* All delayed refs have been processed, Go ahead
* and send the head node to run_one_delayed_ref,
@@ -2264,12 +2299,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ret = run_delayed_extent_op(trans, root,
ref, extent_op);
- BUG_ON(ret);
kfree(extent_op);
- cond_resched();
- spin_lock(&delayed_refs->lock);
- continue;
+ if (ret) {
+ printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+ return ret;
+ }
+
+ goto next;
}
list_del_init(&locked_ref->cluster);
@@ -2279,29 +2316,63 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
-
+ /*
+ * we modified num_entries, but as we're currently running
+ * delayed refs, skip
+ * wake_up(&delayed_refs->seq_wait);
+ * here.
+ */
spin_unlock(&delayed_refs->lock);
ret = run_one_delayed_ref(trans, root, ref, extent_op,
must_insert_reserved);
- BUG_ON(ret);
btrfs_put_delayed_ref(ref);
kfree(extent_op);
count++;
+ if (ret) {
+ printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+ return ret;
+ }
+
+next:
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024,
+ btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
cond_resched();
spin_lock(&delayed_refs->lock);
}
return count;
}
+
+static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+ unsigned long num_refs)
+{
+ struct list_head *first_seq = delayed_refs->seq_head.next;
+
+ spin_unlock(&delayed_refs->lock);
+ pr_debug("waiting for more refs (num %ld, first %p)\n",
+ num_refs, first_seq);
+ wait_event(delayed_refs->seq_wait,
+ num_refs != delayed_refs->num_entries ||
+ delayed_refs->seq_head.next != first_seq);
+ pr_debug("done waiting for more refs (num %ld, first %p)\n",
+ delayed_refs->num_entries, delayed_refs->seq_head.next);
+ spin_lock(&delayed_refs->lock);
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
* 0, which means to process everything in the tree at the start
* of the run (but not newly added entries), or it can be some target
* number you'd like to process.
+ *
+ * Returns 0 on success or if called with an aborted transaction
+ * Returns <0 on error and aborts the transaction
*/
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count)
@@ -2311,15 +2382,27 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref;
struct list_head cluster;
int ret;
+ u64 delayed_start;
int run_all = count == (unsigned long)-1;
int run_most = 0;
+ unsigned long num_refs = 0;
+ int consider_waiting;
+
+ /* We'll clean this up in btrfs_cleanup_transaction */
+ if (trans->aborted)
+ return 0;
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
+
delayed_refs = &trans->transaction->delayed_refs;
INIT_LIST_HEAD(&cluster);
again:
+ consider_waiting = 0;
spin_lock(&delayed_refs->lock);
if (count == 0) {
count = delayed_refs->num_entries * 2;
@@ -2336,18 +2419,51 @@ again:
* of refs to process starting at the first one we are able to
* lock
*/
+ delayed_start = delayed_refs->run_delayed_start;
ret = btrfs_find_ref_cluster(trans, &cluster,
delayed_refs->run_delayed_start);
if (ret)
break;
+ if (delayed_start >= delayed_refs->run_delayed_start) {
+ if (consider_waiting == 0) {
+ /*
+ * btrfs_find_ref_cluster looped. let's do one
+ * more cycle. if we don't run any delayed ref
+ * during that cycle (because we can't because
+ * all of them are blocked) and if the number of
+ * refs doesn't change, we avoid busy waiting.
+ */
+ consider_waiting = 1;
+ num_refs = delayed_refs->num_entries;
+ } else {
+ wait_for_more_refs(delayed_refs, num_refs);
+ /*
+ * after waiting, things have changed. we
+ * dropped the lock and someone else might have
+ * run some refs, built new clusters and so on.
+ * therefore, we restart staleness detection.
+ */
+ consider_waiting = 0;
+ }
+ }
+
ret = run_clustered_refs(trans, root, &cluster);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
count -= min_t(unsigned long, ret, count);
if (count == 0)
break;
+
+ if (ret || delayed_refs->run_delayed_start == 0) {
+ /* refs were run, let's reset staleness detection */
+ consider_waiting = 0;
+ }
}
if (run_all) {
@@ -2405,7 +2521,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->update_key = 0;
extent_op->is_data = is_data ? 1 : 0;
- ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+ ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+ num_bytes, extent_op);
if (ret)
kfree(extent_op);
return ret;
@@ -2501,7 +2618,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
ret = -ENOENT;
if (path->slots[0] == 0)
@@ -2590,7 +2707,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc)
+ int full_backref, int inc, int for_cow)
{
u64 bytenr;
u64 num_bytes;
@@ -2603,7 +2720,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int level;
int ret = 0;
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- u64, u64, u64, u64, u64, u64);
+ u64, u64, u64, u64, u64, u64, int);
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
@@ -2640,34 +2757,34 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset);
+ key.offset, for_cow);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = btrfs_level_size(root, level - 1);
ret = process_func(trans, root, bytenr, num_bytes,
- parent, ref_root, level - 1, 0);
+ parent, ref_root, level - 1, 0,
+ for_cow);
if (ret)
goto fail;
}
}
return 0;
fail:
- BUG();
return ret;
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, int full_backref, int for_cow)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, int full_backref, int for_cow)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2683,7 +2800,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
if (ret < 0)
goto fail;
- BUG_ON(ret);
+ BUG_ON(ret); /* Corruption */
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -2691,8 +2808,10 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
fail:
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
return ret;
+ }
return 0;
}
@@ -2865,7 +2984,8 @@ again:
if (last == 0) {
err = btrfs_run_delayed_refs(trans, root,
(unsigned long)-1);
- BUG_ON(err);
+ if (err) /* File system offline */
+ goto out;
}
cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2892,7 +3012,9 @@ again:
last = cache->key.objectid + cache->key.offset;
err = write_one_cache_group(trans, root, path, cache);
- BUG_ON(err);
+ if (err) /* File system offline */
+ goto out;
+
btrfs_put_block_group(cache);
}
@@ -2905,7 +3027,8 @@ again:
if (last == 0) {
err = btrfs_run_delayed_refs(trans, root,
(unsigned long)-1);
- BUG_ON(err);
+ if (err) /* File system offline */
+ goto out;
}
cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2930,20 +3053,21 @@ again:
continue;
}
- btrfs_write_out_cache(root, trans, cache, path);
+ err = btrfs_write_out_cache(root, trans, cache, path);
/*
* If we didn't have an error then the cache state is still
* NEED_WRITE, so we can set it to WRITTEN.
*/
- if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+ if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
cache->disk_cache_state = BTRFS_DC_WRITTEN;
last = cache->key.objectid + cache->key.offset;
btrfs_put_block_group(cache);
}
+out:
btrfs_free_path(path);
- return 0;
+ return err;
}
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2993,9 +3117,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
spin_lock_init(&found->lock);
- found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
- BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA);
+ found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
found->total_bytes = total_bytes;
found->disk_total = total_bytes * factor;
found->bytes_used = bytes_used;
@@ -3016,20 +3138,53 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
- u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_DUP);
- if (extra_flags) {
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- fs_info->avail_data_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
- fs_info->avail_metadata_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- fs_info->avail_system_alloc_bits |= extra_flags;
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits |= extra_flags;
+}
+
+/*
+ * returns target flags in extended format or 0 if restripe for this
+ * chunk_type is not in progress
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ u64 target = 0;
+
+ BUG_ON(!mutex_is_locked(&fs_info->volume_mutex) &&
+ !spin_is_locked(&fs_info->balance_lock));
+
+ if (!bctl)
+ return 0;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA &&
+ bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+ bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+ bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
}
+
+ return target;
}
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format. If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
/*
@@ -3039,6 +3194,22 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
*/
u64 num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
+ u64 target;
+
+ /*
+ * see if restripe for this chunk_type is in progress, if so
+ * try to reduce to the target profile
+ */
+ spin_lock(&root->fs_info->balance_lock);
+ target = get_restripe_target(root->fs_info, flags);
+ if (target) {
+ /* pick target profile only if it's already available */
+ if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
+ spin_unlock(&root->fs_info->balance_lock);
+ return extended_to_chunk(target);
+ }
+ }
+ spin_unlock(&root->fs_info->balance_lock);
if (num_devices == 1)
flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3059,22 +3230,22 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
((flags & BTRFS_BLOCK_GROUP_RAID1) |
(flags & BTRFS_BLOCK_GROUP_RAID10) |
- (flags & BTRFS_BLOCK_GROUP_DUP)))
+ (flags & BTRFS_BLOCK_GROUP_DUP))) {
flags &= ~BTRFS_BLOCK_GROUP_RAID0;
- return flags;
+ }
+
+ return extended_to_chunk(flags);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
if (flags & BTRFS_BLOCK_GROUP_DATA)
- flags |= root->fs_info->avail_data_alloc_bits &
- root->fs_info->data_alloc_profile;
+ flags |= root->fs_info->avail_data_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- flags |= root->fs_info->avail_system_alloc_bits &
- root->fs_info->system_alloc_profile;
+ flags |= root->fs_info->avail_system_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_METADATA)
- flags |= root->fs_info->avail_metadata_alloc_bits &
- root->fs_info->metadata_alloc_profile;
+ flags |= root->fs_info->avail_metadata_alloc_bits;
+
return btrfs_reduce_alloc_profile(root, flags);
}
@@ -3191,6 +3362,8 @@ commit_trans:
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ data_sinfo->flags, bytes, 1);
spin_unlock(&data_sinfo->lock);
return 0;
@@ -3210,6 +3383,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
data_sinfo = BTRFS_I(inode)->space_info;
spin_lock(&data_sinfo->lock);
data_sinfo->bytes_may_use -= bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ data_sinfo->flags, bytes, 0);
spin_unlock(&data_sinfo->lock);
}
@@ -3257,29 +3432,61 @@ static int should_alloc_chunk(struct btrfs_root *root,
if (num_bytes - num_allocated < thresh)
return 1;
}
+ thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- /*
- * we have two similar checks here, one based on percentage
- * and once based on a hard number of 256MB. The idea
- * is that if we have a good amount of free
- * room, don't allocate a chunk. A good mount is
- * less than 80% utilized of the chunks we have allocated,
- * or more than 256MB free
- */
- if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
- return 0;
+ /* 256MB or 2% of the FS */
+ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
+ /* system chunks need a much small threshold */
+ if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ thresh = 32 * 1024 * 1024;
- if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
+ if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
return 0;
+ return 1;
+}
- thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
+static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+{
+ u64 num_dev;
- /* 256MB or 5% of the FS */
- thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+ if (type & BTRFS_BLOCK_GROUP_RAID10 ||
+ type & BTRFS_BLOCK_GROUP_RAID0)
+ num_dev = root->fs_info->fs_devices->rw_devices;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1)
+ num_dev = 2;
+ else
+ num_dev = 1; /* DUP or single */
- if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
- return 0;
- return 1;
+ /* metadata for updaing devices and chunk tree */
+ return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+}
+
+static void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 type)
+{
+ struct btrfs_space_info *info;
+ u64 left;
+ u64 thresh;
+
+ info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ spin_lock(&info->lock);
+ left = info->total_bytes - info->bytes_used - info->bytes_pinned -
+ info->bytes_reserved - info->bytes_readonly;
+ spin_unlock(&info->lock);
+
+ thresh = get_system_chunk_thresh(root, type);
+ if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
+ left, thresh, type);
+ dump_space_info(info, 0, 0);
+ }
+
+ if (left < thresh) {
+ u64 flags;
+
+ flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
+ btrfs_alloc_chunk(trans, root, flags);
+ }
}
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -3291,19 +3498,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
int wait_for_alloc = 0;
int ret = 0;
- flags = btrfs_reduce_alloc_profile(extent_root, flags);
-
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
ret = update_space_info(extent_root->fs_info, flags,
0, 0, &space_info);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
- BUG_ON(!space_info);
+ BUG_ON(!space_info); /* Logic error */
again:
spin_lock(&space_info->lock);
- if (space_info->force_alloc)
+ if (force < space_info->force_alloc)
force = space_info->force_alloc;
if (space_info->full) {
spin_unlock(&space_info->lock);
@@ -3354,6 +3559,12 @@ again:
force_metadata_allocation(fs_info);
}
+ /*
+ * Check if we have enough space in SYSTEM chunk because we may need
+ * to update devices.
+ */
+ check_system_chunk(trans, extent_root, flags);
+
ret = btrfs_alloc_chunk(trans, extent_root, flags);
if (ret < 0 && ret != -ENOSPC)
goto out;
@@ -3499,12 +3710,15 @@ static int may_commit_transaction(struct btrfs_root *root,
if (space_info != delayed_rsv->space_info)
return -ENOSPC;
+ spin_lock(&space_info->lock);
spin_lock(&delayed_rsv->lock);
- if (delayed_rsv->size < bytes) {
+ if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
spin_unlock(&delayed_rsv->lock);
+ spin_unlock(&space_info->lock);
return -ENOSPC;
}
spin_unlock(&delayed_rsv->lock);
+ spin_unlock(&space_info->lock);
commit:
trans = btrfs_join_transaction(root);
@@ -3561,8 +3775,10 @@ again:
ret = wait_event_interruptible(space_info->wait,
!space_info->flush);
/* Must have been interrupted, return */
- if (ret)
+ if (ret) {
+ printk(KERN_DEBUG "btrfs: %s returning -EINTR\n", __func__);
return -EINTR;
+ }
spin_lock(&space_info->lock);
}
@@ -3582,6 +3798,8 @@ again:
if (used <= space_info->total_bytes) {
if (used + orig_bytes <= space_info->total_bytes) {
space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info", space_info->flags, orig_bytes, 1);
ret = 0;
} else {
/*
@@ -3649,6 +3867,8 @@ again:
if (used + num_bytes < space_info->total_bytes + avail) {
space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info", space_info->flags, orig_bytes, 1);
ret = 0;
} else {
wait_ordered = true;
@@ -3711,8 +3931,9 @@ out:
return ret;
}
-static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static struct btrfs_block_rsv *get_block_rsv(
+ const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root)
{
struct btrfs_block_rsv *block_rsv = NULL;
@@ -3755,7 +3976,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
spin_unlock(&block_rsv->lock);
}
-static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +4013,8 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
if (num_bytes) {
spin_lock(&space_info->lock);
space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, num_bytes, 0);
space_info->reservation_progress++;
spin_unlock(&space_info->lock);
}
@@ -3947,7 +4171,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
if (global_rsv->full || global_rsv == block_rsv ||
block_rsv->space_info != global_rsv->space_info)
global_rsv = NULL;
- block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+ block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+ num_bytes);
}
/*
@@ -3980,7 +4205,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
num_bytes += div64_u64(data_used + meta_used, 50);
if (num_bytes * 3 > meta_used)
- num_bytes = div64_u64(meta_used, 3);
+ num_bytes = div64_u64(meta_used, 3) * 2;
return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
}
@@ -4006,11 +4231,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
num_bytes = sinfo->total_bytes - num_bytes;
block_rsv->reserved += num_bytes;
sinfo->bytes_may_use += num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ sinfo->flags, num_bytes, 1);
}
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
sinfo->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ sinfo->flags, num_bytes, 0);
sinfo->reservation_progress++;
block_rsv->reserved = block_rsv->size;
block_rsv->full = 1;
@@ -4045,7 +4274,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
- block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+ block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+ (u64)-1);
WARN_ON(fs_info->delalloc_block_rsv.size > 0);
WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,10 +4292,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
if (!trans->bytes_reserved)
return;
+ trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
trans->bytes_reserved = 0;
}
+/* Can only return 0 or -ENOSPC */
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
{
@@ -4079,6 +4312,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
* when we are truly done with the orphan item.
*/
u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ trace_btrfs_space_reservation(root->fs_info, "orphan",
+ btrfs_ino(inode), num_bytes, 1);
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
@@ -4086,6 +4321,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ trace_btrfs_space_reservation(root->fs_info, "orphan",
+ btrfs_ino(inode), num_bytes, 0);
btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
}
@@ -4213,12 +4450,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
/* Need to be holding the i_mutex here if we aren't free space cache */
if (btrfs_is_free_space_inode(root, inode))
flush = 0;
- else
- WARN_ON(!mutex_is_locked(&inode->i_mutex));
if (flush && btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
+ mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
@@ -4266,8 +4502,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (dropped)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
- if (to_free)
+ if (to_free) {
btrfs_block_rsv_release(root, block_rsv, to_free);
+ trace_btrfs_space_reservation(root->fs_info,
+ "delalloc",
+ btrfs_ino(inode),
+ to_free, 0);
+ }
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
return ret;
}
@@ -4278,7 +4520,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
}
BTRFS_I(inode)->reserved_extents += nr_extents;
spin_unlock(&BTRFS_I(inode)->lock);
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ if (to_reserve)
+ trace_btrfs_space_reservation(root->fs_info,"delalloc",
+ btrfs_ino(inode), to_reserve, 1);
block_rsv_add_bytes(block_rsv, to_reserve, 1);
return 0;
@@ -4308,6 +4554,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), to_free, 0);
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
to_free);
}
@@ -4387,7 +4635,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
while (total) {
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache)
- return -1;
+ return -ENOENT;
if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -4490,7 +4738,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
struct btrfs_block_group_cache *cache;
cache = btrfs_lookup_block_group(root->fs_info, bytenr);
- BUG_ON(!cache);
+ BUG_ON(!cache); /* Logic error */
pin_down_extent(root, cache, bytenr, num_bytes, reserved);
@@ -4508,7 +4756,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *cache;
cache = btrfs_lookup_block_group(root->fs_info, bytenr);
- BUG_ON(!cache);
+ BUG_ON(!cache); /* Logic error */
/*
* pull in the free space cache (if any) so that our pin
@@ -4553,6 +4801,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
+
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
if (reserve != RESERVE_FREE) {
@@ -4562,7 +4811,9 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
if (reserve == RESERVE_ALLOC) {
- BUG_ON(space_info->bytes_may_use < num_bytes);
+ trace_btrfs_space_reservation(cache->fs_info,
+ "space_info", space_info->flags,
+ num_bytes, 0);
space_info->bytes_may_use -= num_bytes;
}
}
@@ -4578,7 +4829,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
return ret;
}
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4608,7 +4859,6 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
up_write(&fs_info->extent_commit_sem);
update_global_block_rsv(fs_info);
- return 0;
}
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
@@ -4623,7 +4873,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
if (cache)
btrfs_put_block_group(cache);
cache = btrfs_lookup_block_group(fs_info, start);
- BUG_ON(!cache);
+ BUG_ON(!cache); /* Logic error */
}
len = cache->key.objectid + cache->key.offset - start;
@@ -4660,6 +4910,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
u64 end;
int ret;
+ if (trans->aborted)
+ return 0;
+
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
unpin = &fs_info->freed_extents[1];
else
@@ -4745,7 +4998,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop,
is_data);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -4763,10 +5017,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root,
path->nodes[0]);
}
- BUG_ON(ret);
+ if (ret < 0)
+ goto abort;
extent_slot = path->slots[0];
}
- } else {
+ } else if (ret == -ENOENT) {
btrfs_print_leaf(extent_root, path->nodes[0]);
WARN_ON(1);
printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
@@ -4776,6 +5031,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)root_objectid,
(unsigned long long)owner_objectid,
(unsigned long long)owner_offset);
+ } else {
+ goto abort;
}
leaf = path->nodes[0];
@@ -4785,7 +5042,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(found_extent || extent_slot != path->slots[0]);
ret = convert_extent_item_v0(trans, extent_root, path,
owner_objectid, 0);
- BUG_ON(ret < 0);
+ if (ret < 0)
+ goto abort;
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -4802,7 +5060,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)bytenr);
btrfs_print_leaf(extent_root, path->nodes[0]);
}
- BUG_ON(ret);
+ if (ret < 0)
+ goto abort;
extent_slot = path->slots[0];
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -4839,7 +5098,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop,
is_data);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
}
} else {
if (found_extent) {
@@ -4856,23 +5116,27 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
btrfs_release_path(path);
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
- BUG_ON(ret);
- } else {
- invalidate_mapping_pages(info->btree_inode->i_mapping,
- bytenr >> PAGE_CACHE_SHIFT,
- (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
+ if (ret)
+ goto abort;
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
}
+out:
btrfs_free_path(path);
return ret;
+
+abort:
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
}
/*
@@ -4928,6 +5192,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
rb_erase(&head->node.rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
+ if (waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
/*
* we don't take a ref on the node because we're removing it from the
@@ -4955,17 +5221,18 @@ out:
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- u64 parent, int last_ref)
+ u64 parent, int last_ref, int for_cow)
{
struct btrfs_block_group_cache *cache = NULL;
int ret;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
- parent, root->root_key.objectid,
- btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL);
- BUG_ON(ret);
+ ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+ buf->start, buf->len,
+ parent, root->root_key.objectid,
+ btrfs_header_level(buf),
+ BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+ BUG_ON(ret); /* -ENOMEM */
}
if (!last_ref)
@@ -4999,12 +5266,13 @@ out:
btrfs_put_block_group(cache);
}
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset)
+/* Can return -ENOMEM */
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int for_cow)
{
int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
/*
* tree log blocks never actually go into the extent allocation
@@ -5016,15 +5284,16 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_pin_extent(root, bytenr, num_bytes, 1);
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+ num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL);
- BUG_ON(ret);
+ BTRFS_DROP_DELAYED_REF, NULL, for_cow);
} else {
- ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
- parent, root_objectid, owner,
- offset, BTRFS_DROP_DELAYED_REF, NULL);
- BUG_ON(ret);
+ ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+ num_bytes,
+ parent, root_objectid, owner,
+ offset, BTRFS_DROP_DELAYED_REF,
+ NULL, for_cow);
}
return ret;
}
@@ -5081,28 +5350,34 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return 0;
}
-static int get_block_group_index(struct btrfs_block_group_cache *cache)
+static int __get_block_group_index(u64 flags)
{
int index;
- if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+
+ if (flags & BTRFS_BLOCK_GROUP_RAID10)
index = 0;
- else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1)
index = 1;
- else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+ else if (flags & BTRFS_BLOCK_GROUP_DUP)
index = 2;
- else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+ else if (flags & BTRFS_BLOCK_GROUP_RAID0)
index = 3;
else
index = 4;
+
return index;
}
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+ return __get_block_group_index(cache->flags);
+}
+
enum btrfs_loop_type {
- LOOP_FIND_IDEAL = 0,
- LOOP_CACHING_NOWAIT = 1,
- LOOP_CACHING_WAIT = 2,
- LOOP_ALLOC_CHUNK = 3,
- LOOP_NO_EMPTY_SIZE = 4,
+ LOOP_CACHING_NOWAIT = 0,
+ LOOP_CACHING_WAIT = 1,
+ LOOP_ALLOC_CHUNK = 2,
+ LOOP_NO_EMPTY_SIZE = 3,
};
/*
@@ -5116,7 +5391,6 @@ enum btrfs_loop_type {
static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
- u64 search_start, u64 search_end,
u64 hint_byte, struct btrfs_key *ins,
u64 data)
{
@@ -5125,6 +5399,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
struct btrfs_block_group_cache *used_block_group;
+ u64 search_start = 0;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
int done_chunk_alloc = 0;
@@ -5138,14 +5413,14 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
bool failed_alloc = false;
bool use_cluster = true;
bool have_caching_bg = false;
- u64 ideal_cache_percent = 0;
- u64 ideal_cache_offset = 0;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
ins->objectid = 0;
ins->offset = 0;
+ trace_find_free_extent(orig_root, num_bytes, empty_size, data);
+
space_info = __find_space_info(root->fs_info, data);
if (!space_info) {
printk(KERN_ERR "No space info for %llu\n", data);
@@ -5187,7 +5462,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
empty_cluster = 0;
if (search_start == hint_byte) {
-ideal_cache:
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
used_block_group = block_group;
@@ -5199,8 +5473,7 @@ ideal_cache:
* picked out then we don't care that the block group is cached.
*/
if (block_group && block_group_bits(block_group, data) &&
- (block_group->cached != BTRFS_CACHE_NO ||
- search_start == ideal_cache_offset)) {
+ block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
block_group->ro) {
@@ -5254,56 +5527,16 @@ search:
have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
- u64 free_percent;
-
found_uncached_bg = true;
ret = cache_block_group(block_group, trans,
- orig_root, 1);
- if (block_group->cached == BTRFS_CACHE_FINISHED)
- goto alloc;
-
- free_percent = btrfs_block_group_used(&block_group->item);
- free_percent *= 100;
- free_percent = div64_u64(free_percent,
- block_group->key.offset);
- free_percent = 100 - free_percent;
- if (free_percent > ideal_cache_percent &&
- likely(!block_group->ro)) {
- ideal_cache_offset = block_group->key.objectid;
- ideal_cache_percent = free_percent;
- }
-
- /*
- * The caching workers are limited to 2 threads, so we
- * can queue as much work as we care to.
- */
- if (loop > LOOP_FIND_IDEAL) {
- ret = cache_block_group(block_group, trans,
- orig_root, 0);
- BUG_ON(ret);
- }
-
- /*
- * If loop is set for cached only, try the next block
- * group.
- */
- if (loop == LOOP_FIND_IDEAL)
- goto loop;
+ orig_root, 0);
+ BUG_ON(ret < 0);
+ ret = 0;
}
-alloc:
if (unlikely(block_group->ro))
goto loop;
- spin_lock(&block_group->free_space_ctl->tree_lock);
- if (cached &&
- block_group->free_space_ctl->free_space <
- num_bytes + empty_cluster + empty_size) {
- spin_unlock(&block_group->free_space_ctl->tree_lock);
- goto loop;
- }
- spin_unlock(&block_group->free_space_ctl->tree_lock);
-
/*
* Ok we want to try and use the cluster allocator, so
* lets look there
@@ -5331,6 +5564,8 @@ alloc:
if (offset) {
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(root,
+ block_group, search_start, num_bytes);
goto checks;
}
@@ -5349,8 +5584,15 @@ refill_cluster:
* plenty of times and not have found
* anything, so we are likely way too
* fragmented for the clustering stuff to find
- * anything. */
- if (loop >= LOOP_NO_EMPTY_SIZE) {
+ * anything.
+ *
+ * However, if the cluster is taken from the
+ * current block group, release the cluster
+ * first, so that we stand a better chance of
+ * succeeding in the unclustered
+ * allocation. */
+ if (loop >= LOOP_NO_EMPTY_SIZE &&
+ last_ptr->block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
}
@@ -5361,6 +5603,11 @@ refill_cluster:
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ if (loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ goto unclustered_alloc;
+ }
+
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
@@ -5377,6 +5624,9 @@ refill_cluster:
if (offset) {
/* we found one, proceed */
spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(root,
+ block_group, search_start,
+ num_bytes);
goto checks;
}
} else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5651,15 @@ refill_cluster:
}
unclustered_alloc:
+ spin_lock(&block_group->free_space_ctl->tree_lock);
+ if (cached &&
+ block_group->free_space_ctl->free_space <
+ num_bytes + empty_cluster + empty_size) {
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+ goto loop;
+ }
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
/*
@@ -5425,11 +5684,6 @@ unclustered_alloc:
}
checks:
search_start = stripe_align(root, offset);
- /* move on to the next group */
- if (search_start + num_bytes >= search_end) {
- btrfs_add_free_space(used_block_group, offset, num_bytes);
- goto loop;
- }
/* move on to the next group */
if (search_start + num_bytes >
@@ -5438,9 +5692,6 @@ checks:
goto loop;
}
- ins->objectid = search_start;
- ins->offset = num_bytes;
-
if (offset < search_start)
btrfs_add_free_space(used_block_group, offset,
search_start - offset);
@@ -5457,6 +5708,8 @@ checks:
ins->objectid = search_start;
ins->offset = num_bytes;
+ trace_btrfs_reserve_extent(orig_root, block_group,
+ search_start, num_bytes);
if (offset < search_start)
btrfs_add_free_space(used_block_group, offset,
search_start - offset);
@@ -5481,9 +5734,7 @@ loop:
if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
goto search;
- /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
- * for them to make caching progress. Also
- * determine the best possible bg to cache
+ /*
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
* caching kthreads as we move along
* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
@@ -5493,50 +5744,17 @@ loop:
*/
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
index = 0;
- if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
- found_uncached_bg = false;
- loop++;
- if (!ideal_cache_percent)
- goto search;
-
- /*
- * 1 of the following 2 things have happened so far
- *
- * 1) We found an ideal block group for caching that
- * is mostly full and will cache quickly, so we might
- * as well wait for it.
- *
- * 2) We searched for cached only and we didn't find
- * anything, and we didn't start any caching kthreads
- * either, so chances are we will loop through and
- * start a couple caching kthreads, and then come back
- * around and just wait for them. This will be slower
- * because we will have 2 caching kthreads reading at
- * the same time when we could have just started one
- * and waited for it to get far enough to give us an
- * allocation, so go ahead and go to the wait caching
- * loop.
- */
- loop = LOOP_CACHING_WAIT;
- search_start = ideal_cache_offset;
- ideal_cache_percent = 0;
- goto ideal_cache;
- } else if (loop == LOOP_FIND_IDEAL) {
- /*
- * Didn't find a uncached bg, wait on anything we find
- * next.
- */
- loop = LOOP_CACHING_WAIT;
- goto search;
- }
-
loop++;
-
if (loop == LOOP_ALLOC_CHUNK) {
if (allowed_chunk_alloc) {
ret = do_chunk_alloc(trans, root, num_bytes +
2 * 1024 * 1024, data,
CHUNK_ALLOC_LIMITED);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans,
+ root, ret);
+ goto out;
+ }
allowed_chunk_alloc = 0;
if (ret == 1)
done_chunk_alloc = 1;
@@ -5565,6 +5783,7 @@ loop:
} else if (ins->objectid) {
ret = 0;
}
+out:
return ret;
}
@@ -5618,11 +5837,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data)
+ struct btrfs_key *ins, u64 data)
{
+ bool final_tried = false;
int ret;
- u64 search_start = 0;
data = btrfs_get_alloc_profile(root, data);
again:
@@ -5630,32 +5848,44 @@ again:
* the only place that sets empty_size is btrfs_realloc_node, which
* is not called recursively on allocations
*/
- if (empty_size || root->ref_cows)
+ if (empty_size || root->ref_cows) {
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
num_bytes + 2 * 1024 * 1024, data,
CHUNK_ALLOC_NO_FORCE);
+ if (ret < 0 && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ }
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
- search_start, search_end, hint_byte,
- ins, data);
+ hint_byte, ins, data);
- if (ret == -ENOSPC && num_bytes > min_alloc_size) {
- num_bytes = num_bytes >> 1;
- num_bytes = num_bytes & ~(root->sectorsize - 1);
- num_bytes = max(num_bytes, min_alloc_size);
- do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes, data, CHUNK_ALLOC_FORCE);
- goto again;
- }
- if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
- struct btrfs_space_info *sinfo;
-
- sinfo = __find_space_info(root->fs_info, data);
- printk(KERN_ERR "btrfs allocation failed flags %llu, "
- "wanted %llu\n", (unsigned long long)data,
- (unsigned long long)num_bytes);
- dump_space_info(sinfo, num_bytes, 1);
+ if (ret == -ENOSPC) {
+ if (!final_tried) {
+ num_bytes = num_bytes >> 1;
+ num_bytes = num_bytes & ~(root->sectorsize - 1);
+ num_bytes = max(num_bytes, min_alloc_size);
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ num_bytes, data, CHUNK_ALLOC_FORCE);
+ if (ret < 0 && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ if (num_bytes == min_alloc_size)
+ final_tried = true;
+ goto again;
+ } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ struct btrfs_space_info *sinfo;
+
+ sinfo = __find_space_info(root->fs_info, data);
+ printk(KERN_ERR "btrfs allocation failed flags %llu, "
+ "wanted %llu\n", (unsigned long long)data,
+ (unsigned long long)num_bytes);
+ if (sinfo)
+ dump_space_info(sinfo, num_bytes, 1);
+ }
}
trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
@@ -5733,7 +5963,10 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_free_path(path);
+ return ret;
+ }
leaf = path->nodes[0];
extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5763,7 +5996,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
- if (ret) {
+ if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
(unsigned long long)ins->offset);
@@ -5794,7 +6027,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_free_path(path);
+ return ret;
+ }
leaf = path->nodes[0];
extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5824,7 +6060,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
- if (ret) {
+ if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
(unsigned long long)ins->offset);
@@ -5842,9 +6078,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
- 0, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_EXTENT, NULL);
+ ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+ ins->offset, 0,
+ root_objectid, owner, offset,
+ BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
return ret;
}
@@ -5871,28 +6108,28 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
if (!caching_ctl) {
BUG_ON(!block_group_cache_done(block_group));
ret = btrfs_remove_free_space(block_group, start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else {
mutex_lock(&caching_ctl->mutex);
if (start >= caching_ctl->progress) {
ret = add_excluded_extent(root, start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else if (start + num_bytes <= caching_ctl->progress) {
ret = btrfs_remove_free_space(block_group,
start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else {
num_bytes = caching_ctl->progress - start;
ret = btrfs_remove_free_space(block_group,
start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
start = caching_ctl->progress;
num_bytes = ins->objectid + ins->offset -
caching_ctl->progress;
ret = add_excluded_extent(root, start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
mutex_unlock(&caching_ctl->mutex);
@@ -5901,7 +6138,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
RESERVE_ALLOC_NO_ACCOUNT);
- BUG_ON(ret);
+ BUG_ON(ret); /* logic error */
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -5922,6 +6159,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
clean_tree_block(trans, root, buf);
+ clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
btrfs_set_buffer_uptodate(buf);
@@ -5997,10 +6235,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
return ERR_PTR(-ENOSPC);
}
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u32 blocksize)
{
block_rsv_add_bytes(block_rsv, blocksize, 0);
- block_rsv_release_bytes(block_rsv, NULL, 0);
+ block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
}
/*
@@ -6014,7 +6253,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
- u64 hint, u64 empty_size)
+ u64 hint, u64 empty_size, int for_cow)
{
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
@@ -6028,15 +6267,15 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
- empty_size, hint, (u64)-1, &ins, 0);
+ empty_size, hint, &ins, 0);
if (ret) {
- unuse_block_rsv(block_rsv, blocksize);
+ unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
}
buf = btrfs_init_new_buffer(trans, root, ins.objectid,
blocksize, level);
- BUG_ON(IS_ERR(buf));
+ BUG_ON(IS_ERR(buf)); /* -ENOMEM */
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
@@ -6048,7 +6287,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;
extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
+ BUG_ON(!extent_op); /* -ENOMEM */
if (key)
memcpy(&extent_op->key, key, sizeof(extent_op->key));
else
@@ -6058,11 +6297,12 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
extent_op->update_flags = 1;
extent_op->is_data = 0;
- ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+ ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+ ins.objectid,
ins.offset, parent, root_objectid,
level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op);
- BUG_ON(ret);
+ extent_op, for_cow);
+ BUG_ON(ret); /* -ENOMEM */
}
return buf;
}
@@ -6078,6 +6318,7 @@ struct walk_control {
int keep_locks;
int reada_slot;
int reada_count;
+ int for_reloc;
};
#define DROP_REFERENCE 1
@@ -6131,7 +6372,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
/* We don't lock the tree block, it's OK to be racy here */
ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
&refs, &flags);
- BUG_ON(ret);
+ /* We don't care about errors in readahead. */
+ if (ret < 0)
+ continue;
BUG_ON(refs == 0);
if (wc->stage == DROP_REFERENCE) {
@@ -6198,7 +6441,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
eb->start, eb->len,
&wc->refs[level],
&wc->flags[level]);
- BUG_ON(ret);
+ BUG_ON(ret == -ENOMEM);
+ if (ret)
+ return ret;
BUG_ON(wc->refs[level] == 0);
}
@@ -6216,13 +6461,13 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
BUG_ON(!path->locks[level]);
- ret = btrfs_inc_ref(trans, root, eb, 1);
- BUG_ON(ret);
- ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret);
+ ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+ BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
eb->len, flag, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
@@ -6294,7 +6539,11 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
&wc->refs[level - 1],
&wc->flags[level - 1]);
- BUG_ON(ret);
+ if (ret < 0) {
+ btrfs_tree_unlock(next);
+ return ret;
+ }
+
BUG_ON(wc->refs[level - 1] == 0);
*lookup_info = 0;
@@ -6362,8 +6611,8 @@ skip:
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
- root->root_key.objectid, level - 1, 0);
- BUG_ON(ret);
+ root->root_key.objectid, level - 1, 0, 0);
+ BUG_ON(ret); /* -ENOMEM */
}
btrfs_tree_unlock(next);
free_extent_buffer(next);
@@ -6421,7 +6670,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
eb->start, eb->len,
&wc->refs[level],
&wc->flags[level]);
- BUG_ON(ret);
+ if (ret < 0) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ return ret;
+ }
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
@@ -6436,10 +6688,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
- ret = btrfs_dec_ref(trans, root, eb, 1);
+ ret = btrfs_dec_ref(trans, root, eb, 1,
+ wc->for_reloc);
else
- ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret);
+ ret = btrfs_dec_ref(trans, root, eb, 0,
+ wc->for_reloc);
+ BUG_ON(ret); /* -ENOMEM */
}
/* make block locked assertion in clean_tree_block happy */
if (!path->locks[level] &&
@@ -6465,7 +6719,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_header_owner(path->nodes[level + 1]));
}
- btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+ btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
@@ -6548,8 +6802,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*/
-void btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref,
+ int for_reloc)
{
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -6575,7 +6830,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
}
trans = btrfs_start_transaction(tree_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
if (block_rsv)
trans->block_rsv = block_rsv;
@@ -6600,7 +6858,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
path->lowest_level = 0;
if (ret < 0) {
err = ret;
- goto out_free;
+ goto out_end_trans;
}
WARN_ON(ret > 0);
@@ -6620,7 +6878,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
path->nodes[level]->len,
&wc->refs[level],
&wc->flags[level]);
- BUG_ON(ret);
+ if (ret < 0) {
+ err = ret;
+ goto out_end_trans;
+ }
BUG_ON(wc->refs[level] == 0);
if (level == root_item->drop_level)
@@ -6637,6 +6898,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
wc->stage = DROP_REFERENCE;
wc->update_ref = update_ref;
wc->keep_locks = 0;
+ wc->for_reloc = for_reloc;
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
@@ -6670,26 +6932,40 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ err = ret;
+ goto out_end_trans;
+ }
btrfs_end_transaction_throttle(trans, tree_root);
trans = btrfs_start_transaction(tree_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
if (block_rsv)
trans->block_rsv = block_rsv;
}
}
btrfs_release_path(path);
- BUG_ON(err);
+ if (err)
+ goto out_end_trans;
ret = btrfs_del_root(trans, tree_root, &root->root_key);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ goto out_end_trans;
+ }
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
NULL, NULL);
- BUG_ON(ret < 0);
- if (ret > 0) {
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ err = ret;
+ goto out_end_trans;
+ } else if (ret > 0) {
/* if we fail to delete the orphan item this time
* around, it'll get picked up the next time.
*
@@ -6707,20 +6983,22 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
free_extent_buffer(root->commit_root);
kfree(root);
}
-out_free:
+out_end_trans:
btrfs_end_transaction_throttle(trans, tree_root);
+out_free:
kfree(wc);
btrfs_free_path(path);
out:
if (err)
btrfs_std_error(root->fs_info, err);
- return;
+ return err;
}
/*
* drop subtree rooted at tree block 'node'.
*
* NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
*/
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -6765,6 +7043,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
wc->stage = DROP_REFERENCE;
wc->update_ref = 0;
wc->keep_locks = 1;
+ wc->for_reloc = 1;
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
@@ -6789,8 +7068,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
{
u64 num_devices;
- u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+ u64 stripped;
+
+ /*
+ * if restripe for this chunk_type is on pick target profile and
+ * return, otherwise do the usual balance
+ */
+ stripped = get_restripe_target(root->fs_info, flags);
+ if (stripped)
+ return extended_to_chunk(stripped);
/*
* we add in the count of missing devices because we want
@@ -6800,6 +7086,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
+ stripped = BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
if (num_devices == 1) {
stripped |= BTRFS_BLOCK_GROUP_DUP;
stripped = flags & ~stripped;
@@ -6812,7 +7101,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
return stripped | BTRFS_BLOCK_GROUP_DUP;
- return flags;
} else {
/* they already had raid on here, just return */
if (flags & stripped)
@@ -6825,9 +7113,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
if (flags & BTRFS_BLOCK_GROUP_DUP)
return stripped | BTRFS_BLOCK_GROUP_RAID1;
- /* turn single device chunks into raid0 */
- return stripped | BTRFS_BLOCK_GROUP_RAID0;
+ /* this is drive concat, leave it alone */
}
+
return flags;
}
@@ -6886,12 +7174,16 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
BUG_ON(cache->ro);
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
alloc_flags = update_block_group_flags(root, cache->flags);
- if (alloc_flags != cache->flags)
- do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
- CHUNK_ALLOC_FORCE);
+ if (alloc_flags != cache->flags) {
+ ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+ if (ret < 0)
+ goto out;
+ }
ret = set_block_group_ro(cache, 0);
if (!ret)
@@ -6971,7 +7263,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
-int btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_set_block_group_rw(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
@@ -6987,7 +7279,6 @@ int btrfs_set_block_group_rw(struct btrfs_root *root,
cache->ro = 0;
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
- return 0;
}
/*
@@ -7005,6 +7296,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
u64 min_free;
u64 dev_min = 1;
u64 dev_nr = 0;
+ u64 target;
int index;
int full = 0;
int ret = 0;
@@ -7045,13 +7337,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
/*
* ok we don't have enough space, but maybe we have free space on our
* devices to allocate new chunks for relocation, so loop through our
- * alloc devices and guess if we have enough space. However, if we
- * were marked as full, then we know there aren't enough chunks, and we
- * can just return.
+ * alloc devices and guess if we have enough space. if this block
+ * group is going to be restriped, run checks against the target
+ * profile instead of the current one.
*/
ret = -1;
- if (full)
- goto out;
/*
* index:
@@ -7061,7 +7351,20 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* 3: raid0
* 4: single
*/
- index = get_block_group_index(block_group);
+ target = get_restripe_target(root->fs_info, block_group->flags);
+ if (target) {
+ index = __get_block_group_index(extended_to_chunk(target));
+ } else {
+ /*
+ * this is just a balance, so if we were marked as full
+ * we know there is no space for a new chunk
+ */
+ if (full)
+ goto out;
+
+ index = get_block_group_index(block_group);
+ }
+
if (index == 0) {
dev_min = 4;
/* Divide by 2 */
@@ -7085,7 +7388,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* space to fit our block group in.
*/
if (device->total_bytes > device->bytes_used + min_free) {
- ret = find_free_dev_extent(NULL, device, min_free,
+ ret = find_free_dev_extent(device, min_free,
&dev_offset, NULL);
if (!ret)
dev_nr++;
@@ -7355,7 +7658,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(&cache->item),
&space_info);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
cache->space_info = space_info;
spin_lock(&cache->space_info->lock);
cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7364,7 +7667,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid))
@@ -7446,7 +7749,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
+ update_global_block_rsv(root->fs_info);
spin_lock(&cache->space_info->lock);
cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7455,17 +7759,33 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
sizeof(cache->item));
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ return ret;
+ }
set_avail_alloc_bits(extent_root->fs_info, type);
return 0;
}
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start)
{
@@ -7476,6 +7796,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct inode *inode;
int ret;
+ int index;
int factor;
root = root->fs_info->extent_root;
@@ -7491,6 +7812,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, block_group);
memcpy(&key, &block_group->key, sizeof(key));
+ index = get_block_group_index(block_group);
if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -7522,7 +7844,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
inode = lookup_free_space_inode(tree_root, block_group, path);
if (!IS_ERR(inode)) {
ret = btrfs_orphan_add(trans, inode);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ goto out;
+ }
clear_nlink(inode);
/* One for the block groups ref */
spin_lock(&block_group->lock);
@@ -7565,6 +7890,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
+ if (list_empty(&block_group->space_info->block_groups[index]))
+ clear_avail_alloc_bits(root->fs_info, block_group->flags);
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
@@ -7654,9 +7981,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
u64 start;
u64 end;
u64 trimmed = 0;
+ u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret = 0;
- cache = btrfs_lookup_block_group(fs_info, range->start);
+ /*
+ * try to trim all FS space, our block group may start from non-zero.
+ */
+ if (range->len == total_bytes)
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
+ else
+ cache = btrfs_lookup_block_group(fs_info, range->start);
while (cache) {
if (cache->key.objectid >= (range->start + range->len)) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f..8d904dd7ea9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,8 @@
#include "ctree.h"
#include "btrfs_inode.h"
#include "volumes.h"
+#include "check-integrity.h"
+#include "locking.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -52,6 +54,13 @@ struct extent_page_data {
unsigned int sync_io:1;
};
+static noinline void flush_write_bio(void *data);
+static inline struct btrfs_fs_info *
+tree_fs_info(struct extent_io_tree *tree)
+{
+ return btrfs_sb(tree->mapping->host->i_sb);
+}
+
int __init extent_io_init(void)
{
extent_state_cache = kmem_cache_create("extent_state",
@@ -135,6 +144,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
#endif
atomic_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
+ trace_alloc_extent_state(state, mask, _RET_IP_);
return state;
}
@@ -152,6 +162,7 @@ void free_extent_state(struct extent_state *state)
list_del(&state->leak_list);
spin_unlock_irqrestore(&leak_lock, flags);
#endif
+ trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
}
@@ -438,6 +449,13 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
return prealloc;
}
+void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+ btrfs_panic(tree_fs_info(tree), err, "Locking error: "
+ "Extent tree was modified by another "
+ "thread while locked.");
+}
+
/*
* clear some bits on a range in the tree. This may require splitting
* or inserting elements in the tree, so the gfp mask is used to
@@ -448,8 +466,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
*
* the range [start, end] is inclusive.
*
- * This takes the tree lock, and returns < 0 on error, > 0 if any of the
- * bits were already set, or zero if none of the bits were already set.
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int wake, int delete,
@@ -463,7 +480,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct rb_node *node;
u64 last_end;
int err;
- int set = 0;
int clear = 0;
if (delete)
@@ -512,6 +528,15 @@ hit_next:
WARN_ON(state->end < start);
last_end = state->end;
+ if (state->end < end && !need_resched())
+ next_node = rb_next(&state->rb_node);
+ else
+ next_node = NULL;
+
+ /* the state doesn't have the wanted bits, go ahead */
+ if (!(state->state & bits))
+ goto next;
+
/*
* | ---- desired range ---- |
* | state | or
@@ -532,12 +557,14 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
- set |= clear_state_bit(tree, state, &bits, wake);
+ clear_state_bit(tree, state, &bits, wake);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -554,30 +581,27 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, end + 1);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
if (wake)
wake_up(&state->wq);
- set |= clear_state_bit(tree, prealloc, &bits, wake);
+ clear_state_bit(tree, prealloc, &bits, wake);
prealloc = NULL;
goto out;
}
- if (state->end < end && prealloc && !need_resched())
- next_node = rb_next(&state->rb_node);
- else
- next_node = NULL;
-
- set |= clear_state_bit(tree, state, &bits, wake);
+ clear_state_bit(tree, state, &bits, wake);
+next:
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
if (start <= end && next_node) {
state = rb_entry(next_node, struct extent_state,
rb_node);
- if (state->start == start)
- goto hit_next;
+ goto hit_next;
}
goto search_again;
@@ -586,7 +610,7 @@ out:
if (prealloc)
free_extent_state(prealloc);
- return set;
+ return 0;
search_again:
if (start > end)
@@ -597,8 +621,8 @@ search_again:
goto again;
}
-static int wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
+static void wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
__releases(tree->lock)
__acquires(tree->lock)
{
@@ -608,7 +632,6 @@ static int wait_on_state(struct extent_io_tree *tree,
schedule();
spin_lock(&tree->lock);
finish_wait(&state->wq, &wait);
- return 0;
}
/*
@@ -616,7 +639,7 @@ static int wait_on_state(struct extent_io_tree *tree,
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
{
struct extent_state *state;
struct rb_node *node;
@@ -653,7 +676,6 @@ again:
}
out:
spin_unlock(&tree->lock);
- return 0;
}
static void set_state_bits(struct extent_io_tree *tree,
@@ -701,9 +723,10 @@ static void uncache_state(struct extent_state **cached_ptr)
* [start, end] is inclusive This takes the tree lock.
*/
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask)
+static int __must_check
+__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -737,8 +760,10 @@ again:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = insert_state(tree, prealloc, start, end, &bits);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
prealloc = NULL;
- BUG_ON(err == -EEXIST);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
@@ -804,7 +829,9 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
prealloc = NULL;
if (err)
goto out;
@@ -841,12 +868,9 @@ hit_next:
*/
err = insert_state(tree, prealloc, start, this_end,
&bits);
- BUG_ON(err == -EEXIST);
- if (err) {
- free_extent_state(prealloc);
- prealloc = NULL;
- goto out;
- }
+ if (err)
+ extent_io_tree_panic(tree, err);
+
cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
@@ -868,7 +892,8 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, end + 1);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits);
cache_state(prealloc, cached_state);
@@ -895,6 +920,15 @@ search_again:
goto again;
}
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
+ u64 *failed_start, struct extent_state **cached_state,
+ gfp_t mask)
+{
+ return __set_extent_bit(tree, start, end, bits, 0, failed_start,
+ cached_state, mask);
+}
+
+
/**
* convert_extent - convert all bits in a given range from one bit to another
* @tree: the io tree to search
@@ -941,7 +975,8 @@ again:
}
err = insert_state(tree, prealloc, start, end, &bits);
prealloc = NULL;
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
@@ -960,8 +995,6 @@ hit_next:
set_state_bits(tree, state, &bits);
clear_state_bit(tree, state, &clear_bits, 0);
-
- merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
@@ -999,14 +1032,14 @@ hit_next:
goto out;
}
err = split_state(tree, state, prealloc, start);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, &bits);
clear_state_bit(tree, state, &clear_bits, 0);
- merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1039,12 +1072,8 @@ hit_next:
*/
err = insert_state(tree, prealloc, start, this_end,
&bits);
- BUG_ON(err == -EEXIST);
- if (err) {
- free_extent_state(prealloc);
- prealloc = NULL;
- goto out;
- }
+ if (err)
+ extent_io_tree_panic(tree, err);
prealloc = NULL;
start = this_end + 1;
goto search_again;
@@ -1063,12 +1092,11 @@ hit_next:
}
err = split_state(tree, state, prealloc, end + 1);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits);
clear_state_bit(tree, prealloc, &clear_bits, 0);
-
- merge_state(tree, prealloc);
prealloc = NULL;
goto out;
}
@@ -1095,14 +1123,14 @@ search_again:
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
NULL, mask);
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
- return set_extent_bit(tree, start, end, bits, 0, NULL,
+ return set_extent_bit(tree, start, end, bits, NULL,
NULL, mask);
}
@@ -1117,7 +1145,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE,
- 0, NULL, cached_state, mask);
+ NULL, cached_state, mask);
}
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1131,7 +1159,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
NULL, mask);
}
@@ -1139,7 +1167,7 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
- NULL, cached_state, mask);
+ cached_state, mask);
}
static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1155,42 +1183,40 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, struct extent_state **cached_state, gfp_t mask)
+ int bits, struct extent_state **cached_state)
{
int err;
u64 failed_start;
while (1) {
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
- EXTENT_LOCKED, &failed_start,
- cached_state, mask);
- if (err == -EEXIST && (mask & __GFP_WAIT)) {
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, GFP_NOFS);
+ if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
- } else {
+ } else
break;
- }
WARN_ON(start > end);
}
return err;
}
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
- return lock_extent_bits(tree, start, end, 0, NULL, mask);
+ return lock_extent_bits(tree, start, end, 0, NULL);
}
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
int err;
u64 failed_start;
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, mask);
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, GFP_NOFS);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, NULL, mask);
+ EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
return 0;
}
return 1;
@@ -1203,10 +1229,10 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
mask);
}
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
- mask);
+ GFP_NOFS);
}
/*
@@ -1220,7 +1246,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
while (index <= end_index) {
page = find_get_page(tree->mapping, index);
- BUG_ON(!page);
+ BUG_ON(!page); /* Pages should be in the extent_io_tree */
set_page_writeback(page);
page_cache_release(page);
index++;
@@ -1343,9 +1369,9 @@ out:
return found;
}
-static noinline int __unlock_for_delalloc(struct inode *inode,
- struct page *locked_page,
- u64 start, u64 end)
+static noinline void __unlock_for_delalloc(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end)
{
int ret;
struct page *pages[16];
@@ -1355,7 +1381,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
int i;
if (index == locked_page->index && end_index == index)
- return 0;
+ return;
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1370,7 +1396,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
index += ret;
cond_resched();
}
- return 0;
}
static noinline int lock_delalloc_pages(struct inode *inode,
@@ -1500,11 +1525,10 @@ again:
goto out_failed;
}
}
- BUG_ON(ret);
+ BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end,
- 0, &cached_state, GFP_NOFS);
+ lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1761,39 +1785,34 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
* helper function to set a given page up to date if all the
* extents in the tree for that page are up to date
*/
-static int check_page_uptodate(struct extent_io_tree *tree,
- struct page *page)
+static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
- return 0;
}
/*
* helper function to unlock a page if all the extents in the tree
* for that page are unlocked
*/
-static int check_page_locked(struct extent_io_tree *tree,
- struct page *page)
+static void check_page_locked(struct extent_io_tree *tree, struct page *page)
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
unlock_page(page);
- return 0;
}
/*
* helper function to end page writeback if all the extents
* in the tree for that page are done with writeback
*/
-static int check_page_writeback(struct extent_io_tree *tree,
- struct page *page)
+static void check_page_writeback(struct extent_io_tree *tree,
+ struct page *page)
{
end_page_writeback(page);
- return 0;
}
/*
@@ -1895,7 +1914,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
}
bio->bi_bdev = dev->bdev;
bio_add_page(bio, page, length, start-page_offset(page));
- submit_bio(WRITE_SYNC, bio);
+ btrfsic_submit_bio(WRITE_SYNC, bio);
wait_for_completion(&compl);
if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -1912,6 +1931,26 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
return 0;
}
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+ int mirror_num)
+{
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ u64 start = eb->start;
+ unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
+ int ret;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+ ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+ start, p, mirror_num);
+ if (ret)
+ break;
+ start += PAGE_CACHE_SIZE;
+ }
+
+ return ret;
+}
+
/*
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
@@ -2153,13 +2192,46 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
"this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
failrec->this_mirror, num_copies, failrec->in_validation);
- tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
- failrec->bio_flags, 0);
- return 0;
+ ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
+ failrec->this_mirror,
+ failrec->bio_flags, 0);
+ return ret;
}
/* lots and lots of room for performance fixes in the end_bio funcs */
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+{
+ int uptodate = (err == 0);
+ struct extent_io_tree *tree;
+ int ret;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ if (tree->ops && tree->ops->writepage_end_io_hook) {
+ ret = tree->ops->writepage_end_io_hook(page, start,
+ end, NULL, uptodate);
+ if (ret)
+ uptodate = 0;
+ }
+
+ if (!uptodate && tree->ops &&
+ tree->ops->writepage_io_failed_hook) {
+ ret = tree->ops->writepage_io_failed_hook(NULL, page,
+ start, end, NULL);
+ /* Writeback already completed */
+ if (ret == 0)
+ return 1;
+ }
+
+ if (!uptodate) {
+ clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ return 0;
+}
+
/*
* after a writepage IO is done, we need to:
* clear the uptodate bits on error
@@ -2171,13 +2243,11 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
*/
static void end_bio_extent_writepage(struct bio *bio, int err)
{
- int uptodate = err == 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct extent_io_tree *tree;
u64 start;
u64 end;
int whole_page;
- int ret;
do {
struct page *page = bvec->bv_page;
@@ -2194,28 +2264,9 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
if (--bvec >= bio->bi_io_vec)
prefetchw(&bvec->bv_page->flags);
- if (tree->ops && tree->ops->writepage_end_io_hook) {
- ret = tree->ops->writepage_end_io_hook(page, start,
- end, NULL, uptodate);
- if (ret)
- uptodate = 0;
- }
-
- if (!uptodate && tree->ops &&
- tree->ops->writepage_io_failed_hook) {
- ret = tree->ops->writepage_io_failed_hook(bio, page,
- start, end, NULL);
- if (ret == 0) {
- uptodate = (err == 0);
- continue;
- }
- }
- if (!uptodate) {
- clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
- ClearPageUptodate(page);
- SetPageError(page);
- }
+ if (end_extent_writepage(page, err, start, end))
+ continue;
if (whole_page)
end_page_writeback(page);
@@ -2246,6 +2297,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
u64 start;
u64 end;
int whole_page;
+ int failed_mirror;
int ret;
if (err)
@@ -2292,9 +2344,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
else
clean_io_failure(start, page);
}
- if (!uptodate) {
- int failed_mirror;
+
+ if (!uptodate)
failed_mirror = (int)(unsigned long)bio->bi_bdev;
+
+ if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
+ ret = tree->ops->readpage_io_failed_hook(page, failed_mirror);
+ if (!ret && !err &&
+ test_bit(BIO_UPTODATE, &bio->bi_flags))
+ uptodate = 1;
+ } else if (!uptodate) {
/*
* The generic bio_readpage_error handles errors the
* following way: If possible, new read requests are
@@ -2308,7 +2367,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
ret = bio_readpage_error(bio, page, start, end,
failed_mirror, NULL);
if (ret == 0) {
-error_handled:
uptodate =
test_bit(BIO_UPTODATE, &bio->bi_flags);
if (err)
@@ -2316,16 +2374,9 @@ error_handled:
uncache_state(&cached);
continue;
}
- if (tree->ops && tree->ops->readpage_io_failed_hook) {
- ret = tree->ops->readpage_io_failed_hook(
- bio, page, start, end,
- failed_mirror, state);
- if (ret == 0)
- goto error_handled;
- }
}
- if (uptodate) {
+ if (uptodate && tree->track_uptodate) {
set_extent_uptodate(tree, start, end, &cached,
GFP_ATOMIC);
}
@@ -2374,8 +2425,12 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
return bio;
}
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+/*
+ * Since writes are async, they will only return -ENOMEM.
+ * Reads can return the full range of I/O error conditions.
+ */
+static int __must_check submit_one_bio(int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
int ret = 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -2393,7 +2448,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
mirror_num, bio_flags, start);
else
- submit_bio(rw, bio);
+ btrfsic_submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
@@ -2401,6 +2456,19 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
return ret;
}
+static int merge_bio(struct extent_io_tree *tree, struct page *page,
+ unsigned long offset, size_t size, struct bio *bio,
+ unsigned long bio_flags)
+{
+ int ret = 0;
+ if (tree->ops && tree->ops->merge_bio_hook)
+ ret = tree->ops->merge_bio_hook(page, offset, size, bio,
+ bio_flags);
+ BUG_ON(ret < 0);
+ return ret;
+
+}
+
static int submit_extent_page(int rw, struct extent_io_tree *tree,
struct page *page, sector_t sector,
size_t size, unsigned long offset,
@@ -2429,12 +2497,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
sector;
if (prev_bio_flags != bio_flags || !contig ||
- (tree->ops && tree->ops->merge_bio_hook &&
- tree->ops->merge_bio_hook(page, offset, page_size, bio,
- bio_flags)) ||
+ merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
bio_add_page(bio, page, page_size, offset) < page_size) {
ret = submit_one_bio(rw, bio, mirror_num,
prev_bio_flags);
+ if (ret < 0)
+ return ret;
bio = NULL;
} else {
return 0;
@@ -2461,25 +2529,31 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
return ret;
}
-void set_page_extent_mapped(struct page *page)
+void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
{
if (!PagePrivate(page)) {
SetPagePrivate(page);
page_cache_get(page);
- set_page_private(page, EXTENT_PAGE_PRIVATE);
+ set_page_private(page, (unsigned long)eb);
+ } else {
+ WARN_ON(page->private != (unsigned long)eb);
}
}
-static void set_page_extent_head(struct page *page, unsigned long len)
+void set_page_extent_mapped(struct page *page)
{
- WARN_ON(!PagePrivate(page));
- set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, EXTENT_PAGE_PRIVATE);
+ }
}
/*
* basic readpage implementation. Locked extent state structs are inserted
* into the tree that are removed when the IO is done (by the end_io
* handlers)
+ * XXX JDM: This needs looking at to ensure proper page locking
*/
static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page,
@@ -2519,11 +2593,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end = page_end;
while (1) {
- lock_extent(tree, start, end, GFP_NOFS);
+ lock_extent(tree, start, end);
ordered = btrfs_lookup_ordered_extent(inode, start);
if (!ordered)
break;
- unlock_extent(tree, start, end, GFP_NOFS);
+ unlock_extent(tree, start, end);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
@@ -2534,10 +2608,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
if (zero_offset) {
iosize = PAGE_CACHE_SIZE - zero_offset;
- userpage = kmap_atomic(page, KM_USER0);
+ userpage = kmap_atomic(page);
memset(userpage + zero_offset, 0, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage, KM_USER0);
+ kunmap_atomic(userpage);
}
}
while (cur <= end) {
@@ -2546,10 +2620,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
struct extent_state *cached = NULL;
iosize = PAGE_CACHE_SIZE - pg_offset;
- userpage = kmap_atomic(page, KM_USER0);
+ userpage = kmap_atomic(page);
memset(userpage + pg_offset, 0, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage, KM_USER0);
+ kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur, cur + iosize - 1,
@@ -2560,7 +2634,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end - cur + 1, 0);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
- unlock_extent(tree, cur, end, GFP_NOFS);
+ unlock_extent(tree, cur, end);
break;
}
extent_offset = cur - em->start;
@@ -2595,10 +2669,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
char *userpage;
struct extent_state *cached = NULL;
- userpage = kmap_atomic(page, KM_USER0);
+ userpage = kmap_atomic(page);
memset(userpage + pg_offset, 0, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage, KM_USER0);
+ kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
@@ -2612,7 +2686,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
if (test_range_bit(tree, cur, cur_end,
EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
- unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -2622,7 +2696,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
*/
if (block_start == EXTENT_MAP_INLINE) {
SetPageError(page);
- unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -2642,6 +2716,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag);
+ BUG_ON(ret == -ENOMEM);
nr++;
*bio_flags = this_bio_flag;
}
@@ -2744,10 +2819,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (page->index == end_index) {
char *userpage;
- userpage = kmap_atomic(page, KM_USER0);
+ userpage = kmap_atomic(page);
memset(userpage + pg_offset, 0,
PAGE_CACHE_SIZE - pg_offset);
- kunmap_atomic(userpage, KM_USER0);
+ kunmap_atomic(userpage);
flush_dcache_page(page);
}
pg_offset = 0;
@@ -2778,9 +2853,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
delalloc_start = delalloc_end + 1;
continue;
}
- tree->ops->fill_delalloc(inode, page, delalloc_start,
- delalloc_end, &page_started,
- &nr_written);
+ ret = tree->ops->fill_delalloc(inode, page,
+ delalloc_start,
+ delalloc_end,
+ &page_started,
+ &nr_written);
+ /* File system has been set read-only */
+ if (ret) {
+ SetPageError(page);
+ goto done;
+ }
/*
* delalloc_end is already one less than the total
* length, so we don't subtract one from
@@ -2817,8 +2899,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (tree->ops && tree->ops->writepage_start_hook) {
ret = tree->ops->writepage_start_hook(page, start,
page_end);
- if (ret == -EAGAIN) {
- redirty_page_for_writepage(wbc, page);
+ if (ret) {
+ /* Fixup worker will requeue */
+ if (ret == -EBUSY)
+ wbc->pages_skipped++;
+ else
+ redirty_page_for_writepage(wbc, page);
update_nr_written(page, wbc, nr_written);
unlock_page(page);
ret = 0;
@@ -2949,6 +3035,275 @@ done_unlocked:
return 0;
}
+static int eb_wait(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
+static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static int lock_extent_buffer_for_io(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct extent_page_data *epd)
+{
+ unsigned long i, num_pages;
+ int flush = 0;
+ int ret = 0;
+
+ if (!btrfs_try_tree_write_lock(eb)) {
+ flush = 1;
+ flush_write_bio(epd);
+ btrfs_tree_lock(eb);
+ }
+
+ if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+ btrfs_tree_unlock(eb);
+ if (!epd->sync_io)
+ return 0;
+ if (!flush) {
+ flush_write_bio(epd);
+ flush = 1;
+ }
+ while (1) {
+ wait_on_extent_buffer_writeback(eb);
+ btrfs_tree_lock(eb);
+ if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+ break;
+ btrfs_tree_unlock(eb);
+ }
+ }
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+ spin_lock(&fs_info->delalloc_lock);
+ if (fs_info->dirty_metadata_bytes >= eb->len)
+ fs_info->dirty_metadata_bytes -= eb->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&fs_info->delalloc_lock);
+ ret = 1;
+ }
+
+ btrfs_tree_unlock(eb);
+
+ if (!ret)
+ return ret;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+
+ if (!trylock_page(p)) {
+ if (!flush) {
+ flush_write_bio(epd);
+ flush = 1;
+ }
+ lock_page(p);
+ }
+ }
+
+ return ret;
+}
+
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+{
+ int uptodate = err == 0;
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_buffer *eb;
+ int done;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ bvec--;
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ done = atomic_dec_and_test(&eb->io_pages);
+
+ if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+
+ end_page_writeback(page);
+
+ if (!done)
+ continue;
+
+ end_extent_buffer_writeback(eb);
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+
+}
+
+static int write_one_eb(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+ u64 offset = eb->start;
+ unsigned long i, num_pages;
+ int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
+ int ret;
+
+ clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ num_pages = num_extent_pages(eb->start, eb->len);
+ atomic_set(&eb->io_pages, num_pages);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+
+ clear_page_dirty_for_io(p);
+ set_page_writeback(p);
+ ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
+ PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+ -1, end_bio_extent_buffer_writepage,
+ 0, 0, 0);
+ if (ret) {
+ set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ SetPageError(p);
+ if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+ end_extent_buffer_writeback(eb);
+ ret = -EIO;
+ break;
+ }
+ offset += PAGE_CACHE_SIZE;
+ update_nr_written(p, wbc, 1);
+ unlock_page(p);
+ }
+
+ if (unlikely(ret)) {
+ for (; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+ unlock_page(p);
+ }
+ }
+
+ return ret;
+}
+
+int btree_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
+ struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ struct extent_buffer *eb, *prev_eb = NULL;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ };
+ int ret = 0;
+ int done = 0;
+ int nr_to_write_done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ int scanned = 0;
+ int tag;
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ scanned = 1;
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
+ while (!done && !nr_to_write_done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ unsigned i;
+
+ scanned = 1;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (!PagePrivate(page))
+ continue;
+
+ if (!wbc->range_cyclic && page->index > end) {
+ done = 1;
+ break;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+ if (!eb) {
+ WARN_ON(1);
+ continue;
+ }
+
+ if (eb == prev_eb)
+ continue;
+
+ if (!atomic_inc_not_zero(&eb->refs)) {
+ WARN_ON(1);
+ continue;
+ }
+
+ prev_eb = eb;
+ ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+ if (!ret) {
+ free_extent_buffer(eb);
+ continue;
+ }
+
+ ret = write_one_eb(eb, fs_info, wbc, &epd);
+ if (ret) {
+ done = 1;
+ free_extent_buffer(eb);
+ break;
+ }
+ free_extent_buffer(eb);
+
+ /*
+ * the filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time
+ */
+ nr_to_write_done = wbc->nr_to_write <= 0;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+ flush_write_bio(&epd);
+ return ret;
+}
+
/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
@@ -3080,10 +3435,14 @@ retry:
static void flush_epd_write_bio(struct extent_page_data *epd)
{
if (epd->bio) {
+ int rw = WRITE;
+ int ret;
+
if (epd->sync_io)
- submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
- else
- submit_one_bio(WRITE, epd->bio, 0, 0);
+ rw = WRITE_SYNC;
+
+ ret = submit_one_bio(rw, epd->bio, 0, 0);
+ BUG_ON(ret < 0); /* -ENOMEM */
epd->bio = NULL;
}
}
@@ -3200,7 +3559,7 @@ int extent_readpages(struct extent_io_tree *tree,
}
BUG_ON(!list_empty(pages));
if (bio)
- submit_one_bio(READ, bio, 0, bio_flags);
+ return submit_one_bio(READ, bio, 0, bio_flags);
return 0;
}
@@ -3221,7 +3580,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
+ lock_extent_bits(tree, start, end, 0, &cached_state);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -3288,7 +3647,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
len = end - start + 1;
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
- if (IS_ERR_OR_NULL(em)) {
+ if (!em) {
write_unlock(&map->lock);
break;
}
@@ -3435,7 +3794,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent,
get_extent);
@@ -3529,26 +3888,7 @@ out:
inline struct page *extent_buffer_page(struct extent_buffer *eb,
unsigned long i)
{
- struct page *p;
- struct address_space *mapping;
-
- if (i == 0)
- return eb->first_page;
- i += eb->start >> PAGE_CACHE_SHIFT;
- mapping = eb->first_page->mapping;
- if (!mapping)
- return NULL;
-
- /*
- * extent_buffer_page is only called after pinning the page
- * by increasing the reference count. So we know the page must
- * be in the radix tree.
- */
- rcu_read_lock();
- p = radix_tree_lookup(&mapping->page_tree, i);
- rcu_read_unlock();
-
- return p;
+ return eb->pages[i];
}
inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -3557,6 +3897,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len)
(start >> PAGE_CACHE_SHIFT);
}
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#if LEAK_DEBUG
+ unsigned long flags;
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ if (eb->pages && eb->pages != eb->inline_pages)
+ kfree(eb->pages);
+ kmem_cache_free(extent_buffer_cache, eb);
+}
+
static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
u64 start,
unsigned long len,
@@ -3572,6 +3925,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
return NULL;
eb->start = start;
eb->len = len;
+ eb->tree = tree;
rwlock_init(&eb->lock);
atomic_set(&eb->write_locks, 0);
atomic_set(&eb->read_locks, 0);
@@ -3579,6 +3933,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
atomic_set(&eb->blocking_writers, 0);
atomic_set(&eb->spinning_readers, 0);
atomic_set(&eb->spinning_writers, 0);
+ eb->lock_nested = 0;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
@@ -3587,20 +3942,32 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
list_add(&eb->leak_list, &buffers);
spin_unlock_irqrestore(&leak_lock, flags);
#endif
+ spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
+ atomic_set(&eb->io_pages, 0);
+
+ if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
+ struct page **pages;
+ int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ pages = kzalloc(num_pages, mask);
+ if (!pages) {
+ __free_extent_buffer(eb);
+ return NULL;
+ }
+ eb->pages = pages;
+ } else {
+ eb->pages = eb->inline_pages;
+ }
return eb;
}
-static void __free_extent_buffer(struct extent_buffer *eb)
+static int extent_buffer_under_io(struct extent_buffer *eb)
{
-#if LEAK_DEBUG
- unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
- list_del(&eb->leak_list);
- spin_unlock_irqrestore(&leak_lock, flags);
-#endif
- kmem_cache_free(extent_buffer_cache, eb);
+ return (atomic_read(&eb->io_pages) ||
+ test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+ test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
/*
@@ -3612,8 +3979,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
unsigned long index;
struct page *page;
- if (!eb->first_page)
- return;
+ BUG_ON(extent_buffer_under_io(eb));
index = num_extent_pages(eb->start, eb->len);
if (start_idx >= index)
@@ -3622,8 +3988,34 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
do {
index--;
page = extent_buffer_page(eb, index);
- if (page)
+ if (page) {
+ spin_lock(&page->mapping->private_lock);
+ /*
+ * We do this since we'll remove the pages after we've
+ * removed the eb from the radix tree, so we could race
+ * and have this page now attached to the new eb. So
+ * only clear page_private if it's still connected to
+ * this eb.
+ */
+ if (PagePrivate(page) &&
+ page->private == (unsigned long)eb) {
+ BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(PageDirty(page));
+ BUG_ON(PageWriteback(page));
+ /*
+ * We need to make sure we haven't be attached
+ * to a new eb.
+ */
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ /* One for the page private */
+ page_cache_release(page);
+ }
+ spin_unlock(&page->mapping->private_lock);
+
+ /* One for when we alloced the page */
page_cache_release(page);
+ }
} while (index != start_idx);
}
@@ -3636,9 +4028,50 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
__free_extent_buffer(eb);
}
+static void check_buffer_tree_ref(struct extent_buffer *eb)
+{
+ /* the ref bit is tricky. We have to make sure it is set
+ * if we have the buffer dirty. Otherwise the
+ * code to free a buffer can end up dropping a dirty
+ * page
+ *
+ * Once the ref bit is set, it won't go away while the
+ * buffer is dirty or in writeback, and it also won't
+ * go away while we have the reference count on the
+ * eb bumped.
+ *
+ * We can't just set the ref bit without bumping the
+ * ref on the eb because free_extent_buffer might
+ * see the ref bit and try to clear it. If this happens
+ * free_extent_buffer might end up dropping our original
+ * ref by mistake and freeing the page before we are able
+ * to add one more ref.
+ *
+ * So bump the ref count first, then set the bit. If someone
+ * beat us to it, drop the ref we added.
+ */
+ if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ atomic_inc(&eb->refs);
+ if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+ }
+}
+
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+{
+ unsigned long num_pages, i;
+
+ check_buffer_tree_ref(eb);
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+ mark_page_accessed(p);
+ }
+}
+
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
- u64 start, unsigned long len,
- struct page *page0)
+ u64 start, unsigned long len)
{
unsigned long num_pages = num_extent_pages(start, len);
unsigned long i;
@@ -3654,7 +4087,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
- mark_page_accessed(eb->first_page);
+ mark_extent_buffer_accessed(eb);
return eb;
}
rcu_read_unlock();
@@ -3663,32 +4096,43 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
if (!eb)
return NULL;
- if (page0) {
- eb->first_page = page0;
- i = 1;
- index++;
- page_cache_get(page0);
- mark_page_accessed(page0);
- set_page_extent_mapped(page0);
- set_page_extent_head(page0, len);
- uptodate = PageUptodate(page0);
- } else {
- i = 0;
- }
- for (; i < num_pages; i++, index++) {
+ for (i = 0; i < num_pages; i++, index++) {
p = find_or_create_page(mapping, index, GFP_NOFS);
if (!p) {
WARN_ON(1);
goto free_eb;
}
- set_page_extent_mapped(p);
- mark_page_accessed(p);
- if (i == 0) {
- eb->first_page = p;
- set_page_extent_head(p, len);
- } else {
- set_page_private(p, EXTENT_PAGE_PRIVATE);
+
+ spin_lock(&mapping->private_lock);
+ if (PagePrivate(p)) {
+ /*
+ * We could have already allocated an eb for this page
+ * and attached one so lets see if we can get a ref on
+ * the existing eb, and if we can we know it's good and
+ * we can just return that one, else we know we can just
+ * overwrite page->private.
+ */
+ exists = (struct extent_buffer *)p->private;
+ if (atomic_inc_not_zero(&exists->refs)) {
+ spin_unlock(&mapping->private_lock);
+ unlock_page(p);
+ mark_extent_buffer_accessed(exists);
+ goto free_eb;
+ }
+
+ /*
+ * Do this so attach doesn't complain and we need to
+ * drop the ref the old guy had.
+ */
+ ClearPagePrivate(p);
+ WARN_ON(PageDirty(p));
+ page_cache_release(p);
}
+ attach_extent_buffer_page(eb, p);
+ spin_unlock(&mapping->private_lock);
+ WARN_ON(PageDirty(p));
+ mark_page_accessed(p);
+ eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
@@ -3696,12 +4140,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
* see below about how we avoid a nasty race with release page
* and why we unlock later
*/
- if (i != 0)
- unlock_page(p);
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
+again:
ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret)
goto free_eb;
@@ -3711,14 +4153,21 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
if (ret == -EEXIST) {
exists = radix_tree_lookup(&tree->buffer,
start >> PAGE_CACHE_SHIFT);
- /* add one reference for the caller */
- atomic_inc(&exists->refs);
+ if (!atomic_inc_not_zero(&exists->refs)) {
+ spin_unlock(&tree->buffer_lock);
+ radix_tree_preload_end();
+ exists = NULL;
+ goto again;
+ }
spin_unlock(&tree->buffer_lock);
radix_tree_preload_end();
+ mark_extent_buffer_accessed(exists);
goto free_eb;
}
/* add one reference for the tree */
- atomic_inc(&eb->refs);
+ spin_lock(&eb->refs_lock);
+ check_buffer_tree_ref(eb);
+ spin_unlock(&eb->refs_lock);
spin_unlock(&tree->buffer_lock);
radix_tree_preload_end();
@@ -3731,15 +4180,20 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
* after the extent buffer is in the radix tree so
* it doesn't get lost
*/
- set_page_extent_mapped(eb->first_page);
- set_page_extent_head(eb->first_page, eb->len);
- if (!page0)
- unlock_page(eb->first_page);
+ SetPageChecked(eb->pages[0]);
+ for (i = 1; i < num_pages; i++) {
+ p = extent_buffer_page(eb, i);
+ ClearPageChecked(p);
+ unlock_page(p);
+ }
+ unlock_page(eb->pages[0]);
return eb;
free_eb:
- if (eb->first_page && !page0)
- unlock_page(eb->first_page);
+ for (i = 0; i < num_pages; i++) {
+ if (eb->pages[i])
+ unlock_page(eb->pages[i]);
+ }
if (!atomic_dec_and_test(&eb->refs))
return exists;
@@ -3756,7 +4210,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
- mark_page_accessed(eb->first_page);
+ mark_extent_buffer_accessed(eb);
return eb;
}
rcu_read_unlock();
@@ -3764,19 +4218,71 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
return NULL;
}
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+ struct extent_buffer *eb =
+ container_of(head, struct extent_buffer, rcu_head);
+
+ __free_extent_buffer(eb);
+}
+
+/* Expects to have eb->eb_lock already held */
+static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
+{
+ WARN_ON(atomic_read(&eb->refs) == 0);
+ if (atomic_dec_and_test(&eb->refs)) {
+ struct extent_io_tree *tree = eb->tree;
+
+ spin_unlock(&eb->refs_lock);
+
+ spin_lock(&tree->buffer_lock);
+ radix_tree_delete(&tree->buffer,
+ eb->start >> PAGE_CACHE_SHIFT);
+ spin_unlock(&tree->buffer_lock);
+
+ /* Should be safe to release our pages at this point */
+ btrfs_release_extent_buffer_page(eb, 0);
+
+ call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+ return;
+ }
+ spin_unlock(&eb->refs_lock);
+}
+
void free_extent_buffer(struct extent_buffer *eb)
{
if (!eb)
return;
- if (!atomic_dec_and_test(&eb->refs))
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) == 2 &&
+ test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+ !extent_buffer_under_io(eb) &&
+ test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+
+ /*
+ * I know this is terrible, but it's temporary until we stop tracking
+ * the uptodate bits and such for the extent buffers.
+ */
+ release_extent_buffer(eb, GFP_ATOMIC);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+ if (!eb)
return;
- WARN_ON(1);
+ spin_lock(&eb->refs_lock);
+ set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+ if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+ test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+ release_extent_buffer(eb, GFP_NOFS);
}
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+void clear_extent_buffer_dirty(struct extent_buffer *eb)
{
unsigned long i;
unsigned long num_pages;
@@ -3792,10 +4298,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
lock_page(page);
WARN_ON(!PagePrivate(page));
- set_page_extent_mapped(page);
- if (i == 0)
- set_page_extent_head(page, eb->len);
-
clear_page_dirty_for_io(page);
spin_lock_irq(&page->mapping->tree_lock);
if (!PageDirty(page)) {
@@ -3807,24 +4309,29 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
ClearPageError(page);
unlock_page(page);
}
- return 0;
+ WARN_ON(atomic_read(&eb->refs) == 0);
}
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+int set_extent_buffer_dirty(struct extent_buffer *eb)
{
unsigned long i;
unsigned long num_pages;
int was_dirty = 0;
+ check_buffer_tree_ref(eb);
+
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+
num_pages = num_extent_pages(eb->start, eb->len);
+ WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+
for (i = 0; i < num_pages; i++)
- __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+ set_page_dirty(extent_buffer_page(eb, i));
return was_dirty;
}
-static int __eb_straddles_pages(u64 start, u64 len)
+static int range_straddles_pages(u64 start, u64 len)
{
if (len < PAGE_CACHE_SIZE)
return 1;
@@ -3835,26 +4342,14 @@ static int __eb_straddles_pages(u64 start, u64 len)
return 0;
}
-static int eb_straddles_pages(struct extent_buffer *eb)
-{
- return __eb_straddles_pages(eb->start, eb->len);
-}
-
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state **cached_state)
+int clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
unsigned long num_pages;
- num_pages = num_extent_pages(eb->start, eb->len);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
- if (eb_straddles_pages(eb)) {
- clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- cached_state, GFP_NOFS);
- }
+ num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
if (page)
@@ -3863,27 +4358,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
return 0;
}
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+int set_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
unsigned long num_pages;
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
-
- if (eb_straddles_pages(eb)) {
- set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- NULL, GFP_NOFS);
- }
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
- if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
- ((i == num_pages - 1) &&
- ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
- check_page_uptodate(tree, page);
- continue;
- }
SetPageUptodate(page);
}
return 0;
@@ -3898,7 +4382,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int uptodate;
unsigned long index;
- if (__eb_straddles_pages(start, end - start + 1)) {
+ if (range_straddles_pages(start, end - start + 1)) {
ret = test_range_bit(tree, start, end,
EXTENT_UPTODATE, 1, NULL);
if (ret)
@@ -3907,6 +4391,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
while (start <= end) {
index = start >> PAGE_CACHE_SHIFT;
page = find_get_page(tree->mapping, index);
+ if (!page)
+ return 1;
uptodate = PageUptodate(page);
page_cache_release(page);
if (!uptodate) {
@@ -3918,35 +4404,9 @@ int extent_range_uptodate(struct extent_io_tree *tree,
return pg_uptodate;
}
-int extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state *cached_state)
+int extent_buffer_uptodate(struct extent_buffer *eb)
{
- int ret = 0;
- unsigned long num_pages;
- unsigned long i;
- struct page *page;
- int pg_uptodate = 1;
-
- if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
- return 1;
-
- if (eb_straddles_pages(eb)) {
- ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, cached_state);
- if (ret)
- return ret;
- }
-
- num_pages = num_extent_pages(eb->start, eb->len);
- for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
- if (!PageUptodate(page)) {
- pg_uptodate = 0;
- break;
- }
- }
- return pg_uptodate;
+ return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
}
int read_extent_buffer_pages(struct extent_io_tree *tree,
@@ -3960,21 +4420,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
int ret = 0;
int locked_pages = 0;
int all_uptodate = 1;
- int inc_all_pages = 0;
unsigned long num_pages;
+ unsigned long num_reads = 0;
struct bio *bio = NULL;
unsigned long bio_flags = 0;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
- if (eb_straddles_pages(eb)) {
- if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, NULL)) {
- return 0;
- }
- }
-
if (start) {
WARN_ON(start < eb->start);
start_i = (start >> PAGE_CACHE_SHIFT) -
@@ -3993,8 +4446,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
lock_page(page);
}
locked_pages++;
- if (!PageUptodate(page))
+ if (!PageUptodate(page)) {
+ num_reads++;
all_uptodate = 0;
+ }
}
if (all_uptodate) {
if (start_i == 0)
@@ -4002,20 +4457,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
goto unlock_exit;
}
+ clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ eb->failed_mirror = 0;
+ atomic_set(&eb->io_pages, num_reads);
for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
-
- WARN_ON(!PagePrivate(page));
-
- set_page_extent_mapped(page);
- if (i == 0)
- set_page_extent_head(page, eb->len);
-
- if (inc_all_pages)
- page_cache_get(page);
if (!PageUptodate(page)) {
- if (start_i == 0)
- inc_all_pages = 1;
ClearPageError(page);
err = __extent_read_full_page(tree, page,
get_extent, &bio,
@@ -4027,8 +4474,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
}
}
- if (bio)
- submit_one_bio(READ, bio, mirror_num, bio_flags);
+ if (bio) {
+ err = submit_one_bio(READ, bio, mirror_num, bio_flags);
+ if (err)
+ return err;
+ }
if (ret || wait != WAIT_COMPLETE)
return ret;
@@ -4040,8 +4490,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
ret = -EIO;
}
- if (!ret)
- set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
return ret;
unlock_exit:
@@ -4283,15 +4731,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
{
char *dst_kaddr = page_address(dst_page);
char *src_kaddr;
+ int must_memmove = 0;
if (dst_page != src_page) {
src_kaddr = page_address(src_page);
} else {
src_kaddr = dst_kaddr;
- BUG_ON(areas_overlap(src_off, dst_off, len));
+ if (areas_overlap(src_off, dst_off, len))
+ must_memmove = 1;
}
- memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+ if (must_memmove)
+ memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+ else
+ memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
}
void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@ -4361,7 +4814,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
"len %lu len %lu\n", dst_offset, len, dst->len);
BUG_ON(1);
}
- if (!areas_overlap(src_offset, dst_offset, len)) {
+ if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
}
@@ -4387,47 +4840,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
}
}
-static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
-{
- struct extent_buffer *eb =
- container_of(head, struct extent_buffer, rcu_head);
-
- btrfs_release_extent_buffer(eb);
-}
-
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+int try_release_extent_buffer(struct page *page, gfp_t mask)
{
- u64 start = page_offset(page);
struct extent_buffer *eb;
- int ret = 1;
- spin_lock(&tree->buffer_lock);
- eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
- if (!eb) {
- spin_unlock(&tree->buffer_lock);
- return ret;
+ /*
+ * We need to make sure noboody is attaching this page to an eb right
+ * now.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&page->mapping->private_lock);
+ return 1;
}
- if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
- ret = 0;
- goto out;
- }
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
/*
- * set @eb->refs to 0 if it is already 1, and then release the @eb.
- * Or go back.
+ * This is a little awful but should be ok, we need to make sure that
+ * the eb doesn't disappear out from under us while we're looking at
+ * this page.
*/
- if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
- ret = 0;
- goto out;
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ spin_unlock(&eb->refs_lock);
+ spin_unlock(&page->mapping->private_lock);
+ return 0;
}
+ spin_unlock(&page->mapping->private_lock);
- radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-out:
- spin_unlock(&tree->buffer_lock);
+ if ((mask & GFP_NOFS) == GFP_NOFS)
+ mask = GFP_NOFS;
- /* at this point we can safely release the extent buffer */
- if (atomic_read(&eb->refs) == 0)
- call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
- return ret;
+ /*
+ * If tree ref isn't set then we know the ref on this eb is a real ref,
+ * so just return, this page will likely be freed soon anyway.
+ */
+ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ return 0;
+ }
+ release_extent_buffer(eb, mask);
+
+ return 1;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c300132..faf10eb57f7 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -35,6 +35,10 @@
#define EXTENT_BUFFER_DIRTY 2
#define EXTENT_BUFFER_CORRUPT 3
#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
+#define EXTENT_BUFFER_TREE_REF 5
+#define EXTENT_BUFFER_STALE 6
+#define EXTENT_BUFFER_WRITEBACK 7
+#define EXTENT_BUFFER_IOERR 8
/* these are flags for extent_clear_unlock_delalloc */
#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -54,6 +58,7 @@
#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
struct extent_state;
+struct btrfs_root;
typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
@@ -69,9 +74,7 @@ struct extent_io_ops {
size_t size, struct bio *bio,
unsigned long bio_flags);
int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
- int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
- u64 start, u64 end, int failed_mirror,
- struct extent_state *state);
+ int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
u64 start, u64 end,
struct extent_state *state);
@@ -97,6 +100,7 @@ struct extent_io_tree {
struct radix_tree_root buffer;
struct address_space *mapping;
u64 dirty_bytes;
+ int track_uptodate;
spinlock_t lock;
spinlock_t buffer_lock;
struct extent_io_ops *ops;
@@ -119,16 +123,22 @@ struct extent_state {
struct list_head leak_list;
};
+#define INLINE_EXTENT_BUFFER_PAGES 16
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
struct extent_buffer {
u64 start;
unsigned long len;
unsigned long map_start;
unsigned long map_len;
- struct page *first_page;
unsigned long bflags;
+ struct extent_io_tree *tree;
+ spinlock_t refs_lock;
+ atomic_t refs;
+ atomic_t io_pages;
+ int failed_mirror;
struct list_head leak_list;
struct rcu_head rcu_head;
- atomic_t refs;
+ pid_t lock_owner;
/* count of read lock holders on the extent buffer */
atomic_t write_locks;
@@ -137,6 +147,7 @@ struct extent_buffer {
atomic_t blocking_readers;
atomic_t spinning_readers;
atomic_t spinning_writers;
+ int lock_nested;
/* protects write locks */
rwlock_t lock;
@@ -150,6 +161,9 @@ struct extent_buffer {
* to unlock
*/
wait_queue_head_t read_lock_wq;
+ wait_queue_head_t lock_wq;
+ struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
+ struct page **pages;
};
static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -176,18 +190,17 @@ void extent_io_tree_init(struct extent_io_tree *tree,
int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_buffer(struct page *page, gfp_t mask);
int try_release_extent_state(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, struct extent_state **cached, gfp_t mask);
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+ int bits, struct extent_state **cached);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached, gfp_t mask);
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
int __init extent_io_init(void);
@@ -208,7 +221,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive_bits, u64 *failed_start,
+ int bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
@@ -238,6 +251,8 @@ int extent_writepages(struct extent_io_tree *tree,
struct address_space *mapping,
get_extent_t *get_extent,
struct writeback_control *wbc);
+int btree_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc);
int extent_readpages(struct extent_io_tree *tree,
struct address_space *mapping,
struct list_head *pages, unsigned nr_pages,
@@ -249,11 +264,11 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
- u64 start, unsigned long len,
- struct page *page0);
+ u64 start, unsigned long len);
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb);
+void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE 0
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
@@ -285,19 +300,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
void memset_extent_buffer(struct extent_buffer *eb, char c,
unsigned long start, unsigned long len);
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state **cached_state);
-int extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state *cached_state);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
+void clear_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_buffer *eb);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
@@ -317,4 +325,7 @@ struct btrfs_mapping_tree;
int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
u64 length, u64 logical, struct page *page,
int mirror_num);
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+ int mirror_num);
#endif
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 33a7890b1f4..1195f09761f 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,8 +26,8 @@ struct extent_map {
unsigned long flags;
struct block_device *bdev;
atomic_t refs;
- unsigned int in_tree:1;
- unsigned int compress_type:4;
+ unsigned int in_tree;
+ unsigned int compress_type;
};
struct extent_map_tree {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c7fb3a4247d..5d158d32023 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,10 +25,12 @@
#include "transaction.h"
#include "print-tree.h"
-#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
size) - 1))
+#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE))
+
#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
sizeof(struct btrfs_sector_sum) * \
@@ -59,7 +61,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
sizeof(*item));
if (ret < 0)
goto out;
- BUG_ON(ret);
+ BUG_ON(ret); /* Can't happen */
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -284,6 +286,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_ordered_sum *sums;
struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
+ LIST_HEAD(tmplist);
unsigned long offset;
int ret;
size_t size;
@@ -358,7 +361,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
MAX_ORDERED_SUM_BYTES(root));
sums = kzalloc(btrfs_ordered_sum_size(root, size),
GFP_NOFS);
- BUG_ON(!sums);
+ if (!sums) {
+ ret = -ENOMEM;
+ goto fail;
+ }
sector_sum = sums->sums;
sums->bytenr = start;
@@ -380,12 +386,19 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
offset += csum_size;
sector_sum++;
}
- list_add_tail(&sums->list, list);
+ list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
}
ret = 0;
fail:
+ while (ret < 0 && !list_empty(&tmplist)) {
+ sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ list_splice_tail(&tmplist, list);
+
btrfs_free_path(path);
return ret;
}
@@ -420,7 +433,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered);
+ BUG_ON(!ordered); /* Logic error */
sums->bytenr = ordered->start;
while (bio_index < bio->bi_vcnt) {
@@ -439,21 +452,21 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
GFP_NOFS);
- BUG_ON(!sums);
+ BUG_ON(!sums); /* -ENOMEM */
sector_sum = sums->sums;
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered);
+ BUG_ON(!ordered); /* Logic error */
sums->bytenr = ordered->start;
}
- data = kmap_atomic(bvec->bv_page, KM_USER0);
+ data = kmap_atomic(bvec->bv_page);
sector_sum->sum = ~(u32)0;
sector_sum->sum = btrfs_csum_data(root,
data + bvec->bv_offset,
sector_sum->sum,
bvec->bv_len);
- kunmap_atomic(data, KM_USER0);
+ kunmap_atomic(data);
btrfs_csum_final(sector_sum->sum,
(char *)&sector_sum->sum);
sector_sum->bytenr = disk_bytenr;
@@ -483,18 +496,17 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
* This calls btrfs_truncate_item with the correct args based on the
* overlap, and fixes up the key as required.
*/
-static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *key,
- u64 bytenr, u64 len)
+static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
u64 csum_end;
u64 end_byte = bytenr + len;
u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
- int ret;
leaf = path->nodes[0];
csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
@@ -510,7 +522,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
*/
u32 new_size = (bytenr - key->offset) >> blocksize_bits;
new_size *= csum_size;
- ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+ btrfs_truncate_item(trans, root, path, new_size, 1);
} else if (key->offset >= bytenr && csum_end > end_byte &&
end_byte > key->offset) {
/*
@@ -522,15 +534,13 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
u32 new_size = (csum_end - end_byte) >> blocksize_bits;
new_size *= csum_size;
- ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+ btrfs_truncate_item(trans, root, path, new_size, 0);
key->offset = end_byte;
- ret = btrfs_set_item_key_safe(trans, root, path, key);
- BUG_ON(ret);
+ btrfs_set_item_key_safe(trans, root, path, key);
} else {
BUG();
}
- return 0;
}
/*
@@ -635,13 +645,14 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
* item changed size or key
*/
ret = btrfs_split_item(trans, root, path, &key, offset);
- BUG_ON(ret && ret != -EAGAIN);
+ if (ret && ret != -EAGAIN) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
key.offset = end_byte - 1;
} else {
- ret = truncate_one_csum(trans, root, path,
- &key, bytenr, len);
- BUG_ON(ret);
+ truncate_one_csum(trans, root, path, &key, bytenr, len);
if (key.offset < bytenr)
break;
}
@@ -772,7 +783,7 @@ again:
if (diff != csum_size)
goto insert;
- ret = btrfs_extend_item(trans, root, path, diff);
+ btrfs_extend_item(trans, root, path, diff);
goto csum;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 034d9850322..d83260d7498 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -452,7 +452,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split = alloc_extent_map();
if (!split2)
split2 = alloc_extent_map();
- BUG_ON(!split || !split2);
+ BUG_ON(!split || !split2); /* -ENOMEM */
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -494,7 +494,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->flags = flags;
split->compress_type = em->compress_type;
ret = add_extent_mapping(em_tree, split);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -520,7 +520,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
}
ret = add_extent_mapping(em_tree, split);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
free_extent_map(split);
split = NULL;
}
@@ -678,8 +678,8 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset);
- BUG_ON(ret);
+ start - extent_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
*hint_byte = disk_bytenr;
}
key.offset = start;
@@ -753,8 +753,8 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
key.objectid, key.offset -
- extent_offset);
- BUG_ON(ret);
+ extent_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
*hint_byte = disk_bytenr;
@@ -770,7 +770,10 @@ next_slot:
ret = btrfs_del_items(trans, root, path, del_slot,
del_nr);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
del_nr = 0;
del_slot = 0;
@@ -782,11 +785,13 @@ next_slot:
BUG_ON(1);
}
- if (del_nr > 0) {
+ if (!ret && del_nr > 0) {
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
}
+out:
btrfs_free_path(path);
return ret;
}
@@ -944,7 +949,10 @@ again:
btrfs_release_path(path);
goto again;
}
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
@@ -962,8 +970,8 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- ino, orig_offset);
- BUG_ON(ret);
+ ino, orig_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
if (split == start) {
key.offset = start;
@@ -989,8 +997,8 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset);
- BUG_ON(ret);
+ ino, orig_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
}
other_start = 0;
other_end = start;
@@ -1006,8 +1014,8 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset);
- BUG_ON(ret);
+ ino, orig_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
}
if (del_nr == 0) {
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -1025,7 +1033,10 @@ again:
btrfs_mark_buffer_dirty(leaf);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- BUG_ON(ret);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
out:
btrfs_free_path(path);
@@ -1105,8 +1116,7 @@ again:
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos - 1, 0, &cached_state,
- GFP_NOFS);
+ start_pos, last_pos - 1, 0, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
last_pos - 1);
if (ordered &&
@@ -1274,7 +1284,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
dirty_pages);
if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
btrfs_btree_balance_dirty(root, 1);
- btrfs_throttle(root);
pos += copied;
num_written += copied;
@@ -1606,6 +1615,14 @@ static long btrfs_fallocate(struct file *file, int mode,
return -EOPNOTSUPP;
/*
+ * Make sure we have enough space before we do the
+ * allocation.
+ */
+ ret = btrfs_check_data_free_space(inode, len);
+ if (ret)
+ return ret;
+
+ /*
* wait for ordered IO before we have any locks. We'll loop again
* below with the locks held.
*/
@@ -1631,7 +1648,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, 0, &cached_state, GFP_NOFS);
+ locked_end, 0, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
@@ -1660,7 +1677,13 @@ static long btrfs_fallocate(struct file *file, int mode,
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
- BUG_ON(IS_ERR_OR_NULL(em));
+ if (IS_ERR_OR_NULL(em)) {
+ if (!em)
+ ret = -ENOMEM;
+ else
+ ret = PTR_ERR(em);
+ break;
+ }
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = (last_byte + mask) & ~mask;
@@ -1668,27 +1691,12 @@ static long btrfs_fallocate(struct file *file, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-
- /*
- * Make sure we have enough space before we do the
- * allocation.
- */
- ret = btrfs_check_data_free_space(inode, last_byte -
- cur_offset);
- if (ret) {
- free_extent_map(em);
- break;
- }
-
ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
last_byte - cur_offset,
1 << inode->i_blkbits,
offset + len,
&alloc_hint);
- /* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, last_byte -
- cur_offset);
if (ret < 0) {
free_extent_map(em);
break;
@@ -1716,6 +1724,8 @@ static long btrfs_fallocate(struct file *file, int mode,
&cached_state, GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
+ /* Let go of our reservation. */
+ btrfs_free_reserved_data_space(inode, len);
return ret;
}
@@ -1743,7 +1753,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
return -ENXIO;
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
/*
* Delalloc is such a pain. If we have a hole and we have pending
@@ -1762,7 +1772,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
start - root->sectorsize,
root->sectorsize, 0);
if (IS_ERR(em)) {
- ret = -ENXIO;
+ ret = PTR_ERR(em);
goto out;
}
last_end = em->start + em->len;
@@ -1774,7 +1784,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
while (1) {
em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
- ret = -ENXIO;
+ ret = PTR_ERR(em);
break;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9a897bf7953..e88330d3df5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -230,11 +230,13 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
if (ret) {
trans->block_rsv = rsv;
- WARN_ON(1);
+ btrfs_abort_transaction(trans, root, ret);
return ret;
}
ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
trans->block_rsv = rsv;
return ret;
@@ -319,9 +321,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
io_ctl_unmap_page(io_ctl);
for (i = 0; i < io_ctl->num_pages; i++) {
- ClearPageChecked(io_ctl->pages[i]);
- unlock_page(io_ctl->pages[i]);
- page_cache_release(io_ctl->pages[i]);
+ if (io_ctl->pages[i]) {
+ ClearPageChecked(io_ctl->pages[i]);
+ unlock_page(io_ctl->pages[i]);
+ page_cache_release(io_ctl->pages[i]);
+ }
}
}
@@ -635,7 +639,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
if (!num_entries)
return 0;
- io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root);
+ if (ret)
+ return ret;
+
ret = readahead_cache(inode);
if (ret)
goto out;
@@ -772,6 +779,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
spin_lock(&block_group->lock);
if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
spin_unlock(&block_group->lock);
+ btrfs_free_path(path);
goto out;
}
spin_unlock(&block_group->lock);
@@ -838,7 +846,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct io_ctl io_ctl;
struct list_head bitmap_list;
struct btrfs_key key;
- u64 start, end, len;
+ u64 start, extent_start, extent_end, len;
int entries = 0;
int bitmaps = 0;
int ret;
@@ -849,7 +857,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (!i_size_read(inode))
return -1;
- io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root);
+ if (ret)
+ return -1;
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,24 +867,11 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_cluster,
block_group_list);
- /*
- * We shouldn't have switched the pinned extents yet so this is the
- * right one
- */
- unpin = root->fs_info->pinned_extents;
-
/* Lock all pages first so we can lock the extent safely. */
io_ctl_prepare_pages(&io_ctl, inode, 0);
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- 0, &cached_state, GFP_NOFS);
-
- /*
- * When searching for pinned extents, we need to start at our start
- * offset.
- */
- if (block_group)
- start = block_group->key.objectid;
+ 0, &cached_state);
node = rb_first(&ctl->free_space_offset);
if (!node && cluster) {
@@ -918,9 +915,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* We want to add any pinned extents to our free space cache
* so we don't leak the space
*/
+
+ /*
+ * We shouldn't have switched the pinned extents yet so this is the
+ * right one
+ */
+ unpin = root->fs_info->pinned_extents;
+
+ if (block_group)
+ start = block_group->key.objectid;
+
while (block_group && (start < block_group->key.objectid +
block_group->key.offset)) {
- ret = find_first_extent_bit(unpin, start, &start, &end,
+ ret = find_first_extent_bit(unpin, start,
+ &extent_start, &extent_end,
EXTENT_DIRTY);
if (ret) {
ret = 0;
@@ -928,20 +936,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
}
/* This pinned extent is out of our range */
- if (start >= block_group->key.objectid +
+ if (extent_start >= block_group->key.objectid +
block_group->key.offset)
break;
- len = block_group->key.objectid +
- block_group->key.offset - start;
- len = min(len, end + 1 - start);
+ extent_start = max(extent_start, start);
+ extent_end = min(block_group->key.objectid +
+ block_group->key.offset, extent_end + 1);
+ len = extent_end - extent_start;
entries++;
- ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+ ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
if (ret)
goto out_nospc;
- start = end + 1;
+ start = extent_end;
}
/* Write out the bitmaps */
@@ -1061,7 +1070,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
ret = 0;
#ifdef DEBUG
- printk(KERN_ERR "btrfs: failed to write free space cace "
+ printk(KERN_ERR "btrfs: failed to write free space cache "
"for block group %llu\n", block_group->key.objectid);
#endif
}
@@ -1941,14 +1950,14 @@ again:
*/
ret = btrfs_add_free_space(block_group, old_start,
offset - old_start);
- WARN_ON(ret);
+ WARN_ON(ret); /* -ENOMEM */
goto out;
}
ret = remove_from_bitmap(ctl, info, &offset, &bytes);
if (ret == -EAGAIN)
goto again;
- BUG_ON(ret);
+ BUG_ON(ret); /* logic error */
out_lock:
spin_unlock(&ctl->tree_lock);
out:
@@ -2236,7 +2245,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
if (entry->bitmap) {
ret = btrfs_alloc_from_bitmap(block_group,
cluster, entry, bytes,
- min_start);
+ cluster->window_start);
if (ret == 0) {
node = rb_next(&entry->offset_index);
if (!node)
@@ -2245,6 +2254,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
offset_index);
continue;
}
+ cluster->window_start += bytes;
} else {
ret = entry->offset;
@@ -2283,23 +2293,23 @@ out:
static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
struct btrfs_free_space *entry,
struct btrfs_free_cluster *cluster,
- u64 offset, u64 bytes, u64 min_bytes)
+ u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
unsigned long next_zero;
unsigned long i;
- unsigned long search_bits;
- unsigned long total_bits;
+ unsigned long want_bits;
+ unsigned long min_bits;
unsigned long found_bits;
unsigned long start = 0;
unsigned long total_found = 0;
int ret;
- bool found = false;
i = offset_to_bit(entry->offset, block_group->sectorsize,
max_t(u64, offset, entry->offset));
- search_bits = bytes_to_bits(bytes, block_group->sectorsize);
- total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+ want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+ min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
again:
found_bits = 0;
@@ -2308,7 +2318,7 @@ again:
i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
next_zero = find_next_zero_bit(entry->bitmap,
BITS_PER_BITMAP, i);
- if (next_zero - i >= search_bits) {
+ if (next_zero - i >= min_bits) {
found_bits = next_zero - i;
break;
}
@@ -2318,10 +2328,9 @@ again:
if (!found_bits)
return -ENOSPC;
- if (!found) {
+ if (!total_found) {
start = i;
cluster->max_size = 0;
- found = true;
}
total_found += found_bits;
@@ -2329,13 +2338,8 @@ again:
if (cluster->max_size < found_bits * block_group->sectorsize)
cluster->max_size = found_bits * block_group->sectorsize;
- if (total_found < total_bits) {
- i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
- if (i - start > total_bits * 2) {
- total_found = 0;
- cluster->max_size = 0;
- found = false;
- }
+ if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+ i = next_zero + 1;
goto again;
}
@@ -2344,30 +2348,33 @@ again:
rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -EEXIST; Logic error */
+ trace_btrfs_setup_cluster(block_group, cluster,
+ total_found * block_group->sectorsize, 1);
return 0;
}
/*
* This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
*/
static noinline int
setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
- u64 min_bytes)
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *first = NULL;
struct btrfs_free_space *entry = NULL;
- struct btrfs_free_space *prev = NULL;
struct btrfs_free_space *last;
struct rb_node *node;
u64 window_start;
u64 window_free;
u64 max_extent;
- u64 max_gap = 128 * 1024;
+ u64 total_size = 0;
entry = tree_search_offset(ctl, offset, 0, 1);
if (!entry)
@@ -2377,8 +2384,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
* We don't want bitmaps, so just move along until we find a normal
* extent entry.
*/
- while (entry->bitmap) {
- if (list_empty(&entry->list))
+ while (entry->bitmap || entry->bytes < min_bytes) {
+ if (entry->bitmap && list_empty(&entry->list))
list_add_tail(&entry->list, bitmaps);
node = rb_next(&entry->offset_index);
if (!node)
@@ -2391,12 +2398,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
max_extent = entry->bytes;
first = entry;
last = entry;
- prev = entry;
- while (window_free <= min_bytes) {
- node = rb_next(&entry->offset_index);
- if (!node)
- return -ENOSPC;
+ for (node = rb_next(&entry->offset_index); node;
+ node = rb_next(&entry->offset_index)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bitmap) {
@@ -2405,26 +2409,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
continue;
}
- /*
- * we haven't filled the empty size and the window is
- * very large. reset and try again
- */
- if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
- entry->offset - window_start > (min_bytes * 2)) {
- first = entry;
- window_start = entry->offset;
- window_free = entry->bytes;
- last = entry;
+ if (entry->bytes < min_bytes)
+ continue;
+
+ last = entry;
+ window_free += entry->bytes;
+ if (entry->bytes > max_extent)
max_extent = entry->bytes;
- } else {
- last = entry;
- window_free += entry->bytes;
- if (entry->bytes > max_extent)
- max_extent = entry->bytes;
- }
- prev = entry;
}
+ if (window_free < bytes || max_extent < cont1_bytes)
+ return -ENOSPC;
+
cluster->window_start = first->offset;
node = &first->offset_index;
@@ -2438,17 +2434,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
- if (entry->bitmap)
+ if (entry->bitmap || entry->bytes < min_bytes)
continue;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 0);
- BUG_ON(ret);
+ total_size += entry->bytes;
+ BUG_ON(ret); /* -EEXIST; Logic error */
} while (node && entry != last);
cluster->max_size = max_extent;
-
+ trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
return 0;
}
@@ -2460,7 +2457,7 @@ static noinline int
setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
- u64 min_bytes)
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
@@ -2482,10 +2479,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
}
list_for_each_entry(entry, bitmaps, list) {
- if (entry->bytes < min_bytes)
+ if (entry->bytes < bytes)
continue;
ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
- bytes, min_bytes);
+ bytes, cont1_bytes, min_bytes);
if (!ret)
return 0;
}
@@ -2499,7 +2496,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
/*
* here we try to find a cluster of blocks in a block group. The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
* We might not find them all in one contiguous area.
*
* returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2512,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
struct btrfs_free_space *entry, *tmp;
LIST_HEAD(bitmaps);
u64 min_bytes;
+ u64 cont1_bytes;
int ret;
- /* for metadata, allow allocates with more holes */
+ /*
+ * Choose the minimum extent size we'll require for this
+ * cluster. For SSD_SPREAD, don't allow any fragmentation.
+ * For metadata, allow allocates with smaller extents. For
+ * data, keep it dense.
+ */
if (btrfs_test_opt(root, SSD_SPREAD)) {
- min_bytes = bytes + empty_size;
+ cont1_bytes = min_bytes = bytes + empty_size;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
- /*
- * we want to do larger allocations when we are
- * flushing out the delayed refs, it helps prevent
- * making more work as we go along.
- */
- if (trans->transaction->delayed_refs.flushing)
- min_bytes = max(bytes, (bytes + empty_size) >> 1);
- else
- min_bytes = max(bytes, (bytes + empty_size) >> 4);
- } else
- min_bytes = max(bytes, (bytes + empty_size) >> 2);
+ cont1_bytes = bytes;
+ min_bytes = block_group->sectorsize;
+ } else {
+ cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+ min_bytes = block_group->sectorsize;
+ }
spin_lock(&ctl->tree_lock);
@@ -2539,7 +2537,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
* If we know we don't have enough space to make a cluster don't even
* bother doing all the work to try and find one.
*/
- if (ctl->free_space < min_bytes) {
+ if (ctl->free_space < bytes) {
spin_unlock(&ctl->tree_lock);
return -ENOSPC;
}
@@ -2552,11 +2550,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
goto out;
}
+ trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+ min_bytes);
+
+ INIT_LIST_HEAD(&bitmaps);
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
- bytes, min_bytes);
+ bytes + empty_size,
+ cont1_bytes, min_bytes);
if (ret)
ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
- offset, bytes, min_bytes);
+ offset, bytes + empty_size,
+ cont1_bytes, min_bytes);
/* Clear our temporary list */
list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2571,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
list_add_tail(&cluster->block_group_list,
&block_group->cluster_list);
cluster->block_group = block_group;
+ } else {
+ trace_btrfs_failed_cluster_setup(block_group);
}
out:
spin_unlock(&cluster->lock);
@@ -2588,17 +2594,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
cluster->block_group = NULL;
}
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
- u64 *trimmed, u64 start, u64 end, u64 minlen)
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 bytes,
+ u64 reserved_start, u64 reserved_bytes)
{
- struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct btrfs_free_space *entry = NULL;
+ struct btrfs_space_info *space_info = block_group->space_info;
struct btrfs_fs_info *fs_info = block_group->fs_info;
- u64 bytes = 0;
- u64 actually_trimmed;
- int ret = 0;
+ int ret;
+ int update = 0;
+ u64 trimmed = 0;
- *trimmed = 0;
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (!block_group->ro) {
+ block_group->reserved += reserved_bytes;
+ space_info->bytes_reserved += reserved_bytes;
+ update = 1;
+ }
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ ret = btrfs_error_discard_extent(fs_info->extent_root,
+ start, bytes, &trimmed);
+ if (!ret)
+ *total_trimmed += trimmed;
+
+ btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+
+ if (update) {
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (block_group->ro)
+ space_info->bytes_readonly += reserved_bytes;
+ block_group->reserved -= reserved_bytes;
+ space_info->bytes_reserved -= reserved_bytes;
+ spin_unlock(&space_info->lock);
+ spin_unlock(&block_group->lock);
+ }
+
+ return ret;
+}
+
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ int ret = 0;
+ u64 extent_start;
+ u64 extent_bytes;
+ u64 bytes;
while (start < end) {
spin_lock(&ctl->tree_lock);
@@ -2609,81 +2655,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
}
entry = tree_search_offset(ctl, start, 0, 1);
- if (!entry)
- entry = tree_search_offset(ctl,
- offset_to_bitmap(ctl, start),
- 1, 1);
-
- if (!entry || entry->offset >= end) {
+ if (!entry) {
spin_unlock(&ctl->tree_lock);
break;
}
- if (entry->bitmap) {
- ret = search_bitmap(ctl, entry, &start, &bytes);
- if (!ret) {
- if (start >= end) {
- spin_unlock(&ctl->tree_lock);
- break;
- }
- bytes = min(bytes, end - start);
- bitmap_clear_bits(ctl, entry, start, bytes);
- if (entry->bytes == 0)
- free_bitmap(ctl, entry);
- } else {
- start = entry->offset + BITS_PER_BITMAP *
- block_group->sectorsize;
+ /* skip bitmaps */
+ while (entry->bitmap) {
+ node = rb_next(&entry->offset_index);
+ if (!node) {
spin_unlock(&ctl->tree_lock);
- ret = 0;
- continue;
+ goto out;
}
- } else {
- start = entry->offset;
- bytes = min(entry->bytes, end - start);
- unlink_free_space(ctl, entry);
- kmem_cache_free(btrfs_free_space_cachep, entry);
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ }
+
+ if (entry->offset >= end) {
+ spin_unlock(&ctl->tree_lock);
+ break;
}
+ extent_start = entry->offset;
+ extent_bytes = entry->bytes;
+ start = max(start, extent_start);
+ bytes = min(extent_start + extent_bytes, end) - start;
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ goto next;
+ }
+
+ unlink_free_space(ctl, entry);
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+
spin_unlock(&ctl->tree_lock);
- if (bytes >= minlen) {
- struct btrfs_space_info *space_info;
- int update = 0;
-
- space_info = block_group->space_info;
- spin_lock(&space_info->lock);
- spin_lock(&block_group->lock);
- if (!block_group->ro) {
- block_group->reserved += bytes;
- space_info->bytes_reserved += bytes;
- update = 1;
- }
- spin_unlock(&block_group->lock);
- spin_unlock(&space_info->lock);
-
- ret = btrfs_error_discard_extent(fs_info->extent_root,
- start,
- bytes,
- &actually_trimmed);
-
- btrfs_add_free_space(block_group, start, bytes);
- if (update) {
- spin_lock(&space_info->lock);
- spin_lock(&block_group->lock);
- if (block_group->ro)
- space_info->bytes_readonly += bytes;
- block_group->reserved -= bytes;
- space_info->bytes_reserved -= bytes;
- spin_unlock(&space_info->lock);
- spin_unlock(&block_group->lock);
- }
+ ret = do_trimming(block_group, total_trimmed, start, bytes,
+ extent_start, extent_bytes);
+ if (ret)
+ break;
+next:
+ start += bytes;
- if (ret)
- break;
- *trimmed += actually_trimmed;
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+out:
+ return ret;
+}
+
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ int ret = 0;
+ int ret2;
+ u64 bytes;
+ u64 offset = offset_to_bitmap(ctl, start);
+
+ while (offset < end) {
+ bool next_bitmap = false;
+
+ spin_lock(&ctl->tree_lock);
+
+ if (ctl->free_space < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ break;
+ }
+
+ entry = tree_search_offset(ctl, offset, 1, 0);
+ if (!entry) {
+ spin_unlock(&ctl->tree_lock);
+ next_bitmap = true;
+ goto next;
+ }
+
+ bytes = minlen;
+ ret2 = search_bitmap(ctl, entry, &start, &bytes);
+ if (ret2 || start >= end) {
+ spin_unlock(&ctl->tree_lock);
+ next_bitmap = true;
+ goto next;
+ }
+
+ bytes = min(bytes, end - start);
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ goto next;
+ }
+
+ bitmap_clear_bits(ctl, entry, start, bytes);
+ if (entry->bytes == 0)
+ free_bitmap(ctl, entry);
+
+ spin_unlock(&ctl->tree_lock);
+
+ ret = do_trimming(block_group, total_trimmed, start, bytes,
+ start, bytes);
+ if (ret)
+ break;
+next:
+ if (next_bitmap) {
+ offset += BITS_PER_BITMAP * ctl->unit;
+ } else {
+ start += bytes;
+ if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+ offset += BITS_PER_BITMAP * ctl->unit;
}
- start += bytes;
- bytes = 0;
if (fatal_signal_pending(current)) {
ret = -ERESTARTSYS;
@@ -2696,6 +2779,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
return ret;
}
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+ if (ret)
+ return ret;
+
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+
+ return ret;
+}
+
/*
* Find the left-most item in the cache tree, and then return the
* smallest inode number in the item.
@@ -2733,6 +2832,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
int ret;
ret = search_bitmap(ctl, entry, &offset, &count);
+ /* Logic error; Should be empty if it can't find anything */
BUG_ON(ret);
ino = offset;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index baa74f3db69..a13cf1a96c7 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -19,6 +19,7 @@
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
+#include "print-tree.h"
static int find_name_in_backref(struct btrfs_path *path, const char *name,
int name_len, struct btrfs_inode_ref **ref_ret)
@@ -128,13 +129,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
- ret = btrfs_truncate_item(trans, root, path,
+ btrfs_truncate_item(trans, root, path,
item_size - sub_item_len, 1);
out:
btrfs_free_path(path);
return ret;
}
+/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
@@ -165,7 +167,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
goto out;
old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- ret = btrfs_extend_item(trans, root, path, ins_len);
+ btrfs_extend_item(trans, root, path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d6..b1a1c929ba8 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
root->root_key.objectid);
- BUG_ON(IS_ERR(tsk));
+ BUG_ON(IS_ERR(tsk)); /* -ENOMEM */
}
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -271,7 +271,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
break;
info = rb_entry(n, struct btrfs_free_space, offset_index);
- BUG_ON(info->bitmap);
+ BUG_ON(info->bitmap); /* Logic error */
if (info->offset > root->cache_progress)
goto free;
@@ -438,15 +438,17 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
trans->bytes_reserved);
if (ret)
goto out;
+ trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+ trans->transid, trans->bytes_reserved, 1);
again:
inode = lookup_free_ino_inode(root, path);
- if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+ if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
ret = PTR_ERR(inode);
goto out_release;
}
if (IS_ERR(inode)) {
- BUG_ON(retry);
+ BUG_ON(retry); /* Logic error */
retry = true;
ret = create_free_ino_inode(root, trans, path);
@@ -457,12 +459,17 @@ again:
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, root, inode);
- WARN_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_put;
+ }
if (i_size_read(inode) > 0) {
ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto out_put;
+ }
}
spin_lock(&root->cache_lock);
@@ -498,6 +505,8 @@ again:
out_put:
iput(inode);
out_release:
+ trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+ trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
out:
trans->block_rsv = rsv;
@@ -526,7 +535,7 @@ static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto error;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
l = path->nodes[0];
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81b235a61f8..115bc05e42b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -150,7 +150,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
inode_add_bytes(inode, size);
ret = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
- BUG_ON(ret);
if (ret) {
err = ret;
goto fail;
@@ -173,9 +172,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_CACHE_SIZE);
- kaddr = kmap_atomic(cpage, KM_USER0);
+ kaddr = kmap_atomic(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
i++;
ptr += cur_size;
@@ -187,10 +186,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
page = find_get_page(inode->i_mapping,
start >> PAGE_CACHE_SHIFT);
btrfs_set_file_extent_compression(leaf, ei, 0);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
offset = start & (PAGE_CACHE_SIZE - 1);
write_extent_buffer(leaf, kaddr + offset, ptr, size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
page_cache_release(page);
}
btrfs_mark_buffer_dirty(leaf);
@@ -206,9 +205,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
* could end up racing with unlink.
*/
BTRFS_I(inode)->disk_i_size = inode->i_size;
- btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, inode);
- return 0;
+ return ret;
fail:
btrfs_free_path(path);
return err;
@@ -250,14 +249,18 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
ret = btrfs_drop_extents(trans, inode, start, aligned_end,
&hint_byte, 1);
- BUG_ON(ret);
+ if (ret)
+ return ret;
if (isize > actual_end)
inline_len = min_t(u64, isize, actual_end);
ret = insert_inline_extent(trans, root, inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
@@ -293,7 +296,7 @@ static noinline int add_async_extent(struct async_cow *cow,
struct async_extent *async_extent;
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
- BUG_ON(!async_extent);
+ BUG_ON(!async_extent); /* -ENOMEM */
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
@@ -344,8 +347,9 @@ static noinline int compress_file_range(struct inode *inode,
int will_compress;
int compress_type = root->fs_info->compress_type;
- /* if this is a small write inside eof, kick off a defragbot */
- if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
+ /* if this is a small write inside eof, kick off a defrag */
+ if ((end - start + 1) < 16 * 1024 &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
actual_end = min_t(u64, isize, end + 1);
@@ -422,10 +426,10 @@ again:
* sending it down to disk
*/
if (offset) {
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memset(kaddr + offset, 0,
PAGE_CACHE_SIZE - offset);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
will_compress = 1;
}
@@ -433,7 +437,11 @@ again:
cont:
if (start == 0) {
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto cleanup_and_out;
+ }
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
/* lets try to make an inline extent */
@@ -450,11 +458,11 @@ cont:
total_compressed,
compress_type, pages);
}
- if (ret == 0) {
+ if (ret <= 0) {
/*
- * inline extent creation worked, we don't need
- * to create any more async work items. Unlock
- * and free up our temp pages.
+ * inline extent creation worked or returned error,
+ * we don't need to create any more async work items.
+ * Unlock and free up our temp pages.
*/
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
@@ -547,7 +555,7 @@ cleanup_and_bail_uncompressed:
}
out:
- return 0;
+ return ret;
free_pages_out:
for (i = 0; i < nr_pages_ret; i++) {
@@ -557,6 +565,20 @@ free_pages_out:
kfree(pages);
goto out;
+
+cleanup_and_out:
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+ if (!trans || IS_ERR(trans))
+ btrfs_error(root->fs_info, ret, "Failed to join transaction");
+ else
+ btrfs_abort_transaction(trans, root, ret);
+ goto free_pages_out;
}
/*
@@ -597,7 +619,7 @@ retry:
lock_extent(io_tree, async_extent->start,
async_extent->start +
- async_extent->ram_size - 1, GFP_NOFS);
+ async_extent->ram_size - 1);
/* allocate blocks */
ret = cow_file_range(inode, async_cow->locked_page,
@@ -606,6 +628,8 @@ retry:
async_extent->ram_size - 1,
&page_started, &nr_written, 0);
+ /* JDM XXX */
+
/*
* if page_started, cow_file_range inserted an
* inline extent and took care of all the unlocking
@@ -625,18 +649,21 @@ retry:
}
lock_extent(io_tree, async_extent->start,
- async_extent->start + async_extent->ram_size - 1,
- GFP_NOFS);
+ async_extent->start + async_extent->ram_size - 1);
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
- ret = btrfs_reserve_extent(trans, root,
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ ret = btrfs_reserve_extent(trans, root,
async_extent->compressed_size,
async_extent->compressed_size,
- 0, alloc_hint,
- (u64)-1, &ins, 1);
- btrfs_end_transaction(trans, root);
+ 0, alloc_hint, &ins, 1);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ }
if (ret) {
int i;
@@ -649,8 +676,10 @@ retry:
async_extent->pages = NULL;
unlock_extent(io_tree, async_extent->start,
async_extent->start +
- async_extent->ram_size - 1, GFP_NOFS);
- goto retry;
+ async_extent->ram_size - 1);
+ if (ret == -ENOSPC)
+ goto retry;
+ goto out_free; /* JDM: Requeue? */
}
/*
@@ -662,7 +691,7 @@ retry:
async_extent->ram_size - 1, 0);
em = alloc_extent_map();
- BUG_ON(!em);
+ BUG_ON(!em); /* -ENOMEM */
em->start = async_extent->start;
em->len = async_extent->ram_size;
em->orig_start = em->start;
@@ -694,7 +723,7 @@ retry:
ins.offset,
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
/*
* clear dirty, set writeback and unlock the pages.
@@ -716,13 +745,17 @@ retry:
ins.offset, async_extent->pages,
async_extent->nr_pages);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
cond_resched();
}
-
- return 0;
+ ret = 0;
+out:
+ return ret;
+out_free:
+ kfree(async_extent);
+ goto out;
}
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -791,7 +824,18 @@ static noinline int cow_file_range(struct inode *inode,
BUG_ON(btrfs_is_free_space_inode(root, inode));
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+ return PTR_ERR(trans);
+ }
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -800,7 +844,8 @@ static noinline int cow_file_range(struct inode *inode,
ret = 0;
/* if this is a small write inside eof, kick off defrag */
- if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
+ if (num_bytes < 64 * 1024 &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(trans, inode);
if (start == 0) {
@@ -821,8 +866,10 @@ static noinline int cow_file_range(struct inode *inode,
*nr_written = *nr_written +
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
*page_started = 1;
- ret = 0;
goto out;
+ } else if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_unlock;
}
}
@@ -838,11 +885,14 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
- (u64)-1, &ins, 1);
- BUG_ON(ret);
+ &ins, 1);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_unlock;
+ }
em = alloc_extent_map();
- BUG_ON(!em);
+ BUG_ON(!em); /* -ENOMEM */
em->start = start;
em->orig_start = em->start;
ram_size = ins.offset;
@@ -868,13 +918,16 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = ins.offset;
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
ram_size, cur_alloc_size, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_unlock;
+ }
}
if (disk_num_bytes < cur_alloc_size)
@@ -899,11 +952,23 @@ static noinline int cow_file_range(struct inode *inode,
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
}
-out:
ret = 0;
+out:
btrfs_end_transaction(trans, root);
return ret;
+out_unlock:
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+
+ goto out;
}
/*
@@ -969,7 +1034,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1, 0, NULL, GFP_NOFS);
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
- BUG_ON(!async_cow);
+ BUG_ON(!async_cow); /* -ENOMEM */
async_cow->inode = inode;
async_cow->root = root;
async_cow->locked_page = locked_page;
@@ -1060,7 +1125,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 disk_bytenr;
u64 num_bytes;
int extent_type;
- int ret;
+ int ret, err;
int type;
int nocow;
int check_prev = 1;
@@ -1078,7 +1143,11 @@ static noinline int run_delalloc_nocow(struct inode *inode,
else
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
cow_start = (u64)-1;
@@ -1086,7 +1155,10 @@ static noinline int run_delalloc_nocow(struct inode *inode,
while (1) {
ret = btrfs_lookup_file_extent(trans, root, path, ino,
cur_offset, 0);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error;
+ }
if (ret > 0 && path->slots[0] > 0 && check_prev) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1100,8 +1172,10 @@ next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- BUG_ON(1);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error;
+ }
if (ret > 0)
break;
leaf = path->nodes[0];
@@ -1189,7 +1263,10 @@ out_check:
ret = cow_file_range(inode, locked_page, cow_start,
found_key.offset - 1, page_started,
nr_written, 1);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error;
+ }
cow_start = (u64)-1;
}
@@ -1198,7 +1275,7 @@ out_check:
struct extent_map_tree *em_tree;
em_tree = &BTRFS_I(inode)->extent_tree;
em = alloc_extent_map();
- BUG_ON(!em);
+ BUG_ON(!em); /* -ENOMEM */
em->start = cur_offset;
em->orig_start = em->start;
em->len = num_bytes;
@@ -1224,13 +1301,16 @@ out_check:
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
num_bytes, num_bytes, type);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, cur_offset,
num_bytes);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error;
+ }
}
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
@@ -1249,18 +1329,23 @@ out_check:
if (cow_start != (u64)-1) {
ret = cow_file_range(inode, locked_page, cow_start, end,
page_started, nr_written, 1);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error;
+ }
}
+error:
if (nolock) {
- ret = btrfs_end_transaction_nolock(trans, root);
- BUG_ON(ret);
+ err = btrfs_end_transaction_nolock(trans, root);
} else {
- ret = btrfs_end_transaction(trans, root);
- BUG_ON(ret);
+ err = btrfs_end_transaction(trans, root);
}
+ if (!ret)
+ ret = err;
+
btrfs_free_path(path);
- return 0;
+ return ret;
}
/*
@@ -1425,10 +1510,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
map_length = length;
ret = btrfs_map_block(map_tree, READ, logical,
&map_length, NULL, 0);
-
+ /* Will always return 0 or 1 with map_multi == NULL */
+ BUG_ON(ret < 0);
if (map_length < length + size)
return 1;
- return ret;
+ return 0;
}
/*
@@ -1448,7 +1534,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
int ret = 0;
ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
return 0;
}
@@ -1479,14 +1565,16 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
int skip_sum;
+ int metadata = 0;
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
if (btrfs_is_free_space_inode(root, inode))
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
- else
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- BUG_ON(ret);
+ metadata = 2;
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+ if (ret)
+ return ret;
if (!(rw & REQ_WRITE)) {
if (bio_flags & EXTENT_BIO_COMPRESSED) {
@@ -1555,6 +1643,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct inode *inode;
u64 page_start;
u64 page_end;
+ int ret;
fixup = container_of(work, struct btrfs_writepage_fixup, work);
page = fixup->page;
@@ -1570,7 +1659,7 @@ again:
page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
/* already ordered? We're done */
if (PagePrivate2(page))
@@ -1582,12 +1671,21 @@ again:
page_end, &cached_state, GFP_NOFS);
unlock_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
goto again;
}
- BUG();
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ if (ret) {
+ mapping_set_error(page->mapping, ret);
+ end_extent_writepage(page, ret, page_start, page_end);
+ ClearPageChecked(page);
+ goto out;
+ }
+
btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
ClearPageChecked(page);
+ set_page_dirty(page);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state, GFP_NOFS);
@@ -1630,7 +1728,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
fixup->work.func = btrfs_writepage_fixup_worker;
fixup->page = page;
btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
- return -EAGAIN;
+ return -EBUSY;
}
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -1665,13 +1763,15 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
*/
ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
&hint, 0);
- BUG_ON(ret);
+ if (ret)
+ goto out;
ins.objectid = btrfs_ino(inode);
ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
- BUG_ON(ret);
+ if (ret)
+ goto out;
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -1699,10 +1799,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
btrfs_ino(inode), file_pos, &ins);
- BUG_ON(ret);
+out:
btrfs_free_path(path);
- return 0;
+ return ret;
}
/*
@@ -1730,35 +1830,41 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
end - start + 1);
if (!ret)
return 0;
- BUG_ON(!ordered_extent);
+ BUG_ON(!ordered_extent); /* Logic error */
nolock = btrfs_is_free_space_inode(root, inode);
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
- BUG_ON(!list_empty(&ordered_extent->list));
+ BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (!ret) {
if (nolock)
trans = btrfs_join_transaction_nolock(root);
else
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
- BUG_ON(ret);
+ if (ret) /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, root, ret);
}
goto out;
}
lock_extent_bits(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
- 0, &cached_state, GFP_NOFS);
+ 0, &cached_state);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
else
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out_unlock;
+ }
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -1769,7 +1875,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->file_offset,
ordered_extent->file_offset +
ordered_extent->len);
- BUG_ON(ret);
} else {
BUG_ON(root == root->fs_info->tree_root);
ret = insert_reserved_file_extent(trans, inode,
@@ -1783,11 +1888,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset,
ordered_extent->len);
- BUG_ON(ret);
}
unlock_extent_cached(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset +
ordered_extent->len - 1, &cached_state, GFP_NOFS);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
add_pending_csums(trans, inode, ordered_extent->file_offset,
&ordered_extent->list);
@@ -1795,7 +1903,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
ret = btrfs_update_inode_fallback(trans, root, inode);
- BUG_ON(ret);
+ if (ret) { /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
ret = 0;
out:
@@ -1814,6 +1925,11 @@ out:
btrfs_put_ordered_extent(ordered_extent);
return 0;
+out_unlock:
+ unlock_extent_cached(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1, &cached_state, GFP_NOFS);
+ goto out;
}
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
@@ -1863,7 +1979,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
} else {
ret = get_state_private(io_tree, start, &private);
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (ret)
goto zeroit;
@@ -1872,7 +1988,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
if (csum != private)
goto zeroit;
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
good:
return 0;
@@ -1884,7 +2000,7 @@ zeroit:
(unsigned long long)private);
memset(kaddr + offset, 1, end - start + 1);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (private == 0)
return 0;
return -EIO;
@@ -1895,6 +2011,8 @@ struct delayed_iput {
struct inode *inode;
};
+/* JDM: If this is fs-wide, why can't we add a pointer to
+ * btrfs_inode instead and avoid the allocation? */
void btrfs_add_delayed_iput(struct inode *inode)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -1951,12 +2069,28 @@ enum btrfs_orphan_cleanup_state {
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_block_rsv *block_rsv;
int ret;
if (!list_empty(&root->orphan_list) ||
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
return;
+ spin_lock(&root->orphan_lock);
+ if (!list_empty(&root->orphan_list)) {
+ spin_unlock(&root->orphan_lock);
+ return;
+ }
+
+ if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+ spin_unlock(&root->orphan_lock);
+ return;
+ }
+
+ block_rsv = root->orphan_block_rsv;
+ root->orphan_block_rsv = NULL;
+ spin_unlock(&root->orphan_lock);
+
if (root->orphan_item_inserted &&
btrfs_root_refs(&root->root_item) > 0) {
ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +2099,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
root->orphan_item_inserted = 0;
}
- if (root->orphan_block_rsv) {
- WARN_ON(root->orphan_block_rsv->size > 0);
- btrfs_free_block_rsv(root, root->orphan_block_rsv);
- root->orphan_block_rsv = NULL;
+ if (block_rsv) {
+ WARN_ON(block_rsv->size > 0);
+ btrfs_free_block_rsv(root, block_rsv);
}
}
@@ -2026,20 +2159,27 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
/* grab metadata reservation from transaction handle */
if (reserve) {
ret = btrfs_orphan_reserve_metadata(trans, inode);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
}
/* insert an orphan item to track this unlinked/truncated file */
if (insert >= 1) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
- BUG_ON(ret && ret != -EEXIST);
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ ret = 0;
}
/* insert an orphan item to track subvolume contains orphan files */
if (insert >= 2) {
ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
root->root_key.objectid);
- BUG_ON(ret);
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
}
return 0;
}
@@ -2069,7 +2209,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
if (trans && delete_item) {
ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
}
if (release_rsv)
@@ -2203,7 +2343,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
}
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
btrfs_end_transaction(trans, root);
continue;
}
@@ -2224,14 +2364,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
continue;
}
nr_truncate++;
- /*
- * Need to hold the imutex for reservation purposes, not
- * a huge deal here but I have a WARN_ON in
- * btrfs_delalloc_reserve_space to catch offenders.
- */
- mutex_lock(&inode->i_mutex);
ret = btrfs_truncate(inode);
- mutex_unlock(&inode->i_mutex);
} else {
nr_unlink++;
}
@@ -2592,16 +2725,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
"inode %llu parent %llu\n", name_len, name,
(unsigned long long)ino, (unsigned long long)dir_ino);
+ btrfs_abort_transaction(trans, root, ret);
goto err;
}
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto err;
+ }
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
inode, dir_ino);
- BUG_ON(ret != 0 && ret != -ENOENT);
+ if (ret != 0 && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto err;
+ }
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
dir, index);
@@ -2759,7 +2898,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
err = ret;
goto out;
}
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
if (check_path_shared(root, path))
goto out;
btrfs_release_path(path);
@@ -2792,7 +2931,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
err = PTR_ERR(ref);
goto out;
}
- BUG_ON(!ref);
+ BUG_ON(!ref); /* Logic error */
if (check_path_shared(root, path))
goto out;
index = btrfs_inode_ref_index(path->nodes[0], ref);
@@ -2845,7 +2984,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
BUG_ON(!root->fs_info->enospc_unlink);
root->fs_info->enospc_unlink = 0;
}
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -2899,23 +3038,42 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
- BUG_ON(IS_ERR_OR_NULL(di));
+ if (IS_ERR_OR_NULL(di)) {
+ if (!di)
+ ret = -ENOENT;
+ else
+ ret = PTR_ERR(di);
+ goto out;
+ }
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &key);
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
btrfs_release_path(path);
ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
objectid, root->root_key.objectid,
dir_ino, &index, name, name_len);
if (ret < 0) {
- BUG_ON(ret != -ENOENT);
+ if (ret != -ENOENT) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
di = btrfs_search_dir_index_item(root, path, dir_ino,
name, name_len);
- BUG_ON(IS_ERR_OR_NULL(di));
+ if (IS_ERR_OR_NULL(di)) {
+ if (!di)
+ ret = -ENOENT;
+ else
+ ret = PTR_ERR(di);
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
@@ -2925,15 +3083,19 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, dir);
- BUG_ON(ret);
-
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+out:
btrfs_free_path(path);
- return 0;
+ return ret;
}
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -3009,7 +3171,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int pending_del_nr = 0;
int pending_del_slot = 0;
int extent_type = -1;
- int encoding;
int ret;
int err = 0;
u64 ino = btrfs_ino(inode);
@@ -3059,7 +3220,6 @@ search_again:
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
found_type = btrfs_key_type(&found_key);
- encoding = 0;
if (found_key.objectid != ino)
break;
@@ -3072,10 +3232,6 @@ search_again:
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
- encoding = btrfs_file_extent_compression(leaf, fi);
- encoding |= btrfs_file_extent_encryption(leaf, fi);
- encoding |= btrfs_file_extent_other_encoding(leaf, fi);
-
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
item_end +=
btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3259,7 @@ search_again:
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (!del_item && !encoding) {
+ if (!del_item) {
u64 orig_num_bytes =
btrfs_file_extent_num_bytes(leaf, fi);
extent_num_bytes = new_size -
@@ -3149,8 +3305,8 @@ search_again:
}
size =
btrfs_file_extent_calc_inline_size(size);
- ret = btrfs_truncate_item(trans, root, path,
- size, 1);
+ btrfs_truncate_item(trans, root, path,
+ size, 1);
} else if (root->ref_cows) {
inode_sub_bytes(inode, item_end + 1 -
found_key.offset);
@@ -3179,7 +3335,7 @@ delete:
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
btrfs_header_owner(leaf),
- ino, extent_offset);
+ ino, extent_offset, 0);
BUG_ON(ret);
}
@@ -3198,7 +3354,11 @@ delete:
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
pending_del_nr);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans,
+ root, ret);
+ goto error;
+ }
pending_del_nr = 0;
}
btrfs_release_path(path);
@@ -3211,8 +3371,10 @@ out:
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
}
+error:
btrfs_free_path(path);
return err;
}
@@ -3270,8 +3432,7 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
- GFP_NOFS);
+ lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
set_page_extent_mapped(page);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
@@ -3347,7 +3508,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
btrfs_wait_ordered_range(inode, hole_start,
block_end - hole_start);
lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, hole_start);
if (!ordered)
break;
@@ -3360,7 +3521,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
while (1) {
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
block_end - cur_offset, 0);
- BUG_ON(IS_ERR_OR_NULL(em));
+ if (IS_ERR(em)) {
+ err = PTR_ERR(em);
+ break;
+ }
last_byte = min(extent_map_end(em), block_end);
last_byte = (last_byte + mask) & ~mask;
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3377,7 +3541,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
cur_offset + hole_size,
&hint_byte, 1);
if (err) {
- btrfs_update_inode(trans, root, inode);
+ btrfs_abort_transaction(trans, root, err);
btrfs_end_transaction(trans, root);
break;
}
@@ -3387,7 +3551,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
0, hole_size, 0, hole_size,
0, 0, 0);
if (err) {
- btrfs_update_inode(trans, root, inode);
+ btrfs_abort_transaction(trans, root, err);
btrfs_end_transaction(trans, root);
break;
}
@@ -3434,7 +3598,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
i_size_write(inode, newsize);
btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
ret = btrfs_update_inode(trans, root, inode);
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
} else {
/*
@@ -3767,7 +3931,7 @@ static void inode_tree_del(struct inode *inode)
}
}
-int btrfs_invalidate_inodes(struct btrfs_root *root)
+void btrfs_invalidate_inodes(struct btrfs_root *root)
{
struct rb_node *node;
struct rb_node *prev;
@@ -3827,7 +3991,6 @@ again:
node = rb_next(node);
}
spin_unlock(&root->inode_lock);
- return 0;
}
static int btrfs_init_locked_inode(struct inode *inode, void *p)
@@ -4569,16 +4732,42 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
parent_ino, index);
}
- if (ret == 0) {
- ret = btrfs_insert_dir_item(trans, root, name, name_len,
- parent_inode, &key,
- btrfs_inode_type(inode), index);
- BUG_ON(ret);
+ /* Nothing to clean up yet */
+ if (ret)
+ return ret;
+
+ ret = btrfs_insert_dir_item(trans, root, name, name_len,
+ parent_inode, &key,
+ btrfs_inode_type(inode), index);
+ if (ret == -EEXIST)
+ goto fail_dir_item;
+ else if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ name_len * 2);
+ parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, parent_inode);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+
+fail_dir_item:
+ if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ u64 local_index;
+ int err;
+ err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+ key.objectid, root->root_key.objectid,
+ parent_ino, &local_index, name, name_len);
+
+ } else if (add_backref) {
+ u64 local_index;
+ int err;
- btrfs_i_size_write(parent_inode, parent_inode->i_size +
- name_len * 2);
- parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
- ret = btrfs_update_inode(trans, root, parent_inode);
+ err = btrfs_del_inode_ref(trans, root, name, name_len,
+ ino, parent_ino, &local_index);
}
return ret;
}
@@ -4655,7 +4844,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
}
out_unlock:
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root, nr);
if (drop_inode) {
inode_dec_link_count(inode);
@@ -4723,7 +4912,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
}
out_unlock:
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -4776,13 +4965,14 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
} else {
struct dentry *parent = dentry->d_parent;
err = btrfs_update_inode(trans, root, inode);
- BUG_ON(err);
+ if (err)
+ goto fail;
d_instantiate(dentry, inode);
btrfs_log_new_name(trans, inode, NULL, parent);
}
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
fail:
if (drop_inode) {
inode_dec_link_count(inode);
@@ -4848,7 +5038,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
out_fail:
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
if (drop_on_err)
iput(inode);
btrfs_btree_balance_dirty(root, nr);
@@ -4907,12 +5097,12 @@ static noinline int uncompress_inline(struct btrfs_path *path,
ret = btrfs_decompress(compress_type, tmp, page,
extent_offset, inline_size, max_size);
if (ret) {
- char *kaddr = kmap_atomic(page, KM_USER0);
+ char *kaddr = kmap_atomic(page);
unsigned long copy_size = min_t(u64,
PAGE_CACHE_SIZE - pg_offset,
max_size - extent_offset);
memset(kaddr + pg_offset, 0, copy_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
kfree(tmp);
return 0;
@@ -5107,7 +5297,7 @@ again:
ret = uncompress_inline(path, inode, page,
pg_offset,
extent_offset, item);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -5121,7 +5311,7 @@ again:
}
flush_dcache_page(page);
} else if (create && PageUptodate(page)) {
- WARN_ON(1);
+ BUG();
if (!trans) {
kunmap(page);
free_extent_map(em);
@@ -5222,6 +5412,7 @@ out:
free_extent_map(em);
return ERR_PTR(err);
}
+ BUG_ON(!em); /* Error is always set */
return em;
}
@@ -5384,7 +5575,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
alloc_hint = get_extent_allocation_hint(inode, start, len);
ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
- alloc_hint, (u64)-1, &ins, 1);
+ alloc_hint, &ins, 1);
if (ret) {
em = ERR_PTR(ret);
goto out;
@@ -5572,7 +5763,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
free_extent_map(em);
/* DIO will do one hole at a time, so just unlock a sector */
unlock_extent(&BTRFS_I(inode)->io_tree, start,
- start + root->sectorsize - 1, GFP_NOFS);
+ start + root->sectorsize - 1);
return 0;
}
@@ -5689,11 +5880,11 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
unsigned long flags;
local_irq_save(flags);
- kaddr = kmap_atomic(page, KM_IRQ0);
+ kaddr = kmap_atomic(page);
csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
csum, bvec->bv_len);
btrfs_csum_final(csum, (char *)&csum);
- kunmap_atomic(kaddr, KM_IRQ0);
+ kunmap_atomic(kaddr);
local_irq_restore(flags);
flush_dcache_page(bvec->bv_page);
@@ -5713,7 +5904,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
} while (bvec <= bvec_end);
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
- dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+ dip->logical_offset + dip->bytes - 1);
bio->bi_private = dip->private;
kfree(dip->csums);
@@ -5764,7 +5955,7 @@ again:
lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
ordered->file_offset + ordered->len - 1, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
ret = btrfs_mark_extent_written(trans, inode,
@@ -5838,7 +6029,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
int ret;
struct btrfs_root *root = BTRFS_I(inode)->root;
ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
return 0;
}
@@ -6179,7 +6370,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
while (1) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, &cached_state, GFP_NOFS);
+ 0, &cached_state);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure theres no ordered
@@ -6203,7 +6394,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
if (writing) {
write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- EXTENT_DELALLOC, 0, NULL, &cached_state,
+ EXTENT_DELALLOC, NULL, &cached_state,
GFP_NOFS);
if (ret) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
@@ -6333,8 +6524,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
btrfs_releasepage(page, GFP_NOFS);
return;
}
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
- GFP_NOFS);
+ lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
ordered = btrfs_lookup_ordered_extent(page->mapping->host,
page_offset(page));
if (ordered) {
@@ -6356,8 +6546,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
}
btrfs_put_ordered_extent(ordered);
cached_state = NULL;
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
- GFP_NOFS);
+ lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
}
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -6399,21 +6588,23 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
unsigned long zero_start;
loff_t size;
int ret;
+ int reserved = 0;
u64 page_start;
u64 page_end;
- /* Need this to keep space reservations serialized */
- mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
- mutex_unlock(&inode->i_mutex);
- if (!ret)
+ if (!ret) {
ret = btrfs_update_time(vma->vm_file);
+ reserved = 1;
+ }
if (ret) {
if (ret == -ENOMEM)
ret = VM_FAULT_OOM;
else /* -ENOSPC, -EIO, etc */
ret = VM_FAULT_SIGBUS;
- goto out;
+ if (reserved)
+ goto out;
+ goto out_noreserve;
}
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
@@ -6430,8 +6621,7 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
- GFP_NOFS);
+ lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
set_page_extent_mapped(page);
/*
@@ -6494,8 +6684,9 @@ out_unlock:
if (!ret)
return VM_FAULT_LOCKED;
unlock_page(page);
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
out:
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out_noreserve:
return ret;
}
@@ -6668,7 +6859,7 @@ end_trans:
err = ret;
nr = trans->blocks_used;
- ret = btrfs_end_transaction_throttle(trans, root);
+ ret = btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root, nr);
}
@@ -6691,8 +6882,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
int err;
u64 index = 0;
- inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
- new_dirid, S_IFDIR | 0700, &index);
+ inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
+ new_dirid, new_dirid,
+ S_IFDIR | (~current_umask() & S_IRWXUGO),
+ &index);
if (IS_ERR(inode))
return PTR_ERR(inode);
inode->i_op = &btrfs_dir_inode_operations;
@@ -6702,10 +6895,9 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
btrfs_i_size_write(inode, 0);
err = btrfs_update_inode(trans, new_root, inode);
- BUG_ON(err);
iput(inode);
- return 0;
+ return err;
}
struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -6748,7 +6940,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_map_tree_init(&ei->extent_tree);
extent_io_tree_init(&ei->io_tree, &inode->i_data);
extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+ ei->io_tree.track_uptodate = 1;
+ ei->io_failure_tree.track_uptodate = 1;
mutex_init(&ei->log_mutex);
+ mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -7036,7 +7231,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!ret)
ret = btrfs_update_inode(trans, root, old_inode);
}
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
if (new_inode) {
new_inode->i_ctime = CURRENT_TIME;
@@ -7054,11 +7252,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_dentry->d_name.name,
new_dentry->d_name.len);
}
- BUG_ON(ret);
- if (new_inode->i_nlink == 0) {
+ if (!ret && new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, new_dentry->d_inode);
BUG_ON(ret);
}
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
}
fixup_inode_flags(new_dir, old_inode);
@@ -7066,7 +7267,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = btrfs_add_link(trans, new_dir, old_inode,
new_dentry->d_name.name,
new_dentry->d_name.len, 0, index);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
struct dentry *parent = new_dentry->d_parent;
@@ -7074,7 +7278,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
btrfs_end_log_trans(root);
}
out_fail:
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
@@ -7246,7 +7450,7 @@ out_unlock:
if (!err)
d_instantiate(dentry, inode);
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -7279,7 +7483,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
- 0, *alloc_hint, (u64)-1, &ins, 1);
+ 0, *alloc_hint, &ins, 1);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -7291,7 +7495,12 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
ins.offset, ins.offset,
ins.offset, 0, 0, 0,
BTRFS_FILE_EXTENT_PREALLOC);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ if (own_trans)
+ btrfs_end_transaction(trans, root);
+ break;
+ }
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + ins.offset -1, 0);
@@ -7313,7 +7522,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
+
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ if (own_trans)
+ btrfs_end_transaction(trans, root);
+ break;
+ }
if (own_trans)
btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5441ff1480f..18cc23d164a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
struct btrfs_trans_handle *trans;
unsigned int flags, oldflags;
int ret;
+ u64 ip_oldflags;
+ unsigned int i_oldflags;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
mutex_lock(&inode->i_mutex);
+ ip_oldflags = ip->flags;
+ i_oldflags = inode->i_flags;
+
flags = btrfs_mask_flags(inode->i_mode, flags);
oldflags = btrfs_flags_to_ioctl(ip->flags);
if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
- trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_drop;
+ }
btrfs_update_iflags(inode);
inode->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
btrfs_end_transaction(trans, root);
+ out_drop:
+ if (ret) {
+ ip->flags = ip_oldflags;
+ inode->i_flags = i_oldflags;
+ }
mnt_drop_write_file(file);
-
- ret = 0;
out_unlock:
mutex_unlock(&inode->i_mutex);
return ret;
@@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
{
- struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
struct btrfs_device *device;
struct request_queue *q;
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
- u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
- ret = btrfs_trim_fs(root, &range);
+ ret = btrfs_trim_fs(fs_info->tree_root, &range);
if (ret < 0)
return ret;
@@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
return PTR_ERR(trans);
leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0);
+ 0, objectid, NULL, 0, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -416,22 +425,37 @@ static noinline int create_subvol(struct btrfs_root *root,
key.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
- BUG_ON(IS_ERR(new_root));
+ if (IS_ERR(new_root)) {
+ btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
+ ret = PTR_ERR(new_root);
+ goto fail;
+ }
btrfs_record_root_in_trans(trans, new_root);
ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
+ if (ret) {
+ /* We potentially lose an unused inode item here */
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
/*
* insert the directory item
*/
ret = btrfs_set_inode_index(dir, &index);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
ret = btrfs_insert_dir_item(trans, root,
name, namelen, dir, &key,
BTRFS_FT_DIR, index);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto fail;
+ }
btrfs_i_size_write(dir, dir->i_size + namelen * 2);
ret = btrfs_update_inode(trans, root, dir);
@@ -760,6 +784,31 @@ none:
return -ENOENT;
}
+/*
+ * Validaty check of prev em and next em:
+ * 1) no prev/next em
+ * 2) prev/next em is an hole/inline extent
+ */
+static int check_adjacent_extents(struct inode *inode, struct extent_map *em)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *prev = NULL, *next = NULL;
+ int ret = 0;
+
+ read_lock(&em_tree->lock);
+ prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1);
+ next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1);
+ read_unlock(&em_tree->lock);
+
+ if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) &&
+ (!next || next->block_start >= EXTENT_MAP_LAST_BYTE))
+ ret = 1;
+ free_extent_map(prev);
+ free_extent_map(next);
+
+ return ret;
+}
+
static int should_defrag_range(struct inode *inode, u64 start, u64 len,
int thresh, u64 *last_len, u64 *skip,
u64 *defrag_end)
@@ -788,17 +837,25 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
if (!em) {
/* get the big lock and read metadata off disk */
- lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+ lock_extent(io_tree, start, start + len - 1);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
- unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+ unlock_extent(io_tree, start, start + len - 1);
if (IS_ERR(em))
return 0;
}
/* this will cover holes, and inline extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE)
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ ret = 0;
+ goto out;
+ }
+
+ /* If we have nothing to merge with us, just skip. */
+ if (check_adjacent_extents(inode, em)) {
ret = 0;
+ goto out;
+ }
/*
* we hit a real extent, if it is big don't bother defragging it again
@@ -806,6 +863,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
ret = 0;
+out:
/*
* last_len ends up being a counter of how many bytes we've defragged.
* every time we choose not to defrag an extent, we reset *last_len
@@ -847,35 +905,62 @@ static int cluster_pages_for_defrag(struct inode *inode,
u64 isize = i_size_read(inode);
u64 page_start;
u64 page_end;
+ u64 page_cnt;
int ret;
int i;
int i_done;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
+ struct extent_io_tree *tree;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
- if (isize == 0)
- return 0;
file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (!isize || start_index > file_end)
+ return 0;
+
+ page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
- mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_space(inode,
- num_pages << PAGE_CACHE_SHIFT);
- mutex_unlock(&inode->i_mutex);
+ page_cnt << PAGE_CACHE_SHIFT);
if (ret)
return ret;
-again:
- ret = 0;
i_done = 0;
+ tree = &BTRFS_I(inode)->io_tree;
/* step one, lock all the pages */
- for (i = 0; i < num_pages; i++) {
+ for (i = 0; i < page_cnt; i++) {
struct page *page;
+again:
page = find_or_create_page(inode->i_mapping,
- start_index + i, mask);
+ start_index + i, mask);
if (!page)
break;
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+ while (1) {
+ lock_extent(tree, page_start, page_end);
+ ordered = btrfs_lookup_ordered_extent(inode,
+ page_start);
+ unlock_extent(tree, page_start, page_end);
+ if (!ordered)
+ break;
+
+ unlock_page(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * we unlocked the page above, so we need check if
+ * it was released or not.
+ */
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
+ }
+
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
@@ -886,15 +971,13 @@ again:
break;
}
}
- isize = i_size_read(inode);
- file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
- if (!isize || page->index > file_end ||
- page->mapping != inode->i_mapping) {
- /* whoops, we blew past eof, skip this page */
+
+ if (page->mapping != inode->i_mapping) {
unlock_page(page);
page_cache_release(page);
- break;
+ goto again;
}
+
pages[i] = page;
i_done++;
}
@@ -915,38 +998,18 @@ again:
page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, 0, &cached_state,
- GFP_NOFS);
- ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
- if (ordered &&
- ordered->file_offset + ordered->len > page_start &&
- ordered->file_offset < page_end) {
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1,
- &cached_state, GFP_NOFS);
- for (i = 0; i < i_done; i++) {
- unlock_page(pages[i]);
- page_cache_release(pages[i]);
- }
- btrfs_wait_ordered_range(inode, page_start,
- page_end - page_start);
- goto again;
- }
- if (ordered)
- btrfs_put_ordered_extent(ordered);
-
+ page_start, page_end - 1, 0, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
GFP_NOFS);
- if (i_done != num_pages) {
+ if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode,
- (num_pages - i_done) << PAGE_CACHE_SHIFT);
+ (page_cnt - i_done) << PAGE_CACHE_SHIFT);
}
@@ -971,7 +1034,7 @@ out:
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
- btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
+ btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
return ret;
}
@@ -1058,7 +1121,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i = range->start >> PAGE_CACHE_SHIFT;
}
if (!max_to_defrag)
- max_to_defrag = last_index;
+ max_to_defrag = last_index + 1;
/*
* make writeback starts from i, so the defrag range can be
@@ -1077,12 +1140,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (!(inode->i_sb->s_flags & MS_ACTIVE))
break;
- if (!newer_than &&
- !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE,
- extent_thresh,
- &last_len, &skip,
- &defrag_end)) {
+ if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, extent_thresh,
+ &last_len, &skip, &defrag_end)) {
unsigned long next;
/*
* the should_defrag function tells us how much to skip
@@ -1111,17 +1171,24 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra_index += max_cluster;
}
+ mutex_lock(&inode->i_mutex);
ret = cluster_pages_for_defrag(inode, pages, i, cluster);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&inode->i_mutex);
goto out_ra;
+ }
defrag_count += ret;
balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+ mutex_unlock(&inode->i_mutex);
if (newer_than) {
if (newer_off == (u64)-1)
break;
+ if (ret > 0)
+ i += ret;
+
newer_off = max(newer_off + 1,
(u64)i << PAGE_CACHE_SHIFT);
@@ -1203,13 +1270,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- mutex_lock(&root->fs_info->volume_mutex);
sizestr = vol_args->name;
devstr = strchr(sizestr, ':');
if (devstr) {
@@ -1226,7 +1301,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
if (!strcmp(sizestr, "max"))
new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1316,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
new_size = memparse(sizestr, NULL);
if (new_size == 0) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
}
@@ -1250,7 +1325,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (mod < 0) {
if (new_size > old_size) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
new_size = old_size - new_size;
} else if (mod > 0) {
@@ -1259,11 +1334,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (new_size < 256 * 1024 * 1024) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
if (new_size > device->bdev->bd_inode->i_size) {
ret = -EFBIG;
- goto out_unlock;
+ goto out_free;
}
do_div(new_size, root->sectorsize);
@@ -1276,7 +1351,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out_unlock;
+ goto out_free;
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans, root);
@@ -1284,9 +1359,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
ret = btrfs_shrink_device(device, new_size);
}
-out_unlock:
- mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -1311,6 +1387,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
goto out;
}
+ if (name[0] == '.' &&
+ (namelen == 1 || (name[1] == '.' && namelen == 2))) {
+ ret = -EEXIST;
+ goto out;
+ }
+
if (subvol) {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
NULL, transid, readonly);
@@ -1939,7 +2021,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dest->root_key.objectid,
dentry->d_name.name,
dentry->d_name.len);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_end_trans;
+ }
btrfs_record_root_in_trans(trans, dest);
@@ -1952,11 +2038,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
ret = btrfs_insert_orphan_item(trans,
root->fs_info->tree_root,
dest->root_key.objectid);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ err = ret;
+ goto out_end_trans;
+ }
}
-
+out_end_trans:
ret = btrfs_end_transaction(trans, root);
- BUG_ON(ret);
+ if (ret && !err)
+ err = ret;
inode->i_flags |= S_DEAD;
out_up_write:
up_write(&root->fs_info->subvol_sem);
@@ -2052,14 +2143,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_init_new_device(root, vol_args->name);
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -2074,14 +2176,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_rm_device(root, vol_args->name);
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -2277,13 +2390,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
another, and lock file content */
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ lock_extent(&BTRFS_I(src)->io_tree, off, off+len);
ordered = btrfs_lookup_first_ordered_extent(src, off+len);
if (!ordered &&
!test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
EXTENT_DELALLOC, 0, NULL))
break;
- unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
if (ordered)
btrfs_put_ordered_extent(ordered);
btrfs_wait_ordered_range(src, off, len);
@@ -2398,11 +2511,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.offset,
new_key.offset + datal,
&hint_byte, 1);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root,
+ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root,
+ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
leaf = path->nodes[0];
slot = path->slots[0];
@@ -2427,8 +2550,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
disko, diskl, 0,
root->root_key.objectid,
btrfs_ino(inode),
- new_key.offset - datao);
- BUG_ON(ret);
+ new_key.offset - datao,
+ 0);
+ if (ret) {
+ btrfs_abort_transaction(trans,
+ root,
+ ret);
+ btrfs_end_transaction(trans,
+ root);
+ goto out;
+
+ }
}
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 skip = 0;
@@ -2453,11 +2585,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.offset,
new_key.offset + datal,
&hint_byte, 1);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root,
+ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root,
+ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
if (skip) {
u32 start =
@@ -2491,8 +2633,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_i_size_write(inode, endoff);
ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
- btrfs_end_transaction(trans, root);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ ret = btrfs_end_transaction(trans, root);
}
next:
btrfs_release_path(path);
@@ -2501,7 +2647,7 @@ next:
ret = 0;
out:
btrfs_release_path(path);
- unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
out_unlock:
mutex_unlock(&src->i_mutex);
mutex_unlock(&inode->i_mutex);
@@ -2977,7 +3123,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
{
int ret = 0;
int size;
- u64 extent_offset;
+ u64 extent_item_pos;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
struct btrfs_path *path = NULL;
@@ -3008,15 +3154,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
}
ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+ btrfs_release_path(path);
if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = -ENOENT;
if (ret < 0)
goto out;
- extent_offset = loi->logical - key.objectid;
- ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
- extent_offset, build_ino_list, inodes);
+ extent_item_pos = loi->logical - key.objectid;
+ ret = iterate_extent_inodes(root->fs_info, key.objectid,
+ extent_item_pos, 0, build_ino_list,
+ inodes);
if (ret < 0)
goto out;
@@ -3034,6 +3182,163 @@ out:
return ret;
}
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ bargs->flags = bctl->flags;
+
+ if (atomic_read(&fs_info->balance_running))
+ bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+ if (atomic_read(&fs_info->balance_pause_req))
+ bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+ if (atomic_read(&fs_info->balance_cancel_req))
+ bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+ memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+ memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+ memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+ if (lock) {
+ spin_lock(&fs_info->balance_lock);
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ spin_unlock(&fs_info->balance_lock);
+ } else {
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ }
+}
+
+static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ struct btrfs_balance_control *bctl;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (arg) {
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ goto out;
+ }
+
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out_bargs;
+ }
+
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+
+ goto do_balance;
+ }
+ } else {
+ bargs = NULL;
+ }
+
+ if (fs_info->balance_ctl) {
+ ret = -EINPROGRESS;
+ goto out_bargs;
+ }
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out_bargs;
+ }
+
+ bctl->fs_info = fs_info;
+ if (arg) {
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+ bctl->flags = bargs->flags;
+ } else {
+ /* balance everything - no filters */
+ bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+ }
+
+do_balance:
+ ret = btrfs_balance(bctl, bargs);
+ /*
+ * bctl is freed in __cancel_balance or in free_fs_info if
+ * restriper was paused all the way until unmount
+ */
+ if (arg) {
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+ }
+
+out_bargs:
+ kfree(bargs);
+out:
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case BTRFS_BALANCE_CTL_PAUSE:
+ return btrfs_pause_balance(root->fs_info);
+ case BTRFS_BALANCE_CTL_CANCEL:
+ return btrfs_cancel_balance(root->fs_info);
+ }
+
+ return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+ void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+ if (!bargs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ update_ioctl_balance_args(fs_info, 1, bargs);
+
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+
+ kfree(bargs);
+out:
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -3078,7 +3383,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
- return btrfs_balance(root->fs_info->dev_root);
+ return btrfs_ioctl_balance(root, NULL);
case BTRFS_IOC_CLONE:
return btrfs_ioctl_clone(file, arg, 0, 0, 0);
case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3415,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_scrub_cancel(root, argp);
case BTRFS_IOC_SCRUB_PROGRESS:
return btrfs_ioctl_scrub_progress(root, argp);
+ case BTRFS_IOC_BALANCE_V2:
+ return btrfs_ioctl_balance(root, argp);
+ case BTRFS_IOC_BALANCE_CTL:
+ return btrfs_ioctl_balance_ctl(root, arg);
+ case BTRFS_IOC_BALANCE_PROGRESS:
+ return btrfs_ioctl_balance_progress(root, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de..4f69028a68c 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
__u64 reserved[124]; /* pad to 1k */
};
+/* balance control ioctl modes */
+#define BTRFS_BALANCE_CTL_PAUSE 1
+#define BTRFS_BALANCE_CTL_CANCEL 2
+
+/*
+ * this is packed, because it should be exactly the same as its disk
+ * byte order counterpart (struct btrfs_disk_balance_args)
+ */
+struct btrfs_balance_args {
+ __u64 profiles;
+ __u64 usage;
+ __u64 devid;
+ __u64 pstart;
+ __u64 pend;
+ __u64 vstart;
+ __u64 vend;
+
+ __u64 target;
+
+ __u64 flags;
+
+ __u64 unused[8];
+} __attribute__ ((__packed__));
+
+/* report balance progress to userspace */
+struct btrfs_balance_progress {
+ __u64 expected; /* estimated # of chunks that will be
+ * relocated to fulfill the request */
+ __u64 considered; /* # of chunks we have considered so far */
+ __u64 completed; /* # of chunks relocated so far */
+};
+
+#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
+#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
+#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
+
+struct btrfs_ioctl_balance_args {
+ __u64 flags; /* in/out */
+ __u64 state; /* out */
+
+ struct btrfs_balance_args data; /* in/out */
+ struct btrfs_balance_args meta; /* in/out */
+ struct btrfs_balance_args sys; /* in/out */
+
+ struct btrfs_balance_progress stat; /* out */
+
+ __u64 unused[72]; /* pad to 1k */
+};
+
#define BTRFS_INO_LOOKUP_PATH_MAX 4080
struct btrfs_ioctl_ino_lookup_args {
__u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
struct btrfs_ioctl_dev_info_args)
#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
+ struct btrfs_ioctl_balance_args)
+#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
+#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
+ struct btrfs_ioctl_balance_args)
#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
struct btrfs_ioctl_ino_path_args)
#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d77b67c4b27..272f911203f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
*/
void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
+ if (eb->lock_nested) {
+ read_lock(&eb->lock);
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ read_unlock(&eb->lock);
+ return;
+ }
+ read_unlock(&eb->lock);
+ }
if (rw == BTRFS_WRITE_LOCK) {
if (atomic_read(&eb->blocking_writers) == 0) {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
+ if (eb->lock_nested) {
+ read_lock(&eb->lock);
+ if (&eb->lock_nested && current->pid == eb->lock_owner) {
+ read_unlock(&eb->lock);
+ return;
+ }
+ read_unlock(&eb->lock);
+ }
if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_writers) != 1);
write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
again:
+ read_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_writers) &&
+ current->pid == eb->lock_owner) {
+ /*
+ * This extent is already write-locked by our thread. We allow
+ * an additional read lock to be added because it's for the same
+ * thread. btrfs_find_all_roots() depends on this as it may be
+ * called on a partly (write-)locked tree.
+ */
+ BUG_ON(eb->lock_nested);
+ eb->lock_nested = 1;
+ read_unlock(&eb->lock);
+ return;
+ }
+ read_unlock(&eb->lock);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers)) {
read_unlock(&eb->lock);
- wait_event(eb->write_lock_wq,
- atomic_read(&eb->blocking_writers) == 0);
goto again;
}
atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
}
atomic_inc(&eb->write_locks);
atomic_inc(&eb->spinning_writers);
+ eb->lock_owner = current->pid;
return 1;
}
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
+ if (eb->lock_nested) {
+ read_lock(&eb->lock);
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ read_unlock(&eb->lock);
+ return;
+ }
+ read_unlock(&eb->lock);
+ }
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
+ if (eb->lock_nested) {
+ read_lock(&eb->lock);
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ read_unlock(&eb->lock);
+ return;
+ }
+ read_unlock(&eb->lock);
+ }
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
if (atomic_dec_and_test(&eb->blocking_readers))
@@ -160,7 +208,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
* take a spinning write lock. This will wait for both
* blocking readers or writers
*/
-int btrfs_tree_lock(struct extent_buffer *eb)
+void btrfs_tree_lock(struct extent_buffer *eb)
{
again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
@@ -181,13 +229,13 @@ again:
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
atomic_inc(&eb->write_locks);
- return 0;
+ eb->lock_owner = current->pid;
}
/*
* drop a spinning or a blocking write lock.
*/
-int btrfs_tree_unlock(struct extent_buffer *eb)
+void btrfs_tree_unlock(struct extent_buffer *eb)
{
int blockers = atomic_read(&eb->blocking_writers);
@@ -206,7 +254,6 @@ int btrfs_tree_unlock(struct extent_buffer *eb)
atomic_dec(&eb->spinning_writers);
write_unlock(&eb->lock);
}
- return 0;
}
void btrfs_assert_tree_locked(struct extent_buffer *eb)
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 17247ddb81a..ca52681e5f4 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -24,8 +24,8 @@
#define BTRFS_WRITE_LOCK_BLOCKING 3
#define BTRFS_READ_LOCK_BLOCKING 4
-int btrfs_tree_lock(struct extent_buffer *eb);
-int btrfs_tree_unlock(struct extent_buffer *eb);
+void btrfs_tree_lock(struct extent_buffer *eb);
+void btrfs_tree_unlock(struct extent_buffer *eb);
int btrfs_try_spin_lock(struct extent_buffer *eb);
void btrfs_tree_read_lock(struct extent_buffer *eb);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index a178f5ebea7..743b86fa4fc 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -411,9 +411,9 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
bytes = min_t(unsigned long, destlen, out_len - start_byte);
- kaddr = kmap_atomic(dest_page, KM_USER0);
+ kaddr = kmap_atomic(dest_page);
memcpy(kaddr, workspace->buf + start_byte, bytes);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a1c94042530..bbf6d0d9aeb 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -59,6 +59,14 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
return NULL;
}
+static void ordered_data_tree_panic(struct inode *inode, int errno,
+ u64 offset)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
+ "%llu\n", (unsigned long long)offset);
+}
+
/*
* look for a given offset in the tree, and if it can't be found return the
* first lesser offset
@@ -207,7 +215,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
spin_lock(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
- BUG_ON(node);
+ if (node)
+ ordered_data_tree_panic(inode, -EEXIST, file_offset);
spin_unlock(&tree->lock);
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
@@ -215,7 +224,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
&BTRFS_I(inode)->root->fs_info->ordered_extents);
spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
- BUG_ON(node);
return 0;
}
@@ -249,9 +257,9 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
* when an ordered extent is finished. If the list covers more than one
* ordered extent, it is split across multiples.
*/
-int btrfs_add_ordered_sum(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- struct btrfs_ordered_sum *sum)
+void btrfs_add_ordered_sum(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum)
{
struct btrfs_ordered_inode_tree *tree;
@@ -259,7 +267,6 @@ int btrfs_add_ordered_sum(struct inode *inode,
spin_lock(&tree->lock);
list_add_tail(&sum->list, &entry->list);
spin_unlock(&tree->lock);
- return 0;
}
/*
@@ -384,7 +391,7 @@ out:
* used to drop a reference on an ordered extent. This will free
* the extent if the last reference is dropped
*/
-int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
{
struct list_head *cur;
struct btrfs_ordered_sum *sum;
@@ -400,7 +407,6 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
}
kfree(entry);
}
- return 0;
}
/*
@@ -408,8 +414,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
* and you must wake_up entry->wait. You must hold the tree lock
* while you call this function.
*/
-static int __btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
+static void __btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -436,35 +442,30 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
list_del_init(&BTRFS_I(inode)->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-
- return 0;
}
/*
* remove an ordered extent from the tree. No references are dropped
* but any waiters are woken.
*/
-int btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
+void btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
- int ret;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock(&tree->lock);
- ret = __btrfs_remove_ordered_extent(inode, entry);
+ __btrfs_remove_ordered_extent(inode, entry);
spin_unlock(&tree->lock);
wake_up(&entry->wait);
-
- return ret;
}
/*
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
-int btrfs_wait_ordered_extents(struct btrfs_root *root,
- int nocow_only, int delay_iput)
+void btrfs_wait_ordered_extents(struct btrfs_root *root,
+ int nocow_only, int delay_iput)
{
struct list_head splice;
struct list_head *cur;
@@ -512,7 +513,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root,
spin_lock(&root->fs_info->ordered_extent_lock);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
- return 0;
}
/*
@@ -525,7 +525,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root,
* extra check to make sure the ordered operation list really is empty
* before we return
*/
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
{
struct btrfs_inode *btrfs_inode;
struct inode *inode;
@@ -573,8 +573,6 @@ again:
spin_unlock(&root->fs_info->ordered_extent_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
-
- return 0;
}
/*
@@ -609,7 +607,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
/*
* Used to wait on ordered extents across a large range of bytes.
*/
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
{
u64 end;
u64 orig_end;
@@ -664,7 +662,6 @@ again:
schedule_timeout(1);
goto again;
}
- return 0;
}
/*
@@ -948,9 +945,8 @@ out:
* If trans is not null, we'll do a friendly check for a transaction that
* is already flushing things and force the IO down ourselves.
*/
-int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode)
+void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
{
u64 last_mod;
@@ -961,7 +957,7 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
* commit, we can safely return without doing anything
*/
if (last_mod < root->fs_info->last_trans_committed)
- return 0;
+ return;
/*
* the transaction is already committing. Just start the IO and
@@ -969,7 +965,7 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
*/
if (trans && root->fs_info->running_transaction->blocked) {
btrfs_wait_ordered_range(inode, 0, (u64)-1);
- return 0;
+ return;
}
spin_lock(&root->fs_info->ordered_extent_lock);
@@ -978,6 +974,4 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
&root->fs_info->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-
- return 0;
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ff1f69aa188..c355ad4dc1a 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -138,8 +138,8 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
t->last = NULL;
}
-int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-int btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry);
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
@@ -154,14 +154,14 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int compress_type);
-int btrfs_add_ordered_sum(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- struct btrfs_ordered_sum *sum);
+void btrfs_add_ordered_sum(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
u64 file_offset);
void btrfs_start_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry, int wait);
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
@@ -170,10 +170,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
-int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode);
-int btrfs_wait_ordered_extents(struct btrfs_root *root,
- int nocow_only, int delay_iput);
+void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode);
+void btrfs_wait_ordered_extents(struct btrfs_root *root,
+ int nocow_only, int delay_iput);
#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index f8be250963a..24cad1695af 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -58,7 +58,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
- if (ret) {
+ if (ret) { /* JDM: Really? */
ret = -ENOENT;
goto out;
}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 2373b39a132..dc5d33146fd 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -54,7 +54,6 @@
* than the 2 started one after another.
*/
-#define MAX_MIRRORS 2
#define MAX_IN_FLIGHT 6
struct reada_extctl {
@@ -71,7 +70,7 @@ struct reada_extent {
struct list_head extctl;
struct kref refcnt;
spinlock_t lock;
- struct reada_zone *zones[MAX_MIRRORS];
+ struct reada_zone *zones[BTRFS_MAX_MIRRORS];
int nzones;
struct btrfs_device *scheduled_for;
};
@@ -84,7 +83,8 @@ struct reada_zone {
spinlock_t lock;
int locked;
struct btrfs_device *device;
- struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
+ struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl
+ * self */
int ndevs;
struct kref refcnt;
};
@@ -305,7 +305,7 @@ again:
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&dev->reada_zones,
- (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
+ (unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
zone);
spin_unlock(&fs_info->reada_lock);
@@ -365,9 +365,9 @@ again:
if (ret || !bbio || length < blocksize)
goto error;
- if (bbio->num_stripes > MAX_MIRRORS) {
+ if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
printk(KERN_ERR "btrfs readahead: more than %d copies not "
- "supported", MAX_MIRRORS);
+ "supported", BTRFS_MAX_MIRRORS);
goto error;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfb55434a46..017281dbb2a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -326,6 +326,19 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
return NULL;
}
+void backref_tree_panic(struct rb_node *rb_node, int errno,
+ u64 bytenr)
+{
+
+ struct btrfs_fs_info *fs_info = NULL;
+ struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ if (bnode->root)
+ fs_info = bnode->root->fs_info;
+ btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
+ "found at offset %llu\n", (unsigned long long)bytenr);
+}
+
/*
* walk up backref nodes until reach node presents tree root
*/
@@ -452,7 +465,8 @@ static void update_backref_node(struct backref_cache *cache,
rb_erase(&node->rb_node, &cache->rb_root);
node->bytenr = bytenr;
rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, bytenr);
}
/*
@@ -999,7 +1013,8 @@ next:
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, node->bytenr,
&node->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, node->bytenr);
list_add_tail(&node->lower, &cache->leaves);
}
@@ -1034,7 +1049,9 @@ next:
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, upper->bytenr,
&upper->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST,
+ upper->bytenr);
}
list_add_tail(&edge->list[UPPER], &upper->lower);
@@ -1180,7 +1197,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
&new_node->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
if (!new_node->lowest) {
list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
@@ -1203,14 +1221,15 @@ fail:
/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
-static int __add_reloc_root(struct btrfs_root *root)
+static int __must_check __add_reloc_root(struct btrfs_root *root)
{
struct rb_node *rb_node;
struct mapping_node *node;
struct reloc_control *rc = root->fs_info->reloc_ctl;
node = kmalloc(sizeof(*node), GFP_NOFS);
- BUG_ON(!node);
+ if (!node)
+ return -ENOMEM;
node->bytenr = root->node->start;
node->data = root;
@@ -1219,7 +1238,12 @@ static int __add_reloc_root(struct btrfs_root *root)
rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
- BUG_ON(rb_node);
+ if (rb_node) {
+ kfree(node);
+ btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
+ "for start=%llu while inserting into relocation "
+ "tree\n");
+ }
list_add_tail(&root->root_list, &rc->reloc_roots);
return 0;
@@ -1252,7 +1276,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, node->bytenr);
} else {
list_del_init(&root->root_list);
kfree(node);
@@ -1334,6 +1359,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *reloc_root;
struct reloc_control *rc = root->fs_info->reloc_ctl;
int clear_rsv = 0;
+ int ret;
if (root->reloc_root) {
reloc_root = root->reloc_root;
@@ -1353,7 +1379,8 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
if (clear_rsv)
trans->block_rsv = NULL;
- __add_reloc_root(reloc_root);
+ ret = __add_reloc_root(reloc_root);
+ BUG_ON(ret < 0);
root->reloc_root = reloc_root;
return 0;
}
@@ -1577,15 +1604,14 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
WARN_ON(!IS_ALIGNED(end, root->sectorsize));
end--;
ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end,
- GFP_NOFS);
+ key.offset, end);
if (!ret)
continue;
btrfs_drop_extent_cache(inode, key.offset, end,
1);
unlock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end, GFP_NOFS);
+ key.offset, end);
}
}
@@ -1604,12 +1630,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
num_bytes, parent,
btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset, 1);
BUG_ON(ret);
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
parent, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset, 1);
BUG_ON(ret);
}
if (dirty)
@@ -1778,21 +1804,23 @@ again:
ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0);
+ src->root_key.objectid, level - 1, 0,
+ 1);
BUG_ON(ret);
ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
- 0);
+ 0, 1);
BUG_ON(ret);
ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0);
+ src->root_key.objectid, level - 1, 0,
+ 1);
BUG_ON(ret);
ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
- 0);
+ 0, 1);
BUG_ON(ret);
btrfs_unlock_up_safe(path, 0);
@@ -1954,9 +1982,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for readpage to complete */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end);
btrfs_drop_extent_cache(inode, start, end, 1);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
}
return 0;
}
@@ -2244,7 +2272,8 @@ again:
} else {
list_del_init(&reloc_root->root_list);
}
- btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
+ ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
+ BUG_ON(ret < 0);
}
if (found) {
@@ -2558,7 +2587,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
node->eb->start, blocksize,
upper->eb->start,
btrfs_header_owner(upper->eb),
- node->level, 0);
+ node->level, 0, 1);
BUG_ON(ret);
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2860,12 +2889,12 @@ int prealloc_file_extent_cluster(struct inode *inode,
else
end = cluster->end - offset;
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
if (ret)
break;
nr++;
@@ -2897,7 +2926,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end);
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
@@ -2908,7 +2937,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
}
btrfs_drop_extent_cache(inode, start, end, 0);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
return ret;
}
@@ -2947,9 +2976,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
while (index <= last_index) {
- mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
- mutex_unlock(&inode->i_mutex);
if (ret)
goto out;
@@ -2990,8 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
page_start = (u64)page->index << PAGE_CACHE_SHIFT;
page_end = page_start + PAGE_CACHE_SIZE - 1;
- lock_extent(&BTRFS_I(inode)->io_tree,
- page_start, page_end, GFP_NOFS);
+ lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
set_page_extent_mapped(page);
@@ -3007,7 +3033,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
set_page_dirty(page);
unlock_extent(&BTRFS_I(inode)->io_tree,
- page_start, page_end, GFP_NOFS);
+ page_start, page_end);
unlock_page(page);
page_cache_release(page);
@@ -3154,7 +3180,8 @@ static int add_tree_block(struct reloc_control *rc,
block->key_ready = 0;
rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, block->bytenr);
return 0;
}
@@ -3426,7 +3453,9 @@ static int find_data_references(struct reloc_control *rc,
block->key_ready = 1;
rb_node = tree_insert(blocks, block->bytenr,
&block->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST,
+ block->bytenr);
}
if (counted)
added = 1;
@@ -4073,10 +4102,11 @@ out:
static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
- int ret;
+ int ret, err;
trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
memset(&root->root_item.drop_progress, 0,
sizeof(root->root_item.drop_progress));
@@ -4084,11 +4114,11 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
btrfs_set_root_refs(&root->root_item, 0);
ret = btrfs_update_root(trans, root->fs_info->tree_root,
&root->root_key, &root->root_item);
- BUG_ON(ret);
- ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
- BUG_ON(ret);
- return 0;
+ err = btrfs_end_transaction(trans, root->fs_info->tree_root);
+ if (err)
+ return err;
+ return ret;
}
/*
@@ -4156,7 +4186,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
err = ret;
goto out;
}
- mark_garbage_root(reloc_root);
+ ret = mark_garbage_root(reloc_root);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
}
}
@@ -4202,13 +4236,19 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root = read_fs_root(root->fs_info,
reloc_root->root_key.offset);
- BUG_ON(IS_ERR(fs_root));
+ if (IS_ERR(fs_root)) {
+ err = PTR_ERR(fs_root);
+ goto out_free;
+ }
- __add_reloc_root(reloc_root);
+ err = __add_reloc_root(reloc_root);
+ BUG_ON(err < 0); /* -ENOMEM or logic error */
fs_root->reloc_root = reloc_root;
}
- btrfs_commit_transaction(trans, rc->extent_root);
+ err = btrfs_commit_transaction(trans, rc->extent_root);
+ if (err)
+ goto out_free;
merge_reloc_roots(rc);
@@ -4218,7 +4258,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (IS_ERR(trans))
err = PTR_ERR(trans);
else
- btrfs_commit_transaction(trans, rc->extent_root);
+ err = btrfs_commit_transaction(trans, rc->extent_root);
out_free:
kfree(rc);
out:
@@ -4267,6 +4307,8 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0);
+ if (ret)
+ goto out;
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
@@ -4284,6 +4326,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
btrfs_add_ordered_sum(inode, ordered, sums);
}
+out:
btrfs_put_ordered_extent(ordered);
return ret;
}
@@ -4380,7 +4423,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
* called after snapshot is created. migrate block reservation
* and create reloc root for the newly created snapshot
*/
-void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending)
{
struct btrfs_root *root = pending->root;
@@ -4390,7 +4433,7 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
int ret;
if (!root->reloc_root)
- return;
+ return 0;
rc = root->fs_info->reloc_ctl;
rc->merging_rsv_size += rc->nodes_relocated;
@@ -4399,18 +4442,21 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
rc->block_rsv,
rc->nodes_relocated);
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
new_root = pending->snap;
reloc_root = create_reloc_root(trans, root->reloc_root,
new_root->root_key.objectid);
+ if (IS_ERR(reloc_root))
+ return PTR_ERR(reloc_root);
- __add_reloc_root(reloc_root);
+ ret = __add_reloc_root(reloc_root);
+ BUG_ON(ret < 0);
new_root->reloc_root = reloc_root;
- if (rc->create_reloc_tree) {
+ if (rc->create_reloc_tree)
ret = clone_backref_node(trans, rc, root, reloc_root);
- BUG_ON(ret);
- }
+ return ret;
}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index f4099904565..24fb8ce4e07 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -93,10 +93,14 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
unsigned long ptr;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
goto out;
+ }
if (ret != 0) {
btrfs_print_leaf(root, path->nodes[0]);
@@ -116,13 +120,10 @@ out:
return ret;
}
-int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_key *key, struct btrfs_root_item
- *item)
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_root_item *item)
{
- int ret;
- ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
- return ret;
+ return btrfs_insert_item(trans, root, key, item, sizeof(*item));
}
/*
@@ -384,6 +385,8 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
*
* For a back ref the root_id is the id of the subvol or snapshot and
* ref_id is the id of the tree referencing it.
+ *
+ * Will return 0, -ENOMEM, or anything from the CoW path
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
@@ -407,7 +410,11 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name_len);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_free_path(path);
+ return ret;
+ }
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ddf2c90d3fc..90acc82046c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
#include "transaction.h"
#include "backref.h"
#include "extent_io.h"
+#include "check-integrity.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -35,37 +36,30 @@
* Future enhancements:
* - In case an unrepairable extent is encountered, track which files are
* affected and report them
- * - In case of a read error on files with nodatasum, map the file and read
- * the extent to trigger a writeback of the good copy
* - track and record media errors, throw out bad devices
* - add a mode to also read unallocated space
*/
-struct scrub_bio;
-struct scrub_page;
+struct scrub_block;
struct scrub_dev;
-static void scrub_bio_end_io(struct bio *bio, int err);
-static void scrub_checksum(struct btrfs_work *work);
-static int scrub_checksum_data(struct scrub_dev *sdev,
- struct scrub_page *spag, void *buffer);
-static int scrub_checksum_tree_block(struct scrub_dev *sdev,
- struct scrub_page *spag, u64 logical,
- void *buffer);
-static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
-static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
-static void scrub_fixup_end_io(struct bio *bio, int err);
-static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
- struct page *page);
-static void scrub_fixup(struct scrub_bio *sbio, int ix);
#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
+#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_page {
+ struct scrub_block *sblock;
+ struct page *page;
+ struct block_device *bdev;
u64 flags; /* extent flags */
u64 generation;
- int mirror_num;
- int have_csum;
+ u64 logical;
+ u64 physical;
+ struct {
+ unsigned int mirror_num:8;
+ unsigned int have_csum:1;
+ unsigned int io_error:1;
+ };
u8 csum[BTRFS_CSUM_SIZE];
};
@@ -76,12 +70,25 @@ struct scrub_bio {
int err;
u64 logical;
u64 physical;
- struct scrub_page spag[SCRUB_PAGES_PER_BIO];
- u64 count;
+ struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
+ int page_count;
int next_free;
struct btrfs_work work;
};
+struct scrub_block {
+ struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+ int page_count;
+ atomic_t outstanding_pages;
+ atomic_t ref_count; /* free mem on transition to zero */
+ struct scrub_dev *sdev;
+ struct {
+ unsigned int header_error:1;
+ unsigned int checksum_error:1;
+ unsigned int no_io_error_seen:1;
+ };
+};
+
struct scrub_dev {
struct scrub_bio *bios[SCRUB_BIOS_PER_DEV];
struct btrfs_device *dev;
@@ -95,6 +102,10 @@ struct scrub_dev {
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
+ int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
+ u32 sectorsize;
+ u32 nodesize;
+ u32 leafsize;
/*
* statistics
*/
@@ -123,6 +134,43 @@ struct scrub_warning {
int scratch_bufsize;
};
+
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
+static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+ struct btrfs_mapping_tree *map_tree,
+ u64 length, u64 logical,
+ struct scrub_block *sblock);
+static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock, int is_metadata,
+ int have_csum, u8 *csum, u64 generation,
+ u16 csum_size);
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock,
+ int is_metadata, int have_csum,
+ const u8 *csum, u64 generation,
+ u16 csum_size);
+static void scrub_complete_bio_end_io(struct bio *bio, int err);
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int force_write);
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int page_num, int force_write);
+static int scrub_checksum_data(struct scrub_block *sblock);
+static int scrub_checksum_tree_block(struct scrub_block *sblock);
+static int scrub_checksum_super(struct scrub_block *sblock);
+static void scrub_block_get(struct scrub_block *sblock);
+static void scrub_block_put(struct scrub_block *sblock);
+static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+ struct scrub_page *spage);
+static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+ u64 physical, u64 flags, u64 gen, int mirror_num,
+ u8 *csum, int force);
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_block_complete(struct scrub_block *sblock);
+
+
static void scrub_free_csums(struct scrub_dev *sdev)
{
while (!list_empty(&sdev->csum_list)) {
@@ -134,23 +182,6 @@ static void scrub_free_csums(struct scrub_dev *sdev)
}
}
-static void scrub_free_bio(struct bio *bio)
-{
- int i;
- struct page *last_page = NULL;
-
- if (!bio)
- return;
-
- for (i = 0; i < bio->bi_vcnt; ++i) {
- if (bio->bi_io_vec[i].bv_page == last_page)
- continue;
- last_page = bio->bi_io_vec[i].bv_page;
- __free_page(last_page);
- }
- bio_put(bio);
-}
-
static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
{
int i;
@@ -158,13 +189,23 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
if (!sdev)
return;
+ /* this can happen when scrub is cancelled */
+ if (sdev->curr != -1) {
+ struct scrub_bio *sbio = sdev->bios[sdev->curr];
+
+ for (i = 0; i < sbio->page_count; i++) {
+ BUG_ON(!sbio->pagev[i]);
+ BUG_ON(!sbio->pagev[i]->page);
+ scrub_block_put(sbio->pagev[i]->sblock);
+ }
+ bio_put(sbio->bio);
+ }
+
for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
struct scrub_bio *sbio = sdev->bios[i];
if (!sbio)
break;
-
- scrub_free_bio(sbio->bio);
kfree(sbio);
}
@@ -178,11 +219,16 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
struct scrub_dev *sdev;
int i;
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+ int pages_per_bio;
+ pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
+ bio_get_nr_vecs(dev->bdev));
sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
if (!sdev)
goto nomem;
sdev->dev = dev;
+ sdev->pages_per_bio = pages_per_bio;
+ sdev->curr = -1;
for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
struct scrub_bio *sbio;
@@ -193,8 +239,8 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
sbio->index = i;
sbio->sdev = sdev;
- sbio->count = 0;
- sbio->work.func = scrub_checksum;
+ sbio->page_count = 0;
+ sbio->work.func = scrub_bio_end_io_worker;
if (i != SCRUB_BIOS_PER_DEV-1)
sdev->bios[i]->next_free = i + 1;
@@ -202,7 +248,9 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
sdev->bios[i]->next_free = -1;
}
sdev->first_free = 0;
- sdev->curr = -1;
+ sdev->nodesize = dev->dev_root->nodesize;
+ sdev->leafsize = dev->dev_root->leafsize;
+ sdev->sectorsize = dev->dev_root->sectorsize;
atomic_set(&sdev->in_flight, 0);
atomic_set(&sdev->fixup_cnt, 0);
atomic_set(&sdev->cancel_req, 0);
@@ -293,10 +341,9 @@ err:
return 0;
}
-static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
- int ix)
+static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
{
- struct btrfs_device *dev = sbio->sdev->dev;
+ struct btrfs_device *dev = sblock->sdev->dev;
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
struct btrfs_path *path;
struct btrfs_key found_key;
@@ -309,14 +356,15 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
u8 ref_level;
unsigned long ptr = 0;
const int bufsize = 4096;
- u64 extent_offset;
+ u64 extent_item_pos;
path = btrfs_alloc_path();
swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
- swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
- swarn.logical = sbio->logical + ix * PAGE_SIZE;
+ BUG_ON(sblock->page_count < 1);
+ swarn.sector = (sblock->pagev[0].physical) >> 9;
+ swarn.logical = sblock->pagev[0].logical;
swarn.errstr = errstr;
swarn.dev = dev;
swarn.msg_bufsize = bufsize;
@@ -329,18 +377,20 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
if (ret < 0)
goto out;
- extent_offset = swarn.logical - found_key.objectid;
+ extent_item_pos = swarn.logical - found_key.objectid;
swarn.extent_item_size = found_key.offset;
eb = path->nodes[0];
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ btrfs_release_path(path);
if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
&ref_root, &ref_level);
- printk(KERN_WARNING "%s at logical %llu on dev %s, "
+ printk(KERN_WARNING
+ "btrfs: %s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
"%llu\n", errstr, swarn.logical, dev->name,
(unsigned long long)swarn.sector,
@@ -350,8 +400,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
} while (ret != 1);
} else {
swarn.path = path;
- iterate_extent_inodes(fs_info, path, found_key.objectid,
- extent_offset,
+ iterate_extent_inodes(fs_info, found_key.objectid,
+ extent_item_pos, 1,
scrub_print_warning_inode, &swarn);
}
@@ -529,9 +579,9 @@ out:
spin_lock(&sdev->stat_lock);
++sdev->stat.uncorrectable_errors;
spin_unlock(&sdev->stat_lock);
- printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
- "(nodatasum) error at logical %llu\n",
- fixup->logical);
+ printk_ratelimited(KERN_ERR
+ "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+ (unsigned long long)fixup->logical, sdev->dev->name);
}
btrfs_free_path(path);
@@ -548,91 +598,168 @@ out:
}
/*
- * scrub_recheck_error gets called when either verification of the page
- * failed or the bio failed to read, e.g. with EIO. In the latter case,
- * recheck_error gets called for every page in the bio, even though only
- * one may be bad
+ * scrub_handle_errored_block gets called when either verification of the
+ * pages failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all pages in the bio, even though only one
+ * may be bad.
+ * The goal of this function is to repair the errored block by using the
+ * contents of one of the mirrors.
*/
-static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
{
- struct scrub_dev *sdev = sbio->sdev;
- u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+ struct scrub_dev *sdev = sblock_to_check->sdev;
+ struct btrfs_fs_info *fs_info;
+ u64 length;
+ u64 logical;
+ u64 generation;
+ unsigned int failed_mirror_index;
+ unsigned int is_metadata;
+ unsigned int have_csum;
+ u8 *csum;
+ struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+ struct scrub_block *sblock_bad;
+ int ret;
+ int mirror_index;
+ int page_num;
+ int success;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
+ DEFAULT_RATELIMIT_BURST);
+
+ BUG_ON(sblock_to_check->page_count < 1);
+ fs_info = sdev->dev->dev_root->fs_info;
+ length = sblock_to_check->page_count * PAGE_SIZE;
+ logical = sblock_to_check->pagev[0].logical;
+ generation = sblock_to_check->pagev[0].generation;
+ BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
+ failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
+ is_metadata = !(sblock_to_check->pagev[0].flags &
+ BTRFS_EXTENT_FLAG_DATA);
+ have_csum = sblock_to_check->pagev[0].have_csum;
+ csum = sblock_to_check->pagev[0].csum;
- if (sbio->err) {
- if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
- sbio->bio->bi_io_vec[ix].bv_page) == 0) {
- if (scrub_fixup_check(sbio, ix) == 0)
- return 0;
- }
- if (__ratelimit(&_rs))
- scrub_print_warning("i/o error", sbio, ix);
- } else {
- if (__ratelimit(&_rs))
- scrub_print_warning("checksum error", sbio, ix);
+ /*
+ * read all mirrors one after the other. This includes to
+ * re-read the extent or metadata block that failed (that was
+ * the cause that this fixup code is called) another time,
+ * page by page this time in order to know which pages
+ * caused I/O errors and which ones are good (for all mirrors).
+ * It is the goal to handle the situation when more than one
+ * mirror contains I/O errors, but the errors do not
+ * overlap, i.e. the data can be repaired by selecting the
+ * pages from those mirrors without I/O error on the
+ * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
+ * would be that mirror #1 has an I/O error on the first page,
+ * the second page is good, and mirror #2 has an I/O error on
+ * the second page, but the first page is good.
+ * Then the first page of the first mirror can be repaired by
+ * taking the first page of the second mirror, and the
+ * second page of the second mirror can be repaired by
+ * copying the contents of the 2nd page of the 1st mirror.
+ * One more note: if the pages of one mirror contain I/O
+ * errors, the checksum cannot be verified. In order to get
+ * the best data for repairing, the first attempt is to find
+ * a mirror without I/O errors and with a validated checksum.
+ * Only if this is not possible, the pages are picked from
+ * mirrors with I/O errors without considering the checksum.
+ * If the latter is the case, at the end, the checksum of the
+ * repaired area is verified in order to correctly maintain
+ * the statistics.
+ */
+
+ sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
+ sizeof(*sblocks_for_recheck),
+ GFP_NOFS);
+ if (!sblocks_for_recheck) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.malloc_errors++;
+ sdev->stat.read_errors++;
+ sdev->stat.uncorrectable_errors++;
+ spin_unlock(&sdev->stat_lock);
+ goto out;
}
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.read_errors;
- spin_unlock(&sdev->stat_lock);
+ /* setup the context, map the logical blocks and alloc the pages */
+ ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
+ logical, sblocks_for_recheck);
+ if (ret) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.read_errors++;
+ sdev->stat.uncorrectable_errors++;
+ spin_unlock(&sdev->stat_lock);
+ goto out;
+ }
+ BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+ sblock_bad = sblocks_for_recheck + failed_mirror_index;
- scrub_fixup(sbio, ix);
- return 1;
-}
+ /* build and submit the bios for the failed mirror, check checksums */
+ ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+ csum, generation, sdev->csum_size);
+ if (ret) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.read_errors++;
+ sdev->stat.uncorrectable_errors++;
+ spin_unlock(&sdev->stat_lock);
+ goto out;
+ }
-static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
-{
- int ret = 1;
- struct page *page;
- void *buffer;
- u64 flags = sbio->spag[ix].flags;
+ if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+ sblock_bad->no_io_error_seen) {
+ /*
+ * the error disappeared after reading page by page, or
+ * the area was part of a huge bio and other parts of the
+ * bio caused I/O errors, or the block layer merged several
+ * read requests into one and the error is caused by a
+ * different bio (usually one of the two latter cases is
+ * the cause)
+ */
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.unverified_errors++;
+ spin_unlock(&sdev->stat_lock);
- page = sbio->bio->bi_io_vec[ix].bv_page;
- buffer = kmap_atomic(page, KM_USER0);
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- ret = scrub_checksum_data(sbio->sdev,
- sbio->spag + ix, buffer);
- } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- ret = scrub_checksum_tree_block(sbio->sdev,
- sbio->spag + ix,
- sbio->logical + ix * PAGE_SIZE,
- buffer);
- } else {
- WARN_ON(1);
+ goto out;
}
- kunmap_atomic(buffer, KM_USER0);
- return ret;
-}
+ if (!sblock_bad->no_io_error_seen) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.read_errors++;
+ spin_unlock(&sdev->stat_lock);
+ if (__ratelimit(&_rs))
+ scrub_print_warning("i/o error", sblock_to_check);
+ } else if (sblock_bad->checksum_error) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.csum_errors++;
+ spin_unlock(&sdev->stat_lock);
+ if (__ratelimit(&_rs))
+ scrub_print_warning("checksum error", sblock_to_check);
+ } else if (sblock_bad->header_error) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.verify_errors++;
+ spin_unlock(&sdev->stat_lock);
+ if (__ratelimit(&_rs))
+ scrub_print_warning("checksum/header error",
+ sblock_to_check);
+ }
-static void scrub_fixup_end_io(struct bio *bio, int err)
-{
- complete((struct completion *)bio->bi_private);
-}
+ if (sdev->readonly)
+ goto did_not_correct_error;
+
+ if (!is_metadata && !have_csum) {
+ struct scrub_fixup_nodatasum *fixup_nodatasum;
-static void scrub_fixup(struct scrub_bio *sbio, int ix)
-{
- struct scrub_dev *sdev = sbio->sdev;
- struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct btrfs_bio *bbio = NULL;
- struct scrub_fixup_nodatasum *fixup;
- u64 logical = sbio->logical + ix * PAGE_SIZE;
- u64 length;
- int i;
- int ret;
- DECLARE_COMPLETION_ONSTACK(complete);
-
- if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
- (sbio->spag[ix].have_csum == 0)) {
- fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
- if (!fixup)
- goto uncorrectable;
- fixup->sdev = sdev;
- fixup->logical = logical;
- fixup->root = fs_info->extent_root;
- fixup->mirror_num = sbio->spag[ix].mirror_num;
+ /*
+ * !is_metadata and !have_csum, this means that the data
+ * might not be COW'ed, that it might be modified
+ * concurrently. The general strategy to work on the
+ * commit root does not help in the case when COW is not
+ * used.
+ */
+ fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
+ if (!fixup_nodatasum)
+ goto did_not_correct_error;
+ fixup_nodatasum->sdev = sdev;
+ fixup_nodatasum->logical = logical;
+ fixup_nodatasum->root = fs_info->extent_root;
+ fixup_nodatasum->mirror_num = failed_mirror_index + 1;
/*
* increment scrubs_running to prevent cancel requests from
* completing as long as a fixup worker is running. we must also
@@ -647,235 +774,528 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
atomic_inc(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
atomic_inc(&sdev->fixup_cnt);
- fixup->work.func = scrub_fixup_nodatasum;
- btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
- return;
+ fixup_nodatasum->work.func = scrub_fixup_nodatasum;
+ btrfs_queue_worker(&fs_info->scrub_workers,
+ &fixup_nodatasum->work);
+ goto out;
}
- length = PAGE_SIZE;
- ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
- &bbio, 0);
- if (ret || !bbio || length < PAGE_SIZE) {
- printk(KERN_ERR
- "scrub_fixup: btrfs_map_block failed us for %llu\n",
- (unsigned long long)logical);
- WARN_ON(1);
- kfree(bbio);
- return;
+ /*
+ * now build and submit the bios for the other mirrors, check
+ * checksums
+ */
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ if (mirror_index == failed_mirror_index)
+ continue;
+
+ /* build and submit the bios, check checksums */
+ ret = scrub_recheck_block(fs_info,
+ sblocks_for_recheck + mirror_index,
+ is_metadata, have_csum, csum,
+ generation, sdev->csum_size);
+ if (ret)
+ goto did_not_correct_error;
}
- if (bbio->num_stripes == 1)
- /* there aren't any replicas */
- goto uncorrectable;
+ /*
+ * first try to pick the mirror which is completely without I/O
+ * errors and also does not have a checksum error.
+ * If one is found, and if a checksum is present, the full block
+ * that is known to contain an error is rewritten. Afterwards
+ * the block is known to be corrected.
+ * If a mirror is found which is completely correct, and no
+ * checksum is present, only those pages are rewritten that had
+ * an I/O error in the block to be repaired, since it cannot be
+ * determined, which copy of the other pages is better (and it
+ * could happen otherwise that a correct page would be
+ * overwritten by a bad one).
+ */
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ struct scrub_block *sblock_other = sblocks_for_recheck +
+ mirror_index;
+
+ if (!sblock_other->header_error &&
+ !sblock_other->checksum_error &&
+ sblock_other->no_io_error_seen) {
+ int force_write = is_metadata || have_csum;
+
+ ret = scrub_repair_block_from_good_copy(sblock_bad,
+ sblock_other,
+ force_write);
+ if (0 == ret)
+ goto corrected_error;
+ }
+ }
/*
- * first find a good copy
+ * in case of I/O errors in the area that is supposed to be
+ * repaired, continue by picking good copies of those pages.
+ * Select the good pages from mirrors to rewrite bad pages from
+ * the area to fix. Afterwards verify the checksum of the block
+ * that is supposed to be repaired. This verification step is
+ * only done for the purpose of statistic counting and for the
+ * final scrub report, whether errors remain.
+ * A perfect algorithm could make use of the checksum and try
+ * all possible combinations of pages from the different mirrors
+ * until the checksum verification succeeds. For example, when
+ * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+ * of mirror #2 is readable but the final checksum test fails,
+ * then the 2nd page of mirror #3 could be tried, whether now
+ * the final checksum succeedes. But this would be a rare
+ * exception and is therefore not implemented. At least it is
+ * avoided that the good copy is overwritten.
+ * A more useful improvement would be to pick the sectors
+ * without I/O error based on sector sizes (512 bytes on legacy
+ * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+ * mirror could be repaired by taking 512 byte of a different
+ * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+ * area are unreadable.
*/
- for (i = 0; i < bbio->num_stripes; ++i) {
- if (i + 1 == sbio->spag[ix].mirror_num)
- continue;
- if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
- bbio->stripes[i].physical >> 9,
- sbio->bio->bi_io_vec[ix].bv_page)) {
- /* I/O-error, this is not a good copy */
+ /* can only fix I/O errors from here on */
+ if (sblock_bad->no_io_error_seen)
+ goto did_not_correct_error;
+
+ success = 1;
+ for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+
+ if (!page_bad->io_error)
continue;
+
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ struct scrub_block *sblock_other = sblocks_for_recheck +
+ mirror_index;
+ struct scrub_page *page_other = sblock_other->pagev +
+ page_num;
+
+ if (!page_other->io_error) {
+ ret = scrub_repair_page_from_good_copy(
+ sblock_bad, sblock_other, page_num, 0);
+ if (0 == ret) {
+ page_bad->io_error = 0;
+ break; /* succeeded for this page */
+ }
+ }
}
- if (scrub_fixup_check(sbio, ix) == 0)
- break;
+ if (page_bad->io_error) {
+ /* did not find a mirror to copy the page from */
+ success = 0;
+ }
}
- if (i == bbio->num_stripes)
- goto uncorrectable;
- if (!sdev->readonly) {
- /*
- * bi_io_vec[ix].bv_page now contains good data, write it back
- */
- if (scrub_fixup_io(WRITE, sdev->dev->bdev,
- (sbio->physical + ix * PAGE_SIZE) >> 9,
- sbio->bio->bi_io_vec[ix].bv_page)) {
- /* I/O-error, writeback failed, give up */
- goto uncorrectable;
+ if (success) {
+ if (is_metadata || have_csum) {
+ /*
+ * need to verify the checksum now that all
+ * sectors on disk are repaired (the write
+ * request for data to be repaired is on its way).
+ * Just be lazy and use scrub_recheck_block()
+ * which re-reads the data before the checksum
+ * is verified, but most likely the data comes out
+ * of the page cache.
+ */
+ ret = scrub_recheck_block(fs_info, sblock_bad,
+ is_metadata, have_csum, csum,
+ generation, sdev->csum_size);
+ if (!ret && !sblock_bad->header_error &&
+ !sblock_bad->checksum_error &&
+ sblock_bad->no_io_error_seen)
+ goto corrected_error;
+ else
+ goto did_not_correct_error;
+ } else {
+corrected_error:
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.corrected_errors++;
+ spin_unlock(&sdev->stat_lock);
+ printk_ratelimited(KERN_ERR
+ "btrfs: fixed up error at logical %llu on dev %s\n",
+ (unsigned long long)logical, sdev->dev->name);
}
+ } else {
+did_not_correct_error:
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.uncorrectable_errors++;
+ spin_unlock(&sdev->stat_lock);
+ printk_ratelimited(KERN_ERR
+ "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
+ (unsigned long long)logical, sdev->dev->name);
}
- kfree(bbio);
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.corrected_errors;
- spin_unlock(&sdev->stat_lock);
+out:
+ if (sblocks_for_recheck) {
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
+ mirror_index++) {
+ struct scrub_block *sblock = sblocks_for_recheck +
+ mirror_index;
+ int page_index;
+
+ for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
+ page_index++)
+ if (sblock->pagev[page_index].page)
+ __free_page(
+ sblock->pagev[page_index].page);
+ }
+ kfree(sblocks_for_recheck);
+ }
- printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
- (unsigned long long)logical);
- return;
+ return 0;
+}
-uncorrectable:
- kfree(bbio);
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.uncorrectable_errors;
- spin_unlock(&sdev->stat_lock);
+static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+ struct btrfs_mapping_tree *map_tree,
+ u64 length, u64 logical,
+ struct scrub_block *sblocks_for_recheck)
+{
+ int page_index;
+ int mirror_index;
+ int ret;
+
+ /*
+ * note: the three members sdev, ref_count and outstanding_pages
+ * are not used (and not set) in the blocks that are used for
+ * the recheck procedure
+ */
+
+ page_index = 0;
+ while (length > 0) {
+ u64 sublen = min_t(u64, length, PAGE_SIZE);
+ u64 mapped_length = sublen;
+ struct btrfs_bio *bbio = NULL;
+
+ /*
+ * with a length of PAGE_SIZE, each returned stripe
+ * represents one mirror
+ */
+ ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
+ &bbio, 0);
+ if (ret || !bbio || mapped_length < sublen) {
+ kfree(bbio);
+ return -EIO;
+ }
- printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
- "logical %llu\n", (unsigned long long)logical);
+ BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
+ for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+ mirror_index++) {
+ struct scrub_block *sblock;
+ struct scrub_page *page;
+
+ if (mirror_index >= BTRFS_MAX_MIRRORS)
+ continue;
+
+ sblock = sblocks_for_recheck + mirror_index;
+ page = sblock->pagev + page_index;
+ page->logical = logical;
+ page->physical = bbio->stripes[mirror_index].physical;
+ page->bdev = bbio->stripes[mirror_index].dev->bdev;
+ page->mirror_num = mirror_index + 1;
+ page->page = alloc_page(GFP_NOFS);
+ if (!page->page) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.malloc_errors++;
+ spin_unlock(&sdev->stat_lock);
+ return -ENOMEM;
+ }
+ sblock->page_count++;
+ }
+ kfree(bbio);
+ length -= sublen;
+ logical += sublen;
+ page_index++;
+ }
+
+ return 0;
}
-static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
- struct page *page)
+/*
+ * this function will check the on disk data for checksum errors, header
+ * errors and read I/O errors. If any I/O errors happen, the exact pages
+ * which are errored are marked as being bad. The goal is to enable scrub
+ * to take those pages that are not errored from all the mirrors so that
+ * the pages that are errored in the just handled mirror can be repaired.
+ */
+static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock, int is_metadata,
+ int have_csum, u8 *csum, u64 generation,
+ u16 csum_size)
{
- struct bio *bio = NULL;
- int ret;
- DECLARE_COMPLETION_ONSTACK(complete);
+ int page_num;
- bio = bio_alloc(GFP_NOFS, 1);
- bio->bi_bdev = bdev;
- bio->bi_sector = sector;
- bio_add_page(bio, page, PAGE_SIZE, 0);
- bio->bi_end_io = scrub_fixup_end_io;
- bio->bi_private = &complete;
- submit_bio(rw, bio);
+ sblock->no_io_error_seen = 1;
+ sblock->header_error = 0;
+ sblock->checksum_error = 0;
- /* this will also unplug the queue */
- wait_for_completion(&complete);
+ for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ struct bio *bio;
+ int ret;
+ struct scrub_page *page = sblock->pagev + page_num;
+ DECLARE_COMPLETION_ONSTACK(complete);
+
+ BUG_ON(!page->page);
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio->bi_bdev = page->bdev;
+ bio->bi_sector = page->physical >> 9;
+ bio->bi_end_io = scrub_complete_bio_end_io;
+ bio->bi_private = &complete;
+
+ ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
+ if (PAGE_SIZE != ret) {
+ bio_put(bio);
+ return -EIO;
+ }
+ btrfsic_submit_bio(READ, bio);
- ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
- bio_put(bio);
- return ret;
+ /* this will also unplug the queue */
+ wait_for_completion(&complete);
+
+ page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ sblock->no_io_error_seen = 0;
+ bio_put(bio);
+ }
+
+ if (sblock->no_io_error_seen)
+ scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+ have_csum, csum, generation,
+ csum_size);
+
+ return 0;
}
-static void scrub_bio_end_io(struct bio *bio, int err)
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock,
+ int is_metadata, int have_csum,
+ const u8 *csum, u64 generation,
+ u16 csum_size)
{
- struct scrub_bio *sbio = bio->bi_private;
- struct scrub_dev *sdev = sbio->sdev;
- struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+ int page_num;
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ u32 crc = ~(u32)0;
+ struct btrfs_root *root = fs_info->extent_root;
+ void *mapped_buffer;
+
+ BUG_ON(!sblock->pagev[0].page);
+ if (is_metadata) {
+ struct btrfs_header *h;
+
+ mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+ h = (struct btrfs_header *)mapped_buffer;
+
+ if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+ generation != le64_to_cpu(h->generation) ||
+ memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+ memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+ BTRFS_UUID_SIZE))
+ sblock->header_error = 1;
+ csum = h->csum;
+ } else {
+ if (!have_csum)
+ return;
- sbio->err = err;
- sbio->bio = bio;
+ mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+ }
- btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+ for (page_num = 0;;) {
+ if (page_num == 0 && is_metadata)
+ crc = btrfs_csum_data(root,
+ ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
+ crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
+ else
+ crc = btrfs_csum_data(root, mapped_buffer, crc,
+ PAGE_SIZE);
+
+ kunmap_atomic(mapped_buffer);
+ page_num++;
+ if (page_num >= sblock->page_count)
+ break;
+ BUG_ON(!sblock->pagev[page_num].page);
+
+ mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
+ }
+
+ btrfs_csum_final(crc, calculated_csum);
+ if (memcmp(calculated_csum, csum, csum_size))
+ sblock->checksum_error = 1;
}
-static void scrub_checksum(struct btrfs_work *work)
+static void scrub_complete_bio_end_io(struct bio *bio, int err)
{
- struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
- struct scrub_dev *sdev = sbio->sdev;
- struct page *page;
- void *buffer;
- int i;
- u64 flags;
- u64 logical;
- int ret;
+ complete((struct completion *)bio->bi_private);
+}
- if (sbio->err) {
- ret = 0;
- for (i = 0; i < sbio->count; ++i)
- ret |= scrub_recheck_error(sbio, i);
- if (!ret) {
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.unverified_errors;
- spin_unlock(&sdev->stat_lock);
- }
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int force_write)
+{
+ int page_num;
+ int ret = 0;
- sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
- sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
- sbio->bio->bi_phys_segments = 0;
- sbio->bio->bi_idx = 0;
+ for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ int ret_sub;
- for (i = 0; i < sbio->count; i++) {
- struct bio_vec *bi;
- bi = &sbio->bio->bi_io_vec[i];
- bi->bv_offset = 0;
- bi->bv_len = PAGE_SIZE;
- }
- goto out;
+ ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
+ sblock_good,
+ page_num,
+ force_write);
+ if (ret_sub)
+ ret = ret_sub;
}
- for (i = 0; i < sbio->count; ++i) {
- page = sbio->bio->bi_io_vec[i].bv_page;
- buffer = kmap_atomic(page, KM_USER0);
- flags = sbio->spag[i].flags;
- logical = sbio->logical + i * PAGE_SIZE;
- ret = 0;
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
- } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
- logical, buffer);
- } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
- BUG_ON(i);
- (void)scrub_checksum_super(sbio, buffer);
- } else {
- WARN_ON(1);
- }
- kunmap_atomic(buffer, KM_USER0);
- if (ret) {
- ret = scrub_recheck_error(sbio, i);
- if (!ret) {
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.unverified_errors;
- spin_unlock(&sdev->stat_lock);
- }
+
+ return ret;
+}
+
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int page_num, int force_write)
+{
+ struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+ struct scrub_page *page_good = sblock_good->pagev + page_num;
+
+ BUG_ON(sblock_bad->pagev[page_num].page == NULL);
+ BUG_ON(sblock_good->pagev[page_num].page == NULL);
+ if (force_write || sblock_bad->header_error ||
+ sblock_bad->checksum_error || page_bad->io_error) {
+ struct bio *bio;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(complete);
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio->bi_bdev = page_bad->bdev;
+ bio->bi_sector = page_bad->physical >> 9;
+ bio->bi_end_io = scrub_complete_bio_end_io;
+ bio->bi_private = &complete;
+
+ ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+ if (PAGE_SIZE != ret) {
+ bio_put(bio);
+ return -EIO;
}
+ btrfsic_submit_bio(WRITE, bio);
+
+ /* this will also unplug the queue */
+ wait_for_completion(&complete);
+ bio_put(bio);
}
-out:
- scrub_free_bio(sbio->bio);
- sbio->bio = NULL;
- spin_lock(&sdev->list_lock);
- sbio->next_free = sdev->first_free;
- sdev->first_free = sbio->index;
- spin_unlock(&sdev->list_lock);
- atomic_dec(&sdev->in_flight);
- wake_up(&sdev->list_wait);
+ return 0;
}
-static int scrub_checksum_data(struct scrub_dev *sdev,
- struct scrub_page *spag, void *buffer)
+static void scrub_checksum(struct scrub_block *sblock)
{
+ u64 flags;
+ int ret;
+
+ BUG_ON(sblock->page_count < 1);
+ flags = sblock->pagev[0].flags;
+ ret = 0;
+ if (flags & BTRFS_EXTENT_FLAG_DATA)
+ ret = scrub_checksum_data(sblock);
+ else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ ret = scrub_checksum_tree_block(sblock);
+ else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+ (void)scrub_checksum_super(sblock);
+ else
+ WARN_ON(1);
+ if (ret)
+ scrub_handle_errored_block(sblock);
+}
+
+static int scrub_checksum_data(struct scrub_block *sblock)
+{
+ struct scrub_dev *sdev = sblock->sdev;
u8 csum[BTRFS_CSUM_SIZE];
+ u8 *on_disk_csum;
+ struct page *page;
+ void *buffer;
u32 crc = ~(u32)0;
int fail = 0;
struct btrfs_root *root = sdev->dev->dev_root;
+ u64 len;
+ int index;
- if (!spag->have_csum)
+ BUG_ON(sblock->page_count < 1);
+ if (!sblock->pagev[0].have_csum)
return 0;
- crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+ on_disk_csum = sblock->pagev[0].csum;
+ page = sblock->pagev[0].page;
+ buffer = kmap_atomic(page);
+
+ len = sdev->sectorsize;
+ index = 0;
+ for (;;) {
+ u64 l = min_t(u64, len, PAGE_SIZE);
+
+ crc = btrfs_csum_data(root, buffer, crc, l);
+ kunmap_atomic(buffer);
+ len -= l;
+ if (len == 0)
+ break;
+ index++;
+ BUG_ON(index >= sblock->page_count);
+ BUG_ON(!sblock->pagev[index].page);
+ page = sblock->pagev[index].page;
+ buffer = kmap_atomic(page);
+ }
+
btrfs_csum_final(crc, csum);
- if (memcmp(csum, spag->csum, sdev->csum_size))
+ if (memcmp(csum, on_disk_csum, sdev->csum_size))
fail = 1;
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.data_extents_scrubbed;
- sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
- if (fail)
+ if (fail) {
+ spin_lock(&sdev->stat_lock);
++sdev->stat.csum_errors;
- spin_unlock(&sdev->stat_lock);
+ spin_unlock(&sdev->stat_lock);
+ }
return fail;
}
-static int scrub_checksum_tree_block(struct scrub_dev *sdev,
- struct scrub_page *spag, u64 logical,
- void *buffer)
+static int scrub_checksum_tree_block(struct scrub_block *sblock)
{
+ struct scrub_dev *sdev = sblock->sdev;
struct btrfs_header *h;
struct btrfs_root *root = sdev->dev->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- u8 csum[BTRFS_CSUM_SIZE];
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ u8 on_disk_csum[BTRFS_CSUM_SIZE];
+ struct page *page;
+ void *mapped_buffer;
+ u64 mapped_size;
+ void *p;
u32 crc = ~(u32)0;
int fail = 0;
int crc_fail = 0;
+ u64 len;
+ int index;
+
+ BUG_ON(sblock->page_count < 1);
+ page = sblock->pagev[0].page;
+ mapped_buffer = kmap_atomic(page);
+ h = (struct btrfs_header *)mapped_buffer;
+ memcpy(on_disk_csum, h->csum, sdev->csum_size);
/*
* we don't use the getter functions here, as we
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- h = (struct btrfs_header *)buffer;
- if (logical != le64_to_cpu(h->bytenr))
+ if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
++fail;
- if (spag->generation != le64_to_cpu(h->generation))
+ if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
++fail;
if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -885,51 +1305,99 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev,
BTRFS_UUID_SIZE))
++fail;
- crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
- PAGE_SIZE - BTRFS_CSUM_SIZE);
- btrfs_csum_final(crc, csum);
- if (memcmp(csum, h->csum, sdev->csum_size))
+ BUG_ON(sdev->nodesize != sdev->leafsize);
+ len = sdev->nodesize - BTRFS_CSUM_SIZE;
+ mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+ p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+ index = 0;
+ for (;;) {
+ u64 l = min_t(u64, len, mapped_size);
+
+ crc = btrfs_csum_data(root, p, crc, l);
+ kunmap_atomic(mapped_buffer);
+ len -= l;
+ if (len == 0)
+ break;
+ index++;
+ BUG_ON(index >= sblock->page_count);
+ BUG_ON(!sblock->pagev[index].page);
+ page = sblock->pagev[index].page;
+ mapped_buffer = kmap_atomic(page);
+ mapped_size = PAGE_SIZE;
+ p = mapped_buffer;
+ }
+
+ btrfs_csum_final(crc, calculated_csum);
+ if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
++crc_fail;
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.tree_extents_scrubbed;
- sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
- if (crc_fail)
- ++sdev->stat.csum_errors;
- if (fail)
- ++sdev->stat.verify_errors;
- spin_unlock(&sdev->stat_lock);
+ if (crc_fail || fail) {
+ spin_lock(&sdev->stat_lock);
+ if (crc_fail)
+ ++sdev->stat.csum_errors;
+ if (fail)
+ ++sdev->stat.verify_errors;
+ spin_unlock(&sdev->stat_lock);
+ }
return fail || crc_fail;
}
-static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+static int scrub_checksum_super(struct scrub_block *sblock)
{
struct btrfs_super_block *s;
- u64 logical;
- struct scrub_dev *sdev = sbio->sdev;
+ struct scrub_dev *sdev = sblock->sdev;
struct btrfs_root *root = sdev->dev->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- u8 csum[BTRFS_CSUM_SIZE];
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ u8 on_disk_csum[BTRFS_CSUM_SIZE];
+ struct page *page;
+ void *mapped_buffer;
+ u64 mapped_size;
+ void *p;
u32 crc = ~(u32)0;
int fail = 0;
+ u64 len;
+ int index;
- s = (struct btrfs_super_block *)buffer;
- logical = sbio->logical;
+ BUG_ON(sblock->page_count < 1);
+ page = sblock->pagev[0].page;
+ mapped_buffer = kmap_atomic(page);
+ s = (struct btrfs_super_block *)mapped_buffer;
+ memcpy(on_disk_csum, s->csum, sdev->csum_size);
- if (logical != le64_to_cpu(s->bytenr))
+ if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
++fail;
- if (sbio->spag[0].generation != le64_to_cpu(s->generation))
+ if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
++fail;
if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
++fail;
- crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
- PAGE_SIZE - BTRFS_CSUM_SIZE);
- btrfs_csum_final(crc, csum);
- if (memcmp(csum, s->csum, sbio->sdev->csum_size))
+ len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
+ mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+ p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+ index = 0;
+ for (;;) {
+ u64 l = min_t(u64, len, mapped_size);
+
+ crc = btrfs_csum_data(root, p, crc, l);
+ kunmap_atomic(mapped_buffer);
+ len -= l;
+ if (len == 0)
+ break;
+ index++;
+ BUG_ON(index >= sblock->page_count);
+ BUG_ON(!sblock->pagev[index].page);
+ page = sblock->pagev[index].page;
+ mapped_buffer = kmap_atomic(page);
+ mapped_size = PAGE_SIZE;
+ p = mapped_buffer;
+ }
+
+ btrfs_csum_final(crc, calculated_csum);
+ if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
++fail;
if (fail) {
@@ -946,29 +1414,42 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
return fail;
}
-static int scrub_submit(struct scrub_dev *sdev)
+static void scrub_block_get(struct scrub_block *sblock)
+{
+ atomic_inc(&sblock->ref_count);
+}
+
+static void scrub_block_put(struct scrub_block *sblock)
+{
+ if (atomic_dec_and_test(&sblock->ref_count)) {
+ int i;
+
+ for (i = 0; i < sblock->page_count; i++)
+ if (sblock->pagev[i].page)
+ __free_page(sblock->pagev[i].page);
+ kfree(sblock);
+ }
+}
+
+static void scrub_submit(struct scrub_dev *sdev)
{
struct scrub_bio *sbio;
if (sdev->curr == -1)
- return 0;
+ return;
sbio = sdev->bios[sdev->curr];
- sbio->err = 0;
sdev->curr = -1;
atomic_inc(&sdev->in_flight);
- submit_bio(READ, sbio->bio);
-
- return 0;
+ btrfsic_submit_bio(READ, sbio->bio);
}
-static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
- u64 physical, u64 flags, u64 gen, int mirror_num,
- u8 *csum, int force)
+static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+ struct scrub_page *spage)
{
+ struct scrub_block *sblock = spage->sblock;
struct scrub_bio *sbio;
- struct page *page;
int ret;
again:
@@ -981,7 +1462,7 @@ again:
if (sdev->curr != -1) {
sdev->first_free = sdev->bios[sdev->curr]->next_free;
sdev->bios[sdev->curr]->next_free = -1;
- sdev->bios[sdev->curr]->count = 0;
+ sdev->bios[sdev->curr]->page_count = 0;
spin_unlock(&sdev->list_lock);
} else {
spin_unlock(&sdev->list_lock);
@@ -989,62 +1470,200 @@ again:
}
}
sbio = sdev->bios[sdev->curr];
- if (sbio->count == 0) {
+ if (sbio->page_count == 0) {
struct bio *bio;
- sbio->physical = physical;
- sbio->logical = logical;
- bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
- if (!bio)
- return -ENOMEM;
+ sbio->physical = spage->physical;
+ sbio->logical = spage->logical;
+ bio = sbio->bio;
+ if (!bio) {
+ bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
+ if (!bio)
+ return -ENOMEM;
+ sbio->bio = bio;
+ }
bio->bi_private = sbio;
bio->bi_end_io = scrub_bio_end_io;
bio->bi_bdev = sdev->dev->bdev;
- bio->bi_sector = sbio->physical >> 9;
+ bio->bi_sector = spage->physical >> 9;
sbio->err = 0;
- sbio->bio = bio;
- } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
- sbio->logical + sbio->count * PAGE_SIZE != logical) {
- ret = scrub_submit(sdev);
- if (ret)
- return ret;
+ } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ spage->physical ||
+ sbio->logical + sbio->page_count * PAGE_SIZE !=
+ spage->logical) {
+ scrub_submit(sdev);
goto again;
}
- sbio->spag[sbio->count].flags = flags;
- sbio->spag[sbio->count].generation = gen;
- sbio->spag[sbio->count].have_csum = 0;
- sbio->spag[sbio->count].mirror_num = mirror_num;
-
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
- if (!ret) {
- __free_page(page);
- ret = scrub_submit(sdev);
- if (ret)
- return ret;
+ sbio->pagev[sbio->page_count] = spage;
+ ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+ if (ret != PAGE_SIZE) {
+ if (sbio->page_count < 1) {
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ return -EIO;
+ }
+ scrub_submit(sdev);
goto again;
}
- if (csum) {
- sbio->spag[sbio->count].have_csum = 1;
- memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+ scrub_block_get(sblock); /* one for the added page */
+ atomic_inc(&sblock->outstanding_pages);
+ sbio->page_count++;
+ if (sbio->page_count == sdev->pages_per_bio)
+ scrub_submit(sdev);
+
+ return 0;
+}
+
+static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+ u64 physical, u64 flags, u64 gen, int mirror_num,
+ u8 *csum, int force)
+{
+ struct scrub_block *sblock;
+ int index;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+ if (!sblock) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.malloc_errors++;
+ spin_unlock(&sdev->stat_lock);
+ return -ENOMEM;
+ }
+
+ /* one ref inside this function, plus one for each page later on */
+ atomic_set(&sblock->ref_count, 1);
+ sblock->sdev = sdev;
+ sblock->no_io_error_seen = 1;
+
+ for (index = 0; len > 0; index++) {
+ struct scrub_page *spage = sblock->pagev + index;
+ u64 l = min_t(u64, len, PAGE_SIZE);
+
+ BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ spage->page = alloc_page(GFP_NOFS);
+ if (!spage->page) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.malloc_errors++;
+ spin_unlock(&sdev->stat_lock);
+ while (index > 0) {
+ index--;
+ __free_page(sblock->pagev[index].page);
+ }
+ kfree(sblock);
+ return -ENOMEM;
+ }
+ spage->sblock = sblock;
+ spage->bdev = sdev->dev->bdev;
+ spage->flags = flags;
+ spage->generation = gen;
+ spage->logical = logical;
+ spage->physical = physical;
+ spage->mirror_num = mirror_num;
+ if (csum) {
+ spage->have_csum = 1;
+ memcpy(spage->csum, csum, sdev->csum_size);
+ } else {
+ spage->have_csum = 0;
+ }
+ sblock->page_count++;
+ len -= l;
+ logical += l;
+ physical += l;
}
- ++sbio->count;
- if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
+
+ BUG_ON(sblock->page_count == 0);
+ for (index = 0; index < sblock->page_count; index++) {
+ struct scrub_page *spage = sblock->pagev + index;
int ret;
- ret = scrub_submit(sdev);
- if (ret)
+ ret = scrub_add_page_to_bio(sdev, spage);
+ if (ret) {
+ scrub_block_put(sblock);
return ret;
+ }
}
+ if (force)
+ scrub_submit(sdev);
+
+ /* last one frees, either here or in bio completion for last page */
+ scrub_block_put(sblock);
return 0;
}
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+ struct scrub_bio *sbio = bio->bi_private;
+ struct scrub_dev *sdev = sbio->sdev;
+ struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+
+ sbio->err = err;
+ sbio->bio = bio;
+
+ btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_bio_end_io_worker(struct btrfs_work *work)
+{
+ struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+ struct scrub_dev *sdev = sbio->sdev;
+ int i;
+
+ BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
+ if (sbio->err) {
+ for (i = 0; i < sbio->page_count; i++) {
+ struct scrub_page *spage = sbio->pagev[i];
+
+ spage->io_error = 1;
+ spage->sblock->no_io_error_seen = 0;
+ }
+ }
+
+ /* now complete the scrub_block items that have all pages completed */
+ for (i = 0; i < sbio->page_count; i++) {
+ struct scrub_page *spage = sbio->pagev[i];
+ struct scrub_block *sblock = spage->sblock;
+
+ if (atomic_dec_and_test(&sblock->outstanding_pages))
+ scrub_block_complete(sblock);
+ scrub_block_put(sblock);
+ }
+
+ if (sbio->err) {
+ /* what is this good for??? */
+ sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+ sbio->bio->bi_phys_segments = 0;
+ sbio->bio->bi_idx = 0;
+
+ for (i = 0; i < sbio->page_count; i++) {
+ struct bio_vec *bi;
+ bi = &sbio->bio->bi_io_vec[i];
+ bi->bv_offset = 0;
+ bi->bv_len = PAGE_SIZE;
+ }
+ }
+
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ spin_lock(&sdev->list_lock);
+ sbio->next_free = sdev->first_free;
+ sdev->first_free = sbio->index;
+ spin_unlock(&sdev->list_lock);
+ atomic_dec(&sdev->in_flight);
+ wake_up(&sdev->list_wait);
+}
+
+static void scrub_block_complete(struct scrub_block *sblock)
+{
+ if (!sblock->no_io_error_seen)
+ scrub_handle_errored_block(sblock);
+ else
+ scrub_checksum(sblock);
+}
+
static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
u8 *csum)
{
@@ -1052,7 +1671,6 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
int ret = 0;
unsigned long i;
unsigned long num_sectors;
- u32 sectorsize = sdev->dev->dev_root->sectorsize;
while (!list_empty(&sdev->csum_list)) {
sum = list_first_entry(&sdev->csum_list,
@@ -1070,7 +1688,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
if (!sum)
return 0;
- num_sectors = sum->len / sectorsize;
+ num_sectors = sum->len / sdev->sectorsize;
for (i = 0; i < num_sectors; ++i) {
if (sum->sums[i].bytenr == logical) {
memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
@@ -1091,9 +1709,28 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
{
int ret;
u8 csum[BTRFS_CSUM_SIZE];
+ u32 blocksize;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ blocksize = sdev->sectorsize;
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.data_extents_scrubbed++;
+ sdev->stat.data_bytes_scrubbed += len;
+ spin_unlock(&sdev->stat_lock);
+ } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ BUG_ON(sdev->nodesize != sdev->leafsize);
+ blocksize = sdev->nodesize;
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.tree_extents_scrubbed++;
+ sdev->stat.tree_bytes_scrubbed += len;
+ spin_unlock(&sdev->stat_lock);
+ } else {
+ blocksize = sdev->sectorsize;
+ BUG_ON(1);
+ }
while (len) {
- u64 l = min_t(u64, len, PAGE_SIZE);
+ u64 l = min_t(u64, len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@ -1102,8 +1739,8 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
if (have_csum == 0)
++sdev->stat.no_csum;
}
- ret = scrub_page(sdev, logical, l, physical, flags, gen,
- mirror_num, have_csum ? csum : NULL, 0);
+ ret = scrub_pages(sdev, logical, l, physical, flags, gen,
+ mirror_num, have_csum ? csum : NULL, 0);
if (ret)
return ret;
len -= l;
@@ -1168,6 +1805,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
if (!path)
return -ENOMEM;
+ /*
+ * work on commit root. The related disk blocks are static as
+ * long as COW is applied. This means, it is save to rewrite
+ * them to repair disk errors without any race conditions
+ */
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -1365,7 +2007,8 @@ out:
}
static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
- u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
+ u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length,
+ u64 dev_offset)
{
struct btrfs_mapping_tree *map_tree =
&sdev->dev->dev_root->fs_info->mapping_tree;
@@ -1389,7 +2032,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
goto out;
for (i = 0; i < map->num_stripes; ++i) {
- if (map->stripes[i].dev == sdev->dev) {
+ if (map->stripes[i].dev == sdev->dev &&
+ map->stripes[i].physical == dev_offset) {
ret = scrub_stripe(sdev, map, i, chunk_offset, length);
if (ret)
goto out;
@@ -1485,7 +2129,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
break;
}
ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
- chunk_offset, length);
+ chunk_offset, length, found_key.offset);
btrfs_put_block_group(cache);
if (ret)
break;
@@ -1512,15 +2156,18 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
struct btrfs_device *device = sdev->dev;
struct btrfs_root *root = device->dev_root;
+ if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ return -EIO;
+
gen = root->fs_info->last_trans_committed;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
break;
- ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
- BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+ ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+ BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
if (ret)
return ret;
}
@@ -1579,10 +2226,30 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
/*
* check some assumptions
*/
- if (root->sectorsize != PAGE_SIZE ||
- root->sectorsize != root->leafsize ||
- root->sectorsize != root->nodesize) {
- printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+ if (root->nodesize != root->leafsize) {
+ printk(KERN_ERR
+ "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
+ root->nodesize, root->leafsize);
+ return -EINVAL;
+ }
+
+ if (root->nodesize > BTRFS_STRIPE_LEN) {
+ /*
+ * in this case scrub is unable to calculate the checksum
+ * the way scrub is implemented. Do not handle this
+ * situation at all because it won't ever happen.
+ */
+ printk(KERN_ERR
+ "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
+ root->nodesize, BTRFS_STRIPE_LEN);
+ return -EINVAL;
+ }
+
+ if (root->sectorsize != PAGE_SIZE) {
+ /* not supported for data w/o checksums */
+ printk(KERN_ERR
+ "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
+ root->sectorsize, (unsigned long long)PAGE_SIZE);
return -EINVAL;
}
@@ -1652,7 +2319,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
return ret;
}
-int btrfs_scrub_pause(struct btrfs_root *root)
+void btrfs_scrub_pause(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1667,34 +2334,28 @@ int btrfs_scrub_pause(struct btrfs_root *root)
mutex_lock(&fs_info->scrub_lock);
}
mutex_unlock(&fs_info->scrub_lock);
-
- return 0;
}
-int btrfs_scrub_continue(struct btrfs_root *root)
+void btrfs_scrub_continue(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
atomic_dec(&fs_info->scrub_pause_req);
wake_up(&fs_info->scrub_pause_wait);
- return 0;
}
-int btrfs_scrub_pause_super(struct btrfs_root *root)
+void btrfs_scrub_pause_super(struct btrfs_root *root)
{
down_write(&root->fs_info->scrub_super_lock);
- return 0;
}
-int btrfs_scrub_continue_super(struct btrfs_root *root)
+void btrfs_scrub_continue_super(struct btrfs_root *root)
{
up_write(&root->fs_info->scrub_super_lock);
- return 0;
}
-int btrfs_scrub_cancel(struct btrfs_root *root)
+int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
mutex_lock(&fs_info->scrub_lock);
if (!atomic_read(&fs_info->scrubs_running)) {
@@ -1715,6 +2376,11 @@ int btrfs_scrub_cancel(struct btrfs_root *root)
return 0;
}
+int btrfs_scrub_cancel(struct btrfs_root *root)
+{
+ return __btrfs_scrub_cancel(root->fs_info);
+}
+
int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1737,6 +2403,7 @@ int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
return 0;
}
+
int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
{
struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index bc1f6ad1844..c6ffa581241 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -44,8 +44,9 @@
#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \
-u##bits btrfs_##name(struct extent_buffer *eb, \
- type *s) \
+void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \
+u##bits btrfs_token_##name(struct extent_buffer *eb, \
+ type *s, struct btrfs_map_token *token) \
{ \
unsigned long part_offset = (unsigned long)s; \
unsigned long offset = part_offset + offsetof(type, member); \
@@ -54,9 +55,18 @@ u##bits btrfs_##name(struct extent_buffer *eb, \
char *kaddr; \
unsigned long map_start; \
unsigned long map_len; \
+ unsigned long mem_len = sizeof(((type *)0)->member); \
u##bits res; \
+ if (token && token->kaddr && token->offset <= offset && \
+ token->eb == eb && \
+ (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \
+ kaddr = token->kaddr; \
+ p = (type *)(kaddr + part_offset - token->offset); \
+ res = le##bits##_to_cpu(p->member); \
+ return res; \
+ } \
err = map_private_extent_buffer(eb, offset, \
- sizeof(((type *)0)->member), \
+ mem_len, \
&kaddr, &map_start, &map_len); \
if (err) { \
__le##bits leres; \
@@ -65,10 +75,15 @@ u##bits btrfs_##name(struct extent_buffer *eb, \
} \
p = (type *)(kaddr + part_offset - map_start); \
res = le##bits##_to_cpu(p->member); \
+ if (token) { \
+ token->kaddr = kaddr; \
+ token->offset = map_start; \
+ token->eb = eb; \
+ } \
return res; \
} \
-void btrfs_set_##name(struct extent_buffer *eb, \
- type *s, u##bits val) \
+void btrfs_set_token_##name(struct extent_buffer *eb, \
+ type *s, u##bits val, struct btrfs_map_token *token) \
{ \
unsigned long part_offset = (unsigned long)s; \
unsigned long offset = part_offset + offsetof(type, member); \
@@ -77,8 +92,17 @@ void btrfs_set_##name(struct extent_buffer *eb, \
char *kaddr; \
unsigned long map_start; \
unsigned long map_len; \
+ unsigned long mem_len = sizeof(((type *)0)->member); \
+ if (token && token->kaddr && token->offset <= offset && \
+ token->eb == eb && \
+ (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \
+ kaddr = token->kaddr; \
+ p = (type *)(kaddr + part_offset - token->offset); \
+ p->member = cpu_to_le##bits(val); \
+ return; \
+ } \
err = map_private_extent_buffer(eb, offset, \
- sizeof(((type *)0)->member), \
+ mem_len, \
&kaddr, &map_start, &map_len); \
if (err) { \
__le##bits val2; \
@@ -88,7 +112,22 @@ void btrfs_set_##name(struct extent_buffer *eb, \
} \
p = (type *)(kaddr + part_offset - map_start); \
p->member = cpu_to_le##bits(val); \
-}
+ if (token) { \
+ token->kaddr = kaddr; \
+ token->offset = map_start; \
+ token->eb = eb; \
+ } \
+} \
+void btrfs_set_##name(struct extent_buffer *eb, \
+ type *s, u##bits val) \
+{ \
+ btrfs_set_token_##name(eb, s, val, NULL); \
+} \
+u##bits btrfs_##name(struct extent_buffer *eb, \
+ type *s) \
+{ \
+ return btrfs_token_##name(eb, s, NULL); \
+} \
#include "ctree.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae488aa1966..8d5d380f7bd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -76,6 +76,9 @@ static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
case -EROFS:
errstr = "Readonly filesystem";
break;
+ case -EEXIST:
+ errstr = "Object already exists";
+ break;
default:
if (nbuf) {
if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
@@ -116,6 +119,8 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
sb->s_flags |= MS_RDONLY;
printk(KERN_INFO "btrfs is forced readonly\n");
+ __btrfs_scrub_cancel(fs_info);
+// WARN_ON(1);
}
}
@@ -124,36 +129,143 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
* invokes the approciate error response.
*/
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno)
+ unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
char nbuf[16];
const char *errstr;
+ va_list args;
+ va_start(args, fmt);
/*
* Special case: if the error is EROFS, and we're already
* under MS_RDONLY, then it is safe here.
*/
if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
- return;
+ return;
- errstr = btrfs_decode_error(fs_info, errno, nbuf);
- printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
- sb->s_id, function, line, errstr);
- save_error_info(fs_info);
+ errstr = btrfs_decode_error(fs_info, errno, nbuf);
+ if (fmt) {
+ struct va_format vaf = {
+ .fmt = fmt,
+ .va = &args,
+ };
+
+ printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
+ sb->s_id, function, line, errstr, &vaf);
+ } else {
+ printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+ sb->s_id, function, line, errstr);
+ }
- btrfs_handle_error(fs_info);
+ /* Don't go through full error handling during mount */
+ if (sb->s_flags & MS_BORN) {
+ save_error_info(fs_info);
+ btrfs_handle_error(fs_info);
+ }
+ va_end(args);
}
-static void btrfs_put_super(struct super_block *sb)
+const char *logtypes[] = {
+ "emergency",
+ "alert",
+ "critical",
+ "error",
+ "warning",
+ "notice",
+ "info",
+ "debug",
+};
+
+void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
- struct btrfs_root *root = btrfs_sb(sb);
- int ret;
+ struct super_block *sb = fs_info->sb;
+ char lvl[4];
+ struct va_format vaf;
+ va_list args;
+ const char *type = logtypes[4];
+
+ va_start(args, fmt);
+
+ if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
+ strncpy(lvl, fmt, 3);
+ fmt += 3;
+ type = logtypes[fmt[1] - '0'];
+ } else
+ *lvl = '\0';
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf);
+}
+
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *function,
+ unsigned int line, int errno)
+{
+ WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted");
+ trans->aborted = errno;
+ /* Nothing used. The other threads that have joined this
+ * transaction may be able to continue. */
+ if (!trans->blocks_used) {
+ btrfs_printk(root->fs_info, "Aborting unused transaction.\n");
+ return;
+ }
+ trans->transaction->aborted = errno;
+ __btrfs_std_error(root->fs_info, function, line, errno, NULL);
+}
+/*
+ * __btrfs_panic decodes unexpected, fatal errors from the caller,
+ * issues an alert, and either panics or BUGs, depending on mount options.
+ */
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ char nbuf[16];
+ char *s_id = "<unknown>";
+ const char *errstr;
+ struct va_format vaf = { .fmt = fmt };
+ va_list args;
- ret = close_ctree(root);
- sb->s_fs_info = NULL;
+ if (fs_info)
+ s_id = fs_info->sb->s_id;
- (void)ret; /* FIXME: need to fix VFS to return error? */
+ va_start(args, fmt);
+ vaf.va = &args;
+
+ errstr = btrfs_decode_error(fs_info, errno, nbuf);
+ if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)
+ panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
+ s_id, function, line, &vaf, errstr);
+
+ printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
+ s_id, function, line, &vaf, errstr);
+ va_end(args);
+ /* Caller calls BUG() */
+}
+
+static void btrfs_put_super(struct super_block *sb)
+{
+ (void)close_ctree(btrfs_sb(sb)->tree_root);
+ /* FIXME: need to fix VFS to return error? */
+ /* AV: return it _where_? ->put_super() can be triggered by any number
+ * of async events, up to and including delivery of SIGKILL to the
+ * last process that kept it busy. Or segfault in the aforementioned
+ * process... Whom would you report that to?
+ */
}
enum {
@@ -163,8 +275,11 @@ enum {
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
- Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+ Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+ Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+ Opt_check_integrity, Opt_check_integrity_including_extent_data,
+ Opt_check_integrity_print_mask, Opt_fatal_errors,
+ Opt_err,
};
static match_table_t tokens = {
@@ -199,12 +314,18 @@ static match_table_t tokens = {
{Opt_inode_cache, "inode_cache"},
{Opt_no_space_cache, "nospace_cache"},
{Opt_recovery, "recovery"},
+ {Opt_skip_balance, "skip_balance"},
+ {Opt_check_integrity, "check_int"},
+ {Opt_check_integrity_including_extent_data, "check_int_data"},
+ {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
+ {Opt_fatal_errors, "fatal_errors=%s"},
{Opt_err, NULL},
};
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
+ * XXX JDM: This needs to be cleaned up for remount.
*/
int btrfs_parse_options(struct btrfs_root *root, char *options)
{
@@ -397,6 +518,52 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
printk(KERN_INFO "btrfs: enabling auto recovery");
btrfs_set_opt(info->mount_opt, RECOVERY);
break;
+ case Opt_skip_balance:
+ btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ case Opt_check_integrity_including_extent_data:
+ printk(KERN_INFO "btrfs: enabling check integrity"
+ " including extent data\n");
+ btrfs_set_opt(info->mount_opt,
+ CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+ break;
+ case Opt_check_integrity:
+ printk(KERN_INFO "btrfs: enabling check integrity\n");
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+ break;
+ case Opt_check_integrity_print_mask:
+ intarg = 0;
+ match_int(&args[0], &intarg);
+ if (intarg) {
+ info->check_integrity_print_mask = intarg;
+ printk(KERN_INFO "btrfs:"
+ " check_integrity_print_mask 0x%x\n",
+ info->check_integrity_print_mask);
+ }
+ break;
+#else
+ case Opt_check_integrity_including_extent_data:
+ case Opt_check_integrity:
+ case Opt_check_integrity_print_mask:
+ printk(KERN_ERR "btrfs: support for check_integrity*"
+ " not compiled in!\n");
+ ret = -EINVAL;
+ goto out;
+#endif
+ case Opt_fatal_errors:
+ if (strcmp(args[0].from, "panic") == 0)
+ btrfs_set_opt(info->mount_opt,
+ PANIC_ON_FATAL_ERROR);
+ else if (strcmp(args[0].from, "bug") == 0)
+ btrfs_clear_opt(info->mount_opt,
+ PANIC_ON_FATAL_ERROR);
+ else {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -500,7 +667,8 @@ out:
static struct dentry *get_default_root(struct super_block *sb,
u64 subvol_objectid)
{
- struct btrfs_root *root = sb->s_fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
struct btrfs_path *path;
@@ -530,7 +698,7 @@ static struct dentry *get_default_root(struct super_block *sb,
* will mount by default if we haven't been given a specific subvolume
* to mount.
*/
- dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
@@ -544,7 +712,7 @@ static struct dentry *get_default_root(struct super_block *sb,
*/
btrfs_free_path(path);
dir_id = BTRFS_FIRST_FREE_OBJECTID;
- new_root = root->fs_info->fs_root;
+ new_root = fs_info->fs_root;
goto setup_root;
}
@@ -552,7 +720,7 @@ static struct dentry *get_default_root(struct super_block *sb,
btrfs_free_path(path);
find_root:
- new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ new_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (IS_ERR(new_root))
return ERR_CAST(new_root);
@@ -587,8 +755,7 @@ static int btrfs_fill_super(struct super_block *sb,
void *data, int silent)
{
struct inode *inode;
- struct dentry *root_dentry;
- struct btrfs_root *tree_root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_key key;
int err;
@@ -603,51 +770,48 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_flags |= MS_POSIXACL;
#endif
- tree_root = open_ctree(sb, fs_devices, (char *)data);
-
- if (IS_ERR(tree_root)) {
+ err = open_ctree(sb, fs_devices, (char *)data);
+ if (err) {
printk("btrfs: open_ctree failed\n");
- return PTR_ERR(tree_root);
+ return err;
}
- sb->s_fs_info = tree_root;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+ inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail_close;
}
- root_dentry = d_alloc_root(inode);
- if (!root_dentry) {
- iput(inode);
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root) {
err = -ENOMEM;
goto fail_close;
}
- sb->s_root = root_dentry;
-
save_mount_options(sb, data);
cleancache_init_fs(sb);
+ sb->s_flags |= MS_ACTIVE;
return 0;
fail_close:
- close_ctree(tree_root);
+ close_ctree(fs_info->tree_root);
return err;
}
int btrfs_sync_fs(struct super_block *sb, int wait)
{
struct btrfs_trans_handle *trans;
- struct btrfs_root *root = btrfs_sb(sb);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
int ret;
trace_btrfs_sync_fs(wait);
if (!wait) {
- filemap_flush(root->fs_info->btree_inode->i_mapping);
+ filemap_flush(fs_info->btree_inode->i_mapping);
return 0;
}
@@ -663,8 +827,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
- struct btrfs_root *root = btrfs_sb(dentry->d_sb);
- struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+ struct btrfs_root *root = info->tree_root;
char *compress_type;
if (btrfs_test_opt(root, DEGRADED))
@@ -722,28 +886,27 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",autodefrag");
if (btrfs_test_opt(root, INODE_MAP_CACHE))
seq_puts(seq, ",inode_cache");
+ if (btrfs_test_opt(root, SKIP_BALANCE))
+ seq_puts(seq, ",skip_balance");
+ if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+ seq_puts(seq, ",fatal_errors=panic");
return 0;
}
static int btrfs_test_super(struct super_block *s, void *data)
{
- struct btrfs_root *test_root = data;
- struct btrfs_root *root = btrfs_sb(s);
+ struct btrfs_fs_info *p = data;
+ struct btrfs_fs_info *fs_info = btrfs_sb(s);
- /*
- * If this super block is going away, return false as it
- * can't match as an existing super block.
- */
- if (!atomic_read(&s->s_active))
- return 0;
- return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+ return fs_info->fs_devices == p->fs_devices;
}
static int btrfs_set_super(struct super_block *s, void *data)
{
- s->s_fs_info = data;
-
- return set_anon_super(s, data);
+ int err = set_anon_super(s, data);
+ if (!err)
+ s->s_fs_info = data;
+ return err;
}
/*
@@ -903,12 +1066,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (!fs_info)
return ERR_PTR(-ENOMEM);
- fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- if (!fs_info->tree_root) {
- error = -ENOMEM;
- goto error_fs_info;
- }
- fs_info->tree_root->fs_info = fs_info;
fs_info->fs_devices = fs_devices;
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -928,43 +1085,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
}
bdev = fs_devices->latest_bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super,
- fs_info->tree_root);
+ s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
if (IS_ERR(s)) {
error = PTR_ERR(s);
goto error_close_devices;
}
if (s->s_root) {
- if ((flags ^ s->s_flags) & MS_RDONLY) {
- deactivate_locked_super(s);
- error = -EBUSY;
- goto error_close_devices;
- }
-
btrfs_close_devices(fs_devices);
free_fs_info(fs_info);
+ if ((flags ^ s->s_flags) & MS_RDONLY)
+ error = -EBUSY;
} else {
char b[BDEVNAME_SIZE];
s->s_flags = flags | MS_NOSEC;
strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
- btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+ btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
- s->s_flags |= MS_ACTIVE;
}
- root = get_default_root(s, subvol_objectid);
- if (IS_ERR(root)) {
+ root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+ if (IS_ERR(root))
deactivate_locked_super(s);
- return root;
- }
return root;
@@ -977,12 +1121,22 @@ error_fs_info:
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
- struct btrfs_root *root = btrfs_sb(sb);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
+ unsigned old_flags = sb->s_flags;
+ unsigned long old_opts = fs_info->mount_opt;
+ unsigned long old_compress_type = fs_info->compress_type;
+ u64 old_max_inline = fs_info->max_inline;
+ u64 old_alloc_start = fs_info->alloc_start;
+ int old_thread_pool_size = fs_info->thread_pool_size;
+ unsigned int old_metadata_ratio = fs_info->metadata_ratio;
int ret;
ret = btrfs_parse_options(root, data);
- if (ret)
- return -EINVAL;
+ if (ret) {
+ ret = -EINVAL;
+ goto restore;
+ }
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
@@ -990,26 +1144,44 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (*flags & MS_RDONLY) {
sb->s_flags |= MS_RDONLY;
- ret = btrfs_commit_super(root);
- WARN_ON(ret);
+ ret = btrfs_commit_super(root);
+ if (ret)
+ goto restore;
} else {
- if (root->fs_info->fs_devices->rw_devices == 0)
- return -EACCES;
+ if (fs_info->fs_devices->rw_devices == 0)
+ ret = -EACCES;
+ goto restore;
- if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
- return -EINVAL;
+ if (btrfs_super_log_root(fs_info->super_copy) != 0)
+ ret = -EINVAL;
+ goto restore;
- ret = btrfs_cleanup_fs_roots(root->fs_info);
- WARN_ON(ret);
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ if (ret)
+ goto restore;
/* recover relocation */
ret = btrfs_recover_relocation(root);
- WARN_ON(ret);
+ if (ret)
+ goto restore;
sb->s_flags &= ~MS_RDONLY;
}
return 0;
+
+restore:
+ /* We've hit an error - don't reset MS_RDONLY */
+ if (sb->s_flags & MS_RDONLY)
+ old_flags |= MS_RDONLY;
+ sb->s_flags = old_flags;
+ fs_info->mount_opt = old_opts;
+ fs_info->compress_type = old_compress_type;
+ fs_info->max_inline = old_max_inline;
+ fs_info->alloc_start = old_alloc_start;
+ fs_info->thread_pool_size = old_thread_pool_size;
+ fs_info->metadata_ratio = old_metadata_ratio;
+ return ret;
}
/* Used to sort the devices by max_avail(descending sort) */
@@ -1168,18 +1340,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct btrfs_root *root = btrfs_sb(dentry->d_sb);
- struct btrfs_super_block *disk_super = root->fs_info->super_copy;
- struct list_head *head = &root->fs_info->space_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ struct list_head *head = &fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
int bits = dentry->d_sb->s_blocksize_bits;
- __be32 *fsid = (__be32 *)root->fs_info->fsid;
+ __be32 *fsid = (__be32 *)fs_info->fsid;
int ret;
/* holding chunk_muext to avoid allocating new chunks */
- mutex_lock(&root->fs_info->chunk_mutex);
+ mutex_lock(&fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1198,14 +1370,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bavail = total_free_data;
- ret = btrfs_calc_avail_data_space(root, &total_free_data);
+ ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
if (ret) {
- mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
buf->f_bavail += total_free_data;
buf->f_bavail = buf->f_bavail >> bits;
- mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
@@ -1219,11 +1391,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+static void btrfs_kill_super(struct super_block *sb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ kill_anon_super(sb);
+ free_fs_info(fs_info);
+}
+
static struct file_system_type btrfs_fs_type = {
.owner = THIS_MODULE,
.name = "btrfs",
.mount = btrfs_mount,
- .kill_sb = kill_anon_super,
+ .kill_sb = btrfs_kill_super,
.fs_flags = FS_REQUIRES_DEV,
};
@@ -1257,17 +1436,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
static int btrfs_freeze(struct super_block *sb)
{
- struct btrfs_root *root = btrfs_sb(sb);
- mutex_lock(&root->fs_info->transaction_kthread_mutex);
- mutex_lock(&root->fs_info->cleaner_mutex);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ mutex_lock(&fs_info->transaction_kthread_mutex);
+ mutex_lock(&fs_info->cleaner_mutex);
return 0;
}
static int btrfs_unfreeze(struct super_block *sb)
{
- struct btrfs_root *root = btrfs_sb(sb);
- mutex_unlock(&root->fs_info->cleaner_mutex);
- mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
}
@@ -1332,9 +1511,7 @@ static int __init init_btrfs_fs(void)
if (err)
return err;
- err = btrfs_init_compress();
- if (err)
- goto free_sysfs;
+ btrfs_init_compress();
err = btrfs_init_cachep();
if (err)
@@ -1360,6 +1537,8 @@ static int __init init_btrfs_fs(void)
if (err)
goto unregister_ioctl;
+ btrfs_init_lockdep();
+
printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
return 0;
@@ -1375,7 +1554,6 @@ free_cachep:
btrfs_destroy_cachep();
free_compress:
btrfs_exit_compress();
-free_sysfs:
btrfs_exit_sysfs();
return err;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3..8da29e8e4de 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -31,11 +31,13 @@
#define BTRFS_ROOT_TRANS_TAG 0
-static noinline void put_transaction(struct btrfs_transaction *transaction)
+void put_transaction(struct btrfs_transaction *transaction)
{
WARN_ON(atomic_read(&transaction->use_count) == 0);
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
+ WARN_ON(transaction->delayed_refs.root.rb_node);
+ WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
memset(transaction, 0, sizeof(*transaction));
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
@@ -56,6 +58,12 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
spin_lock(&root->fs_info->trans_lock);
loop:
+ /* The file system has been taken offline. No new transactions. */
+ if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ spin_unlock(&root->fs_info->trans_lock);
+ return -EROFS;
+ }
+
if (root->fs_info->trans_no_join) {
if (!nofail) {
spin_unlock(&root->fs_info->trans_lock);
@@ -65,6 +73,8 @@ loop:
cur_trans = root->fs_info->running_transaction;
if (cur_trans) {
+ if (cur_trans->aborted)
+ return cur_trans->aborted;
atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
cur_trans->num_joined++;
@@ -108,8 +118,11 @@ loop:
cur_trans->delayed_refs.num_heads = 0;
cur_trans->delayed_refs.flushing = 0;
cur_trans->delayed_refs.run_delayed_start = 0;
+ cur_trans->delayed_refs.seq = 1;
+ init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
spin_lock_init(&cur_trans->commit_lock);
spin_lock_init(&cur_trans->delayed_refs.lock);
+ INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -118,6 +131,7 @@ loop:
root->fs_info->generation++;
cur_trans->transid = root->fs_info->generation;
root->fs_info->running_transaction = cur_trans;
+ cur_trans->aborted = 0;
spin_unlock(&root->fs_info->trans_lock);
return 0;
@@ -313,6 +327,7 @@ again:
h->use_count = 1;
h->block_rsv = NULL;
h->orig_rsv = NULL;
+ h->aborted = 0;
smp_mb();
if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -321,6 +336,8 @@ again:
}
if (num_bytes) {
+ trace_btrfs_space_reservation(root->fs_info, "transaction",
+ h->transid, num_bytes, 1);
h->block_rsv = &root->fs_info->trans_block_rsv;
h->bytes_reserved = num_bytes;
}
@@ -432,6 +449,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_block_rsv *rsv = trans->block_rsv;
int updates;
+ int err;
smp_mb();
if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
@@ -445,8 +463,11 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
updates = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
- if (updates)
- btrfs_run_delayed_refs(trans, root, updates);
+ if (updates) {
+ err = btrfs_run_delayed_refs(trans, root, updates);
+ if (err) /* Error code will also eval true */
+ return err;
+ }
trans->block_rsv = rsv;
@@ -467,19 +488,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
- while (count < 4) {
+ while (count < 2) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (cur &&
trans->transaction->delayed_refs.num_heads_ready > 64) {
trans->delayed_ref_updates = 0;
-
- /*
- * do a full flush if the transaction is trying
- * to close
- */
- if (trans->transaction->delayed_refs.flushing)
- cur = 0;
btrfs_run_delayed_refs(trans, root, cur);
} else {
break;
@@ -524,6 +538,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(root);
+ if (trans->aborted ||
+ root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ return -EIO;
+ }
+
return 0;
}
@@ -689,11 +708,13 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
&root->root_item);
- BUG_ON(ret);
+ if (ret)
+ return ret;
old_root_used = btrfs_root_used(&root->root_item);
ret = btrfs_write_dirty_block_groups(trans, root);
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
if (root != root->fs_info->extent_root)
@@ -704,6 +725,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
/*
* update all the cowonly tree roots on disk
+ *
+ * The error handling in this function may not be obvious. Any of the
+ * failures will cause the file system to go offline. We still need
+ * to clean up the delayed refs.
*/
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
@@ -714,22 +739,30 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
int ret;
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
+ if (ret)
+ return ret;
eb = btrfs_lock_root_node(fs_info->tree_root);
- btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
+ ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
+ 0, &eb);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
+ if (ret)
+ return ret;
+
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
+ if (ret)
+ return ret;
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
- update_cowonly_root(trans, root);
+ ret = update_cowonly_root(trans, root);
+ if (ret)
+ return ret;
}
down_write(&fs_info->extent_commit_sem);
@@ -873,7 +906,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
if (!new_root_item) {
- pending->error = -ENOMEM;
+ ret = pending->error = -ENOMEM;
goto fail;
}
@@ -910,17 +943,24 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* insert the directory item
*/
ret = btrfs_set_inode_index(parent_inode, &index);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
ret = btrfs_insert_dir_item(trans, parent_root,
dentry->d_name.name, dentry->d_name.len,
parent_inode, &key,
BTRFS_FT_DIR, index);
- BUG_ON(ret);
+ if (ret == -EEXIST) {
+ pending->error = -EEXIST;
+ dput(parent);
+ goto fail;
+ } else if (ret) {
+ goto abort_trans_dput;
+ }
btrfs_i_size_write(parent_inode, parent_inode->i_size +
dentry->d_name.len * 2);
ret = btrfs_update_inode(trans, parent_root, parent_inode);
- BUG_ON(ret);
+ if (ret)
+ goto abort_trans_dput;
/*
* pull in the delayed directory update
@@ -929,7 +969,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* snapshot
*/
ret = btrfs_run_delayed_items(trans, root);
- BUG_ON(ret);
+ if (ret) { /* Transaction aborted */
+ dput(parent);
+ goto fail;
+ }
record_root_in_trans(trans, root);
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
@@ -944,12 +987,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_flags(new_root_item, root_flags);
old = btrfs_lock_root_node(root);
- btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ if (ret) {
+ btrfs_tree_unlock(old);
+ free_extent_buffer(old);
+ goto abort_trans_dput;
+ }
+
btrfs_set_lock_blocking(old);
- btrfs_copy_root(trans, root, old, &tmp, objectid);
+ ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
+ /* clean up in any case */
btrfs_tree_unlock(old);
free_extent_buffer(old);
+ if (ret)
+ goto abort_trans_dput;
/* see comments in should_cow_block() */
root->force_cow = 1;
@@ -961,7 +1013,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
- BUG_ON(ret);
+ if (ret)
+ goto abort_trans_dput;
/*
* insert root back/forward references
@@ -970,19 +1023,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
parent_root->root_key.objectid,
btrfs_ino(parent_inode), index,
dentry->d_name.name, dentry->d_name.len);
- BUG_ON(ret);
dput(parent);
+ if (ret)
+ goto fail;
key.offset = (u64)-1;
pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
- BUG_ON(IS_ERR(pending->snap));
+ if (IS_ERR(pending->snap)) {
+ ret = PTR_ERR(pending->snap);
+ goto abort_trans;
+ }
- btrfs_reloc_post_snapshot(trans, pending);
+ ret = btrfs_reloc_post_snapshot(trans, pending);
+ if (ret)
+ goto abort_trans;
+ ret = 0;
fail:
kfree(new_root_item);
trans->block_rsv = rsv;
btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
- return 0;
+ return ret;
+
+abort_trans_dput:
+ dput(parent);
+abort_trans:
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
}
/*
@@ -993,12 +1059,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
{
struct btrfs_pending_snapshot *pending;
struct list_head *head = &trans->transaction->pending_snapshots;
- int ret;
- list_for_each_entry(pending, head, list) {
- ret = create_pending_snapshot(trans, fs_info, pending);
- BUG_ON(ret);
- }
+ list_for_each_entry(pending, head, list)
+ create_pending_snapshot(trans, fs_info, pending);
return 0;
}
@@ -1122,6 +1185,33 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
return 0;
}
+
+static void cleanup_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ WARN_ON(trans->use_count > 1);
+
+ spin_lock(&root->fs_info->trans_lock);
+ list_del_init(&cur_trans->list);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ btrfs_cleanup_one_transaction(trans->transaction, root);
+
+ put_transaction(cur_trans);
+ put_transaction(cur_trans);
+
+ trace_btrfs_transaction_commit(root);
+
+ btrfs_scrub_continue(root);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+}
+
/*
* btrfs_transaction state sequence:
* in_commit = 0, blocked = 0 (initial)
@@ -1133,10 +1223,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
unsigned long joined = 0;
- struct btrfs_transaction *cur_trans;
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
DEFINE_WAIT(wait);
- int ret;
+ int ret = -EIO;
int should_grow = 0;
unsigned long now = get_seconds();
int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
@@ -1146,13 +1236,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
+ if (cur_trans->aborted)
+ goto cleanup_transaction;
+
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
ret = btrfs_run_delayed_refs(trans, root, 0);
- BUG_ON(ret);
+ if (ret)
+ goto cleanup_transaction;
cur_trans = trans->transaction;
+
/*
* set the flushing flag so procs in this transaction have to
* start sending their work down.
@@ -1160,19 +1255,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
cur_trans->delayed_refs.flushing = 1;
ret = btrfs_run_delayed_refs(trans, root, 0);
- BUG_ON(ret);
+ if (ret)
+ goto cleanup_transaction;
spin_lock(&cur_trans->commit_lock);
if (cur_trans->in_commit) {
spin_unlock(&cur_trans->commit_lock);
atomic_inc(&cur_trans->use_count);
- btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans, root);
wait_for_commit(root, cur_trans);
put_transaction(cur_trans);
- return 0;
+ return ret;
}
trans->transaction->in_commit = 1;
@@ -1212,12 +1308,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (flush_on_commit || snap_pending) {
btrfs_start_delalloc_inodes(root, 1);
- ret = btrfs_wait_ordered_extents(root, 0, 1);
- BUG_ON(ret);
+ btrfs_wait_ordered_extents(root, 0, 1);
}
ret = btrfs_run_delayed_items(trans, root);
- BUG_ON(ret);
+ if (ret)
+ goto cleanup_transaction;
/*
* rename don't use btrfs_join_transaction, so, once we
@@ -1259,13 +1355,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->reloc_mutex);
ret = btrfs_run_delayed_items(trans, root);
- BUG_ON(ret);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto cleanup_transaction;
+ }
ret = create_pending_snapshots(trans, root->fs_info);
- BUG_ON(ret);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto cleanup_transaction;
+ }
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto cleanup_transaction;
+ }
/*
* make sure none of the code above managed to slip in a
@@ -1292,7 +1397,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->tree_log_mutex);
ret = commit_fs_roots(trans, root);
- BUG_ON(ret);
+ if (ret) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ goto cleanup_transaction;
+ }
/* commit_fs_roots gets rid of all the tree log roots, it is now
* safe to free the root of tree log roots
@@ -1300,7 +1408,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_free_log_root_tree(trans, root->fs_info);
ret = commit_cowonly_roots(trans, root);
- BUG_ON(ret);
+ if (ret) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ goto cleanup_transaction;
+ }
btrfs_prepare_extent_commit(trans, root);
@@ -1334,8 +1445,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
wake_up(&root->fs_info->transaction_wait);
ret = btrfs_write_and_wait_transaction(trans, root);
- BUG_ON(ret);
- write_ctree_super(trans, root, 0);
+ if (ret) {
+ btrfs_error(root->fs_info, ret,
+ "Error while writing out transaction.");
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ goto cleanup_transaction;
+ }
+
+ ret = write_ctree_super(trans, root, 0);
+ if (ret) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ goto cleanup_transaction;
+ }
/*
* the super is written, we can safely allow the tree-loggers
@@ -1371,6 +1492,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_run_delayed_iputs(root);
return ret;
+
+cleanup_transaction:
+ btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
+// WARN_ON(1);
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+ cleanup_transaction(trans, root);
+
+ return ret;
}
/*
@@ -1386,6 +1516,8 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
spin_unlock(&fs_info->trans_lock);
while (!list_empty(&list)) {
+ int ret;
+
root = list_entry(list.next, struct btrfs_root, root_list);
list_del(&root->root_list);
@@ -1393,9 +1525,10 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- btrfs_drop_snapshot(root, NULL, 0);
+ ret = btrfs_drop_snapshot(root, NULL, 0, 0);
else
- btrfs_drop_snapshot(root, NULL, 1);
+ ret =btrfs_drop_snapshot(root, NULL, 1, 0);
+ BUG_ON(ret < 0);
}
return 0;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 02564e6230a..fe27379e368 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -43,6 +43,7 @@ struct btrfs_transaction {
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
struct btrfs_delayed_ref_root delayed_refs;
+ int aborted;
};
struct btrfs_trans_handle {
@@ -55,6 +56,7 @@ struct btrfs_trans_handle {
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
+ int aborted;
};
struct btrfs_pending_snapshot {
@@ -114,4 +116,5 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+void put_transaction(struct btrfs_transaction *transaction);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3568374d419..d017283ae6f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -212,14 +212,13 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
* indicate we're done making changes to the log tree
* and wake up anyone waiting to do a sync
*/
-int btrfs_end_log_trans(struct btrfs_root *root)
+void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
smp_mb();
if (waitqueue_active(&root->log_writer_wait))
wake_up(&root->log_writer_wait);
}
- return 0;
}
@@ -378,12 +377,11 @@ insert:
u32 found_size;
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
- if (found_size > item_size) {
+ if (found_size > item_size)
btrfs_truncate_item(trans, root, path, item_size, 1);
- } else if (found_size < item_size) {
- ret = btrfs_extend_item(trans, root, path,
- item_size - found_size);
- }
+ else if (found_size < item_size)
+ btrfs_extend_item(trans, root, path,
+ item_size - found_size);
} else if (ret) {
return ret;
}
@@ -589,7 +587,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root,
ins.objectid, ins.offset,
0, root->root_key.objectid,
- key->objectid, offset);
+ key->objectid, offset, 0);
BUG_ON(ret);
} else {
/*
@@ -1763,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_and_pin_reserved_extent(root,
bytenr, blocksize);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or logic errors */
}
free_extent_buffer(next);
continue;
@@ -1871,20 +1869,26 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
wret = walk_down_log_tree(trans, log, path, &level, wc);
if (wret > 0)
break;
- if (wret < 0)
+ if (wret < 0) {
ret = wret;
+ goto out;
+ }
wret = walk_up_log_tree(trans, log, path, &level, wc);
if (wret > 0)
break;
- if (wret < 0)
+ if (wret < 0) {
ret = wret;
+ goto out;
+ }
}
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
- wc->process_func(log, path->nodes[orig_level], wc,
+ ret = wc->process_func(log, path->nodes[orig_level], wc,
btrfs_header_generation(path->nodes[orig_level]));
+ if (ret)
+ goto out;
if (wc->free) {
struct extent_buffer *next;
@@ -1900,10 +1904,11 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_and_pin_reserved_extent(log, next->start,
next->len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or logic errors */
}
}
+out:
for (i = 0; i <= orig_level; i++) {
if (path->nodes[i]) {
free_extent_buffer(path->nodes[i]);
@@ -1957,16 +1962,18 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
finish_wait(&root->log_commit_wait[index], &wait);
mutex_lock(&root->log_mutex);
- } while (root->log_transid < transid + 2 &&
+ } while (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && root->log_transid < transid + 2 &&
atomic_read(&root->log_commit[index]));
return 0;
}
-static int wait_for_writer(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
DEFINE_WAIT(wait);
- while (atomic_read(&root->log_writers)) {
+ while (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && atomic_read(&root->log_writers)) {
prepare_to_wait(&root->log_writer_wait,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
@@ -1976,7 +1983,6 @@ static int wait_for_writer(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
}
- return 0;
}
/*
@@ -2044,7 +2050,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* wait for them until later.
*/
ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
btrfs_set_root_node(&log->root_item, log->node);
@@ -2075,7 +2085,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (ret) {
- BUG_ON(ret != -ENOSPC);
+ if (ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out;
+ }
root->fs_info->last_trans_log_full_commit = trans->transid;
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2115,7 +2129,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_marked_extents(log_root_tree,
&log_root_tree->dirty_log_pages,
EXTENT_DIRTY | EXTENT_NEW);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -2324,7 +2342,9 @@ out_unlock:
if (ret == -ENOSPC) {
root->fs_info->last_trans_log_full_commit = trans->transid;
ret = 0;
- }
+ } else if (ret < 0)
+ btrfs_abort_transaction(trans, root, ret);
+
btrfs_end_log_trans(root);
return err;
@@ -2355,7 +2375,8 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
if (ret == -ENOSPC) {
root->fs_info->last_trans_log_full_commit = trans->transid;
ret = 0;
- }
+ } else if (ret < 0 && ret != -ENOENT)
+ btrfs_abort_transaction(trans, root, ret);
btrfs_end_log_trans(root);
return ret;
@@ -3167,13 +3188,20 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
fs_info->log_root_recovering = 1;
trans = btrfs_start_transaction(fs_info->tree_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto error;
+ }
wc.trans = trans;
wc.pin = 1;
ret = walk_log_tree(trans, log_root_tree, &wc);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_error(fs_info, ret, "Failed to pin buffers while "
+ "recovering log root tree.");
+ goto error;
+ }
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3182,8 +3210,12 @@ again:
while (1) {
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
- if (ret < 0)
- break;
+
+ if (ret < 0) {
+ btrfs_error(fs_info, ret,
+ "Couldn't find tree log root.");
+ goto error;
+ }
if (ret > 0) {
if (path->slots[0] == 0)
break;
@@ -3197,14 +3229,24 @@ again:
log = btrfs_read_fs_root_no_radix(log_root_tree,
&found_key);
- BUG_ON(IS_ERR(log));
+ if (IS_ERR(log)) {
+ ret = PTR_ERR(log);
+ btrfs_error(fs_info, ret,
+ "Couldn't read tree log root.");
+ goto error;
+ }
tmp_key.objectid = found_key.offset;
tmp_key.type = BTRFS_ROOT_ITEM_KEY;
tmp_key.offset = (u64)-1;
wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
- BUG_ON(IS_ERR_OR_NULL(wc.replay_dest));
+ if (IS_ERR(wc.replay_dest)) {
+ ret = PTR_ERR(wc.replay_dest);
+ btrfs_error(fs_info, ret, "Couldn't read target root "
+ "for tree log recovery.");
+ goto error;
+ }
wc.replay_dest->log_root = log;
btrfs_record_root_in_trans(trans, wc.replay_dest);
@@ -3252,6 +3294,10 @@ again:
kfree(log_root_tree);
return 0;
+
+error:
+ btrfs_free_path(path);
+ return ret;
}
/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 2270ac58d74..862ac813f6b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -38,7 +38,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct inode *inode, u64 dirid);
-int btrfs_end_log_trans(struct btrfs_root *root);
+void btrfs_end_log_trans(struct btrfs_root *root);
int btrfs_pin_log_trans(struct btrfs_root *root);
int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 00000000000..12f5147bd2b
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include "ulist.h"
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * elem = NULL;
+ *
+ * while ((elem = ulist_next(ulist, elem)) {
+ * for (all child nodes n in elem)
+ * ulist_add(ulist, n);
+ * do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are adressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist: the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+ ulist->nnodes = 0;
+ ulist->nodes = ulist->int_nodes;
+ ulist->nodes_alloced = ULIST_SIZE;
+}
+EXPORT_SYMBOL(ulist_init);
+
+/**
+ * ulist_fini - free up additionally allocated memory for the ulist
+ * @ulist: the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+void ulist_fini(struct ulist *ulist)
+{
+ /*
+ * The first ULIST_SIZE elements are stored inline in struct ulist.
+ * Only if more elements are alocated they need to be freed.
+ */
+ if (ulist->nodes_alloced > ULIST_SIZE)
+ kfree(ulist->nodes);
+ ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */
+}
+EXPORT_SYMBOL(ulist_fini);
+
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist: ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+ ulist_fini(ulist);
+ ulist_init(ulist);
+}
+EXPORT_SYMBOL(ulist_reinit);
+
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask: allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(unsigned long gfp_mask)
+{
+ struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+
+ if (!ulist)
+ return NULL;
+
+ ulist_init(ulist);
+
+ return ulist;
+}
+EXPORT_SYMBOL(ulist_alloc);
+
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist: ulist to free
+ *
+ * It is not necessary to call ulist_fini before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+ if (!ulist)
+ return;
+ ulist_fini(ulist);
+ kfree(ulist);
+}
+EXPORT_SYMBOL(ulist_free);
+
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist: ulist to add the element to
+ * @val: value to add to ulist
+ * @aux: auxiliary value to store along with val
+ * @gfp_mask: flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ * locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+ unsigned long gfp_mask)
+{
+ int i;
+
+ for (i = 0; i < ulist->nnodes; ++i) {
+ if (ulist->nodes[i].val == val)
+ return 0;
+ }
+
+ if (ulist->nnodes >= ulist->nodes_alloced) {
+ u64 new_alloced = ulist->nodes_alloced + 128;
+ struct ulist_node *new_nodes;
+ void *old = NULL;
+
+ /*
+ * if nodes_alloced == ULIST_SIZE no memory has been allocated
+ * yet, so pass NULL to krealloc
+ */
+ if (ulist->nodes_alloced > ULIST_SIZE)
+ old = ulist->nodes;
+
+ new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
+ gfp_mask);
+ if (!new_nodes)
+ return -ENOMEM;
+
+ if (!old)
+ memcpy(new_nodes, ulist->int_nodes,
+ sizeof(ulist->int_nodes));
+
+ ulist->nodes = new_nodes;
+ ulist->nodes_alloced = new_alloced;
+ }
+ ulist->nodes[ulist->nnodes].val = val;
+ ulist->nodes[ulist->nnodes].aux = aux;
+ ++ulist->nnodes;
+
+ return 1;
+}
+EXPORT_SYMBOL(ulist_add);
+
+/**
+ * ulist_next - iterate ulist
+ * @ulist: ulist to iterate
+ * @prev: previously returned element or %NULL to start iteration
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ * locking is needed
+ *
+ * This function is used to iterate an ulist. The iteration is started with
+ * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+{
+ int next;
+
+ if (ulist->nnodes == 0)
+ return NULL;
+
+ if (!prev)
+ return &ulist->nodes[0];
+
+ next = (prev - ulist->nodes) + 1;
+ if (next < 0 || next >= ulist->nnodes)
+ return NULL;
+
+ return &ulist->nodes[next];
+}
+EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 00000000000..2e25dec58ec
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ *
+ */
+
+#ifndef __ULIST__
+#define __ULIST__
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ */
+
+/*
+ * number of elements statically allocated inside struct ulist
+ */
+#define ULIST_SIZE 16
+
+/*
+ * element of the list
+ */
+struct ulist_node {
+ u64 val; /* value to store */
+ unsigned long aux; /* auxiliary value saved along with the val */
+};
+
+struct ulist {
+ /*
+ * number of elements stored in list
+ */
+ unsigned long nnodes;
+
+ /*
+ * number of nodes we already have room for
+ */
+ unsigned long nodes_alloced;
+
+ /*
+ * pointer to the array storing the elements. The first ULIST_SIZE
+ * elements are stored inline. In this case the it points to int_nodes.
+ * After exceeding ULIST_SIZE, dynamic memory is allocated.
+ */
+ struct ulist_node *nodes;
+
+ /*
+ * inline storage space for the first ULIST_SIZE entries
+ */
+ struct ulist_node int_nodes[ULIST_SIZE];
+};
+
+void ulist_init(struct ulist *ulist);
+void ulist_fini(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(unsigned long gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+ unsigned long gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9..a872b48be0a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
+#include <linux/kthread.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
@@ -32,6 +33,7 @@
#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
+#include "check-integrity.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -65,7 +67,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
kfree(fs_devices);
}
-int btrfs_cleanup_fs_uuids(void)
+void btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
@@ -75,7 +77,6 @@ int btrfs_cleanup_fs_uuids(void)
list_del(&fs_devices->list);
free_fs_devices(fs_devices);
}
- return 0;
}
static noinline struct btrfs_device *__find_device(struct list_head *head,
@@ -128,7 +129,7 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios,
* the list if the block device is congested. This way, multiple devices
* can make progress from a single worker thread.
*/
-static noinline int run_scheduled_bios(struct btrfs_device *device)
+static noinline void run_scheduled_bios(struct btrfs_device *device)
{
struct bio *pending;
struct backing_dev_info *bdi;
@@ -246,7 +247,7 @@ loop_lock:
sync_pending = 0;
}
- submit_bio(cur->bi_rw, cur);
+ btrfsic_submit_bio(cur->bi_rw, cur);
num_run++;
batch_run++;
if (need_resched())
@@ -314,7 +315,6 @@ loop_lock:
done:
blk_finish_plug(&plug);
- return 0;
}
static void pending_bios_fn(struct btrfs_work *work)
@@ -453,16 +453,27 @@ error:
return ERR_PTR(-ENOMEM);
}
-int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *next;
+ struct block_device *latest_bdev = NULL;
+ u64 latest_devid = 0;
+ u64 latest_transid = 0;
+
mutex_lock(&uuid_mutex);
again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (device->in_fs_metadata)
+ if (device->in_fs_metadata) {
+ if (!latest_transid ||
+ device->generation > latest_transid) {
+ latest_devid = device->devid;
+ latest_transid = device->generation;
+ latest_bdev = device->bdev;
+ }
continue;
+ }
if (device->bdev) {
blkdev_put(device->bdev, device->mode);
@@ -485,8 +496,11 @@ again:
goto again;
}
+ fs_devices->latest_bdev = latest_bdev;
+ fs_devices->latest_devid = latest_devid;
+ fs_devices->latest_trans = latest_transid;
+
mutex_unlock(&uuid_mutex);
- return 0;
}
static void __free_device(struct work_struct *work)
@@ -535,10 +549,10 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
fs_devices->num_can_discard--;
new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
- BUG_ON(!new_device);
+ BUG_ON(!new_device); /* -ENOMEM */
memcpy(new_device, device, sizeof(*new_device));
new_device->name = kstrdup(device->name, GFP_NOFS);
- BUG_ON(device->name && !new_device->name);
+ BUG_ON(device->name && !new_device->name); /* -ENOMEM */
new_device->bdev = NULL;
new_device->writeable = 0;
new_device->in_fs_metadata = 0;
@@ -608,6 +622,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
printk(KERN_INFO "open %s failed\n", device->name);
goto error;
}
+ filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ invalidate_bdev(bdev);
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
@@ -706,8 +722,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
u64 devid;
u64 transid;
- mutex_lock(&uuid_mutex);
-
flags |= FMODE_EXCL;
bdev = blkdev_get_by_path(path, flags, holder);
@@ -716,6 +730,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
goto error;
}
+ mutex_lock(&uuid_mutex);
ret = set_blocksize(bdev, 4096);
if (ret)
goto error_close;
@@ -737,9 +752,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
brelse(bh);
error_close:
+ mutex_unlock(&uuid_mutex);
blkdev_put(bdev, flags);
error:
- mutex_unlock(&uuid_mutex);
return ret;
}
@@ -829,7 +844,6 @@ out:
/*
* find_free_dev_extent - find free space in the specified device
- * @trans: transaction handler
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @start: store the start of the free space.
@@ -848,8 +862,7 @@ out:
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
struct btrfs_key key;
@@ -893,7 +906,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
@@ -1025,8 +1038,10 @@ again:
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
+ } else {
+ btrfs_error(root->fs_info, ret, "Slot search failed");
+ goto out;
}
- BUG_ON(ret);
if (device->bytes_used > 0) {
u64 len = btrfs_dev_extent_length(leaf, extent);
@@ -1036,7 +1051,10 @@ again:
spin_unlock(&root->fs_info->free_chunk_lock);
}
ret = btrfs_del_item(trans, root, path);
-
+ if (ret) {
+ btrfs_error(root->fs_info, ret,
+ "Failed to remove dev extent item");
+ }
out:
btrfs_free_path(path);
return ret;
@@ -1064,7 +1082,8 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*extent));
- BUG_ON(ret);
+ if (ret)
+ goto out;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1079,6 +1098,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
btrfs_mark_buffer_dirty(leaf);
+out:
btrfs_free_path(path);
return ret;
}
@@ -1104,7 +1124,7 @@ static noinline int find_next_chunk(struct btrfs_root *root,
if (ret < 0)
goto error;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
if (ret) {
@@ -1148,7 +1168,7 @@ static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
if (ret < 0)
goto error;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
BTRFS_DEV_ITEM_KEY);
@@ -1282,7 +1302,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
bool clear_super = false;
mutex_lock(&uuid_mutex);
- mutex_lock(&root->fs_info->volume_mutex);
all_avail = root->fs_info->avail_data_alloc_bits |
root->fs_info->avail_system_alloc_bits |
@@ -1337,6 +1356,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
set_blocksize(bdev, 4096);
+ invalidate_bdev(bdev);
bh = btrfs_read_dev_super(bdev);
if (!bh) {
ret = -EINVAL;
@@ -1452,7 +1472,6 @@ error_close:
if (bdev)
blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
- mutex_unlock(&root->fs_info->volume_mutex);
mutex_unlock(&uuid_mutex);
return ret;
error_undo:
@@ -1469,8 +1488,7 @@ error_undo:
/*
* does all the dirty work required for changing file system's UUID.
*/
-static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_root *root)
{
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
@@ -1585,7 +1603,7 @@ next_slot:
(unsigned long)btrfs_device_fsid(dev_item),
BTRFS_UUID_SIZE);
device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
- BUG_ON(!device);
+ BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
btrfs_set_device_generation(leaf, dev_item,
@@ -1629,7 +1647,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
- mutex_lock(&root->fs_info->volume_mutex);
devices = &root->fs_info->fs_devices->devices;
/*
@@ -1695,8 +1712,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (seeding_dev) {
sb->s_flags &= ~MS_RDONLY;
- ret = btrfs_prepare_sprout(trans, root);
- BUG_ON(ret);
+ ret = btrfs_prepare_sprout(root);
+ BUG_ON(ret); /* -ENOMEM */
}
device->fs_devices = root->fs_info->fs_devices;
@@ -1734,11 +1751,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (seeding_dev) {
ret = init_first_rw_device(trans, root, device);
- BUG_ON(ret);
+ if (ret)
+ goto error_trans;
ret = btrfs_finish_sprout(trans, root);
- BUG_ON(ret);
+ if (ret)
+ goto error_trans;
} else {
ret = btrfs_add_device(trans, root, device);
+ if (ret)
+ goto error_trans;
}
/*
@@ -1748,25 +1769,38 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
btrfs_clear_space_info_full(root->fs_info);
unlock_chunks(root);
- btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
+ if (ret) /* transaction commit */
+ return ret;
+
ret = btrfs_relocate_sys_chunks(root);
- BUG_ON(ret);
+ if (ret < 0)
+ btrfs_error(root->fs_info, ret,
+ "Failed to relocate sys chunks after "
+ "device initialization. This can be fixed "
+ "using the \"btrfs balance\" command.");
}
-out:
- mutex_unlock(&root->fs_info->volume_mutex);
+
return ret;
+
+error_trans:
+ unlock_chunks(root);
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ kfree(device->name);
+ kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
- goto out;
+ return ret;
}
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -1867,10 +1901,20 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
key.type = BTRFS_CHUNK_ITEM_KEY;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- BUG_ON(ret);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0) { /* Logic error or corruption */
+ btrfs_error(root->fs_info, -ENOENT,
+ "Failed lookup while freeing chunk.");
+ ret = -ENOENT;
+ goto out;
+ }
ret = btrfs_del_item(trans, root, path);
-
+ if (ret < 0)
+ btrfs_error(root->fs_info, ret,
+ "Failed to delete chunk item.");
+out:
btrfs_free_path(path);
return ret;
}
@@ -1959,7 +2003,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
- BUG_ON(em->start > chunk_offset ||
+ BUG_ON(!em || em->start > chunk_offset ||
em->start + em->len < chunk_offset);
map = (struct map_lookup *)em->bdev;
@@ -2032,7 +2076,7 @@ again:
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
@@ -2077,6 +2121,358 @@ error:
return ret;
}
+static int insert_balance_item(struct btrfs_root *root,
+ struct btrfs_balance_control *bctl)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret, err;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*item));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+ btrfs_set_balance_data(leaf, item, &disk_bargs);
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+ btrfs_set_balance_meta(leaf, item, &disk_bargs);
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+ btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+ btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+ return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret, err;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+ return ret;
+}
+
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+ /*
+ * Turn on soft mode for chunk types that were being converted.
+ */
+ if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+ /*
+ * Turn on usage filter if is not already used. The idea is
+ * that chunks that we have already balanced should be
+ * reasonably full. Don't do it for chunks that are being
+ * converted - that will keep us from relocating unconverted
+ * (albeit full) chunks.
+ */
+ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->data.usage = 90;
+ }
+ if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->sys.usage = 90;
+ }
+ if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->meta.usage = 90;
+ }
+}
+
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper. Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+ BUG_ON(fs_info->balance_ctl);
+
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = bctl;
+ spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ BUG_ON(!fs_info->balance_ctl);
+
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = NULL;
+ spin_unlock(&fs_info->balance_lock);
+
+ kfree(bctl);
+}
+
+/*
+ * Balance filters. Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_type,
+ struct btrfs_balance_args *bargs)
+{
+ chunk_type = chunk_to_extended(chunk_type) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ if (bargs->profiles & chunk_type)
+ return 0;
+
+ return 1;
+}
+
+static u64 div_factor_fine(u64 num, int factor)
+{
+ if (factor <= 0)
+ return 0;
+ if (factor >= 100)
+ return num;
+
+ num *= factor;
+ do_div(num, 100);
+ return num;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 chunk_used, user_thresh;
+ int ret = 1;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ chunk_used = btrfs_block_group_used(&cache->item);
+
+ user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+ if (chunk_used < user_thresh)
+ ret = 0;
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_stripe *stripe;
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ int i;
+
+ for (i = 0; i < num_stripes; i++) {
+ stripe = btrfs_stripe_nr(chunk, i);
+ if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+ return 0;
+ }
+
+ return 1;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_stripe *stripe;
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ u64 stripe_offset;
+ u64 stripe_length;
+ int factor;
+ int i;
+
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+ return 0;
+
+ if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
+ factor = num_stripes / factor;
+
+ for (i = 0; i < num_stripes; i++) {
+ stripe = btrfs_stripe_nr(chunk, i);
+ if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+ continue;
+
+ stripe_offset = btrfs_stripe_offset(leaf, stripe);
+ stripe_length = btrfs_chunk_length(leaf, chunk);
+ do_div(stripe_length, factor);
+
+ if (stripe_offset < bargs->pend &&
+ stripe_offset + stripe_length > bargs->pstart)
+ return 0;
+ }
+
+ return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ if (chunk_offset < bargs->vend &&
+ chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+ /* at least part of the chunk is inside this vrange */
+ return 0;
+
+ return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_type,
+ struct btrfs_balance_args *bargs)
+{
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+ return 0;
+
+ chunk_type = chunk_to_extended(chunk_type) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ if (bargs->target == chunk_type)
+ return 1;
+
+ return 0;
+}
+
+static int should_balance_chunk(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+ struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ struct btrfs_balance_args *bargs = NULL;
+ u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+ /* type filter */
+ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+ (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+ return 0;
+ }
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ bargs = &bctl->data;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ bargs = &bctl->sys;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ bargs = &bctl->meta;
+
+ /* profiles filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+ chunk_profiles_filter(chunk_type, bargs)) {
+ return 0;
+ }
+
+ /* usage filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* devid filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+ chunk_devid_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
+ /* drange filter, makes sense only with devid filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+ chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* vrange filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+ chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* soft profile changing mode */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+ chunk_soft_convert_filter(chunk_type, bargs)) {
+ return 0;
+ }
+
+ return 1;
+}
+
static u64 div_factor(u64 num, int factor)
{
if (factor == 10)
@@ -2086,29 +2482,28 @@ static u64 div_factor(u64 num, int factor)
return num;
}
-int btrfs_balance(struct btrfs_root *dev_root)
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
- int ret;
- struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct list_head *devices;
struct btrfs_device *device;
u64 old_size;
u64 size_to_free;
+ struct btrfs_chunk *chunk;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
- struct btrfs_trans_handle *trans;
struct btrfs_key found_key;
-
- if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&dev_root->fs_info->volume_mutex);
- dev_root = dev_root->fs_info->dev_root;
+ struct btrfs_trans_handle *trans;
+ struct extent_buffer *leaf;
+ int slot;
+ int ret;
+ int enospc_errors = 0;
+ bool counting = true;
/* step one make some room on all the devices */
+ devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
old_size = device->total_bytes;
size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2532,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
ret = -ENOMEM;
goto error;
}
+
+ /* zero out stat counters */
+ spin_lock(&fs_info->balance_lock);
+ memset(&bctl->stat, 0, sizeof(bctl->stat));
+ spin_unlock(&fs_info->balance_lock);
+again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
+ if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
+ ret = -ECANCELED;
+ goto error;
+ }
+
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
@@ -2151,15 +2558,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
* failed
*/
if (ret == 0)
- break;
+ BUG(); /* FIXME break ? */
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
- if (ret)
+ if (ret) {
+ ret = 0;
break;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
if (found_key.objectid != key.objectid)
break;
@@ -2167,22 +2578,399 @@ int btrfs_balance(struct btrfs_root *dev_root)
if (found_key.offset == 0)
break;
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+ if (!counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.considered++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+
+ ret = should_balance_chunk(chunk_root, leaf, chunk,
+ found_key.offset);
btrfs_release_path(path);
+ if (!ret)
+ goto loop;
+
+ if (counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.expected++;
+ spin_unlock(&fs_info->balance_lock);
+ goto loop;
+ }
+
ret = btrfs_relocate_chunk(chunk_root,
chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
if (ret && ret != -ENOSPC)
goto error;
+ if (ret == -ENOSPC) {
+ enospc_errors++;
+ } else {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.completed++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+loop:
key.offset = found_key.offset - 1;
}
- ret = 0;
+
+ if (counting) {
+ btrfs_release_path(path);
+ counting = false;
+ goto again;
+ }
error:
btrfs_free_path(path);
- mutex_unlock(&dev_root->fs_info->volume_mutex);
+ if (enospc_errors) {
+ printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+ enospc_errors);
+ if (!ret)
+ ret = -ENOSPC;
+ }
+
return ret;
}
+/**
+ * alloc_profile_is_valid - see if a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static int alloc_profile_is_valid(u64 flags, int extended)
+{
+ u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
+ BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+ flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ /* 1) check that all other bits are zeroed */
+ if (flags & ~mask)
+ return 0;
+
+ /* 2) see if profile is reduced */
+ if (flags == 0)
+ return !extended; /* "0" is valid for usual profiles */
+
+ /* true if exactly one bit set */
+ return (flags & (flags - 1)) == 0;
+}
+
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+ /* cancel requested || normal exit path */
+ return atomic_read(&fs_info->balance_cancel_req) ||
+ (atomic_read(&fs_info->balance_pause_req) == 0 &&
+ atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+
+ unset_balance_control(fs_info);
+ ret = del_balance_item(fs_info->tree_root);
+ BUG_ON(ret);
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+ struct btrfs_ioctl_balance_args *bargs);
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+ u64 allowed;
+ int mixed = 0;
+ int ret;
+
+ if (btrfs_fs_closing(fs_info) ||
+ atomic_read(&fs_info->balance_pause_req) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+ if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = 1;
+
+ /*
+ * In case of mixed groups both data and meta should be picked,
+ * and identical options should be given for both of them.
+ */
+ allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
+ if (mixed && (bctl->flags & allowed)) {
+ if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+ !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+ memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+ printk(KERN_ERR "btrfs: with mixed groups data and "
+ "metadata balance options must be the same\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ if (fs_info->fs_devices->num_devices == 1)
+ allowed |= BTRFS_BLOCK_GROUP_DUP;
+ else if (fs_info->fs_devices->num_devices < 4)
+ allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+ else
+ allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10);
+
+ if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl->data.target, 1) ||
+ (bctl->data.target & ~allowed))) {
+ printk(KERN_ERR "btrfs: unable to start balance with target "
+ "data profile %llu\n",
+ (unsigned long long)bctl->data.target);
+ ret = -EINVAL;
+ goto out;
+ }
+ if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl->meta.target, 1) ||
+ (bctl->meta.target & ~allowed))) {
+ printk(KERN_ERR "btrfs: unable to start balance with target "
+ "metadata profile %llu\n",
+ (unsigned long long)bctl->meta.target);
+ ret = -EINVAL;
+ goto out;
+ }
+ if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl->sys.target, 1) ||
+ (bctl->sys.target & ~allowed))) {
+ printk(KERN_ERR "btrfs: unable to start balance with target "
+ "system profile %llu\n",
+ (unsigned long long)bctl->sys.target);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* allow dup'ed data chunks only in mixed mode */
+ if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
+ printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* allow to reduce meta or sys integrity only if force set */
+ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10;
+ if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_system_alloc_bits & allowed) &&
+ !(bctl->sys.target & allowed)) ||
+ ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_metadata_alloc_bits & allowed) &&
+ !(bctl->meta.target & allowed))) {
+ if (bctl->flags & BTRFS_BALANCE_FORCE) {
+ printk(KERN_INFO "btrfs: force reducing metadata "
+ "integrity\n");
+ } else {
+ printk(KERN_ERR "btrfs: balance will reduce metadata "
+ "integrity, use force if you want this\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ ret = insert_balance_item(fs_info->tree_root, bctl);
+ if (ret && ret != -EEXIST)
+ goto out;
+
+ if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+ BUG_ON(ret == -EEXIST);
+ set_balance_control(bctl);
+ } else {
+ BUG_ON(ret != -EEXIST);
+ spin_lock(&fs_info->balance_lock);
+ update_balance_args(bctl);
+ spin_unlock(&fs_info->balance_lock);
+ }
+
+ atomic_inc(&fs_info->balance_running);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ ret = __btrfs_balance(fs_info);
+
+ mutex_lock(&fs_info->balance_mutex);
+ atomic_dec(&fs_info->balance_running);
+
+ if (bargs) {
+ memset(bargs, 0, sizeof(*bargs));
+ update_ioctl_balance_args(fs_info, 0, bargs);
+ }
+
+ if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+ balance_need_close(fs_info)) {
+ __cancel_balance(fs_info);
+ }
+
+ wake_up(&fs_info->balance_wait_q);
+
+ return ret;
+out:
+ if (bctl->flags & BTRFS_BALANCE_RESUME)
+ __cancel_balance(fs_info);
+ else
+ kfree(bctl);
+ return ret;
+}
+
+static int balance_kthread(void *data)
+{
+ struct btrfs_balance_control *bctl =
+ (struct btrfs_balance_control *)data;
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+ int ret = 0;
+
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ set_balance_control(bctl);
+
+ if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+ printk(KERN_INFO "btrfs: force skipping balance\n");
+ } else {
+ printk(KERN_INFO "btrfs: continuing balance\n");
+ ret = btrfs_balance(bctl, NULL);
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
+}
+
+int btrfs_recover_balance(struct btrfs_root *tree_root)
+{
+ struct task_struct *tsk;
+ struct btrfs_balance_control *bctl;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out_bctl;
+ if (ret > 0) { /* ret = -ENOENT; */
+ ret = 0;
+ goto out_bctl;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ bctl->fs_info = tree_root->fs_info;
+ bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+
+ btrfs_balance_data(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+ btrfs_balance_meta(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+ btrfs_balance_sys(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+ tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+ if (IS_ERR(tsk))
+ ret = PTR_ERR(tsk);
+ else
+ goto out;
+
+out_bctl:
+ kfree(bctl);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ if (atomic_read(&fs_info->balance_running)) {
+ atomic_inc(&fs_info->balance_pause_req);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ wait_event(fs_info->balance_wait_q,
+ atomic_read(&fs_info->balance_running) == 0);
+
+ mutex_lock(&fs_info->balance_mutex);
+ /* we are good with balance_ctl ripped off from under us */
+ BUG_ON(atomic_read(&fs_info->balance_running));
+ atomic_dec(&fs_info->balance_pause_req);
+ } else {
+ ret = -ENOTCONN;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ atomic_inc(&fs_info->balance_cancel_req);
+ /*
+ * if we are running just wait and return, balance item is
+ * deleted in btrfs_balance in this case
+ */
+ if (atomic_read(&fs_info->balance_running)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ wait_event(fs_info->balance_wait_q,
+ atomic_read(&fs_info->balance_running) == 0);
+ mutex_lock(&fs_info->balance_mutex);
+ } else {
+ /* __cancel_balance needs volume_mutex */
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl)
+ __cancel_balance(fs_info);
+
+ mutex_unlock(&fs_info->volume_mutex);
+ }
+
+ BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+ atomic_dec(&fs_info->balance_cancel_req);
+ mutex_unlock(&fs_info->balance_mutex);
+ return 0;
+}
+
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
@@ -2234,7 +3022,7 @@ again:
key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
- while (1) {
+ do {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto done;
@@ -2276,8 +3064,7 @@ again:
goto done;
if (ret == -ENOSPC)
failed++;
- key.offset -= 1;
- }
+ } while (key.offset-- > 0);
if (failed && !retried) {
failed = 0;
@@ -2323,8 +3110,7 @@ done:
return ret;
}
-static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_root *root,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
@@ -2396,11 +3182,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
int i;
int j;
- if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
- (type & BTRFS_BLOCK_GROUP_DUP)) {
- WARN_ON(1);
- type &= ~BTRFS_BLOCK_GROUP_DUP;
- }
+ BUG_ON(!alloc_profile_is_valid(type, 0));
if (list_empty(&fs_devices->alloc_list))
return -ENOSPC;
@@ -2441,10 +3223,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_stripe_size = 1024 * 1024 * 1024;
max_chunk_size = 10 * max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- max_stripe_size = 256 * 1024 * 1024;
+ /* for larger filesystems, use larger metadata chunks */
+ if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+ max_stripe_size = 1024 * 1024 * 1024;
+ else
+ max_stripe_size = 256 * 1024 * 1024;
max_chunk_size = max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- max_stripe_size = 8 * 1024 * 1024;
+ max_stripe_size = 32 * 1024 * 1024;
max_chunk_size = 2 * max_stripe_size;
} else {
printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2496,7 +3282,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (total_avail == 0)
continue;
- ret = find_free_dev_extent(trans, device,
+ ret = find_free_dev_extent(device,
max_stripe_size * dev_stripes,
&dev_offset, &max_avail);
if (ret && ret != -ENOSPC)
@@ -2592,13 +3378,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- BUG_ON(ret);
free_extent_map(em);
+ if (ret)
+ goto error;
ret = btrfs_make_block_group(trans, extent_root, 0, type,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, num_bytes);
- BUG_ON(ret);
+ if (ret)
+ goto error;
for (i = 0; i < map->num_stripes; ++i) {
struct btrfs_device *device;
@@ -2611,7 +3399,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
info->chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, dev_offset, stripe_size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto error;
+ }
}
kfree(devices_info);
@@ -2647,7 +3438,8 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
device = map->stripes[index].dev;
device->bytes_used += stripe_size;
ret = btrfs_update_device(trans, device);
- BUG_ON(ret);
+ if (ret)
+ goto out_free;
index++;
}
@@ -2684,16 +3476,19 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
key.offset = chunk_offset;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
- BUG_ON(ret);
- if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+ if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ /*
+ * TODO: Cleanup of inserted chunk root in case of
+ * failure.
+ */
+ ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
item_size);
- BUG_ON(ret);
}
+out_free:
kfree(chunk);
- return 0;
+ return ret;
}
/*
@@ -2725,7 +3520,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
chunk_size, stripe_size);
- BUG_ON(ret);
+ if (ret)
+ return ret;
return 0;
}
@@ -2752,28 +3548,29 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
return ret;
alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
- (fs_info->metadata_alloc_profile &
- fs_info->avail_metadata_alloc_bits);
+ fs_info->avail_metadata_alloc_bits;
alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
&stripe_size, chunk_offset, alloc_profile);
- BUG_ON(ret);
+ if (ret)
+ return ret;
sys_chunk_offset = chunk_offset + chunk_size;
alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
- (fs_info->system_alloc_profile &
- fs_info->avail_system_alloc_bits);
+ fs_info->avail_system_alloc_bits;
alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
&sys_chunk_size, &sys_stripe_size,
sys_chunk_offset, alloc_profile);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
ret = btrfs_add_device(trans, fs_info->chunk_root, device);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
/*
* Modifying chunk tree needs allocating new blocks from both
@@ -2783,13 +3580,20 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
*/
ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
chunk_size, stripe_size);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
ret = __finish_chunk_alloc(trans, extent_root, sys_map,
sys_chunk_offset, sys_chunk_size,
sys_stripe_size);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
+
return 0;
+
+abort:
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
}
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -2901,26 +3705,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 stripe_nr;
u64 stripe_nr_orig;
u64 stripe_nr_end;
- int stripes_allocated = 8;
- int stripes_required = 1;
int stripe_index;
int i;
+ int ret = 0;
int num_stripes;
int max_errors = 0;
struct btrfs_bio *bbio = NULL;
- if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
- stripes_allocated = 1;
-again:
- if (bbio_ret) {
- bbio = kzalloc(btrfs_bio_size(stripes_allocated),
- GFP_NOFS);
- if (!bbio)
- return -ENOMEM;
-
- atomic_set(&bbio->error, 0);
- }
-
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
read_unlock(&em_tree->lock);
@@ -2939,32 +3730,6 @@ again:
if (mirror_num > map->num_stripes)
mirror_num = 0;
- /* if our btrfs_bio struct is too small, back off and try again */
- if (rw & REQ_WRITE) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_DUP)) {
- stripes_required = map->num_stripes;
- max_errors = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- stripes_required = map->sub_stripes;
- max_errors = 1;
- }
- }
- if (rw & REQ_DISCARD) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID10)) {
- stripes_required = map->num_stripes;
- }
- }
- if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
- stripes_allocated < stripes_required) {
- stripes_allocated = map->num_stripes;
- free_extent_map(em);
- kfree(bbio);
- goto again;
- }
stripe_nr = offset;
/*
* stripe_nr counts the total number of stripes we have to stride
@@ -2980,10 +3745,7 @@ again:
if (rw & REQ_DISCARD)
*length = min_t(u64, em->len - offset, *length);
- else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_DUP)) {
+ else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
/* we limit the length of each bio to what fits in a stripe */
*length = min_t(u64, em->len - offset,
map->stripe_len - stripe_offset);
@@ -3059,81 +3821,55 @@ again:
}
BUG_ON(stripe_index >= map->num_stripes);
+ bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+ if (!bbio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ atomic_set(&bbio->error, 0);
+
if (rw & REQ_DISCARD) {
+ int factor = 0;
+ int sub_stripes = 0;
+ u64 stripes_per_dev = 0;
+ u32 remaining_stripes = 0;
+
+ if (map->type &
+ (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ sub_stripes = 1;
+ else
+ sub_stripes = map->sub_stripes;
+
+ factor = map->num_stripes / sub_stripes;
+ stripes_per_dev = div_u64_rem(stripe_nr_end -
+ stripe_nr_orig,
+ factor,
+ &remaining_stripes);
+ }
+
for (i = 0; i < num_stripes; i++) {
bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
bbio->stripes[i].dev = map->stripes[stripe_index].dev;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- u64 stripes;
- u32 last_stripe = 0;
- int j;
-
- div_u64_rem(stripe_nr_end - 1,
- map->num_stripes,
- &last_stripe);
-
- for (j = 0; j < map->num_stripes; j++) {
- u32 test;
-
- div_u64_rem(stripe_nr_end - 1 - j,
- map->num_stripes, &test);
- if (test == stripe_index)
- break;
- }
- stripes = stripe_nr_end - 1 - j;
- do_div(stripes, map->num_stripes);
- bbio->stripes[i].length = map->stripe_len *
- (stripes - stripe_nr + 1);
-
- if (i == 0) {
- bbio->stripes[i].length -=
- stripe_offset;
- stripe_offset = 0;
- }
- if (stripe_index == last_stripe)
- bbio->stripes[i].length -=
- stripe_end_offset;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- u64 stripes;
- int j;
- int factor = map->num_stripes /
- map->sub_stripes;
- u32 last_stripe = 0;
-
- div_u64_rem(stripe_nr_end - 1,
- factor, &last_stripe);
- last_stripe *= map->sub_stripes;
-
- for (j = 0; j < factor; j++) {
- u32 test;
-
- div_u64_rem(stripe_nr_end - 1 - j,
- factor, &test);
-
- if (test ==
- stripe_index / map->sub_stripes)
- break;
- }
- stripes = stripe_nr_end - 1 - j;
- do_div(stripes, factor);
- bbio->stripes[i].length = map->stripe_len *
- (stripes - stripe_nr + 1);
-
- if (i < map->sub_stripes) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ bbio->stripes[i].length = stripes_per_dev *
+ map->stripe_len;
+ if (i / sub_stripes < remaining_stripes)
+ bbio->stripes[i].length +=
+ map->stripe_len;
+ if (i < sub_stripes)
bbio->stripes[i].length -=
stripe_offset;
- if (i == map->sub_stripes - 1)
- stripe_offset = 0;
- }
- if (stripe_index >= last_stripe &&
- stripe_index <= (last_stripe +
- map->sub_stripes - 1)) {
+ if ((i / sub_stripes + 1) %
+ sub_stripes == remaining_stripes)
bbio->stripes[i].length -=
stripe_end_offset;
- }
+ if (i == sub_stripes - 1)
+ stripe_offset = 0;
} else
bbio->stripes[i].length = *length;
@@ -3155,15 +3891,22 @@ again:
stripe_index++;
}
}
- if (bbio_ret) {
- *bbio_ret = bbio;
- bbio->num_stripes = num_stripes;
- bbio->max_errors = max_errors;
- bbio->mirror_num = mirror_num;
+
+ if (rw & REQ_WRITE) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ max_errors = 1;
+ }
}
+
+ *bbio_ret = bbio;
+ bbio->num_stripes = num_stripes;
+ bbio->max_errors = max_errors;
+ bbio->mirror_num = mirror_num;
out:
free_extent_map(em);
- return 0;
+ return ret;
}
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3201,7 +3944,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
do_div(length, map->num_stripes);
buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
- BUG_ON(!buf);
+ BUG_ON(!buf); /* -ENOMEM */
for (i = 0; i < map->num_stripes; i++) {
if (devid && map->stripes[i].dev->devid != devid)
@@ -3294,7 +4037,7 @@ struct async_sched {
* This will add one bio to the pending list for a device and make sure
* the work struct is scheduled.
*/
-static noinline int schedule_bio(struct btrfs_root *root,
+static noinline void schedule_bio(struct btrfs_root *root,
struct btrfs_device *device,
int rw, struct bio *bio)
{
@@ -3304,9 +4047,9 @@ static noinline int schedule_bio(struct btrfs_root *root,
/* don't bother with additional async steps for reads, right now */
if (!(rw & REQ_WRITE)) {
bio_get(bio);
- submit_bio(rw, bio);
+ btrfsic_submit_bio(rw, bio);
bio_put(bio);
- return 0;
+ return;
}
/*
@@ -3340,7 +4083,6 @@ static noinline int schedule_bio(struct btrfs_root *root,
if (should_queue)
btrfs_queue_worker(&root->fs_info->submit_workers,
&device->work);
- return 0;
}
int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
@@ -3363,7 +4105,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
mirror_num);
- BUG_ON(ret);
+ if (ret) /* -ENOMEM */
+ return ret;
total_devs = bbio->num_stripes;
if (map_length < length) {
@@ -3382,7 +4125,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
while (dev_nr < total_devs) {
if (dev_nr < total_devs - 1) {
bio = bio_clone(first_bio, GFP_NOFS);
- BUG_ON(!bio);
+ BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
}
@@ -3399,7 +4142,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
if (async_submit)
schedule_bio(root, dev, rw, bio);
else
- submit_bio(rw, bio);
+ btrfsic_submit_bio(rw, bio);
} else {
bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
bio->bi_sector = logical >> 9;
@@ -3536,13 +4279,13 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
write_lock(&map_tree->map_tree.lock);
ret = add_extent_mapping(&map_tree->map_tree, em);
write_unlock(&map_tree->map_tree.lock);
- BUG_ON(ret);
+ BUG_ON(ret); /* Tree corruption */
free_extent_map(em);
return 0;
}
-static int fill_device_from_item(struct extent_buffer *leaf,
+static void fill_device_from_item(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item,
struct btrfs_device *device)
{
@@ -3559,8 +4302,6 @@ static int fill_device_from_item(struct extent_buffer *leaf,
ptr = (unsigned long)btrfs_device_uuid(dev_item);
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-
- return 0;
}
static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
@@ -3568,7 +4309,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
struct btrfs_fs_devices *fs_devices;
int ret;
- mutex_lock(&uuid_mutex);
+ BUG_ON(!mutex_is_locked(&uuid_mutex));
fs_devices = root->fs_info->fs_devices->seed;
while (fs_devices) {
@@ -3606,7 +4347,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
fs_devices->seed = root->fs_info->fs_devices->seed;
root->fs_info->fs_devices->seed = fs_devices;
out:
- mutex_unlock(&uuid_mutex);
return ret;
}
@@ -3699,6 +4439,20 @@ int btrfs_read_sys_array(struct btrfs_root *root)
return -ENOMEM;
btrfs_set_buffer_uptodate(sb);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
+ /*
+ * The sb extent buffer is artifical and just used to read the system array.
+ * btrfs_set_buffer_uptodate() call does not properly mark all it's
+ * pages up-to-date when the page is larger: extent does not cover the
+ * whole page and consequently check_page_uptodate does not find all
+ * the page's extents up-to-date (the hole beyond sb),
+ * write_extent_buffer then triggers a WARN_ON.
+ *
+ * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
+ * but sb spans only this function. Add an explicit SetPageUptodate call
+ * to silence the warning eg. on PowerPC 64.
+ */
+ if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
+ SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
@@ -3749,6 +4503,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
if (!path)
return -ENOMEM;
+ mutex_lock(&uuid_mutex);
+ lock_chunks(root);
+
/* first we search for all of the device items, and then we
* read in all of the chunk items. This way we can create chunk
* mappings that reference all of the devices that are afound
@@ -3799,6 +4556,9 @@ again:
}
ret = 0;
error:
+ unlock_chunks(root);
+ mutex_unlock(&uuid_mutex);
+
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37..bb6b03f97aa 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,51 @@ struct map_lookup {
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA (1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
+#define BTRFS_BALANCE_METADATA (1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
+ BTRFS_BALANCE_SYSTEM | \
+ BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE (1ULL << 3)
+#define BTRFS_BALANCE_RESUME (1ULL << 4)
+
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
+
+/*
+ * Profile changing flags. When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+ struct btrfs_fs_info *fs_info;
+
+ struct btrfs_balance_args data;
+ struct btrfs_balance_args meta;
+ struct btrfs_balance_args sys;
+
+ u64 flags;
+
+ struct btrfs_balance_progress stat;
+};
+
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
@@ -215,12 +260,12 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
int btrfs_add_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_root *root, char *device_path);
-int btrfs_cleanup_fs_uuids(void);
+void btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
u8 *uuid, u8 *fsid);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+ struct btrfs_ioctl_balance_args *bargs);
+int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3848b04e310..e7a5659087e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
out:
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
return ret;
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index faccd47c6c4..92c20654cc5 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -370,9 +370,9 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
PAGE_CACHE_SIZE - buf_offset);
bytes = min(bytes, bytes_left);
- kaddr = kmap_atomic(dest_page, KM_USER0);
+ kaddr = kmap_atomic(dest_page);
memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
pg_offset += bytes;
bytes_left -= bytes;
diff --git a/fs/buffer.c b/fs/buffer.c
index 1a30db77af3..36d66653b93 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -29,7 +29,7 @@
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
@@ -1384,10 +1384,23 @@ static void invalidate_bh_lru(void *arg)
}
put_cpu_var(bh_lrus);
}
+
+static bool has_bh_in_lru(int cpu, void *dummy)
+{
+ struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
+ int i;
+ for (i = 0; i < BH_LRU_SIZE; i++) {
+ if (b->bhs[i])
+ return 1;
+ }
+
+ return 0;
+}
+
void invalidate_bh_lrus(void)
{
- on_each_cpu(invalidate_bh_lru, NULL, 1);
+ on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index a0358c2189c..7f0771d3894 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -646,7 +646,8 @@ lookup_again:
* (this is used to keep track of culling, and atimes are only
* updated by read, write and readdir but not lookup or
* open) */
- touch_atime(cache->mnt, next);
+ path.dentry = next;
+ touch_atime(&path);
}
/* open a file interface onto a data file */
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b60fc8bfb3e..620daad201d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap)
unsigned long ttl;
u32 gen;
- spin_lock(&cap->session->s_cap_lock);
+ spin_lock(&cap->session->s_gen_ttl_lock);
gen = cap->session->s_cap_gen;
ttl = cap->session->s_cap_ttl;
- spin_unlock(&cap->session->s_cap_lock);
+ spin_unlock(&cap->session->s_gen_ttl_lock);
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
dout("__cap_is_valid %p cap %p issued %s "
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 74fd74719dc..3e8094be460 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -973,12 +973,12 @@ static int dentry_lease_is_valid(struct dentry *dentry)
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di && di->lease_session) {
+ if (di->lease_session) {
s = di->lease_session;
- spin_lock(&s->s_cap_lock);
+ spin_lock(&s->s_gen_ttl_lock);
gen = s->s_cap_gen;
ttl = s->s_cap_ttl;
- spin_unlock(&s->s_cap_lock);
+ spin_unlock(&s->s_gen_ttl_lock);
if (di->lease_gen == gen &&
time_before(jiffies, dentry->d_time) &&
@@ -1072,13 +1072,11 @@ static void ceph_d_release(struct dentry *dentry)
struct ceph_dentry_info *di = ceph_dentry(dentry);
dout("d_release %p\n", dentry);
- if (di) {
- ceph_dentry_lru_del(dentry);
- if (di->lease_session)
- ceph_put_mds_session(di->lease_session);
- kmem_cache_free(ceph_dentry_cachep, di);
- dentry->d_fsdata = NULL;
- }
+ ceph_dentry_lru_del(dentry);
+ if (di->lease_session)
+ ceph_put_mds_session(di->lease_session);
+ kmem_cache_free(ceph_dentry_cachep, di);
+ dentry->d_fsdata = NULL;
}
static int ceph_snapdir_d_revalidate(struct dentry *dentry,
@@ -1096,17 +1094,36 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
*/
void ceph_dir_set_complete(struct inode *inode)
{
- /* not yet implemented */
+ struct dentry *dentry = d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry) &&
+ ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
+ dout(" marking %p (%p) complete\n", inode, dentry);
+ set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+ dput(dentry);
}
void ceph_dir_clear_complete(struct inode *inode)
{
- /* not yet implemented */
+ struct dentry *dentry = d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry)) {
+ dout(" marking %p (%p) complete\n", inode, dentry);
+ set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+ dput(dentry);
}
bool ceph_dir_test_complete(struct inode *inode)
{
- /* not yet implemented */
+ struct dentry *dentry = d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry)) {
+ dout(" marking %p (%p) NOT complete\n", inode, dentry);
+ clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+ dput(dentry);
return false;
}
@@ -1220,6 +1237,7 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
do {
ceph_mdsc_get_request(req);
spin_unlock(&ci->i_unsafe_lock);
+
dout("dir_fsync %p wait on tid %llu (until %llu)\n",
inode, req->r_tid, last_tid);
if (req->r_timeout) {
@@ -1232,9 +1250,9 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
} else {
wait_for_completion(&req->r_safe_completion);
}
- spin_lock(&ci->i_unsafe_lock);
ceph_mdsc_put_request(req);
+ spin_lock(&ci->i_unsafe_lock);
if (ret || list_empty(head))
break;
req = list_entry(head->next,
@@ -1259,13 +1277,11 @@ void ceph_dentry_lru_add(struct dentry *dn)
dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
- if (di) {
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_add_tail(&di->lru, &mdsc->dentry_lru);
- mdsc->num_dentry++;
- spin_unlock(&mdsc->dentry_lru_lock);
- }
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_add_tail(&di->lru, &mdsc->dentry_lru);
+ mdsc->num_dentry++;
+ spin_unlock(&mdsc->dentry_lru_lock);
}
void ceph_dentry_lru_touch(struct dentry *dn)
@@ -1275,12 +1291,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
dn->d_name.len, dn->d_name.name, di->offset);
- if (di) {
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_move_tail(&di->lru, &mdsc->dentry_lru);
- spin_unlock(&mdsc->dentry_lru_lock);
- }
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_move_tail(&di->lru, &mdsc->dentry_lru);
+ spin_unlock(&mdsc->dentry_lru_lock);
}
void ceph_dentry_lru_del(struct dentry *dn)
@@ -1290,13 +1304,11 @@ void ceph_dentry_lru_del(struct dentry *dn)
dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
- if (di) {
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_del_init(&di->lru);
- mdsc->num_dentry--;
- spin_unlock(&mdsc->dentry_lru_lock);
- }
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_del_init(&di->lru);
+ mdsc->num_dentry--;
+ spin_unlock(&mdsc->dentry_lru_lock);
}
/*
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9fbcdecaacc..fbb2a643ef1 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,9 +56,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
return -EINVAL;
spin_lock(&dentry->d_lock);
- parent = dget(dentry->d_parent);
- spin_unlock(&dentry->d_lock);
-
+ parent = dentry->d_parent;
if (*max_len >= connected_handle_length) {
dout("encode_fh %p connectable\n", dentry);
cfh->ino = ceph_ino(dentry->d_inode);
@@ -81,7 +79,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
*max_len = handle_length;
type = 255;
}
- dput(parent);
+ spin_unlock(&dentry->d_lock);
return type;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 25283e7a37f..9fff9f3b17e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -677,18 +677,19 @@ static int fill_inode(struct inode *inode,
case S_IFLNK:
inode->i_op = &ceph_symlink_iops;
if (!ci->i_symlink) {
- int symlen = iinfo->symlink_len;
+ u32 symlen = iinfo->symlink_len;
char *sym;
- BUG_ON(symlen != inode->i_size);
spin_unlock(&ci->i_ceph_lock);
+ err = -EINVAL;
+ if (WARN_ON(symlen != inode->i_size))
+ goto out;
+
err = -ENOMEM;
- sym = kmalloc(symlen+1, GFP_NOFS);
+ sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
if (!sym)
goto out;
- memcpy(sym, iinfo->symlink, symlen);
- sym[symlen] = 0;
spin_lock(&ci->i_ceph_lock);
if (!ci->i_symlink)
@@ -850,11 +851,12 @@ static void ceph_set_dentry_offset(struct dentry *dn)
{
struct dentry *dir = dn->d_parent;
struct inode *inode = dir->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_info *ci;
struct ceph_dentry_info *di;
BUG_ON(!inode);
+ ci = ceph_inode(inode);
di = ceph_dentry(dn);
spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6203d805eb4..89971e137aa 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
/* trace */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
+ ceph_decode_need(&p, end, len, bad);
err = parse_reply_info_trace(&p, p+len, info, features);
if (err < 0)
goto out_bad;
@@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg,
/* extra */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
+ ceph_decode_need(&p, end, len, bad);
err = parse_reply_info_extra(&p, p+len, info, features);
if (err < 0)
goto out_bad;
@@ -398,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
s->s_con.peer_name.num = cpu_to_le64(mds);
- spin_lock_init(&s->s_cap_lock);
+ spin_lock_init(&s->s_gen_ttl_lock);
s->s_cap_gen = 0;
- s->s_cap_ttl = 0;
+ s->s_cap_ttl = jiffies - 1;
+
+ spin_lock_init(&s->s_cap_lock);
s->s_renew_requested = 0;
s->s_renew_seq = 0;
INIT_LIST_HEAD(&s->s_caps);
@@ -1079,8 +1083,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
int wake = 0;
spin_lock(&session->s_cap_lock);
- was_stale = is_renew && (session->s_cap_ttl == 0 ||
- time_after_eq(jiffies, session->s_cap_ttl));
+ was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
session->s_cap_ttl = session->s_renew_requested +
mdsc->mdsmap->m_session_timeout*HZ;
@@ -2326,10 +2329,10 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_STALE:
pr_info("mds%d caps went stale, renewing\n",
session->s_mds);
- spin_lock(&session->s_cap_lock);
+ spin_lock(&session->s_gen_ttl_lock);
session->s_cap_gen++;
- session->s_cap_ttl = 0;
- spin_unlock(&session->s_cap_lock);
+ session->s_cap_ttl = jiffies - 1;
+ spin_unlock(&session->s_gen_ttl_lock);
send_renew_caps(mdsc, session);
break;
@@ -2772,7 +2775,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
di = ceph_dentry(dentry);
switch (h->action) {
case CEPH_MDS_LEASE_REVOKE:
- if (di && di->lease_session == session) {
+ if (di->lease_session == session) {
if (ceph_seq_cmp(di->lease_seq, seq) > 0)
h->seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
@@ -2781,7 +2784,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
break;
case CEPH_MDS_LEASE_RENEW:
- if (di && di->lease_session == session &&
+ if (di->lease_session == session &&
di->lease_gen == session->s_cap_gen &&
di->lease_renew_from &&
di->lease_renew_after == 0) {
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a50ca0e3947..8c7c04ebb59 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -117,10 +117,13 @@ struct ceph_mds_session {
void *s_authorizer_buf, *s_authorizer_reply_buf;
size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
- /* protected by s_cap_lock */
- spinlock_t s_cap_lock;
+ /* protected by s_gen_ttl_lock */
+ spinlock_t s_gen_ttl_lock;
u32 s_cap_gen; /* inc each time we get mds stale msg */
unsigned long s_cap_ttl; /* when session caps expire */
+
+ /* protected by s_cap_lock */
+ spinlock_t s_cap_lock;
struct list_head s_caps; /* all caps issued by this session */
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index a559c80f127..f04c0961f99 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -331,7 +331,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
/* alloc new snap context */
err = -ENOMEM;
- if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+ if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64))
goto fail;
snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
if (!snapc)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 48f61a12af6..1e67dd7305a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -130,8 +130,12 @@ enum {
Opt_nodirstat,
Opt_rbytes,
Opt_norbytes,
+ Opt_asyncreaddir,
Opt_noasyncreaddir,
+ Opt_dcache,
+ Opt_nodcache,
Opt_ino32,
+ Opt_noino32,
};
static match_table_t fsopt_tokens = {
@@ -151,8 +155,12 @@ static match_table_t fsopt_tokens = {
{Opt_nodirstat, "nodirstat"},
{Opt_rbytes, "rbytes"},
{Opt_norbytes, "norbytes"},
+ {Opt_asyncreaddir, "asyncreaddir"},
{Opt_noasyncreaddir, "noasyncreaddir"},
+ {Opt_dcache, "dcache"},
+ {Opt_nodcache, "nodcache"},
{Opt_ino32, "ino32"},
+ {Opt_noino32, "noino32"},
{-1, NULL}
};
@@ -228,12 +236,24 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_norbytes:
fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
break;
+ case Opt_asyncreaddir:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
+ break;
case Opt_noasyncreaddir:
fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
break;
+ case Opt_dcache:
+ fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
+ break;
+ case Opt_nodcache:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
+ break;
case Opt_ino32:
fsopt->flags |= CEPH_MOUNT_OPT_INO32;
break;
+ case Opt_noino32:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
+ break;
default:
BUG_ON(token);
}
@@ -324,10 +344,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
*path += 2;
dout("server path '%s'\n", *path);
- err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+ *popt = ceph_parse_options(options, dev_name, dev_name_end,
parse_fsopt_token, (void *)fsopt);
- if (err)
+ if (IS_ERR(*popt)) {
+ err = PTR_ERR(*popt);
goto out;
+ }
/* success */
*pfsopt = fsopt;
@@ -377,6 +399,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",norbytes");
if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
seq_puts(m, ",noasyncreaddir");
+ if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
+ seq_puts(m, ",dcache");
+ else
+ seq_puts(m, ",nodcache");
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
@@ -641,16 +667,15 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
dout("open_root_inode success\n");
if (ceph_ino(inode) == CEPH_INO_ROOT &&
fsc->sb->s_root == NULL) {
- root = d_alloc_root(inode);
+ root = d_make_root(inode);
if (!root) {
- iput(inode);
root = ERR_PTR(-ENOMEM);
goto out;
}
- ceph_init_dentry(root);
} else {
root = d_obtain_alias(inode);
}
+ ceph_init_dentry(root);
dout("open_root_inode success, root dentry is %p\n", root);
} else {
root = ERR_PTR(err);
@@ -913,6 +938,7 @@ static int __init init_ceph(void)
if (ret)
goto out;
+ ceph_xattr_init();
ret = register_filesystem(&ceph_fs_type);
if (ret)
goto out_icache;
@@ -922,6 +948,7 @@ static int __init init_ceph(void)
return 0;
out_icache:
+ ceph_xattr_exit();
destroy_caches();
out:
return ret;
@@ -931,6 +958,7 @@ static void __exit exit_ceph(void)
{
dout("exit_ceph\n");
unregister_filesystem(&ceph_fs_type);
+ ceph_xattr_exit();
destroy_caches();
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cb3652b3727..fc35036d258 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -28,6 +28,7 @@
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
+#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
@@ -366,7 +367,7 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
u32 ino = vino & 0xffffffff;
ino ^= vino >> 32;
if (!ino)
- ino = 1;
+ ino = 2;
return ino;
}
@@ -732,6 +733,8 @@ extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern int ceph_removexattr(struct dentry *, const char *);
extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+extern void __init ceph_xattr_init(void);
+extern void ceph_xattr_exit(void);
/* caps.c */
extern const char *ceph_cap_string(int c);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a5e36e4488a..35b86331d8a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -8,9 +8,12 @@
#include <linux/xattr.h>
#include <linux/slab.h>
+#define XATTR_CEPH_PREFIX "ceph."
+#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
+
static bool ceph_is_valid_xattr(const char *name)
{
- return !strncmp(name, "ceph.", 5) ||
+ return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
!strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN) ||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
@@ -21,79 +24,91 @@ static bool ceph_is_valid_xattr(const char *name)
* These define virtual xattrs exposing the recursive directory
* statistics and layout metadata.
*/
-struct ceph_vxattr_cb {
- bool readonly;
+struct ceph_vxattr {
char *name;
+ size_t name_size; /* strlen(name) + 1 (for '\0') */
size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
size_t size);
+ bool readonly;
};
/* directories */
-static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
size_t size)
{
return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
}
-static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
size_t size)
{
return snprintf(val, size, "%lld", ci->i_files);
}
-static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
size_t size)
{
return snprintf(val, size, "%lld", ci->i_subdirs);
}
-static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
size_t size)
{
return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
}
-static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
size_t size)
{
return snprintf(val, size, "%lld", ci->i_rfiles);
}
-static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
size_t size)
{
return snprintf(val, size, "%lld", ci->i_rsubdirs);
}
-static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
size_t size)
{
return snprintf(val, size, "%lld", ci->i_rbytes);
}
-static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
size_t size)
{
- return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+ return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec,
(long)ci->i_rctime.tv_nsec);
}
-static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
- { true, "ceph.dir.entries", ceph_vxattrcb_entries},
- { true, "ceph.dir.files", ceph_vxattrcb_files},
- { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
- { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
- { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
- { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
- { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
- { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
- { true, NULL, NULL }
+#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
+
+#define XATTR_NAME_CEPH(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .readonly = true, \
+ }
+
+static struct ceph_vxattr ceph_dir_vxattrs[] = {
+ XATTR_NAME_CEPH(dir, entries),
+ XATTR_NAME_CEPH(dir, files),
+ XATTR_NAME_CEPH(dir, subdirs),
+ XATTR_NAME_CEPH(dir, rentries),
+ XATTR_NAME_CEPH(dir, rfiles),
+ XATTR_NAME_CEPH(dir, rsubdirs),
+ XATTR_NAME_CEPH(dir, rbytes),
+ XATTR_NAME_CEPH(dir, rctime),
+ { 0 } /* Required table terminator */
};
+static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
/* files */
-static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
int ret;
@@ -103,19 +118,32 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
- if (ceph_file_layout_pg_preferred(ci->i_layout))
- ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
+
+ if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) {
+ val += ret;
+ size -= ret;
+ ret += snprintf(val, size, "preferred_osd=%lld\n",
(unsigned long long)ceph_file_layout_pg_preferred(
ci->i_layout));
+ }
+
return ret;
}
-static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
- { true, "ceph.layout", ceph_vxattrcb_layout},
- { NULL, NULL }
+static struct ceph_vxattr ceph_file_vxattrs[] = {
+ XATTR_NAME_CEPH(file, layout),
+ /* The following extended attribute name is deprecated */
+ {
+ .name = XATTR_CEPH_PREFIX "layout",
+ .name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
+ .getxattr_cb = ceph_vxattrcb_file_layout,
+ .readonly = true,
+ },
+ { 0 } /* Required table terminator */
};
+static size_t ceph_file_vxattrs_name_size; /* total size of all names */
-static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
{
if (S_ISDIR(inode->i_mode))
return ceph_dir_vxattrs;
@@ -124,14 +152,59 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
return NULL;
}
-static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+ if (vxattrs == ceph_dir_vxattrs)
+ return ceph_dir_vxattrs_name_size;
+ if (vxattrs == ceph_file_vxattrs)
+ return ceph_file_vxattrs_name_size;
+ BUG();
+
+ return 0;
+}
+
+/*
+ * Compute the aggregate size (including terminating '\0') of all
+ * virtual extended attribute names in the given vxattr table.
+ */
+static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+ struct ceph_vxattr *vxattr;
+ size_t size = 0;
+
+ for (vxattr = vxattrs; vxattr->name; vxattr++)
+ size += vxattr->name_size;
+
+ return size;
+}
+
+/* Routines called at initialization and exit time */
+
+void __init ceph_xattr_init(void)
+{
+ ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
+ ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
+}
+
+void ceph_xattr_exit(void)
+{
+ ceph_dir_vxattrs_name_size = 0;
+ ceph_file_vxattrs_name_size = 0;
+}
+
+static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
const char *name)
{
- do {
- if (strcmp(vxattr->name, name) == 0)
- return vxattr;
- vxattr++;
- } while (vxattr->name);
+ struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode);
+
+ if (vxattr) {
+ while (vxattr->name) {
+ if (!strcmp(vxattr->name, name))
+ return vxattr;
+ vxattr++;
+ }
+ }
+
return NULL;
}
@@ -500,17 +573,15 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
{
struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
int err;
struct ceph_inode_xattr *xattr;
- struct ceph_vxattr_cb *vxattr = NULL;
+ struct ceph_vxattr *vxattr = NULL;
if (!ceph_is_valid_xattr(name))
return -ENODATA;
/* let's see if a virtual xattr was requested */
- if (vxattrs)
- vxattr = ceph_match_vxattr(vxattrs, name);
+ vxattr = ceph_match_vxattr(inode, name);
spin_lock(&ci->i_ceph_lock);
dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
@@ -566,7 +637,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
{
struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+ struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
u32 vir_namelen = 0;
u32 namelen;
int err;
@@ -594,11 +665,12 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
goto out;
list_xattr:
- vir_namelen = 0;
- /* include virtual dir xattrs */
- if (vxattrs)
- for (i = 0; vxattrs[i].name; i++)
- vir_namelen += strlen(vxattrs[i].name) + 1;
+ /*
+ * Start with virtual dir xattr names (if any) (including
+ * terminating '\0' characters for each).
+ */
+ vir_namelen = ceph_vxattrs_name_size(vxattrs);
+
/* adding 1 byte per each variable due to the null termination */
namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
err = -ERANGE;
@@ -696,17 +768,17 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
+ struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+ int issued;
int err;
+ int dirty;
int name_len = strlen(name);
int val_len = size;
char *newname = NULL;
char *newval = NULL;
struct ceph_inode_xattr *xattr = NULL;
- int issued;
int required_blob_size;
- int dirty;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
@@ -714,12 +786,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
- if (vxattrs) {
- struct ceph_vxattr_cb *vxattr =
- ceph_match_vxattr(vxattrs, name);
- if (vxattr && vxattr->readonly)
- return -EOPNOTSUPP;
- }
+ vxattr = ceph_match_vxattr(inode, name);
+ if (vxattr && vxattr->readonly)
+ return -EOPNOTSUPP;
/* preallocate memory for xattr name, value, index node */
err = -ENOMEM;
@@ -728,11 +797,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
goto out;
if (val_len) {
- newval = kmalloc(val_len + 1, GFP_NOFS);
+ newval = kmemdup(value, val_len, GFP_NOFS);
if (!newval)
goto out;
- memcpy(newval, value, val_len);
- newval[val_len] = '\0';
}
xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
@@ -742,6 +809,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
+ dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
if (!(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync;
__build_xattrs(inode);
@@ -750,7 +818,7 @@ retry:
if (!ci->i_xattrs.prealloc_blob ||
required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
- struct ceph_buffer *blob = NULL;
+ struct ceph_buffer *blob;
spin_unlock(&ci->i_ceph_lock);
dout(" preaallocating new blob size=%d\n", required_blob_size);
@@ -764,12 +832,13 @@ retry:
goto retry;
}
- dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
err = __set_xattr(ci, newname, name_len, newval,
val_len, 1, 1, 1, &xattr);
+
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME;
+
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
@@ -814,10 +883,11 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
int ceph_removexattr(struct dentry *dentry, const char *name)
{
struct inode *inode = dentry->d_inode;
+ struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
int issued;
int err;
+ int required_blob_size;
int dirty;
if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -826,26 +896,43 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
- if (vxattrs) {
- struct ceph_vxattr_cb *vxattr =
- ceph_match_vxattr(vxattrs, name);
- if (vxattr && vxattr->readonly)
- return -EOPNOTSUPP;
- }
+ vxattr = ceph_match_vxattr(inode, name);
+ if (vxattr && vxattr->readonly)
+ return -EOPNOTSUPP;
+ err = -ENOMEM;
spin_lock(&ci->i_ceph_lock);
- __build_xattrs(inode);
+retry:
issued = __ceph_caps_issued(ci, NULL);
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
if (!(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync;
+ __build_xattrs(inode);
+
+ required_blob_size = __get_required_blob_size(ci, 0, 0);
+
+ if (!ci->i_xattrs.prealloc_blob ||
+ required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+ struct ceph_buffer *blob;
+
+ spin_unlock(&ci->i_ceph_lock);
+ dout(" preaallocating new blob size=%d\n", required_blob_size);
+ blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+ if (!blob)
+ goto out;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_xattrs.prealloc_blob)
+ ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ ci->i_xattrs.prealloc_blob = blob;
+ goto retry;
+ }
err = __remove_xattr_by_name(ceph_inode(inode), name);
+
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME;
-
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
@@ -853,6 +940,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
do_sync:
spin_unlock(&ci->i_ceph_lock);
err = ceph_send_removexattr(dentry, name);
+out:
return err;
}
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f66cc162515..2b243af70aa 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -139,8 +139,7 @@ config CIFS_DFS_UPCALL
points. If unsure, say N.
config CIFS_FSCACHE
- bool "Provide CIFS client caching support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "Provide CIFS client caching support"
depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
help
Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
@@ -148,8 +147,8 @@ config CIFS_FSCACHE
manager. If unsure, say N.
config CIFS_ACL
- bool "Provide CIFS ACL support (EXPERIMENTAL)"
- depends on EXPERIMENTAL && CIFS_XATTR && KEYS
+ bool "Provide CIFS ACL support"
+ depends on CIFS_XATTR && KEYS
help
Allows to fetch CIFS/NTFS ACL from the server. The DACL blob
is handed over to the application/caller.
diff --git a/fs/cifs/README b/fs/cifs/README
index 895da1dc155..b7d782bab79 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -753,10 +753,6 @@ module loading or during the runtime by using the interface
i.e. echo "value" > /sys/module/cifs/parameters/<param>
-1. echo_retries - The number of echo attempts before giving up and
- reconnecting to the server. The default is 5. The value 0
- means never reconnect.
-
-2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
+1. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
[Y/y/1]. To disable use any of [N/n/0].
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 84e8c072470..27046462941 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -58,15 +58,16 @@ cifs_dump_mem(char *label, void *data, int length)
}
#ifdef CONFIG_CIFS_DEBUG2
-void cifs_dump_detail(struct smb_hdr *smb)
+void cifs_dump_detail(void *buf)
{
+ struct smb_hdr *smb = (struct smb_hdr *)buf;
+
cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
smb->Command, smb->Status.CifsError,
smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
cERROR(1, "smb buf %p len %d", smb, smbCalcSize(smb));
}
-
void cifs_dump_mids(struct TCP_Server_Info *server)
{
struct list_head *tmp;
@@ -79,15 +80,15 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
spin_lock(&GlobalMid_Lock);
list_for_each(tmp, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
- mid_entry->midState,
- (int)mid_entry->command,
+ cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu",
+ mid_entry->mid_state,
+ le16_to_cpu(mid_entry->command),
mid_entry->pid,
mid_entry->callback_data,
mid_entry->mid);
#ifdef CONFIG_CIFS_STATS2
cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
- mid_entry->largeBuf,
+ mid_entry->large_buf,
mid_entry->resp_buf,
mid_entry->when_received,
jiffies);
@@ -171,8 +172,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_printf(m, "TCP status: %d\n\tLocal Users To "
"Server: %d SecMode: 0x%x Req On Wire: %d",
server->tcpStatus, server->srv_count,
- server->sec_mode,
- atomic_read(&server->inFlight));
+ server->sec_mode, in_flight(server));
#ifdef CONFIG_CIFS_STATS2
seq_printf(m, " In Send: %d In MaxReq Wait: %d",
@@ -218,12 +218,12 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
mid_entry = list_entry(tmp3, struct mid_q_entry,
qhead);
seq_printf(m, "\tState: %d com: %d pid:"
- " %d cbdata: %p mid %d\n",
- mid_entry->midState,
- (int)mid_entry->command,
- mid_entry->pid,
- mid_entry->callback_data,
- mid_entry->mid);
+ " %d cbdata: %p mid %llu\n",
+ mid_entry->mid_state,
+ le16_to_cpu(mid_entry->command),
+ mid_entry->pid,
+ mid_entry->callback_data,
+ mid_entry->mid);
}
spin_unlock(&GlobalMid_Lock);
}
@@ -418,7 +418,6 @@ static const struct file_operations cifs_stats_proc_fops = {
static struct proc_dir_entry *proc_fs_cifs;
static const struct file_operations cifsFYI_proc_fops;
-static const struct file_operations cifs_oplock_proc_fops;
static const struct file_operations cifs_lookup_cache_proc_fops;
static const struct file_operations traceSMB_proc_fops;
static const struct file_operations cifs_multiuser_mount_proc_fops;
@@ -439,7 +438,6 @@ cifs_proc_init(void)
#endif /* STATS */
proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
- proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
&cifs_linux_ext_proc_fops);
proc_create("MultiuserMount", 0, proc_fs_cifs,
@@ -463,7 +461,6 @@ cifs_proc_clean(void)
remove_proc_entry("Stats", proc_fs_cifs);
#endif
remove_proc_entry("MultiuserMount", proc_fs_cifs);
- remove_proc_entry("OplockEnabled", proc_fs_cifs);
remove_proc_entry("SecurityFlags", proc_fs_cifs);
remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
@@ -509,46 +506,6 @@ static const struct file_operations cifsFYI_proc_fops = {
.write = cifsFYI_proc_write,
};
-static int cifs_oplock_proc_show(struct seq_file *m, void *v)
-{
- seq_printf(m, "%d\n", enable_oplocks);
- return 0;
-}
-
-static int cifs_oplock_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, cifs_oplock_proc_show, NULL);
-}
-
-static ssize_t cifs_oplock_proc_write(struct file *file,
- const char __user *buffer, size_t count, loff_t *ppos)
-{
- char c;
- int rc;
-
- printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface "
- "will be removed in kernel version 3.4. Please migrate to "
- "using the 'enable_oplocks' module parameter in cifs.ko.\n");
- rc = get_user(c, buffer);
- if (rc)
- return rc;
- if (c == '0' || c == 'n' || c == 'N')
- enable_oplocks = false;
- else if (c == '1' || c == 'y' || c == 'Y')
- enable_oplocks = true;
-
- return count;
-}
-
-static const struct file_operations cifs_oplock_proc_fops = {
- .owner = THIS_MODULE,
- .open = cifs_oplock_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = cifs_oplock_proc_write,
-};
-
static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
{
seq_printf(m, "%d\n", linuxExtEnabled);
@@ -676,14 +633,23 @@ static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
{
char c;
int rc;
+ static bool warned;
rc = get_user(c, buffer);
if (rc)
return rc;
if (c == '0' || c == 'n' || c == 'N')
multiuser_mount = 0;
- else if (c == '1' || c == 'y' || c == 'Y')
+ else if (c == '1' || c == 'y' || c == 'Y') {
multiuser_mount = 1;
+ if (!warned) {
+ warned = true;
+ printk(KERN_WARNING "CIFS VFS: The legacy multiuser "
+ "mount code is scheduled to be deprecated in "
+ "3.5. Please switch to using the multiuser "
+ "mount option.");
+ }
+ }
return count;
}
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 8942b28cf80..566e0ae8dc2 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -26,13 +26,13 @@
void cifs_dump_mem(char *label, void *data, int length);
#ifdef CONFIG_CIFS_DEBUG2
#define DBG2 2
-void cifs_dump_detail(struct smb_hdr *);
+void cifs_dump_detail(void *);
void cifs_dump_mids(struct TCP_Server_Info *);
#else
#define DBG2 0
#endif
extern int traceSMB; /* flag which enables the function below */
-void dump_smb(struct smb_hdr *, int);
+void dump_smb(void *, int);
#define CIFS_INFO 0x01
#define CIFS_RC 0x02
#define CIFS_TIMER 0x04
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 2272fd5fe5b..e622863b292 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -113,9 +113,11 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
MAX_MECH_STR_LEN +
UID_KEY_LEN + (sizeof(uid_t) * 2) +
CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
- USER_KEY_LEN + strlen(sesInfo->user_name) +
PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
+ if (sesInfo->user_name)
+ desc_len += USER_KEY_LEN + strlen(sesInfo->user_name);
+
spnego_key = ERR_PTR(-ENOMEM);
description = kzalloc(desc_len, GFP_KERNEL);
if (description == NULL)
@@ -152,8 +154,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
dp = description + strlen(description);
sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
- dp = description + strlen(description);
- sprintf(dp, ";user=%s", sesInfo->user_name);
+ if (sesInfo->user_name) {
+ dp = description + strlen(description);
+ sprintf(dp, ";user=%s", sesInfo->user_name);
+ }
dp = description + strlen(description);
sprintf(dp, ";pid=0x%x", current->pid);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 1b2e180b018..fbb9da95184 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -27,17 +27,17 @@
#include "cifs_debug.h"
/*
- * cifs_ucs2_bytes - how long will a string be after conversion?
- * @ucs - pointer to input string
+ * cifs_utf16_bytes - how long will a string be after conversion?
+ * @utf16 - pointer to input string
* @maxbytes - don't go past this many bytes of input string
* @codepage - destination codepage
*
- * Walk a ucs2le string and return the number of bytes that the string will
+ * Walk a utf16le string and return the number of bytes that the string will
* be after being converted to the given charset, not including any null
* termination required. Don't walk past maxbytes in the source buffer.
*/
int
-cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+cifs_utf16_bytes(const __le16 *from, int maxbytes,
const struct nls_table *codepage)
{
int i;
@@ -122,7 +122,7 @@ cp_convert:
}
/*
- * cifs_from_ucs2 - convert utf16le string to local charset
+ * cifs_from_utf16 - convert utf16le string to local charset
* @to - destination buffer
* @from - source buffer
* @tolen - destination buffer size (in bytes)
@@ -130,7 +130,7 @@ cp_convert:
* @codepage - codepage to which characters should be converted
* @mapchar - should characters be remapped according to the mapchars option?
*
- * Convert a little-endian ucs2le string (as sent by the server) to a string
+ * Convert a little-endian utf16le string (as sent by the server) to a string
* in the provided codepage. The tolen and fromlen parameters are to ensure
* that the code doesn't walk off of the end of the buffer (which is always
* a danger if the alignment of the source buffer is off). The destination
@@ -139,12 +139,12 @@ cp_convert:
* null terminator).
*
* Note that some windows versions actually send multiword UTF-16 characters
- * instead of straight UCS-2. The linux nls routines however aren't able to
+ * instead of straight UTF16-2. The linux nls routines however aren't able to
* deal with those characters properly. In the event that we get some of
* those characters, they won't be translated properly.
*/
int
-cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
const struct nls_table *codepage, bool mapchar)
{
int i, charlen, safelen;
@@ -190,13 +190,13 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
}
/*
- * NAME: cifs_strtoUCS()
+ * NAME: cifs_strtoUTF16()
*
* FUNCTION: Convert character string to unicode string
*
*/
int
-cifs_strtoUCS(__le16 *to, const char *from, int len,
+cifs_strtoUTF16(__le16 *to, const char *from, int len,
const struct nls_table *codepage)
{
int charlen;
@@ -206,7 +206,7 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
charlen = codepage->char2uni(from, len, &wchar_to);
if (charlen < 1) {
- cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
+ cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d",
*from, charlen);
/* A question mark */
wchar_to = 0x003f;
@@ -220,7 +220,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
}
/*
- * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
+ * cifs_strndup_from_utf16 - copy a string from wire format to the local
+ * codepage
* @src - source string
* @maxlen - don't walk past this many bytes in the source string
* @is_unicode - is this a unicode string?
@@ -231,19 +232,19 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
* error.
*/
char *
-cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
- const struct nls_table *codepage)
+cifs_strndup_from_utf16(const char *src, const int maxlen,
+ const bool is_unicode, const struct nls_table *codepage)
{
int len;
char *dst;
if (is_unicode) {
- len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
+ len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
len += nls_nullsize(codepage);
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return NULL;
- cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
+ cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
false);
} else {
len = strnlen(src, maxlen);
@@ -264,7 +265,7 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
* names are little endian 16 bit Unicode on the wire
*/
int
-cifsConvertToUCS(__le16 *target, const char *source, int srclen,
+cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
const struct nls_table *cp, int mapChars)
{
int i, j, charlen;
@@ -273,7 +274,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
wchar_t tmp;
if (!mapChars)
- return cifs_strtoUCS(target, source, PATH_MAX, cp);
+ return cifs_strtoUTF16(target, source, PATH_MAX, cp);
for (i = 0, j = 0; i < srclen; j++) {
src_char = source[i];
@@ -281,7 +282,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
switch (src_char) {
case 0:
put_unaligned(0, &target[j]);
- goto ctoUCS_out;
+ goto ctoUTF16_out;
case ':':
dst_char = cpu_to_le16(UNI_COLON);
break;
@@ -326,7 +327,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
put_unaligned(dst_char, &target[j]);
}
-ctoUCS_out:
+ctoUTF16_out:
return i;
}
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 6d02fd56056..a513a546700 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -74,16 +74,16 @@ extern const struct UniCaseRange CifsUniLowerRange[];
#endif /* UNIUPR_NOLOWER */
#ifdef __KERNEL__
-int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
- const struct nls_table *codepage, bool mapchar);
-int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
- const struct nls_table *codepage);
-int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
-char *cifs_strndup_from_ucs(const char *src, const int maxlen,
- const bool is_unicode,
- const struct nls_table *codepage);
-extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
- const struct nls_table *cp, int mapChars);
+int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
+ const struct nls_table *codepage, bool mapchar);
+int cifs_utf16_bytes(const __le16 *from, int maxbytes,
+ const struct nls_table *codepage);
+int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
+char *cifs_strndup_from_utf16(const char *src, const int maxlen,
+ const bool is_unicode,
+ const struct nls_table *codepage);
+extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
+ const struct nls_table *cp, int mapChars);
#endif
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 72ddf23ef6f..3cc1b251ca0 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -556,6 +556,7 @@ init_cifs_idmap(void)
/* instruct request_key() to use this special keyring as a cache for
* the results it looks up */
+ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
cred->thread_keyring = keyring;
cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
root_cred = cred;
@@ -909,6 +910,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
umode_t group_mask = S_IRWXG;
umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
+ if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *))
+ return;
ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
GFP_KERNEL);
if (!ppace) {
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5d9b9acc5fc..63c460e503b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -327,7 +327,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
attrptr->length = cpu_to_le16(2 * dlen);
blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
- cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+ cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
return 0;
}
@@ -376,7 +376,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
kmalloc(attrsize + 1, GFP_KERNEL);
if (!ses->domainName)
return -ENOMEM;
- cifs_from_ucs2(ses->domainName,
+ cifs_from_utf16(ses->domainName,
(__le16 *)blobptr, attrsize, attrsize,
nls_cp, false);
break;
@@ -420,15 +420,20 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
}
/* convert ses->user_name to unicode and uppercase */
- len = strlen(ses->user_name);
+ len = ses->user_name ? strlen(ses->user_name) : 0;
user = kmalloc(2 + (len * 2), GFP_KERNEL);
if (user == NULL) {
cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
rc = -ENOMEM;
return rc;
}
- len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
- UniStrupr(user);
+
+ if (len) {
+ len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
+ UniStrupr(user);
+ } else {
+ memset(user, '\0', 2);
+ }
rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
(char *)user, 2 * len);
@@ -448,8 +453,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
rc = -ENOMEM;
return rc;
}
- len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
- nls_cp);
+ len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
+ nls_cp);
rc =
crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
(char *)domain, 2 * len);
@@ -468,7 +473,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
rc = -ENOMEM;
return rc;
}
- len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+ len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,
nls_cp);
rc =
crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b1fd382d195..d3421282244 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -76,12 +76,7 @@ MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
unsigned int cifs_max_pending = CIFS_MAX_REQ;
module_param(cifs_max_pending, int, 0444);
MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
- "Default: 50 Range: 2 to 256");
-unsigned short echo_retries = 5;
-module_param(echo_retries, ushort, 0644);
-MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
- "reconnecting server. Default: 5. 0 means "
- "never reconnect.");
+ "Default: 32767 Range: 2 to 32767.");
module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
"y/Y/1");
@@ -90,6 +85,8 @@ extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
+struct workqueue_struct *cifsiod_wq;
+
static int
cifs_read_super(struct super_block *sb)
{
@@ -119,12 +116,10 @@ cifs_read_super(struct super_block *sb)
if (IS_ERR(inode)) {
rc = PTR_ERR(inode);
- inode = NULL;
goto out_no_root;
}
- sb->s_root = d_alloc_root(inode);
-
+ sb->s_root = d_make_root(inode);
if (!sb->s_root) {
rc = -ENOMEM;
goto out_no_root;
@@ -147,9 +142,6 @@ cifs_read_super(struct super_block *sb)
out_no_root:
cERROR(1, "cifs_read_super: get root inode failed");
- if (inode)
- iput(inode);
-
return rc;
}
@@ -1116,14 +1108,20 @@ init_cifs(void)
if (cifs_max_pending < 2) {
cifs_max_pending = 2;
cFYI(1, "cifs_max_pending set to min of 2");
- } else if (cifs_max_pending > 256) {
- cifs_max_pending = 256;
- cFYI(1, "cifs_max_pending set to max of 256");
+ } else if (cifs_max_pending > CIFS_MAX_REQ) {
+ cifs_max_pending = CIFS_MAX_REQ;
+ cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ);
+ }
+
+ cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ if (!cifsiod_wq) {
+ rc = -ENOMEM;
+ goto out_clean_proc;
}
rc = cifs_fscache_register();
if (rc)
- goto out_clean_proc;
+ goto out_destroy_wq;
rc = cifs_init_inodecache();
if (rc)
@@ -1171,6 +1169,8 @@ out_destroy_inodecache:
cifs_destroy_inodecache();
out_unreg_fscache:
cifs_fscache_unregister();
+out_destroy_wq:
+ destroy_workqueue(cifsiod_wq);
out_clean_proc:
cifs_proc_clean();
return rc;
@@ -1180,11 +1180,8 @@ static void __exit
exit_cifs(void)
{
cFYI(DBG2, "exit_cifs");
- cifs_proc_clean();
- cifs_fscache_unregister();
-#ifdef CONFIG_CIFS_DFS_UPCALL
+ unregister_filesystem(&cifs_fs_type);
cifs_dfs_release_automount_timer();
-#endif
#ifdef CONFIG_CIFS_ACL
cifs_destroy_idmaptrees();
exit_cifs_idmap();
@@ -1192,10 +1189,12 @@ exit_cifs(void)
#ifdef CONFIG_CIFS_UPCALL
unregister_key_type(&cifs_spnego_key_type);
#endif
- unregister_filesystem(&cifs_fs_type);
- cifs_destroy_inodecache();
- cifs_destroy_mids();
cifs_destroy_request_bufs();
+ cifs_destroy_mids();
+ cifs_destroy_inodecache();
+ cifs_fscache_unregister();
+ destroy_workqueue(cifsiod_wq);
+ cifs_proc_clean();
}
MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index fe5ecf1b422..d1389bb33ce 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -125,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "1.76"
+#define CIFS_VERSION "1.77"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ba53c1c6c6c..4ff6313f0a9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -55,14 +55,9 @@
/*
* MAX_REQ is the maximum number of requests that WE will send
- * on one socket concurrently. It also matches the most common
- * value of max multiplex returned by servers. We may
- * eventually want to use the negotiated value (in case
- * future servers can handle more) when we are more confident that
- * we will not have problems oveloading the socket with pending
- * write data.
+ * on one socket concurrently.
*/
-#define CIFS_MAX_REQ 50
+#define CIFS_MAX_REQ 32767
#define RFC1001_NAME_LEN 15
#define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1)
@@ -235,6 +230,12 @@ struct cifs_mnt_data {
int flags;
};
+static inline unsigned int
+get_rfc1002_length(void *buf)
+{
+ return be32_to_cpu(*((__be32 *)buf));
+}
+
struct TCP_Server_Info {
struct list_head tcp_ses_list;
struct list_head smb_ses_list;
@@ -255,7 +256,9 @@ struct TCP_Server_Info {
bool noblocksnd; /* use blocking sendmsg */
bool noautotune; /* do not autotune send buf sizes */
bool tcp_nodelay;
- atomic_t inFlight; /* number of requests on the wire to server */
+ int credits; /* send no more requests at once */
+ unsigned int in_flight; /* number of requests on the wire to server */
+ spinlock_t req_lock; /* protect the two values above */
struct mutex srv_mutex;
struct task_struct *tsk;
char server_GUID[16];
@@ -263,6 +266,7 @@ struct TCP_Server_Info {
bool session_estab; /* mark when very first sess is established */
u16 dialect; /* dialect index that server chose */
enum securityEnum secType;
+ bool oplocks:1; /* enable oplocks */
unsigned int maxReq; /* Clients should submit no more */
/* than maxReq distinct unanswered SMBs to the server when using */
/* multiplexed reads or writes */
@@ -278,7 +282,7 @@ struct TCP_Server_Info {
vcnumbers */
int capabilities; /* allow selective disabling of caps by smb sess */
int timeAdj; /* Adjust for difference in server time zone in sec */
- __u16 CurrentMid; /* multiplex id - rotating counter */
+ __u64 CurrentMid; /* multiplex id - rotating counter */
char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
/* 16th byte of RFC1001 workstation name is always null */
char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
@@ -307,6 +311,48 @@ struct TCP_Server_Info {
#endif
};
+static inline unsigned int
+in_flight(struct TCP_Server_Info *server)
+{
+ unsigned int num;
+ spin_lock(&server->req_lock);
+ num = server->in_flight;
+ spin_unlock(&server->req_lock);
+ return num;
+}
+
+static inline int*
+get_credits_field(struct TCP_Server_Info *server)
+{
+ /*
+ * This will change to switch statement when we reserve slots for echos
+ * and oplock breaks.
+ */
+ return &server->credits;
+}
+
+static inline bool
+has_credits(struct TCP_Server_Info *server, int *credits)
+{
+ int num;
+ spin_lock(&server->req_lock);
+ num = *credits;
+ spin_unlock(&server->req_lock);
+ return num > 0;
+}
+
+static inline size_t
+header_size(void)
+{
+ return sizeof(struct smb_hdr);
+}
+
+static inline size_t
+max_header_size(void)
+{
+ return MAX_CIFS_HDR_SIZE;
+}
+
/*
* Macros to allow the TCP_Server_Info->net field and related code to drop out
* when CONFIG_NET_NS isn't set.
@@ -555,9 +601,11 @@ struct cifs_io_parms {
* Take a reference on the file private data. Must be called with
* cifs_file_list_lock held.
*/
-static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+static inline
+struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file)
{
++cifs_file->count;
+ return cifs_file;
}
void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
@@ -578,7 +626,7 @@ struct cifsInodeInfo {
bool delete_pending; /* DELETE_ON_CLOSE is set */
bool invalid_mapping; /* pagecache is invalid */
unsigned long time; /* jiffies of last update of inode */
- u64 server_eof; /* current file size on server */
+ u64 server_eof; /* current file size on server -- protected by i_lock */
u64 uniqueid; /* server inode number */
u64 createtime; /* creation time on server */
#ifdef CONFIG_CIFS_FSCACHE
@@ -685,8 +733,8 @@ typedef void (mid_callback_t)(struct mid_q_entry *mid);
/* one of these for every pending CIFS request to the server */
struct mid_q_entry {
struct list_head qhead; /* mids waiting on reply from this server */
- __u16 mid; /* multiplex id */
- __u16 pid; /* process id */
+ __u64 mid; /* multiplex id */
+ __u32 pid; /* process id */
__u32 sequence_number; /* for CIFS signing */
unsigned long when_alloc; /* when mid was created */
#ifdef CONFIG_CIFS_STATS2
@@ -696,10 +744,10 @@ struct mid_q_entry {
mid_receive_t *receive; /* call receive callback */
mid_callback_t *callback; /* call completion callback */
void *callback_data; /* general purpose pointer for callback */
- struct smb_hdr *resp_buf; /* pointer to received SMB header */
- int midState; /* wish this were enum but can not pass to wait_event */
- __u8 command; /* smb command code */
- bool largeBuf:1; /* if valid response, is pointer to large buf */
+ void *resp_buf; /* pointer to received SMB header */
+ int mid_state; /* wish this were enum but can not pass to wait_event */
+ __le16 command; /* smb command code */
+ bool large_buf:1; /* if valid response, is pointer to large buf */
bool multiRsp:1; /* multiple trans2 responses for one request */
bool multiEnd:1; /* both received */
};
@@ -879,6 +927,8 @@ require use of the stronger protocol */
#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
#endif /* UPCALL */
#else /* do not allow weak pw hash */
+#define CIFSSEC_MUST_LANMAN 0
+#define CIFSSEC_MUST_PLNTXT 0
#ifdef CONFIG_CIFS_UPCALL
#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */
#else
@@ -1008,9 +1058,6 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
-/* reconnect after this many failed echo attempts */
-GLOBAL_EXTERN unsigned short echo_retries;
-
#ifdef CONFIG_CIFS_ACL
GLOBAL_EXTERN struct rb_root uidtree;
GLOBAL_EXTERN struct rb_root gidtree;
@@ -1025,5 +1072,6 @@ GLOBAL_EXTERN spinlock_t gidsidlock;
void cifs_oplock_break(struct work_struct *work);
extern const struct slow_work_ops cifs_oplock_break_ops;
+extern struct workqueue_struct *cifsiod_wq;
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6f4e243e0f6..96192c1e380 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -77,7 +77,7 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
struct smb_hdr * /* out */ ,
int * /* bytes returned */ , const int long_op);
extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
- struct smb_hdr *in_buf, int flags);
+ char *in_buf, int flags);
extern int cifs_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
@@ -88,9 +88,11 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *in_buf ,
struct smb_hdr *out_buf,
int *bytes_returned);
-extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
-extern bool is_valid_oplock_break(struct smb_hdr *smb,
- struct TCP_Server_Info *);
+extern void cifs_add_credits(struct TCP_Server_Info *server,
+ const unsigned int add);
+extern void cifs_set_credits(struct TCP_Server_Info *server, const int val);
+extern int checkSMB(char *buf, unsigned int length);
+extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
extern bool backup_cred(struct cifs_sb_info *);
extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
@@ -104,7 +106,7 @@ extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
const unsigned short int port);
-extern int map_smb_to_linux_error(struct smb_hdr *smb, bool logErr);
+extern int map_smb_to_linux_error(char *buf, bool logErr);
extern void header_assemble(struct smb_hdr *, char /* command */ ,
const struct cifs_tcon *, int /* length of
fixed section (word count) in two byte units */);
@@ -113,7 +115,7 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
void **request_buf);
extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp);
-extern __u16 GetNextMid(struct TCP_Server_Info *server);
+extern __u64 GetNextMid(struct TCP_Server_Info *server);
extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
extern u64 cifs_UnixTimeToNT(struct timespec);
extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
@@ -168,7 +170,13 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data,
const char *devname);
extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
extern void cifs_umount(struct cifs_sb_info *);
+
+#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)
extern void cifs_dfs_release_automount_timer(void);
+#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
+#define cifs_dfs_release_automount_timer() do { } while (0)
+#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
+
void cifs_proc_init(void);
void cifs_proc_clean(void);
@@ -475,18 +483,25 @@ int cifs_async_readv(struct cifs_readdata *rdata);
/* asynchronous write support */
struct cifs_writedata {
struct kref refcount;
+ struct list_head list;
+ struct completion done;
enum writeback_sync_modes sync_mode;
struct work_struct work;
struct cifsFileInfo *cfile;
__u64 offset;
+ pid_t pid;
unsigned int bytes;
int result;
+ void (*marshal_iov) (struct kvec *iov,
+ struct cifs_writedata *wdata);
unsigned int nr_pages;
struct page *pages[1];
};
int cifs_async_writev(struct cifs_writedata *wdata);
-struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages);
+void cifs_writev_complete(struct work_struct *work);
+struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
+ work_func_t complete);
void cifs_writedata_release(struct kref *refcount);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6600aa2d2ef..f52c5ab78f9 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -458,7 +458,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
goto neg_err_exit;
}
server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
- server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
+ server->maxReq = min_t(unsigned int,
+ le16_to_cpu(rsp->MaxMpxCount),
+ cifs_max_pending);
+ cifs_set_credits(server, server->maxReq);
server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
/* even though we do not use raw we might as well set this
@@ -564,7 +567,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
/* one byte, so no need to convert this or EncryptionKeyLen from
little endian */
- server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
+ server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
+ cifs_max_pending);
+ cifs_set_credits(server, server->maxReq);
/* probably no need to store and check maxvcs */
server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
@@ -691,7 +696,7 @@ CIFSSMBTDis(const int xid, struct cifs_tcon *tcon)
if (rc)
return rc;
- rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
+ rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0);
if (rc)
cFYI(1, "Tree disconnect failed %d", rc);
@@ -716,8 +721,7 @@ cifs_echo_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = mid->callback_data;
DeleteMidQEntry(mid);
- atomic_dec(&server->inFlight);
- wake_up(&server->request_q);
+ cifs_add_credits(server, 1);
}
int
@@ -788,7 +792,7 @@ CIFSSMBLogoff(const int xid, struct cifs_ses *ses)
pSMB->hdr.Uid = ses->Suid;
pSMB->AndXCommand = 0xFF;
- rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
+ rc = SendReceiveNoRsp(xid, ses, (char *) pSMB, 0);
session_already_dead:
mutex_unlock(&ses->session_mutex);
@@ -821,8 +825,8 @@ PsxDelete:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB add path length overrun check */
@@ -893,8 +897,8 @@ DelFileRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->fileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -938,8 +942,8 @@ RmDirRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName,
- PATH_MAX, nls_codepage, remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -981,8 +985,8 @@ MkDirRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name,
- PATH_MAX, nls_codepage, remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -1030,8 +1034,8 @@ PsxCreat:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, name,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, name,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -1197,8 +1201,8 @@ OldOpenRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
count = 1; /* account for one byte pad to word boundary */
name_len =
- cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
- fileName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+ fileName, PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -1304,8 +1308,8 @@ openRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
count = 1; /* account for one byte pad to word boundary */
name_len =
- cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
- fileName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+ fileName, PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->NameLength = cpu_to_le16(name_len);
@@ -1410,8 +1414,7 @@ cifs_readdata_free(struct cifs_readdata *rdata)
static int
cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
- READ_RSP *rsp = (READ_RSP *)server->smallbuf;
- unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length);
+ unsigned int rfclen = get_rfc1002_length(server->smallbuf);
int remaining = rfclen + 4 - server->total_read;
struct cifs_readdata *rdata = mid->callback_data;
@@ -1420,7 +1423,7 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
length = cifs_read_from_socket(server, server->bigbuf,
min_t(unsigned int, remaining,
- CIFSMaxBufSize + MAX_CIFS_HDR_SIZE));
+ CIFSMaxBufSize + max_header_size()));
if (length < 0)
return length;
server->total_read += length;
@@ -1431,19 +1434,40 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return 0;
}
+static inline size_t
+read_rsp_size(void)
+{
+ return sizeof(READ_RSP);
+}
+
+static inline unsigned int
+read_data_offset(char *buf)
+{
+ READ_RSP *rsp = (READ_RSP *)buf;
+ return le16_to_cpu(rsp->DataOffset);
+}
+
+static inline unsigned int
+read_data_length(char *buf)
+{
+ READ_RSP *rsp = (READ_RSP *)buf;
+ return (le16_to_cpu(rsp->DataLengthHigh) << 16) +
+ le16_to_cpu(rsp->DataLength);
+}
+
static int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
int length, len;
unsigned int data_offset, remaining, data_len;
struct cifs_readdata *rdata = mid->callback_data;
- READ_RSP *rsp = (READ_RSP *)server->smallbuf;
- unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4;
+ char *buf = server->smallbuf;
+ unsigned int buflen = get_rfc1002_length(buf) + 4;
u64 eof;
pgoff_t eof_index;
struct page *page, *tpage;
- cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__,
+ cFYI(1, "%s: mid=%llu offset=%llu bytes=%u", __func__,
mid->mid, rdata->offset, rdata->bytes);
/*
@@ -1451,10 +1475,9 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
* can if there's not enough data. At this point, we've read down to
* the Mid.
*/
- len = min_t(unsigned int, rfclen, sizeof(*rsp)) -
- sizeof(struct smb_hdr) + 1;
+ len = min_t(unsigned int, buflen, read_rsp_size()) - header_size() + 1;
- rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1;
+ rdata->iov[0].iov_base = buf + header_size() - 1;
rdata->iov[0].iov_len = len;
length = cifs_readv_from_socket(server, rdata->iov, 1, len);
@@ -1463,7 +1486,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
server->total_read += length;
/* Was the SMB read successful? */
- rdata->result = map_smb_to_linux_error(&rsp->hdr, false);
+ rdata->result = map_smb_to_linux_error(buf, false);
if (rdata->result != 0) {
cFYI(1, "%s: server returned error %d", __func__,
rdata->result);
@@ -1471,14 +1494,14 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
/* Is there enough to get to the rest of the READ_RSP header? */
- if (server->total_read < sizeof(READ_RSP)) {
+ if (server->total_read < read_rsp_size()) {
cFYI(1, "%s: server returned short header. got=%u expected=%zu",
- __func__, server->total_read, sizeof(READ_RSP));
+ __func__, server->total_read, read_rsp_size());
rdata->result = -EIO;
return cifs_readv_discard(server, mid);
}
- data_offset = le16_to_cpu(rsp->DataOffset) + 4;
+ data_offset = read_data_offset(buf) + 4;
if (data_offset < server->total_read) {
/*
* win2k8 sometimes sends an offset of 0 when the read
@@ -1502,7 +1525,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
len = data_offset - server->total_read;
if (len > 0) {
/* read any junk before data into the rest of smallbuf */
- rdata->iov[0].iov_base = server->smallbuf + server->total_read;
+ rdata->iov[0].iov_base = buf + server->total_read;
rdata->iov[0].iov_len = len;
length = cifs_readv_from_socket(server, rdata->iov, 1, len);
if (length < 0)
@@ -1511,15 +1534,14 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
/* set up first iov for signature check */
- rdata->iov[0].iov_base = server->smallbuf;
+ rdata->iov[0].iov_base = buf;
rdata->iov[0].iov_len = server->total_read;
cFYI(1, "0: iov_base=%p iov_len=%zu",
rdata->iov[0].iov_base, rdata->iov[0].iov_len);
/* how much data is in the response? */
- data_len = le16_to_cpu(rsp->DataLengthHigh) << 16;
- data_len += le16_to_cpu(rsp->DataLength);
- if (data_offset + data_len > rfclen) {
+ data_len = read_data_length(buf);
+ if (data_offset + data_len > buflen) {
/* data_len is corrupt -- discard frame */
rdata->result = -EIO;
return cifs_readv_discard(server, mid);
@@ -1598,11 +1620,11 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
rdata->bytes = length;
- cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read,
- rfclen, remaining);
+ cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read,
+ buflen, remaining);
/* discard anything left over */
- if (server->total_read < rfclen)
+ if (server->total_read < buflen)
return cifs_readv_discard(server, mid);
dequeue_mid(mid, false);
@@ -1643,10 +1665,10 @@ cifs_readv_callback(struct mid_q_entry *mid)
struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
- cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__,
- mid->mid, mid->midState, rdata->result, rdata->bytes);
+ cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__,
+ mid->mid, mid->mid_state, rdata->result, rdata->bytes);
- switch (mid->midState) {
+ switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
/* result already set, check signature */
if (server->sec_mode &
@@ -1667,10 +1689,9 @@ cifs_readv_callback(struct mid_q_entry *mid)
rdata->result = -EIO;
}
- queue_work(system_nrt_wq, &rdata->work);
+ queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
- atomic_dec(&server->inFlight);
- wake_up(&server->request_q);
+ cifs_add_credits(server, 1);
}
/* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -2014,7 +2035,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
kref_put(&wdata->refcount, cifs_writedata_release);
}
-static void
+void
cifs_writev_complete(struct work_struct *work)
{
struct cifs_writedata *wdata = container_of(work,
@@ -2023,7 +2044,9 @@ cifs_writev_complete(struct work_struct *work)
int i = 0;
if (wdata->result == 0) {
+ spin_lock(&inode->i_lock);
cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
+ spin_unlock(&inode->i_lock);
cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
wdata->bytes);
} else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
@@ -2044,7 +2067,7 @@ cifs_writev_complete(struct work_struct *work)
}
struct cifs_writedata *
-cifs_writedata_alloc(unsigned int nr_pages)
+cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
{
struct cifs_writedata *wdata;
@@ -2058,14 +2081,16 @@ cifs_writedata_alloc(unsigned int nr_pages)
wdata = kzalloc(sizeof(*wdata) +
sizeof(struct page *) * (nr_pages - 1), GFP_NOFS);
if (wdata != NULL) {
- INIT_WORK(&wdata->work, cifs_writev_complete);
kref_init(&wdata->refcount);
+ INIT_LIST_HEAD(&wdata->list);
+ init_completion(&wdata->done);
+ INIT_WORK(&wdata->work, complete);
}
return wdata;
}
/*
- * Check the midState and signature on received buffer (if any), and queue the
+ * Check the mid_state and signature on received buffer (if any), and queue the
* workqueue completion task.
*/
static void
@@ -2076,7 +2101,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
unsigned int written;
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
- switch (mid->midState) {
+ switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
wdata->result = cifs_check_receive(mid, tcon->ses->server, 0);
if (wdata->result != 0)
@@ -2108,10 +2133,9 @@ cifs_writev_callback(struct mid_q_entry *mid)
break;
}
- queue_work(system_nrt_wq, &wdata->work);
+ queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
- atomic_dec(&tcon->ses->server->inFlight);
- wake_up(&tcon->ses->server->request_q);
+ cifs_add_credits(tcon->ses->server, 1);
}
/* cifs_async_writev - send an async write, and set up mid to handle result */
@@ -2122,7 +2146,6 @@ cifs_async_writev(struct cifs_writedata *wdata)
WRITE_REQ *smb = NULL;
int wct;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
- struct inode *inode = wdata->cfile->dentry->d_inode;
struct kvec *iov = NULL;
if (tcon->ses->capabilities & CAP_LARGE_FILES) {
@@ -2146,8 +2169,8 @@ cifs_async_writev(struct cifs_writedata *wdata)
goto async_writev_out;
}
- smb->hdr.Pid = cpu_to_le16((__u16)wdata->cfile->pid);
- smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->cfile->pid >> 16));
+ smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid);
+ smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16));
smb->AndXCommand = 0xFF; /* none */
smb->Fid = wdata->cfile->netfid;
@@ -2165,15 +2188,13 @@ cifs_async_writev(struct cifs_writedata *wdata)
iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1;
iov[0].iov_base = smb;
- /* marshal up the pages into iov array */
- wdata->bytes = 0;
- for (i = 0; i < wdata->nr_pages; i++) {
- iov[i + 1].iov_len = min(inode->i_size -
- page_offset(wdata->pages[i]),
- (loff_t)PAGE_CACHE_SIZE);
- iov[i + 1].iov_base = kmap(wdata->pages[i]);
- wdata->bytes += iov[i + 1].iov_len;
- }
+ /*
+ * This function should marshal up the page array into the kvec
+ * array, reserving [0] for the header. It should kmap the pages
+ * and set the iov_len properly for each one. It may also set
+ * wdata->bytes too.
+ */
+ wdata->marshal_iov(iov, wdata);
cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
@@ -2418,8 +2439,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
(struct smb_hdr *) pSMB, &bytes_returned);
cifs_small_buf_release(pSMB);
} else {
- rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB,
- timeout);
+ rc = SendReceiveNoRsp(xid, tcon->ses, (char *)pSMB, timeout);
/* SMB buffer freed by function above */
}
cifs_stats_inc(&tcon->num_locks);
@@ -2586,7 +2606,7 @@ CIFSSMBClose(const int xid, struct cifs_tcon *tcon, int smb_file_id)
pSMB->FileID = (__u16) smb_file_id;
pSMB->LastWriteTime = 0xFFFFFFFF;
pSMB->ByteCount = 0;
- rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+ rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
cifs_stats_inc(&tcon->num_closes);
if (rc) {
if (rc != -EINTR) {
@@ -2615,7 +2635,7 @@ CIFSSMBFlush(const int xid, struct cifs_tcon *tcon, int smb_file_id)
pSMB->FileID = (__u16) smb_file_id;
pSMB->ByteCount = 0;
- rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+ rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
cifs_stats_inc(&tcon->num_flushes);
if (rc)
cERROR(1, "Send error in Flush = %d", rc);
@@ -2649,16 +2669,16 @@ renameRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->OldFileName[name_len] = 0x04; /* pad */
/* protocol requires ASCII signature byte on Unicode string */
pSMB->OldFileName[name_len + 1] = 0x00;
name_len2 =
- cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
- toName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+ toName, PATH_MAX, nls_codepage, remap);
name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
name_len2 *= 2; /* convert to bytes */
} else { /* BB improve the check for buffer overruns BB */
@@ -2738,10 +2758,12 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
/* unicode only call */
if (target_name == NULL) {
sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid);
- len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+ len_of_str =
+ cifsConvertToUTF16((__le16 *)rename_info->target_name,
dummy_string, 24, nls_codepage, remap);
} else {
- len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+ len_of_str =
+ cifsConvertToUTF16((__le16 *)rename_info->target_name,
target_name, PATH_MAX, nls_codepage,
remap);
}
@@ -2795,17 +2817,17 @@ copyRetry:
pSMB->Flags = cpu_to_le16(flags & COPY_TREE);
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName,
- fromName, PATH_MAX, nls_codepage,
- remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
+ fromName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->OldFileName[name_len] = 0x04; /* pad */
/* protocol requires ASCII signature byte on Unicode string */
pSMB->OldFileName[name_len + 1] = 0x00;
name_len2 =
- cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
- toName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+ toName, PATH_MAX, nls_codepage, remap);
name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
name_len2 *= 2; /* convert to bytes */
} else { /* BB improve the check for buffer overruns BB */
@@ -2861,9 +2883,9 @@ createSymLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX
- /* find define for this maxpathcomponent */
- , nls_codepage);
+ cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName,
+ /* find define for this maxpathcomponent */
+ PATH_MAX, nls_codepage);
name_len++; /* trailing null */
name_len *= 2;
@@ -2885,9 +2907,9 @@ createSymLinkRetry:
data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len_target =
- cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX
- /* find define for this maxpathcomponent */
- , nls_codepage);
+ cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX
+ /* find define for this maxpathcomponent */
+ , nls_codepage);
name_len_target++; /* trailing null */
name_len_target *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -2949,8 +2971,8 @@ createHardLinkRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName,
- PATH_MAX, nls_codepage, remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
@@ -2972,8 +2994,8 @@ createHardLinkRetry:
data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len_target =
- cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX,
- nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) data_offset, fromName,
+ PATH_MAX, nls_codepage, remap);
name_len_target++; /* trailing null */
name_len_target *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -3042,8 +3064,8 @@ winCreateHardLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
@@ -3051,8 +3073,8 @@ winCreateHardLinkRetry:
pSMB->OldFileName[name_len] = 0x04;
pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
name_len2 =
- cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
- toName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+ toName, PATH_MAX, nls_codepage, remap);
name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
name_len2 *= 2; /* convert to bytes */
} else { /* BB improve the check for buffer overruns BB */
@@ -3108,8 +3130,8 @@ querySymLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifs_strtoUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage);
+ cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -3166,8 +3188,8 @@ querySymLinkRetry:
is_unicode = false;
/* BB FIXME investigate remapping reserved chars here */
- *symlinkinfo = cifs_strndup_from_ucs(data_start, count,
- is_unicode, nls_codepage);
+ *symlinkinfo = cifs_strndup_from_utf16(data_start,
+ count, is_unicode, nls_codepage);
if (!*symlinkinfo)
rc = -ENOMEM;
}
@@ -3450,8 +3472,9 @@ queryAclRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->FileName[name_len] = 0;
@@ -3537,8 +3560,8 @@ setAclRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -3869,13 +3892,12 @@ CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
int rc = 0;
int bytes_returned = 0;
SET_SEC_DESC_REQ *pSMB = NULL;
- NTRANSACT_RSP *pSMBr = NULL;
+ void *pSMBr;
setCifsAclRetry:
- rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB,
- (void **) &pSMBr);
+ rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB, &pSMBr);
if (rc)
- return (rc);
+ return rc;
pSMB->MaxSetupCount = 0;
pSMB->Reserved = 0;
@@ -3903,9 +3925,8 @@ setCifsAclRetry:
pSMB->AclFlags = cpu_to_le32(aclflag);
if (pntsd && acllen) {
- memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
- (char *) pntsd,
- acllen);
+ memcpy((char *)pSMBr + offsetof(struct smb_hdr, Protocol) +
+ data_offset, pntsd, acllen);
inc_rfc1001_len(pSMB, byte_count + data_count);
} else
inc_rfc1001_len(pSMB, byte_count);
@@ -3948,8 +3969,9 @@ QInfRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
} else {
@@ -4086,8 +4108,8 @@ QPathInfoRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -4255,8 +4277,8 @@ UnixQPathInfoRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -4344,8 +4366,8 @@ findFirstRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
/* We can not add the asterik earlier in case
it got remapped to 0xF03A as if it were part of the
directory name instead of a wildcard */
@@ -4619,7 +4641,7 @@ CIFSFindClose(const int xid, struct cifs_tcon *tcon,
pSMB->FileID = searchHandle;
pSMB->ByteCount = 0;
- rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+ rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
if (rc)
cERROR(1, "Send error in FindClose = %d", rc);
@@ -4656,8 +4678,9 @@ GetInodeNumberRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -4794,9 +4817,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
}
- cifsConvertToUCS((__le16 *) tmp, searchName,
- PATH_MAX, nls_codepage, remap);
- node->path_consumed = cifs_ucs2_bytes(tmp,
+ cifsConvertToUTF16((__le16 *) tmp, searchName,
+ PATH_MAX, nls_codepage, remap);
+ node->path_consumed = cifs_utf16_bytes(tmp,
le16_to_cpu(pSMBr->PathConsumed),
nls_codepage);
kfree(tmp);
@@ -4809,8 +4832,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
/* copy DfsPath */
temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
max_len = data_end - temp;
- node->path_name = cifs_strndup_from_ucs(temp, max_len,
- is_unicode, nls_codepage);
+ node->path_name = cifs_strndup_from_utf16(temp, max_len,
+ is_unicode, nls_codepage);
if (!node->path_name) {
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
@@ -4819,8 +4842,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
/* copy link target UNC */
temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
max_len = data_end - temp;
- node->node_name = cifs_strndup_from_ucs(temp, max_len,
- is_unicode, nls_codepage);
+ node->node_name = cifs_strndup_from_utf16(temp, max_len,
+ is_unicode, nls_codepage);
if (!node->node_name)
rc = -ENOMEM;
}
@@ -4873,8 +4896,9 @@ getDFSRetry:
if (ses->capabilities & CAP_UNICODE) {
pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
name_len =
- cifsConvertToUCS((__le16 *) pSMB->RequestFileName,
- searchName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->RequestFileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -5506,8 +5530,8 @@ SetEOFRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -5638,7 +5662,7 @@ CIFSSMBSetFileSize(const int xid, struct cifs_tcon *tcon, __u64 size,
pSMB->Reserved4 = 0;
inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
- rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+ rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
if (rc) {
cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
}
@@ -5682,7 +5706,8 @@ CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon,
param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
offset = param_offset + params;
- data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+ data_offset = (char *)pSMB +
+ offsetof(struct smb_hdr, Protocol) + offset;
count = sizeof(FILE_BASIC_INFO);
pSMB->MaxParameterCount = cpu_to_le16(2);
@@ -5707,7 +5732,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon,
inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
- rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+ rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
if (rc)
cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
@@ -5766,7 +5791,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifs_tcon *tcon,
inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
*data_offset = delete_file ? 1 : 0;
- rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+ rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
if (rc)
cFYI(1, "Send error in SetFileDisposition = %d", rc);
@@ -5796,8 +5821,8 @@ SetTimesRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -5877,8 +5902,8 @@ SetAttrLgcyRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- ConvertToUCS((__le16 *) pSMB->fileName, fileName,
- PATH_MAX, nls_codepage);
+ ConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+ PATH_MAX, nls_codepage);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -5951,7 +5976,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon,
u16 fid, u32 pid_of_opener)
{
struct smb_com_transaction2_sfi_req *pSMB = NULL;
- FILE_UNIX_BASIC_INFO *data_offset;
+ char *data_offset;
int rc = 0;
u16 params, param_offset, offset, byte_count, count;
@@ -5973,8 +5998,9 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon,
param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
offset = param_offset + params;
- data_offset = (FILE_UNIX_BASIC_INFO *)
- ((char *)(&pSMB->hdr.Protocol) + offset);
+ data_offset = (char *)pSMB +
+ offsetof(struct smb_hdr, Protocol) + offset;
+
count = sizeof(FILE_UNIX_BASIC_INFO);
pSMB->MaxParameterCount = cpu_to_le16(2);
@@ -5996,9 +6022,9 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon,
inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
- cifs_fill_unix_set_info(data_offset, args);
+ cifs_fill_unix_set_info((FILE_UNIX_BASIC_INFO *)data_offset, args);
- rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+ rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
if (rc)
cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
@@ -6030,8 +6056,8 @@ setPermsRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -6123,8 +6149,8 @@ QAllEAsRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
list_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
list_len++; /* trailing null */
list_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -6301,8 +6327,8 @@ SetEARetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4666780f315..d81e933a796 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -38,7 +38,10 @@
#include <asm/processor.h>
#include <linux/inet.h>
#include <linux/module.h>
+#include <keys/user-type.h>
#include <net/ipv6.h>
+#include <linux/parser.h>
+
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -62,6 +65,193 @@ extern mempool_t *cifs_req_poolp;
#define TLINK_ERROR_EXPIRE (1 * HZ)
#define TLINK_IDLE_EXPIRE (600 * HZ)
+enum {
+
+ /* Mount options that take no arguments */
+ Opt_user_xattr, Opt_nouser_xattr,
+ Opt_forceuid, Opt_noforceuid,
+ Opt_noblocksend, Opt_noautotune,
+ Opt_hard, Opt_soft, Opt_perm, Opt_noperm,
+ Opt_mapchars, Opt_nomapchars, Opt_sfu,
+ Opt_nosfu, Opt_nodfs, Opt_posixpaths,
+ Opt_noposixpaths, Opt_nounix,
+ Opt_nocase,
+ Opt_brl, Opt_nobrl,
+ Opt_forcemandatorylock, Opt_setuids,
+ Opt_nosetuids, Opt_dynperm, Opt_nodynperm,
+ Opt_nohard, Opt_nosoft,
+ Opt_nointr, Opt_intr,
+ Opt_nostrictsync, Opt_strictsync,
+ Opt_serverino, Opt_noserverino,
+ Opt_rwpidforward, Opt_cifsacl, Opt_nocifsacl,
+ Opt_acl, Opt_noacl, Opt_locallease,
+ Opt_sign, Opt_seal, Opt_direct,
+ Opt_strictcache, Opt_noac,
+ Opt_fsc, Opt_mfsymlinks,
+ Opt_multiuser, Opt_sloppy,
+
+ /* Mount options which take numeric value */
+ Opt_backupuid, Opt_backupgid, Opt_uid,
+ Opt_cruid, Opt_gid, Opt_file_mode,
+ Opt_dirmode, Opt_port,
+ Opt_rsize, Opt_wsize, Opt_actimeo,
+
+ /* Mount options which take string value */
+ Opt_user, Opt_pass, Opt_ip,
+ Opt_unc, Opt_domain,
+ Opt_srcaddr, Opt_prefixpath,
+ Opt_iocharset, Opt_sockopt,
+ Opt_netbiosname, Opt_servern,
+ Opt_ver, Opt_sec,
+
+ /* Mount options to be ignored */
+ Opt_ignore,
+
+ /* Options which could be blank */
+ Opt_blank_pass,
+
+ Opt_err
+};
+
+static const match_table_t cifs_mount_option_tokens = {
+
+ { Opt_user_xattr, "user_xattr" },
+ { Opt_nouser_xattr, "nouser_xattr" },
+ { Opt_forceuid, "forceuid" },
+ { Opt_noforceuid, "noforceuid" },
+ { Opt_noblocksend, "noblocksend" },
+ { Opt_noautotune, "noautotune" },
+ { Opt_hard, "hard" },
+ { Opt_soft, "soft" },
+ { Opt_perm, "perm" },
+ { Opt_noperm, "noperm" },
+ { Opt_mapchars, "mapchars" },
+ { Opt_nomapchars, "nomapchars" },
+ { Opt_sfu, "sfu" },
+ { Opt_nosfu, "nosfu" },
+ { Opt_nodfs, "nodfs" },
+ { Opt_posixpaths, "posixpaths" },
+ { Opt_noposixpaths, "noposixpaths" },
+ { Opt_nounix, "nounix" },
+ { Opt_nounix, "nolinux" },
+ { Opt_nocase, "nocase" },
+ { Opt_nocase, "ignorecase" },
+ { Opt_brl, "brl" },
+ { Opt_nobrl, "nobrl" },
+ { Opt_nobrl, "nolock" },
+ { Opt_forcemandatorylock, "forcemandatorylock" },
+ { Opt_forcemandatorylock, "forcemand" },
+ { Opt_setuids, "setuids" },
+ { Opt_nosetuids, "nosetuids" },
+ { Opt_dynperm, "dynperm" },
+ { Opt_nodynperm, "nodynperm" },
+ { Opt_nohard, "nohard" },
+ { Opt_nosoft, "nosoft" },
+ { Opt_nointr, "nointr" },
+ { Opt_intr, "intr" },
+ { Opt_nostrictsync, "nostrictsync" },
+ { Opt_strictsync, "strictsync" },
+ { Opt_serverino, "serverino" },
+ { Opt_noserverino, "noserverino" },
+ { Opt_rwpidforward, "rwpidforward" },
+ { Opt_cifsacl, "cifsacl" },
+ { Opt_nocifsacl, "nocifsacl" },
+ { Opt_acl, "acl" },
+ { Opt_noacl, "noacl" },
+ { Opt_locallease, "locallease" },
+ { Opt_sign, "sign" },
+ { Opt_seal, "seal" },
+ { Opt_direct, "direct" },
+ { Opt_direct, "forceddirectio" },
+ { Opt_strictcache, "strictcache" },
+ { Opt_noac, "noac" },
+ { Opt_fsc, "fsc" },
+ { Opt_mfsymlinks, "mfsymlinks" },
+ { Opt_multiuser, "multiuser" },
+ { Opt_sloppy, "sloppy" },
+
+ { Opt_backupuid, "backupuid=%s" },
+ { Opt_backupgid, "backupgid=%s" },
+ { Opt_uid, "uid=%s" },
+ { Opt_cruid, "cruid=%s" },
+ { Opt_gid, "gid=%s" },
+ { Opt_file_mode, "file_mode=%s" },
+ { Opt_dirmode, "dirmode=%s" },
+ { Opt_dirmode, "dir_mode=%s" },
+ { Opt_port, "port=%s" },
+ { Opt_rsize, "rsize=%s" },
+ { Opt_wsize, "wsize=%s" },
+ { Opt_actimeo, "actimeo=%s" },
+
+ { Opt_user, "user=%s" },
+ { Opt_user, "username=%s" },
+ { Opt_blank_pass, "pass=" },
+ { Opt_pass, "pass=%s" },
+ { Opt_pass, "password=%s" },
+ { Opt_ip, "ip=%s" },
+ { Opt_ip, "addr=%s" },
+ { Opt_unc, "unc=%s" },
+ { Opt_unc, "target=%s" },
+ { Opt_unc, "path=%s" },
+ { Opt_domain, "dom=%s" },
+ { Opt_domain, "domain=%s" },
+ { Opt_domain, "workgroup=%s" },
+ { Opt_srcaddr, "srcaddr=%s" },
+ { Opt_prefixpath, "prefixpath=%s" },
+ { Opt_iocharset, "iocharset=%s" },
+ { Opt_sockopt, "sockopt=%s" },
+ { Opt_netbiosname, "netbiosname=%s" },
+ { Opt_servern, "servern=%s" },
+ { Opt_ver, "ver=%s" },
+ { Opt_ver, "vers=%s" },
+ { Opt_ver, "version=%s" },
+ { Opt_sec, "sec=%s" },
+
+ { Opt_ignore, "cred" },
+ { Opt_ignore, "credentials" },
+ { Opt_ignore, "guest" },
+ { Opt_ignore, "rw" },
+ { Opt_ignore, "ro" },
+ { Opt_ignore, "suid" },
+ { Opt_ignore, "nosuid" },
+ { Opt_ignore, "exec" },
+ { Opt_ignore, "noexec" },
+ { Opt_ignore, "nodev" },
+ { Opt_ignore, "noauto" },
+ { Opt_ignore, "dev" },
+ { Opt_ignore, "mand" },
+ { Opt_ignore, "nomand" },
+ { Opt_ignore, "_netdev" },
+
+ { Opt_err, NULL }
+};
+
+enum {
+ Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p,
+ Opt_sec_ntlmsspi, Opt_sec_ntlmssp,
+ Opt_ntlm, Opt_sec_ntlmi, Opt_sec_ntlmv2i,
+ Opt_sec_nontlm, Opt_sec_lanman,
+ Opt_sec_none,
+
+ Opt_sec_err
+};
+
+static const match_table_t cifs_secflavor_tokens = {
+ { Opt_sec_krb5, "krb5" },
+ { Opt_sec_krb5i, "krb5i" },
+ { Opt_sec_krb5p, "krb5p" },
+ { Opt_sec_ntlmsspi, "ntlmsspi" },
+ { Opt_sec_ntlmssp, "ntlmssp" },
+ { Opt_ntlm, "ntlm" },
+ { Opt_sec_ntlmi, "ntlmi" },
+ { Opt_sec_ntlmv2i, "ntlmv2i" },
+ { Opt_sec_nontlm, "nontlm" },
+ { Opt_sec_lanman, "lanman" },
+ { Opt_sec_none, "none" },
+
+ { Opt_sec_err, NULL }
+};
+
static int ip_connect(struct TCP_Server_Info *server);
static int generic_ip_connect(struct TCP_Server_Info *server);
static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
@@ -142,8 +332,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
spin_lock(&GlobalMid_Lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- if (mid_entry->midState == MID_REQUEST_SUBMITTED)
- mid_entry->midState = MID_RETRY_NEEDED;
+ if (mid_entry->mid_state == MID_REQUEST_SUBMITTED)
+ mid_entry->mid_state = MID_RETRY_NEEDED;
list_move(&mid_entry->qhead, &retry_list);
}
spin_unlock(&GlobalMid_Lock);
@@ -182,8 +372,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
-EINVAL = invalid transact2
*/
-static int check2ndT2(struct smb_hdr *pSMB)
+static int check2ndT2(char *buf)
{
+ struct smb_hdr *pSMB = (struct smb_hdr *)buf;
struct smb_t2_rsp *pSMBt;
int remaining;
__u16 total_data_size, data_in_this_rsp;
@@ -223,76 +414,92 @@ static int check2ndT2(struct smb_hdr *pSMB)
return remaining;
}
-static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
+static int coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
{
- struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
- struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB;
- char *data_area_of_target;
- char *data_area_of_buf2;
+ struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)second_buf;
+ struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)target_hdr;
+ char *data_area_of_tgt;
+ char *data_area_of_src;
int remaining;
- unsigned int byte_count, total_in_buf;
- __u16 total_data_size, total_in_buf2;
+ unsigned int byte_count, total_in_tgt;
+ __u16 tgt_total_cnt, src_total_cnt, total_in_src;
- total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+ src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount);
+ tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
- if (total_data_size !=
- get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
- cFYI(1, "total data size of primary and secondary t2 differ");
+ if (tgt_total_cnt != src_total_cnt)
+ cFYI(1, "total data count of primary and secondary t2 differ "
+ "source=%hu target=%hu", src_total_cnt, tgt_total_cnt);
- total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+ total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
- remaining = total_data_size - total_in_buf;
+ remaining = tgt_total_cnt - total_in_tgt;
- if (remaining < 0)
+ if (remaining < 0) {
+ cFYI(1, "Server sent too much data. tgt_total_cnt=%hu "
+ "total_in_tgt=%hu", tgt_total_cnt, total_in_tgt);
return -EPROTO;
+ }
- if (remaining == 0) /* nothing to do, ignore */
+ if (remaining == 0) {
+ /* nothing to do, ignore */
+ cFYI(1, "no more data remains");
return 0;
+ }
- total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
- if (remaining < total_in_buf2) {
+ total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
+ if (remaining < total_in_src)
cFYI(1, "transact2 2nd response contains too much data");
- }
/* find end of first SMB data area */
- data_area_of_target = (char *)&pSMBt->hdr.Protocol +
+ data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
- /* validate target area */
- data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
- get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
+ /* validate target area */
+ data_area_of_src = (char *)&pSMBs->hdr.Protocol +
+ get_unaligned_le16(&pSMBs->t2_rsp.DataOffset);
- data_area_of_target += total_in_buf;
+ data_area_of_tgt += total_in_tgt;
- /* copy second buffer into end of first buffer */
- total_in_buf += total_in_buf2;
+ total_in_tgt += total_in_src;
/* is the result too big for the field? */
- if (total_in_buf > USHRT_MAX)
+ if (total_in_tgt > USHRT_MAX) {
+ cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt);
return -EPROTO;
- put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
+ }
+ put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
/* fix up the BCC */
- byte_count = get_bcc(pTargetSMB);
- byte_count += total_in_buf2;
+ byte_count = get_bcc(target_hdr);
+ byte_count += total_in_src;
/* is the result too big for the field? */
- if (byte_count > USHRT_MAX)
+ if (byte_count > USHRT_MAX) {
+ cFYI(1, "coalesced BCC too large (%u)", byte_count);
return -EPROTO;
- put_bcc(byte_count, pTargetSMB);
+ }
+ put_bcc(byte_count, target_hdr);
- byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
- byte_count += total_in_buf2;
+ byte_count = be32_to_cpu(target_hdr->smb_buf_length);
+ byte_count += total_in_src;
/* don't allow buffer to overflow */
- if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
+ if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count);
return -ENOBUFS;
- pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
+ }
+ target_hdr->smb_buf_length = cpu_to_be32(byte_count);
- memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
+ /* copy second buffer into end of first buffer */
+ memcpy(data_area_of_tgt, data_area_of_src, total_in_src);
- if (remaining == total_in_buf2) {
- cFYI(1, "found the last secondary response");
- return 0; /* we are done */
- } else /* more responses to go */
+ if (remaining != total_in_src) {
+ /* more responses to go */
+ cFYI(1, "waiting for more secondary responses");
return 1;
+ }
+
+ /* we are done */
+ cFYI(1, "found the last secondary response");
+ return 0;
}
static void
@@ -317,7 +524,7 @@ cifs_echo_request(struct work_struct *work)
server->hostname);
requeue_echo:
- queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
+ queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL);
}
static bool
@@ -333,7 +540,7 @@ allocate_buffers(struct TCP_Server_Info *server)
}
} else if (server->large_buf) {
/* we are reusing a dirty large buf, clear its start */
- memset(server->bigbuf, 0, sizeof(struct smb_hdr));
+ memset(server->bigbuf, 0, header_size());
}
if (!server->smallbuf) {
@@ -347,7 +554,7 @@ allocate_buffers(struct TCP_Server_Info *server)
/* beginning of smb buffer is cleared in our buf_get */
} else {
/* if existing small buf clear beginning */
- memset(server->smallbuf, 0, sizeof(struct smb_hdr));
+ memset(server->smallbuf, 0, header_size());
}
return true;
@@ -356,12 +563,22 @@ allocate_buffers(struct TCP_Server_Info *server)
static bool
server_unresponsive(struct TCP_Server_Info *server)
{
- if (echo_retries > 0 && server->tcpStatus == CifsGood &&
- time_after(jiffies, server->lstrp +
- (echo_retries * SMB_ECHO_INTERVAL))) {
+ /*
+ * We need to wait 2 echo intervals to make sure we handle such
+ * situations right:
+ * 1s client sends a normal SMB request
+ * 2s client gets a response
+ * 30s echo workqueue job pops, and decides we got a response recently
+ * and don't need to send another
+ * ...
+ * 65s kernel_recvmsg times out, and we see that we haven't gotten
+ * a response in >60s.
+ */
+ if (server->tcpStatus == CifsGood &&
+ time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
cERROR(1, "Server %s has not responded in %d seconds. "
"Reconnecting...", server->hostname,
- (echo_retries * SMB_ECHO_INTERVAL / HZ));
+ (2 * SMB_ECHO_INTERVAL) / HZ);
cifs_reconnect(server);
wake_up(&server->response_q);
return true;
@@ -539,15 +756,16 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
}
static struct mid_q_entry *
-find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf)
+find_mid(struct TCP_Server_Info *server, char *buffer)
{
+ struct smb_hdr *buf = (struct smb_hdr *)buffer;
struct mid_q_entry *mid;
spin_lock(&GlobalMid_Lock);
list_for_each_entry(mid, &server->pending_mid_q, qhead) {
if (mid->mid == buf->Mid &&
- mid->midState == MID_REQUEST_SUBMITTED &&
- mid->command == buf->Command) {
+ mid->mid_state == MID_REQUEST_SUBMITTED &&
+ le16_to_cpu(mid->command) == buf->Command) {
spin_unlock(&GlobalMid_Lock);
return mid;
}
@@ -564,16 +782,16 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
#endif
spin_lock(&GlobalMid_Lock);
if (!malformed)
- mid->midState = MID_RESPONSE_RECEIVED;
+ mid->mid_state = MID_RESPONSE_RECEIVED;
else
- mid->midState = MID_RESPONSE_MALFORMED;
+ mid->mid_state = MID_RESPONSE_MALFORMED;
list_del_init(&mid->qhead);
spin_unlock(&GlobalMid_Lock);
}
static void
handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
- struct smb_hdr *buf, int malformed)
+ char *buf, int malformed)
{
if (malformed == 0 && check2ndT2(buf) > 0) {
mid->multiRsp = true;
@@ -593,13 +811,13 @@ handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
} else {
/* Have first buffer */
mid->resp_buf = buf;
- mid->largeBuf = true;
+ mid->large_buf = true;
server->bigbuf = NULL;
}
return;
}
mid->resp_buf = buf;
- mid->largeBuf = server->large_buf;
+ mid->large_buf = server->large_buf;
/* Was previous buf put in mpx struct for multi-rsp? */
if (!mid->multiRsp) {
/* smb buffer will be freed by user thread */
@@ -625,19 +843,11 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
spin_unlock(&GlobalMid_Lock);
wake_up_all(&server->response_q);
- /*
- * Check if we have blocked requests that need to free. Note that
- * cifs_max_pending is normally 50, but can be set at module install
- * time to as little as two.
- */
- spin_lock(&GlobalMid_Lock);
- if (atomic_read(&server->inFlight) >= cifs_max_pending)
- atomic_set(&server->inFlight, cifs_max_pending - 1);
- /*
- * We do not want to set the max_pending too low or we could end up
- * with the counter going negative.
- */
- spin_unlock(&GlobalMid_Lock);
+ /* check if we have blocked requests that need to free */
+ spin_lock(&server->req_lock);
+ if (server->credits <= 0)
+ server->credits = 1;
+ spin_unlock(&server->req_lock);
/*
* Although there should not be any requests blocked on this queue it
* can not hurt to be paranoid and try to wake up requests that may
@@ -663,8 +873,8 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
spin_lock(&GlobalMid_Lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- cFYI(1, "Clearing mid 0x%x", mid_entry->mid);
- mid_entry->midState = MID_SHUTDOWN;
+ cFYI(1, "Clearing mid 0x%llx", mid_entry->mid);
+ mid_entry->mid_state = MID_SHUTDOWN;
list_move(&mid_entry->qhead, &dispose_list);
}
spin_unlock(&GlobalMid_Lock);
@@ -672,7 +882,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
/* now walk dispose list and issue callbacks */
list_for_each_safe(tmp, tmp2, &dispose_list) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- cFYI(1, "Callback mid 0x%x", mid_entry->mid);
+ cFYI(1, "Callback mid 0x%llx", mid_entry->mid);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
}
@@ -712,11 +922,10 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
int length;
char *buf = server->smallbuf;
- struct smb_hdr *smb_buffer = (struct smb_hdr *)buf;
- unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
+ unsigned int pdu_length = get_rfc1002_length(buf);
/* make sure this will fit in a large buffer */
- if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ if (pdu_length > CIFSMaxBufSize + max_header_size() - 4) {
cERROR(1, "SMB response too long (%u bytes)",
pdu_length);
cifs_reconnect(server);
@@ -727,20 +936,18 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
/* switch to large buffer if too big for a small one */
if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
server->large_buf = true;
- memcpy(server->bigbuf, server->smallbuf, server->total_read);
+ memcpy(server->bigbuf, buf, server->total_read);
buf = server->bigbuf;
- smb_buffer = (struct smb_hdr *)buf;
}
/* now read the rest */
- length = cifs_read_from_socket(server,
- buf + sizeof(struct smb_hdr) - 1,
- pdu_length - sizeof(struct smb_hdr) + 1 + 4);
+ length = cifs_read_from_socket(server, buf + header_size() - 1,
+ pdu_length - header_size() + 1 + 4);
if (length < 0)
return length;
server->total_read += length;
- dump_smb(smb_buffer, server->total_read);
+ dump_smb(buf, server->total_read);
/*
* We know that we received enough to get to the MID as we
@@ -751,15 +958,16 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
* 48 bytes is enough to display the header and a little bit
* into the payload for debugging purposes.
*/
- length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read);
+ length = checkSMB(buf, server->total_read);
if (length != 0)
cifs_dump_mem("Bad SMB: ", buf,
min_t(unsigned int, server->total_read, 48));
- if (mid)
- handle_mid(mid, server, smb_buffer, length);
+ if (!mid)
+ return length;
- return length;
+ handle_mid(mid, server, buf, length);
+ return 0;
}
static int
@@ -769,7 +977,6 @@ cifs_demultiplex_thread(void *p)
struct TCP_Server_Info *server = p;
unsigned int pdu_length;
char *buf = NULL;
- struct smb_hdr *smb_buffer = NULL;
struct task_struct *task_to_wake = NULL;
struct mid_q_entry *mid_entry;
@@ -790,7 +997,6 @@ cifs_demultiplex_thread(void *p)
continue;
server->large_buf = false;
- smb_buffer = (struct smb_hdr *)server->smallbuf;
buf = server->smallbuf;
pdu_length = 4; /* enough to get RFC1001 header */
@@ -803,14 +1009,14 @@ cifs_demultiplex_thread(void *p)
* The right amount was read from socket - 4 bytes,
* so we can now interpret the length field.
*/
- pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
+ pdu_length = get_rfc1002_length(buf);
cFYI(1, "RFC1002 header 0x%x", pdu_length);
if (!is_smb_response(server, buf[0]))
continue;
/* make sure we have enough to get to the MID */
- if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) {
+ if (pdu_length < header_size() - 1 - 4) {
cERROR(1, "SMB response too short (%u bytes)",
pdu_length);
cifs_reconnect(server);
@@ -820,12 +1026,12 @@ cifs_demultiplex_thread(void *p)
/* read down to the MID */
length = cifs_read_from_socket(server, buf + 4,
- sizeof(struct smb_hdr) - 1 - 4);
+ header_size() - 1 - 4);
if (length < 0)
continue;
server->total_read += length;
- mid_entry = find_mid(server, smb_buffer);
+ mid_entry = find_mid(server, buf);
if (!mid_entry || !mid_entry->receive)
length = standard_receive3(server, mid_entry);
@@ -835,22 +1041,19 @@ cifs_demultiplex_thread(void *p)
if (length < 0)
continue;
- if (server->large_buf) {
+ if (server->large_buf)
buf = server->bigbuf;
- smb_buffer = (struct smb_hdr *)buf;
- }
server->lstrp = jiffies;
if (mid_entry != NULL) {
if (!mid_entry->multiRsp || mid_entry->multiEnd)
mid_entry->callback(mid_entry);
- } else if (!is_valid_oplock_break(smb_buffer, server)) {
+ } else if (!is_valid_oplock_break(buf, server)) {
cERROR(1, "No task to wake, unknown frame received! "
"NumMids %d", atomic_read(&midCount));
- cifs_dump_mem("Received Data is: ", buf,
- sizeof(struct smb_hdr));
+ cifs_dump_mem("Received Data is: ", buf, header_size());
#ifdef CONFIG_CIFS_DEBUG2
- cifs_dump_detail(smb_buffer);
+ cifs_dump_detail(buf);
cifs_dump_mids(server);
#endif /* CIFS_DEBUG2 */
@@ -906,23 +1109,95 @@ extract_hostname(const char *unc)
return dst;
}
+static int get_option_ul(substring_t args[], unsigned long *option)
+{
+ int rc;
+ char *string;
+
+ string = match_strdup(args);
+ if (string == NULL)
+ return -ENOMEM;
+ rc = kstrtoul(string, 10, option);
+ kfree(string);
+
+ return rc;
+}
+
+
+static int cifs_parse_security_flavors(char *value,
+ struct smb_vol *vol)
+{
+
+ substring_t args[MAX_OPT_ARGS];
+
+ switch (match_token(value, cifs_secflavor_tokens, args)) {
+ case Opt_sec_krb5:
+ vol->secFlg |= CIFSSEC_MAY_KRB5;
+ break;
+ case Opt_sec_krb5i:
+ vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN;
+ break;
+ case Opt_sec_krb5p:
+ /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */
+ cERROR(1, "Krb5 cifs privacy not supported");
+ break;
+ case Opt_sec_ntlmssp:
+ vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
+ break;
+ case Opt_sec_ntlmsspi:
+ vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN;
+ break;
+ case Opt_ntlm:
+ /* ntlm is default so can be turned off too */
+ vol->secFlg |= CIFSSEC_MAY_NTLM;
+ break;
+ case Opt_sec_ntlmi:
+ vol->secFlg |= CIFSSEC_MAY_NTLM | CIFSSEC_MUST_SIGN;
+ break;
+ case Opt_sec_nontlm:
+ vol->secFlg |= CIFSSEC_MAY_NTLMV2;
+ break;
+ case Opt_sec_ntlmv2i:
+ vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN;
+ break;
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+ case Opt_sec_lanman:
+ vol->secFlg |= CIFSSEC_MAY_LANMAN;
+ break;
+#endif
+ case Opt_sec_none:
+ vol->nullauth = 1;
+ break;
+ default:
+ cERROR(1, "bad security option: %s", value);
+ return 1;
+ }
+
+ return 0;
+}
+
static int
cifs_parse_mount_options(const char *mountdata, const char *devname,
struct smb_vol *vol)
{
- char *value, *data, *end;
+ char *data, *end;
char *mountdata_copy = NULL, *options;
- int err;
unsigned int temp_len, i, j;
char separator[2];
short int override_uid = -1;
short int override_gid = -1;
bool uid_specified = false;
bool gid_specified = false;
+ bool sloppy = false;
+ char *invalid = NULL;
char *nodename = utsname()->nodename;
+ char *string = NULL;
+ char *tmp_end, *value;
+ char delim;
separator[0] = ',';
separator[1] = 0;
+ delim = separator[0];
/*
* does not have to be perfect mapping since field is
@@ -961,6 +1236,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
options = mountdata_copy;
end = options + strlen(options);
+
if (strncmp(options, "sep=", 4) == 0) {
if (options[4] != 0) {
separator[0] = options[4];
@@ -973,617 +1249,662 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->backupgid_specified = false; /* no backup intent for a group */
while ((data = strsep(&options, separator)) != NULL) {
+ substring_t args[MAX_OPT_ARGS];
+ unsigned long option;
+ int token;
+
if (!*data)
continue;
- if ((value = strchr(data, '=')) != NULL)
- *value++ = '\0';
- /* Have to parse this before we parse for "user" */
- if (strnicmp(data, "user_xattr", 10) == 0) {
+ token = match_token(data, cifs_mount_option_tokens, args);
+
+ switch (token) {
+
+ /* Ingnore the following */
+ case Opt_ignore:
+ break;
+
+ /* Boolean values */
+ case Opt_user_xattr:
vol->no_xattr = 0;
- } else if (strnicmp(data, "nouser_xattr", 12) == 0) {
+ break;
+ case Opt_nouser_xattr:
vol->no_xattr = 1;
- } else if (strnicmp(data, "user", 4) == 0) {
- if (!value) {
- printk(KERN_WARNING
- "CIFS: invalid or missing username\n");
- goto cifs_parse_mount_err;
- } else if (!*value) {
- /* null user, ie anonymous, authentication */
- vol->nullauth = 1;
- }
- if (strnlen(value, MAX_USERNAME_SIZE) <
- MAX_USERNAME_SIZE) {
- vol->username = kstrdup(value, GFP_KERNEL);
- if (!vol->username) {
- printk(KERN_WARNING "CIFS: no memory "
- "for username\n");
- goto cifs_parse_mount_err;
- }
- } else {
- printk(KERN_WARNING "CIFS: username too long\n");
- goto cifs_parse_mount_err;
- }
- } else if (strnicmp(data, "pass", 4) == 0) {
- if (!value) {
- vol->password = NULL;
- continue;
- } else if (value[0] == 0) {
- /* check if string begins with double comma
- since that would mean the password really
- does start with a comma, and would not
- indicate an empty string */
- if (value[1] != separator[0]) {
- vol->password = NULL;
- continue;
- }
- }
- temp_len = strlen(value);
- /* removed password length check, NTLM passwords
- can be arbitrarily long */
-
- /* if comma in password, the string will be
- prematurely null terminated. Commas in password are
- specified across the cifs mount interface by a double
- comma ie ,, and a comma used as in other cases ie ','
- as a parameter delimiter/separator is single and due
- to the strsep above is temporarily zeroed. */
-
- /* NB: password legally can have multiple commas and
- the only illegal character in a password is null */
-
- if ((value[temp_len] == 0) &&
- (value + temp_len < end) &&
- (value[temp_len+1] == separator[0])) {
- /* reinsert comma */
- value[temp_len] = separator[0];
- temp_len += 2; /* move after second comma */
- while (value[temp_len] != 0) {
- if (value[temp_len] == separator[0]) {
- if (value[temp_len+1] ==
- separator[0]) {
- /* skip second comma */
- temp_len++;
- } else {
- /* single comma indicating start
- of next parm */
- break;
- }
- }
- temp_len++;
- }
- if (value[temp_len] == 0) {
- options = NULL;
- } else {
- value[temp_len] = 0;
- /* point option to start of next parm */
- options = value + temp_len + 1;
- }
- /* go from value to value + temp_len condensing
- double commas to singles. Note that this ends up
- allocating a few bytes too many, which is ok */
- vol->password = kzalloc(temp_len, GFP_KERNEL);
- if (vol->password == NULL) {
- printk(KERN_WARNING "CIFS: no memory "
- "for password\n");
- goto cifs_parse_mount_err;
- }
- for (i = 0, j = 0; i < temp_len; i++, j++) {
- vol->password[j] = value[i];
- if (value[i] == separator[0]
- && value[i+1] == separator[0]) {
- /* skip second comma */
- i++;
- }
- }
- vol->password[j] = 0;
- } else {
- vol->password = kzalloc(temp_len+1, GFP_KERNEL);
- if (vol->password == NULL) {
- printk(KERN_WARNING "CIFS: no memory "
- "for password\n");
- goto cifs_parse_mount_err;
- }
- strcpy(vol->password, value);
- }
- } else if (!strnicmp(data, "ip", 2) ||
- !strnicmp(data, "addr", 4)) {
- if (!value || !*value) {
- vol->UNCip = NULL;
- } else if (strnlen(value, INET6_ADDRSTRLEN) <
- INET6_ADDRSTRLEN) {
- vol->UNCip = kstrdup(value, GFP_KERNEL);
- if (!vol->UNCip) {
- printk(KERN_WARNING "CIFS: no memory "
- "for UNC IP\n");
- goto cifs_parse_mount_err;
- }
- } else {
- printk(KERN_WARNING "CIFS: ip address "
- "too long\n");
- goto cifs_parse_mount_err;
- }
- } else if (strnicmp(data, "sec", 3) == 0) {
- if (!value || !*value) {
- cERROR(1, "no security value specified");
- continue;
- } else if (strnicmp(value, "krb5i", 5) == 0) {
- vol->secFlg |= CIFSSEC_MAY_KRB5 |
- CIFSSEC_MUST_SIGN;
- } else if (strnicmp(value, "krb5p", 5) == 0) {
- /* vol->secFlg |= CIFSSEC_MUST_SEAL |
- CIFSSEC_MAY_KRB5; */
- cERROR(1, "Krb5 cifs privacy not supported");
- goto cifs_parse_mount_err;
- } else if (strnicmp(value, "krb5", 4) == 0) {
- vol->secFlg |= CIFSSEC_MAY_KRB5;
- } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
- vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
- CIFSSEC_MUST_SIGN;
- } else if (strnicmp(value, "ntlmssp", 7) == 0) {
- vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
- } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
- vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
- CIFSSEC_MUST_SIGN;
- } else if (strnicmp(value, "ntlmv2", 6) == 0) {
- vol->secFlg |= CIFSSEC_MAY_NTLMV2;
- } else if (strnicmp(value, "ntlmi", 5) == 0) {
- vol->secFlg |= CIFSSEC_MAY_NTLM |
- CIFSSEC_MUST_SIGN;
- } else if (strnicmp(value, "ntlm", 4) == 0) {
- /* ntlm is default so can be turned off too */
- vol->secFlg |= CIFSSEC_MAY_NTLM;
- } else if (strnicmp(value, "nontlm", 6) == 0) {
- /* BB is there a better way to do this? */
- vol->secFlg |= CIFSSEC_MAY_NTLMV2;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- } else if (strnicmp(value, "lanman", 6) == 0) {
- vol->secFlg |= CIFSSEC_MAY_LANMAN;
-#endif
- } else if (strnicmp(value, "none", 4) == 0) {
- vol->nullauth = 1;
- } else {
- cERROR(1, "bad security option: %s", value);
- goto cifs_parse_mount_err;
- }
- } else if (strnicmp(data, "vers", 3) == 0) {
- if (!value || !*value) {
- cERROR(1, "no protocol version specified"
- " after vers= mount option");
- } else if ((strnicmp(value, "cifs", 4) == 0) ||
- (strnicmp(value, "1", 1) == 0)) {
- /* this is the default */
- continue;
- }
- } else if ((strnicmp(data, "unc", 3) == 0)
- || (strnicmp(data, "target", 6) == 0)
- || (strnicmp(data, "path", 4) == 0)) {
- if (!value || !*value) {
- printk(KERN_WARNING "CIFS: invalid path to "
- "network resource\n");
- goto cifs_parse_mount_err;
- }
- if ((temp_len = strnlen(value, 300)) < 300) {
- vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
- if (vol->UNC == NULL)
- goto cifs_parse_mount_err;
- strcpy(vol->UNC, value);
- if (strncmp(vol->UNC, "//", 2) == 0) {
- vol->UNC[0] = '\\';
- vol->UNC[1] = '\\';
- } else if (strncmp(vol->UNC, "\\\\", 2) != 0) {
- printk(KERN_WARNING
- "CIFS: UNC Path does not begin "
- "with // or \\\\ \n");
- goto cifs_parse_mount_err;
- }
- } else {
- printk(KERN_WARNING "CIFS: UNC name too long\n");
- goto cifs_parse_mount_err;
- }
- } else if ((strnicmp(data, "domain", 3) == 0)
- || (strnicmp(data, "workgroup", 5) == 0)) {
- if (!value || !*value) {
- printk(KERN_WARNING "CIFS: invalid domain name\n");
- goto cifs_parse_mount_err;
- }
- /* BB are there cases in which a comma can be valid in
- a domain name and need special handling? */
- if (strnlen(value, 256) < 256) {
- vol->domainname = kstrdup(value, GFP_KERNEL);
- if (!vol->domainname) {
- printk(KERN_WARNING "CIFS: no memory "
- "for domainname\n");
- goto cifs_parse_mount_err;
- }
- cFYI(1, "Domain name set");
- } else {
- printk(KERN_WARNING "CIFS: domain name too "
- "long\n");
- goto cifs_parse_mount_err;
- }
- } else if (strnicmp(data, "srcaddr", 7) == 0) {
- vol->srcaddr.ss_family = AF_UNSPEC;
-
- if (!value || !*value) {
- printk(KERN_WARNING "CIFS: srcaddr value"
- " not specified.\n");
- goto cifs_parse_mount_err;
- }
- i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
- value, strlen(value));
- if (i == 0) {
- printk(KERN_WARNING "CIFS: Could not parse"
- " srcaddr: %s\n",
- value);
- goto cifs_parse_mount_err;
- }
- } else if (strnicmp(data, "prefixpath", 10) == 0) {
- if (!value || !*value) {
- printk(KERN_WARNING
- "CIFS: invalid path prefix\n");
- goto cifs_parse_mount_err;
- }
- if ((temp_len = strnlen(value, 1024)) < 1024) {
- if (value[0] != '/')
- temp_len++; /* missing leading slash */
- vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
- if (vol->prepath == NULL)
- goto cifs_parse_mount_err;
- if (value[0] != '/') {
- vol->prepath[0] = '/';
- strcpy(vol->prepath+1, value);
- } else
- strcpy(vol->prepath, value);
- cFYI(1, "prefix path %s", vol->prepath);
- } else {
- printk(KERN_WARNING "CIFS: prefix too long\n");
- goto cifs_parse_mount_err;
- }
- } else if (strnicmp(data, "iocharset", 9) == 0) {
- if (!value || !*value) {
- printk(KERN_WARNING "CIFS: invalid iocharset "
- "specified\n");
- goto cifs_parse_mount_err;
- }
- if (strnlen(value, 65) < 65) {
- if (strnicmp(value, "default", 7)) {
- vol->iocharset = kstrdup(value,
- GFP_KERNEL);
-
- if (!vol->iocharset) {
- printk(KERN_WARNING "CIFS: no "
- "memory for"
- "charset\n");
- goto cifs_parse_mount_err;
- }
- }
- /* if iocharset not set then load_nls_default
- is used by caller */
- cFYI(1, "iocharset set to %s", value);
- } else {
- printk(KERN_WARNING "CIFS: iocharset name "
- "too long.\n");
- goto cifs_parse_mount_err;
- }
- } else if (!strnicmp(data, "uid", 3) && value && *value) {
- vol->linux_uid = simple_strtoul(value, &value, 0);
- uid_specified = true;
- } else if (!strnicmp(data, "cruid", 5) && value && *value) {
- vol->cred_uid = simple_strtoul(value, &value, 0);
- } else if (!strnicmp(data, "forceuid", 8)) {
+ break;
+ case Opt_forceuid:
override_uid = 1;
- } else if (!strnicmp(data, "noforceuid", 10)) {
+ break;
+ case Opt_noforceuid:
override_uid = 0;
- } else if (!strnicmp(data, "gid", 3) && value && *value) {
- vol->linux_gid = simple_strtoul(value, &value, 0);
- gid_specified = true;
- } else if (!strnicmp(data, "forcegid", 8)) {
- override_gid = 1;
- } else if (!strnicmp(data, "noforcegid", 10)) {
- override_gid = 0;
- } else if (strnicmp(data, "file_mode", 4) == 0) {
- if (value && *value) {
- vol->file_mode =
- simple_strtoul(value, &value, 0);
- }
- } else if (strnicmp(data, "dir_mode", 4) == 0) {
- if (value && *value) {
- vol->dir_mode =
- simple_strtoul(value, &value, 0);
- }
- } else if (strnicmp(data, "dirmode", 4) == 0) {
- if (value && *value) {
- vol->dir_mode =
- simple_strtoul(value, &value, 0);
- }
- } else if (strnicmp(data, "port", 4) == 0) {
- if (value && *value) {
- vol->port =
- simple_strtoul(value, &value, 0);
- }
- } else if (strnicmp(data, "rsize", 5) == 0) {
- if (value && *value) {
- vol->rsize =
- simple_strtoul(value, &value, 0);
- }
- } else if (strnicmp(data, "wsize", 5) == 0) {
- if (value && *value) {
- vol->wsize =
- simple_strtoul(value, &value, 0);
- }
- } else if (strnicmp(data, "sockopt", 5) == 0) {
- if (!value || !*value) {
- cERROR(1, "no socket option specified");
- continue;
- } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
- vol->sockopt_tcp_nodelay = 1;
- }
- } else if (strnicmp(data, "netbiosname", 4) == 0) {
- if (!value || !*value || (*value == ' ')) {
- cFYI(1, "invalid (empty) netbiosname");
- } else {
- memset(vol->source_rfc1001_name, 0x20,
- RFC1001_NAME_LEN);
- /*
- * FIXME: are there cases in which a comma can
- * be valid in workstation netbios name (and
- * need special handling)?
- */
- for (i = 0; i < RFC1001_NAME_LEN; i++) {
- /* don't ucase netbiosname for user */
- if (value[i] == 0)
- break;
- vol->source_rfc1001_name[i] = value[i];
- }
- /* The string has 16th byte zero still from
- set at top of the function */
- if (i == RFC1001_NAME_LEN && value[i] != 0)
- printk(KERN_WARNING "CIFS: netbiosname"
- " longer than 15 truncated.\n");
- }
- } else if (strnicmp(data, "servern", 7) == 0) {
- /* servernetbiosname specified override *SMBSERVER */
- if (!value || !*value || (*value == ' ')) {
- cFYI(1, "empty server netbiosname specified");
- } else {
- /* last byte, type, is 0x20 for servr type */
- memset(vol->target_rfc1001_name, 0x20,
- RFC1001_NAME_LEN_WITH_NULL);
-
- for (i = 0; i < 15; i++) {
- /* BB are there cases in which a comma can be
- valid in this workstation netbios name
- (and need special handling)? */
-
- /* user or mount helper must uppercase
- the netbiosname */
- if (value[i] == 0)
- break;
- else
- vol->target_rfc1001_name[i] =
- value[i];
- }
- /* The string has 16th byte zero still from
- set at top of the function */
- if (i == RFC1001_NAME_LEN && value[i] != 0)
- printk(KERN_WARNING "CIFS: server net"
- "biosname longer than 15 truncated.\n");
- }
- } else if (strnicmp(data, "actimeo", 7) == 0) {
- if (value && *value) {
- vol->actimeo = HZ * simple_strtoul(value,
- &value, 0);
- if (vol->actimeo > CIFS_MAX_ACTIMEO) {
- cERROR(1, "CIFS: attribute cache"
- "timeout too large");
- goto cifs_parse_mount_err;
- }
- }
- } else if (strnicmp(data, "credentials", 4) == 0) {
- /* ignore */
- } else if (strnicmp(data, "version", 3) == 0) {
- /* ignore */
- } else if (strnicmp(data, "guest", 5) == 0) {
- /* ignore */
- } else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) {
- /* ignore */
- } else if (strnicmp(data, "ro", 2) == 0) {
- /* ignore */
- } else if (strnicmp(data, "noblocksend", 11) == 0) {
+ break;
+ case Opt_noblocksend:
vol->noblocksnd = 1;
- } else if (strnicmp(data, "noautotune", 10) == 0) {
+ break;
+ case Opt_noautotune:
vol->noautotune = 1;
- } else if ((strnicmp(data, "suid", 4) == 0) ||
- (strnicmp(data, "nosuid", 6) == 0) ||
- (strnicmp(data, "exec", 4) == 0) ||
- (strnicmp(data, "noexec", 6) == 0) ||
- (strnicmp(data, "nodev", 5) == 0) ||
- (strnicmp(data, "noauto", 6) == 0) ||
- (strnicmp(data, "dev", 3) == 0)) {
- /* The mount tool or mount.cifs helper (if present)
- uses these opts to set flags, and the flags are read
- by the kernel vfs layer before we get here (ie
- before read super) so there is no point trying to
- parse these options again and set anything and it
- is ok to just ignore them */
- continue;
- } else if (strnicmp(data, "hard", 4) == 0) {
+ break;
+ case Opt_hard:
vol->retry = 1;
- } else if (strnicmp(data, "soft", 4) == 0) {
+ break;
+ case Opt_soft:
vol->retry = 0;
- } else if (strnicmp(data, "perm", 4) == 0) {
+ break;
+ case Opt_perm:
vol->noperm = 0;
- } else if (strnicmp(data, "noperm", 6) == 0) {
+ break;
+ case Opt_noperm:
vol->noperm = 1;
- } else if (strnicmp(data, "mapchars", 8) == 0) {
+ break;
+ case Opt_mapchars:
vol->remap = 1;
- } else if (strnicmp(data, "nomapchars", 10) == 0) {
+ break;
+ case Opt_nomapchars:
vol->remap = 0;
- } else if (strnicmp(data, "sfu", 3) == 0) {
+ break;
+ case Opt_sfu:
vol->sfu_emul = 1;
- } else if (strnicmp(data, "nosfu", 5) == 0) {
+ break;
+ case Opt_nosfu:
vol->sfu_emul = 0;
- } else if (strnicmp(data, "nodfs", 5) == 0) {
+ break;
+ case Opt_nodfs:
vol->nodfs = 1;
- } else if (strnicmp(data, "posixpaths", 10) == 0) {
+ break;
+ case Opt_posixpaths:
vol->posix_paths = 1;
- } else if (strnicmp(data, "noposixpaths", 12) == 0) {
+ break;
+ case Opt_noposixpaths:
vol->posix_paths = 0;
- } else if (strnicmp(data, "nounix", 6) == 0) {
- vol->no_linux_ext = 1;
- } else if (strnicmp(data, "nolinux", 7) == 0) {
+ break;
+ case Opt_nounix:
vol->no_linux_ext = 1;
- } else if ((strnicmp(data, "nocase", 6) == 0) ||
- (strnicmp(data, "ignorecase", 10) == 0)) {
+ break;
+ case Opt_nocase:
vol->nocase = 1;
- } else if (strnicmp(data, "mand", 4) == 0) {
- /* ignore */
- } else if (strnicmp(data, "nomand", 6) == 0) {
- /* ignore */
- } else if (strnicmp(data, "_netdev", 7) == 0) {
- /* ignore */
- } else if (strnicmp(data, "brl", 3) == 0) {
+ break;
+ case Opt_brl:
vol->nobrl = 0;
- } else if ((strnicmp(data, "nobrl", 5) == 0) ||
- (strnicmp(data, "nolock", 6) == 0)) {
+ break;
+ case Opt_nobrl:
vol->nobrl = 1;
- /* turn off mandatory locking in mode
- if remote locking is turned off since the
- local vfs will do advisory */
+ /*
+ * turn off mandatory locking in mode
+ * if remote locking is turned off since the
+ * local vfs will do advisory
+ */
if (vol->file_mode ==
(S_IALLUGO & ~(S_ISUID | S_IXGRP)))
vol->file_mode = S_IALLUGO;
- } else if (strnicmp(data, "forcemandatorylock", 9) == 0) {
- /* will take the shorter form "forcemand" as well */
- /* This mount option will force use of mandatory
- (DOS/Windows style) byte range locks, instead of
- using posix advisory byte range locks, even if the
- Unix extensions are available and posix locks would
- be supported otherwise. If Unix extensions are not
- negotiated this has no effect since mandatory locks
- would be used (mandatory locks is all that those
- those servers support) */
+ break;
+ case Opt_forcemandatorylock:
vol->mand_lock = 1;
- } else if (strnicmp(data, "setuids", 7) == 0) {
+ break;
+ case Opt_setuids:
vol->setuids = 1;
- } else if (strnicmp(data, "nosetuids", 9) == 0) {
+ break;
+ case Opt_nosetuids:
vol->setuids = 0;
- } else if (strnicmp(data, "dynperm", 7) == 0) {
+ break;
+ case Opt_dynperm:
vol->dynperm = true;
- } else if (strnicmp(data, "nodynperm", 9) == 0) {
+ break;
+ case Opt_nodynperm:
vol->dynperm = false;
- } else if (strnicmp(data, "nohard", 6) == 0) {
+ break;
+ case Opt_nohard:
vol->retry = 0;
- } else if (strnicmp(data, "nosoft", 6) == 0) {
+ break;
+ case Opt_nosoft:
vol->retry = 1;
- } else if (strnicmp(data, "nointr", 6) == 0) {
+ break;
+ case Opt_nointr:
vol->intr = 0;
- } else if (strnicmp(data, "intr", 4) == 0) {
+ break;
+ case Opt_intr:
vol->intr = 1;
- } else if (strnicmp(data, "nostrictsync", 12) == 0) {
+ break;
+ case Opt_nostrictsync:
vol->nostrictsync = 1;
- } else if (strnicmp(data, "strictsync", 10) == 0) {
+ break;
+ case Opt_strictsync:
vol->nostrictsync = 0;
- } else if (strnicmp(data, "serverino", 7) == 0) {
+ break;
+ case Opt_serverino:
vol->server_ino = 1;
- } else if (strnicmp(data, "noserverino", 9) == 0) {
+ break;
+ case Opt_noserverino:
vol->server_ino = 0;
- } else if (strnicmp(data, "rwpidforward", 12) == 0) {
+ break;
+ case Opt_rwpidforward:
vol->rwpidforward = 1;
- } else if (strnicmp(data, "cifsacl", 7) == 0) {
+ break;
+ case Opt_cifsacl:
vol->cifs_acl = 1;
- } else if (strnicmp(data, "nocifsacl", 9) == 0) {
+ break;
+ case Opt_nocifsacl:
vol->cifs_acl = 0;
- } else if (strnicmp(data, "acl", 3) == 0) {
+ break;
+ case Opt_acl:
vol->no_psx_acl = 0;
- } else if (strnicmp(data, "noacl", 5) == 0) {
+ break;
+ case Opt_noacl:
vol->no_psx_acl = 1;
- } else if (strnicmp(data, "locallease", 6) == 0) {
+ break;
+ case Opt_locallease:
vol->local_lease = 1;
- } else if (strnicmp(data, "sign", 4) == 0) {
+ break;
+ case Opt_sign:
vol->secFlg |= CIFSSEC_MUST_SIGN;
- } else if (strnicmp(data, "seal", 4) == 0) {
+ break;
+ case Opt_seal:
/* we do not do the following in secFlags because seal
- is a per tree connection (mount) not a per socket
- or per-smb connection option in the protocol */
- /* vol->secFlg |= CIFSSEC_MUST_SEAL; */
+ * is a per tree connection (mount) not a per socket
+ * or per-smb connection option in the protocol
+ * vol->secFlg |= CIFSSEC_MUST_SEAL;
+ */
vol->seal = 1;
- } else if (strnicmp(data, "direct", 6) == 0) {
- vol->direct_io = 1;
- } else if (strnicmp(data, "forcedirectio", 13) == 0) {
+ break;
+ case Opt_direct:
vol->direct_io = 1;
- } else if (strnicmp(data, "strictcache", 11) == 0) {
+ break;
+ case Opt_strictcache:
vol->strict_io = 1;
- } else if (strnicmp(data, "noac", 4) == 0) {
+ break;
+ case Opt_noac:
printk(KERN_WARNING "CIFS: Mount option noac not "
"supported. Instead set "
"/proc/fs/cifs/LookupCacheEnabled to 0\n");
- } else if (strnicmp(data, "fsc", 3) == 0) {
+ break;
+ case Opt_fsc:
#ifndef CONFIG_CIFS_FSCACHE
cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE "
"kernel config option set");
goto cifs_parse_mount_err;
#endif
vol->fsc = true;
- } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
+ break;
+ case Opt_mfsymlinks:
vol->mfsymlinks = true;
- } else if (strnicmp(data, "multiuser", 8) == 0) {
+ break;
+ case Opt_multiuser:
vol->multiuser = true;
- } else if (!strnicmp(data, "backupuid", 9) && value && *value) {
- err = kstrtouint(value, 0, &vol->backupuid);
- if (err < 0) {
+ break;
+ case Opt_sloppy:
+ sloppy = true;
+ break;
+
+ /* Numeric Values */
+ case Opt_backupuid:
+ if (get_option_ul(args, &option)) {
cERROR(1, "%s: Invalid backupuid value",
__func__);
goto cifs_parse_mount_err;
}
+ vol->backupuid = option;
vol->backupuid_specified = true;
- } else if (!strnicmp(data, "backupgid", 9) && value && *value) {
- err = kstrtouint(value, 0, &vol->backupgid);
- if (err < 0) {
+ break;
+ case Opt_backupgid:
+ if (get_option_ul(args, &option)) {
cERROR(1, "%s: Invalid backupgid value",
__func__);
goto cifs_parse_mount_err;
}
+ vol->backupgid = option;
vol->backupgid_specified = true;
- } else
- printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
- data);
- }
- if (vol->UNC == NULL) {
- if (devname == NULL) {
- printk(KERN_WARNING "CIFS: Missing UNC name for mount "
- "target\n");
- goto cifs_parse_mount_err;
- }
- if ((temp_len = strnlen(devname, 300)) < 300) {
+ break;
+ case Opt_uid:
+ if (get_option_ul(args, &option)) {
+ cERROR(1, "%s: Invalid uid value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->linux_uid = option;
+ uid_specified = true;
+ break;
+ case Opt_cruid:
+ if (get_option_ul(args, &option)) {
+ cERROR(1, "%s: Invalid cruid value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->cred_uid = option;
+ break;
+ case Opt_gid:
+ if (get_option_ul(args, &option)) {
+ cERROR(1, "%s: Invalid gid value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->linux_gid = option;
+ gid_specified = true;
+ break;
+ case Opt_file_mode:
+ if (get_option_ul(args, &option)) {
+ cERROR(1, "%s: Invalid file_mode value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->file_mode = option;
+ break;
+ case Opt_dirmode:
+ if (get_option_ul(args, &option)) {
+ cERROR(1, "%s: Invalid dir_mode value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->dir_mode = option;
+ break;
+ case Opt_port:
+ if (get_option_ul(args, &option)) {
+ cERROR(1, "%s: Invalid port value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->port = option;
+ break;
+ case Opt_rsize:
+ if (get_option_ul(args, &option)) {
+ cERROR(1, "%s: Invalid rsize value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->rsize = option;
+ break;
+ case Opt_wsize:
+ if (get_option_ul(args, &option)) {
+ cERROR(1, "%s: Invalid wsize value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->wsize = option;
+ break;
+ case Opt_actimeo:
+ if (get_option_ul(args, &option)) {
+ cERROR(1, "%s: Invalid actimeo value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->actimeo = HZ * option;
+ if (vol->actimeo > CIFS_MAX_ACTIMEO) {
+ cERROR(1, "CIFS: attribute cache"
+ "timeout too large");
+ goto cifs_parse_mount_err;
+ }
+ break;
+
+ /* String Arguments */
+
+ case Opt_user:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ /* null user, ie. anonymous authentication */
+ vol->nullauth = 1;
+ } else if (strnlen(string, MAX_USERNAME_SIZE) >
+ MAX_USERNAME_SIZE) {
+ printk(KERN_WARNING "CIFS: username too long\n");
+ goto cifs_parse_mount_err;
+ }
+ vol->username = kstrdup(string, GFP_KERNEL);
+ if (!vol->username) {
+ printk(KERN_WARNING "CIFS: no memory "
+ "for username\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_blank_pass:
+ vol->password = NULL;
+ break;
+ case Opt_pass:
+ /* passwords have to be handled differently
+ * to allow the character used for deliminator
+ * to be passed within them
+ */
+
+ /* Obtain the value string */
+ value = strchr(data, '=');
+ value++;
+
+ /* Set tmp_end to end of the string */
+ tmp_end = (char *) value + strlen(value);
+
+ /* Check if following character is the deliminator
+ * If yes, we have encountered a double deliminator
+ * reset the NULL character to the deliminator
+ */
+ if (tmp_end < end && tmp_end[1] == delim)
+ tmp_end[0] = delim;
+
+ /* Keep iterating until we get to a single deliminator
+ * OR the end
+ */
+ while ((tmp_end = strchr(tmp_end, delim)) != NULL &&
+ (tmp_end[1] == delim)) {
+ tmp_end = (char *) &tmp_end[2];
+ }
+
+ /* Reset var options to point to next element */
+ if (tmp_end) {
+ tmp_end[0] = '\0';
+ options = (char *) &tmp_end[1];
+ } else
+ /* Reached the end of the mount option string */
+ options = end;
+
+ /* Now build new password string */
+ temp_len = strlen(value);
+ vol->password = kzalloc(temp_len+1, GFP_KERNEL);
+ if (vol->password == NULL) {
+ printk(KERN_WARNING "CIFS: no memory "
+ "for password\n");
+ goto cifs_parse_mount_err;
+ }
+
+ for (i = 0, j = 0; i < temp_len; i++, j++) {
+ vol->password[j] = value[i];
+ if ((value[i] == delim) &&
+ value[i+1] == delim)
+ /* skip the second deliminator */
+ i++;
+ }
+ vol->password[j] = '\0';
+ break;
+ case Opt_ip:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ vol->UNCip = NULL;
+ } else if (strnlen(string, INET6_ADDRSTRLEN) >
+ INET6_ADDRSTRLEN) {
+ printk(KERN_WARNING "CIFS: ip address "
+ "too long\n");
+ goto cifs_parse_mount_err;
+ }
+ vol->UNCip = kstrdup(string, GFP_KERNEL);
+ if (!vol->UNCip) {
+ printk(KERN_WARNING "CIFS: no memory "
+ "for UNC IP\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_unc:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ printk(KERN_WARNING "CIFS: invalid path to "
+ "network resource\n");
+ goto cifs_parse_mount_err;
+ }
+
+ temp_len = strnlen(string, 300);
+ if (temp_len == 300) {
+ printk(KERN_WARNING "CIFS: UNC name too long\n");
+ goto cifs_parse_mount_err;
+ }
+
vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
- if (vol->UNC == NULL)
+ if (vol->UNC == NULL) {
+ printk(KERN_WARNING "CIFS: no memory for UNC\n");
goto cifs_parse_mount_err;
- strcpy(vol->UNC, devname);
- if (strncmp(vol->UNC, "//", 2) == 0) {
+ }
+ strcpy(vol->UNC, string);
+
+ if (strncmp(string, "//", 2) == 0) {
vol->UNC[0] = '\\';
vol->UNC[1] = '\\';
- } else if (strncmp(vol->UNC, "\\\\", 2) != 0) {
+ } else if (strncmp(string, "\\\\", 2) != 0) {
printk(KERN_WARNING "CIFS: UNC Path does not "
- "begin with // or \\\\ \n");
+ "begin with // or \\\\\n");
goto cifs_parse_mount_err;
}
- value = strpbrk(vol->UNC+2, "/\\");
- if (value)
- *value = '\\';
- } else {
- printk(KERN_WARNING "CIFS: UNC name too long\n");
+
+ break;
+ case Opt_domain:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ printk(KERN_WARNING "CIFS: invalid domain"
+ " name\n");
+ goto cifs_parse_mount_err;
+ } else if (strnlen(string, 256) == 256) {
+ printk(KERN_WARNING "CIFS: domain name too"
+ " long\n");
+ goto cifs_parse_mount_err;
+ }
+
+ vol->domainname = kstrdup(string, GFP_KERNEL);
+ if (!vol->domainname) {
+ printk(KERN_WARNING "CIFS: no memory "
+ "for domainname\n");
+ goto cifs_parse_mount_err;
+ }
+ cFYI(1, "Domain name set");
+ break;
+ case Opt_srcaddr:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ printk(KERN_WARNING "CIFS: srcaddr value not"
+ " specified\n");
+ goto cifs_parse_mount_err;
+ } else if (!cifs_convert_address(
+ (struct sockaddr *)&vol->srcaddr,
+ string, strlen(string))) {
+ printk(KERN_WARNING "CIFS: Could not parse"
+ " srcaddr: %s\n", string);
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_prefixpath:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ printk(KERN_WARNING "CIFS: Invalid path"
+ " prefix\n");
+ goto cifs_parse_mount_err;
+ }
+ temp_len = strnlen(string, 1024);
+ if (string[0] != '/')
+ temp_len++; /* missing leading slash */
+ if (temp_len > 1024) {
+ printk(KERN_WARNING "CIFS: prefix too long\n");
+ goto cifs_parse_mount_err;
+ }
+
+ vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
+ if (vol->prepath == NULL) {
+ printk(KERN_WARNING "CIFS: no memory "
+ "for path prefix\n");
+ goto cifs_parse_mount_err;
+ }
+
+ if (string[0] != '/') {
+ vol->prepath[0] = '/';
+ strcpy(vol->prepath+1, string);
+ } else
+ strcpy(vol->prepath, string);
+
+ break;
+ case Opt_iocharset:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ printk(KERN_WARNING "CIFS: Invalid iocharset"
+ " specified\n");
+ goto cifs_parse_mount_err;
+ } else if (strnlen(string, 1024) >= 65) {
+ printk(KERN_WARNING "CIFS: iocharset name "
+ "too long.\n");
+ goto cifs_parse_mount_err;
+ }
+
+ if (strnicmp(string, "default", 7) != 0) {
+ vol->iocharset = kstrdup(string,
+ GFP_KERNEL);
+ if (!vol->iocharset) {
+ printk(KERN_WARNING "CIFS: no memory"
+ "for charset\n");
+ goto cifs_parse_mount_err;
+ }
+ }
+ /* if iocharset not set then load_nls_default
+ * is used by caller
+ */
+ cFYI(1, "iocharset set to %s", string);
+ break;
+ case Opt_sockopt:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ printk(KERN_WARNING "CIFS: No socket option"
+ " specified\n");
+ goto cifs_parse_mount_err;
+ }
+ if (strnicmp(string, "TCP_NODELAY", 11) == 0)
+ vol->sockopt_tcp_nodelay = 1;
+ break;
+ case Opt_netbiosname:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ printk(KERN_WARNING "CIFS: Invalid (empty)"
+ " netbiosname\n");
+ break;
+ }
+
+ memset(vol->source_rfc1001_name, 0x20,
+ RFC1001_NAME_LEN);
+ /*
+ * FIXME: are there cases in which a comma can
+ * be valid in workstation netbios name (and
+ * need special handling)?
+ */
+ for (i = 0; i < RFC1001_NAME_LEN; i++) {
+ /* don't ucase netbiosname for user */
+ if (string[i] == 0)
+ break;
+ vol->source_rfc1001_name[i] = string[i];
+ }
+ /* The string has 16th byte zero still from
+ * set at top of the function
+ */
+ if (i == RFC1001_NAME_LEN && string[i] != 0)
+ printk(KERN_WARNING "CIFS: netbiosname"
+ " longer than 15 truncated.\n");
+
+ break;
+ case Opt_servern:
+ /* servernetbiosname specified override *SMBSERVER */
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ printk(KERN_WARNING "CIFS: Empty server"
+ " netbiosname specified\n");
+ break;
+ }
+ /* last byte, type, is 0x20 for servr type */
+ memset(vol->target_rfc1001_name, 0x20,
+ RFC1001_NAME_LEN_WITH_NULL);
+
+ /* BB are there cases in which a comma can be
+ valid in this workstation netbios name
+ (and need special handling)? */
+
+ /* user or mount helper must uppercase the
+ netbios name */
+ for (i = 0; i < 15; i++) {
+ if (string[i] == 0)
+ break;
+ vol->target_rfc1001_name[i] = string[i];
+ }
+ /* The string has 16th byte zero still from
+ set at top of the function */
+ if (i == RFC1001_NAME_LEN && string[i] != 0)
+ printk(KERN_WARNING "CIFS: server net"
+ "biosname longer than 15 truncated.\n");
+ break;
+ case Opt_ver:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ cERROR(1, "no protocol version specified"
+ " after vers= mount option");
+ goto cifs_parse_mount_err;
+ }
+
+ if (strnicmp(string, "cifs", 4) == 0 ||
+ strnicmp(string, "1", 1) == 0) {
+ /* This is the default */
+ break;
+ }
+ /* For all other value, error */
+ printk(KERN_WARNING "CIFS: Invalid version"
+ " specified\n");
goto cifs_parse_mount_err;
+ case Opt_sec:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (!*string) {
+ printk(KERN_WARNING "CIFS: no security flavor"
+ " specified\n");
+ break;
+ }
+
+ if (cifs_parse_security_flavors(string, vol) != 0)
+ goto cifs_parse_mount_err;
+ break;
+ default:
+ /*
+ * An option we don't recognize. Save it off for later
+ * if we haven't already found one
+ */
+ if (!invalid)
+ invalid = data;
+ break;
}
+ /* Free up any allocated string */
+ kfree(string);
+ string = NULL;
}
- if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
- cERROR(1, "Multiuser mounts currently require krb5 "
- "authentication!");
+ if (!sloppy && invalid) {
+ printk(KERN_ERR "CIFS: Unknown mount option \"%s\"\n", invalid);
goto cifs_parse_mount_err;
}
+#ifndef CONFIG_KEYS
+ /* Muliuser mounts require CONFIG_KEYS support */
+ if (vol->multiuser) {
+ cERROR(1, "Multiuser mounts require kernels with "
+ "CONFIG_KEYS enabled.");
+ goto cifs_parse_mount_err;
+ }
+#endif
+
if (vol->UNCip == NULL)
vol->UNCip = &vol->UNC[2];
@@ -1602,7 +1923,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
kfree(mountdata_copy);
return 0;
+out_nomem:
+ printk(KERN_WARNING "Could not allocate temporary buffer\n");
cifs_parse_mount_err:
+ kfree(string);
kfree(mountdata_copy);
return 1;
}
@@ -1888,7 +2212,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
tcp_ses->noblocksnd = volume_info->noblocksnd;
tcp_ses->noautotune = volume_info->noautotune;
tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
- atomic_set(&tcp_ses->inFlight, 0);
+ tcp_ses->in_flight = 0;
+ tcp_ses->credits = 1;
init_waitqueue_head(&tcp_ses->response_q);
init_waitqueue_head(&tcp_ses->request_q);
INIT_LIST_HEAD(&tcp_ses->pending_mid_q);
@@ -1953,7 +2278,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
cifs_fscache_get_client_cookie(tcp_ses);
/* queue echo request delayed work */
- queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+ queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
return tcp_ses;
@@ -1981,10 +2306,16 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
return 0;
break;
default:
+ /* NULL username means anonymous session */
+ if (ses->user_name == NULL) {
+ if (!vol->nullauth)
+ return 0;
+ break;
+ }
+
/* anything else takes username/password */
- if (ses->user_name == NULL)
- return 0;
- if (strncmp(ses->user_name, vol->username,
+ if (strncmp(ses->user_name,
+ vol->username ? vol->username : "",
MAX_USERNAME_SIZE))
return 0;
if (strlen(vol->username) != 0 &&
@@ -2039,6 +2370,132 @@ cifs_put_smb_ses(struct cifs_ses *ses)
cifs_put_tcp_session(server);
}
+#ifdef CONFIG_KEYS
+
+/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */
+#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1)
+
+/* Populate username and pw fields from keyring if possible */
+static int
+cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
+{
+ int rc = 0;
+ char *desc, *delim, *payload;
+ ssize_t len;
+ struct key *key;
+ struct TCP_Server_Info *server = ses->server;
+ struct sockaddr_in *sa;
+ struct sockaddr_in6 *sa6;
+ struct user_key_payload *upayload;
+
+ desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
+ if (!desc)
+ return -ENOMEM;
+
+ /* try to find an address key first */
+ switch (server->dstaddr.ss_family) {
+ case AF_INET:
+ sa = (struct sockaddr_in *)&server->dstaddr;
+ sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr);
+ break;
+ case AF_INET6:
+ sa6 = (struct sockaddr_in6 *)&server->dstaddr;
+ sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
+ break;
+ default:
+ cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family);
+ rc = -EINVAL;
+ goto out_err;
+ }
+
+ cFYI(1, "%s: desc=%s", __func__, desc);
+ key = request_key(&key_type_logon, desc, "");
+ if (IS_ERR(key)) {
+ if (!ses->domainName) {
+ cFYI(1, "domainName is NULL");
+ rc = PTR_ERR(key);
+ goto out_err;
+ }
+
+ /* didn't work, try to find a domain key */
+ sprintf(desc, "cifs:d:%s", ses->domainName);
+ cFYI(1, "%s: desc=%s", __func__, desc);
+ key = request_key(&key_type_logon, desc, "");
+ if (IS_ERR(key)) {
+ rc = PTR_ERR(key);
+ goto out_err;
+ }
+ }
+
+ down_read(&key->sem);
+ upayload = key->payload.data;
+ if (IS_ERR_OR_NULL(upayload)) {
+ rc = upayload ? PTR_ERR(upayload) : -EINVAL;
+ goto out_key_put;
+ }
+
+ /* find first : in payload */
+ payload = (char *)upayload->data;
+ delim = strnchr(payload, upayload->datalen, ':');
+ cFYI(1, "payload=%s", payload);
+ if (!delim) {
+ cFYI(1, "Unable to find ':' in payload (datalen=%d)",
+ upayload->datalen);
+ rc = -EINVAL;
+ goto out_key_put;
+ }
+
+ len = delim - payload;
+ if (len > MAX_USERNAME_SIZE || len <= 0) {
+ cFYI(1, "Bad value from username search (len=%zd)", len);
+ rc = -EINVAL;
+ goto out_key_put;
+ }
+
+ vol->username = kstrndup(payload, len, GFP_KERNEL);
+ if (!vol->username) {
+ cFYI(1, "Unable to allocate %zd bytes for username", len);
+ rc = -ENOMEM;
+ goto out_key_put;
+ }
+ cFYI(1, "%s: username=%s", __func__, vol->username);
+
+ len = key->datalen - (len + 1);
+ if (len > MAX_PASSWORD_SIZE || len <= 0) {
+ cFYI(1, "Bad len for password search (len=%zd)", len);
+ rc = -EINVAL;
+ kfree(vol->username);
+ vol->username = NULL;
+ goto out_key_put;
+ }
+
+ ++delim;
+ vol->password = kstrndup(delim, len, GFP_KERNEL);
+ if (!vol->password) {
+ cFYI(1, "Unable to allocate %zd bytes for password", len);
+ rc = -ENOMEM;
+ kfree(vol->username);
+ vol->username = NULL;
+ goto out_key_put;
+ }
+
+out_key_put:
+ up_read(&key->sem);
+ key_put(key);
+out_err:
+ kfree(desc);
+ cFYI(1, "%s: returning %d", __func__, rc);
+ return rc;
+}
+#else /* ! CONFIG_KEYS */
+static inline int
+cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
+ struct cifs_ses *ses __attribute__((unused)))
+{
+ return -ENOSYS;
+}
+#endif /* CONFIG_KEYS */
+
static bool warned_on_ntlm; /* globals init to false automatically */
static struct cifs_ses *
@@ -2914,18 +3371,33 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
/*
- * Windows only supports a max of 60k reads. Default to that when posix
- * extensions aren't in force.
+ * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
+ * those values when posix extensions aren't in force. In actuality here, we
+ * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
+ * to be ok with the extra byte even though Windows doesn't send writes that
+ * are that large.
+ *
+ * Citation:
+ *
+ * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
*/
#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
+#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
static unsigned int
cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
{
__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
struct TCP_Server_Info *server = tcon->ses->server;
- unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
- CIFS_DEFAULT_IOSIZE;
+ unsigned int wsize;
+
+ /* start with specified wsize, or default */
+ if (pvolume_info->wsize)
+ wsize = pvolume_info->wsize;
+ else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+ wsize = CIFS_DEFAULT_IOSIZE;
+ else
+ wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
/* can server support 24-bit write sizes? (via UNIX extensions) */
if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -3136,10 +3608,9 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
return -EINVAL;
if (volume_info->nullauth) {
- cFYI(1, "null user");
- volume_info->username = kzalloc(1, GFP_KERNEL);
- if (volume_info->username == NULL)
- return -ENOMEM;
+ cFYI(1, "Anonymous login");
+ kfree(volume_info->username);
+ volume_info->username = NULL;
} else if (volume_info->username) {
/* BB fixme parse for domain name here */
cFYI(1, "Username: %s", volume_info->username);
@@ -3204,7 +3675,7 @@ cifs_ra_pages(struct cifs_sb_info *cifs_sb)
int
cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
{
- int rc = 0;
+ int rc;
int xid;
struct cifs_ses *pSesInfo;
struct cifs_tcon *tcon;
@@ -3231,6 +3702,7 @@ try_mount_again:
FreeXid(xid);
}
#endif
+ rc = 0;
tcon = NULL;
pSesInfo = NULL;
srvTcp = NULL;
@@ -3372,7 +3844,7 @@ remote_path_check:
tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
spin_unlock(&cifs_sb->tlink_tree_lock);
- queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+ queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
TLINK_IDLE_EXPIRE);
mount_fail_check:
@@ -3478,7 +3950,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
if (ses->capabilities & CAP_UNICODE) {
smb_buffer->Flags2 |= SMBFLG2_UNICODE;
length =
- cifs_strtoUCS((__le16 *) bcc_ptr, tree,
+ cifs_strtoUTF16((__le16 *) bcc_ptr, tree,
6 /* max utf8 char length in bytes */ *
(/* server len*/ + 256 /* share len */), nls_codepage);
bcc_ptr += 2 * length; /* convert num 16 bit words to bytes */
@@ -3533,7 +4005,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
/* mostly informational -- no need to fail on error here */
kfree(tcon->nativeFileSystem);
- tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
+ tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr,
bytes_left, is_unicode,
nls_codepage);
@@ -3592,9 +4064,11 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)
if (server->maxBuf != 0)
return 0;
+ cifs_set_credits(server, 1);
rc = CIFSSMBNegotiate(xid, ses);
if (rc == -EAGAIN) {
/* retry only once on 1st time connection */
+ cifs_set_credits(server, 1);
rc = CIFSSMBNegotiate(xid, ses);
if (rc == -EAGAIN)
rc = -EHOSTDOWN;
@@ -3657,25 +4131,43 @@ int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
return rc;
}
+static int
+cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
+{
+ switch (ses->server->secType) {
+ case Kerberos:
+ vol->secFlg = CIFSSEC_MUST_KRB5;
+ return 0;
+ case NTLMv2:
+ vol->secFlg = CIFSSEC_MUST_NTLMV2;
+ break;
+ case NTLM:
+ vol->secFlg = CIFSSEC_MUST_NTLM;
+ break;
+ case RawNTLMSSP:
+ vol->secFlg = CIFSSEC_MUST_NTLMSSP;
+ break;
+ case LANMAN:
+ vol->secFlg = CIFSSEC_MUST_LANMAN;
+ break;
+ }
+
+ return cifs_set_cifscreds(vol, ses);
+}
+
static struct cifs_tcon *
cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
{
+ int rc;
struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
struct cifs_ses *ses;
struct cifs_tcon *tcon = NULL;
struct smb_vol *vol_info;
- char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
- /* We used to have this as MAX_USERNAME which is */
- /* way too big now (256 instead of 32) */
vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
- if (vol_info == NULL) {
- tcon = ERR_PTR(-ENOMEM);
- goto out;
- }
+ if (vol_info == NULL)
+ return ERR_PTR(-ENOMEM);
- snprintf(username, sizeof(username), "krb50x%x", fsuid);
- vol_info->username = username;
vol_info->local_nls = cifs_sb->local_nls;
vol_info->linux_uid = fsuid;
vol_info->cred_uid = fsuid;
@@ -3685,8 +4177,11 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
vol_info->local_lease = master_tcon->local_lease;
vol_info->no_linux_ext = !master_tcon->unix_ext;
- /* FIXME: allow for other secFlg settings */
- vol_info->secFlg = CIFSSEC_MUST_KRB5;
+ rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
+ if (rc) {
+ tcon = ERR_PTR(rc);
+ goto out;
+ }
/* get a reference for the same TCP session */
spin_lock(&cifs_tcp_ses_lock);
@@ -3709,6 +4204,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
if (ses->capabilities & CAP_UNIX)
reset_cifs_unix_caps(0, tcon, NULL, vol_info);
out:
+ kfree(vol_info->username);
+ kfree(vol_info->password);
kfree(vol_info);
return tcon;
@@ -3901,6 +4398,6 @@ cifs_prune_tlinks(struct work_struct *work)
}
spin_unlock(&cifs_sb->tlink_tree_lock);
- queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+ queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
TLINK_IDLE_EXPIRE);
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index df8fecb5b99..d172c8ed901 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
}
tcon = tlink_tcon(tlink);
- if (enable_oplocks)
+ if (tcon->ses->server->oplocks)
oplock = REQ_OPLOCK;
if (nd)
@@ -492,7 +492,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
{
int xid;
int rc = 0; /* to get around spurious gcc warning, set to zero here */
- __u32 oplock = 0;
+ __u32 oplock;
__u16 fileHandle = 0;
bool posix_open = false;
struct cifs_sb_info *cifs_sb;
@@ -518,6 +518,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
}
pTcon = tlink_tcon(tlink);
+ oplock = pTcon->ses->server->oplocks ? REQ_OPLOCK : 0;
+
/*
* Don't allow the separator character in a path component.
* The VFS will not allow "/", but "\" is allowed by posix.
@@ -584,10 +586,26 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
* If either that or op not supported returned, follow
* the normal lookup.
*/
- if ((rc == 0) || (rc == -ENOENT))
+ switch (rc) {
+ case 0:
+ /*
+ * The server may allow us to open things like
+ * FIFOs, but the client isn't set up to deal
+ * with that. If it's not a regular file, just
+ * close it and proceed as if it were a normal
+ * lookup.
+ */
+ if (newInode && !S_ISREG(newInode->i_mode)) {
+ CIFSSMBClose(xid, pTcon, fileHandle);
+ break;
+ }
+ case -ENOENT:
posix_open = true;
- else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP))
+ case -EOPNOTSUPP:
+ break;
+ default:
pTcon->broken_posix_open = true;
+ }
}
if (!posix_open)
rc = cifs_get_inode_info_unix(&newInode, full_path,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4dd9283885e..fae765dac93 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -380,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file)
cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
inode, file->f_flags, full_path);
- if (enable_oplocks)
+ if (tcon->ses->server->oplocks)
oplock = REQ_OPLOCK;
else
oplock = 0;
@@ -505,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
cFYI(1, "inode = 0x%p file flags 0x%x for %s",
inode, pCifsFile->f_flags, full_path);
- if (enable_oplocks)
+ if (tcon->ses->server->oplocks)
oplock = REQ_OPLOCK;
else
oplock = 0;
@@ -835,13 +835,21 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock)
if ((flock->fl_flags & FL_POSIX) == 0)
return rc;
+try_again:
mutex_lock(&cinode->lock_mutex);
if (!cinode->can_cache_brlcks) {
mutex_unlock(&cinode->lock_mutex);
return rc;
}
- rc = posix_lock_file_wait(file, flock);
+
+ rc = posix_lock_file(file, flock, NULL);
mutex_unlock(&cinode->lock_mutex);
+ if (rc == FILE_LOCK_DEFERRED) {
+ rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
+ if (!rc)
+ goto try_again;
+ locks_delete_block(flock);
+ }
return rc;
}
@@ -920,16 +928,26 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
for (lockp = &inode->i_flock; *lockp != NULL; \
lockp = &(*lockp)->fl_next)
+struct lock_to_push {
+ struct list_head llist;
+ __u64 offset;
+ __u64 length;
+ __u32 pid;
+ __u16 netfid;
+ __u8 type;
+};
+
static int
cifs_push_posix_locks(struct cifsFileInfo *cfile)
{
struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct file_lock *flock, **before;
- struct cifsLockInfo *lck, *tmp;
+ unsigned int count = 0, i = 0;
int rc = 0, xid, type;
+ struct list_head locks_to_send, *el;
+ struct lock_to_push *lck, *tmp;
__u64 length;
- struct list_head locks_to_send;
xid = GetXid();
@@ -940,29 +958,56 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
return rc;
}
+ lock_flocks();
+ cifs_for_each_lock(cfile->dentry->d_inode, before) {
+ if ((*before)->fl_flags & FL_POSIX)
+ count++;
+ }
+ unlock_flocks();
+
INIT_LIST_HEAD(&locks_to_send);
+ /*
+ * Allocating count locks is enough because no FL_POSIX locks can be
+ * added to the list while we are holding cinode->lock_mutex that
+ * protects locking operations of this inode.
+ */
+ for (; i < count; i++) {
+ lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
+ if (!lck) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+ list_add_tail(&lck->llist, &locks_to_send);
+ }
+
+ el = locks_to_send.next;
lock_flocks();
cifs_for_each_lock(cfile->dentry->d_inode, before) {
flock = *before;
+ if ((flock->fl_flags & FL_POSIX) == 0)
+ continue;
+ if (el == &locks_to_send) {
+ /*
+ * The list ended. We don't have enough allocated
+ * structures - something is really wrong.
+ */
+ cERROR(1, "Can't push all brlocks!");
+ break;
+ }
length = 1 + flock->fl_end - flock->fl_start;
if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
type = CIFS_RDLCK;
else
type = CIFS_WRLCK;
-
- lck = cifs_lock_init(flock->fl_start, length, type,
- cfile->netfid);
- if (!lck) {
- rc = -ENOMEM;
- goto send_locks;
- }
+ lck = list_entry(el, struct lock_to_push, llist);
lck->pid = flock->fl_pid;
-
- list_add_tail(&lck->llist, &locks_to_send);
+ lck->netfid = cfile->netfid;
+ lck->length = length;
+ lck->type = type;
+ lck->offset = flock->fl_start;
+ el = el->next;
}
-
-send_locks:
unlock_flocks();
list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
@@ -979,11 +1024,18 @@ send_locks:
kfree(lck);
}
+out:
cinode->can_cache_brlcks = false;
mutex_unlock(&cinode->lock_mutex);
FreeXid(xid);
return rc;
+err_out:
+ list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
+ list_del(&lck->llist);
+ kfree(lck);
+ }
+ goto out;
}
static int
@@ -1355,7 +1407,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
return rc;
}
-/* update the file size (if needed) after a write */
+/*
+ * update the file size (if needed) after a write. Should be called with
+ * the inode->i_lock held
+ */
void
cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written)
@@ -1427,7 +1482,9 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid,
return rc;
}
} else {
+ spin_lock(&dentry->d_inode->i_lock);
cifs_update_eof(cifsi, *poffset, bytes_written);
+ spin_unlock(&dentry->d_inode->i_lock);
*poffset += bytes_written;
}
}
@@ -1604,6 +1661,27 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
return rc;
}
+/*
+ * Marshal up the iov array, reserving the first one for the header. Also,
+ * set wdata->bytes.
+ */
+static void
+cifs_writepages_marshal_iov(struct kvec *iov, struct cifs_writedata *wdata)
+{
+ int i;
+ struct inode *inode = wdata->cfile->dentry->d_inode;
+ loff_t size = i_size_read(inode);
+
+ /* marshal up the pages into iov array */
+ wdata->bytes = 0;
+ for (i = 0; i < wdata->nr_pages; i++) {
+ iov[i + 1].iov_len = min(size - page_offset(wdata->pages[i]),
+ (loff_t)PAGE_CACHE_SIZE);
+ iov[i + 1].iov_base = kmap(wdata->pages[i]);
+ wdata->bytes += iov[i + 1].iov_len;
+ }
+}
+
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -1640,7 +1718,8 @@ retry:
tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1,
end - index) + 1;
- wdata = cifs_writedata_alloc((unsigned int)tofind);
+ wdata = cifs_writedata_alloc((unsigned int)tofind,
+ cifs_writev_complete);
if (!wdata) {
rc = -ENOMEM;
break;
@@ -1747,6 +1826,7 @@ retry:
wdata->sync_mode = wbc->sync_mode;
wdata->nr_pages = nr_pages;
wdata->offset = page_offset(wdata->pages[0]);
+ wdata->marshal_iov = cifs_writepages_marshal_iov;
do {
if (wdata->cfile != NULL)
@@ -1758,6 +1838,7 @@ retry:
rc = -EBADF;
break;
}
+ wdata->pid = wdata->cfile->pid;
rc = cifs_async_writev(wdata);
} while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
@@ -1999,7 +2080,7 @@ cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
unsigned long i;
for (i = 0; i < num_pages; i++) {
- pages[i] = alloc_page(__GFP_HIGHMEM);
+ pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
if (!pages[i]) {
/*
* save number of pages we have already allocated and
@@ -2007,15 +2088,14 @@ cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
*/
num_pages = i;
rc = -ENOMEM;
- goto error;
+ break;
}
}
- return rc;
-
-error:
- for (i = 0; i < num_pages; i++)
- put_page(pages[i]);
+ if (rc) {
+ for (i = 0; i < num_pages; i++)
+ put_page(pages[i]);
+ }
return rc;
}
@@ -2026,9 +2106,7 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
size_t clen;
clen = min_t(const size_t, len, wsize);
- num_pages = clen / PAGE_CACHE_SIZE;
- if (clen % PAGE_CACHE_SIZE)
- num_pages++;
+ num_pages = DIV_ROUND_UP(clen, PAGE_SIZE);
if (cur_len)
*cur_len = clen;
@@ -2036,24 +2114,79 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
return num_pages;
}
+static void
+cifs_uncached_marshal_iov(struct kvec *iov, struct cifs_writedata *wdata)
+{
+ int i;
+ size_t bytes = wdata->bytes;
+
+ /* marshal up the pages into iov array */
+ for (i = 0; i < wdata->nr_pages; i++) {
+ iov[i + 1].iov_len = min_t(size_t, bytes, PAGE_SIZE);
+ iov[i + 1].iov_base = kmap(wdata->pages[i]);
+ bytes -= iov[i + 1].iov_len;
+ }
+}
+
+static void
+cifs_uncached_writev_complete(struct work_struct *work)
+{
+ int i;
+ struct cifs_writedata *wdata = container_of(work,
+ struct cifs_writedata, work);
+ struct inode *inode = wdata->cfile->dentry->d_inode;
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+ spin_lock(&inode->i_lock);
+ cifs_update_eof(cifsi, wdata->offset, wdata->bytes);
+ if (cifsi->server_eof > inode->i_size)
+ i_size_write(inode, cifsi->server_eof);
+ spin_unlock(&inode->i_lock);
+
+ complete(&wdata->done);
+
+ if (wdata->result != -EAGAIN) {
+ for (i = 0; i < wdata->nr_pages; i++)
+ put_page(wdata->pages[i]);
+ }
+
+ kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+/* attempt to send write to server, retry on any -EAGAIN errors */
+static int
+cifs_uncached_retry_writev(struct cifs_writedata *wdata)
+{
+ int rc;
+
+ do {
+ if (wdata->cfile->invalidHandle) {
+ rc = cifs_reopen_file(wdata->cfile, false);
+ if (rc != 0)
+ continue;
+ }
+ rc = cifs_async_writev(wdata);
+ } while (rc == -EAGAIN);
+
+ return rc;
+}
+
static ssize_t
cifs_iovec_write(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *poffset)
{
- unsigned int written;
- unsigned long num_pages, npages, i;
+ unsigned long nr_pages, i;
size_t copied, len, cur_len;
ssize_t total_written = 0;
- struct kvec *to_send;
- struct page **pages;
+ loff_t offset = *poffset;
struct iov_iter it;
- struct inode *inode;
struct cifsFileInfo *open_file;
- struct cifs_tcon *pTcon;
+ struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
- struct cifs_io_parms io_parms;
- int xid, rc;
- __u32 pid;
+ struct cifs_writedata *wdata, *tmp;
+ struct list_head wdata_list;
+ int rc;
+ pid_t pid;
len = iov_length(iov, nr_segs);
if (!len)
@@ -2063,103 +2196,103 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
if (rc)
return rc;
+ INIT_LIST_HEAD(&wdata_list);
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
-
- pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
- if (!pages)
- return -ENOMEM;
-
- to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
- if (!to_send) {
- kfree(pages);
- return -ENOMEM;
- }
-
- rc = cifs_write_allocate_pages(pages, num_pages);
- if (rc) {
- kfree(pages);
- kfree(to_send);
- return rc;
- }
-
- xid = GetXid();
open_file = file->private_data;
+ tcon = tlink_tcon(open_file->tlink);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
else
pid = current->tgid;
- pTcon = tlink_tcon(open_file->tlink);
- inode = file->f_path.dentry->d_inode;
-
iov_iter_init(&it, iov, nr_segs, len, 0);
- npages = num_pages;
-
do {
- size_t save_len = cur_len;
- for (i = 0; i < npages; i++) {
- copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
- copied = iov_iter_copy_from_user(pages[i], &it, 0,
- copied);
+ size_t save_len;
+
+ nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+ wdata = cifs_writedata_alloc(nr_pages,
+ cifs_uncached_writev_complete);
+ if (!wdata) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
+ if (rc) {
+ kfree(wdata);
+ break;
+ }
+
+ save_len = cur_len;
+ for (i = 0; i < nr_pages; i++) {
+ copied = min_t(const size_t, cur_len, PAGE_SIZE);
+ copied = iov_iter_copy_from_user(wdata->pages[i], &it,
+ 0, copied);
cur_len -= copied;
iov_iter_advance(&it, copied);
- to_send[i+1].iov_base = kmap(pages[i]);
- to_send[i+1].iov_len = copied;
}
-
cur_len = save_len - cur_len;
- do {
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, false);
- if (rc != 0)
- break;
- }
- io_parms.netfid = open_file->netfid;
- io_parms.pid = pid;
- io_parms.tcon = pTcon;
- io_parms.offset = *poffset;
- io_parms.length = cur_len;
- rc = CIFSSMBWrite2(xid, &io_parms, &written, to_send,
- npages, 0);
- } while (rc == -EAGAIN);
-
- for (i = 0; i < npages; i++)
- kunmap(pages[i]);
-
- if (written) {
- len -= written;
- total_written += written;
- cifs_update_eof(CIFS_I(inode), *poffset, written);
- *poffset += written;
- } else if (rc < 0) {
- if (!total_written)
- total_written = rc;
+ wdata->sync_mode = WB_SYNC_ALL;
+ wdata->nr_pages = nr_pages;
+ wdata->offset = (__u64)offset;
+ wdata->cfile = cifsFileInfo_get(open_file);
+ wdata->pid = pid;
+ wdata->bytes = cur_len;
+ wdata->marshal_iov = cifs_uncached_marshal_iov;
+ rc = cifs_uncached_retry_writev(wdata);
+ if (rc) {
+ kref_put(&wdata->refcount, cifs_writedata_release);
break;
}
- /* get length and number of kvecs of the next write */
- npages = get_numpages(cifs_sb->wsize, len, &cur_len);
+ list_add_tail(&wdata->list, &wdata_list);
+ offset += cur_len;
+ len -= cur_len;
} while (len > 0);
- if (total_written > 0) {
- spin_lock(&inode->i_lock);
- if (*poffset > inode->i_size)
- i_size_write(inode, *poffset);
- spin_unlock(&inode->i_lock);
+ /*
+ * If at least one write was successfully sent, then discard any rc
+ * value from the later writes. If the other write succeeds, then
+ * we'll end up returning whatever was written. If it fails, then
+ * we'll get a new rc value from that.
+ */
+ if (!list_empty(&wdata_list))
+ rc = 0;
+
+ /*
+ * Wait for and collect replies for any successful sends in order of
+ * increasing offset. Once an error is hit or we get a fatal signal
+ * while waiting, then return without waiting for any more replies.
+ */
+restart_loop:
+ list_for_each_entry_safe(wdata, tmp, &wdata_list, list) {
+ if (!rc) {
+ /* FIXME: freezable too? */
+ rc = wait_for_completion_killable(&wdata->done);
+ if (rc)
+ rc = -EINTR;
+ else if (wdata->result)
+ rc = wdata->result;
+ else
+ total_written += wdata->bytes;
+
+ /* resend call if it's a retryable error */
+ if (rc == -EAGAIN) {
+ rc = cifs_uncached_retry_writev(wdata);
+ goto restart_loop;
+ }
+ }
+ list_del_init(&wdata->list);
+ kref_put(&wdata->refcount, cifs_writedata_release);
}
- cifs_stats_bytes_written(pTcon, total_written);
- mark_inode_dirty_sync(inode);
+ if (total_written > 0)
+ *poffset += total_written;
- for (i = 0; i < num_pages; i++)
- put_page(pages[i]);
- kfree(to_send);
- kfree(pages);
- FreeXid(xid);
- return total_written;
+ cifs_stats_bytes_written(tcon, total_written);
+ return total_written ? total_written : (ssize_t)rc;
}
ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a5f54b7d982..745da3d0653 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -534,6 +534,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
fattr->cf_dtype = DT_DIR;
+ /*
+ * Server can return wrong NumberOfLinks value for directories
+ * when Unix extensions are disabled - fake it.
+ */
+ fattr->cf_nlink = 2;
} else {
fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
fattr->cf_dtype = DT_REG;
@@ -541,9 +546,9 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
/* clear write bits if ATTR_READONLY is set */
if (fattr->cf_cifsattrs & ATTR_READONLY)
fattr->cf_mode &= ~(S_IWUGO);
- }
- fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+ fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+ }
fattr->cf_uid = cifs_sb->mnt_uid;
fattr->cf_gid = cifs_sb->mnt_gid;
@@ -1322,7 +1327,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
}
/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
to set uid/gid */
- inc_nlink(inode);
cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1355,7 +1359,6 @@ mkdir_retry_old:
d_drop(direntry);
} else {
mkdir_get_info:
- inc_nlink(inode);
if (pTcon->unix_ext)
rc = cifs_get_inode_info_unix(&newinode, full_path,
inode->i_sb, xid);
@@ -1436,6 +1439,11 @@ mkdir_get_info:
}
}
mkdir_out:
+ /*
+ * Force revalidate to get parent dir info when needed since cached
+ * attributes are invalid now.
+ */
+ CIFS_I(inode)->time = 0;
kfree(full_path);
FreeXid(xid);
cifs_put_tlink(tlink);
@@ -1475,7 +1483,6 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
cifs_put_tlink(tlink);
if (!rc) {
- drop_nlink(inode);
spin_lock(&direntry->d_inode->i_lock);
i_size_write(direntry->d_inode, 0);
clear_nlink(direntry->d_inode);
@@ -1483,12 +1490,15 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
}
cifsInode = CIFS_I(direntry->d_inode);
- cifsInode->time = 0; /* force revalidate to go get info when
- needed */
+ /* force revalidate to go get info when needed */
+ cifsInode->time = 0;
cifsInode = CIFS_I(inode);
- cifsInode->time = 0; /* force revalidate to get parent dir info
- since cached search results now invalid */
+ /*
+ * Force revalidate to get parent dir info when needed since cached
+ * attributes are invalid now.
+ */
+ cifsInode->time = 0;
direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
current_fs_time(inode->i_sb);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 703ef5c6fdb..c29d1aa2c54 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -213,55 +213,62 @@ cifs_small_buf_release(void *buf_to_free)
}
/*
- Find a free multiplex id (SMB mid). Otherwise there could be
- mid collisions which might cause problems, demultiplexing the
- wrong response to this request. Multiplex ids could collide if
- one of a series requests takes much longer than the others, or
- if a very large number of long lived requests (byte range
- locks or FindNotify requests) are pending. No more than
- 64K-1 requests can be outstanding at one time. If no
- mids are available, return zero. A future optimization
- could make the combination of mids and uid the key we use
- to demultiplex on (rather than mid alone).
- In addition to the above check, the cifs demultiplex
- code already used the command code as a secondary
- check of the frame and if signing is negotiated the
- response would be discarded if the mid were the same
- but the signature was wrong. Since the mid is not put in the
- pending queue until later (when it is about to be dispatched)
- we do have to limit the number of outstanding requests
- to somewhat less than 64K-1 although it is hard to imagine
- so many threads being in the vfs at one time.
-*/
-__u16 GetNextMid(struct TCP_Server_Info *server)
+ * Find a free multiplex id (SMB mid). Otherwise there could be
+ * mid collisions which might cause problems, demultiplexing the
+ * wrong response to this request. Multiplex ids could collide if
+ * one of a series requests takes much longer than the others, or
+ * if a very large number of long lived requests (byte range
+ * locks or FindNotify requests) are pending. No more than
+ * 64K-1 requests can be outstanding at one time. If no
+ * mids are available, return zero. A future optimization
+ * could make the combination of mids and uid the key we use
+ * to demultiplex on (rather than mid alone).
+ * In addition to the above check, the cifs demultiplex
+ * code already used the command code as a secondary
+ * check of the frame and if signing is negotiated the
+ * response would be discarded if the mid were the same
+ * but the signature was wrong. Since the mid is not put in the
+ * pending queue until later (when it is about to be dispatched)
+ * we do have to limit the number of outstanding requests
+ * to somewhat less than 64K-1 although it is hard to imagine
+ * so many threads being in the vfs at one time.
+ */
+__u64 GetNextMid(struct TCP_Server_Info *server)
{
- __u16 mid = 0;
- __u16 last_mid;
+ __u64 mid = 0;
+ __u16 last_mid, cur_mid;
bool collision;
spin_lock(&GlobalMid_Lock);
- last_mid = server->CurrentMid; /* we do not want to loop forever */
- server->CurrentMid++;
- /* This nested loop looks more expensive than it is.
- In practice the list of pending requests is short,
- fewer than 50, and the mids are likely to be unique
- on the first pass through the loop unless some request
- takes longer than the 64 thousand requests before it
- (and it would also have to have been a request that
- did not time out) */
- while (server->CurrentMid != last_mid) {
+
+ /* mid is 16 bit only for CIFS/SMB */
+ cur_mid = (__u16)((server->CurrentMid) & 0xffff);
+ /* we do not want to loop forever */
+ last_mid = cur_mid;
+ cur_mid++;
+
+ /*
+ * This nested loop looks more expensive than it is.
+ * In practice the list of pending requests is short,
+ * fewer than 50, and the mids are likely to be unique
+ * on the first pass through the loop unless some request
+ * takes longer than the 64 thousand requests before it
+ * (and it would also have to have been a request that
+ * did not time out).
+ */
+ while (cur_mid != last_mid) {
struct mid_q_entry *mid_entry;
unsigned int num_mids;
collision = false;
- if (server->CurrentMid == 0)
- server->CurrentMid++;
+ if (cur_mid == 0)
+ cur_mid++;
num_mids = 0;
list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
++num_mids;
- if (mid_entry->mid == server->CurrentMid &&
- mid_entry->midState == MID_REQUEST_SUBMITTED) {
+ if (mid_entry->mid == cur_mid &&
+ mid_entry->mid_state == MID_REQUEST_SUBMITTED) {
/* This mid is in use, try a different one */
collision = true;
break;
@@ -282,10 +289,11 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
server->tcpStatus = CifsNeedReconnect;
if (!collision) {
- mid = server->CurrentMid;
+ mid = (__u64)cur_mid;
+ server->CurrentMid = mid;
break;
}
- server->CurrentMid++;
+ cur_mid++;
}
spin_unlock(&GlobalMid_Lock);
return mid;
@@ -420,8 +428,10 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
}
int
-checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read)
+checkSMB(char *buf, unsigned int total_read)
{
+ struct smb_hdr *smb = (struct smb_hdr *)buf;
+ __u16 mid = smb->Mid;
__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
__u32 clc_len; /* calculated length */
cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x",
@@ -502,8 +512,9 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read)
}
bool
-is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
+is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
{
+ struct smb_hdr *buf = (struct smb_hdr *)buffer;
struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
struct list_head *tmp, *tmp1, *tmp2;
struct cifs_ses *ses;
@@ -584,7 +595,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
cifs_set_oplock_level(pCifsInode,
pSMB->OplockLevel ? OPLOCK_READ : 0);
- queue_work(system_nrt_wq,
+ queue_work(cifsiod_wq,
&netfile->oplock_break);
netfile->oplock_break_cancelled = false;
@@ -604,16 +615,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
}
void
-dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
+dump_smb(void *buf, int smb_buf_length)
{
int i, j;
char debug_line[17];
- unsigned char *buffer;
+ unsigned char *buffer = buf;
if (traceSMB == 0)
return;
- buffer = (unsigned char *) smb_buf;
for (i = 0, j = 0; i < smb_buf_length; i++, j++) {
if (i % 8 == 0) {
/* have reached the beginning of line */
@@ -690,3 +700,22 @@ backup_cred(struct cifs_sb_info *cifs_sb)
return false;
}
+
+void
+cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add)
+{
+ spin_lock(&server->req_lock);
+ server->credits += add;
+ server->in_flight--;
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+}
+
+void
+cifs_set_credits(struct TCP_Server_Info *server, const int val)
+{
+ spin_lock(&server->req_lock);
+ server->credits = val;
+ server->oplocks = val > 1 ? enable_oplocks : false;
+ spin_unlock(&server->req_lock);
+}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 73e47e84b61..581c225f7f5 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -197,8 +197,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
memcpy(scope_id, pct + 1, slen);
scope_id[slen] = '\0';
- rc = strict_strtoul(scope_id, 0,
- (unsigned long *)&s6->sin6_scope_id);
+ rc = kstrtouint(scope_id, 0, &s6->sin6_scope_id);
rc = (rc == 0) ? 1 : 0;
}
@@ -836,8 +835,9 @@ ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode)
}
int
-map_smb_to_linux_error(struct smb_hdr *smb, bool logErr)
+map_smb_to_linux_error(char *buf, bool logErr)
{
+ struct smb_hdr *smb = (struct smb_hdr *)buf;
unsigned int i;
int rc = -EIO; /* if transport error smb error may not be set */
__u8 smberrclass;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a090bbe6ee2..e2bbc683e01 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -647,10 +647,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
name.name = scratch_buf;
name.len =
- cifs_from_ucs2((char *)name.name, (__le16 *)de.name,
- UNICODE_NAME_MAX,
- min(de.namelen, (size_t)max_len), nlt,
- cifs_sb->mnt_cifs_flags &
+ cifs_from_utf16((char *)name.name, (__le16 *)de.name,
+ UNICODE_NAME_MAX,
+ min_t(size_t, de.namelen,
+ (size_t)max_len), nlt,
+ cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
name.len -= nls_nullsize(nlt);
} else {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 4ec3ee9d72c..551d0c2b973 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -167,16 +167,16 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
int bytes_ret = 0;
/* Copy OS version */
- bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
- nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32,
+ nls_cp);
bcc_ptr += 2 * bytes_ret;
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release,
- 32, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release,
+ 32, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* trailing null */
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
- 32, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+ 32, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* trailing null */
@@ -197,8 +197,8 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
*(bcc_ptr+1) = 0;
bytes_ret = 0;
} else
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName,
- 256, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
+ 256, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* account for null terminator */
@@ -226,8 +226,8 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
*bcc_ptr = 0;
*(bcc_ptr+1) = 0;
} else {
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name,
- MAX_USERNAME_SIZE, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
+ MAX_USERNAME_SIZE, nls_cp);
}
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* account for null termination */
@@ -246,16 +246,15 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
/* copy user */
/* BB what about null user mounts - check that we do this BB */
/* copy user */
- if (ses->user_name != NULL)
+ if (ses->user_name != NULL) {
strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE);
+ bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
+ }
/* else null user mount */
-
- bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
*bcc_ptr = 0;
bcc_ptr++; /* account for null termination */
/* copy domain */
-
if (ses->domainName != NULL) {
strncpy(bcc_ptr, ses->domainName, 256);
bcc_ptr += strnlen(ses->domainName, 256);
@@ -287,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
cFYI(1, "bleft %d", bleft);
kfree(ses->serverOS);
- ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+ ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
cFYI(1, "serverOS=%s", ses->serverOS);
len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
data += len;
@@ -296,7 +295,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
return;
kfree(ses->serverNOS);
- ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+ ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
cFYI(1, "serverNOS=%s", ses->serverNOS);
len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
data += len;
@@ -305,7 +304,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
return;
kfree(ses->serverDomain);
- ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+ ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
cFYI(1, "serverDomain=%s", ses->serverDomain);
return;
@@ -395,6 +394,10 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
+ if (tioffset > blob_len || tioffset + tilen > blob_len) {
+ cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen);
+ return -EINVAL;
+ }
if (tilen) {
ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
if (!ses->auth_key.response) {
@@ -502,8 +505,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
tmp += 2;
} else {
int len;
- len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
- MAX_USERNAME_SIZE, nls_cp);
+ len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
+ MAX_USERNAME_SIZE, nls_cp);
len *= 2; /* unicode is 2 bytes each */
sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
sec_blob->DomainName.Length = cpu_to_le16(len);
@@ -518,8 +521,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
tmp += 2;
} else {
int len;
- len = cifs_strtoUCS((__le16 *)tmp, ses->user_name,
- MAX_USERNAME_SIZE, nls_cp);
+ len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
+ MAX_USERNAME_SIZE, nls_cp);
len *= 2; /* unicode is 2 bytes each */
sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
sec_blob->UserName.Length = cpu_to_le16(len);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 80d85088193..d5cd9aa7eac 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -213,7 +213,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
/* Password cannot be longer than 128 characters */
if (passwd) /* Password must be converted to NT unicode */
- len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
+ len = cifs_strtoUTF16(wpwd, passwd, 128, codepage);
else {
len = 0;
*wpwd = 0; /* Ensure string is null terminated */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0cc9584f588..0961336513d 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -60,8 +60,8 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
memset(temp, 0, sizeof(struct mid_q_entry));
temp->mid = smb_buffer->Mid; /* always LE */
temp->pid = current->pid;
- temp->command = smb_buffer->Command;
- cFYI(1, "For smb_command %d", temp->command);
+ temp->command = cpu_to_le16(smb_buffer->Command);
+ cFYI(1, "For smb_command %d", smb_buffer->Command);
/* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
/* when mid allocated can be before when sent */
temp->when_alloc = jiffies;
@@ -75,7 +75,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
}
atomic_inc(&midCount);
- temp->midState = MID_REQUEST_ALLOCATED;
+ temp->mid_state = MID_REQUEST_ALLOCATED;
return temp;
}
@@ -85,9 +85,9 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
#ifdef CONFIG_CIFS_STATS2
unsigned long now;
#endif
- midEntry->midState = MID_FREE;
+ midEntry->mid_state = MID_FREE;
atomic_dec(&midCount);
- if (midEntry->largeBuf)
+ if (midEntry->large_buf)
cifs_buf_release(midEntry->resp_buf);
else
cifs_small_buf_release(midEntry->resp_buf);
@@ -97,8 +97,8 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
something is wrong, unless it is quite a slow link or server */
if ((now - midEntry->when_alloc) > HZ) {
if ((cifsFYI & CIFS_TIMER) &&
- (midEntry->command != SMB_COM_LOCKING_ANDX)) {
- printk(KERN_DEBUG " CIFS slow rsp: cmd %d mid %d",
+ (midEntry->command != cpu_to_le16(SMB_COM_LOCKING_ANDX))) {
+ printk(KERN_DEBUG " CIFS slow rsp: cmd %d mid %llu",
midEntry->command, midEntry->mid);
printk(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
now - midEntry->when_alloc,
@@ -126,11 +126,11 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
int rc = 0;
int i = 0;
struct msghdr smb_msg;
- struct smb_hdr *smb_buffer = iov[0].iov_base;
+ __be32 *buf_len = (__be32 *)(iov[0].iov_base);
unsigned int len = iov[0].iov_len;
unsigned int total_len;
int first_vec = 0;
- unsigned int smb_buf_length = be32_to_cpu(smb_buffer->smb_buf_length);
+ unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base);
struct socket *ssocket = server->ssocket;
if (ssocket == NULL)
@@ -150,7 +150,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
total_len += iov[i].iov_len;
cFYI(1, "Sending smb: total_len %d", total_len);
- dump_smb(smb_buffer, len);
+ dump_smb(iov[0].iov_base, len);
i = 0;
while (total_len) {
@@ -158,24 +158,24 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
n_vec - first_vec, total_len);
if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
i++;
- /* if blocking send we try 3 times, since each can block
- for 5 seconds. For nonblocking we have to try more
- but wait increasing amounts of time allowing time for
- socket to clear. The overall time we wait in either
- case to send on the socket is about 15 seconds.
- Similarly we wait for 15 seconds for
- a response from the server in SendReceive[2]
- for the server to send a response back for
- most types of requests (except SMB Write
- past end of file which can be slow, and
- blocking lock operations). NFS waits slightly longer
- than CIFS, but this can make it take longer for
- nonresponsive servers to be detected and 15 seconds
- is more than enough time for modern networks to
- send a packet. In most cases if we fail to send
- after the retries we will kill the socket and
- reconnect which may clear the network problem.
- */
+ /*
+ * If blocking send we try 3 times, since each can block
+ * for 5 seconds. For nonblocking we have to try more
+ * but wait increasing amounts of time allowing time for
+ * socket to clear. The overall time we wait in either
+ * case to send on the socket is about 15 seconds.
+ * Similarly we wait for 15 seconds for a response from
+ * the server in SendReceive[2] for the server to send
+ * a response back for most types of requests (except
+ * SMB Write past end of file which can be slow, and
+ * blocking lock operations). NFS waits slightly longer
+ * than CIFS, but this can make it take longer for
+ * nonresponsive servers to be detected and 15 seconds
+ * is more than enough time for modern networks to
+ * send a packet. In most cases if we fail to send
+ * after the retries we will kill the socket and
+ * reconnect which may clear the network problem.
+ */
if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
cERROR(1, "sends on sock %p stuck for 15 seconds",
ssocket);
@@ -235,9 +235,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
else
rc = 0;
- /* Don't want to modify the buffer as a
- side effect of this call. */
- smb_buffer->smb_buf_length = cpu_to_be32(smb_buf_length);
+ /* Don't want to modify the buffer as a side effect of this call. */
+ *buf_len = cpu_to_be32(smb_buf_length);
return rc;
}
@@ -254,44 +253,60 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
return smb_sendv(server, &iov, 1);
}
-static int wait_for_free_request(struct TCP_Server_Info *server,
- const int long_op)
+static int
+wait_for_free_credits(struct TCP_Server_Info *server, const int optype,
+ int *credits)
{
- if (long_op == CIFS_ASYNC_OP) {
+ int rc;
+
+ spin_lock(&server->req_lock);
+ if (optype == CIFS_ASYNC_OP) {
/* oplock breaks must not be held up */
- atomic_inc(&server->inFlight);
+ server->in_flight++;
+ *credits -= 1;
+ spin_unlock(&server->req_lock);
return 0;
}
- spin_lock(&GlobalMid_Lock);
while (1) {
- if (atomic_read(&server->inFlight) >= cifs_max_pending) {
- spin_unlock(&GlobalMid_Lock);
+ if (*credits <= 0) {
+ spin_unlock(&server->req_lock);
cifs_num_waiters_inc(server);
- wait_event(server->request_q,
- atomic_read(&server->inFlight)
- < cifs_max_pending);
+ rc = wait_event_killable(server->request_q,
+ has_credits(server, credits));
cifs_num_waiters_dec(server);
- spin_lock(&GlobalMid_Lock);
+ if (rc)
+ return rc;
+ spin_lock(&server->req_lock);
} else {
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->req_lock);
return -ENOENT;
}
- /* can not count locking commands against total
- as they are allowed to block on server */
+ /*
+ * Can not count locking commands against total
+ * as they are allowed to block on server.
+ */
/* update # of requests on the wire to server */
- if (long_op != CIFS_BLOCKING_OP)
- atomic_inc(&server->inFlight);
- spin_unlock(&GlobalMid_Lock);
+ if (optype != CIFS_BLOCKING_OP) {
+ *credits -= 1;
+ server->in_flight++;
+ }
+ spin_unlock(&server->req_lock);
break;
}
}
return 0;
}
+static int
+wait_for_free_request(struct TCP_Server_Info *server, const int optype)
+{
+ return wait_for_free_credits(server, optype, get_credits_field(server));
+}
+
static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
struct mid_q_entry **ppmidQ)
{
@@ -326,13 +341,40 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
int error;
error = wait_event_freezekillable(server->response_q,
- midQ->midState != MID_REQUEST_SUBMITTED);
+ midQ->mid_state != MID_REQUEST_SUBMITTED);
if (error < 0)
return -ERESTARTSYS;
return 0;
}
+static int
+cifs_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov,
+ unsigned int nvec, struct mid_q_entry **ret_mid)
+{
+ int rc;
+ struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base;
+ struct mid_q_entry *mid;
+
+ /* enable signing if server requires it */
+ if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+ mid = AllocMidQEntry(hdr, server);
+ if (mid == NULL)
+ return -ENOMEM;
+
+ /* put it on the pending_mid_q */
+ spin_lock(&GlobalMid_Lock);
+ list_add_tail(&mid->qhead, &server->pending_mid_q);
+ spin_unlock(&GlobalMid_Lock);
+
+ rc = cifs_sign_smb2(iov, nvec, server, &mid->sequence_number);
+ if (rc)
+ delete_mid(mid);
+ *ret_mid = mid;
+ return rc;
+}
/*
* Send a SMB request and set the callback function in the mid to handle
@@ -345,40 +387,24 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
{
int rc;
struct mid_q_entry *mid;
- struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base;
rc = wait_for_free_request(server, ignore_pend ? CIFS_ASYNC_OP : 0);
if (rc)
return rc;
- /* enable signing if server requires it */
- if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
- hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-
mutex_lock(&server->srv_mutex);
- mid = AllocMidQEntry(hdr, server);
- if (mid == NULL) {
- mutex_unlock(&server->srv_mutex);
- atomic_dec(&server->inFlight);
- wake_up(&server->request_q);
- return -ENOMEM;
- }
-
- /* put it on the pending_mid_q */
- spin_lock(&GlobalMid_Lock);
- list_add_tail(&mid->qhead, &server->pending_mid_q);
- spin_unlock(&GlobalMid_Lock);
-
- rc = cifs_sign_smb2(iov, nvec, server, &mid->sequence_number);
+ rc = cifs_setup_async_request(server, iov, nvec, &mid);
if (rc) {
mutex_unlock(&server->srv_mutex);
- goto out_err;
+ cifs_add_credits(server, 1);
+ wake_up(&server->request_q);
+ return rc;
}
mid->receive = receive;
mid->callback = callback;
mid->callback_data = cbdata;
- mid->midState = MID_REQUEST_SUBMITTED;
+ mid->mid_state = MID_REQUEST_SUBMITTED;
cifs_in_send_inc(server);
rc = smb_sendv(server, iov, nvec);
@@ -392,7 +418,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
return rc;
out_err:
delete_mid(mid);
- atomic_dec(&server->inFlight);
+ cifs_add_credits(server, 1);
wake_up(&server->request_q);
return rc;
}
@@ -408,14 +434,14 @@ out_err:
*/
int
SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
- struct smb_hdr *in_buf, int flags)
+ char *in_buf, int flags)
{
int rc;
struct kvec iov[1];
int resp_buf_type;
- iov[0].iov_base = (char *)in_buf;
- iov[0].iov_len = be32_to_cpu(in_buf->smb_buf_length) + 4;
+ iov[0].iov_base = in_buf;
+ iov[0].iov_len = get_rfc1002_length(in_buf) + 4;
flags |= CIFS_NO_RESP;
rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
@@ -428,11 +454,11 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
{
int rc = 0;
- cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
- mid->mid, mid->midState);
+ cFYI(1, "%s: cmd=%d mid=%llu state=%d", __func__,
+ le16_to_cpu(mid->command), mid->mid, mid->mid_state);
spin_lock(&GlobalMid_Lock);
- switch (mid->midState) {
+ switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
spin_unlock(&GlobalMid_Lock);
return rc;
@@ -447,8 +473,8 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
break;
default:
list_del_init(&mid->qhead);
- cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
- mid->mid, mid->midState);
+ cERROR(1, "%s: invalid mid state mid=%llu state=%d", __func__,
+ mid->mid, mid->mid_state);
rc = -EIO;
}
spin_unlock(&GlobalMid_Lock);
@@ -498,7 +524,7 @@ int
cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
bool log_error)
{
- unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4;
+ unsigned int len = get_rfc1002_length(mid->resp_buf) + 4;
dump_smb(mid->resp_buf, min_t(u32, 92, len));
@@ -518,6 +544,24 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
return map_smb_to_linux_error(mid->resp_buf, log_error);
}
+static int
+cifs_setup_request(struct cifs_ses *ses, struct kvec *iov,
+ unsigned int nvec, struct mid_q_entry **ret_mid)
+{
+ int rc;
+ struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base;
+ struct mid_q_entry *mid;
+
+ rc = allocate_mid(ses, hdr, &mid);
+ if (rc)
+ return rc;
+ rc = cifs_sign_smb2(iov, nvec, ses->server, &mid->sequence_number);
+ if (rc)
+ delete_mid(mid);
+ *ret_mid = mid;
+ return rc;
+}
+
int
SendReceive2(const unsigned int xid, struct cifs_ses *ses,
struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -526,56 +570,53 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
int rc = 0;
int long_op;
struct mid_q_entry *midQ;
- struct smb_hdr *in_buf = iov[0].iov_base;
+ char *buf = iov[0].iov_base;
long_op = flags & CIFS_TIMEOUT_MASK;
*pRespBufType = CIFS_NO_BUFFER; /* no response buf yet */
if ((ses == NULL) || (ses->server == NULL)) {
- cifs_small_buf_release(in_buf);
+ cifs_small_buf_release(buf);
cERROR(1, "Null session");
return -EIO;
}
if (ses->server->tcpStatus == CifsExiting) {
- cifs_small_buf_release(in_buf);
+ cifs_small_buf_release(buf);
return -ENOENT;
}
- /* Ensure that we do not send more than 50 overlapping requests
- to the same server. We may make this configurable later or
- use ses->maxReq */
+ /*
+ * Ensure that we do not send more than 50 overlapping requests
+ * to the same server. We may make this configurable later or
+ * use ses->maxReq.
+ */
rc = wait_for_free_request(ses->server, long_op);
if (rc) {
- cifs_small_buf_release(in_buf);
+ cifs_small_buf_release(buf);
return rc;
}
- /* make sure that we sign in the same order that we send on this socket
- and avoid races inside tcp sendmsg code that could cause corruption
- of smb data */
+ /*
+ * Make sure that we sign in the same order that we send on this socket
+ * and avoid races inside tcp sendmsg code that could cause corruption
+ * of smb data.
+ */
mutex_lock(&ses->server->srv_mutex);
- rc = allocate_mid(ses, in_buf, &midQ);
+ rc = cifs_setup_request(ses, iov, n_vec, &midQ);
if (rc) {
mutex_unlock(&ses->server->srv_mutex);
- cifs_small_buf_release(in_buf);
+ cifs_small_buf_release(buf);
/* Update # of requests on wire to server */
- atomic_dec(&ses->server->inFlight);
- wake_up(&ses->server->request_q);
+ cifs_add_credits(ses->server, 1);
return rc;
}
- rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number);
- if (rc) {
- mutex_unlock(&ses->server->srv_mutex);
- cifs_small_buf_release(in_buf);
- goto out;
- }
- midQ->midState = MID_REQUEST_SUBMITTED;
+ midQ->mid_state = MID_REQUEST_SUBMITTED;
cifs_in_send_inc(ses->server);
rc = smb_sendv(ses->server, iov, n_vec);
cifs_in_send_dec(ses->server);
@@ -584,48 +625,47 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
mutex_unlock(&ses->server->srv_mutex);
if (rc < 0) {
- cifs_small_buf_release(in_buf);
+ cifs_small_buf_release(buf);
goto out;
}
if (long_op == CIFS_ASYNC_OP) {
- cifs_small_buf_release(in_buf);
+ cifs_small_buf_release(buf);
goto out;
}
rc = wait_for_response(ses->server, midQ);
if (rc != 0) {
- send_nt_cancel(ses->server, in_buf, midQ);
+ send_nt_cancel(ses->server, (struct smb_hdr *)buf, midQ);
spin_lock(&GlobalMid_Lock);
- if (midQ->midState == MID_REQUEST_SUBMITTED) {
+ if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
midQ->callback = DeleteMidQEntry;
spin_unlock(&GlobalMid_Lock);
- cifs_small_buf_release(in_buf);
- atomic_dec(&ses->server->inFlight);
- wake_up(&ses->server->request_q);
+ cifs_small_buf_release(buf);
+ cifs_add_credits(ses->server, 1);
return rc;
}
spin_unlock(&GlobalMid_Lock);
}
- cifs_small_buf_release(in_buf);
+ cifs_small_buf_release(buf);
rc = cifs_sync_mid_result(midQ, ses->server);
if (rc != 0) {
- atomic_dec(&ses->server->inFlight);
- wake_up(&ses->server->request_q);
+ cifs_add_credits(ses->server, 1);
return rc;
}
- if (!midQ->resp_buf || midQ->midState != MID_RESPONSE_RECEIVED) {
+ if (!midQ->resp_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) {
rc = -EIO;
cFYI(1, "Bad MID state?");
goto out;
}
- iov[0].iov_base = (char *)midQ->resp_buf;
- iov[0].iov_len = be32_to_cpu(midQ->resp_buf->smb_buf_length) + 4;
- if (midQ->largeBuf)
+ buf = (char *)midQ->resp_buf;
+ iov[0].iov_base = buf;
+ iov[0].iov_len = get_rfc1002_length(buf) + 4;
+ if (midQ->large_buf)
*pRespBufType = CIFS_LARGE_BUFFER;
else
*pRespBufType = CIFS_SMALL_BUFFER;
@@ -637,8 +677,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
midQ->resp_buf = NULL;
out:
delete_mid(midQ);
- atomic_dec(&ses->server->inFlight);
- wake_up(&ses->server->request_q);
+ cifs_add_credits(ses->server, 1);
return rc;
}
@@ -688,8 +727,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc) {
mutex_unlock(&ses->server->srv_mutex);
/* Update # of requests on wire to server */
- atomic_dec(&ses->server->inFlight);
- wake_up(&ses->server->request_q);
+ cifs_add_credits(ses->server, 1);
return rc;
}
@@ -699,7 +737,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
goto out;
}
- midQ->midState = MID_REQUEST_SUBMITTED;
+ midQ->mid_state = MID_REQUEST_SUBMITTED;
cifs_in_send_inc(ses->server);
rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
@@ -717,12 +755,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc != 0) {
send_nt_cancel(ses->server, in_buf, midQ);
spin_lock(&GlobalMid_Lock);
- if (midQ->midState == MID_REQUEST_SUBMITTED) {
+ if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
/* no longer considered to be "in-flight" */
midQ->callback = DeleteMidQEntry;
spin_unlock(&GlobalMid_Lock);
- atomic_dec(&ses->server->inFlight);
- wake_up(&ses->server->request_q);
+ cifs_add_credits(ses->server, 1);
return rc;
}
spin_unlock(&GlobalMid_Lock);
@@ -730,25 +767,23 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_sync_mid_result(midQ, ses->server);
if (rc != 0) {
- atomic_dec(&ses->server->inFlight);
- wake_up(&ses->server->request_q);
+ cifs_add_credits(ses->server, 1);
return rc;
}
if (!midQ->resp_buf || !out_buf ||
- midQ->midState != MID_RESPONSE_RECEIVED) {
+ midQ->mid_state != MID_RESPONSE_RECEIVED) {
rc = -EIO;
cERROR(1, "Bad MID state?");
goto out;
}
- *pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length);
+ *pbytes_returned = get_rfc1002_length(midQ->resp_buf);
memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
rc = cifs_check_receive(midQ, ses->server, 0);
out:
delete_mid(midQ);
- atomic_dec(&ses->server->inFlight);
- wake_up(&ses->server->request_q);
+ cifs_add_credits(ses->server, 1);
return rc;
}
@@ -836,7 +871,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
- midQ->midState = MID_REQUEST_SUBMITTED;
+ midQ->mid_state = MID_REQUEST_SUBMITTED;
cifs_in_send_inc(ses->server);
rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
cifs_in_send_dec(ses->server);
@@ -850,13 +885,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
/* Wait for a reply - allow signals to interrupt. */
rc = wait_event_interruptible(ses->server->response_q,
- (!(midQ->midState == MID_REQUEST_SUBMITTED)) ||
+ (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) ||
((ses->server->tcpStatus != CifsGood) &&
(ses->server->tcpStatus != CifsNew)));
/* Were we interrupted by a signal ? */
if ((rc == -ERESTARTSYS) &&
- (midQ->midState == MID_REQUEST_SUBMITTED) &&
+ (midQ->mid_state == MID_REQUEST_SUBMITTED) &&
((ses->server->tcpStatus == CifsGood) ||
(ses->server->tcpStatus == CifsNew))) {
@@ -886,7 +921,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
send_nt_cancel(ses->server, in_buf, midQ);
spin_lock(&GlobalMid_Lock);
- if (midQ->midState == MID_REQUEST_SUBMITTED) {
+ if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
/* no longer considered to be "in-flight" */
midQ->callback = DeleteMidQEntry;
spin_unlock(&GlobalMid_Lock);
@@ -904,13 +939,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
/* rcvd frame is ok */
- if (out_buf == NULL || midQ->midState != MID_RESPONSE_RECEIVED) {
+ if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) {
rc = -EIO;
cERROR(1, "Bad MID state?");
goto out;
}
- *pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length);
+ *pbytes_returned = get_rfc1002_length(midQ->resp_buf);
memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
rc = cifs_check_receive(midQ, ses->server, 0);
out:
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 45f07c46f3e..10d92cf57ab 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -105,7 +105,6 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
struct cifs_tcon *pTcon;
struct super_block *sb;
char *full_path;
- struct cifs_ntsd *pacl;
if (direntry == NULL)
return -EIO;
@@ -164,23 +163,24 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+ struct cifs_ntsd *pacl;
pacl = kmalloc(value_size, GFP_KERNEL);
if (!pacl) {
cFYI(1, "%s: Can't allocate memory for ACL",
__func__);
rc = -ENOMEM;
} else {
-#ifdef CONFIG_CIFS_ACL
memcpy(pacl, ea_value, value_size);
rc = set_cifs_acl(pacl, value_size,
direntry->d_inode, full_path, CIFS_ACL_DACL);
if (rc == 0) /* force revalidate of the inode */
CIFS_I(direntry->d_inode)->time = 0;
kfree(pacl);
+ }
#else
cFYI(1, "Set CIFS ACL not supported yet");
#endif /* CONFIG_CIFS_ACL */
- }
} else {
int temp;
temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 5e2e1b3f068..2870597b5c9 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -21,7 +21,6 @@
#include <linux/vfs.h>
#include <linux/slab.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/fs.h>
@@ -208,13 +207,12 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
if (IS_ERR(root)) {
error = PTR_ERR(root);
printk("Failure of coda_cnode_make for root: error %d\n", error);
- root = NULL;
goto error;
}
printk("coda_read_super: rootinode is %ld dev %s\n",
root->i_ino, root->i_sb->s_id);
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
error = -EINVAL;
goto error;
@@ -222,9 +220,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
return 0;
error:
- if (root)
- iput(root);
-
mutex_lock(&vc->vc_mutex);
bdi_destroy(&vc->bdi);
vc->vc_sb = NULL;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 8f616e0e252..761d5b31b18 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -38,7 +38,6 @@
#include <linux/mutex.h>
#include <linux/device.h>
#include <asm/io.h>
-#include <asm/system.h>
#include <asm/poll.h>
#include <asm/uaccess.h>
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 9727e0c5257..0c68fd31fbf 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -14,7 +14,6 @@
* improvements to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
*/
-#include <asm/system.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/types.h>
diff --git a/fs/compat.c b/fs/compat.c
index fa9d721ecfe..f2944ace7a7 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -33,7 +33,6 @@
#include <linux/nfs4_mount.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
-#include <linux/module.h>
#include <linux/dirent.h>
#include <linux/fsnotify.h>
#include <linux/highuid.h>
@@ -131,41 +130,35 @@ asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_tim
static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
- compat_ino_t ino = stat->ino;
- typeof(ubuf->st_uid) uid = 0;
- typeof(ubuf->st_gid) gid = 0;
- int err;
+ struct compat_stat tmp;
- SET_UID(uid, stat->uid);
- SET_GID(gid, stat->gid);
+ if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ return -EOVERFLOW;
- if ((u64) stat->size > MAX_NON_LFS ||
- !old_valid_dev(stat->dev) ||
- !old_valid_dev(stat->rdev))
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_ino = stat->ino;
+ if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
- if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
+ tmp.st_mode = stat->mode;
+ tmp.st_nlink = stat->nlink;
+ if (tmp.st_nlink != stat->nlink)
return -EOVERFLOW;
-
- if (clear_user(ubuf, sizeof(*ubuf)))
- return -EFAULT;
-
- err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
- err |= __put_user(ino, &ubuf->st_ino);
- err |= __put_user(stat->mode, &ubuf->st_mode);
- err |= __put_user(stat->nlink, &ubuf->st_nlink);
- err |= __put_user(uid, &ubuf->st_uid);
- err |= __put_user(gid, &ubuf->st_gid);
- err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
- err |= __put_user(stat->size, &ubuf->st_size);
- err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
- err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
- err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
- err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
- err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
- err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
- err |= __put_user(stat->blksize, &ubuf->st_blksize);
- err |= __put_user(stat->blocks, &ubuf->st_blocks);
- return err;
+ SET_UID(tmp.st_uid, stat->uid);
+ SET_GID(tmp.st_gid, stat->gid);
+ tmp.st_rdev = old_encode_dev(stat->rdev);
+ if ((u64) stat->size > MAX_NON_LFS)
+ return -EOVERFLOW;
+ tmp.st_size = stat->size;
+ tmp.st_atime = stat->atime.tv_sec;
+ tmp.st_atime_nsec = stat->atime.tv_nsec;
+ tmp.st_mtime = stat->mtime.tv_sec;
+ tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+ tmp.st_ctime = stat->ctime.tv_sec;
+ tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+ tmp.st_blocks = stat->blocks;
+ tmp.st_blksize = stat->blksize;
+ return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
asmlinkage long compat_sys_newstat(const char __user * filename,
@@ -1177,10 +1170,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
}
asmlinkage ssize_t
-compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
- unsigned long vlen, u32 pos_low, u32 pos_high)
+compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t pos)
{
- loff_t pos = ((loff_t)pos_high << 32) | pos_low;
struct file *file;
int fput_needed;
ssize_t ret;
@@ -1197,6 +1189,14 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
return ret;
}
+asmlinkage ssize_t
+compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
+ unsigned long vlen, u32 pos_low, u32 pos_high)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+ return compat_sys_preadv64(fd, vec, vlen, pos);
+}
+
static size_t compat_writev(struct file *file,
const struct compat_iovec __user *vec,
unsigned long vlen, loff_t *pos)
@@ -1236,10 +1236,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
}
asmlinkage ssize_t
-compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
- unsigned long vlen, u32 pos_low, u32 pos_high)
+compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t pos)
{
- loff_t pos = ((loff_t)pos_high << 32) | pos_low;
struct file *file;
int fput_needed;
ssize_t ret;
@@ -1256,6 +1255,14 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
return ret;
}
+asmlinkage ssize_t
+compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
+ unsigned long vlen, u32 pos_low, u32 pos_high)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+ return compat_sys_pwritev64(fd, vec, vlen, pos);
+}
+
asmlinkage long
compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
unsigned int nr_segs, unsigned int flags)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a10e428b32b..debdfe0fc80 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -34,7 +34,7 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/ppp_defs.h>
-#include <linux/if_ppp.h>
+#include <linux/ppp-ioctl.h>
#include <linux/if_pppox.h>
#include <linux/mtio.h>
#include <linux/auto_fs.h>
@@ -49,7 +49,6 @@
#include <linux/elevator.h>
#include <linux/rtc.h>
#include <linux/pci.h>
-#include <linux/module.h>
#include <linux/serial.h>
#include <linux/if_tun.h>
#include <linux/ctype.h>
@@ -105,6 +104,7 @@
#include <linux/hiddev.h>
+#define __DVB_CORE__
#include <linux/dvb/audio.h>
#include <linux/dvb/dmx.h>
#include <linux/dvb/frontend.h>
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index ede857d20a0..b5f0a3b91f1 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -58,12 +58,11 @@ struct configfs_dirent {
extern struct mutex configfs_symlink_mutex;
extern spinlock_t configfs_dirent_lock;
-extern struct vfsmount * configfs_mount;
extern struct kmem_cache *configfs_dir_cachep;
extern int configfs_is_root(struct config_item *item);
-extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *);
+extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
extern int configfs_inode_init(void);
extern void configfs_inode_exit(void);
@@ -80,15 +79,15 @@ extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr);
-extern int configfs_pin_fs(void);
+extern struct dentry *configfs_pin_fs(void);
extern void configfs_release_fs(void);
extern struct rw_semaphore configfs_rename_sem;
-extern struct super_block * configfs_sb;
extern const struct file_operations configfs_dir_operations;
extern const struct file_operations configfs_file_operations;
extern const struct file_operations bin_fops;
extern const struct inode_operations configfs_dir_inode_operations;
+extern const struct inode_operations configfs_root_inode_operations;
extern const struct inode_operations configfs_symlink_inode_operations;
extern const struct dentry_operations configfs_dentry_ops;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5ddd7ebd9dc..7e6c52d8a20 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -264,11 +264,13 @@ static int init_symlink(struct inode * inode)
return 0;
}
-static int create_dir(struct config_item * k, struct dentry * p,
- struct dentry * d)
+static int create_dir(struct config_item *k, struct dentry *d)
{
int error;
umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+ struct dentry *p = d->d_parent;
+
+ BUG_ON(!k);
error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
if (!error)
@@ -304,19 +306,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
{
- struct dentry * parent;
- int error = 0;
-
- BUG_ON(!item);
-
- if (item->ci_parent)
- parent = item->ci_parent->ci_dentry;
- else if (configfs_mount)
- parent = configfs_mount->mnt_root;
- else
- return -EFAULT;
-
- error = create_dir(item,parent,dentry);
+ int error = create_dir(item, dentry);
if (!error)
item->ci_dentry = dentry;
return error;
@@ -1079,23 +1069,24 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
int ret;
struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
struct config_item *s_item = &subsys->su_group.cg_item;
+ struct dentry *root;
/*
* Pin the configfs filesystem. This means we can safely access
* the root of the configfs filesystem.
*/
- ret = configfs_pin_fs();
- if (ret)
- return ret;
+ root = configfs_pin_fs();
+ if (IS_ERR(root))
+ return PTR_ERR(root);
/*
* Next, lock the root directory. We're going to check that the
* subsystem is really registered, and so we need to lock out
* configfs_[un]register_subsystem().
*/
- mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+ mutex_lock(&root->d_inode->i_mutex);
- root_sd = configfs_sb->s_root->d_fsdata;
+ root_sd = root->d_fsdata;
list_for_each_entry(p, &root_sd->s_children, s_sibling) {
if (p->s_type & CONFIGFS_DIR) {
@@ -1129,7 +1120,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
out_unlock_dirent_lock:
spin_unlock(&configfs_dirent_lock);
out_unlock_fs:
- mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+ mutex_unlock(&root->d_inode->i_mutex);
/*
* If we succeeded, the fs is pinned via other methods. If not,
@@ -1183,11 +1174,6 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
struct module *subsys_owner = NULL, *new_item_owner = NULL;
char *name;
- if (dentry->d_parent == configfs_sb->s_root) {
- ret = -EPERM;
- goto out;
- }
-
sd = dentry->d_parent->d_fsdata;
/*
@@ -1359,9 +1345,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
struct module *subsys_owner = NULL, *dead_item_owner = NULL;
int ret;
- if (dentry->d_parent == configfs_sb->s_root)
- return -EPERM;
-
sd = dentry->d_fsdata;
if (sd->s_type & CONFIGFS_USET_DEFAULT)
return -EPERM;
@@ -1459,6 +1442,11 @@ const struct inode_operations configfs_dir_inode_operations = {
.setattr = configfs_setattr,
};
+const struct inode_operations configfs_root_inode_operations = {
+ .lookup = configfs_lookup,
+ .setattr = configfs_setattr,
+};
+
#if 0
int configfs_rename_dir(struct config_item * item, const char *new_name)
{
@@ -1546,6 +1534,7 @@ static inline unsigned char dt_type(struct configfs_dirent *sd)
static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
struct dentry *dentry = filp->f_path.dentry;
+ struct super_block *sb = dentry->d_sb;
struct configfs_dirent * parent_sd = dentry->d_fsdata;
struct configfs_dirent *cursor = filp->private_data;
struct list_head *p, *q = &cursor->s_sibling;
@@ -1608,7 +1597,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
ino = inode->i_ino;
spin_unlock(&configfs_dirent_lock);
if (!inode)
- ino = iunique(configfs_sb, 2);
+ ino = iunique(sb, 2);
if (filldir(dirent, name, len, filp->f_pos, ino,
dt_type(next)) < 0)
@@ -1680,27 +1669,27 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
struct config_group *group = &subsys->su_group;
struct qstr name;
struct dentry *dentry;
+ struct dentry *root;
struct configfs_dirent *sd;
- err = configfs_pin_fs();
- if (err)
- return err;
+ root = configfs_pin_fs();
+ if (IS_ERR(root))
+ return PTR_ERR(root);
if (!group->cg_item.ci_name)
group->cg_item.ci_name = group->cg_item.ci_namebuf;
- sd = configfs_sb->s_root->d_fsdata;
+ sd = root->d_fsdata;
link_group(to_config_group(sd->s_element), group);
- mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
- I_MUTEX_PARENT);
+ mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
name.name = group->cg_item.ci_name;
name.len = strlen(name.name);
name.hash = full_name_hash(name.name, name.len);
err = -ENOMEM;
- dentry = d_alloc(configfs_sb->s_root, &name);
+ dentry = d_alloc(root, &name);
if (dentry) {
d_add(dentry, NULL);
@@ -1717,7 +1706,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
}
}
- mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+ mutex_unlock(&root->d_inode->i_mutex);
if (err) {
unlink_group(group);
@@ -1731,13 +1720,14 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
{
struct config_group *group = &subsys->su_group;
struct dentry *dentry = group->cg_item.ci_dentry;
+ struct dentry *root = dentry->d_sb->s_root;
- if (dentry->d_parent != configfs_sb->s_root) {
+ if (dentry->d_parent != root) {
printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
return;
}
- mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
+ mutex_lock_nested(&root->d_inode->i_mutex,
I_MUTEX_PARENT);
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
mutex_lock(&configfs_symlink_mutex);
@@ -1754,7 +1744,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
d_delete(dentry);
- mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+ mutex_unlock(&root->d_inode->i_mutex);
dput(dentry);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 3ee36d41886..0074362d9f7 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -44,8 +44,6 @@
static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
#endif
-extern struct super_block * configfs_sb;
-
static const struct address_space_operations configfs_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
@@ -132,9 +130,10 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
inode->i_ctime = iattr->ia_ctime;
}
-struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent * sd)
+struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
+ struct super_block *s)
{
- struct inode * inode = new_inode(configfs_sb);
+ struct inode * inode = new_inode(s);
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mapping->a_ops = &configfs_aops;
@@ -188,36 +187,35 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct inode *))
{
int error = 0;
- struct inode * inode = NULL;
- if (dentry) {
- if (!dentry->d_inode) {
- struct configfs_dirent *sd = dentry->d_fsdata;
- if ((inode = configfs_new_inode(mode, sd))) {
- if (dentry->d_parent && dentry->d_parent->d_inode) {
- struct inode *p_inode = dentry->d_parent->d_inode;
- p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
- }
- configfs_set_inode_lock_class(sd, inode);
- goto Proceed;
- }
- else
- error = -ENOMEM;
- } else
- error = -EEXIST;
- } else
- error = -ENOENT;
- goto Done;
+ struct inode *inode = NULL;
+ struct configfs_dirent *sd;
+ struct inode *p_inode;
+
+ if (!dentry)
+ return -ENOENT;
+
+ if (dentry->d_inode)
+ return -EEXIST;
- Proceed:
- if (init)
+ sd = dentry->d_fsdata;
+ inode = configfs_new_inode(mode, sd, dentry->d_sb);
+ if (!inode)
+ return -ENOMEM;
+
+ p_inode = dentry->d_parent->d_inode;
+ p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+ configfs_set_inode_lock_class(sd, inode);
+
+ if (init) {
error = init(inode);
- if (!error) {
- d_instantiate(dentry, inode);
- if (S_ISDIR(mode) || S_ISLNK(mode))
- dget(dentry); /* pin link and directory dentries in core */
- } else
- iput(inode);
- Done:
+ if (error) {
+ iput(inode);
+ return error;
+ }
+ }
+ d_instantiate(dentry, inode);
+ if (S_ISDIR(mode) || S_ISLNK(mode))
+ dget(dentry); /* pin link and directory dentries in core */
return error;
}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 276e15cafd5..aee0a7ebbd8 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -37,8 +37,7 @@
/* Random magic number */
#define CONFIGFS_MAGIC 0x62656570
-struct vfsmount * configfs_mount = NULL;
-struct super_block * configfs_sb = NULL;
+static struct vfsmount *configfs_mount = NULL;
struct kmem_cache *configfs_dir_cachep;
static int configfs_mnt_count = 0;
@@ -77,12 +76,11 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = CONFIGFS_MAGIC;
sb->s_op = &configfs_ops;
sb->s_time_gran = 1;
- configfs_sb = sb;
inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
- &configfs_root);
+ &configfs_root, sb);
if (inode) {
- inode->i_op = &configfs_dir_inode_operations;
+ inode->i_op = &configfs_root_inode_operations;
inode->i_fop = &configfs_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
@@ -91,10 +89,9 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
return -ENOMEM;
}
- root = d_alloc_root(inode);
+ root = d_make_root(inode);
if (!root) {
pr_debug("%s: could not get root dentry!\n",__func__);
- iput(inode);
return -ENOMEM;
}
config_group_init(&configfs_root_group);
@@ -118,10 +115,11 @@ static struct file_system_type configfs_fs_type = {
.kill_sb = kill_litter_super,
};
-int configfs_pin_fs(void)
+struct dentry *configfs_pin_fs(void)
{
- return simple_pin_fs(&configfs_fs_type, &configfs_mount,
+ int err = simple_pin_fs(&configfs_fs_type, &configfs_mount,
&configfs_mnt_count);
+ return err ? ERR_PTR(err) : configfs_mount->mnt_root;
}
void configfs_release_fs(void)
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 0f3eb41d920..cc9f2546ea4 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -110,13 +110,13 @@ out:
static int get_target(const char *symname, struct path *path,
- struct config_item **target)
+ struct config_item **target, struct super_block *sb)
{
int ret;
ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path);
if (!ret) {
- if (path->dentry->d_sb == configfs_sb) {
+ if (path->dentry->d_sb == sb) {
*target = configfs_get_config_item(path->dentry);
if (!*target) {
ret = -ENOENT;
@@ -141,10 +141,6 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
struct config_item *target_item = NULL;
struct config_item_type *type;
- ret = -EPERM; /* What lack-of-symlink returns */
- if (dentry->d_parent == configfs_sb->s_root)
- goto out;
-
sd = dentry->d_parent->d_fsdata;
/*
* Fake invisibility if dir belongs to a group/default groups hierarchy
@@ -162,7 +158,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
!type->ct_item_ops->allow_link)
goto out_put;
- ret = get_target(symname, &path, &target_item);
+ ret = get_target(symname, &path, &target_item, dentry->d_sb);
if (ret)
goto out_put;
@@ -198,8 +194,6 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
if (!(sd->s_type & CONFIGFS_ITEM_LINK))
goto out;
- BUG_ON(dentry->d_parent == configfs_sb->s_root);
-
sl = sd->s_element;
parent_item = configfs_get_config_item(dentry->d_parent);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a2ee8f9f5a3..d013c46402e 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -257,10 +257,10 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
/* Do sanity checks on the superblock */
if (super.magic != CRAMFS_MAGIC) {
- /* check for wrong endianess */
+ /* check for wrong endianness */
if (super.magic == CRAMFS_MAGIC_WEND) {
if (!silent)
- printk(KERN_ERR "cramfs: wrong endianess\n");
+ printk(KERN_ERR "cramfs: wrong endianness\n");
goto out;
}
@@ -270,7 +270,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
mutex_unlock(&read_mutex);
if (super.magic != CRAMFS_MAGIC) {
if (super.magic == CRAMFS_MAGIC_WEND && !silent)
- printk(KERN_ERR "cramfs: wrong endianess\n");
+ printk(KERN_ERR "cramfs: wrong endianness\n");
else if (!silent)
printk(KERN_ERR "cramfs: wrong magic\n");
goto out;
@@ -318,11 +318,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
root = get_cramfs_inode(sb, &super.root, 0);
if (IS_ERR(root))
goto out;
- sb->s_root = d_alloc_root(root);
- if (!sb->s_root) {
- iput(root);
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root)
goto out;
- }
return 0;
out:
kfree(sbi);
diff --git a/fs/dcache.c b/fs/dcache.c
index 616fedff011..b60ddc41d78 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -23,7 +23,7 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/cache.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mount.h>
#include <linux/file.h>
#include <asm/uaccess.h>
@@ -104,11 +104,11 @@ static unsigned int d_hash_shift __read_mostly;
static struct hlist_bl_head *dentry_hashtable __read_mostly;
-static inline struct hlist_bl_head *d_hash(struct dentry *parent,
- unsigned long hash)
+static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
+ unsigned int hash)
{
- hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
- hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
+ hash += (unsigned long) parent / L1_CACHE_BYTES;
+ hash = hash + (hash >> D_HASHBITS);
return dentry_hashtable + (hash & D_HASHMASK);
}
@@ -137,6 +137,49 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
}
#endif
+/*
+ * Compare 2 name strings, return 0 if they match, otherwise non-zero.
+ * The strings are both count bytes long, and count is non-zero.
+ */
+static inline int dentry_cmp(const unsigned char *cs, size_t scount,
+ const unsigned char *ct, size_t tcount)
+{
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+ unsigned long a,b,mask;
+
+ if (unlikely(scount != tcount))
+ return 1;
+
+ for (;;) {
+ a = *(unsigned long *)cs;
+ b = *(unsigned long *)ct;
+ if (tcount < sizeof(unsigned long))
+ break;
+ if (unlikely(a != b))
+ return 1;
+ cs += sizeof(unsigned long);
+ ct += sizeof(unsigned long);
+ tcount -= sizeof(unsigned long);
+ if (!tcount)
+ return 0;
+ }
+ mask = ~(~0ul << tcount*8);
+ return unlikely(!!((a ^ b) & mask));
+#else
+ if (scount != tcount)
+ return 1;
+
+ do {
+ if (*cs != *ct)
+ return 1;
+ cs++;
+ ct++;
+ tcount--;
+ } while (tcount);
+ return 0;
+#endif
+}
+
static void __d_free(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
@@ -1423,30 +1466,6 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
EXPORT_SYMBOL(d_instantiate_unique);
-/**
- * d_alloc_root - allocate root dentry
- * @root_inode: inode to allocate the root for
- *
- * Allocate a root ("/") dentry for the inode given. The inode is
- * instantiated and returned. %NULL is returned if there is insufficient
- * memory or the inode passed is %NULL.
- */
-
-struct dentry * d_alloc_root(struct inode * root_inode)
-{
- struct dentry *res = NULL;
-
- if (root_inode) {
- static const struct qstr name = { .name = "/", .len = 1 };
-
- res = __d_alloc(root_inode->i_sb, &name);
- if (res)
- d_instantiate(res, root_inode);
- }
- return res;
-}
-EXPORT_SYMBOL(d_alloc_root);
-
struct dentry *d_make_root(struct inode *root_inode)
{
struct dentry *res = NULL;
@@ -1475,7 +1494,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
return alias;
}
-static struct dentry * d_find_any_alias(struct inode *inode)
+/**
+ * d_find_any_alias - find any alias for a given inode
+ * @inode: inode to find an alias for
+ *
+ * If any aliases exist for the given inode, take and return a
+ * reference for one of them. If no aliases exist, return %NULL.
+ */
+struct dentry *d_find_any_alias(struct inode *inode)
{
struct dentry *de;
@@ -1484,7 +1510,7 @@ static struct dentry * d_find_any_alias(struct inode *inode)
spin_unlock(&inode->i_lock);
return de;
}
-
+EXPORT_SYMBOL(d_find_any_alias);
/**
* d_obtain_alias - find or allocate a dentry for a given inode
@@ -1687,7 +1713,7 @@ EXPORT_SYMBOL(d_add_ci);
* __d_lookup_rcu - search for a dentry (racy, store-free)
* @parent: parent dentry
* @name: qstr of name we wish to find
- * @seq: returns d_seq value at the point where the dentry was found
+ * @seqp: returns d_seq value at the point where the dentry was found
* @inode: returns dentry->d_inode when the inode was found valid.
* Returns: dentry, or NULL
*
@@ -1710,8 +1736,9 @@ EXPORT_SYMBOL(d_add_ci);
* child is looked up. Thus, an interlocking stepping of sequence lock checks
* is formed, giving integrity down the path walk.
*/
-struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
- unsigned *seq, struct inode **inode)
+struct dentry *__d_lookup_rcu(const struct dentry *parent,
+ const struct qstr *name,
+ unsigned *seqp, struct inode **inode)
{
unsigned int len = name->len;
unsigned int hash = name->hash;
@@ -1741,6 +1768,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
* See Documentation/filesystems/path-lookup.txt for more details.
*/
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+ unsigned seq;
struct inode *i;
const char *tname;
int tlen;
@@ -1749,7 +1777,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
continue;
seqretry:
- *seq = read_seqcount_begin(&dentry->d_seq);
+ seq = read_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
continue;
if (d_unhashed(dentry))
@@ -1764,7 +1792,7 @@ seqretry:
* edge of memory when walking. If we could load this
* atomically some other way, we could drop this check.
*/
- if (read_seqcount_retry(&dentry->d_seq, *seq))
+ if (read_seqcount_retry(&dentry->d_seq, seq))
goto seqretry;
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
if (parent->d_op->d_compare(parent, *inode,
@@ -1781,6 +1809,7 @@ seqretry:
* order to do anything useful with the returned dentry
* anyway.
*/
+ *seqp = seq;
*inode = i;
return dentry;
}
@@ -2375,6 +2404,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
if (d_ancestor(alias, dentry)) {
/* Check for loops */
actual = ERR_PTR(-ELOOP);
+ spin_unlock(&inode->i_lock);
} else if (IS_ROOT(alias)) {
/* Is this an anonymous mountpoint that we
* could splice into our tree? */
@@ -2384,7 +2414,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
goto found;
} else {
/* Nope, but we must(!) avoid directory
- * aliasing */
+ * aliasing. This drops inode->i_lock */
actual = __d_unalias(inode, dentry, alias);
}
write_sequnlock(&rename_lock);
@@ -2961,7 +2991,7 @@ __setup("dhash_entries=", set_dhash_entries);
static void __init dcache_init_early(void)
{
- int loop;
+ unsigned int loop;
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
@@ -2979,13 +3009,13 @@ static void __init dcache_init_early(void)
&d_hash_mask,
0);
- for (loop = 0; loop < (1 << d_hash_shift); loop++)
+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
static void __init dcache_init(void)
{
- int loop;
+ unsigned int loop;
/*
* A constructor could be added for stable state like the lists,
@@ -3009,7 +3039,7 @@ static void __init dcache_init(void)
&d_hash_mask,
0);
- for (loop = 0; loop < (1 << d_hash_shift); loop++)
+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
diff --git a/fs/dcookies.c b/fs/dcookies.c
index dda0dc702d1..17c77996782 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -13,7 +13,7 @@
*/
#include <linux/syscalls.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/mount.h>
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index f65d4455c5e..5dfafdd1dbd 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -33,18 +33,10 @@ static ssize_t default_write_file(struct file *file, const char __user *buf,
return count;
}
-static int default_open(struct inode *inode, struct file *file)
-{
- if (inode->i_private)
- file->private_data = inode->i_private;
-
- return 0;
-}
-
const struct file_operations debugfs_file_operations = {
.read = default_read_file,
.write = default_write_file,
- .open = default_open,
+ .open = simple_open,
.llseek = noop_llseek,
};
@@ -447,7 +439,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
static const struct file_operations fops_bool = {
.read = read_file_bool,
.write = write_file_bool,
- .open = default_open,
+ .open = simple_open,
.llseek = default_llseek,
};
@@ -492,7 +484,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
static const struct file_operations fops_blob = {
.read = read_file_blob,
- .open = default_open,
+ .open = simple_open,
.llseek = default_llseek,
};
@@ -540,7 +532,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_blob);
* debugfs_print_regs32 - use seq_print to describe a set of registers
* @s: the seq_file structure being used to generate output
* @regs: an array if struct debugfs_reg32 structures
- * @mregs: the length of the above array
+ * @nregs: the length of the above array
* @base: the base address to be used in reading the registers
* @prefix: a string to be prefixed to every output line
*
@@ -611,7 +603,7 @@ static const struct file_operations fops_regset32 = {
* %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
* code.
*/
-struct dentry *debugfs_create_regset32(const char *name, mode_t mode,
+struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_regset32 *regset)
{
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 956d5ddddf6..b80bc846a15 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -23,9 +23,13 @@
#include <linux/debugfs.h>
#include <linux/fsnotify.h>
#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#define DEBUGFS_DEFAULT_MODE 0755
+
static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
@@ -125,11 +129,154 @@ static inline int debugfs_positive(struct dentry *dentry)
return dentry->d_inode && !d_unhashed(dentry);
}
+struct debugfs_mount_opts {
+ uid_t uid;
+ gid_t gid;
+ umode_t mode;
+};
+
+enum {
+ Opt_uid,
+ Opt_gid,
+ Opt_mode,
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ {Opt_uid, "uid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_mode, "mode=%o"},
+ {Opt_err, NULL}
+};
+
+struct debugfs_fs_info {
+ struct debugfs_mount_opts mount_opts;
+};
+
+static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
+{
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ int token;
+ char *p;
+
+ opts->mode = DEBUGFS_DEFAULT_MODE;
+
+ while ((p = strsep(&data, ",")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_uid:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ opts->uid = option;
+ break;
+ case Opt_gid:
+ if (match_octal(&args[0], &option))
+ return -EINVAL;
+ opts->gid = option;
+ break;
+ case Opt_mode:
+ if (match_octal(&args[0], &option))
+ return -EINVAL;
+ opts->mode = option & S_IALLUGO;
+ break;
+ /*
+ * We might like to report bad mount options here;
+ * but traditionally debugfs has ignored all mount options
+ */
+ }
+ }
+
+ return 0;
+}
+
+static int debugfs_apply_options(struct super_block *sb)
+{
+ struct debugfs_fs_info *fsi = sb->s_fs_info;
+ struct inode *inode = sb->s_root->d_inode;
+ struct debugfs_mount_opts *opts = &fsi->mount_opts;
+
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
+
+ return 0;
+}
+
+static int debugfs_remount(struct super_block *sb, int *flags, char *data)
+{
+ int err;
+ struct debugfs_fs_info *fsi = sb->s_fs_info;
+
+ err = debugfs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
+
+ debugfs_apply_options(sb);
+
+fail:
+ return err;
+}
+
+static int debugfs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct debugfs_fs_info *fsi = root->d_sb->s_fs_info;
+ struct debugfs_mount_opts *opts = &fsi->mount_opts;
+
+ if (opts->uid != 0)
+ seq_printf(m, ",uid=%u", opts->uid);
+ if (opts->gid != 0)
+ seq_printf(m, ",gid=%u", opts->gid);
+ if (opts->mode != DEBUGFS_DEFAULT_MODE)
+ seq_printf(m, ",mode=%o", opts->mode);
+
+ return 0;
+}
+
+static const struct super_operations debugfs_super_operations = {
+ .statfs = simple_statfs,
+ .remount_fs = debugfs_remount,
+ .show_options = debugfs_show_options,
+};
+
static int debug_fill_super(struct super_block *sb, void *data, int silent)
{
static struct tree_descr debug_files[] = {{""}};
+ struct debugfs_fs_info *fsi;
+ int err;
+
+ save_mount_options(sb, data);
+
+ fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL);
+ sb->s_fs_info = fsi;
+ if (!fsi) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = debugfs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
+
+ err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
+ if (err)
+ goto fail;
+
+ sb->s_op = &debugfs_super_operations;
+
+ debugfs_apply_options(sb);
+
+ return 0;
- return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
+fail:
+ kfree(fsi);
+ sb->s_fs_info = NULL;
+ return err;
}
static struct dentry *debug_mount(struct file_system_type *fs_type,
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c4e2a58a2e8..10f5e0b484d 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -36,7 +36,61 @@
#define DEVPTS_DEFAULT_PTMX_MODE 0000
#define PTMX_MINOR 2
-extern int pty_limit; /* Config limit on Unix98 ptys */
+/*
+ * sysctl support for setting limits on the number of Unix98 ptys allocated.
+ * Otherwise one can eat up all kernel memory by opening /dev/ptmx repeatedly.
+ */
+static int pty_limit = NR_UNIX98_PTY_DEFAULT;
+static int pty_reserve = NR_UNIX98_PTY_RESERVE;
+static int pty_limit_min;
+static int pty_limit_max = INT_MAX;
+static int pty_count;
+
+static struct ctl_table pty_table[] = {
+ {
+ .procname = "max",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .data = &pty_limit,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &pty_limit_min,
+ .extra2 = &pty_limit_max,
+ }, {
+ .procname = "reserve",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .data = &pty_reserve,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &pty_limit_min,
+ .extra2 = &pty_limit_max,
+ }, {
+ .procname = "nr",
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .data = &pty_count,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+
+static struct ctl_table pty_kern_table[] = {
+ {
+ .procname = "pty",
+ .mode = 0555,
+ .child = pty_table,
+ },
+ {}
+};
+
+static struct ctl_table pty_root_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = pty_kern_table,
+ },
+ {}
+};
+
static DEFINE_MUTEX(allocated_ptys_lock);
static struct vfsmount *devpts_mnt;
@@ -49,10 +103,11 @@ struct pts_mount_opts {
umode_t mode;
umode_t ptmxmode;
int newinstance;
+ int max;
};
enum {
- Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,
+ Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance, Opt_max,
Opt_err
};
@@ -63,6 +118,7 @@ static const match_table_t tokens = {
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
{Opt_ptmxmode, "ptmxmode=%o"},
{Opt_newinstance, "newinstance"},
+ {Opt_max, "max=%d"},
#endif
{Opt_err, NULL}
};
@@ -109,6 +165,7 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
opts->gid = 0;
opts->mode = DEVPTS_DEFAULT_MODE;
opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+ opts->max = NR_UNIX98_PTY_MAX;
/* newinstance makes sense only on initial mount */
if (op == PARSE_MOUNT)
@@ -152,6 +209,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
if (op == PARSE_MOUNT)
opts->newinstance = 1;
break;
+ case Opt_max:
+ if (match_int(&args[0], &option) ||
+ option < 0 || option > NR_UNIX98_PTY_MAX)
+ return -EINVAL;
+ opts->max = option;
+ break;
#endif
default:
printk(KERN_ERR "devpts: called with bogus options\n");
@@ -258,6 +321,8 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",mode=%03o", opts->mode);
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
+ if (opts->max < NR_UNIX98_PTY_MAX)
+ seq_printf(seq, ",max=%d", opts->max);
#endif
return 0;
@@ -309,12 +374,11 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
- s->s_root = d_alloc_root(inode);
+ s->s_root = d_make_root(inode);
if (s->s_root)
return 0;
printk(KERN_ERR "devpts: get root dentry failed\n");
- iput(inode);
fail:
return -ENOMEM;
@@ -438,6 +502,12 @@ retry:
return -ENOMEM;
mutex_lock(&allocated_ptys_lock);
+ if (pty_count >= pty_limit -
+ (fsi->mount_opts.newinstance ? pty_reserve : 0)) {
+ mutex_unlock(&allocated_ptys_lock);
+ return -ENOSPC;
+ }
+
ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
if (ida_ret < 0) {
mutex_unlock(&allocated_ptys_lock);
@@ -446,11 +516,12 @@ retry:
return -EIO;
}
- if (index >= pty_limit) {
+ if (index >= fsi->mount_opts.max) {
ida_remove(&fsi->allocated_ptys, index);
mutex_unlock(&allocated_ptys_lock);
- return -EIO;
+ return -ENOSPC;
}
+ pty_count++;
mutex_unlock(&allocated_ptys_lock);
return index;
}
@@ -462,6 +533,7 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
mutex_lock(&allocated_ptys_lock);
ida_remove(&fsi->allocated_ptys, idx);
+ pty_count--;
mutex_unlock(&allocated_ptys_lock);
}
@@ -558,11 +630,15 @@ void devpts_pty_kill(struct tty_struct *tty)
static int __init init_devpts_fs(void)
{
int err = register_filesystem(&devpts_fs_type);
+ struct ctl_table_header *table;
+
if (!err) {
+ table = register_sysctl_table(pty_root_table);
devpts_mnt = kern_mount(&devpts_fs_type);
if (IS_ERR(devpts_mnt)) {
err = PTR_ERR(devpts_mnt);
unregister_filesystem(&devpts_fs_type);
+ unregister_sysctl_table(table);
}
}
return err;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d740ab67ff6..f4aadd15b61 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -36,6 +36,7 @@
#include <linux/rwsem.h>
#include <linux/uio.h>
#include <linux/atomic.h>
+#include <linux/prefetch.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -172,7 +173,7 @@ void inode_dio_wait(struct inode *inode)
if (atomic_read(&inode->i_dio_count))
__inode_dio_wait(inode);
}
-EXPORT_SYMBOL_GPL(inode_dio_wait);
+EXPORT_SYMBOL(inode_dio_wait);
/*
* inode_dio_done - signal finish of a direct I/O requests
@@ -186,7 +187,7 @@ void inode_dio_done(struct inode *inode)
if (atomic_dec_and_test(&inode->i_dio_count))
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
}
-EXPORT_SYMBOL_GPL(inode_dio_done);
+EXPORT_SYMBOL(inode_dio_done);
/*
* How many pages are in the queue?
@@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
{
int ret;
sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
+ sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
unsigned long fs_count; /* Number of filesystem-sized blocks */
- unsigned long dio_count;/* Number of dio_block-sized blocks */
- unsigned long blkmask;
int create;
/*
@@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
if (ret == 0) {
BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
fs_startblk = sdio->block_in_file >> sdio->blkfactor;
- dio_count = sdio->final_block_in_request - sdio->block_in_file;
- fs_count = dio_count >> sdio->blkfactor;
- blkmask = (1 << sdio->blkfactor) - 1;
- if (dio_count & blkmask)
- fs_count++;
+ fs_endblk = (sdio->final_block_in_request - 1) >>
+ sdio->blkfactor;
+ fs_count = fs_endblk - fs_startblk + 1;
map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits;
@@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
* individual fields and will generate much worse code. This is important
* for the whole file.
*/
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+static inline ssize_t
+do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
@@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
size_t size;
unsigned long addr;
unsigned blkbits = inode->i_blkbits;
- unsigned bdev_blkbits = 0;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
loff_t end = offset;
@@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (rw & WRITE)
rw = WRITE_ODIRECT;
- if (bdev)
- bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+ /*
+ * Avoid references to bdev if not absolutely needed to give
+ * the early prefetch in the caller enough time.
+ */
if (offset & blocksize_mask) {
if (bdev)
- blkbits = bdev_blkbits;
+ blkbits = blksize_bits(bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
if (offset & blocksize_mask)
goto out;
@@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len;
end += size;
- if ((addr & blocksize_mask) || (size & blocksize_mask)) {
+ if (unlikely((addr & blocksize_mask) ||
+ (size & blocksize_mask))) {
if (bdev)
- blkbits = bdev_blkbits;
+ blkbits = blksize_bits(
+ bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
+ if ((addr & blocksize_mask) || (size & blocksize_mask))
goto out;
}
}
@@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
out:
return retval;
}
+
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+ dio_submit_t submit_io, int flags)
+{
+ /*
+ * The block device state is needed in the end to finally
+ * submit everything. Since it's likely to be cache cold
+ * prefetch it here as first thing to hide some of the
+ * latency.
+ *
+ * Attempt to prefetch the pieces we likely need later.
+ */
+ prefetch(&bdev->bd_disk->part_tbl);
+ prefetch(bdev->bd_queue);
+ prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+
+ return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+ nr_segs, get_block, end_io,
+ submit_io, flags);
+}
+
EXPORT_SYMBOL(__blockdev_direct_IO);
static __init int dio_init(void)
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 3dca2b39e83..1c9b08095f9 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -609,13 +609,6 @@ static const struct file_operations format3_fops = {
/*
* dump lkb's on the ls_waiters list
*/
-
-static int waiters_open(struct inode *inode, struct file *file)
-{
- file->private_data = inode->i_private;
- return 0;
-}
-
static ssize_t waiters_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
@@ -644,7 +637,7 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
static const struct file_operations waiters_fops = {
.owner = THIS_MODULE,
- .open = waiters_open,
+ .open = simple_open,
.read = waiters_read,
.llseek = default_llseek,
};
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 83641574b01..dc5eb598b81 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -351,11 +351,28 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
{
struct dlm_rsb *r;
+ uint32_t hash, bucket;
+ int rv;
+
+ hash = jhash(name, len, 0);
+ bucket = hash & (ls->ls_rsbtbl_size - 1);
+
+ spin_lock(&ls->ls_rsbtbl[bucket].lock);
+ rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].keep, name, len, 0, &r);
+ if (rv)
+ rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].toss,
+ name, len, 0, &r);
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+
+ if (!rv)
+ return r;
down_read(&ls->ls_root_sem);
list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
if (len == r->res_length && !memcmp(name, r->res_name, len)) {
up_read(&ls->ls_root_sem);
+ log_error(ls, "find_rsb_root revert to root_list %s",
+ r->res_name);
return r;
}
}
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index d47183043c5..fa5c07d51dc 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -411,8 +411,8 @@ static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
}
-static int search_rsb_tree(struct rb_root *tree, char *name, int len,
- unsigned int flags, struct dlm_rsb **r_ret)
+int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
+ unsigned int flags, struct dlm_rsb **r_ret)
{
struct rb_node *node = tree->rb_node;
struct dlm_rsb *r;
@@ -474,12 +474,12 @@ static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
struct dlm_rsb *r;
int error;
- error = search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r);
+ error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r);
if (!error) {
kref_get(&r->res_ref);
goto out;
}
- error = search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+ error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
if (error)
goto out;
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 265017a7c3e..1a255307f6f 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -28,6 +28,9 @@ void dlm_scan_waiters(struct dlm_ls *ls);
void dlm_scan_timeout(struct dlm_ls *ls);
void dlm_adjust_timeouts(struct dlm_ls *ls);
+int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
+ unsigned int flags, struct dlm_rsb **r_ret);
+
int dlm_purge_locks(struct dlm_ls *ls);
void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
void dlm_grant_after_purge(struct dlm_ls *ls);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 0b3109ee425..133ef6dc7cb 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,6 +52,7 @@
#include <linux/mutex.h>
#include <linux/sctp.h>
#include <linux/slab.h>
+#include <net/sctp/sctp.h>
#include <net/sctp/user.h>
#include <net/ipv6.h>
@@ -474,9 +475,6 @@ static void process_sctp_notification(struct connection *con,
int prim_len, ret;
int addr_len;
struct connection *new_con;
- sctp_peeloff_arg_t parg;
- int parglen = sizeof(parg);
- int err;
/*
* We get this before any data for an association.
@@ -525,23 +523,19 @@ static void process_sctp_notification(struct connection *con,
return;
/* Peel off a new sock */
- parg.associd = sn->sn_assoc_change.sac_assoc_id;
- ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
- SCTP_SOCKOPT_PEELOFF,
- (void *)&parg, &parglen);
+ sctp_lock_sock(con->sock->sk);
+ ret = sctp_do_peeloff(con->sock->sk,
+ sn->sn_assoc_change.sac_assoc_id,
+ &new_con->sock);
+ sctp_release_sock(con->sock->sk);
if (ret < 0) {
log_print("Can't peel off a socket for "
"connection %d to node %d: err=%d",
- parg.associd, nodeid, ret);
- return;
- }
- new_con->sock = sockfd_lookup(parg.sd, &err);
- if (!new_con->sock) {
- log_print("sockfd_lookup error %d", err);
+ (int)sn->sn_assoc_change.sac_assoc_id,
+ nodeid, ret);
return;
}
add_sock(new_con->sock, new_con);
- sockfd_put(new_con->sock);
log_print("connecting to %d sctp association %d",
nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
@@ -1082,7 +1076,7 @@ static void init_local(void)
int i;
dlm_local_count = 0;
- for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+ for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) {
if (dlm_our_addr(&sas, i))
break;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2a834255c75..ea993128155 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -417,17 +417,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
(unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
- "with iv:\n");
- ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
- "encryption:\n");
- ecryptfs_dump_hex((char *)
- (page_address(page)
- + (extent_offset * crypt_stat->extent_size)),
- 8);
- }
rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
page, (extent_offset
* crypt_stat->extent_size),
@@ -440,14 +429,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
goto out;
}
rc = 0;
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
- "rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
- "encryption:\n");
- ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
- }
out:
return rc;
}
@@ -543,17 +524,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
(unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
- "with iv:\n");
- ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
- "decryption:\n");
- ecryptfs_dump_hex((char *)
- (page_address(enc_extent_page)
- + (extent_offset * crypt_stat->extent_size)),
- 8);
- }
rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
(extent_offset
* crypt_stat->extent_size),
@@ -567,16 +537,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
goto out;
}
rc = 0;
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
- "rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
- "decryption:\n");
- ecryptfs_dump_hex((char *)(page_address(page)
- + (extent_offset
- * crypt_stat->extent_size)), 8);
- }
out:
return rc;
}
@@ -1590,8 +1550,8 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
*/
int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
{
- int rc = 0;
- char *page_virt = NULL;
+ int rc;
+ char *page_virt;
struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
@@ -1616,11 +1576,13 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
ecryptfs_dentry,
ECRYPTFS_VALIDATE_HEADER_SIZE);
if (rc) {
+ /* metadata is not in the file header, so try xattrs */
memset(page_virt, 0, PAGE_CACHE_SIZE);
rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
if (rc) {
printk(KERN_DEBUG "Valid eCryptfs headers not found in "
- "file header region or xattr region\n");
+ "file header region or xattr region, inode %lu\n",
+ ecryptfs_inode->i_ino);
rc = -EINVAL;
goto out;
}
@@ -1629,7 +1591,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
ECRYPTFS_DONT_VALIDATE_HEADER_SIZE);
if (rc) {
printk(KERN_DEBUG "Valid eCryptfs headers not found in "
- "file xattr region either\n");
+ "file xattr region either, inode %lu\n",
+ ecryptfs_inode->i_ino);
rc = -EINVAL;
}
if (crypt_stat->mount_crypt_stat->flags
@@ -1640,7 +1603,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
"crypto metadata only in the extended attribute "
"region, but eCryptfs was mounted without "
"xattr support enabled. eCryptfs will not treat "
- "this like an encrypted file.\n");
+ "this like an encrypted file, inode %lu\n",
+ ecryptfs_inode->i_ino);
rc = -EINVAL;
}
}
@@ -2026,6 +1990,17 @@ out:
return;
}
+static size_t ecryptfs_max_decoded_size(size_t encoded_size)
+{
+ /* Not exact; conservatively long. Every block of 4
+ * encoded characters decodes into a block of 3
+ * decoded characters. This segment of code provides
+ * the caller with the maximum amount of allocated
+ * space that @dst will need to point to in a
+ * subsequent call. */
+ return ((encoded_size + 1) * 3) / 4;
+}
+
/**
* ecryptfs_decode_from_filename
* @dst: If NULL, this function only sets @dst_size and returns. If
@@ -2044,13 +2019,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
size_t dst_byte_offset = 0;
if (dst == NULL) {
- /* Not exact; conservatively long. Every block of 4
- * encoded characters decodes into a block of 3
- * decoded characters. This segment of code provides
- * the caller with the maximum amount of allocated
- * space that @dst will need to point to in a
- * subsequent call. */
- (*dst_size) = (((src_size + 1) * 3) / 4);
+ (*dst_size) = ecryptfs_max_decoded_size(src_size);
goto out;
}
while (src_byte_offset < src_size) {
@@ -2275,3 +2244,52 @@ out_free:
out:
return rc;
}
+
+#define ENC_NAME_MAX_BLOCKLEN_8_OR_16 143
+
+int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+ struct blkcipher_desc desc;
+ struct mutex *tfm_mutex;
+ size_t cipher_blocksize;
+ int rc;
+
+ if (!(mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
+ (*namelen) = lower_namelen;
+ return 0;
+ }
+
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+ mount_crypt_stat->global_default_fn_cipher_name);
+ if (unlikely(rc)) {
+ (*namelen) = 0;
+ return rc;
+ }
+
+ mutex_lock(tfm_mutex);
+ cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm);
+ mutex_unlock(tfm_mutex);
+
+ /* Return an exact amount for the common cases */
+ if (lower_namelen == NAME_MAX
+ && (cipher_blocksize == 8 || cipher_blocksize == 16)) {
+ (*namelen) = ENC_NAME_MAX_BLOCKLEN_8_OR_16;
+ return 0;
+ }
+
+ /* Return a safe estimate for the uncommon cases */
+ (*namelen) = lower_namelen;
+ (*namelen) -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+ /* Since this is the max decoded size, subtract 1 "decoded block" len */
+ (*namelen) = ecryptfs_max_decoded_size(*namelen) - 3;
+ (*namelen) -= ECRYPTFS_TAG_70_MAX_METADATA_SIZE;
+ (*namelen) -= ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES;
+ /* Worst case is that the filename is padded nearly a full block size */
+ (*namelen) -= cipher_blocksize - 1;
+
+ if ((*namelen) < 0)
+ (*namelen) = 0;
+
+ return 0;
+}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a9f29b12fbf..867b64c5d84 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -151,12 +151,21 @@ ecryptfs_get_key_payload_data(struct key *key)
* dentry name */
#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
* metadata */
+#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */
+#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to
+ * ecryptfs_parse_packet_length() and
+ * ecryptfs_write_packet_length()
+ */
/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
* ECRYPTFS_MAX_IV_BYTES */
#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
#define MD5_DIGEST_SIZE 16
#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \
+ + ECRYPTFS_SIG_SIZE + 1 + 1)
+#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \
+ + ECRYPTFS_SIG_SIZE + 1 + 1)
#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
@@ -696,6 +705,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
size_t *packet_size,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
char *data, size_t max_packet_size);
+int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
loff_t offset);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index d3f95f941c4..2b17f2f9b12 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -48,8 +48,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
unsigned long nr_segs, loff_t pos)
{
ssize_t rc;
- struct dentry *lower_dentry;
- struct vfsmount *lower_vfsmount;
+ struct path lower;
struct file *file = iocb->ki_filp;
rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
@@ -60,9 +59,9 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
if (-EIOCBQUEUED == rc)
rc = wait_on_sync_kiocb(iocb);
if (rc >= 0) {
- lower_dentry = ecryptfs_dentry_to_lower(file->f_path.dentry);
- lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry);
- touch_atime(lower_vfsmount, lower_dentry);
+ lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry);
+ lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry);
+ touch_atime(&lower);
}
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 19a8ca4ab1d..ab35b113003 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -822,18 +822,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
size_t num_zeros = (PAGE_CACHE_SIZE
- (ia->ia_size & ~PAGE_CACHE_MASK));
-
- /*
- * XXX(truncate) this should really happen at the begginning
- * of ->setattr. But the code is too messy to that as part
- * of a larger patch. ecryptfs is also totally missing out
- * on the inode_change_ok check at the beginning of
- * ->setattr while would include this.
- */
- rc = inode_newsize_ok(inode, ia->ia_size);
- if (rc)
- goto out;
-
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
truncate_setsize(inode, ia->ia_size);
lower_ia->ia_size = ia->ia_size;
@@ -883,6 +871,28 @@ out:
return rc;
}
+static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset)
+{
+ struct ecryptfs_crypt_stat *crypt_stat;
+ loff_t lower_oldsize, lower_newsize;
+
+ crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+ lower_oldsize = upper_size_to_lower_size(crypt_stat,
+ i_size_read(inode));
+ lower_newsize = upper_size_to_lower_size(crypt_stat, offset);
+ if (lower_newsize > lower_oldsize) {
+ /*
+ * The eCryptfs inode and the new *lower* size are mixed here
+ * because we may not have the lower i_mutex held and/or it may
+ * not be appropriate to call inode_newsize_ok() with inodes
+ * from other filesystems.
+ */
+ return inode_newsize_ok(inode, lower_newsize);
+ }
+
+ return 0;
+}
+
/**
* ecryptfs_truncate
* @dentry: The ecryptfs layer dentry
@@ -899,6 +909,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
struct iattr lower_ia = { .ia_valid = 0 };
int rc;
+ rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length);
+ if (rc)
+ return rc;
+
rc = truncate_upper(dentry, &ia, &lower_ia);
if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
@@ -978,6 +992,16 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
}
}
mutex_unlock(&crypt_stat->cs_mutex);
+
+ rc = inode_change_ok(inode, ia);
+ if (rc)
+ goto out;
+ if (ia->ia_valid & ATTR_SIZE) {
+ rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size);
+ if (rc)
+ goto out;
+ }
+
if (S_ISREG(inode->i_mode)) {
rc = filemap_write_and_wait(inode->i_mapping);
if (rc)
@@ -1061,6 +1085,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
}
rc = vfs_setxattr(lower_dentry, name, value, size, flags);
+ if (!rc)
+ fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
out:
return rc;
}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ac1ad48c237..2333203a120 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -109,7 +109,7 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
(*size) += ((unsigned char)(data[1]) + 192);
(*length_size) = 2;
} else if (data[0] == 255) {
- /* Five-byte length; we're not supposed to see this */
+ /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
"supported\n");
rc = -EINVAL;
@@ -126,7 +126,7 @@ out:
/**
* ecryptfs_write_packet_length
* @dest: The byte array target into which to write the length. Must
- * have at least 5 bytes allocated.
+ * have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated.
* @size: The length to write.
* @packet_size_length: The number of bytes used to encode the packet
* length is written to this address.
@@ -146,6 +146,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
dest[1] = ((size - 192) % 256);
(*packet_size_length) = 2;
} else {
+ /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
rc = -EINVAL;
ecryptfs_printk(KERN_WARNING,
"Unsupported packet size: [%zd]\n", size);
@@ -678,10 +679,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
* Octets N3-N4: Block-aligned encrypted filename
* - Consists of a minimum number of random characters, a \0
* separator, and then the filename */
- s->max_packet_size = (1 /* Tag 70 identifier */
- + 3 /* Max Tag 70 packet size */
- + ECRYPTFS_SIG_SIZE /* FNEK sig */
- + 1 /* Cipher identifier */
+ s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE
+ s->block_aligned_filename_size);
if (dest == NULL) {
(*packet_size) = s->max_packet_size;
@@ -933,10 +931,10 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
goto out;
}
s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+ if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
"at least [%d]\n", __func__, max_packet_size,
- (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+ ECRYPTFS_TAG_70_MIN_METADATA_SIZE);
rc = -EINVAL;
goto out;
}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index b4a6befb121..68954937a07 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -550,9 +550,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
if (IS_ERR(inode))
goto out_free;
- s->s_root = d_alloc_root(inode);
+ s->s_root = d_make_root(inode);
if (!s->s_root) {
- iput(inode);
rc = -ENOMEM;
goto out_free;
}
@@ -795,15 +794,10 @@ static int __init ecryptfs_init(void)
"Failed to allocate one or more kmem_cache objects\n");
goto out;
}
- rc = register_filesystem(&ecryptfs_fs_type);
- if (rc) {
- printk(KERN_ERR "Failed to register filesystem\n");
- goto out_free_kmem_caches;
- }
rc = do_sysfs_registration();
if (rc) {
printk(KERN_ERR "sysfs registration failed\n");
- goto out_unregister_filesystem;
+ goto out_free_kmem_caches;
}
rc = ecryptfs_init_kthread();
if (rc) {
@@ -824,19 +818,24 @@ static int __init ecryptfs_init(void)
"rc = [%d]\n", rc);
goto out_release_messaging;
}
+ rc = register_filesystem(&ecryptfs_fs_type);
+ if (rc) {
+ printk(KERN_ERR "Failed to register filesystem\n");
+ goto out_destroy_crypto;
+ }
if (ecryptfs_verbosity > 0)
printk(KERN_CRIT "eCryptfs verbosity set to %d. Secret values "
"will be written to the syslog!\n", ecryptfs_verbosity);
goto out;
+out_destroy_crypto:
+ ecryptfs_destroy_crypto();
out_release_messaging:
ecryptfs_release_messaging();
out_destroy_kthread:
ecryptfs_destroy_kthread();
out_do_sysfs_unregistration:
do_sysfs_unregistration();
-out_unregister_filesystem:
- unregister_filesystem(&ecryptfs_fs_type);
out_free_kmem_caches:
ecryptfs_free_kmem_caches();
out:
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 940a82e63dc..3a06f4043df 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -218,6 +218,29 @@ out_unlock:
return rc;
}
+/*
+ * miscdevfs packet format:
+ * Octet 0: Type
+ * Octets 1-4: network byte order msg_ctx->counter
+ * Octets 5-N0: Size of struct ecryptfs_message to follow
+ * Octets N0-N1: struct ecryptfs_message (including data)
+ *
+ * Octets 5-N1 not written if the packet type does not include a message
+ */
+#define PKT_TYPE_SIZE 1
+#define PKT_CTR_SIZE 4
+#define MIN_NON_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE)
+#define MIN_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \
+ + ECRYPTFS_MIN_PKT_LEN_SIZE)
+/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */
+#define MAX_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \
+ + ECRYPTFS_MAX_PKT_LEN_SIZE \
+ + sizeof(struct ecryptfs_message) \
+ + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)
+#define PKT_TYPE_OFFSET 0
+#define PKT_CTR_OFFSET PKT_TYPE_SIZE
+#define PKT_LEN_OFFSET (PKT_TYPE_SIZE + PKT_CTR_SIZE)
+
/**
* ecryptfs_miscdev_read - format and send message from queue
* @file: fs/ecryptfs/euid miscdevfs handle (ignored)
@@ -237,7 +260,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
struct ecryptfs_daemon *daemon;
struct ecryptfs_msg_ctx *msg_ctx;
size_t packet_length_size;
- char packet_length[3];
+ char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];
size_t i;
size_t total_length;
uid_t euid = current_euid();
@@ -305,15 +328,8 @@ check_list:
packet_length_size = 0;
msg_ctx->msg_size = 0;
}
- /* miscdevfs packet format:
- * Octet 0: Type
- * Octets 1-4: network byte order msg_ctx->counter
- * Octets 5-N0: Size of struct ecryptfs_message to follow
- * Octets N0-N1: struct ecryptfs_message (including data)
- *
- * Octets 5-N1 not written if the packet type does not
- * include a message */
- total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size);
+ total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size
+ + msg_ctx->msg_size);
if (count < total_length) {
rc = 0;
printk(KERN_WARNING "%s: Only given user buffer of "
@@ -324,9 +340,10 @@ check_list:
rc = -EFAULT;
if (put_user(msg_ctx->type, buf))
goto out_unlock_msg_ctx;
- if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1)))
+ if (put_user(cpu_to_be32(msg_ctx->counter),
+ (__be32 __user *)(&buf[PKT_CTR_OFFSET])))
goto out_unlock_msg_ctx;
- i = 5;
+ i = PKT_TYPE_SIZE + PKT_CTR_SIZE;
if (msg_ctx->msg) {
if (copy_to_user(&buf[i], packet_length, packet_length_size))
goto out_unlock_msg_ctx;
@@ -391,12 +408,6 @@ out:
* @count: Amount of data in @buf
* @ppos: Pointer to offset in file (ignored)
*
- * miscdevfs packet format:
- * Octet 0: Type
- * Octets 1-4: network byte order msg_ctx->counter (0's for non-response)
- * Octets 5-N0: Size of struct ecryptfs_message to follow
- * Octets N0-N1: struct ecryptfs_message (including data)
- *
* Returns the number of bytes read from @buf
*/
static ssize_t
@@ -405,60 +416,78 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
{
__be32 counter_nbo;
u32 seq;
- size_t packet_size, packet_size_length, i;
- ssize_t sz = 0;
+ size_t packet_size, packet_size_length;
char *data;
uid_t euid = current_euid();
- int rc;
+ unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE];
+ ssize_t rc;
- if (count == 0)
- goto out;
+ if (count == 0) {
+ return 0;
+ } else if (count == MIN_NON_MSG_PKT_SIZE) {
+ /* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
+ goto memdup;
+ } else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) {
+ printk(KERN_WARNING "%s: Acceptable packet size range is "
+ "[%d-%zu], but amount of data written is [%zu].",
+ __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET],
+ sizeof(packet_size_peek))) {
+ printk(KERN_WARNING "%s: Error while inspecting packet size\n",
+ __func__);
+ return -EFAULT;
+ }
+ rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size,
+ &packet_size_length);
+ if (rc) {
+ printk(KERN_WARNING "%s: Error parsing packet length; "
+ "rc = [%zd]\n", __func__, rc);
+ return rc;
+ }
+
+ if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size)
+ != count) {
+ printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
+ packet_size);
+ return -EINVAL;
+ }
+
+memdup:
data = memdup_user(buf, count);
if (IS_ERR(data)) {
printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
__func__, PTR_ERR(data));
- goto out;
+ return PTR_ERR(data);
}
- sz = count;
- i = 0;
- switch (data[i++]) {
+ switch (data[PKT_TYPE_OFFSET]) {
case ECRYPTFS_MSG_RESPONSE:
- if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
+ if (count < (MIN_MSG_PKT_SIZE
+ + sizeof(struct ecryptfs_message))) {
printk(KERN_WARNING "%s: Minimum acceptable packet "
"size is [%zd], but amount of data written is "
"only [%zd]. Discarding response packet.\n",
__func__,
- (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
- count);
+ (MIN_MSG_PKT_SIZE
+ + sizeof(struct ecryptfs_message)), count);
+ rc = -EINVAL;
goto out_free;
}
- memcpy(&counter_nbo, &data[i], 4);
+ memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);
seq = be32_to_cpu(counter_nbo);
- i += 4;
- rc = ecryptfs_parse_packet_length(&data[i], &packet_size,
- &packet_size_length);
+ rc = ecryptfs_miscdev_response(
+ &data[PKT_LEN_OFFSET + packet_size_length],
+ packet_size, euid, current_user_ns(),
+ task_pid(current), seq);
if (rc) {
- printk(KERN_WARNING "%s: Error parsing packet length; "
- "rc = [%d]\n", __func__, rc);
- goto out_free;
- }
- i += packet_size_length;
- if ((1 + 4 + packet_size_length + packet_size) != count) {
- printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
- " + packet_size([%zd]))([%zd]) != "
- "count([%zd]). Invalid packet format.\n",
- __func__, packet_size_length, packet_size,
- (1 + packet_size_length + packet_size), count);
- goto out_free;
- }
- rc = ecryptfs_miscdev_response(&data[i], packet_size,
- euid, current_user_ns(),
- task_pid(current), seq);
- if (rc)
printk(KERN_WARNING "%s: Failed to deliver miscdev "
- "response to requesting operation; rc = [%d]\n",
+ "response to requesting operation; rc = [%zd]\n",
__func__, rc);
+ goto out_free;
+ }
break;
case ECRYPTFS_MSG_HELO:
case ECRYPTFS_MSG_QUIT:
@@ -467,12 +496,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
"message of unrecognized type [%d]\n",
data[0]);
- break;
+ rc = -EINVAL;
+ goto out_free;
}
+ rc = count;
out_free:
kfree(data);
-out:
- return sz;
+ return rc;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 6a44148c5fb..a46b3a8fee1 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -57,6 +57,10 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
* @page: Page that is locked before this call is made
*
* Returns zero on success; non-zero otherwise
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem. In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
*/
static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
{
@@ -146,7 +150,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
/* This is a header extent */
char *page_virt;
- page_virt = kmap_atomic(page, KM_USER0);
+ page_virt = kmap_atomic(page);
memset(page_virt, 0, PAGE_CACHE_SIZE);
/* TODO: Support more than one header extent */
if (view_extent_num == 0) {
@@ -159,7 +163,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
crypt_stat,
&written);
}
- kunmap_atomic(page_virt, KM_USER0);
+ kunmap_atomic(page_virt);
flush_dcache_page(page);
if (rc) {
printk(KERN_ERR "%s: Error reading xattr "
@@ -481,10 +485,6 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
* @copied: The amount of data copied
* @page: The eCryptfs page
* @fsdata: The fsdata (unused)
- *
- * This is where we encrypt the data and pass the encrypted data to
- * the lower filesystem. In OpenPGP-compatible mode, we operate on
- * entire underlying packets.
*/
static int ecryptfs_write_end(struct file *file,
struct address_space *mapping,
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 3745f7c2b9c..b2a34a192f4 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -130,13 +130,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
- size_t total_remaining_bytes = ((offset + size) - pos);
+ loff_t total_remaining_bytes = ((offset + size) - pos);
+
+ if (fatal_signal_pending(current)) {
+ rc = -EINTR;
+ break;
+ }
if (num_bytes > total_remaining_bytes)
num_bytes = total_remaining_bytes;
if (pos < offset) {
/* remaining zeros to write, up to destination offset */
- size_t total_remaining_zeros = (offset - pos);
+ loff_t total_remaining_zeros = (offset - pos);
if (num_bytes > total_remaining_zeros)
num_bytes = total_remaining_zeros;
@@ -151,7 +156,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
ecryptfs_page_idx, rc);
goto out;
}
- ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
+ ecryptfs_page_virt = kmap_atomic(ecryptfs_page);
/*
* pos: where we're now writing, offset: where the request was
@@ -174,7 +179,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
(data + data_offset), num_bytes);
data_offset += num_bytes;
}
- kunmap_atomic(ecryptfs_page_virt, KM_USER0);
+ kunmap_atomic(ecryptfs_page_virt);
flush_dcache_page(ecryptfs_page);
SetPageUptodate(ecryptfs_page);
unlock_page(ecryptfs_page);
@@ -193,15 +198,19 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
}
pos += num_bytes;
}
- if ((offset + size) > ecryptfs_file_size) {
- i_size_write(ecryptfs_inode, (offset + size));
+ if (pos > ecryptfs_file_size) {
+ i_size_write(ecryptfs_inode, pos);
if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
- rc = ecryptfs_write_inode_size_to_metadata(
+ int rc2;
+
+ rc2 = ecryptfs_write_inode_size_to_metadata(
ecryptfs_inode);
- if (rc) {
+ if (rc2) {
printk(KERN_ERR "Problem with "
"ecryptfs_write_inode_size_to_metadata; "
- "rc = [%d]\n", rc);
+ "rc = [%d]\n", rc2);
+ if (!rc)
+ rc = rc2;
goto out;
}
}
@@ -273,76 +282,3 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
flush_dcache_page(page_for_ecryptfs);
return rc;
}
-
-#if 0
-/**
- * ecryptfs_read
- * @data: The virtual address into which to write the data read (and
- * possibly decrypted) from the lower file
- * @offset: The offset in the decrypted view of the file from which to
- * read into @data
- * @size: The number of bytes to read into @data
- * @ecryptfs_file: The eCryptfs file from which to read
- *
- * Read an arbitrary amount of data from an arbitrary location in the
- * eCryptfs page cache. This is done on an extent-by-extent basis;
- * individual extents are decrypted and read from the lower page
- * cache (via VFS reads). This function takes care of all the
- * address translation to locations in the lower filesystem.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_read(char *data, loff_t offset, size_t size,
- struct file *ecryptfs_file)
-{
- struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
- struct page *ecryptfs_page;
- char *ecryptfs_page_virt;
- loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
- loff_t data_offset = 0;
- loff_t pos;
- int rc = 0;
-
- if ((offset + size) > ecryptfs_file_size) {
- rc = -EINVAL;
- printk(KERN_ERR "%s: Attempt to read data past the end of the "
- "file; offset = [%lld]; size = [%td]; "
- "ecryptfs_file_size = [%lld]\n",
- __func__, offset, size, ecryptfs_file_size);
- goto out;
- }
- pos = offset;
- while (pos < (offset + size)) {
- pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
- size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
- size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
- size_t total_remaining_bytes = ((offset + size) - pos);
-
- if (num_bytes > total_remaining_bytes)
- num_bytes = total_remaining_bytes;
- ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
- ecryptfs_page_idx);
- if (IS_ERR(ecryptfs_page)) {
- rc = PTR_ERR(ecryptfs_page);
- printk(KERN_ERR "%s: Error getting page at "
- "index [%ld] from eCryptfs inode "
- "mapping; rc = [%d]\n", __func__,
- ecryptfs_page_idx, rc);
- goto out;
- }
- ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
- memcpy((data + data_offset),
- ((char *)ecryptfs_page_virt + start_offset_in_page),
- num_bytes);
- kunmap_atomic(ecryptfs_page_virt, KM_USER0);
- flush_dcache_page(ecryptfs_page);
- SetPageUptodate(ecryptfs_page);
- unlock_page(ecryptfs_page);
- page_cache_release(ecryptfs_page);
- pos += num_bytes;
- data_offset += num_bytes;
- }
-out:
- return rc;
-}
-#endif /* 0 */
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 9df7fd6e0c3..2dd946b636d 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -30,6 +30,8 @@
#include <linux/seq_file.h>
#include <linux/file.h>
#include <linux/crypto.h>
+#include <linux/statfs.h>
+#include <linux/magic.h>
#include "ecryptfs_kernel.h"
struct kmem_cache *ecryptfs_inode_info_cache;
@@ -102,10 +104,20 @@ static void ecryptfs_destroy_inode(struct inode *inode)
static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ int rc;
if (!lower_dentry->d_sb->s_op->statfs)
return -ENOSYS;
- return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
+
+ rc = lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
+ if (rc)
+ return rc;
+
+ buf->f_type = ECRYPTFS_SUPER_MAGIC;
+ rc = ecryptfs_set_f_namelen(&buf->f_namelen, buf->f_namelen,
+ &ecryptfs_superblock_to_private(dentry->d_sb)->mount_crypt_stat);
+
+ return rc;
}
/**
@@ -172,7 +184,6 @@ static int ecryptfs_show_options(struct seq_file *m, struct dentry *root)
const struct super_operations ecryptfs_sops = {
.alloc_inode = ecryptfs_alloc_inode,
.destroy_inode = ecryptfs_destroy_inode,
- .drop_inode = generic_drop_inode,
.statfs = ecryptfs_statfs,
.remount_fs = NULL,
.evict_inode = ecryptfs_evict_inode,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 981106429a9..e755ec746c6 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -317,10 +317,9 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
goto out_no_fs;
}
- s->s_root = d_alloc_root(root);
+ s->s_root = d_make_root(root);
if (!(s->s_root)) {
printk(KERN_ERR "EFS: get root dentry failed\n");
- iput(root);
ret = -ENOMEM;
goto out_no_fs;
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index d9a59177391..dba15fecf23 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -16,7 +16,7 @@
#include <linux/spinlock.h>
#include <linux/anon_inodes.h>
#include <linux/syscalls.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kref.h>
#include <linux/eventfd.h>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 828e750af23..739b0985b39 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,7 +34,6 @@
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
@@ -197,6 +196,12 @@ struct eventpoll {
/* The user that created the eventpoll descriptor */
struct user_struct *user;
+
+ struct file *file;
+
+ /* used to optimize loop detection check */
+ int visited;
+ struct list_head visited_list_link;
};
/* Wait structure used by the poll hooks */
@@ -255,6 +260,15 @@ static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;
+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
+static LIST_HEAD(visited_list);
+
+/*
+ * List of files with newly added links, where we may need to limit the number
+ * of emanating paths. Protected by the epmutex.
+ */
+static LIST_HEAD(tfile_check_list);
+
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -276,6 +290,12 @@ ctl_table epoll_table[] = {
};
#endif /* CONFIG_SYSCTL */
+static const struct file_operations eventpoll_fops;
+
+static inline int is_file_epoll(struct file *f)
+{
+ return f->f_op == &eventpoll_fops;
+}
/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -299,6 +319,11 @@ static inline int ep_is_linked(struct list_head *p)
return !list_empty(p);
}
+static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
+{
+ return container_of(p, struct eppoll_entry, wait);
+}
+
/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
{
@@ -401,6 +426,31 @@ out_unlock:
return error;
}
+/*
+ * As described in commit 0ccf831cb lockdep: annotate epoll
+ * the use of wait queues used by epoll is done in a very controlled
+ * manner. Wake ups can nest inside each other, but are never done
+ * with the same locking. For example:
+ *
+ * dfd = socket(...);
+ * efd1 = epoll_create();
+ * efd2 = epoll_create();
+ * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
+ * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
+ *
+ * When a packet arrives to the device underneath "dfd", the net code will
+ * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
+ * callback wakeup entry on that queue, and the wake_up() performed by the
+ * "dfd" net code will end up in ep_poll_callback(). At this point epoll
+ * (efd1) notices that it may have some event ready, so it needs to wake up
+ * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
+ * that ends up in another wake_up(), after having checked about the
+ * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
+ * avoid stack blasting.
+ *
+ * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
+ * this special case of epoll.
+ */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
unsigned long events, int subclass)
@@ -446,6 +496,18 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
put_cpu();
}
+static void ep_remove_wait_queue(struct eppoll_entry *pwq)
+{
+ wait_queue_head_t *whead;
+
+ rcu_read_lock();
+ /* If it is cleared by POLLFREE, it should be rcu-safe */
+ whead = rcu_dereference(pwq->whead);
+ if (whead)
+ remove_wait_queue(whead, &pwq->wait);
+ rcu_read_unlock();
+}
+
/*
* This function unregisters poll callbacks from the associated file
* descriptor. Must be called with "mtx" held (or "epmutex" if called from
@@ -460,7 +522,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
list_del(&pwq->llink);
- remove_wait_queue(pwq->whead, &pwq->wait);
+ ep_remove_wait_queue(pwq);
kmem_cache_free(pwq_cache, pwq);
}
}
@@ -661,9 +723,12 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
struct epitem *epi, *tmp;
+ poll_table pt;
+ init_poll_funcptr(&pt, NULL);
list_for_each_entry_safe(epi, tmp, head, rdllink) {
- if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+ pt._key = epi->event.events;
+ if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
epi->event.events)
return POLLIN | POLLRDNORM;
else {
@@ -711,12 +776,6 @@ static const struct file_operations eventpoll_fops = {
.llseek = noop_llseek,
};
-/* Fast test to see if the file is an eventpoll file */
-static inline int is_file_epoll(struct file *f)
-{
- return f->f_op == &eventpoll_fops;
-}
-
/*
* This is called from eventpoll_release() to unlink files from the eventpoll
* interface. We need to have this facility to cleanup correctly files that are
@@ -827,6 +886,17 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
+ if ((unsigned long)key & POLLFREE) {
+ ep_pwq_from_wait(wait)->whead = NULL;
+ /*
+ * whead = NULL above can race with ep_remove_wait_queue()
+ * which can do another remove_wait_queue() after us, so we
+ * can't use __remove_wait_queue(). whead->lock is held by
+ * the caller.
+ */
+ list_del_init(&wait->task_list);
+ }
+
spin_lock_irqsave(&ep->lock, flags);
/*
@@ -926,6 +996,101 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
rb_insert_color(&epi->rbn, &ep->rbr);
}
+
+
+#define PATH_ARR_SIZE 5
+/*
+ * These are the number paths of length 1 to 5, that we are allowing to emanate
+ * from a single file of interest. For example, we allow 1000 paths of length
+ * 1, to emanate from each file of interest. This essentially represents the
+ * potential wakeup paths, which need to be limited in order to avoid massive
+ * uncontrolled wakeup storms. The common use case should be a single ep which
+ * is connected to n file sources. In this case each file source has 1 path
+ * of length 1. Thus, the numbers below should be more than sufficient. These
+ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
+ * and delete can't add additional paths. Protected by the epmutex.
+ */
+static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
+static int path_count[PATH_ARR_SIZE];
+
+static int path_count_inc(int nests)
+{
+ /* Allow an arbitrary number of depth 1 paths */
+ if (nests == 0)
+ return 0;
+
+ if (++path_count[nests] > path_limits[nests])
+ return -1;
+ return 0;
+}
+
+static void path_count_init(void)
+{
+ int i;
+
+ for (i = 0; i < PATH_ARR_SIZE; i++)
+ path_count[i] = 0;
+}
+
+static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
+{
+ int error = 0;
+ struct file *file = priv;
+ struct file *child_file;
+ struct epitem *epi;
+
+ list_for_each_entry(epi, &file->f_ep_links, fllink) {
+ child_file = epi->ep->file;
+ if (is_file_epoll(child_file)) {
+ if (list_empty(&child_file->f_ep_links)) {
+ if (path_count_inc(call_nests)) {
+ error = -1;
+ break;
+ }
+ } else {
+ error = ep_call_nested(&poll_loop_ncalls,
+ EP_MAX_NESTS,
+ reverse_path_check_proc,
+ child_file, child_file,
+ current);
+ }
+ if (error != 0)
+ break;
+ } else {
+ printk(KERN_ERR "reverse_path_check_proc: "
+ "file is not an ep!\n");
+ }
+ }
+ return error;
+}
+
+/**
+ * reverse_path_check - The tfile_check_list is list of file *, which have
+ * links that are proposed to be newly added. We need to
+ * make sure that those added links don't add too many
+ * paths such that we will spend all our time waking up
+ * eventpoll objects.
+ *
+ * Returns: Returns zero if the proposed links don't create too many paths,
+ * -1 otherwise.
+ */
+static int reverse_path_check(void)
+{
+ int error = 0;
+ struct file *current_file;
+
+ /* let's call this for all tfiles */
+ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+ path_count_init();
+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ reverse_path_check_proc, current_file,
+ current_file, current);
+ if (error)
+ break;
+ }
+ return error;
+}
+
/*
* Must be called with "mtx" held.
*/
@@ -957,6 +1122,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
+ epq.pt._key = event->events;
/*
* Attach the item to the poll hooks and get current event bits.
@@ -987,6 +1153,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
*/
ep_rbtree_insert(ep, epi);
+ /* now check if we've created too many backpaths */
+ error = -EINVAL;
+ if (reverse_path_check())
+ goto error_remove_epi;
+
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irqsave(&ep->lock, flags);
@@ -1011,6 +1182,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
return 0;
+error_remove_epi:
+ spin_lock(&tfile->f_lock);
+ if (ep_is_linked(&epi->fllink))
+ list_del_init(&epi->fllink);
+ spin_unlock(&tfile->f_lock);
+
+ rb_erase(&epi->rbn, &ep->rbr);
+
error_unregister:
ep_unregister_pollwait(ep, epi);
@@ -1038,6 +1217,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
{
int pwake = 0;
unsigned int revents;
+ poll_table pt;
+
+ init_poll_funcptr(&pt, NULL);
/*
* Set the new event interest mask before calling f_op->poll();
@@ -1045,13 +1227,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* f_op->poll() call and the new event set registering.
*/
epi->event.events = event->events;
+ pt._key = event->events;
epi->event.data = event->data; /* protected by mtx */
/*
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
*/
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
+ revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
/*
* If the item is "hot" and it is not registered inside the ready
@@ -1086,6 +1269,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
unsigned int revents;
struct epitem *epi;
struct epoll_event __user *uevent;
+ poll_table pt;
+
+ init_poll_funcptr(&pt, NULL);
/*
* We can loop without lock because we are passed a task private list.
@@ -1098,7 +1284,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
list_del_init(&epi->rdllink);
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+ pt._key = epi->event.events;
+ revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
epi->event.events;
/*
@@ -1275,18 +1462,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
int error = 0;
struct file *file = priv;
struct eventpoll *ep = file->private_data;
+ struct eventpoll *ep_tovisit;
struct rb_node *rbp;
struct epitem *epi;
mutex_lock_nested(&ep->mtx, call_nests + 1);
+ ep->visited = 1;
+ list_add(&ep->visited_list_link, &visited_list);
for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
+ ep_tovisit = epi->ffd.file->private_data;
+ if (ep_tovisit->visited)
+ continue;
error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
- ep_loop_check_proc, epi->ffd.file,
- epi->ffd.file->private_data, current);
+ ep_loop_check_proc, epi->ffd.file,
+ ep_tovisit, current);
if (error != 0)
break;
+ } else {
+ /*
+ * If we've reached a file that is not associated with
+ * an ep, then we need to check if the newly added
+ * links are going to add too many wakeup paths. We do
+ * this by adding it to the tfile_check_list, if it's
+ * not already there, and calling reverse_path_check()
+ * during ep_insert().
+ */
+ if (list_empty(&epi->ffd.file->f_tfile_llink))
+ list_add(&epi->ffd.file->f_tfile_llink,
+ &tfile_check_list);
}
}
mutex_unlock(&ep->mtx);
@@ -1307,8 +1512,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
*/
static int ep_loop_check(struct eventpoll *ep, struct file *file)
{
- return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ int ret;
+ struct eventpoll *ep_cur, *ep_next;
+
+ ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
ep_loop_check_proc, file, ep, current);
+ /* clear visited list */
+ list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
+ visited_list_link) {
+ ep_cur->visited = 0;
+ list_del(&ep_cur->visited_list_link);
+ }
+ return ret;
+}
+
+static void clear_tfile_check_list(void)
+{
+ struct file *file;
+
+ /* first clear the tfile_check_list */
+ while (!list_empty(&tfile_check_list)) {
+ file = list_first_entry(&tfile_check_list, struct file,
+ f_tfile_llink);
+ list_del_init(&file->f_tfile_llink);
+ }
+ INIT_LIST_HEAD(&tfile_check_list);
}
/*
@@ -1316,8 +1544,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
*/
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
- int error;
+ int error, fd;
struct eventpoll *ep = NULL;
+ struct file *file;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1334,11 +1563,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
- error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+ if (fd < 0) {
+ error = fd;
+ goto out_free_ep;
+ }
+ file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
- if (error < 0)
- ep_free(ep);
-
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto out_free_fd;
+ }
+ fd_install(fd, file);
+ ep->file = file;
+ return fd;
+
+out_free_fd:
+ put_unused_fd(fd);
+out_free_ep:
+ ep_free(ep);
return error;
}
@@ -1404,21 +1647,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/*
* When we insert an epoll file descriptor, inside another epoll file
* descriptor, there is the change of creating closed loops, which are
- * better be handled here, than in more critical paths.
+ * better be handled here, than in more critical paths. While we are
+ * checking for loops we also determine the list of files reachable
+ * and hang them on the tfile_check_list, so we can check that we
+ * haven't created too many possible wakeup paths.
*
- * We hold epmutex across the loop check and the insert in this case, in
- * order to prevent two separate inserts from racing and each doing the
- * insert "at the same time" such that ep_loop_check passes on both
- * before either one does the insert, thereby creating a cycle.
+ * We need to hold the epmutex across both ep_insert and ep_remove
+ * b/c we want to make sure we are looking at a coherent view of
+ * epoll network.
*/
- if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+ if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
mutex_lock(&epmutex);
did_lock_epmutex = 1;
- error = -ELOOP;
- if (ep_loop_check(ep, tfile) != 0)
- goto error_tgt_fput;
}
-
+ if (op == EPOLL_CTL_ADD) {
+ if (is_file_epoll(tfile)) {
+ error = -ELOOP;
+ if (ep_loop_check(ep, tfile) != 0)
+ goto error_tgt_fput;
+ } else
+ list_add(&tfile->f_tfile_llink, &tfile_check_list);
+ }
mutex_lock_nested(&ep->mtx, 0);
@@ -1437,6 +1686,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
+ clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
if (epi)
@@ -1455,7 +1705,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
mutex_unlock(&ep->mtx);
error_tgt_fput:
- if (unlikely(did_lock_epmutex))
+ if (did_lock_epmutex)
mutex_unlock(&epmutex);
fput(tfile);
diff --git a/fs/exec.c b/fs/exec.c
index aeb135c7ff5..b1fd2025e59 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,10 +59,13 @@
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
+#include <asm/exec.h>
#include <trace/events/task.h>
#include "internal.h"
+#include <trace/events/sched.h>
+
int core_uses_pid;
char core_pattern[CORENAME_MAX_SIZE] = "core";
unsigned int core_pipe_limit;
@@ -79,15 +82,13 @@ static atomic_t call_count = ATOMIC_INIT(1);
static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);
-int __register_binfmt(struct linux_binfmt * fmt, int insert)
+void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
- if (!fmt)
- return -EINVAL;
+ BUG_ON(!fmt);
write_lock(&binfmt_lock);
insert ? list_add(&fmt->lh, &formats) :
list_add_tail(&fmt->lh, &formats);
write_unlock(&binfmt_lock);
- return 0;
}
EXPORT_SYMBOL(__register_binfmt);
@@ -822,7 +823,7 @@ static int exec_mmap(struct mm_struct *mm)
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
- sync_mm_rss(tsk, old_mm);
+ sync_mm_rss(old_mm);
mm_release(tsk, old_mm);
if (old_mm) {
@@ -848,6 +849,7 @@ static int exec_mmap(struct mm_struct *mm)
if (old_mm) {
up_read(&old_mm->mmap_sem);
BUG_ON(active_mm != old_mm);
+ setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
return 0;
@@ -975,8 +977,8 @@ static int de_thread(struct task_struct *tsk)
sig->notify_count = 0;
no_thread_group:
- if (current->mm)
- setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
+ /* we have changed execution domain */
+ tsk->exit_signal = SIGCHLD;
exit_itimers(sig);
flush_itimer_signals();
@@ -1026,10 +1028,10 @@ static void flush_old_files(struct files_struct * files)
fdt = files_fdtable(files);
if (i >= fdt->max_fds)
break;
- set = fdt->close_on_exec->fds_bits[j];
+ set = fdt->close_on_exec[j];
if (!set)
continue;
- fdt->close_on_exec->fds_bits[j] = 0;
+ fdt->close_on_exec[j] = 0;
spin_unlock(&files->file_lock);
for ( ; set ; i++,set >>= 1) {
if (set & 1) {
@@ -1071,6 +1073,21 @@ void set_task_comm(struct task_struct *tsk, char *buf)
perf_event_comm(tsk);
}
+static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
+{
+ int i, ch;
+
+ /* Copies the binary name from after last slash */
+ for (i = 0; (ch = *(fn++)) != '\0';) {
+ if (ch == '/')
+ i = 0; /* overwrite what we wrote */
+ else
+ if (i < len - 1)
+ tcomm[i++] = ch;
+ }
+ tcomm[i] = '\0';
+}
+
int flush_old_exec(struct linux_binprm * bprm)
{
int retval;
@@ -1085,6 +1102,7 @@ int flush_old_exec(struct linux_binprm * bprm)
set_mm_exe_file(bprm->mm, bprm->file);
+ filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
/*
* Release all of the old mmap stuff
*/
@@ -1096,7 +1114,7 @@ int flush_old_exec(struct linux_binprm * bprm)
bprm->mm = NULL; /* We're using it now */
set_fs(USER_DS);
- current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
+ current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD);
flush_thread();
current->personality &= ~bprm->per_clear;
@@ -1116,10 +1134,6 @@ EXPORT_SYMBOL(would_dump);
void setup_new_exec(struct linux_binprm * bprm)
{
- int i, ch;
- const char *name;
- char tcomm[sizeof(current->comm)];
-
arch_pick_mmap_layout(current->mm);
/* This is the point of no return */
@@ -1130,18 +1144,7 @@ void setup_new_exec(struct linux_binprm * bprm)
else
set_dumpable(current->mm, suid_dumpable);
- name = bprm->filename;
-
- /* Copies the binary name from after last slash */
- for (i=0; (ch = *(name++)) != '\0';) {
- if (ch == '/')
- i = 0; /* overwrite what we wrote */
- else
- if (i < (sizeof(tcomm) - 1))
- tcomm[i++] = ch;
- }
- tcomm[i] = '\0';
- set_task_comm(current, tcomm);
+ set_task_comm(current, bprm->tcomm);
/* Set the new mm task size. We have to do that late because it may
* depend on TIF_32BIT which is only updated in flush_thread() on
@@ -1338,13 +1341,13 @@ int remove_arg_zero(struct linux_binprm *bprm)
ret = -EFAULT;
goto out;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
for (; offset < PAGE_SIZE && kaddr[offset];
offset++, bprm->p++)
;
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
put_arg_page(page);
if (offset == PAGE_SIZE)
@@ -1368,7 +1371,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
unsigned int depth = bprm->recursion_depth;
int try,retval;
struct linux_binfmt *fmt;
- pid_t old_pid;
+ pid_t old_pid, old_vpid;
retval = security_bprm_check(bprm);
if (retval)
@@ -1379,8 +1382,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
return retval;
/* Need to fetch pid before load_binary changes it */
+ old_pid = current->pid;
rcu_read_lock();
- old_pid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
+ old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
rcu_read_unlock();
retval = -ENOENT;
@@ -1401,9 +1405,10 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
*/
bprm->recursion_depth = depth;
if (retval >= 0) {
- if (depth == 0)
- ptrace_event(PTRACE_EVENT_EXEC,
- old_pid);
+ if (depth == 0) {
+ trace_sched_process_exec(current, old_pid, bprm);
+ ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
+ }
put_binfmt(fmt);
allow_write_access(bprm->file);
if (bprm->file)
@@ -1914,7 +1919,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
- struct completion *vfork_done;
int core_waiters = -EBUSY;
init_completion(&core_state->startup);
@@ -1926,22 +1930,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
core_waiters = zap_threads(tsk, mm, core_state, exit_code);
up_write(&mm->mmap_sem);
- if (unlikely(core_waiters < 0))
- goto fail;
-
- /*
- * Make sure nobody is waiting for us to release the VM,
- * otherwise we can deadlock when we wait on each other
- */
- vfork_done = tsk->vfork_done;
- if (vfork_done) {
- tsk->vfork_done = NULL;
- complete(vfork_done);
- }
-
- if (core_waiters)
+ if (core_waiters > 0)
wait_for_completion(&core_state->startup);
-fail:
+
return core_waiters;
}
@@ -2077,8 +2068,8 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
fd_install(0, rp);
spin_lock(&cf->file_lock);
fdt = files_fdtable(cf);
- FD_SET(0, fdt->open_fds);
- FD_CLR(0, fdt->close_on_exec);
+ __set_open_fd(0, fdt);
+ __clear_close_on_exec(0, fdt);
spin_unlock(&cf->file_lock);
/* and disallow core files too */
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 80405836ba6..c61e62ac231 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -597,7 +597,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
goto fail;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
de = (struct exofs_dir_entry *)kaddr;
de->name_len = 1;
de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
@@ -611,7 +611,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
de->inode_no = cpu_to_le64(parent->i_ino);
memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
exofs_set_de_type(de, inode);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
err = exofs_commit_chunk(page, 0, chunk_size);
fail:
page_cache_release(page);
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 9dbf0c30103..fc7161d6bf6 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -143,9 +143,6 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
{
struct inode *inode = old_dentry->d_inode;
- if (inode->i_nlink >= EXOFS_LINK_MAX)
- return -EMLINK;
-
inode->i_ctime = CURRENT_TIME;
inode_inc_link_count(inode);
ihold(inode);
@@ -156,10 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
- int err = -EMLINK;
-
- if (dir->i_nlink >= EXOFS_LINK_MAX)
- goto out;
+ int err;
inode_inc_link_count(dir);
@@ -275,11 +269,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto out_dir;
} else {
- if (dir_de) {
- err = -EMLINK;
- if (new_dir->i_nlink >= EXOFS_LINK_MAX)
- goto out_dir;
- }
err = exofs_add_link(new_dentry, old_inode);
if (err)
goto out_dir;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index d22cd168c6e..735ca06430a 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -389,7 +389,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
memset(fscb, 0, ios->length);
fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
- fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+ fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles);
fscb->s_magic = cpu_to_le16(sb->s_magic);
fscb->s_newfs = 0;
fscb->s_version = EXOFS_FSCB_VER;
@@ -529,7 +529,8 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
struct osd_dev_info *odi)
{
odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
- memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
+ if (likely(odi->systemid_len))
+ memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN);
odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
odi->osdname = dt_dev->osdname;
@@ -565,7 +566,7 @@ int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
if (unlikely(!aoded)) {
- EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+ EXOFS_ERR("ERROR: failed allocating Device array[%d]\n",
numdevs);
return -ENOMEM;
}
@@ -754,6 +755,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_blocksize = EXOFS_BLKSIZE;
sb->s_blocksize_bits = EXOFS_BLKSHIFT;
sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_max_links = EXOFS_LINK_MAX;
atomic_set(&sbi->s_curr_pending, 0);
sb->s_bdev = NULL;
sb->s_dev = 0;
@@ -818,9 +820,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
ret = PTR_ERR(root);
goto free_sbi;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
- iput(root);
EXOFS_ERR("ERROR: get root inode failed\n");
ret = -ENOMEM;
goto free_sbi;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index d37df352d32..0f4f5c92925 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -645,7 +645,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
unlock_page(page);
goto fail;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memset(kaddr, 0, chunk_size);
de = (struct ext2_dir_entry_2 *)kaddr;
de->name_len = 1;
@@ -660,7 +660,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
de->inode = cpu_to_le32(parent->i_ino);
memcpy (de->name, "..\0", 4);
ext2_set_de_type (de, inode);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
err = ext2_commit_chunk(page, 0, chunk_size);
fail:
page_cache_release(page);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 75ad433c669..0b2b4db5bdc 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -1,5 +1,636 @@
+/*
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/include/linux/minix_fs.h
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
#include <linux/fs.h>
#include <linux/ext2_fs.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
+#include <linux/rbtree.h>
+
+/* XXX Here for now... not interested in restructing headers JUST now */
+
+/* data type for block offset of block group */
+typedef int ext2_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long ext2_fsblk_t;
+
+#define E2FSBLK "%lu"
+
+struct ext2_reserve_window {
+ ext2_fsblk_t _rsv_start; /* First byte reserved */
+ ext2_fsblk_t _rsv_end; /* Last byte reserved or 0 */
+};
+
+struct ext2_reserve_window_node {
+ struct rb_node rsv_node;
+ __u32 rsv_goal_size;
+ __u32 rsv_alloc_hit;
+ struct ext2_reserve_window rsv_window;
+};
+
+struct ext2_block_alloc_info {
+ /* information about reservation window */
+ struct ext2_reserve_window_node rsv_window_node;
+ /*
+ * was i_next_alloc_block in ext2_inode_info
+ * is the logical (file-relative) number of the
+ * most-recently-allocated block in this file.
+ * We use this for detecting linearly ascending allocation requests.
+ */
+ __u32 last_alloc_logical_block;
+ /*
+ * Was i_next_alloc_goal in ext2_inode_info
+ * is the *physical* companion to i_next_alloc_block.
+ * it the the physical block number of the block which was most-recentl
+ * allocated to this file. This give us the goal (target) for the next
+ * allocation when we detect linearly ascending requests.
+ */
+ ext2_fsblk_t last_alloc_physical_block;
+};
+
+#define rsv_start rsv_window._rsv_start
+#define rsv_end rsv_window._rsv_end
+
+/*
+ * second extended-fs super-block data in memory
+ */
+struct ext2_sb_info {
+ unsigned long s_frag_size; /* Size of a fragment in bytes */
+ unsigned long s_frags_per_block;/* Number of fragments per block */
+ unsigned long s_inodes_per_block;/* Number of inodes per block */
+ unsigned long s_frags_per_group;/* Number of fragments in a group */
+ unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ unsigned long s_inodes_per_group;/* Number of inodes in a group */
+ unsigned long s_itb_per_group; /* Number of inode table blocks per group */
+ unsigned long s_gdb_count; /* Number of group descriptor blocks */
+ unsigned long s_desc_per_block; /* Number of group descriptors per block */
+ unsigned long s_groups_count; /* Number of groups in the fs */
+ unsigned long s_overhead_last; /* Last calculated overhead */
+ unsigned long s_blocks_last; /* Last seen block count */
+ struct buffer_head * s_sbh; /* Buffer containing the super block */
+ struct ext2_super_block * s_es; /* Pointer to the super block in the buffer */
+ struct buffer_head ** s_group_desc;
+ unsigned long s_mount_opt;
+ unsigned long s_sb_block;
+ uid_t s_resuid;
+ gid_t s_resgid;
+ unsigned short s_mount_state;
+ unsigned short s_pad;
+ int s_addr_per_block_bits;
+ int s_desc_per_block_bits;
+ int s_inode_size;
+ int s_first_ino;
+ spinlock_t s_next_gen_lock;
+ u32 s_next_generation;
+ unsigned long s_dir_count;
+ u8 *s_debts;
+ struct percpu_counter s_freeblocks_counter;
+ struct percpu_counter s_freeinodes_counter;
+ struct percpu_counter s_dirs_counter;
+ struct blockgroup_lock *s_blockgroup_lock;
+ /* root of the per fs reservation window tree */
+ spinlock_t s_rsv_window_lock;
+ struct rb_root s_rsv_window_root;
+ struct ext2_reserve_window_node s_rsv_window_head;
+ /*
+ * s_lock protects against concurrent modifications of s_mount_state,
+ * s_blocks_last, s_overhead_last and the content of superblock's
+ * buffer pointed to by sbi->s_es.
+ *
+ * Note: It is used in ext2_show_options() to provide a consistent view
+ * of the mount options.
+ */
+ spinlock_t s_lock;
+};
+
+static inline spinlock_t *
+sb_bgl_lock(struct ext2_sb_info *sbi, unsigned int block_group)
+{
+ return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
+}
+
+/*
+ * Define EXT2FS_DEBUG to produce debug messages
+ */
+#undef EXT2FS_DEBUG
+
+/*
+ * Define EXT2_RESERVATION to reserve data blocks for expanding files
+ */
+#define EXT2_DEFAULT_RESERVE_BLOCKS 8
+/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
+#define EXT2_MAX_RESERVE_BLOCKS 1027
+#define EXT2_RESERVE_WINDOW_NOT_ALLOCATED 0
+/*
+ * The second extended file system version
+ */
+#define EXT2FS_DATE "95/08/09"
+#define EXT2FS_VERSION "0.5b"
+
+/*
+ * Debug code
+ */
+#ifdef EXT2FS_DEBUG
+# define ext2_debug(f, a...) { \
+ printk ("EXT2-fs DEBUG (%s, %d): %s:", \
+ __FILE__, __LINE__, __func__); \
+ printk (f, ## a); \
+ }
+#else
+# define ext2_debug(f, a...) /**/
+#endif
+
+/*
+ * Special inode numbers
+ */
+#define EXT2_BAD_INO 1 /* Bad blocks inode */
+#define EXT2_ROOT_INO 2 /* Root inode */
+#define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */
+#define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */
+
+/* First non-reserved inode for old ext2 filesystems */
+#define EXT2_GOOD_OLD_FIRST_INO 11
+
+static inline struct ext2_sb_info *EXT2_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT2_MIN_BLOCK_SIZE 1024
+#define EXT2_MAX_BLOCK_SIZE 4096
+#define EXT2_MIN_BLOCK_LOG_SIZE 10
+#define EXT2_BLOCK_SIZE(s) ((s)->s_blocksize)
+#define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32))
+#define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+#define EXT2_ADDR_PER_BLOCK_BITS(s) (EXT2_SB(s)->s_addr_per_block_bits)
+#define EXT2_INODE_SIZE(s) (EXT2_SB(s)->s_inode_size)
+#define EXT2_FIRST_INO(s) (EXT2_SB(s)->s_first_ino)
+
+/*
+ * Macro-instructions used to manage fragments
+ */
+#define EXT2_MIN_FRAG_SIZE 1024
+#define EXT2_MAX_FRAG_SIZE 4096
+#define EXT2_MIN_FRAG_LOG_SIZE 10
+#define EXT2_FRAG_SIZE(s) (EXT2_SB(s)->s_frag_size)
+#define EXT2_FRAGS_PER_BLOCK(s) (EXT2_SB(s)->s_frags_per_block)
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext2_group_desc
+{
+ __le32 bg_block_bitmap; /* Blocks bitmap block */
+ __le32 bg_inode_bitmap; /* Inodes bitmap block */
+ __le32 bg_inode_table; /* Inodes table block */
+ __le16 bg_free_blocks_count; /* Free blocks count */
+ __le16 bg_free_inodes_count; /* Free inodes count */
+ __le16 bg_used_dirs_count; /* Directories count */
+ __le16 bg_pad;
+ __le32 bg_reserved[3];
+};
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT2_BLOCKS_PER_GROUP(s) (EXT2_SB(s)->s_blocks_per_group)
+#define EXT2_DESC_PER_BLOCK(s) (EXT2_SB(s)->s_desc_per_block)
+#define EXT2_INODES_PER_GROUP(s) (EXT2_SB(s)->s_inodes_per_group)
+#define EXT2_DESC_PER_BLOCK_BITS(s) (EXT2_SB(s)->s_desc_per_block_bits)
+
+/*
+ * Constants relative to the data blocks
+ */
+#define EXT2_NDIR_BLOCKS 12
+#define EXT2_IND_BLOCK EXT2_NDIR_BLOCKS
+#define EXT2_DIND_BLOCK (EXT2_IND_BLOCK + 1)
+#define EXT2_TIND_BLOCK (EXT2_DIND_BLOCK + 1)
+#define EXT2_N_BLOCKS (EXT2_TIND_BLOCK + 1)
+
+/*
+ * Inode flags (GETFLAGS/SETFLAGS)
+ */
+#define EXT2_SECRM_FL FS_SECRM_FL /* Secure deletion */
+#define EXT2_UNRM_FL FS_UNRM_FL /* Undelete */
+#define EXT2_COMPR_FL FS_COMPR_FL /* Compress file */
+#define EXT2_SYNC_FL FS_SYNC_FL /* Synchronous updates */
+#define EXT2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */
+#define EXT2_APPEND_FL FS_APPEND_FL /* writes to file may only append */
+#define EXT2_NODUMP_FL FS_NODUMP_FL /* do not dump file */
+#define EXT2_NOATIME_FL FS_NOATIME_FL /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT2_DIRTY_FL FS_DIRTY_FL
+#define EXT2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */
+#define EXT2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */
+#define EXT2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */
+/* End compression flags --- maybe not all used */
+#define EXT2_BTREE_FL FS_BTREE_FL /* btree format dir */
+#define EXT2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */
+#define EXT2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */
+#define EXT2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define EXT2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
+#define EXT2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
+#define EXT2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
+#define EXT2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
+
+#define EXT2_FL_USER_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
+#define EXT2_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\
+ EXT2_SYNC_FL | EXT2_NODUMP_FL |\
+ EXT2_NOATIME_FL | EXT2_COMPRBLK_FL |\
+ EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\
+ EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT2_REG_FLMASK (~(EXT2_DIRSYNC_FL | EXT2_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT2_OTHER_FLMASK (EXT2_NODUMP_FL | EXT2_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext2_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & EXT2_REG_FLMASK;
+ else
+ return flags & EXT2_OTHER_FLMASK;
+}
+
+/*
+ * ioctl commands
+ */
+#define EXT2_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define EXT2_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define EXT2_IOC_GETVERSION FS_IOC_GETVERSION
+#define EXT2_IOC_SETVERSION FS_IOC_SETVERSION
+#define EXT2_IOC_GETRSVSZ _IOR('f', 5, long)
+#define EXT2_IOC_SETRSVSZ _IOW('f', 6, long)
+
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define EXT2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define EXT2_IOC32_GETVERSION FS_IOC32_GETVERSION
+#define EXT2_IOC32_SETVERSION FS_IOC32_SETVERSION
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext2_inode {
+ __le16 i_mode; /* File mode */
+ __le16 i_uid; /* Low 16 bits of Owner Uid */
+ __le32 i_size; /* Size in bytes */
+ __le32 i_atime; /* Access time */
+ __le32 i_ctime; /* Creation time */
+ __le32 i_mtime; /* Modification time */
+ __le32 i_dtime; /* Deletion Time */
+ __le16 i_gid; /* Low 16 bits of Group Id */
+ __le16 i_links_count; /* Links count */
+ __le32 i_blocks; /* Blocks count */
+ __le32 i_flags; /* File flags */
+ union {
+ struct {
+ __le32 l_i_reserved1;
+ } linux1;
+ struct {
+ __le32 h_i_translator;
+ } hurd1;
+ struct {
+ __le32 m_i_reserved1;
+ } masix1;
+ } osd1; /* OS dependent 1 */
+ __le32 i_block[EXT2_N_BLOCKS];/* Pointers to blocks */
+ __le32 i_generation; /* File version (for NFS) */
+ __le32 i_file_acl; /* File ACL */
+ __le32 i_dir_acl; /* Directory ACL */
+ __le32 i_faddr; /* Fragment address */
+ union {
+ struct {
+ __u8 l_i_frag; /* Fragment number */
+ __u8 l_i_fsize; /* Fragment size */
+ __u16 i_pad1;
+ __le16 l_i_uid_high; /* these 2 fields */
+ __le16 l_i_gid_high; /* were reserved2[0] */
+ __u32 l_i_reserved2;
+ } linux2;
+ struct {
+ __u8 h_i_frag; /* Fragment number */
+ __u8 h_i_fsize; /* Fragment size */
+ __le16 h_i_mode_high;
+ __le16 h_i_uid_high;
+ __le16 h_i_gid_high;
+ __le32 h_i_author;
+ } hurd2;
+ struct {
+ __u8 m_i_frag; /* Fragment number */
+ __u8 m_i_fsize; /* Fragment size */
+ __u16 m_pad1;
+ __u32 m_i_reserved2[2];
+ } masix2;
+ } osd2; /* OS dependent 2 */
+};
+
+#define i_size_high i_dir_acl
+
+#define i_reserved1 osd1.linux1.l_i_reserved1
+#define i_frag osd2.linux2.l_i_frag
+#define i_fsize osd2.linux2.l_i_fsize
+#define i_uid_low i_uid
+#define i_gid_low i_gid
+#define i_uid_high osd2.linux2.l_i_uid_high
+#define i_gid_high osd2.linux2.l_i_gid_high
+#define i_reserved2 osd2.linux2.l_i_reserved2
+
+/*
+ * File system states
+ */
+#define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */
+#define EXT2_ERROR_FS 0x0002 /* Errors detected */
+
+/*
+ * Mount flags
+ */
+#define EXT2_MOUNT_CHECK 0x000001 /* Do mount-time checks */
+#define EXT2_MOUNT_OLDALLOC 0x000002 /* Don't use the new Orlov allocator */
+#define EXT2_MOUNT_GRPID 0x000004 /* Create files with directory's group */
+#define EXT2_MOUNT_DEBUG 0x000008 /* Some debugging messages */
+#define EXT2_MOUNT_ERRORS_CONT 0x000010 /* Continue on errors */
+#define EXT2_MOUNT_ERRORS_RO 0x000020 /* Remount fs ro on errors */
+#define EXT2_MOUNT_ERRORS_PANIC 0x000040 /* Panic on errors */
+#define EXT2_MOUNT_MINIX_DF 0x000080 /* Mimics the Minix statfs */
+#define EXT2_MOUNT_NOBH 0x000100 /* No buffer_heads */
+#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */
+#define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */
+#define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */
+#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */
+#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */
+#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */
+#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */
+
+
+#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
+#define set_opt(o, opt) o |= EXT2_MOUNT_##opt
+#define test_opt(sb, opt) (EXT2_SB(sb)->s_mount_opt & \
+ EXT2_MOUNT_##opt)
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT2_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
+#define EXT2_DFL_CHECKINTERVAL 0 /* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT2_ERRORS_CONTINUE 1 /* Continue execution */
+#define EXT2_ERRORS_RO 2 /* Remount fs read-only */
+#define EXT2_ERRORS_PANIC 3 /* Panic */
+#define EXT2_ERRORS_DEFAULT EXT2_ERRORS_CONTINUE
+
+/*
+ * Structure of the super block
+ */
+struct ext2_super_block {
+ __le32 s_inodes_count; /* Inodes count */
+ __le32 s_blocks_count; /* Blocks count */
+ __le32 s_r_blocks_count; /* Reserved blocks count */
+ __le32 s_free_blocks_count; /* Free blocks count */
+ __le32 s_free_inodes_count; /* Free inodes count */
+ __le32 s_first_data_block; /* First Data Block */
+ __le32 s_log_block_size; /* Block size */
+ __le32 s_log_frag_size; /* Fragment size */
+ __le32 s_blocks_per_group; /* # Blocks per group */
+ __le32 s_frags_per_group; /* # Fragments per group */
+ __le32 s_inodes_per_group; /* # Inodes per group */
+ __le32 s_mtime; /* Mount time */
+ __le32 s_wtime; /* Write time */
+ __le16 s_mnt_count; /* Mount count */
+ __le16 s_max_mnt_count; /* Maximal mount count */
+ __le16 s_magic; /* Magic signature */
+ __le16 s_state; /* File system state */
+ __le16 s_errors; /* Behaviour when detecting errors */
+ __le16 s_minor_rev_level; /* minor revision level */
+ __le32 s_lastcheck; /* time of last check */
+ __le32 s_checkinterval; /* max. time between checks */
+ __le32 s_creator_os; /* OS */
+ __le32 s_rev_level; /* Revision level */
+ __le16 s_def_resuid; /* Default uid for reserved blocks */
+ __le16 s_def_resgid; /* Default gid for reserved blocks */
+ /*
+ * These fields are for EXT2_DYNAMIC_REV superblocks only.
+ *
+ * Note: the difference between the compatible feature set and
+ * the incompatible feature set is that if there is a bit set
+ * in the incompatible feature set that the kernel doesn't
+ * know about, it should refuse to mount the filesystem.
+ *
+ * e2fsck's requirements are more strict; if it doesn't know
+ * about a feature in either the compatible or incompatible
+ * feature set, it must abort and not try to meddle with
+ * things it doesn't understand...
+ */
+ __le32 s_first_ino; /* First non-reserved inode */
+ __le16 s_inode_size; /* size of inode structure */
+ __le16 s_block_group_nr; /* block group # of this superblock */
+ __le32 s_feature_compat; /* compatible feature set */
+ __le32 s_feature_incompat; /* incompatible feature set */
+ __le32 s_feature_ro_compat; /* readonly-compatible feature set */
+ __u8 s_uuid[16]; /* 128-bit uuid for volume */
+ char s_volume_name[16]; /* volume name */
+ char s_last_mounted[64]; /* directory where last mounted */
+ __le32 s_algorithm_usage_bitmap; /* For compression */
+ /*
+ * Performance hints. Directory preallocation should only
+ * happen if the EXT2_COMPAT_PREALLOC flag is on.
+ */
+ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
+ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
+ __u16 s_padding1;
+ /*
+ * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
+ */
+ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
+ __u32 s_journal_inum; /* inode number of journal file */
+ __u32 s_journal_dev; /* device number of journal file */
+ __u32 s_last_orphan; /* start of list of inodes to delete */
+ __u32 s_hash_seed[4]; /* HTREE hash seed */
+ __u8 s_def_hash_version; /* Default hash version to use */
+ __u8 s_reserved_char_pad;
+ __u16 s_reserved_word_pad;
+ __le32 s_default_mount_opts;
+ __le32 s_first_meta_bg; /* First metablock block group */
+ __u32 s_reserved[190]; /* Padding to the end of the block */
+};
+
+/*
+ * Codes for operating systems
+ */
+#define EXT2_OS_LINUX 0
+#define EXT2_OS_HURD 1
+#define EXT2_OS_MASIX 2
+#define EXT2_OS_FREEBSD 3
+#define EXT2_OS_LITES 4
+
+/*
+ * Revision levels
+ */
+#define EXT2_GOOD_OLD_REV 0 /* The good old (original) format */
+#define EXT2_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
+
+#define EXT2_CURRENT_REV EXT2_GOOD_OLD_REV
+#define EXT2_MAX_SUPP_REV EXT2_DYNAMIC_REV
+
+#define EXT2_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT2_HAS_COMPAT_FEATURE(sb,mask) \
+ ( EXT2_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+#define EXT2_HAS_RO_COMPAT_FEATURE(sb,mask) \
+ ( EXT2_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+#define EXT2_HAS_INCOMPAT_FEATURE(sb,mask) \
+ ( EXT2_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+#define EXT2_SET_COMPAT_FEATURE(sb,mask) \
+ EXT2_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT2_SET_RO_COMPAT_FEATURE(sb,mask) \
+ EXT2_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT2_SET_INCOMPAT_FEATURE(sb,mask) \
+ EXT2_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT2_CLEAR_COMPAT_FEATURE(sb,mask) \
+ EXT2_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
+ EXT2_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT2_CLEAR_INCOMPAT_FEATURE(sb,mask) \
+ EXT2_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT2_FEATURE_COMPAT_DIR_PREALLOC 0x0001
+#define EXT2_FEATURE_COMPAT_IMAGIC_INODES 0x0002
+#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004
+#define EXT2_FEATURE_COMPAT_EXT_ATTR 0x0008
+#define EXT2_FEATURE_COMPAT_RESIZE_INO 0x0010
+#define EXT2_FEATURE_COMPAT_DIR_INDEX 0x0020
+#define EXT2_FEATURE_COMPAT_ANY 0xffffffff
+
+#define EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
+#define EXT2_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
+#define EXT2_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
+#define EXT2_FEATURE_RO_COMPAT_ANY 0xffffffff
+
+#define EXT2_FEATURE_INCOMPAT_COMPRESSION 0x0001
+#define EXT2_FEATURE_INCOMPAT_FILETYPE 0x0002
+#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004
+#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008
+#define EXT2_FEATURE_INCOMPAT_META_BG 0x0010
+#define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff
+
+#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP (EXT2_FEATURE_INCOMPAT_FILETYPE| \
+ EXT2_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT2_FEATURE_RO_COMPAT_BTREE_DIR)
+#define EXT2_FEATURE_RO_COMPAT_UNSUPPORTED ~EXT2_FEATURE_RO_COMPAT_SUPP
+#define EXT2_FEATURE_INCOMPAT_UNSUPPORTED ~EXT2_FEATURE_INCOMPAT_SUPP
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define EXT2_DEF_RESUID 0
+#define EXT2_DEF_RESGID 0
+
+/*
+ * Default mount options
+ */
+#define EXT2_DEFM_DEBUG 0x0001
+#define EXT2_DEFM_BSDGROUPS 0x0002
+#define EXT2_DEFM_XATTR_USER 0x0004
+#define EXT2_DEFM_ACL 0x0008
+#define EXT2_DEFM_UID16 0x0010
+ /* Not used by ext2, but reserved for use by ext3 */
+#define EXT3_DEFM_JMODE 0x0060
+#define EXT3_DEFM_JMODE_DATA 0x0020
+#define EXT3_DEFM_JMODE_ORDERED 0x0040
+#define EXT3_DEFM_JMODE_WBACK 0x0060
+
+/*
+ * Structure of a directory entry
+ */
+
+struct ext2_dir_entry {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __le16 name_len; /* Name length */
+ char name[]; /* File name, up to EXT2_NAME_LEN */
+};
+
+/*
+ * The new version of the directory entry. Since EXT2 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext2_dir_entry_2 {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __u8 name_len; /* Name length */
+ __u8 file_type;
+ char name[]; /* File name, up to EXT2_NAME_LEN */
+};
+
+/*
+ * Ext2 directory file types. Only the low 3 bits are used. The
+ * other bits are reserved for now.
+ */
+enum {
+ EXT2_FT_UNKNOWN = 0,
+ EXT2_FT_REG_FILE = 1,
+ EXT2_FT_DIR = 2,
+ EXT2_FT_CHRDEV = 3,
+ EXT2_FT_BLKDEV = 4,
+ EXT2_FT_FIFO = 5,
+ EXT2_FT_SOCK = 6,
+ EXT2_FT_SYMLINK = 7,
+ EXT2_FT_MAX
+};
+
+/*
+ * EXT2_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT2_DIR_PAD 4
+#define EXT2_DIR_ROUND (EXT2_DIR_PAD - 1)
+#define EXT2_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT2_DIR_ROUND) & \
+ ~EXT2_DIR_ROUND)
+#define EXT2_MAX_REC_LEN ((1<<16)-1)
+
+static inline void verify_offsets(void)
+{
+#define A(x,y) BUILD_BUG_ON(x != offsetof(struct ext2_super_block, y));
+ A(EXT2_SB_MAGIC_OFFSET, s_magic);
+ A(EXT2_SB_BLOCKS_OFFSET, s_blocks_count);
+ A(EXT2_SB_BSIZE_OFFSET, s_log_block_size);
+#undef A
+}
/*
* ext2 mount options
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 1089f760c84..2de655f5d62 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -77,10 +77,11 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags = flags & EXT2_FL_USER_MODIFIABLE;
flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
ei->i_flags = flags;
- mutex_unlock(&inode->i_mutex);
ext2_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME_SEC;
+ mutex_unlock(&inode->i_mutex);
+
mark_inode_dirty(inode);
setflags_out:
mnt_drop_write_file(filp);
@@ -88,20 +89,29 @@ setflags_out:
}
case EXT2_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *) arg);
- case EXT2_IOC_SETVERSION:
+ case EXT2_IOC_SETVERSION: {
+ __u32 generation;
+
if (!inode_owner_or_capable(inode))
return -EPERM;
ret = mnt_want_write_file(filp);
if (ret)
return ret;
- if (get_user(inode->i_generation, (int __user *) arg)) {
+ if (get_user(generation, (int __user *) arg)) {
ret = -EFAULT;
- } else {
- inode->i_ctime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
+ goto setversion_out;
}
+
+ mutex_lock(&inode->i_mutex);
+ inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_generation = generation;
+ mutex_unlock(&inode->i_mutex);
+
+ mark_inode_dirty(inode);
+setversion_out:
mnt_drop_write_file(filp);
return ret;
+ }
case EXT2_IOC_GETRSVSZ:
if (test_opt(inode->i_sb, RESERVATION)
&& S_ISREG(inode->i_mode)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 080419814ba..dffb8653628 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -195,9 +195,6 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
struct inode *inode = old_dentry->d_inode;
int err;
- if (inode->i_nlink >= EXT2_LINK_MAX)
- return -EMLINK;
-
dquot_initialize(dir);
inode->i_ctime = CURRENT_TIME_SEC;
@@ -217,10 +214,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
{
struct inode * inode;
- int err = -EMLINK;
-
- if (dir->i_nlink >= EXT2_LINK_MAX)
- goto out;
+ int err;
dquot_initialize(dir);
@@ -346,11 +340,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
} else {
- if (dir_de) {
- err = -EMLINK;
- if (new_dir->i_nlink >= EXT2_LINK_MAX)
- goto out_dir;
- }
err = ext2_add_link(new_dentry, old_inode);
if (err)
goto out_dir;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0090595beb2..e1025c7a437 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -919,6 +919,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits);
+ sb->s_max_links = EXT2_LINK_MAX;
if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) {
sbi->s_inode_size = EXT2_GOOD_OLD_INODE_SIZE;
@@ -1087,9 +1088,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount3;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
- iput(root);
ext2_msg(sb, KERN_ERR, "error: get root inode failed");
ret = -ENOMEM;
goto failed_mount3;
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index be7a8d02c9a..cfedb2cb0d8 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -3,10 +3,7 @@
* Handler for storing security labels as extended attributes.
*/
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/ext2_fs.h>
+#include "ext2.h"
#include <linux/security.h>
#include "xattr.h"
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2989467d359..7e192574c00 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -5,10 +5,7 @@
* Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/string.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/ext2_fs.h>
+#include "ext2.h"
#include "xattr.h"
static size_t
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index 322a56b2dfb..1c3312858fc 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -9,8 +9,6 @@
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/buffer_head.h>
-#include <linux/ext2_fs_sb.h>
-#include <linux/ext2_fs.h>
#include <linux/blkdev.h>
#include "ext2.h"
#include "xip.h"
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 3091f62e55b..c76832c8d19 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -4,13 +4,7 @@
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*/
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a2038928f9a..baac1b129fb 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -11,17 +11,9 @@
* David S. Miller (davem@caip.rutgers.edu), 1995
*/
-#include <linux/time.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
#include <linux/blkdev.h>
-#include <trace/events/ext3.h>
+#include "ext3.h"
/*
* balloc.c contains the blocks allocation and deallocation routines
@@ -1743,8 +1735,11 @@ allocated:
*errp = 0;
brelse(bitmap_bh);
- dquot_free_block(inode, *count-num);
- *count = num;
+
+ if (num < *count) {
+ dquot_free_block(inode, *count-num);
+ *count = num;
+ }
trace_ext3_allocate_blocks(inode, goal, num,
(unsigned long long)ret_block);
@@ -1970,7 +1965,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
sbi = EXT3_SB(sb);
/* Walk through the whole group */
- while (start < max) {
+ while (start <= max) {
start = bitmap_search_next_usable_block(start, bitmap_bh, max);
if (start < 0)
break;
@@ -1980,7 +1975,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
* Allocate contiguous free extents by setting bits in the
* block bitmap
*/
- while (next < max
+ while (next <= max
&& claim_block(sb_bgl_lock(sbi, group),
next, bitmap_bh)) {
next++;
@@ -2091,73 +2086,74 @@ err_out:
*/
int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
- ext3_grpblk_t last_block, first_block, free_blocks;
- unsigned long first_group, last_group;
- unsigned long group, ngroups;
+ ext3_grpblk_t last_block, first_block;
+ unsigned long group, first_group, last_group;
struct ext3_group_desc *gdp;
struct ext3_super_block *es = EXT3_SB(sb)->s_es;
- uint64_t start, len, minlen, trimmed;
+ uint64_t start, minlen, end, trimmed = 0;
+ ext3_fsblk_t first_data_blk =
+ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
int ret = 0;
- start = (range->start >> sb->s_blocksize_bits) +
- le32_to_cpu(es->s_first_data_block);
- len = range->len >> sb->s_blocksize_bits;
+ start = range->start >> sb->s_blocksize_bits;
+ end = start + (range->len >> sb->s_blocksize_bits) - 1;
minlen = range->minlen >> sb->s_blocksize_bits;
- trimmed = 0;
- if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
+ if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) ||
+ unlikely(start >= max_blks))
return -EINVAL;
- if (start >= max_blks)
- return -EINVAL;
- if (start + len > max_blks)
- len = max_blks - start;
+ if (end >= max_blks)
+ end = max_blks - 1;
+ if (end <= first_data_blk)
+ goto out;
+ if (start < first_data_blk)
+ start = first_data_blk;
- ngroups = EXT3_SB(sb)->s_groups_count;
smp_rmb();
/* Determine first and last group to examine based on start and len */
ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
&first_group, &first_block);
- ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
+ ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,
&last_group, &last_block);
- last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
- last_block = EXT3_BLOCKS_PER_GROUP(sb);
- if (first_group > last_group)
- return -EINVAL;
+ /* end now represents the last block to discard in this group */
+ end = EXT3_BLOCKS_PER_GROUP(sb) - 1;
for (group = first_group; group <= last_group; group++) {
gdp = ext3_get_group_desc(sb, group, NULL);
if (!gdp)
break;
- free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
- if (free_blocks < minlen)
- continue;
-
/*
* For all the groups except the last one, last block will
- * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
- * change it for the last group in which case first_block +
- * len < EXT3_BLOCKS_PER_GROUP(sb).
+ * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to
+ * change it for the last group, note that last_block is
+ * already computed earlier by ext3_get_group_no_and_offset()
*/
- if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
- last_block = first_block + len;
- len -= last_block - first_block;
+ if (group == last_group)
+ end = last_block;
- ret = ext3_trim_all_free(sb, group, first_block,
- last_block, minlen);
- if (ret < 0)
- break;
+ if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) {
+ ret = ext3_trim_all_free(sb, group, first_block,
+ end, minlen);
+ if (ret < 0)
+ break;
+ trimmed += ret;
+ }
- trimmed += ret;
+ /*
+ * For every group except the first one, we are sure
+ * that the first block to discard will be block #0.
+ */
first_block = 0;
}
- if (ret >= 0)
+ if (ret > 0)
ret = 0;
- range->len = trimmed * sb->s_blocksize;
+out:
+ range->len = trimmed * sb->s_blocksize;
return ret;
}
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
index 6afc39d8025..909d13e2656 100644
--- a/fs/ext3/bitmap.c
+++ b/fs/ext3/bitmap.c
@@ -7,9 +7,7 @@
* Universite Pierre et Marie Curie (Paris VI)
*/
-#include <linux/buffer_head.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#ifdef EXT3FS_DEBUG
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 34f0a072b93..cc761ad8fa5 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -21,12 +21,7 @@
*
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
+#include "ext3.h"
static unsigned char ext3_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
new file mode 100644
index 00000000000..b6515fd7e56
--- /dev/null
+++ b/fs/ext3/ext3.h
@@ -0,0 +1,1322 @@
+/*
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/include/linux/minix_fs.h
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/magic.h>
+#include <linux/bug.h>
+#include <linux/blockgroup_lock.h>
+
+/*
+ * The second extended filesystem constants/structures
+ */
+
+/*
+ * Define EXT3FS_DEBUG to produce debug messages
+ */
+#undef EXT3FS_DEBUG
+
+/*
+ * Define EXT3_RESERVATION to reserve data blocks for expanding files
+ */
+#define EXT3_DEFAULT_RESERVE_BLOCKS 8
+/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
+#define EXT3_MAX_RESERVE_BLOCKS 1027
+#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
+
+/*
+ * Debug code
+ */
+#ifdef EXT3FS_DEBUG
+#define ext3_debug(f, a...) \
+ do { \
+ printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \
+ __FILE__, __LINE__, __func__); \
+ printk (KERN_DEBUG f, ## a); \
+ } while (0)
+#else
+#define ext3_debug(f, a...) do {} while (0)
+#endif
+
+/*
+ * Special inodes numbers
+ */
+#define EXT3_BAD_INO 1 /* Bad blocks inode */
+#define EXT3_ROOT_INO 2 /* Root inode */
+#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */
+#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */
+#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */
+#define EXT3_JOURNAL_INO 8 /* Journal inode */
+
+/* First non-reserved inode for old ext3 filesystems */
+#define EXT3_GOOD_OLD_FIRST_INO 11
+
+/*
+ * Maximal count of links to a file
+ */
+#define EXT3_LINK_MAX 32000
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT3_MIN_BLOCK_SIZE 1024
+#define EXT3_MAX_BLOCK_SIZE 65536
+#define EXT3_MIN_BLOCK_LOG_SIZE 10
+#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize)
+#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits)
+#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size)
+#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino)
+
+/*
+ * Macro-instructions used to manage fragments
+ */
+#define EXT3_MIN_FRAG_SIZE 1024
+#define EXT3_MAX_FRAG_SIZE 4096
+#define EXT3_MIN_FRAG_LOG_SIZE 10
+#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size)
+#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block)
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext3_group_desc
+{
+ __le32 bg_block_bitmap; /* Blocks bitmap block */
+ __le32 bg_inode_bitmap; /* Inodes bitmap block */
+ __le32 bg_inode_table; /* Inodes table block */
+ __le16 bg_free_blocks_count; /* Free blocks count */
+ __le16 bg_free_inodes_count; /* Free inodes count */
+ __le16 bg_used_dirs_count; /* Directories count */
+ __u16 bg_pad;
+ __le32 bg_reserved[3];
+};
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group)
+#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block)
+#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group)
+#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits)
+
+/*
+ * Constants relative to the data blocks
+ */
+#define EXT3_NDIR_BLOCKS 12
+#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS
+#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1)
+#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1)
+#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1)
+
+/*
+ * Inode flags
+ */
+#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */
+#define EXT3_UNRM_FL 0x00000002 /* Undelete */
+#define EXT3_COMPR_FL 0x00000004 /* Compress file */
+#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */
+#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */
+#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */
+#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */
+#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT3_DIRTY_FL 0x00000100
+#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
+#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */
+#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */
+/* End compression flags --- maybe not all used */
+#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */
+#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */
+#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
+#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */
+#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
+
+#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
+#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
+ EXT3_SYNC_FL | EXT3_NODUMP_FL |\
+ EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
+ EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
+ EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & EXT3_REG_FLMASK;
+ else
+ return flags & EXT3_OTHER_FLMASK;
+}
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext3_new_group_input {
+ __u32 group; /* Group number for this data */
+ __u32 block_bitmap; /* Absolute block number of block bitmap */
+ __u32 inode_bitmap; /* Absolute block number of inode bitmap */
+ __u32 inode_table; /* Absolute block number of inode table start */
+ __u32 blocks_count; /* Total number of blocks in this group */
+ __u16 reserved_blocks; /* Number of reserved blocks in this group */
+ __u16 unused;
+};
+
+/* The struct ext3_new_group_input in kernel space, with free_blocks_count */
+struct ext3_new_group_data {
+ __u32 group;
+ __u32 block_bitmap;
+ __u32 inode_bitmap;
+ __u32 inode_table;
+ __u32 blocks_count;
+ __u16 reserved_blocks;
+ __u16 unused;
+ __u32 free_blocks_count;
+};
+
+
+/*
+ * ioctl commands
+ */
+#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define EXT3_IOC_GETVERSION _IOR('f', 3, long)
+#define EXT3_IOC_SETVERSION _IOW('f', 4, long)
+#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
+#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input)
+#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION
+#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION
+#ifdef CONFIG_JBD_DEBUG
+#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
+#endif
+#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
+#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
+
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define EXT3_IOC32_GETVERSION _IOR('f', 3, int)
+#define EXT3_IOC32_SETVERSION _IOW('f', 4, int)
+#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int)
+#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int)
+#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
+#ifdef CONFIG_JBD_DEBUG
+#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
+#endif
+#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
+#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+
+
+/*
+ * Mount options
+ */
+struct ext3_mount_options {
+ unsigned long s_mount_opt;
+ uid_t s_resuid;
+ gid_t s_resgid;
+ unsigned long s_commit_interval;
+#ifdef CONFIG_QUOTA
+ int s_jquota_fmt;
+ char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext3_inode {
+ __le16 i_mode; /* File mode */
+ __le16 i_uid; /* Low 16 bits of Owner Uid */
+ __le32 i_size; /* Size in bytes */
+ __le32 i_atime; /* Access time */
+ __le32 i_ctime; /* Creation time */
+ __le32 i_mtime; /* Modification time */
+ __le32 i_dtime; /* Deletion Time */
+ __le16 i_gid; /* Low 16 bits of Group Id */
+ __le16 i_links_count; /* Links count */
+ __le32 i_blocks; /* Blocks count */
+ __le32 i_flags; /* File flags */
+ union {
+ struct {
+ __u32 l_i_reserved1;
+ } linux1;
+ struct {
+ __u32 h_i_translator;
+ } hurd1;
+ struct {
+ __u32 m_i_reserved1;
+ } masix1;
+ } osd1; /* OS dependent 1 */
+ __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
+ __le32 i_generation; /* File version (for NFS) */
+ __le32 i_file_acl; /* File ACL */
+ __le32 i_dir_acl; /* Directory ACL */
+ __le32 i_faddr; /* Fragment address */
+ union {
+ struct {
+ __u8 l_i_frag; /* Fragment number */
+ __u8 l_i_fsize; /* Fragment size */
+ __u16 i_pad1;
+ __le16 l_i_uid_high; /* these 2 fields */
+ __le16 l_i_gid_high; /* were reserved2[0] */
+ __u32 l_i_reserved2;
+ } linux2;
+ struct {
+ __u8 h_i_frag; /* Fragment number */
+ __u8 h_i_fsize; /* Fragment size */
+ __u16 h_i_mode_high;
+ __u16 h_i_uid_high;
+ __u16 h_i_gid_high;
+ __u32 h_i_author;
+ } hurd2;
+ struct {
+ __u8 m_i_frag; /* Fragment number */
+ __u8 m_i_fsize; /* Fragment size */
+ __u16 m_pad1;
+ __u32 m_i_reserved2[2];
+ } masix2;
+ } osd2; /* OS dependent 2 */
+ __le16 i_extra_isize;
+ __le16 i_pad1;
+};
+
+#define i_size_high i_dir_acl
+
+#define i_reserved1 osd1.linux1.l_i_reserved1
+#define i_frag osd2.linux2.l_i_frag
+#define i_fsize osd2.linux2.l_i_fsize
+#define i_uid_low i_uid
+#define i_gid_low i_gid
+#define i_uid_high osd2.linux2.l_i_uid_high
+#define i_gid_high osd2.linux2.l_i_gid_high
+#define i_reserved2 osd2.linux2.l_i_reserved2
+
+/*
+ * File system states
+ */
+#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */
+#define EXT3_ERROR_FS 0x0002 /* Errors detected */
+#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */
+
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
+
+/*
+ * Mount flags
+ */
+#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */
+/* EXT3_MOUNT_OLDALLOC was there */
+#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */
+#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */
+#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
+#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
+#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
+#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
+#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
+#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */
+#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
+#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
+#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
+#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
+#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
+#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
+#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
+#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
+#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
+#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
+#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */
+#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
+#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write
+ * error in ordered mode */
+
+/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+#ifndef _LINUX_EXT2_FS_H
+#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt
+#define set_opt(o, opt) o |= EXT3_MOUNT_##opt
+#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \
+ EXT3_MOUNT_##opt)
+#else
+#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD
+#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT
+#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS
+#endif
+
+#define ext3_set_bit __set_bit_le
+#define ext3_set_bit_atomic ext2_set_bit_atomic
+#define ext3_clear_bit __clear_bit_le
+#define ext3_clear_bit_atomic ext2_clear_bit_atomic
+#define ext3_test_bit test_bit_le
+#define ext3_find_next_zero_bit find_next_zero_bit_le
+
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
+#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */
+#define EXT3_ERRORS_RO 2 /* Remount fs read-only */
+#define EXT3_ERRORS_PANIC 3 /* Panic */
+#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE
+
+/*
+ * Structure of the super block
+ */
+struct ext3_super_block {
+/*00*/ __le32 s_inodes_count; /* Inodes count */
+ __le32 s_blocks_count; /* Blocks count */
+ __le32 s_r_blocks_count; /* Reserved blocks count */
+ __le32 s_free_blocks_count; /* Free blocks count */
+/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
+ __le32 s_first_data_block; /* First Data Block */
+ __le32 s_log_block_size; /* Block size */
+ __le32 s_log_frag_size; /* Fragment size */
+/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
+ __le32 s_frags_per_group; /* # Fragments per group */
+ __le32 s_inodes_per_group; /* # Inodes per group */
+ __le32 s_mtime; /* Mount time */
+/*30*/ __le32 s_wtime; /* Write time */
+ __le16 s_mnt_count; /* Mount count */
+ __le16 s_max_mnt_count; /* Maximal mount count */
+ __le16 s_magic; /* Magic signature */
+ __le16 s_state; /* File system state */
+ __le16 s_errors; /* Behaviour when detecting errors */
+ __le16 s_minor_rev_level; /* minor revision level */
+/*40*/ __le32 s_lastcheck; /* time of last check */
+ __le32 s_checkinterval; /* max. time between checks */
+ __le32 s_creator_os; /* OS */
+ __le32 s_rev_level; /* Revision level */
+/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */
+ __le16 s_def_resgid; /* Default gid for reserved blocks */
+ /*
+ * These fields are for EXT3_DYNAMIC_REV superblocks only.
+ *
+ * Note: the difference between the compatible feature set and
+ * the incompatible feature set is that if there is a bit set
+ * in the incompatible feature set that the kernel doesn't
+ * know about, it should refuse to mount the filesystem.
+ *
+ * e2fsck's requirements are more strict; if it doesn't know
+ * about a feature in either the compatible or incompatible
+ * feature set, it must abort and not try to meddle with
+ * things it doesn't understand...
+ */
+ __le32 s_first_ino; /* First non-reserved inode */
+ __le16 s_inode_size; /* size of inode structure */
+ __le16 s_block_group_nr; /* block group # of this superblock */
+ __le32 s_feature_compat; /* compatible feature set */
+/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
+ __le32 s_feature_ro_compat; /* readonly-compatible feature set */
+/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
+/*78*/ char s_volume_name[16]; /* volume name */
+/*88*/ char s_last_mounted[64]; /* directory where last mounted */
+/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
+ /*
+ * Performance hints. Directory preallocation should only
+ * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+ */
+ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
+ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
+ __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */
+ /*
+ * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
+ */
+/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
+/*E0*/ __le32 s_journal_inum; /* inode number of journal file */
+ __le32 s_journal_dev; /* device number of journal file */
+ __le32 s_last_orphan; /* start of list of inodes to delete */
+ __le32 s_hash_seed[4]; /* HTREE hash seed */
+ __u8 s_def_hash_version; /* Default hash version to use */
+ __u8 s_reserved_char_pad;
+ __u16 s_reserved_word_pad;
+ __le32 s_default_mount_opts;
+ __le32 s_first_meta_bg; /* First metablock block group */
+ __le32 s_mkfs_time; /* When the filesystem was created */
+ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */
+ /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
+ __le32 s_r_blocks_count_hi; /* Reserved blocks count */
+ __le32 s_free_blocks_count_hi; /* Free blocks count */
+ __le16 s_min_extra_isize; /* All inodes have at least # bytes */
+ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
+ __le32 s_flags; /* Miscellaneous flags */
+ __le16 s_raid_stride; /* RAID stride */
+ __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
+ __le64 s_mmp_block; /* Block for multi-mount protection */
+ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad2;
+ __le16 s_reserved_pad;
+ __u32 s_reserved[162]; /* Padding to the end of the block */
+};
+
+/* data type for block offset of block group */
+typedef int ext3_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long ext3_fsblk_t;
+
+#define E3FSBLK "%lu"
+
+struct ext3_reserve_window {
+ ext3_fsblk_t _rsv_start; /* First byte reserved */
+ ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */
+};
+
+struct ext3_reserve_window_node {
+ struct rb_node rsv_node;
+ __u32 rsv_goal_size;
+ __u32 rsv_alloc_hit;
+ struct ext3_reserve_window rsv_window;
+};
+
+struct ext3_block_alloc_info {
+ /* information about reservation window */
+ struct ext3_reserve_window_node rsv_window_node;
+ /*
+ * was i_next_alloc_block in ext3_inode_info
+ * is the logical (file-relative) number of the
+ * most-recently-allocated block in this file.
+ * We use this for detecting linearly ascending allocation requests.
+ */
+ __u32 last_alloc_logical_block;
+ /*
+ * Was i_next_alloc_goal in ext3_inode_info
+ * is the *physical* companion to i_next_alloc_block.
+ * it the physical block number of the block which was most-recentl
+ * allocated to this file. This give us the goal (target) for the next
+ * allocation when we detect linearly ascending requests.
+ */
+ ext3_fsblk_t last_alloc_physical_block;
+};
+
+#define rsv_start rsv_window._rsv_start
+#define rsv_end rsv_window._rsv_end
+
+/*
+ * third extended file system inode data in memory
+ */
+struct ext3_inode_info {
+ __le32 i_data[15]; /* unconverted */
+ __u32 i_flags;
+#ifdef EXT3_FRAGMENTS
+ __u32 i_faddr;
+ __u8 i_frag_no;
+ __u8 i_frag_size;
+#endif
+ ext3_fsblk_t i_file_acl;
+ __u32 i_dir_acl;
+ __u32 i_dtime;
+
+ /*
+ * i_block_group is the number of the block group which contains
+ * this file's inode. Constant across the lifetime of the inode,
+ * it is ued for making block allocation decisions - we try to
+ * place a file's data blocks near its inode block, and new inodes
+ * near to their parent directory's inode.
+ */
+ __u32 i_block_group;
+ unsigned long i_state_flags; /* Dynamic state flags for ext3 */
+
+ /* block reservation info */
+ struct ext3_block_alloc_info *i_block_alloc_info;
+
+ __u32 i_dir_start_lookup;
+#ifdef CONFIG_EXT3_FS_XATTR
+ /*
+ * Extended attributes can be read independently of the main file
+ * data. Taking i_mutex even when reading would cause contention
+ * between readers of EAs and writers of regular file data, so
+ * instead we synchronize on xattr_sem when reading or changing
+ * EAs.
+ */
+ struct rw_semaphore xattr_sem;
+#endif
+
+ struct list_head i_orphan; /* unlinked but open inodes */
+
+ /*
+ * i_disksize keeps track of what the inode size is ON DISK, not
+ * in memory. During truncate, i_size is set to the new size by
+ * the VFS prior to calling ext3_truncate(), but the filesystem won't
+ * set i_disksize to 0 until the truncate is actually under way.
+ *
+ * The intent is that i_disksize always represents the blocks which
+ * are used by this file. This allows recovery to restart truncate
+ * on orphans if we crash during truncate. We actually write i_disksize
+ * into the on-disk inode when writing inodes out, instead of i_size.
+ *
+ * The only time when i_disksize and i_size may be different is when
+ * a truncate is in progress. The only things which change i_disksize
+ * are ext3_get_block (growth) and ext3_truncate (shrinkth).
+ */
+ loff_t i_disksize;
+
+ /* on-disk additional length */
+ __u16 i_extra_isize;
+
+ /*
+ * truncate_mutex is for serialising ext3_truncate() against
+ * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's
+ * data tree are chopped off during truncate. We can't do that in
+ * ext3 because whenever we perform intermediate commits during
+ * truncate, the inode and all the metadata blocks *must* be in a
+ * consistent state which allows truncation of the orphans to restart
+ * during recovery. Hence we must fix the get_block-vs-truncate race
+ * by other means, so we have truncate_mutex.
+ */
+ struct mutex truncate_mutex;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ atomic_t i_sync_tid;
+ atomic_t i_datasync_tid;
+
+ struct inode vfs_inode;
+};
+
+/*
+ * third extended-fs super-block data in memory
+ */
+struct ext3_sb_info {
+ unsigned long s_frag_size; /* Size of a fragment in bytes */
+ unsigned long s_frags_per_block;/* Number of fragments per block */
+ unsigned long s_inodes_per_block;/* Number of inodes per block */
+ unsigned long s_frags_per_group;/* Number of fragments in a group */
+ unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ unsigned long s_inodes_per_group;/* Number of inodes in a group */
+ unsigned long s_itb_per_group; /* Number of inode table blocks per group */
+ unsigned long s_gdb_count; /* Number of group descriptor blocks */
+ unsigned long s_desc_per_block; /* Number of group descriptors per block */
+ unsigned long s_groups_count; /* Number of groups in the fs */
+ unsigned long s_overhead_last; /* Last calculated overhead */
+ unsigned long s_blocks_last; /* Last seen block count */
+ struct buffer_head * s_sbh; /* Buffer containing the super block */
+ struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */
+ struct buffer_head ** s_group_desc;
+ unsigned long s_mount_opt;
+ ext3_fsblk_t s_sb_block;
+ uid_t s_resuid;
+ gid_t s_resgid;
+ unsigned short s_mount_state;
+ unsigned short s_pad;
+ int s_addr_per_block_bits;
+ int s_desc_per_block_bits;
+ int s_inode_size;
+ int s_first_ino;
+ spinlock_t s_next_gen_lock;
+ u32 s_next_generation;
+ u32 s_hash_seed[4];
+ int s_def_hash_version;
+ int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
+ struct percpu_counter s_freeblocks_counter;
+ struct percpu_counter s_freeinodes_counter;
+ struct percpu_counter s_dirs_counter;
+ struct blockgroup_lock *s_blockgroup_lock;
+
+ /* root of the per fs reservation window tree */
+ spinlock_t s_rsv_window_lock;
+ struct rb_root s_rsv_window_root;
+ struct ext3_reserve_window_node s_rsv_window_head;
+
+ /* Journaling */
+ struct inode * s_journal_inode;
+ struct journal_s * s_journal;
+ struct list_head s_orphan;
+ struct mutex s_orphan_lock;
+ struct mutex s_resize_lock;
+ unsigned long s_commit_interval;
+ struct block_device *journal_bdev;
+#ifdef CONFIG_QUOTA
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+#endif
+};
+
+static inline spinlock_t *
+sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
+{
+ return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
+}
+
+static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+static inline struct ext3_inode_info *EXT3_I(struct inode *inode)
+{
+ return container_of(inode, struct ext3_inode_info, vfs_inode);
+}
+
+static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino)
+{
+ return ino == EXT3_ROOT_INO ||
+ ino == EXT3_JOURNAL_INO ||
+ ino == EXT3_RESIZE_INO ||
+ (ino >= EXT3_FIRST_INO(sb) &&
+ ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count));
+}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+ EXT3_STATE_JDATA, /* journaled data exists */
+ EXT3_STATE_NEW, /* inode is newly created */
+ EXT3_STATE_XATTR, /* has in-inode xattrs */
+ EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */
+};
+
+static inline int ext3_test_inode_state(struct inode *inode, int bit)
+{
+ return test_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+static inline void ext3_set_inode_state(struct inode *inode, int bit)
+{
+ set_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+static inline void ext3_clear_inode_state(struct inode *inode, int bit)
+{
+ clear_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime
+
+/*
+ * Codes for operating systems
+ */
+#define EXT3_OS_LINUX 0
+#define EXT3_OS_HURD 1
+#define EXT3_OS_MASIX 2
+#define EXT3_OS_FREEBSD 3
+#define EXT3_OS_LITES 4
+
+/*
+ * Revision levels
+ */
+#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */
+#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
+
+#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV
+#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV
+
+#define EXT3_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+#define EXT3_SET_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001
+#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002
+#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004
+#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008
+#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010
+#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020
+
+#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
+#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
+#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
+
+#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001
+#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002
+#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010
+
+#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ EXT3_FEATURE_INCOMPAT_RECOVER| \
+ EXT3_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define EXT3_DEF_RESUID 0
+#define EXT3_DEF_RESGID 0
+
+/*
+ * Default mount options
+ */
+#define EXT3_DEFM_DEBUG 0x0001
+#define EXT3_DEFM_BSDGROUPS 0x0002
+#define EXT3_DEFM_XATTR_USER 0x0004
+#define EXT3_DEFM_ACL 0x0008
+#define EXT3_DEFM_UID16 0x0010
+#define EXT3_DEFM_JMODE 0x0060
+#define EXT3_DEFM_JMODE_DATA 0x0020
+#define EXT3_DEFM_JMODE_ORDERED 0x0040
+#define EXT3_DEFM_JMODE_WBACK 0x0060
+
+/*
+ * Structure of a directory entry
+ */
+#define EXT3_NAME_LEN 255
+
+struct ext3_dir_entry {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __le16 name_len; /* Name length */
+ char name[EXT3_NAME_LEN]; /* File name */
+};
+
+/*
+ * The new version of the directory entry. Since EXT3 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext3_dir_entry_2 {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __u8 name_len; /* Name length */
+ __u8 file_type;
+ char name[EXT3_NAME_LEN]; /* File name */
+};
+
+/*
+ * Ext3 directory file types. Only the low 3 bits are used. The
+ * other bits are reserved for now.
+ */
+#define EXT3_FT_UNKNOWN 0
+#define EXT3_FT_REG_FILE 1
+#define EXT3_FT_DIR 2
+#define EXT3_FT_CHRDEV 3
+#define EXT3_FT_BLKDEV 4
+#define EXT3_FT_FIFO 5
+#define EXT3_FT_SOCK 6
+#define EXT3_FT_SYMLINK 7
+
+#define EXT3_FT_MAX 8
+
+/*
+ * EXT3_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT3_DIR_PAD 4
+#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
+#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
+ ~EXT3_DIR_ROUND)
+#define EXT3_MAX_REC_LEN ((1<<16)-1)
+
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
+static inline unsigned ext3_rec_len_from_disk(__le16 dlen)
+{
+ unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+ if (len == EXT3_MAX_REC_LEN)
+ return 1 << 16;
+#endif
+ return len;
+}
+
+static inline __le16 ext3_rec_len_to_disk(unsigned len)
+{
+#if (PAGE_CACHE_SIZE >= 65536)
+ if (len == (1 << 16))
+ return cpu_to_le16(EXT3_MAX_REC_LEN);
+ else if (len > (1 << 16))
+ BUG();
+#endif
+ return cpu_to_le16(len);
+}
+
+/*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+ */
+
+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
+#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+/* Legal values for the dx_root hash_version field: */
+
+#define DX_HASH_LEGACY 0
+#define DX_HASH_HALF_MD4 1
+#define DX_HASH_TEA 2
+#define DX_HASH_LEGACY_UNSIGNED 3
+#define DX_HASH_HALF_MD4_UNSIGNED 4
+#define DX_HASH_TEA_UNSIGNED 5
+
+/* hash info structure used by the directory hash */
+struct dx_hash_info
+{
+ u32 hash;
+ u32 minor_hash;
+ int hash_version;
+ u32 *seed;
+};
+
+#define EXT3_HTREE_EOF 0x7fffffff
+
+/*
+ * Control parameters used by ext3_htree_next_block
+ */
+#define HASH_NB_ALWAYS 1
+
+
+/*
+ * Describe an inode's exact location on disk and in memory
+ */
+struct ext3_iloc
+{
+ struct buffer_head *bh;
+ unsigned long offset;
+ unsigned long block_group;
+};
+
+static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc)
+{
+ return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset);
+}
+
+/*
+ * This structure is stuffed into the struct file's private_data field
+ * for directories. It is where we put information so that we can do
+ * readdir operations in hash tree order.
+ */
+struct dir_private_info {
+ struct rb_root root;
+ struct rb_node *curr_node;
+ struct fname *extra_fname;
+ loff_t last_pos;
+ __u32 curr_hash;
+ __u32 curr_minor_hash;
+ __u32 next_hash;
+};
+
+/* calculate the first block number of the group */
+static inline ext3_fsblk_t
+ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
+{
+ return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) +
+ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
+}
+
+/*
+ * Special error return code only used by dx_probe() and its callers.
+ */
+#define ERR_BAD_DX_DIR -75000
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext3 source programs needs to include it so they are duplicated here.
+ */
+# define NORET_TYPE /**/
+# define ATTRIB_NORET __attribute__((noreturn))
+# define NORET_AND noreturn,
+
+/* balloc.c */
+extern int ext3_bg_has_super(struct super_block *sb, int group);
+extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, int *errp);
+extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, unsigned long *count, int *errp);
+extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t block, unsigned long count);
+extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
+ ext3_fsblk_t block, unsigned long count,
+ unsigned long *pdquot_freed_blocks);
+extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *);
+extern void ext3_check_blocks_bitmap (struct super_block *);
+extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+ unsigned int block_group,
+ struct buffer_head ** bh);
+extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
+extern void ext3_init_block_alloc_info(struct inode *);
+extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
+extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range);
+
+/* dir.c */
+extern int ext3_check_dir_entry(const char *, struct inode *,
+ struct ext3_dir_entry_2 *,
+ struct buffer_head *, unsigned long);
+extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+ __u32 minor_hash,
+ struct ext3_dir_entry_2 *dirent);
+extern void ext3_htree_free_dir_info(struct dir_private_info *p);
+
+/* fsync.c */
+extern int ext3_sync_file(struct file *, loff_t, loff_t, int);
+
+/* hash.c */
+extern int ext3fs_dirhash(const char *name, int len, struct
+ dx_hash_info *hinfo);
+
+/* ialloc.c */
+extern struct inode * ext3_new_inode (handle_t *, struct inode *,
+ const struct qstr *, umode_t);
+extern void ext3_free_inode (handle_t *, struct inode *);
+extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+extern unsigned long ext3_count_free_inodes (struct super_block *);
+extern unsigned long ext3_count_dirs (struct super_block *);
+extern void ext3_check_inodes_bitmap (struct super_block *);
+extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+
+
+/* inode.c */
+int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext3_fsblk_t blocknr);
+struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
+ sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
+ int create);
+
+extern struct inode *ext3_iget(struct super_block *, unsigned long);
+extern int ext3_write_inode (struct inode *, struct writeback_control *);
+extern int ext3_setattr (struct dentry *, struct iattr *);
+extern void ext3_evict_inode (struct inode *);
+extern int ext3_sync_inode (handle_t *, struct inode *);
+extern void ext3_discard_reservation (struct inode *);
+extern void ext3_dirty_inode(struct inode *, int);
+extern int ext3_change_inode_journal_flag(struct inode *, int);
+extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
+extern int ext3_can_truncate(struct inode *inode);
+extern void ext3_truncate(struct inode *inode);
+extern void ext3_set_inode_flags(struct inode *);
+extern void ext3_get_inode_flags(struct ext3_inode_info *);
+extern void ext3_set_aops(struct inode *inode);
+extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+
+/* ioctl.c */
+extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* namei.c */
+extern int ext3_orphan_add(handle_t *, struct inode *);
+extern int ext3_orphan_del(handle_t *, struct inode *);
+extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+ __u32 start_minor_hash, __u32 *next_hash);
+
+/* resize.c */
+extern int ext3_group_add(struct super_block *sb,
+ struct ext3_new_group_data *input);
+extern int ext3_group_extend(struct super_block *sb,
+ struct ext3_super_block *es,
+ ext3_fsblk_t n_blocks_count);
+
+/* super.c */
+extern __printf(3, 4)
+void ext3_error(struct super_block *, const char *, const char *, ...);
+extern void __ext3_std_error (struct super_block *, const char *, int);
+extern __printf(3, 4)
+void ext3_abort(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext3_warning(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext3_msg(struct super_block *, const char *, const char *, ...);
+extern void ext3_update_dynamic_rev (struct super_block *sb);
+
+#define ext3_std_error(sb, errno) \
+do { \
+ if ((errno)) \
+ __ext3_std_error((sb), __func__, (errno)); \
+} while (0)
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext3_dir_operations;
+
+/* file.c */
+extern const struct inode_operations ext3_file_inode_operations;
+extern const struct file_operations ext3_file_operations;
+
+/* namei.c */
+extern const struct inode_operations ext3_dir_inode_operations;
+extern const struct inode_operations ext3_special_inode_operations;
+
+/* symlink.c */
+extern const struct inode_operations ext3_symlink_inode_operations;
+extern const struct inode_operations ext3_fast_symlink_inode_operations;
+
+#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal)
+
+/* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction. */
+
+#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U
+
+/* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+
+#define EXT3_XATTR_TRANS_BLOCKS 6U
+
+/* Define the minimum size for a transaction which modifies data. This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota). The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \
+ EXT3_XATTR_TRANS_BLOCKS - 2 + \
+ EXT3_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/* Delete operations potentially hit one directory's namespace plus an
+ * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
+ * generous. We can grow the delete transaction later if necessary. */
+
+#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64)
+
+/* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction. For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+#define EXT3_MAX_TRANS_DATA 64U
+
+/* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one. Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates. Quota allocations are not
+ * needed. */
+
+#define EXT3_RESERVE_TRANS_BLOCKS 12U
+
+#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
+
+#ifdef CONFIG_QUOTA
+/* Amount of blocks needed for quota update - we know that the structure was
+ * allocated so we need to update only inode+data */
+#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+/* Amount of blocks needed for quota insert/delete - we do some block writes
+ * but inode, sb and group updates are done only once */
+#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
+ (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
+#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
+ (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
+#else
+#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
+#endif
+#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
+
+int
+ext3_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+ struct ext3_iloc *iloc);
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh. This _must_ be cleaned up later.
+ */
+
+int ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+ struct ext3_iloc *iloc);
+
+int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+/*
+ * Wrapper functions with which ext3 calls into JBD. The intent here is
+ * to allow these to be turned into appropriate stubs so ext3 can control
+ * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't
+ * been done yet.
+ */
+
+static inline void ext3_journal_release_buffer(handle_t *handle,
+ struct buffer_head *bh)
+{
+ journal_release_buffer(handle, bh);
+}
+
+void ext3_journal_abort_handle(const char *caller, const char *err_fn,
+ struct buffer_head *bh, handle_t *handle, int err);
+
+int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+ struct buffer_head *bh);
+
+int __ext3_journal_get_write_access(const char *where, handle_t *handle,
+ struct buffer_head *bh);
+
+int __ext3_journal_forget(const char *where, handle_t *handle,
+ struct buffer_head *bh);
+
+int __ext3_journal_revoke(const char *where, handle_t *handle,
+ unsigned long blocknr, struct buffer_head *bh);
+
+int __ext3_journal_get_create_access(const char *where,
+ handle_t *handle, struct buffer_head *bh);
+
+int __ext3_journal_dirty_metadata(const char *where,
+ handle_t *handle, struct buffer_head *bh);
+
+#define ext3_journal_get_undo_access(handle, bh) \
+ __ext3_journal_get_undo_access(__func__, (handle), (bh))
+#define ext3_journal_get_write_access(handle, bh) \
+ __ext3_journal_get_write_access(__func__, (handle), (bh))
+#define ext3_journal_revoke(handle, blocknr, bh) \
+ __ext3_journal_revoke(__func__, (handle), (blocknr), (bh))
+#define ext3_journal_get_create_access(handle, bh) \
+ __ext3_journal_get_create_access(__func__, (handle), (bh))
+#define ext3_journal_dirty_metadata(handle, bh) \
+ __ext3_journal_dirty_metadata(__func__, (handle), (bh))
+#define ext3_journal_forget(handle, bh) \
+ __ext3_journal_forget(__func__, (handle), (bh))
+
+int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+
+handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks);
+int __ext3_journal_stop(const char *where, handle_t *handle);
+
+static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
+{
+ return ext3_journal_start_sb(inode->i_sb, nblocks);
+}
+
+#define ext3_journal_stop(handle) \
+ __ext3_journal_stop(__func__, (handle))
+
+static inline handle_t *ext3_journal_current_handle(void)
+{
+ return journal_current_handle();
+}
+
+static inline int ext3_journal_extend(handle_t *handle, int nblocks)
+{
+ return journal_extend(handle, nblocks);
+}
+
+static inline int ext3_journal_restart(handle_t *handle, int nblocks)
+{
+ return journal_restart(handle, nblocks);
+}
+
+static inline int ext3_journal_blocks_per_page(struct inode *inode)
+{
+ return journal_blocks_per_page(inode);
+}
+
+static inline int ext3_journal_force_commit(journal_t *journal)
+{
+ return journal_force_commit(journal);
+}
+
+/* super.c */
+int ext3_force_commit(struct super_block *sb);
+
+static inline int ext3_should_journal_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 1;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+ return 1;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 1;
+ return 0;
+}
+
+static inline int ext3_should_order_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 0;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
+ return 1;
+ return 0;
+}
+
+static inline int ext3_should_writeback_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 0;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
+ return 1;
+ return 0;
+}
+
+#include <trace/events/ext3.h>
diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c
index d401f148d74..785a3261a26 100644
--- a/fs/ext3/ext3_jbd.c
+++ b/fs/ext3/ext3_jbd.c
@@ -2,7 +2,7 @@
* Interface between ext3 and JBD
*/
-#include <linux/ext3_jbd.h>
+#include "ext3.h"
int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
struct buffer_head *bh)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 724df69847d..25cb413277e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -18,12 +18,8 @@
* (jj@sunsite.ms.mff.cuni.cz)
*/
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
#include <linux/quotaops.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 1860ed35632..d4dff278cbd 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -22,15 +22,9 @@
* we can depend on generic_block_fdatasync() to sync the data blocks.
*/
-#include <linux/time.h>
#include <linux/blkdev.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
#include <linux/writeback.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <trace/events/ext3.h>
+#include "ext3.h"
/*
* akpm: A new design for ext3_sync_file().
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index 7d215b4d4f2..d10231ddcf8 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -9,9 +9,7 @@
* License.
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include <linux/cryptohash.h>
#define DELTA 0x9E3779B9
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 1cde2843801..e3c39e4cec1 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -12,21 +12,10 @@
* David S. Miller (davem@caip.rutgers.edu), 1995
*/
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/stat.h>
-#include <linux/string.h>
#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
#include <linux/random.h>
-#include <linux/bitops.h>
-#include <trace/events/ext3.h>
-
-#include <asm/byteorder.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2d0afeca0b4..10d7812f602 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -22,22 +22,12 @@
* Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
*/
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/ext3_jbd.h>
-#include <linux/jbd.h>
#include <linux/highuid.h>
-#include <linux/pagemap.h>
#include <linux/quotaops.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/mpage.h>
-#include <linux/uio.h>
-#include <linux/bio.h>
-#include <linux/fiemap.h>
#include <linux/namei.h>
-#include <trace/events/ext3.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
@@ -756,6 +746,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
struct ext3_block_alloc_info *block_i;
ext3_fsblk_t current_block;
struct ext3_inode_info *ei = EXT3_I(inode);
+ struct timespec now;
block_i = ei->i_block_alloc_info;
/*
@@ -795,9 +786,11 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
}
/* We are done with atomic stuff, now do the rest of housekeeping */
-
- inode->i_ctime = CURRENT_TIME_SEC;
- ext3_mark_inode_dirty(handle, inode);
+ now = CURRENT_TIME_SEC;
+ if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
+ inode->i_ctime = now;
+ ext3_mark_inode_dirty(handle, inode);
+ }
/* ext3_mark_inode_dirty already updated i_sync_tid */
atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 4af574ce4a4..677a5c27dc6 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -7,15 +7,10 @@
* Universite Pierre et Marie Curie (Paris VI)
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/capability.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
#include <linux/mount.h>
-#include <linux/time.h>
#include <linux/compat.h>
#include <asm/uaccess.h>
+#include "ext3.h"
long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index e8e211795e9..d7940b24cf6 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -24,20 +24,8 @@
* Theodore Ts'o, 2002
*/
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/jbd.h>
-#include <linux/time.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/string.h>
#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
-#include <linux/bio.h>
-#include <trace/events/ext3.h>
-
+#include "ext3.h"
#include "namei.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 7916e4ce166..0f814f3450d 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -11,10 +11,7 @@
#define EXT3FS_DEBUG
-#include <linux/ext3_jbd.h>
-
-#include <linux/errno.h>
-#include <linux/slab.h>
+#include "ext3.h"
#define outside(b, first, last) ((b) < (first) || (b) >= (last))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 726c7ef6cdf..cf0b5921cf0 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -17,22 +17,12 @@
*/
#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/slab.h>
-#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/parser.h>
-#include <linux/buffer_head.h>
#include <linux/exportfs.h>
-#include <linux/vfs.h>
+#include <linux/statfs.h>
#include <linux/random.h>
#include <linux/mount.h>
-#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
#include <linux/log2.h>
@@ -40,13 +30,13 @@
#include <asm/uaccess.h>
+#define CREATE_TRACE_POINTS
+
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
#include "namei.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/ext3.h>
-
#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
#define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
#else
@@ -2046,10 +2036,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
goto failed_mount3;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
- iput(root);
ret = -ENOMEM;
goto failed_mount3;
}
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index 7c489820777..6b01c3eab1f 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -17,10 +17,8 @@
* ext3 symlink handling code
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
#include <linux/namei.h>
+#include "ext3.h"
#include "xattr.h"
static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index d565759d82e..d22ebb7a4f5 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -50,14 +50,9 @@
* by the buffer lock.
*/
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include <linux/mbcache.h>
#include <linux/quotaops.h>
-#include <linux/rwsem.h>
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index ea26f2acab9..3387664ad70 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -3,12 +3,8 @@
* Handler for storing security labels as extended attributes.
*/
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
#include <linux/security.h>
+#include "ext3.h"
#include "xattr.h"
static size_t
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index 2526a8829de..d75727cc67f 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -5,11 +5,7 @@
* Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/string.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include "xattr.h"
static size_t
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index b32e473a1e3..5612af3567e 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -5,10 +5,7 @@
* Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include "xattr.h"
static size_t
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f9e2cd8cf71..4bbd07a6fa1 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -336,10 +336,10 @@ err_out:
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc *desc;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
ext4_fsblk_t bitmap_blk;
desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -348,9 +348,9 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
bitmap_blk = ext4_block_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
- ext4_error(sb, "Cannot read block bitmap - "
- "block_group = %u, block_bitmap = %llu",
- block_group, bitmap_blk);
+ ext4_error(sb, "Cannot get buffer for block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, bitmap_blk);
return NULL;
}
@@ -382,25 +382,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
return bh;
}
/*
- * submit the buffer_head for read. We can
- * safely mark the bitmap as uptodate now.
- * We do it here so the bitmap uptodate bit
- * get set with buffer lock held.
+ * submit the buffer_head for reading
*/
+ set_buffer_new(bh);
trace_ext4_read_block_bitmap_load(sb, block_group);
- set_bitmap_uptodate(bh);
- if (bh_submit_read(bh) < 0) {
- put_bh(bh);
+ bh->b_end_io = ext4_end_bitmap_read;
+ get_bh(bh);
+ submit_bh(READ, bh);
+ return bh;
+}
+
+/* Returns 0 on success, 1 on error */
+int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ struct ext4_group_desc *desc;
+
+ if (!buffer_new(bh))
+ return 0;
+ desc = ext4_get_group_desc(sb, block_group, NULL);
+ if (!desc)
+ return 1;
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
ext4_error(sb, "Cannot read block bitmap - "
- "block_group = %u, block_bitmap = %llu",
- block_group, bitmap_blk);
- return NULL;
+ "block_group = %u, block_bitmap = %llu",
+ block_group, (unsigned long long) bh->b_blocknr);
+ return 1;
}
+ clear_buffer_new(bh);
+ /* Panic or remount fs read-only if block bitmap is invalid */
ext4_valid_block_bitmap(sb, desc, block_group, bh);
- /*
- * file system mounted not to panic on error,
- * continue with corrupt bitmap
- */
+ return 0;
+}
+
+struct buffer_head *
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+ struct buffer_head *bh;
+
+ bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+ put_bh(bh);
+ return NULL;
+ }
return bh;
}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 164c56092e5..b8678620264 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -32,24 +32,8 @@ static unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static int ext4_readdir(struct file *, void *, filldir_t);
static int ext4_dx_readdir(struct file *filp,
void *dirent, filldir_t filldir);
-static int ext4_release_dir(struct inode *inode,
- struct file *filp);
-
-const struct file_operations ext4_dir_operations = {
- .llseek = ext4_llseek,
- .read = generic_read_dir,
- .readdir = ext4_readdir, /* we take BKL. needed?*/
- .unlocked_ioctl = ext4_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ext4_compat_ioctl,
-#endif
- .fsync = ext4_sync_file,
- .release = ext4_release_dir,
-};
-
static unsigned char get_dtype(struct super_block *sb, int filetype)
{
@@ -60,6 +44,26 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
return (ext4_filetype_table[filetype]);
}
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which chould potentially get coverted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+ ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
+ ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+ return 1;
+
+ return 0;
+}
+
/*
* Return 0 if the directory entry is OK, and 1 if there is a problem
*
@@ -91,17 +95,17 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
return 0;
if (filp)
- ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
+ ext4_error_file(filp, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u(%u), "
"inode=%u, rec_len=%d, name_len=%d",
- error_msg, (unsigned) (offset%bh->b_size),
+ error_msg, (unsigned) (offset % bh->b_size),
offset, le32_to_cpu(de->inode),
rlen, de->name_len);
else
- ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+ ext4_error_inode(dir, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u(%u), "
"inode=%u, rec_len=%d, name_len=%d",
- error_msg, (unsigned) (offset%bh->b_size),
+ error_msg, (unsigned) (offset % bh->b_size),
offset, le32_to_cpu(de->inode),
rlen, de->name_len);
@@ -115,18 +119,13 @@ static int ext4_readdir(struct file *filp,
unsigned int offset;
int i, stored;
struct ext4_dir_entry_2 *de;
- struct super_block *sb;
int err;
struct inode *inode = filp->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
int ret = 0;
int dir_has_error = 0;
- sb = inode->i_sb;
-
- if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX) &&
- ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
- ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+ if (is_dx_dir(inode)) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
ret = err;
@@ -254,22 +253,134 @@ out:
return ret;
}
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return is_compat_task();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
+}
+
/*
* These functions convert from the major/minor hash to an f_pos
- * value.
+ * value for dx directories
+ *
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
+ */
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return major >> 1;
+ else
+ return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return (pos << 1) & 0xffffffff;
+ else
+ return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return 0;
+ else
+ return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext4_get_htree_eof(struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return EXT4_HTREE_EOF_32BIT;
+ else
+ return EXT4_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext4_dir_llseek() based on generic_file_llseek() to handle both
+ * non-htree and htree directories, where the "offset" is in terms
+ * of the filename hash value instead of the byte offset.
*
- * Currently we only use major hash numer. This is unfortunate, but
- * on 32-bit machines, the same VFS interface is used for lseek and
- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
- * lseek/telldir/seekdir will blow out spectacularly, and from within
- * the ext2 low-level routine, we don't know if we're being called by
- * a 64-bit version of the system call or the 32-bit version of the
- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
- * cookie. Sigh.
+ * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX)
+ * will be invalid once the directory was converted into a dx directory
*/
-#define hash2pos(major, minor) (major >> 1)
-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
-#define pos2min_hash(pos) (0)
+loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t ret = -EINVAL;
+ int dx_dir = is_dx_dir(inode);
+
+ mutex_lock(&inode->i_mutex);
+
+ /* NOTE: relative offsets with dx directories might not work
+ * as expected, as it is difficult to figure out the
+ * correct offset between dx hashes */
+
+ switch (origin) {
+ case SEEK_END:
+ if (unlikely(offset > 0))
+ goto out_err; /* not supported for directories */
+
+ /* so only negative offsets are left, does that have a
+ * meaning for directories at all? */
+ if (dx_dir)
+ offset += ext4_get_htree_eof(file);
+ else
+ offset += inode->i_size;
+ break;
+ case SEEK_CUR:
+ /*
+ * Here we special-case the lseek(fd, 0, SEEK_CUR)
+ * position-querying operation. Avoid rewriting the "same"
+ * f_pos value back to the file because a concurrent read(),
+ * write() or lseek() might have altered it
+ */
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out_ok;
+ }
+
+ offset += file->f_pos;
+ break;
+ }
+
+ if (unlikely(offset < 0))
+ goto out_err;
+
+ if (!dx_dir) {
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out_err;
+ } else if (offset > ext4_get_htree_eof(file))
+ goto out_err;
+
+ /* Special lock needed here? */
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+
+out_ok:
+ ret = offset;
+out_err:
+ mutex_unlock(&inode->i_mutex);
+
+ return ret;
+}
/*
* This structure holds the nodes of the red-black tree used to store
@@ -330,15 +441,16 @@ static void free_rb_tree_fname(struct rb_root *root)
}
-static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
+ loff_t pos)
{
struct dir_private_info *p;
p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->curr_hash = pos2maj_hash(pos);
- p->curr_minor_hash = pos2min_hash(pos);
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
return p;
}
@@ -425,11 +537,12 @@ static int call_filldir(struct file *filp, void *dirent,
sb = inode->i_sb;
if (!fname) {
- printk(KERN_ERR "EXT4-fs: call_filldir: called with "
- "null fname?!?\n");
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
+ "called with null fname?!?", __func__, __LINE__,
+ inode->i_ino, current->comm);
return 0;
}
- curr_pos = hash2pos(fname->hash, fname->minor_hash);
+ curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
while (fname) {
error = filldir(dirent, fname->name,
fname->name_len, curr_pos,
@@ -454,13 +567,13 @@ static int ext4_dx_readdir(struct file *filp,
int ret;
if (!info) {
- info = ext4_htree_create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(filp, filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
}
- if (filp->f_pos == EXT4_HTREE_EOF)
+ if (filp->f_pos == ext4_get_htree_eof(filp))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
@@ -468,8 +581,8 @@ static int ext4_dx_readdir(struct file *filp,
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp->f_pos);
+ info->curr_hash = pos2maj_hash(filp, filp->f_pos);
+ info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
}
/*
@@ -501,7 +614,7 @@ static int ext4_dx_readdir(struct file *filp,
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = EXT4_HTREE_EOF;
+ filp->f_pos = ext4_get_htree_eof(filp);
break;
}
info->curr_node = rb_first(&info->root);
@@ -521,7 +634,7 @@ static int ext4_dx_readdir(struct file *filp,
info->curr_minor_hash = fname->minor_hash;
} else {
if (info->next_hash == ~0) {
- filp->f_pos = EXT4_HTREE_EOF;
+ filp->f_pos = ext4_get_htree_eof(filp);
break;
}
info->curr_hash = info->next_hash;
@@ -540,3 +653,15 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
return 0;
}
+
+const struct file_operations ext4_dir_operations = {
+ .llseek = ext4_dir_llseek,
+ .read = generic_read_dir,
+ .readdir = ext4_readdir,
+ .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .fsync = ext4_sync_file,
+ .release = ext4_release_dir,
+};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513004fc3d8..ab2594a30f8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,7 +53,7 @@
printk(KERN_DEBUG f, ## a); \
} while (0)
#else
-#define ext4_debug(f, a...) do {} while (0)
+#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
#define EXT4_ERROR_INODE(inode, fmt, a...) \
@@ -184,6 +184,8 @@ struct mpage_da_data {
#define EXT4_IO_END_UNWRITTEN 0x0001
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_QUEUED 0x0004
+#define EXT4_IO_END_DIRECT 0x0008
+#define EXT4_IO_END_IN_FSYNC 0x0010
struct ext4_io_page {
struct page *p_page;
@@ -192,18 +194,25 @@ struct ext4_io_page {
#define MAX_IO_PAGES 128
+/*
+ * For converting uninitialized extents on a work queue.
+ *
+ * 'page' is only used from the writepage() path; 'pages' is only used for
+ * buffered writes; they are used to keep page references until conversion
+ * takes place. For AIO/DIO, neither field is filled in.
+ */
typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
struct inode *inode; /* file being written to */
unsigned int flag; /* unwritten or not */
- struct page *page; /* page struct for buffer write */
+ struct page *page; /* for writepage() path */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
struct work_struct work; /* data work queue */
struct kiocb *iocb; /* iocb struct for AIO */
int result; /* error value for AIO */
- int num_io_pages;
- struct ext4_io_page *pages[MAX_IO_PAGES];
+ int num_io_pages; /* for writepages() */
+ struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -923,6 +932,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
+#define EXT4_MOUNT_ERRORS_MASK 0x00070
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
@@ -941,7 +951,6 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
-#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
@@ -1142,6 +1151,7 @@ struct ext4_sb_info {
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
unsigned int s_mount_flags;
+ unsigned int s_def_mount_opt;
ext4_fsblk_t s_sb_block;
uid_t s_resuid;
gid_t s_resgid;
@@ -1420,8 +1430,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
-#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1612,7 +1623,11 @@ struct dx_hash_info
u32 *seed;
};
-#define EXT4_HTREE_EOF 0x7fffffff
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
+#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
+
/*
* Control parameters used by ext4_htree_next_block
@@ -1794,8 +1809,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
ext4_group_t block_group,
struct buffer_head ** bh);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
- ext4_group_t block_group);
+
+extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
+ ext4_group_t block_group);
+extern int ext4_wait_block_bitmap(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh);
+extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+ ext4_group_t block_group);
extern void ext4_init_block_bitmap(struct super_block *sb,
struct buffer_head *bh,
ext4_group_t group,
@@ -1841,6 +1862,7 @@ extern void ext4_check_inodes_bitmap(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
ext4_group_t group, int barrier);
+extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
/* mballoc.c */
extern long ext4_mb_stats;
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a52db3a69a3..0f58b86e3a0 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -47,9 +47,9 @@
*/
#define EXT_DEBUG__
#ifdef EXT_DEBUG
-#define ext_debug(a...) printk(a)
+#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
#else
-#define ext_debug(a...)
+#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
/*
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5802fa1dab1..83b20fcf940 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -104,6 +104,78 @@
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+/**
+ * struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ * This struct is a 'seed' structure for a using with your own callback
+ * structs. If you are using callbacks you must allocate one of these
+ * or another struct of your own definition which has this struct
+ * as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+ /* list information for other callbacks attached to the same handle */
+ struct list_head jce_list;
+
+ /* Function to call with this callback structure */
+ void (*jce_func)(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int error);
+
+ /* user data goes here */
+};
+
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ * @sb: superblock of current filesystem for transaction
+ * @jce: returned journal callback data
+ * @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+ void (*func)(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc),
+ struct ext4_journal_cb_entry *jce)
+{
+ struct ext4_sb_info *sbi =
+ EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+ /* Add the jce to transaction's private list */
+ jce->jce_func = func;
+ spin_lock(&sbi->s_md_lock);
+ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+ spin_unlock(&sbi->s_md_lock);
+}
+
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ */
+static inline void ext4_journal_callback_del(handle_t *handle,
+ struct ext4_journal_cb_entry *jce)
+{
+ struct ext4_sb_info *sbi =
+ EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+ spin_lock(&sbi->s_md_lock);
+ list_del_init(&jce->jce_list);
+ spin_unlock(&sbi->s_md_lock);
+}
+
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
@@ -261,43 +333,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
/* super.c */
int ext4_force_commit(struct super_block *sb);
-static inline int ext4_should_journal_data(struct inode *inode)
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
+
+static inline int ext4_inode_journal_mode(struct inode *inode)
{
if (EXT4_JOURNAL(inode) == NULL)
- return 0;
- if (!S_ISREG(inode->i_mode))
- return 1;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
- return 1;
- if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
- return 1;
- return 0;
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ /* We do not support data journalling with delayed allocation */
+ if (!S_ISREG(inode->i_mode) ||
+ test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+ !test_opt(inode->i_sb, DELALLOC))
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ else
+ BUG();
+}
+
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
}
static inline int ext4_should_order_data(struct inode *inode)
{
- if (EXT4_JOURNAL(inode) == NULL)
- return 0;
- if (!S_ISREG(inode->i_mode))
- return 0;
- if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
- return 0;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- return 1;
- return 0;
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
}
static inline int ext4_should_writeback_data(struct inode *inode)
{
- if (EXT4_JOURNAL(inode) == NULL)
- return 1;
- if (!S_ISREG(inode->i_mode))
- return 0;
- if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
- return 0;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
- return 1;
- return 0;
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}
/*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74f23c292e1..1421938e679 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,14 @@
#include <trace/events/ext4.h>
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
+ due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
+
static int ext4_split_extent(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
@@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle,
int split_flag,
int flags);
+static int ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag,
+ int flags);
+
static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode,
int needed)
@@ -300,6 +315,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
ext4_fsblk_t block = ext4_ext_pblock(ext);
int len = ext4_ext_get_actual_len(ext);
+ if (len == 0)
+ return 0;
return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
}
@@ -2308,7 +2325,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
struct ext4_extent *ex;
/* the header must be checked already in ext4_ext_remove_space() */
- ext_debug("truncate since %u in leaf\n", start);
+ ext_debug("truncate since %u in leaf to %u\n", start, end);
if (!path[depth].p_hdr)
path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
eh = path[depth].p_hdr;
@@ -2343,14 +2360,17 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ext_debug(" border %u:%u\n", a, b);
/* If this extent is beyond the end of the hole, skip it */
- if (end <= ex_ee_block) {
+ if (end < ex_ee_block) {
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
continue;
} else if (b != ex_ee_block + ex_ee_len - 1) {
- EXT4_ERROR_INODE(inode," bad truncate %u:%u\n",
- start, end);
+ EXT4_ERROR_INODE(inode,
+ "can not handle truncate %u:%u "
+ "on extent %u:%u",
+ start, end, ex_ee_block,
+ ex_ee_block + ex_ee_len - 1);
err = -EIO;
goto out;
} else if (a != ex_ee_block) {
@@ -2482,7 +2502,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
return 1;
}
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
{
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
@@ -2491,7 +2512,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
handle_t *handle;
int i, err;
- ext_debug("truncate since %u\n", start);
+ ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
handle = ext4_journal_start(inode, depth + 1);
@@ -2504,6 +2525,61 @@ again:
trace_ext4_ext_remove_space(inode, start, depth);
/*
+ * Check if we are removing extents inside the extent tree. If that
+ * is the case, we are going to punch a hole inside the extent tree
+ * so we have to check whether we need to split the extent covering
+ * the last block to remove so we can easily remove the part of it
+ * in ext4_ext_rm_leaf().
+ */
+ if (end < EXT_MAX_BLOCKS - 1) {
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+
+ /* find extent for this block */
+ path = ext4_ext_find_extent(inode, end, NULL);
+ if (IS_ERR(path)) {
+ ext4_journal_stop(handle);
+ return PTR_ERR(path);
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex)
+ goto cont;
+
+ ee_block = le32_to_cpu(ex->ee_block);
+
+ /*
+ * See if the last block is inside the extent, if so split
+ * the extent at 'end' block so we can easily remove the
+ * tail of the first part of the split extent in
+ * ext4_ext_rm_leaf().
+ */
+ if (end >= ee_block &&
+ end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+ int split_flag = 0;
+
+ if (ext4_ext_is_uninitialized(ex))
+ split_flag = EXT4_EXT_MARK_UNINIT1 |
+ EXT4_EXT_MARK_UNINIT2;
+
+ /*
+ * Split the extent in two so that 'end' is the last
+ * block in the first new extent
+ */
+ err = ext4_split_extent_at(handle, inode, path,
+ end + 1, split_flag,
+ EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+
+ if (err < 0)
+ goto out;
+ }
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+cont:
+
+ /*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
*/
@@ -2515,6 +2591,7 @@ again:
}
path[0].p_depth = depth;
path[0].p_hdr = ext_inode_hdr(inode);
+
if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
err = -EIO;
goto out;
@@ -2526,7 +2603,7 @@ again:
/* this is leaf block */
err = ext4_ext_rm_leaf(handle, inode, path,
&partial_cluster, start,
- EXT_MAX_BLOCKS - 1);
+ end);
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
path[i].p_bh = NULL;
@@ -2651,17 +2728,17 @@ void ext4_ext_init(struct super_block *sb)
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
- printk(KERN_INFO "EXT4-fs: file extents enabled");
+ printk(KERN_INFO "EXT4-fs: file extents enabled"
#ifdef AGGRESSIVE_TEST
- printk(", aggressive tests");
+ ", aggressive tests"
#endif
#ifdef CHECK_BINSEARCH
- printk(", check binsearch");
+ ", check binsearch"
#endif
#ifdef EXTENTS_STATS
- printk(", stats");
+ ", stats"
#endif
- printk("\n");
+ "\n");
#endif
#ifdef EXTENTS_STATS
spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
@@ -2709,14 +2786,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
}
/*
- * used by extent splitting.
- */
-#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
- due to ENOSPC */
-#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
-#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
-
-/*
* ext4_split_extent_at() splits an extent at given block.
*
* @handle: the journal handle
@@ -3224,11 +3293,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
depth = ext_depth(inode);
eh = path[depth].p_hdr;
- if (unlikely(!eh->eh_entries)) {
- EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
- "EOFBLOCKS_FL set");
- return -EIO;
- }
+ /*
+ * We're going to remove EOFBLOCKS_FL entirely in future so we
+ * do not care for this case anymore. Simply remove the flag
+ * if there are no extents.
+ */
+ if (unlikely(!eh->eh_entries))
+ goto out;
last_ex = EXT_LAST_EXTENT(eh);
/*
* We should clear the EOFBLOCKS_FL flag if we are writing the
@@ -3252,6 +3323,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
for (i = depth-1; i >= 0; i--)
if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
return 0;
+out:
ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
return ext4_mark_inode_dirty(handle, inode);
}
@@ -3710,8 +3782,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
int free_on_err = 0, err = 0, depth, ret;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
- unsigned int punched_out = 0;
- unsigned int result = 0;
struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
ext4_lblk_t cluster_offset;
@@ -3721,8 +3791,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* check in cache */
- if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
- ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
+ if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
if (!newex.ee_start_lo && !newex.ee_start_hi) {
if ((sbi->s_cluster_ratio > 1) &&
ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
@@ -3790,113 +3859,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
/* if found extent covers block, simply return it */
if (in_range(map->m_lblk, ee_block, ee_len)) {
- struct ext4_map_blocks punch_map;
- ext4_fsblk_t partial_cluster = 0;
-
newblock = map->m_lblk - ee_block + ee_start;
/* number of remaining blocks in the extent */
allocated = ee_len - (map->m_lblk - ee_block);
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
ee_block, ee_len, newblock);
- if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
- /*
- * Do not put uninitialized extent
- * in the cache
- */
- if (!ext4_ext_is_uninitialized(ex)) {
- ext4_ext_put_in_cache(inode, ee_block,
- ee_len, ee_start);
- goto out;
- }
- ret = ext4_ext_handle_uninitialized_extents(
- handle, inode, map, path, flags,
- allocated, newblock);
- return ret;
- }
-
- /*
- * Punch out the map length, but only to the
- * end of the extent
- */
- punched_out = allocated < map->m_len ?
- allocated : map->m_len;
-
/*
- * Sense extents need to be converted to
- * uninitialized, they must fit in an
- * uninitialized extent
+ * Do not put uninitialized extent
+ * in the cache
*/
- if (punched_out > EXT_UNINIT_MAX_LEN)
- punched_out = EXT_UNINIT_MAX_LEN;
-
- punch_map.m_lblk = map->m_lblk;
- punch_map.m_pblk = newblock;
- punch_map.m_len = punched_out;
- punch_map.m_flags = 0;
-
- /* Check to see if the extent needs to be split */
- if (punch_map.m_len != ee_len ||
- punch_map.m_lblk != ee_block) {
-
- ret = ext4_split_extent(handle, inode,
- path, &punch_map, 0,
- EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
- EXT4_GET_BLOCKS_PRE_IO);
-
- if (ret < 0) {
- err = ret;
- goto out2;
- }
- /*
- * find extent for the block at
- * the start of the hole
- */
- ext4_ext_drop_refs(path);
- kfree(path);
-
- path = ext4_ext_find_extent(inode,
- map->m_lblk, NULL);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- path = NULL;
- goto out2;
- }
-
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
- ee_len = ext4_ext_get_actual_len(ex);
- ee_block = le32_to_cpu(ex->ee_block);
- ee_start = ext4_ext_pblock(ex);
-
- }
-
- ext4_ext_mark_uninitialized(ex);
-
- ext4_ext_invalidate_cache(inode);
-
- err = ext4_ext_rm_leaf(handle, inode, path,
- &partial_cluster, map->m_lblk,
- map->m_lblk + punched_out);
-
- if (!err && path->p_hdr->eh_entries == 0) {
- /*
- * Punch hole freed all of this sub tree,
- * so we need to correct eh_depth
- */
- err = ext4_ext_get_access(handle, inode, path);
- if (err == 0) {
- ext_inode_hdr(inode)->eh_depth = 0;
- ext_inode_hdr(inode)->eh_max =
- cpu_to_le16(ext4_ext_space_root(
- inode, 0));
-
- err = ext4_ext_dirty(
- handle, inode, path);
- }
+ if (!ext4_ext_is_uninitialized(ex)) {
+ ext4_ext_put_in_cache(inode, ee_block,
+ ee_len, ee_start);
+ goto out;
}
-
- goto out2;
+ ret = ext4_ext_handle_uninitialized_extents(
+ handle, inode, map, path, flags,
+ allocated, newblock);
+ return ret;
}
}
@@ -4165,13 +4146,11 @@ out2:
ext4_ext_drop_refs(path);
kfree(path);
}
- result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
- punched_out : allocated;
trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
- newblock, map->m_len, err ? err : result);
+ newblock, map->m_len, err ? err : allocated);
- return err ? err : result;
+ return err ? err : allocated;
}
void ext4_ext_truncate(struct inode *inode)
@@ -4228,7 +4207,7 @@ void ext4_ext_truncate(struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
- err = ext4_ext_remove_space(inode, last_block);
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
/* In a multi-transaction truncate, we only make the final
* transaction synchronous.
@@ -4436,10 +4415,11 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
if (ret <= 0) {
WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_map_blocks "
- "returned error inode#%lu, block=%u, "
- "max_blocks=%u", __func__,
- inode->i_ino, map.m_lblk, map.m_len);
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "%s:%d: inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ __func__, __LINE__, inode->i_ino, map.m_lblk,
+ map.m_len, ret);
}
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
@@ -4705,14 +4685,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct super_block *sb = inode->i_sb;
- struct ext4_ext_cache cache_ex;
- ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
+ ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- struct ext4_map_blocks map;
handle_t *handle;
loff_t first_page, last_page, page_len;
loff_t first_page_offset, last_page_offset;
- int ret, credits, blocks_released, err = 0;
+ int credits, err = 0;
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
@@ -4728,10 +4706,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
offset;
}
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-
first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
last_page = (offset + length) >> PAGE_CACHE_SHIFT;
@@ -4810,7 +4784,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
}
}
-
/*
* If i_size is contained in the last page, we need to
* unmap and zero the partial page after i_size
@@ -4830,73 +4803,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
}
}
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
/* If there are no blocks to remove, return now */
- if (first_block >= last_block)
+ if (first_block >= stop_block)
goto out;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
- /*
- * Loop over all the blocks and identify blocks
- * that need to be punched out
- */
- iblock = first_block;
- blocks_released = 0;
- while (iblock < last_block) {
- max_blocks = last_block - iblock;
- num_blocks = 1;
- memset(&map, 0, sizeof(map));
- map.m_lblk = iblock;
- map.m_len = max_blocks;
- ret = ext4_ext_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
-
- if (ret > 0) {
- blocks_released += ret;
- num_blocks = ret;
- } else if (ret == 0) {
- /*
- * If map blocks could not find the block,
- * then it is in a hole. If the hole was
- * not already cached, then map blocks should
- * put it in the cache. So we can get the hole
- * out of the cache
- */
- memset(&cache_ex, 0, sizeof(cache_ex));
- if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
- !cache_ex.ec_start) {
-
- /* The hole is cached */
- num_blocks = cache_ex.ec_block +
- cache_ex.ec_len - iblock;
-
- } else {
- /* The block could not be identified */
- err = -EIO;
- break;
- }
- } else {
- /* Map blocks error */
- err = ret;
- break;
- }
-
- if (num_blocks == 0) {
- /* This condition should never happen */
- ext_debug("Block lookup failed");
- err = -EIO;
- break;
- }
-
- iblock += num_blocks;
- }
+ err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
- if (blocks_released > 0) {
- ext4_ext_invalidate_cache(inode);
- ext4_discard_preallocations(inode);
- }
+ ext4_ext_invalidate_cache(inode);
+ ext4_discard_preallocations(inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 00a2cb753ef..bb6c7d81131 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode)
io = list_entry(ei->i_completed_io_list.next,
ext4_io_end_t, list);
list_del_init(&io->list);
+ io->flag |= EXT4_IO_END_IN_FSYNC;
/*
* Calling ext4_end_io_nolock() to convert completed
* IO to written.
@@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode)
if (ret < 0)
ret2 = ret;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ io->flag &= ~EXT4_IO_END_IN_FSYNC;
}
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index ac8f168c8ab..fa8e4911d35 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -200,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
return -1;
}
hash = hash & ~1;
- if (hash == (EXT4_HTREE_EOF << 1))
- hash = (EXT4_HTREE_EOF-1) << 1;
+ if (hash == (EXT4_HTREE_EOF_32BIT << 1))
+ hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
hinfo->hash = hash;
hinfo->minor_hash = minor_hash;
return 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25d8c9781ad..409c2ee7750 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -92,6 +92,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
return EXT4_INODES_PER_GROUP(sb);
}
+void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
+{
+ if (uptodate) {
+ set_buffer_uptodate(bh);
+ set_bitmap_uptodate(bh);
+ }
+ unlock_buffer(bh);
+ put_bh(bh);
+}
+
/*
* Read the inode allocation bitmap for a given block_group, reading
* into the specified slot in the superblock's bitmap cache.
@@ -147,18 +157,18 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
return bh;
}
/*
- * submit the buffer_head for read. We can
- * safely mark the bitmap as uptodate now.
- * We do it here so the bitmap uptodate bit
- * get set with buffer lock held.
+ * submit the buffer_head for reading
*/
trace_ext4_load_inode_bitmap(sb, block_group);
- set_bitmap_uptodate(bh);
- if (bh_submit_read(bh) < 0) {
+ bh->b_end_io = ext4_end_bitmap_read;
+ get_bh(bh);
+ submit_bh(READ, bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
put_bh(bh);
ext4_error(sb, "Cannot read inode bitmap - "
- "block_group = %u, inode_bitmap = %llu",
- block_group, bitmap_blk);
+ "block_group = %u, inode_bitmap = %llu",
+ block_group, bitmap_blk);
return NULL;
}
return bh;
@@ -194,19 +204,20 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
struct ext4_sb_info *sbi;
int fatal = 0, err, count, cleared;
- if (atomic_read(&inode->i_count) > 1) {
- printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
- atomic_read(&inode->i_count));
+ if (!sb) {
+ printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
+ "nonexistent device\n", __func__, __LINE__);
return;
}
- if (inode->i_nlink) {
- printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
- inode->i_nlink);
+ if (atomic_read(&inode->i_count) > 1) {
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
+ __func__, __LINE__, inode->i_ino,
+ atomic_read(&inode->i_count));
return;
}
- if (!sb) {
- printk(KERN_ERR "ext4_free_inode: inode on "
- "nonexistent device\n");
+ if (inode->i_nlink) {
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
+ __func__, __LINE__, inode->i_ino, inode->i_nlink);
return;
}
sbi = EXT4_SB(sb);
@@ -593,94 +604,6 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
}
/*
- * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's ext4_group_lock
- * and clear the uninit flag. The inode bitmap update
- * and group desc uninit flag clear should be done
- * after holding ext4_group_lock so that ext4_read_inode_bitmap
- * doesn't race with the ext4_claim_inode
- */
-static int ext4_claim_inode(struct super_block *sb,
- struct buffer_head *inode_bitmap_bh,
- unsigned long ino, ext4_group_t group, umode_t mode)
-{
- int free = 0, retval = 0, count;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
-
- /*
- * We have to be sure that new inode allocation does not race with
- * inode table initialization, because otherwise we may end up
- * allocating and writing new inode right before sb_issue_zeroout
- * takes place and overwriting our new inode with zeroes. So we
- * take alloc_sem to prevent it.
- */
- down_read(&grp->alloc_sem);
- ext4_lock_group(sb, group);
- if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
- /* not a free inode */
- retval = 1;
- goto err_ret;
- }
- ino++;
- if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
- ino > EXT4_INODES_PER_GROUP(sb)) {
- ext4_unlock_group(sb, group);
- up_read(&grp->alloc_sem);
- ext4_error(sb, "reserved inode or inode > inodes count - "
- "block_group = %u, inode=%lu", group,
- ino + group * EXT4_INODES_PER_GROUP(sb));
- return 1;
- }
- /* If we didn't allocate from within the initialized part of the inode
- * table then we need to initialize up to this inode. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
- /* When marking the block group with
- * ~EXT4_BG_INODE_UNINIT we don't want to depend
- * on the value of bg_itable_unused even though
- * mke2fs could have initialized the same for us.
- * Instead we calculated the value below
- */
-
- free = 0;
- } else {
- free = EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp);
- }
-
- /*
- * Check the relative inode number against the last used
- * relative inode number in this group. if it is greater
- * we need to update the bg_itable_unused count
- *
- */
- if (ino > free)
- ext4_itable_unused_set(sb, gdp,
- (EXT4_INODES_PER_GROUP(sb) - ino));
- }
- count = ext4_free_inodes_count(sb, gdp) - 1;
- ext4_free_inodes_set(sb, gdp, count);
- if (S_ISDIR(mode)) {
- count = ext4_used_dirs_count(sb, gdp) + 1;
- ext4_used_dirs_set(sb, gdp, count);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f = ext4_flex_group(sbi, group);
-
- atomic_inc(&sbi->s_flex_groups[f].used_dirs);
- }
- }
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-err_ret:
- ext4_unlock_group(sb, group);
- up_read(&grp->alloc_sem);
- return retval;
-}
-
-/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
* free space and a low directory-to-inode ratio; if that fails, then of
@@ -741,6 +664,11 @@ got_group:
if (ret2 == -1)
goto out;
+ /*
+ * Normally we will only go through one pass of this loop,
+ * unless we get unlucky and it turns out the group we selected
+ * had its last inode grabbed by someone else.
+ */
for (i = 0; i < ngroups; i++, ino = 0) {
err = -EIO;
@@ -757,51 +685,24 @@ repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
inode_bitmap_bh->b_data,
EXT4_INODES_PER_GROUP(sb), ino);
-
- if (ino < EXT4_INODES_PER_GROUP(sb)) {
-
- BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle,
- inode_bitmap_bh);
- if (err)
- goto fail;
-
- BUFFER_TRACE(group_desc_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle,
- group_desc_bh);
- if (err)
- goto fail;
- if (!ext4_claim_inode(sb, inode_bitmap_bh,
- ino, group, mode)) {
- /* we won it */
- BUFFER_TRACE(inode_bitmap_bh,
- "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle,
- NULL,
- inode_bitmap_bh);
- if (err)
- goto fail;
- /* zero bit is inode number 1*/
- ino++;
- goto got;
- }
- /* we lost it */
- ext4_handle_release_buffer(handle, inode_bitmap_bh);
- ext4_handle_release_buffer(handle, group_desc_bh);
-
- if (++ino < EXT4_INODES_PER_GROUP(sb))
- goto repeat_in_this_group;
+ if (ino >= EXT4_INODES_PER_GROUP(sb)) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
}
-
- /*
- * This case is possible in concurrent environment. It is very
- * rare. We cannot repeat the find_group_xxx() call because
- * that will simply return the same blockgroup, because the
- * group descriptor metadata has not yet been updated.
- * So we just go onto the next blockgroup.
- */
- if (++group == ngroups)
- group = 0;
+ if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
+ ext4_error(sb, "reserved inode found cleared - "
+ "inode=%lu", ino + 1);
+ continue;
+ }
+ ext4_lock_group(sb, group);
+ ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
+ ext4_unlock_group(sb, group);
+ ino++; /* the inode bitmap is zero-based */
+ if (!ret2)
+ goto got; /* we grabbed the inode! */
+ if (ino < EXT4_INODES_PER_GROUP(sb))
+ goto repeat_in_this_group;
}
err = -ENOSPC;
goto out;
@@ -838,6 +739,59 @@ got:
if (err)
goto fail;
}
+
+ BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+ if (err)
+ goto fail;
+
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, group_desc_bh);
+ if (err)
+ goto fail;
+
+ /* Update the relevant bg descriptor fields */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+ int free;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+ ext4_lock_group(sb, group); /* while we modify the bg desc */
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ free = 0;
+ }
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. if it is greater
+ * we need to update the bg_itable_unused count
+ */
+ if (ino > free)
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - ino));
+ up_read(&grp->alloc_sem);
+ }
+ ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+ if (S_ISDIR(mode)) {
+ ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+
+ atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+ }
+ }
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+ ext4_unlock_group(sb, group);
+ }
+
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+ if (err)
+ goto fail;
+
BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
if (err)
@@ -1101,7 +1055,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
* where it is called from on active part of filesystem is ext4lazyinit
* thread, so we do not need any special locks, however we have to prevent
* inode allocation from the current group, so we take alloc_sem lock, to
- * block ext4_claim_inode until we are finished.
+ * block ext4_new_inode() until we are finished.
*/
int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
int barrier)
@@ -1149,9 +1103,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
sbi->s_inodes_per_block);
if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
- ext4_error(sb, "Something is wrong with group %u\n"
- "Used itable blocks: %d"
- "itable unused count: %u\n",
+ ext4_error(sb, "Something is wrong with group %u: "
+ "used itable blocks: %d; "
+ "itable unused count: %u",
group, used_blks,
ext4_itable_unused_count(sb, gdp));
ret = 1;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82fe629..c77b0bd2c71 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -272,7 +272,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
trace_ext4_da_update_reserve_space(inode, used, quota_claim);
if (unlikely(used > ei->i_reserved_data_blocks)) {
ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
- "with only %d reserved data blocks\n",
+ "with only %d reserved data blocks",
__func__, inode->i_ino, used,
ei->i_reserved_data_blocks);
WARN_ON(1);
@@ -1165,7 +1165,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
*/
ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
"ino %lu, to_free %d with only %d reserved "
- "data blocks\n", inode->i_ino, to_free,
+ "data blocks", inode->i_ino, to_free,
ei->i_reserved_data_blocks);
WARN_ON(1);
to_free = ei->i_reserved_data_blocks;
@@ -1428,20 +1428,22 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- printk(KERN_CRIT "Total free blocks count %lld\n",
+ struct super_block *sb = inode->i_sb;
+
+ ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
EXT4_C2B(EXT4_SB(inode->i_sb),
ext4_count_free_clusters(inode->i_sb)));
- printk(KERN_CRIT "Free/Dirty block details\n");
- printk(KERN_CRIT "free_blocks=%lld\n",
+ ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
+ ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
(long long) EXT4_C2B(EXT4_SB(inode->i_sb),
percpu_counter_sum(&sbi->s_freeclusters_counter)));
- printk(KERN_CRIT "dirty_blocks=%lld\n",
+ ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
(long long) EXT4_C2B(EXT4_SB(inode->i_sb),
percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
- printk(KERN_CRIT "Block reservation details\n");
- printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
- EXT4_I(inode)->i_reserved_data_blocks);
- printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
+ ext4_msg(sb, KERN_CRIT, "Block reservation details");
+ ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
+ EXT4_I(inode)->i_reserved_data_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
EXT4_I(inode)->i_reserved_meta_blocks);
return;
}
@@ -2482,13 +2484,14 @@ static int ext4_da_write_end(struct file *file,
int write_mode = (int)(unsigned long)fsdata;
if (write_mode == FALL_BACK_TO_NONDELALLOC) {
- if (ext4_should_order_data(inode)) {
+ switch (ext4_inode_journal_mode(inode)) {
+ case EXT4_INODE_ORDERED_DATA_MODE:
return ext4_ordered_write_end(file, mapping, pos,
len, copied, page, fsdata);
- } else if (ext4_should_writeback_data(inode)) {
+ case EXT4_INODE_WRITEBACK_DATA_MODE:
return ext4_writeback_write_end(file, mapping, pos,
len, copied, page, fsdata);
- } else {
+ default:
BUG();
}
}
@@ -2763,7 +2766,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
goto out;
ext_debug("ext4_end_io_dio(): io_end 0x%p "
- "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
iocb->private, io_end->inode->i_ino, iocb, offset,
size);
@@ -2795,9 +2798,6 @@ out:
/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
-
- /* XXX: probably should move into the real I/O completion handler */
- inode_dio_done(inode);
}
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@ -2811,8 +2811,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
goto out;
if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
- printk("sb umounted, discard end_io request for inode %lu\n",
- io_end->inode->i_ino);
+ ext4_msg(io_end->inode->i_sb, KERN_INFO,
+ "sb umounted, discard end_io request for inode %lu",
+ io_end->inode->i_ino);
ext4_free_io_end(io_end);
goto out;
}
@@ -2921,9 +2922,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
iocb->private = NULL;
EXT4_I(inode)->cur_aio_dio = NULL;
if (!is_sync_kiocb(iocb)) {
- iocb->private = ext4_init_io_end(inode, GFP_NOFS);
- if (!iocb->private)
+ ext4_io_end_t *io_end =
+ ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end)
return -ENOMEM;
+ io_end->flag |= EXT4_IO_END_DIRECT;
+ iocb->private = io_end;
/*
* we save the io structure for current async
* direct IO, so that later ext4_map_blocks()
@@ -2940,7 +2944,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
ext4_get_block_write,
ext4_end_io_dio,
NULL,
- DIO_LOCKING | DIO_SKIP_HOLES);
+ DIO_LOCKING);
if (iocb->private)
EXT4_I(inode)->cur_aio_dio = NULL;
/*
@@ -3086,18 +3090,25 @@ static const struct address_space_operations ext4_da_aops = {
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode) &&
- test_opt(inode->i_sb, DELALLOC))
- inode->i_mapping->a_ops = &ext4_da_aops;
- else if (ext4_should_order_data(inode))
- inode->i_mapping->a_ops = &ext4_ordered_aops;
- else if (ext4_should_writeback_data(inode) &&
- test_opt(inode->i_sb, DELALLOC))
- inode->i_mapping->a_ops = &ext4_da_aops;
- else if (ext4_should_writeback_data(inode))
- inode->i_mapping->a_ops = &ext4_writeback_aops;
- else
+ switch (ext4_inode_journal_mode(inode)) {
+ case EXT4_INODE_ORDERED_DATA_MODE:
+ if (test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else
+ inode->i_mapping->a_ops = &ext4_ordered_aops;
+ break;
+ case EXT4_INODE_WRITEBACK_DATA_MODE:
+ if (test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else
+ inode->i_mapping->a_ops = &ext4_writeback_aops;
+ break;
+ case EXT4_INODE_JOURNAL_DATA_MODE:
inode->i_mapping->a_ops = &ext4_journalled_aops;
+ break;
+ default:
+ BUG();
+ }
}
@@ -3329,16 +3340,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file->f_path.dentry->d_inode;
if (!S_ISREG(inode->i_mode))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
/* TODO: Add support for non extent hole punching */
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
/* TODO: Add support for bigalloc file systems */
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
return ext4_ext_punch_hole(file, offset, length);
@@ -3924,10 +3935,8 @@ static int ext4_do_update_inode(handle_t *handle,
ext4_update_dynamic_rev(sb);
EXT4_SET_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
- sb->s_dirt = 1;
ext4_handle_sync(handle);
- err = ext4_handle_dirty_metadata(handle, NULL,
- EXT4_SB(sb)->s_sbh);
+ err = ext4_handle_dirty_super(handle, sb);
}
}
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -4152,11 +4161,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(inode)) {
+ if (attr->ia_size != i_size_read(inode))
truncate_setsize(inode, attr->ia_size);
- ext4_truncate(inode);
- } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
- ext4_truncate(inode);
+ ext4_truncate(inode);
}
if (!rc) {
@@ -4314,7 +4321,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (test_opt(inode->i_sb, I_VERSION))
+ if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
/* the do_update_inode consumes one bh->b_count */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cb990b21c69..99ab428bcfa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,6 +21,7 @@
* mballoc.c contains the multiblocks allocation routines
*/
+#include "ext4_jbd2.h"
#include "mballoc.h"
#include <linux/debugfs.h>
#include <linux/slab.h>
@@ -339,7 +340,7 @@
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
+static struct kmem_cache *ext4_free_data_cachep;
/* We create slab caches for groupinfo data structures based on the
* superblock block size. There will be one per mounted filesystem for
@@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int rc);
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -425,7 +427,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
{
char *bb;
- BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(max == NULL);
if (order > e4b->bd_blkbits + 1) {
@@ -436,10 +438,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
/* at order 0 we see each particular block */
if (order == 0) {
*max = 1 << (e4b->bd_blkbits + 3);
- return EXT4_MB_BITMAP(e4b);
+ return e4b->bd_bitmap;
}
- bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+ bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
return bb;
@@ -588,7 +590,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
for (j = 0; j < (1 << order); j++) {
k = (i * (1 << order)) + j;
MB_CHECK_ASSERT(
- !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+ !mb_test_bit(k, e4b->bd_bitmap));
}
count++;
}
@@ -782,7 +784,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
int groups_per_page;
int err = 0;
int i;
- ext4_group_t first_group;
+ ext4_group_t first_group, group;
int first_block;
struct super_block *sb;
struct buffer_head *bhs;
@@ -806,24 +808,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* allocate buffer_heads to read bitmaps */
if (groups_per_page > 1) {
- err = -ENOMEM;
i = sizeof(struct buffer_head *) * groups_per_page;
bh = kzalloc(i, GFP_NOFS);
- if (bh == NULL)
+ if (bh == NULL) {
+ err = -ENOMEM;
goto out;
+ }
} else
bh = &bhs;
first_group = page->index * blocks_per_page / 2;
/* read all groups the page covers into the cache */
- for (i = 0; i < groups_per_page; i++) {
- struct ext4_group_desc *desc;
-
- if (first_group + i >= ngroups)
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (group >= ngroups)
break;
- grinfo = ext4_get_group_info(sb, first_group + i);
+ grinfo = ext4_get_group_info(sb, group);
/*
* If page is uptodate then we came here after online resize
* which added some new uninitialized group info structs, so
@@ -834,69 +835,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
bh[i] = NULL;
continue;
}
-
- err = -EIO;
- desc = ext4_get_group_desc(sb, first_group + i, NULL);
- if (desc == NULL)
- goto out;
-
- err = -ENOMEM;
- bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
- if (bh[i] == NULL)
+ if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
+ err = -ENOMEM;
goto out;
-
- if (bitmap_uptodate(bh[i]))
- continue;
-
- lock_buffer(bh[i]);
- if (bitmap_uptodate(bh[i])) {
- unlock_buffer(bh[i]);
- continue;
- }
- ext4_lock_group(sb, first_group + i);
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- ext4_init_block_bitmap(sb, bh[i],
- first_group + i, desc);
- set_bitmap_uptodate(bh[i]);
- set_buffer_uptodate(bh[i]);
- ext4_unlock_group(sb, first_group + i);
- unlock_buffer(bh[i]);
- continue;
}
- ext4_unlock_group(sb, first_group + i);
- if (buffer_uptodate(bh[i])) {
- /*
- * if not uninit if bh is uptodate,
- * bitmap is also uptodate
- */
- set_bitmap_uptodate(bh[i]);
- unlock_buffer(bh[i]);
- continue;
- }
- get_bh(bh[i]);
- /*
- * submit the buffer_head for read. We can
- * safely mark the bitmap as uptodate now.
- * We do it here so the bitmap uptodate bit
- * get set with buffer lock held.
- */
- set_bitmap_uptodate(bh[i]);
- bh[i]->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh[i]);
- mb_debug(1, "read bitmap for group %u\n", first_group + i);
+ mb_debug(1, "read bitmap for group %u\n", group);
}
/* wait for I/O completion */
- for (i = 0; i < groups_per_page; i++)
- if (bh[i])
- wait_on_buffer(bh[i]);
-
- err = -EIO;
- for (i = 0; i < groups_per_page; i++)
- if (bh[i] && !buffer_uptodate(bh[i]))
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+ err = -EIO;
goto out;
+ }
+ }
- err = 0;
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
int group;
@@ -1250,10 +1203,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
int order = 1;
void *bb;
- BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
- bb = EXT4_MB_BUDDY(e4b);
+ bb = e4b->bd_buddy;
while (order <= e4b->bd_blkbits + 1) {
block = block >> 1;
if (!mb_test_bit(block, bb)) {
@@ -1323,9 +1276,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
/* let's maintain fragments counter */
if (first != 0)
- block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
+ block = !mb_test_bit(first - 1, e4b->bd_bitmap);
if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
- max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
+ max = !mb_test_bit(first + count, e4b->bd_bitmap);
if (block && max)
e4b->bd_info->bb_fragments--;
else if (!block && !max)
@@ -1336,7 +1289,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
block = first++;
order = 0;
- if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
+ if (!mb_test_bit(block, e4b->bd_bitmap)) {
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
@@ -1347,7 +1300,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
"freeing already freed block "
"(bit %u)", block);
}
- mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
+ mb_clear_bit(block, e4b->bd_bitmap);
e4b->bd_info->bb_counters[order]++;
/* start of the buddy */
@@ -1429,7 +1382,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
break;
next = (block + 1) * (1 << order);
- if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+ if (mb_test_bit(next, e4b->bd_bitmap))
break;
order = mb_find_order_for_block(e4b, next);
@@ -1466,9 +1419,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
/* let's maintain fragments counter */
if (start != 0)
- mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+ mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
- max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+ max = !mb_test_bit(start + len, e4b->bd_bitmap);
if (mlen && max)
e4b->bd_info->bb_fragments++;
else if (!mlen && !max)
@@ -1511,7 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
- ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+ ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -1810,7 +1763,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct super_block *sb = ac->ac_sb;
- void *bitmap = EXT4_MB_BITMAP(e4b);
+ void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
int i;
int free;
@@ -1870,7 +1823,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
{
struct super_block *sb = ac->ac_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- void *bitmap = EXT4_MB_BITMAP(e4b);
+ void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
ext4_fsblk_t first_group_block;
ext4_fsblk_t a;
@@ -2224,7 +2177,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
EXT4_DESC_PER_BLOCK_BITS(sb);
meta_group_info = kmalloc(metalen, GFP_KERNEL);
if (meta_group_info == NULL) {
- ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem "
+ ext4_msg(sb, KERN_ERR, "can't allocate mem "
"for a buddy group");
goto exit_meta_group_info;
}
@@ -2238,7 +2191,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
if (meta_group_info[i] == NULL) {
- ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem");
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
goto exit_group_info;
}
memset(meta_group_info[i], 0, kmem_cache_size(cachep));
@@ -2522,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
- if (sbi->s_journal)
- sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-
return 0;
out_free_locality_groups:
@@ -2637,58 +2587,55 @@ static inline int ext4_issue_discard(struct super_block *sb,
* This function is called by the jbd2 layer once the commit has finished,
* so we know we can free the blocks that were released with that commit.
*/
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc)
{
- struct super_block *sb = journal->j_private;
+ struct ext4_free_data *entry = (struct ext4_free_data *)jce;
struct ext4_buddy e4b;
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
- struct ext4_free_data *entry;
- struct list_head *l, *ltmp;
- list_for_each_safe(l, ltmp, &txn->t_private_list) {
- entry = list_entry(l, struct ext4_free_data, list);
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ entry->efd_count, entry->efd_group, entry);
- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
- entry->count, entry->group, entry);
+ if (test_opt(sb, DISCARD))
+ ext4_issue_discard(sb, entry->efd_group,
+ entry->efd_start_cluster, entry->efd_count);
- if (test_opt(sb, DISCARD))
- ext4_issue_discard(sb, entry->group,
- entry->start_cluster, entry->count);
+ err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+ /* we expect to find existing buddy because it's pinned */
+ BUG_ON(err != 0);
- err = ext4_mb_load_buddy(sb, entry->group, &e4b);
- /* we expect to find existing buddy because it's pinned */
- BUG_ON(err != 0);
- db = e4b.bd_info;
- /* there are blocks to put in buddy to make them really free */
- count += entry->count;
- count2++;
- ext4_lock_group(sb, entry->group);
- /* Take it out of per group rb tree */
- rb_erase(&entry->node, &(db->bb_free_root));
- mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
+ db = e4b.bd_info;
+ /* there are blocks to put in buddy to make them really free */
+ count += entry->efd_count;
+ count2++;
+ ext4_lock_group(sb, entry->efd_group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->efd_node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
- /*
- * Clear the trimmed flag for the group so that the next
- * ext4_trim_fs can trim it.
- * If the volume is mounted with -o discard, online discard
- * is supported and the free blocks will be trimmed online.
- */
- if (!test_opt(sb, DISCARD))
- EXT4_MB_GRP_CLEAR_TRIMMED(db);
+ /*
+ * Clear the trimmed flag for the group so that the next
+ * ext4_trim_fs can trim it.
+ * If the volume is mounted with -o discard, online discard
+ * is supported and the free blocks will be trimmed online.
+ */
+ if (!test_opt(sb, DISCARD))
+ EXT4_MB_GRP_CLEAR_TRIMMED(db);
- if (!db->bb_free_root.rb_node) {
- /* No more items in the per group rb tree
- * balance refcounts from ext4_mb_free_metadata()
- */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
- }
- ext4_unlock_group(sb, entry->group);
- kmem_cache_free(ext4_free_ext_cachep, entry);
- ext4_mb_unload_buddy(&e4b);
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
}
+ ext4_unlock_group(sb, entry->efd_group);
+ kmem_cache_free(ext4_free_data_cachep, entry);
+ ext4_mb_unload_buddy(&e4b);
mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
}
@@ -2741,9 +2688,9 @@ int __init ext4_init_mballoc(void)
return -ENOMEM;
}
- ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
- SLAB_RECLAIM_ACCOUNT);
- if (ext4_free_ext_cachep == NULL) {
+ ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+ SLAB_RECLAIM_ACCOUNT);
+ if (ext4_free_data_cachep == NULL) {
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
@@ -2761,7 +2708,7 @@ void ext4_exit_mballoc(void)
rcu_barrier();
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
- kmem_cache_destroy(ext4_free_ext_cachep);
+ kmem_cache_destroy(ext4_free_data_cachep);
ext4_groupinfo_destroy_slabs();
ext4_remove_debugfs_entry();
}
@@ -2815,7 +2762,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (!ext4_data_block_valid(sbi, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
- "fs metadata\n", block, block+len);
+ "fs metadata", block, block+len);
/* File system mounted not to panic on error
* Fix the bitmap and repeat the block allocation
* We leak some of the blocks here.
@@ -2911,7 +2858,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits, max;
ext4_lblk_t end;
- loff_t size, orig_size, start_off;
+ loff_t size, start_off;
+ loff_t orig_size __maybe_unused;
ext4_lblk_t start;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_prealloc_space *pa;
@@ -3321,8 +3269,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
n = rb_first(&(grp->bb_free_root));
while (n) {
- entry = rb_entry(n, struct ext4_free_data, node);
- ext4_set_bits(bitmap, entry->start_cluster, entry->count);
+ entry = rb_entry(n, struct ext4_free_data, efd_node);
+ ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
n = rb_next(n);
}
return;
@@ -3916,11 +3864,11 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
return;
- ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:"
+ ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
" Allocation context details:");
- ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d",
+ ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
ac->ac_status, ac->ac_flags);
- ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, "
+ ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
"goal %lu/%lu/%lu@%lu, "
"best %lu/%lu/%lu@%lu cr %d",
(unsigned long)ac->ac_o_ex.fe_group,
@@ -3936,9 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_len,
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
- ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found",
+ ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
ac->ac_ex_scanned, ac->ac_found);
- ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: ");
+ ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
ngroups = ext4_get_groups_count(sb);
for (i = 0; i < ngroups; i++) {
struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -4428,9 +4376,9 @@ out:
static int can_merge(struct ext4_free_data *entry1,
struct ext4_free_data *entry2)
{
- if ((entry1->t_tid == entry2->t_tid) &&
- (entry1->group == entry2->group) &&
- ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
+ if ((entry1->efd_tid == entry2->efd_tid) &&
+ (entry1->efd_group == entry2->efd_group) &&
+ ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
return 1;
return 0;
}
@@ -4452,8 +4400,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
- new_node = &new_entry->node;
- cluster = new_entry->start_cluster;
+ new_node = &new_entry->efd_node;
+ cluster = new_entry->efd_start_cluster;
if (!*n) {
/* first free block exent. We need to
@@ -4466,10 +4414,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
while (*n) {
parent = *n;
- entry = rb_entry(parent, struct ext4_free_data, node);
- if (cluster < entry->start_cluster)
+ entry = rb_entry(parent, struct ext4_free_data, efd_node);
+ if (cluster < entry->efd_start_cluster)
n = &(*n)->rb_left;
- else if (cluster >= (entry->start_cluster + entry->count))
+ else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
n = &(*n)->rb_right;
else {
ext4_grp_locked_error(sb, group, 0,
@@ -4486,34 +4434,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
/* Now try to see the extent can be merged to left and right */
node = rb_prev(new_node);
if (node) {
- entry = rb_entry(node, struct ext4_free_data, node);
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
if (can_merge(entry, new_entry)) {
- new_entry->start_cluster = entry->start_cluster;
- new_entry->count += entry->count;
+ new_entry->efd_start_cluster = entry->efd_start_cluster;
+ new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- spin_lock(&sbi->s_md_lock);
- list_del(&entry->list);
- spin_unlock(&sbi->s_md_lock);
- kmem_cache_free(ext4_free_ext_cachep, entry);
+ ext4_journal_callback_del(handle, &entry->efd_jce);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}
node = rb_next(new_node);
if (node) {
- entry = rb_entry(node, struct ext4_free_data, node);
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
if (can_merge(new_entry, entry)) {
- new_entry->count += entry->count;
+ new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- spin_lock(&sbi->s_md_lock);
- list_del(&entry->list);
- spin_unlock(&sbi->s_md_lock);
- kmem_cache_free(ext4_free_ext_cachep, entry);
+ ext4_journal_callback_del(handle, &entry->efd_jce);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}
/* Add the extent to transaction's private list */
- spin_lock(&sbi->s_md_lock);
- list_add(&new_entry->list, &handle->h_transaction->t_private_list);
- spin_unlock(&sbi->s_md_lock);
+ ext4_journal_callback_add(handle, ext4_free_data_callback,
+ &new_entry->efd_jce);
return 0;
}
@@ -4691,15 +4634,15 @@ do_more:
* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed
*/
- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
if (!new_entry) {
err = -ENOMEM;
goto error_return;
}
- new_entry->start_cluster = bit;
- new_entry->group = block_group;
- new_entry->count = count_clusters;
- new_entry->t_tid = handle->h_transaction->t_tid;
+ new_entry->efd_start_cluster = bit;
+ new_entry->efd_group = block_group;
+ new_entry->efd_count = count_clusters;
+ new_entry->efd_tid = handle->h_transaction->t_tid;
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -4971,11 +4914,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
start = (e4b.bd_info->bb_first_free > start) ?
e4b.bd_info->bb_first_free : start;
- while (start < max) {
- start = mb_find_next_zero_bit(bitmap, max, start);
- if (start >= max)
+ while (start <= max) {
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
+ if (start > max)
break;
- next = mb_find_next_bit(bitmap, max, start);
+ next = mb_find_next_bit(bitmap, max + 1, start);
if ((next - start) >= minblocks) {
ext4_trim_extent(sb, start,
@@ -5027,37 +4970,36 @@ out:
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
struct ext4_group_info *grp;
- ext4_group_t first_group, last_group;
- ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+ ext4_group_t group, first_group, last_group;
ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
- uint64_t start, len, minlen, trimmed = 0;
+ uint64_t start, end, minlen, trimmed = 0;
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+ ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
int ret = 0;
start = range->start >> sb->s_blocksize_bits;
- len = range->len >> sb->s_blocksize_bits;
+ end = start + (range->len >> sb->s_blocksize_bits) - 1;
minlen = range->minlen >> sb->s_blocksize_bits;
- if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)))
+ if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
+ unlikely(start >= max_blks))
return -EINVAL;
- if (start + len <= first_data_blk)
+ if (end >= max_blks)
+ end = max_blks - 1;
+ if (end <= first_data_blk)
goto out;
- if (start < first_data_blk) {
- len -= first_data_blk - start;
+ if (start < first_data_blk)
start = first_data_blk;
- }
- /* Determine first and last group to examine based on start and len */
+ /* Determine first and last group to examine based on start and end */
ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
&first_group, &first_cluster);
- ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
&last_group, &last_cluster);
- last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
- last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
- if (first_group > last_group)
- return -EINVAL;
+ /* end now represents the last cluster to discard in this group */
+ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
for (group = first_group; group <= last_group; group++) {
grp = ext4_get_group_info(sb, group);
@@ -5069,31 +5011,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
/*
- * For all the groups except the last one, last block will
- * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
- * change it for the last group in which case start +
- * len < EXT4_BLOCKS_PER_GROUP(sb).
+ * For all the groups except the last one, last cluster will
+ * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
+ * change it for the last group, note that last_cluster is
+ * already computed earlier by ext4_get_group_no_and_offset()
*/
- if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb))
- last_cluster = first_cluster + len;
- len -= last_cluster - first_cluster;
+ if (group == last_group)
+ end = last_cluster;
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- last_cluster, minlen);
+ end, minlen);
if (cnt < 0) {
ret = cnt;
break;
}
+ trimmed += cnt;
}
- trimmed += cnt;
+
+ /*
+ * For every group except the first one, we are sure
+ * that the first cluster to discard will be cluster #0.
+ */
first_cluster = 0;
}
- range->len = trimmed * sb->s_blocksize;
if (!ret)
atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
out:
+ range->len = trimmed * sb->s_blocksize;
return ret;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 47705f3285e..c070618c21c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -96,21 +96,23 @@ extern u8 mb_enable_debug;
struct ext4_free_data {
- /* this links the free block information from group_info */
- struct rb_node node;
+ /* MUST be the first member */
+ struct ext4_journal_cb_entry efd_jce;
+
+ /* ext4_free_data private data starts from here */
- /* this links the free block information from ext4_sb_info */
- struct list_head list;
+ /* this links the free block information from group_info */
+ struct rb_node efd_node;
/* group which free block extent belongs */
- ext4_group_t group;
+ ext4_group_t efd_group;
/* free block extent */
- ext4_grpblk_t start_cluster;
- ext4_grpblk_t count;
+ ext4_grpblk_t efd_start_cluster;
+ ext4_grpblk_t efd_count;
/* transaction which freed this extent */
- tid_t t_tid;
+ tid_t efd_tid;
};
struct ext4_prealloc_space {
@@ -210,8 +212,6 @@ struct ext4_buddy {
__u16 bd_blkbits;
ext4_group_t bd_group;
};
-#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
-#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
struct ext4_free_extent *fex)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index e7d6bb0acfa..f39f80f8f2c 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -471,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode)
tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
S_IFREG, NULL, goal, owner);
if (IS_ERR(tmp_inode)) {
- retval = PTR_ERR(inode);
+ retval = PTR_ERR(tmp_inode);
ext4_journal_stop(handle);
return retval;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 7ea4ba4eff2..ed6548d8916 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -257,8 +257,8 @@ int ext4_multi_mount_protect(struct super_block *sb,
* If check_interval in MMP block is larger, use that instead of
* update_interval from the superblock.
*/
- if (mmp->mmp_check_interval > mmp_check_interval)
- mmp_check_interval = mmp->mmp_check_interval;
+ if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
+ mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
seq = le32_to_cpu(mmp->mmp_seq);
if (seq == EXT4_MMP_SEQ_CLEAN)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2043f482375..349d7b3671c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -468,7 +468,7 @@ fail2:
fail:
if (*err == ERR_BAD_DX_DIR)
ext4_warning(dir->i_sb,
- "Corrupt dir inode %ld, running e2fsck is "
+ "Corrupt dir inode %lu, running e2fsck is "
"recommended.", dir->i_ino);
return NULL;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 47585189651..dcdeef169a6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -110,6 +110,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
+ if (io->flag & EXT4_IO_END_DIRECT)
+ inode_dio_done(inode);
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
wake_up_all(ext4_ioend_wq(io->inode));
@@ -127,12 +129,18 @@ static void ext4_end_io_work(struct work_struct *work)
unsigned long flags;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (io->flag & EXT4_IO_END_IN_FSYNC)
+ goto requeue;
if (list_empty(&io->list)) {
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
goto free;
}
if (!mutex_trylock(&inode->i_mutex)) {
+ bool was_queued;
+requeue:
+ was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
+ io->flag |= EXT4_IO_END_QUEUED;
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
/*
* Requeue the work instead of waiting so that the work
@@ -145,9 +153,8 @@ static void ext4_end_io_work(struct work_struct *work)
* yield the cpu if it sees an end_io request that has already
* been requeued.
*/
- if (io->flag & EXT4_IO_END_QUEUED)
+ if (was_queued)
yield();
- io->flag |= EXT4_IO_END_QUEUED;
return;
}
list_del_init(&io->list);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f9d948f0eb8..59fa0be2725 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1163,8 +1163,11 @@ static void ext4_update_super(struct super_block *sb,
do_div(reserved_blocks, 100);
ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+ ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
flex_gd->count);
+ le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+ flex_gd->count);
/*
* We need to protect s_groups_count against other CPUs seeing
@@ -1465,6 +1468,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
}
ext4_blocks_count_set(es, o_blocks_count + add);
+ ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
o_blocks_count + add);
/* We add the blocks to the bitmap and set the group need init bit */
@@ -1512,16 +1516,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count = ext4_blocks_count(es);
if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n",
- o_blocks_count, n_blocks_count);
+ ext4_msg(sb, KERN_DEBUG,
+ "extending last group from %llu to %llu blocks",
+ o_blocks_count, n_blocks_count);
if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
return 0;
if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
- printk(KERN_ERR "EXT4-fs: filesystem on %s:"
- " too large to resize to %llu blocks safely\n",
- sb->s_id, n_blocks_count);
+ ext4_msg(sb, KERN_ERR,
+ "filesystem too large to resize to %llu blocks safely",
+ n_blocks_count);
if (sizeof(sector_t) < 8)
ext4_warning(sb, "CONFIG_LBDAF not enabled");
return -EINVAL;
@@ -1582,7 +1587,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
ext4_fsblk_t o_blocks_count;
ext4_group_t o_group;
ext4_group_t n_group;
- ext4_grpblk_t offset;
+ ext4_grpblk_t offset, add;
unsigned long n_desc_blocks;
unsigned long o_desc_blocks;
unsigned long desc_blocks;
@@ -1591,8 +1596,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
o_blocks_count = ext4_blocks_count(es);
if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
- "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+ ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
+ "to %llu blocks", o_blocks_count, n_blocks_count);
if (n_blocks_count < o_blocks_count) {
/* On-line shrinking not supported */
@@ -1605,7 +1610,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
return 0;
ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
- ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset);
+ ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
EXT4_DESC_PER_BLOCK(sb);
@@ -1634,10 +1639,12 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
}
brelse(bh);
- if (offset != 0) {
- /* extend the last group */
- ext4_grpblk_t add;
- add = EXT4_BLOCKS_PER_GROUP(sb) - offset;
+ /* extend the last group */
+ if (n_group == o_group)
+ add = n_blocks_count - o_blocks_count;
+ else
+ add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+ if (add > 0) {
err = ext4_group_extend_no_check(sb, o_blocks_count, add);
if (err)
goto out;
@@ -1674,7 +1681,7 @@ out:
iput(resize_inode);
if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
- "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+ ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
+ "upto %llu blocks", o_blocks_count, n_blocks_count);
return err;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 502c61fd739..ceebaf853be 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -62,6 +62,7 @@ static struct ext4_features *ext4_feat;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
+static int ext4_show_options(struct seq_file *seq, struct dentry *root);
static int ext4_commit_super(struct super_block *sb, int sync);
static void ext4_mark_recovery_complete(struct super_block *sb,
struct ext4_super_block *es);
@@ -375,7 +376,7 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
if (is_handle_aborted(handle))
return;
- printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
+ printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
caller, line, errstr, err_fn);
jbd2_journal_abort_handle(handle);
@@ -431,6 +432,22 @@ static int block_device_ejected(struct super_block *sb)
return bdi->dev == NULL;
}
+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int error = is_journal_aborted(journal);
+ struct ext4_journal_cb_entry *jce, *tmp;
+
+ spin_lock(&sbi->s_md_lock);
+ list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+ list_del_init(&jce->jce_list);
+ spin_unlock(&sbi->s_md_lock);
+ jce->jce_func(sb, jce, error);
+ spin_lock(&sbi->s_md_lock);
+ }
+ spin_unlock(&sbi->s_md_lock);
+}
/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
@@ -498,11 +515,16 @@ void ext4_error_inode(struct inode *inode, const char *function,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
- inode->i_sb->s_id, function, line, inode->i_ino);
if (block)
- printk(KERN_CONT "block %llu: ", block);
- printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: block %llu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, &vaf);
+ else
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, &vaf);
va_end(args);
ext4_handle_error(inode->i_sb);
@@ -524,15 +546,21 @@ void ext4_error_file(struct file *file, const char *function,
path = d_path(&(file->f_path), pathname, sizeof(pathname));
if (IS_ERR(path))
path = "(unknown)";
- printk(KERN_CRIT
- "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
- inode->i_sb->s_id, function, line, inode->i_ino);
- if (block)
- printk(KERN_CONT "block %llu: ", block);
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
+ if (block)
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "block %llu: comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, path, &vaf);
+ else
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, path, &vaf);
va_end(args);
ext4_handle_error(inode->i_sb);
@@ -808,9 +836,6 @@ static void ext4_put_super(struct super_block *sb)
destroy_workqueue(sbi->dio_unwritten_wq);
lock_super(sb);
- if (sb->s_dirt)
- ext4_commit_super(sb, 1);
-
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -827,9 +852,12 @@ static void ext4_put_super(struct super_block *sb)
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
- ext4_commit_super(sb, 1);
}
+ if (sb->s_dirt || !(sb->s_flags & MS_RDONLY))
+ ext4_commit_super(sb, 1);
+
if (sbi->s_proc) {
+ remove_proc_entry("options", sbi->s_proc);
remove_proc_entry(sb->s_id, ext4_proc_root);
}
kobject_del(&sbi->s_kobj);
@@ -990,180 +1018,6 @@ void ext4_clear_inode(struct inode *inode)
}
}
-static inline void ext4_show_quota_options(struct seq_file *seq,
- struct super_block *sb)
-{
-#if defined(CONFIG_QUOTA)
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_jquota_fmt) {
- char *fmtname = "";
-
- switch (sbi->s_jquota_fmt) {
- case QFMT_VFS_OLD:
- fmtname = "vfsold";
- break;
- case QFMT_VFS_V0:
- fmtname = "vfsv0";
- break;
- case QFMT_VFS_V1:
- fmtname = "vfsv1";
- break;
- }
- seq_printf(seq, ",jqfmt=%s", fmtname);
- }
-
- if (sbi->s_qf_names[USRQUOTA])
- seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
-
- if (sbi->s_qf_names[GRPQUOTA])
- seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-
- if (test_opt(sb, USRQUOTA))
- seq_puts(seq, ",usrquota");
-
- if (test_opt(sb, GRPQUOTA))
- seq_puts(seq, ",grpquota");
-#endif
-}
-
-/*
- * Show an option if
- * - it's set to a non-default value OR
- * - if the per-sb default is different from the global default
- */
-static int ext4_show_options(struct seq_file *seq, struct dentry *root)
-{
- int def_errors;
- unsigned long def_mount_opts;
- struct super_block *sb = root->d_sb;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
-
- def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
- def_errors = le16_to_cpu(es->s_errors);
-
- if (sbi->s_sb_block != 1)
- seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
- if (test_opt(sb, MINIX_DF))
- seq_puts(seq, ",minixdf");
- if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
- seq_puts(seq, ",grpid");
- if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
- seq_puts(seq, ",nogrpid");
- if (sbi->s_resuid != EXT4_DEF_RESUID ||
- le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
- seq_printf(seq, ",resuid=%u", sbi->s_resuid);
- }
- if (sbi->s_resgid != EXT4_DEF_RESGID ||
- le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
- seq_printf(seq, ",resgid=%u", sbi->s_resgid);
- }
- if (test_opt(sb, ERRORS_RO)) {
- if (def_errors == EXT4_ERRORS_PANIC ||
- def_errors == EXT4_ERRORS_CONTINUE) {
- seq_puts(seq, ",errors=remount-ro");
- }
- }
- if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
- seq_puts(seq, ",errors=continue");
- if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
- seq_puts(seq, ",errors=panic");
- if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
- seq_puts(seq, ",nouid32");
- if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
- seq_puts(seq, ",debug");
-#ifdef CONFIG_EXT4_FS_XATTR
- if (test_opt(sb, XATTR_USER))
- seq_puts(seq, ",user_xattr");
- if (!test_opt(sb, XATTR_USER))
- seq_puts(seq, ",nouser_xattr");
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
- seq_puts(seq, ",acl");
- if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
- seq_puts(seq, ",noacl");
-#endif
- if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
- seq_printf(seq, ",commit=%u",
- (unsigned) (sbi->s_commit_interval / HZ));
- }
- if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
- seq_printf(seq, ",min_batch_time=%u",
- (unsigned) sbi->s_min_batch_time);
- }
- if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
- seq_printf(seq, ",max_batch_time=%u",
- (unsigned) sbi->s_max_batch_time);
- }
-
- /*
- * We're changing the default of barrier mount option, so
- * let's always display its mount state so it's clear what its
- * status is.
- */
- seq_puts(seq, ",barrier=");
- seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
- seq_puts(seq, ",journal_async_commit");
- else if (test_opt(sb, JOURNAL_CHECKSUM))
- seq_puts(seq, ",journal_checksum");
- if (test_opt(sb, I_VERSION))
- seq_puts(seq, ",i_version");
- if (!test_opt(sb, DELALLOC) &&
- !(def_mount_opts & EXT4_DEFM_NODELALLOC))
- seq_puts(seq, ",nodelalloc");
-
- if (!test_opt(sb, MBLK_IO_SUBMIT))
- seq_puts(seq, ",nomblk_io_submit");
- if (sbi->s_stripe)
- seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
- /*
- * journal mode get enabled in different ways
- * So just print the value even if we didn't specify it
- */
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
- seq_puts(seq, ",data=journal");
- else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- seq_puts(seq, ",data=ordered");
- else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
- seq_puts(seq, ",data=writeback");
-
- if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
- seq_printf(seq, ",inode_readahead_blks=%u",
- sbi->s_inode_readahead_blks);
-
- if (test_opt(sb, DATA_ERR_ABORT))
- seq_puts(seq, ",data_err=abort");
-
- if (test_opt(sb, NO_AUTO_DA_ALLOC))
- seq_puts(seq, ",noauto_da_alloc");
-
- if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
- seq_puts(seq, ",discard");
-
- if (test_opt(sb, NOLOAD))
- seq_puts(seq, ",norecovery");
-
- if (test_opt(sb, DIOREAD_NOLOCK))
- seq_puts(seq, ",dioread_nolock");
-
- if (test_opt(sb, BLOCK_VALIDITY) &&
- !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
- seq_puts(seq, ",block_validity");
-
- if (!test_opt(sb, INIT_INODE_TABLE))
- seq_puts(seq, ",noinit_itable");
- else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
- seq_printf(seq, ",init_itable=%u",
- (unsigned) sbi->s_li_wait_mult);
-
- ext4_show_quota_options(seq, sb);
-
- return 0;
-}
-
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
{
@@ -1316,18 +1170,17 @@ static const struct export_operations ext4_export_ops = {
enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
- Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_nouid32, Opt_debug, Opt_removed,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
- Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
+ Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
- Opt_journal_update, Opt_journal_dev,
- Opt_journal_checksum, Opt_journal_async_commit,
+ Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
- Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
+ Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
@@ -1350,20 +1203,19 @@ static const match_table_t tokens = {
{Opt_err_ro, "errors=remount-ro"},
{Opt_nouid32, "nouid32"},
{Opt_debug, "debug"},
- {Opt_oldalloc, "oldalloc"},
- {Opt_orlov, "orlov"},
+ {Opt_removed, "oldalloc"},
+ {Opt_removed, "orlov"},
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
- {Opt_noload, "noload"},
{Opt_noload, "norecovery"},
- {Opt_nobh, "nobh"},
- {Opt_bh, "bh"},
+ {Opt_noload, "noload"},
+ {Opt_removed, "nobh"},
+ {Opt_removed, "bh"},
{Opt_commit, "commit=%u"},
{Opt_min_batch_time, "min_batch_time=%u"},
{Opt_max_batch_time, "max_batch_time=%u"},
- {Opt_journal_update, "journal=update"},
{Opt_journal_dev, "journal_dev=%u"},
{Opt_journal_checksum, "journal_checksum"},
{Opt_journal_async_commit, "journal_async_commit"},
@@ -1389,7 +1241,6 @@ static const match_table_t tokens = {
{Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
{Opt_stripe, "stripe=%u"},
- {Opt_resize, "resize"},
{Opt_delalloc, "delalloc"},
{Opt_nodelalloc, "nodelalloc"},
{Opt_mblk_io_submit, "mblk_io_submit"},
@@ -1408,6 +1259,11 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable=%u"},
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
+ {Opt_removed, "check=none"}, /* mount option from ext2/3 */
+ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
+ {Opt_removed, "reservation"}, /* mount option from ext2/3 */
+ {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
+ {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */
{Opt_err, NULL},
};
@@ -1496,420 +1352,273 @@ static int clear_qf_name(struct super_block *sb, int qtype)
}
#endif
-static int parse_options(char *options, struct super_block *sb,
- unsigned long *journal_devnum,
- unsigned int *journal_ioprio,
- ext4_fsblk_t *n_blocks_count, int is_remount)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int data_opt = 0;
- int option;
+#define MOPT_SET 0x0001
+#define MOPT_CLEAR 0x0002
+#define MOPT_NOSUPPORT 0x0004
+#define MOPT_EXPLICIT 0x0008
+#define MOPT_CLEAR_ERR 0x0010
+#define MOPT_GTE0 0x0020
#ifdef CONFIG_QUOTA
- int qfmt;
+#define MOPT_Q 0
+#define MOPT_QFMT 0x0040
+#else
+#define MOPT_Q MOPT_NOSUPPORT
+#define MOPT_QFMT MOPT_NOSUPPORT
#endif
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_bsd_df:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
- clear_opt(sb, MINIX_DF);
- break;
- case Opt_minix_df:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
- set_opt(sb, MINIX_DF);
-
- break;
- case Opt_grpid:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
- set_opt(sb, GRPID);
-
- break;
- case Opt_nogrpid:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
- clear_opt(sb, GRPID);
-
- break;
- case Opt_resuid:
- if (match_int(&args[0], &option))
- return 0;
- sbi->s_resuid = option;
- break;
- case Opt_resgid:
- if (match_int(&args[0], &option))
- return 0;
- sbi->s_resgid = option;
- break;
- case Opt_sb:
- /* handled by get_sb_block() instead of here */
- /* *sb_block = match_int(&args[0]); */
- break;
- case Opt_err_panic:
- clear_opt(sb, ERRORS_CONT);
- clear_opt(sb, ERRORS_RO);
- set_opt(sb, ERRORS_PANIC);
- break;
- case Opt_err_ro:
- clear_opt(sb, ERRORS_CONT);
- clear_opt(sb, ERRORS_PANIC);
- set_opt(sb, ERRORS_RO);
- break;
- case Opt_err_cont:
- clear_opt(sb, ERRORS_RO);
- clear_opt(sb, ERRORS_PANIC);
- set_opt(sb, ERRORS_CONT);
- break;
- case Opt_nouid32:
- set_opt(sb, NO_UID32);
- break;
- case Opt_debug:
- set_opt(sb, DEBUG);
- break;
- case Opt_oldalloc:
- ext4_msg(sb, KERN_WARNING,
- "Ignoring deprecated oldalloc option");
- break;
- case Opt_orlov:
- ext4_msg(sb, KERN_WARNING,
- "Ignoring deprecated orlov option");
- break;
+#define MOPT_DATAJ 0x0080
+
+static const struct mount_opts {
+ int token;
+ int mount_opt;
+ int flags;
+} ext4_mount_opts[] = {
+ {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
+ {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
+ {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
+ {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
+ {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
+ {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
+ {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
+ {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
+ {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
+ {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
+ {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
+ {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
+ {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
+ {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
+ {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
+ {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
+ EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
+ {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
+ {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
+ {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
+ {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
+ {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
+ {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+ {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
+ {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
+ {Opt_commit, 0, MOPT_GTE0},
+ {Opt_max_batch_time, 0, MOPT_GTE0},
+ {Opt_min_batch_time, 0, MOPT_GTE0},
+ {Opt_inode_readahead_blks, 0, MOPT_GTE0},
+ {Opt_init_itable, 0, MOPT_GTE0},
+ {Opt_stripe, 0, MOPT_GTE0},
+ {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
+ {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
+ {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
#ifdef CONFIG_EXT4_FS_XATTR
- case Opt_user_xattr:
- set_opt(sb, XATTR_USER);
- break;
- case Opt_nouser_xattr:
- clear_opt(sb, XATTR_USER);
- break;
+ {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
+ {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
#else
- case Opt_user_xattr:
- case Opt_nouser_xattr:
- ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
- break;
+ {Opt_user_xattr, 0, MOPT_NOSUPPORT},
+ {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
#endif
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- case Opt_acl:
- set_opt(sb, POSIX_ACL);
- break;
- case Opt_noacl:
- clear_opt(sb, POSIX_ACL);
- break;
+ {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
+ {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
#else
- case Opt_acl:
- case Opt_noacl:
- ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
- break;
+ {Opt_acl, 0, MOPT_NOSUPPORT},
+ {Opt_noacl, 0, MOPT_NOSUPPORT},
#endif
- case Opt_journal_update:
- /* @@@ FIXME */
- /* Eventually we will want to be able to create
- a journal file here. For now, only allow the
- user to specify an existing inode to be the
- journal file. */
- if (is_remount) {
- ext4_msg(sb, KERN_ERR,
- "Cannot specify journal on remount");
- return 0;
- }
- set_opt(sb, UPDATE_JOURNAL);
- break;
- case Opt_journal_dev:
- if (is_remount) {
+ {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
+ {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
+ {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
+ {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
+ MOPT_SET | MOPT_Q},
+ {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
+ MOPT_SET | MOPT_Q},
+ {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
+ EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
+ {Opt_usrjquota, 0, MOPT_Q},
+ {Opt_grpjquota, 0, MOPT_Q},
+ {Opt_offusrjquota, 0, MOPT_Q},
+ {Opt_offgrpjquota, 0, MOPT_Q},
+ {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
+ {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
+ {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+ {Opt_err, 0, 0}
+};
+
+static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+ substring_t *args, unsigned long *journal_devnum,
+ unsigned int *journal_ioprio, int is_remount)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ const struct mount_opts *m;
+ int arg = 0;
+
+ if (args->from && match_int(args, &arg))
+ return -1;
+ switch (token) {
+ case Opt_noacl:
+ case Opt_nouser_xattr:
+ ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
+ break;
+ case Opt_sb:
+ return 1; /* handled by get_sb_block() */
+ case Opt_removed:
+ ext4_msg(sb, KERN_WARNING,
+ "Ignoring removed %s option", opt);
+ return 1;
+ case Opt_resuid:
+ sbi->s_resuid = arg;
+ return 1;
+ case Opt_resgid:
+ sbi->s_resgid = arg;
+ return 1;
+ case Opt_abort:
+ sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ return 1;
+ case Opt_i_version:
+ sb->s_flags |= MS_I_VERSION;
+ return 1;
+ case Opt_journal_dev:
+ if (is_remount) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
+ return -1;
+ }
+ *journal_devnum = arg;
+ return 1;
+ case Opt_journal_ioprio:
+ if (arg < 0 || arg > 7)
+ return -1;
+ *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+ return 1;
+ }
+
+ for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+ if (token != m->token)
+ continue;
+ if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+ return -1;
+ if (m->flags & MOPT_EXPLICIT)
+ set_opt2(sb, EXPLICIT_DELALLOC);
+ if (m->flags & MOPT_CLEAR_ERR)
+ clear_opt(sb, ERRORS_MASK);
+ if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+ ext4_msg(sb, KERN_ERR, "Cannot change quota "
+ "options when quota turned on");
+ return -1;
+ }
+
+ if (m->flags & MOPT_NOSUPPORT) {
+ ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+ } else if (token == Opt_commit) {
+ if (arg == 0)
+ arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ sbi->s_commit_interval = HZ * arg;
+ } else if (token == Opt_max_batch_time) {
+ if (arg == 0)
+ arg = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_max_batch_time = arg;
+ } else if (token == Opt_min_batch_time) {
+ sbi->s_min_batch_time = arg;
+ } else if (token == Opt_inode_readahead_blks) {
+ if (arg > (1 << 30))
+ return -1;
+ if (arg && !is_power_of_2(arg)) {
ext4_msg(sb, KERN_ERR,
- "Cannot specify journal on remount");
- return 0;
+ "EXT4-fs: inode_readahead_blks"
+ " must be a power of 2");
+ return -1;
}
- if (match_int(&args[0], &option))
- return 0;
- *journal_devnum = option;
- break;
- case Opt_journal_checksum:
- set_opt(sb, JOURNAL_CHECKSUM);
- break;
- case Opt_journal_async_commit:
- set_opt(sb, JOURNAL_ASYNC_COMMIT);
- set_opt(sb, JOURNAL_CHECKSUM);
- break;
- case Opt_noload:
- set_opt(sb, NOLOAD);
- break;
- case Opt_commit:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- if (option == 0)
- option = JBD2_DEFAULT_MAX_COMMIT_AGE;
- sbi->s_commit_interval = HZ * option;
- break;
- case Opt_max_batch_time:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- if (option == 0)
- option = EXT4_DEF_MAX_BATCH_TIME;
- sbi->s_max_batch_time = option;
- break;
- case Opt_min_batch_time:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- sbi->s_min_batch_time = option;
- break;
- case Opt_data_journal:
- data_opt = EXT4_MOUNT_JOURNAL_DATA;
- goto datacheck;
- case Opt_data_ordered:
- data_opt = EXT4_MOUNT_ORDERED_DATA;
- goto datacheck;
- case Opt_data_writeback:
- data_opt = EXT4_MOUNT_WRITEBACK_DATA;
- datacheck:
+ sbi->s_inode_readahead_blks = arg;
+ } else if (token == Opt_init_itable) {
+ set_opt(sb, INIT_INODE_TABLE);
+ if (!args->from)
+ arg = EXT4_DEF_LI_WAIT_MULT;
+ sbi->s_li_wait_mult = arg;
+ } else if (token == Opt_stripe) {
+ sbi->s_stripe = arg;
+ } else if (m->flags & MOPT_DATAJ) {
if (is_remount) {
if (!sbi->s_journal)
ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
- else if (test_opt(sb, DATA_FLAGS) != data_opt) {
+ else if (test_opt(sb, DATA_FLAGS) !=
+ m->mount_opt) {
ext4_msg(sb, KERN_ERR,
- "Cannot change data mode on remount");
- return 0;
+ "Cannot change data mode on remount");
+ return -1;
}
} else {
clear_opt(sb, DATA_FLAGS);
- sbi->s_mount_opt |= data_opt;
+ sbi->s_mount_opt |= m->mount_opt;
}
- break;
- case Opt_data_err_abort:
- set_opt(sb, DATA_ERR_ABORT);
- break;
- case Opt_data_err_ignore:
- clear_opt(sb, DATA_ERR_ABORT);
- break;
#ifdef CONFIG_QUOTA
- case Opt_usrjquota:
+ } else if (token == Opt_usrjquota) {
if (!set_qf_name(sb, USRQUOTA, &args[0]))
- return 0;
- break;
- case Opt_grpjquota:
+ return -1;
+ } else if (token == Opt_grpjquota) {
if (!set_qf_name(sb, GRPQUOTA, &args[0]))
- return 0;
- break;
- case Opt_offusrjquota:
+ return -1;
+ } else if (token == Opt_offusrjquota) {
if (!clear_qf_name(sb, USRQUOTA))
- return 0;
- break;
- case Opt_offgrpjquota:
+ return -1;
+ } else if (token == Opt_offgrpjquota) {
if (!clear_qf_name(sb, GRPQUOTA))
- return 0;
- break;
-
- case Opt_jqfmt_vfsold:
- qfmt = QFMT_VFS_OLD;
- goto set_qf_format;
- case Opt_jqfmt_vfsv0:
- qfmt = QFMT_VFS_V0;
- goto set_qf_format;
- case Opt_jqfmt_vfsv1:
- qfmt = QFMT_VFS_V1;
-set_qf_format:
+ return -1;
+ } else if (m->flags & MOPT_QFMT) {
if (sb_any_quota_loaded(sb) &&
- sbi->s_jquota_fmt != qfmt) {
- ext4_msg(sb, KERN_ERR, "Cannot change "
- "journaled quota options when "
- "quota turned on");
- return 0;
+ sbi->s_jquota_fmt != m->mount_opt) {
+ ext4_msg(sb, KERN_ERR, "Cannot "
+ "change journaled quota options "
+ "when quota turned on");
+ return -1;
}
- sbi->s_jquota_fmt = qfmt;
- break;
- case Opt_quota:
- case Opt_usrquota:
- set_opt(sb, QUOTA);
- set_opt(sb, USRQUOTA);
- break;
- case Opt_grpquota:
- set_opt(sb, QUOTA);
- set_opt(sb, GRPQUOTA);
- break;
- case Opt_noquota:
- if (sb_any_quota_loaded(sb)) {
- ext4_msg(sb, KERN_ERR, "Cannot change quota "
- "options when quota turned on");
- return 0;
- }
- clear_opt(sb, QUOTA);
- clear_opt(sb, USRQUOTA);
- clear_opt(sb, GRPQUOTA);
- break;
-#else
- case Opt_quota:
- case Opt_usrquota:
- case Opt_grpquota:
- ext4_msg(sb, KERN_ERR,
- "quota options not supported");
- break;
- case Opt_usrjquota:
- case Opt_grpjquota:
- case Opt_offusrjquota:
- case Opt_offgrpjquota:
- case Opt_jqfmt_vfsold:
- case Opt_jqfmt_vfsv0:
- case Opt_jqfmt_vfsv1:
- ext4_msg(sb, KERN_ERR,
- "journaled quota options not supported");
- break;
- case Opt_noquota:
- break;
+ sbi->s_jquota_fmt = m->mount_opt;
#endif
- case Opt_abort:
- sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
- break;
- case Opt_nobarrier:
- clear_opt(sb, BARRIER);
- break;
- case Opt_barrier:
- if (args[0].from) {
- if (match_int(&args[0], &option))
- return 0;
- } else
- option = 1; /* No argument, default to 1 */
- if (option)
- set_opt(sb, BARRIER);
- else
- clear_opt(sb, BARRIER);
- break;
- case Opt_ignore:
- break;
- case Opt_resize:
- if (!is_remount) {
- ext4_msg(sb, KERN_ERR,
- "resize option only available "
- "for remount");
- return 0;
- }
- if (match_int(&args[0], &option) != 0)
- return 0;
- *n_blocks_count = option;
- break;
- case Opt_nobh:
- ext4_msg(sb, KERN_WARNING,
- "Ignoring deprecated nobh option");
- break;
- case Opt_bh:
- ext4_msg(sb, KERN_WARNING,
- "Ignoring deprecated bh option");
- break;
- case Opt_i_version:
- set_opt(sb, I_VERSION);
- sb->s_flags |= MS_I_VERSION;
- break;
- case Opt_nodelalloc:
- clear_opt(sb, DELALLOC);
- clear_opt2(sb, EXPLICIT_DELALLOC);
- break;
- case Opt_mblk_io_submit:
- set_opt(sb, MBLK_IO_SUBMIT);
- break;
- case Opt_nomblk_io_submit:
- clear_opt(sb, MBLK_IO_SUBMIT);
- break;
- case Opt_stripe:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- sbi->s_stripe = option;
- break;
- case Opt_delalloc:
- set_opt(sb, DELALLOC);
- set_opt2(sb, EXPLICIT_DELALLOC);
- break;
- case Opt_block_validity:
- set_opt(sb, BLOCK_VALIDITY);
- break;
- case Opt_noblock_validity:
- clear_opt(sb, BLOCK_VALIDITY);
- break;
- case Opt_inode_readahead_blks:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0 || option > (1 << 30))
- return 0;
- if (option && !is_power_of_2(option)) {
- ext4_msg(sb, KERN_ERR,
- "EXT4-fs: inode_readahead_blks"
- " must be a power of 2");
- return 0;
+ } else {
+ if (!args->from)
+ arg = 1;
+ if (m->flags & MOPT_CLEAR)
+ arg = !arg;
+ else if (unlikely(!(m->flags & MOPT_SET))) {
+ ext4_msg(sb, KERN_WARNING,
+ "buggy handling of option %s", opt);
+ WARN_ON(1);
+ return -1;
}
- sbi->s_inode_readahead_blks = option;
- break;
- case Opt_journal_ioprio:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0 || option > 7)
- break;
- *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
- option);
- break;
- case Opt_noauto_da_alloc:
- set_opt(sb, NO_AUTO_DA_ALLOC);
- break;
- case Opt_auto_da_alloc:
- if (args[0].from) {
- if (match_int(&args[0], &option))
- return 0;
- } else
- option = 1; /* No argument, default to 1 */
- if (option)
- clear_opt(sb, NO_AUTO_DA_ALLOC);
+ if (arg != 0)
+ sbi->s_mount_opt |= m->mount_opt;
else
- set_opt(sb,NO_AUTO_DA_ALLOC);
- break;
- case Opt_discard:
- set_opt(sb, DISCARD);
- break;
- case Opt_nodiscard:
- clear_opt(sb, DISCARD);
- break;
- case Opt_dioread_nolock:
- set_opt(sb, DIOREAD_NOLOCK);
- break;
- case Opt_dioread_lock:
- clear_opt(sb, DIOREAD_NOLOCK);
- break;
- case Opt_init_itable:
- set_opt(sb, INIT_INODE_TABLE);
- if (args[0].from) {
- if (match_int(&args[0], &option))
- return 0;
- } else
- option = EXT4_DEF_LI_WAIT_MULT;
- if (option < 0)
- return 0;
- sbi->s_li_wait_mult = option;
- break;
- case Opt_noinit_itable:
- clear_opt(sb, INIT_INODE_TABLE);
- break;
- default:
- ext4_msg(sb, KERN_ERR,
- "Unrecognized mount option \"%s\" "
- "or missing value", p);
- return 0;
+ sbi->s_mount_opt &= ~m->mount_opt;
}
+ return 1;
+ }
+ ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+ "or missing value", opt);
+ return -1;
+}
+
+static int parse_options(char *options, struct super_block *sb,
+ unsigned long *journal_devnum,
+ unsigned int *journal_ioprio,
+ int is_remount)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int token;
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ if (!*p)
+ continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = 0;
+ token = match_token(p, tokens, args);
+ if (handle_mount_opt(sb, p, token, args, journal_devnum,
+ journal_ioprio, is_remount) < 0)
+ return 0;
}
#ifdef CONFIG_QUOTA
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
@@ -1942,6 +1651,160 @@ set_qf_format:
return 1;
}
+static inline void ext4_show_quota_options(struct seq_file *seq,
+ struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sbi->s_jquota_fmt) {
+ char *fmtname = "";
+
+ switch (sbi->s_jquota_fmt) {
+ case QFMT_VFS_OLD:
+ fmtname = "vfsold";
+ break;
+ case QFMT_VFS_V0:
+ fmtname = "vfsv0";
+ break;
+ case QFMT_VFS_V1:
+ fmtname = "vfsv1";
+ break;
+ }
+ seq_printf(seq, ",jqfmt=%s", fmtname);
+ }
+
+ if (sbi->s_qf_names[USRQUOTA])
+ seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+ if (sbi->s_qf_names[GRPQUOTA])
+ seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+
+ if (test_opt(sb, USRQUOTA))
+ seq_puts(seq, ",usrquota");
+
+ if (test_opt(sb, GRPQUOTA))
+ seq_puts(seq, ",grpquota");
+#endif
+}
+
+static const char *token2str(int token)
+{
+ static const struct match_token *t;
+
+ for (t = tokens; t->token != Opt_err; t++)
+ if (t->token == token && !strchr(t->pattern, '='))
+ break;
+ return t->pattern;
+}
+
+/*
+ * Show an option if
+ * - it's set to a non-default value OR
+ * - if the per-sb default is different from the global default
+ */
+static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
+ int nodefs)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
+ const struct mount_opts *m;
+ char sep = nodefs ? '\n' : ',';
+
+#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
+#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
+
+ if (sbi->s_sb_block != 1)
+ SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
+
+ for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+ int want_set = m->flags & MOPT_SET;
+ if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
+ (m->flags & MOPT_CLEAR_ERR))
+ continue;
+ if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+ continue; /* skip if same as the default */
+ if ((want_set &&
+ (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
+ (!want_set && (sbi->s_mount_opt & m->mount_opt)))
+ continue; /* select Opt_noFoo vs Opt_Foo */
+ SEQ_OPTS_PRINT("%s", token2str(m->token));
+ }
+
+ if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID ||
+ le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+ SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid);
+ if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID ||
+ le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+ SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid);
+ def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
+ if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
+ SEQ_OPTS_PUTS("errors=remount-ro");
+ if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
+ SEQ_OPTS_PUTS("errors=continue");
+ if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
+ SEQ_OPTS_PUTS("errors=panic");
+ if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+ SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
+ if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
+ SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
+ if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
+ SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+ if (sb->s_flags & MS_I_VERSION)
+ SEQ_OPTS_PUTS("i_version");
+ if (nodefs || sbi->s_stripe)
+ SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
+ if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ SEQ_OPTS_PUTS("data=journal");
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ SEQ_OPTS_PUTS("data=ordered");
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ SEQ_OPTS_PUTS("data=writeback");
+ }
+ if (nodefs ||
+ sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+ SEQ_OPTS_PRINT("inode_readahead_blks=%u",
+ sbi->s_inode_readahead_blks);
+
+ if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
+ (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
+ SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+
+ ext4_show_quota_options(seq, sb);
+ return 0;
+}
+
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
+{
+ return _ext4_show_options(seq, root->d_sb, 0);
+}
+
+static int options_seq_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = seq->private;
+ int rc;
+
+ seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
+ rc = _ext4_show_options(seq, sb, 1);
+ seq_puts(seq, "\n");
+ return rc;
+}
+
+static int options_open_fs(struct inode *inode, struct file *file)
+{
+ return single_open(file, options_seq_show, PDE(inode)->data);
+}
+
+static const struct file_operations ext4_seq_options_fops = {
+ .owner = THIS_MODULE,
+ .open = options_open_fs,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int read_only)
{
@@ -2945,7 +2808,7 @@ static int ext4_run_lazyinit_thread(void)
ext4_clear_request_list();
kfree(ext4_li_info);
ext4_li_info = NULL;
- printk(KERN_CRIT "EXT4: error %d creating inode table "
+ printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
"initialization thread\n",
err);
return err;
@@ -3183,11 +3046,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sb, INIT_INODE_TABLE);
if (def_mount_opts & EXT4_DEFM_DEBUG)
set_opt(sb, DEBUG);
- if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
- ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
- "2.6.38");
+ if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
set_opt(sb, GRPID);
- }
if (def_mount_opts & EXT4_DEFM_UID16)
set_opt(sb, NO_UID32);
/* xattr user namespace & acls are now defaulted on */
@@ -3240,13 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
- &journal_devnum, &journal_ioprio, NULL, 0)) {
+ &journal_devnum, &journal_ioprio, 0)) {
ext4_msg(sb, KERN_WARNING,
"failed to parse options in superblock: %s",
sbi->s_es->s_mount_opts);
}
+ sbi->s_def_mount_opt = sbi->s_mount_opt;
if (!parse_options((char *) data, sb, &journal_devnum,
- &journal_ioprio, NULL, 0))
+ &journal_ioprio, 0))
goto failed_mount;
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -3416,7 +3277,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#else
es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
#endif
- sb->s_dirt = 1;
}
/* Handle clustersize */
@@ -3540,6 +3400,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (ext4_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+ if (sbi->s_proc)
+ proc_create_data("options", S_IRUGO, sbi->s_proc,
+ &ext4_seq_options_fops, sb);
+
bgl_lock_init(sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
@@ -3694,6 +3558,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+ sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+
/*
* The journal may have updated the bg summary counts, so we
* need to update the global counters.
@@ -3735,9 +3601,8 @@ no_journal:
iput(root);
goto failed_mount4;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
- iput(root);
ext4_msg(sb, KERN_ERR, "get root dentry failed");
ret = -ENOMEM;
goto failed_mount4;
@@ -3862,6 +3727,7 @@ failed_mount2:
ext4_kvfree(sbi->s_group_desc);
failed_mount:
if (sbi->s_proc) {
+ remove_proc_entry("options", sbi->s_proc);
remove_proc_entry(sb->s_id, ext4_proc_root);
}
#ifdef CONFIG_QUOTA
@@ -4091,15 +3957,6 @@ static int ext4_load_journal(struct super_block *sb,
if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
- if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
- err = jbd2_journal_update_format(journal);
- if (err) {
- ext4_msg(sb, KERN_ERR, "error updating journal");
- jbd2_journal_destroy(journal);
- return err;
- }
- }
-
if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
err = jbd2_journal_wipe(journal, !really_read_only);
if (!err) {
@@ -4386,7 +4243,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
{
struct ext4_super_block *es;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_fsblk_t n_blocks_count = 0;
unsigned long old_sb_flags;
struct ext4_mount_options old_opts;
int enable_quota = 0;
@@ -4419,8 +4275,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
/*
* Allow the "check" option to be passed as a remount option.
*/
- if (!parse_options(data, sb, NULL, &journal_ioprio,
- &n_blocks_count, 1)) {
+ if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
err = -EINVAL;
goto restore_opts;
}
@@ -4438,8 +4293,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
}
- if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
- n_blocks_count > ext4_blocks_count(es)) {
+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
err = -EROFS;
goto restore_opts;
@@ -4514,8 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal)
ext4_clear_journal_err(sb, es);
sbi->s_mount_state = le16_to_cpu(es->s_state);
- if ((err = ext4_group_extend(sb, es, n_blocks_count)))
- goto restore_opts;
if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
if (EXT4_HAS_INCOMPAT_FEATURE(sb,
@@ -5056,6 +4908,9 @@ static int __init ext4_init_fs(void)
{
int i, err;
+ ext4_li_info = NULL;
+ mutex_init(&ext4_li_mtx);
+
ext4_check_flag_values();
for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
@@ -5094,8 +4949,6 @@ static int __init ext4_init_fs(void)
if (err)
goto out;
- ext4_li_info = NULL;
- mutex_init(&ext4_li_mtx);
return 0;
out:
unregister_as_ext2();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93a00d89a22..e88748e55c0 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -82,8 +82,8 @@
printk("\n"); \
} while (0)
#else
-# define ea_idebug(f...)
-# define ea_bdebug(f...)
+# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
static void ext4_xattr_cache_insert(struct buffer_head *);
@@ -158,13 +158,10 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
static inline int
ext4_xattr_check_block(struct buffer_head *bh)
{
- int error;
-
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1))
return -EIO;
- error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
- return error;
+ return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
}
static inline int
@@ -220,7 +217,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
error = -ENODATA;
if (!EXT4_I(inode)->i_file_acl)
goto cleanup;
- ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+ ea_idebug(inode, "reading block %llu",
+ (unsigned long long)EXT4_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
if (!bh)
goto cleanup;
@@ -363,7 +361,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
error = 0;
if (!EXT4_I(inode)->i_file_acl)
goto cleanup;
- ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+ ea_idebug(inode, "reading block %llu",
+ (unsigned long long)EXT4_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
error = -EIO;
if (!bh)
@@ -487,18 +486,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
ext4_free_blocks(handle, inode, bh, 0, 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
+ unlock_buffer(bh);
} else {
le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+ if (ce)
+ mb_cache_entry_release(ce);
+ unlock_buffer(bh);
error = ext4_handle_dirty_metadata(handle, inode, bh);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
dquot_free_block(inode, 1);
ea_bdebug(bh, "refcount now=%d; releasing",
le32_to_cpu(BHDR(bh)->h_refcount));
- if (ce)
- mb_cache_entry_release(ce);
}
- unlock_buffer(bh);
out:
ext4_std_error(inode->i_sb, error);
return;
@@ -834,7 +834,8 @@ inserted:
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
- ea_idebug(inode, "creating block %d", block);
+ ea_idebug(inode, "creating block %llu",
+ (unsigned long long)block);
new_bh = sb_getblk(sb, block);
if (!new_bh) {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 3ab841054d5..21687e31acc 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1496,11 +1496,13 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
root_inode->i_ino = MSDOS_ROOT_INO;
root_inode->i_version = 1;
error = fat_read_root(root_inode);
- if (error < 0)
+ if (error < 0) {
+ iput(root_inode);
goto out_fail;
+ }
error = -ENOMEM;
insert_inode_hash(root_inode);
- sb->s_root = d_alloc_root(root_inode);
+ sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
fat_msg(sb, KERN_ERR, "get root inode failed");
goto out_fail;
@@ -1516,8 +1518,6 @@ out_invalid:
out_fail:
if (fat_inode)
iput(fat_inode);
- if (root_inode)
- iput(root_inode);
unload_nls(sbi->nls_io);
unload_nls(sbi->nls_disk);
if (sbi->options.iocharset != fat_default_iocharset)
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a81eb2367d3..98ae804f527 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -521,57 +521,46 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
op = &outname[*outlen * sizeof(wchar_t)];
} else {
- if (nls) {
- for (i = 0, ip = name, op = outname, *outlen = 0;
- i < len && *outlen <= FAT_LFN_LEN;
- *outlen += 1)
- {
- if (escape && (*ip == ':')) {
- if (i > len - 5)
- return -EINVAL;
- ec = 0;
- for (k = 1; k < 5; k++) {
- nc = ip[k];
- ec <<= 4;
- if (nc >= '0' && nc <= '9') {
- ec |= nc - '0';
- continue;
- }
- if (nc >= 'a' && nc <= 'f') {
- ec |= nc - ('a' - 10);
- continue;
- }
- if (nc >= 'A' && nc <= 'F') {
- ec |= nc - ('A' - 10);
- continue;
- }
- return -EINVAL;
+ for (i = 0, ip = name, op = outname, *outlen = 0;
+ i < len && *outlen < FAT_LFN_LEN;
+ *outlen += 1) {
+ if (escape && (*ip == ':')) {
+ if (i > len - 5)
+ return -EINVAL;
+ ec = 0;
+ for (k = 1; k < 5; k++) {
+ nc = ip[k];
+ ec <<= 4;
+ if (nc >= '0' && nc <= '9') {
+ ec |= nc - '0';
+ continue;
}
- *op++ = ec & 0xFF;
- *op++ = ec >> 8;
- ip += 5;
- i += 5;
- } else {
- if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0)
- return -EINVAL;
- ip += charlen;
- i += charlen;
- op += 2;
+ if (nc >= 'a' && nc <= 'f') {
+ ec |= nc - ('a' - 10);
+ continue;
+ }
+ if (nc >= 'A' && nc <= 'F') {
+ ec |= nc - ('A' - 10);
+ continue;
+ }
+ return -EINVAL;
}
+ *op++ = ec & 0xFF;
+ *op++ = ec >> 8;
+ ip += 5;
+ i += 5;
+ } else {
+ charlen = nls->char2uni(ip, len - i,
+ (wchar_t *)op);
+ if (charlen < 0)
+ return -EINVAL;
+ ip += charlen;
+ i += charlen;
+ op += 2;
}
- if (i < len)
- return -ENAMETOOLONG;
- } else {
- for (i = 0, ip = name, op = outname, *outlen = 0;
- i < len && *outlen <= FAT_LFN_LEN;
- i++, *outlen += 1)
- {
- *op++ = *ip++;
- *op++ = 0;
- }
- if (i < len)
- return -ENAMETOOLONG;
}
+ if (i < len)
+ return -ENAMETOOLONG;
}
*longlen = *outlen;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 22764c7c838..75e7c1f3a08 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -32,20 +32,20 @@ void set_close_on_exec(unsigned int fd, int flag)
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (flag)
- FD_SET(fd, fdt->close_on_exec);
+ __set_close_on_exec(fd, fdt);
else
- FD_CLR(fd, fdt->close_on_exec);
+ __clear_close_on_exec(fd, fdt);
spin_unlock(&files->file_lock);
}
-static int get_close_on_exec(unsigned int fd)
+static bool get_close_on_exec(unsigned int fd)
{
struct files_struct *files = current->files;
struct fdtable *fdt;
- int res;
+ bool res;
rcu_read_lock();
fdt = files_fdtable(files);
- res = FD_ISSET(fd, fdt->close_on_exec);
+ res = close_on_exec(fd, fdt);
rcu_read_unlock();
return res;
}
@@ -90,15 +90,15 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
err = -EBUSY;
fdt = files_fdtable(files);
tofree = fdt->fd[newfd];
- if (!tofree && FD_ISSET(newfd, fdt->open_fds))
+ if (!tofree && fd_is_open(newfd, fdt))
goto out_unlock;
get_file(file);
rcu_assign_pointer(fdt->fd[newfd], file);
- FD_SET(newfd, fdt->open_fds);
+ __set_open_fd(newfd, fdt);
if (flags & O_CLOEXEC)
- FD_SET(newfd, fdt->close_on_exec);
+ __set_close_on_exec(newfd, fdt);
else
- FD_CLR(newfd, fdt->close_on_exec);
+ __clear_close_on_exec(newfd, fdt);
spin_unlock(&files->file_lock);
if (tofree)
diff --git a/fs/file.c b/fs/file.c
index 4c6992d8f3b..ba3f6053025 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -6,7 +6,7 @@
* Manage the dynamic fd arrays in the process files_struct.
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
@@ -40,7 +40,7 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */
*/
static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-static void *alloc_fdmem(unsigned int size)
+static void *alloc_fdmem(size_t size)
{
/*
* Very large allocations can stress page reclaim, so fall back to
@@ -142,7 +142,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
static struct fdtable * alloc_fdtable(unsigned int nr)
{
struct fdtable *fdt;
- char *data;
+ void *data;
/*
* Figure out how many fds we actually want to support in this fdtable.
@@ -172,14 +172,15 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
data = alloc_fdmem(nr * sizeof(struct file *));
if (!data)
goto out_fdt;
- fdt->fd = (struct file **)data;
- data = alloc_fdmem(max_t(unsigned int,
+ fdt->fd = data;
+
+ data = alloc_fdmem(max_t(size_t,
2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
if (!data)
goto out_arr;
- fdt->open_fds = (fd_set *)data;
+ fdt->open_fds = data;
data += nr / BITS_PER_BYTE;
- fdt->close_on_exec = (fd_set *)data;
+ fdt->close_on_exec = data;
fdt->next = NULL;
return fdt;
@@ -275,11 +276,11 @@ static int count_open_files(struct fdtable *fdt)
int i;
/* Find the last open fd */
- for (i = size/(8*sizeof(long)); i > 0; ) {
- if (fdt->open_fds->fds_bits[--i])
+ for (i = size / BITS_PER_LONG; i > 0; ) {
+ if (fdt->open_fds[--i])
break;
}
- i = (i+1) * 8 * sizeof(long);
+ i = (i + 1) * BITS_PER_LONG;
return i;
}
@@ -306,8 +307,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
newf->next_fd = 0;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
- new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
- new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
+ new_fdt->close_on_exec = newf->close_on_exec_init;
+ new_fdt->open_fds = newf->open_fds_init;
new_fdt->fd = &newf->fd_array[0];
new_fdt->next = NULL;
@@ -350,10 +351,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
- memcpy(new_fdt->open_fds->fds_bits,
- old_fdt->open_fds->fds_bits, open_files/8);
- memcpy(new_fdt->close_on_exec->fds_bits,
- old_fdt->close_on_exec->fds_bits, open_files/8);
+ memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
+ memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
@@ -366,7 +365,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
* is partway through open(). So make sure that this
* fd is available to the new process.
*/
- FD_CLR(open_files - i, new_fdt->open_fds);
+ __clear_open_fd(open_files - i, new_fdt);
}
rcu_assign_pointer(*new_fds++, f);
}
@@ -379,11 +378,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
memset(new_fds, 0, size);
if (new_fdt->max_fds > open_files) {
- int left = (new_fdt->max_fds-open_files)/8;
- int start = open_files / (8 * sizeof(unsigned long));
+ int left = (new_fdt->max_fds - open_files) / 8;
+ int start = open_files / BITS_PER_LONG;
- memset(&new_fdt->open_fds->fds_bits[start], 0, left);
- memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
+ memset(&new_fdt->open_fds[start], 0, left);
+ memset(&new_fdt->close_on_exec[start], 0, left);
}
rcu_assign_pointer(newf->fdt, new_fdt);
@@ -419,8 +418,8 @@ struct files_struct init_files = {
.fdtab = {
.max_fds = NR_OPEN_DEFAULT,
.fd = &init_files.fd_array[0],
- .close_on_exec = (fd_set *)&init_files.close_on_exec_init,
- .open_fds = (fd_set *)&init_files.open_fds_init,
+ .close_on_exec = init_files.close_on_exec_init,
+ .open_fds = init_files.open_fds_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
};
@@ -443,8 +442,7 @@ repeat:
fd = files->next_fd;
if (fd < fdt->max_fds)
- fd = find_next_zero_bit(fdt->open_fds->fds_bits,
- fdt->max_fds, fd);
+ fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
error = expand_files(files, fd);
if (error < 0)
@@ -460,11 +458,11 @@ repeat:
if (start <= files->next_fd)
files->next_fd = fd + 1;
- FD_SET(fd, fdt->open_fds);
+ __set_open_fd(fd, fdt);
if (flags & O_CLOEXEC)
- FD_SET(fd, fdt->close_on_exec);
+ __set_close_on_exec(fd, fdt);
else
- FD_CLR(fd, fdt->close_on_exec);
+ __clear_close_on_exec(fd, fdt);
error = fd;
#if 1
/* Sanity check */
diff --git a/fs/file_table.c b/fs/file_table.c
index 20002e39754..70f2a0fd6ae 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -204,7 +204,7 @@ EXPORT_SYMBOL(alloc_file);
* to write to @file, along with access to write through
* its vfsmount.
*/
-void drop_file_write_access(struct file *file)
+static void drop_file_write_access(struct file *file)
{
struct vfsmount *mnt = file->f_path.mnt;
struct dentry *dentry = file->f_path.dentry;
@@ -219,7 +219,6 @@ void drop_file_write_access(struct file *file)
mnt_drop_write(mnt);
file_release_write(file);
}
-EXPORT_SYMBOL_GPL(drop_file_write_access);
/* the real guts of fput() - releasing the last reference to file
*/
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 9d1c9955838..d4fabd26084 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -224,9 +224,8 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
ret = PTR_ERR(root);
goto out;
}
- sbp->s_root = d_alloc_root(root);
+ sbp->s_root = d_make_root(root);
if (!sbp->s_root) {
- iput(root);
printk(KERN_WARNING "vxfs: unable to get root dentry.\n");
goto out_free_ilist;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f855916657b..539f36cf3e4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -14,7 +14,7 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
@@ -53,14 +53,6 @@ struct wb_writeback_work {
};
/*
- * Include the creation of the trace points after defining the
- * wb_writeback_work structure so that the definition remains local to this
- * file.
- */
-#define CREATE_TRACE_POINTS
-#include <trace/events/writeback.h>
-
-/*
* We don't actually have pdflush, but this one is exported though /proc...
*/
int nr_pdflush_threads;
@@ -92,6 +84,14 @@ static inline struct inode *wb_inode(struct list_head *head)
return list_entry(head, struct inode, i_wb_list);
}
+/*
+ * Include the creation of the trace points after defining the
+ * wb_writeback_work structure and inline functions so that the definition
+ * remains local to this file.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
{
@@ -256,7 +256,8 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
}
/*
- * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
+ * Move expired (dirtied after work->older_than_this) dirty inodes from
+ * @delaying_queue to @dispatch_queue.
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
@@ -1148,23 +1149,6 @@ out_unlock_inode:
}
EXPORT_SYMBOL(__mark_inode_dirty);
-/*
- * Write out a superblock's list of dirty inodes. A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched. For other superblocks,
- * assume that all inodes are backed by the same queue.
- *
- * The inodes to be written are parked on bdi->b_io. They are moved back onto
- * bdi->b_dirty as they are selected for writing. This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
- */
static void wait_sb_inodes(struct super_block *sb)
{
struct inode *inode, *old_inode = NULL;
@@ -1284,7 +1268,7 @@ int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
/**
- * writeback_inodes_sb_if_idle - start writeback if none underway
+ * writeback_inodes_sb_nr_if_idle - start writeback if none underway
* @sb: the superblock
* @nr: the number of pages to write
* @reason: reason why some writeback work was initiated
@@ -1364,8 +1348,6 @@ int write_inode_now(struct inode *inode, int sync)
ret = writeback_single_inode(inode, wb, &wbc);
spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock);
- if (sync)
- inode_sync_wait(inode);
return ret;
}
EXPORT_SYMBOL(write_inode_now);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 78b519c1353..e159e682ad4 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/path.h>
@@ -26,11 +26,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
{
struct path old_root;
+ path_get_longterm(path);
spin_lock(&fs->lock);
write_seqcount_begin(&fs->seq);
old_root = fs->root;
fs->root = *path;
- path_get_longterm(path);
write_seqcount_end(&fs->seq);
spin_unlock(&fs->lock);
if (old_root.dentry)
@@ -45,11 +45,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
{
struct path old_pwd;
+ path_get_longterm(path);
spin_lock(&fs->lock);
write_seqcount_begin(&fs->seq);
old_pwd = fs->pwd;
fs->pwd = *path;
- path_get_longterm(path);
write_seqcount_end(&fs->seq);
spin_unlock(&fs->lock);
@@ -57,6 +57,14 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
path_put_longterm(&old_pwd);
}
+static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
+{
+ if (likely(p->dentry != old->dentry || p->mnt != old->mnt))
+ return 0;
+ *p = *new;
+ return 1;
+}
+
void chroot_fs_refs(struct path *old_root, struct path *new_root)
{
struct task_struct *g, *p;
@@ -68,21 +76,16 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
task_lock(p);
fs = p->fs;
if (fs) {
+ int hits = 0;
spin_lock(&fs->lock);
write_seqcount_begin(&fs->seq);
- if (fs->root.dentry == old_root->dentry
- && fs->root.mnt == old_root->mnt) {
- path_get_longterm(new_root);
- fs->root = *new_root;
+ hits += replace_path(&fs->root, old_root, new_root);
+ hits += replace_path(&fs->pwd, old_root, new_root);
+ write_seqcount_end(&fs->seq);
+ while (hits--) {
count++;
- }
- if (fs->pwd.dentry == old_root->dentry
- && fs->pwd.mnt == old_root->mnt) {
path_get_longterm(new_root);
- fs->pwd = *new_root;
- count++;
}
- write_seqcount_end(&fs->seq);
spin_unlock(&fs->lock);
}
task_unlock(p);
@@ -107,10 +110,8 @@ void exit_fs(struct task_struct *tsk)
int kill;
task_lock(tsk);
spin_lock(&fs->lock);
- write_seqcount_begin(&fs->seq);
tsk->fs = NULL;
kill = !--fs->users;
- write_seqcount_end(&fs->seq);
spin_unlock(&fs->lock);
task_unlock(tsk);
if (kill)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 2aaf3eaaf13..7df2b5e8fbe 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -838,10 +838,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
}
}
if (page) {
- void *mapaddr = kmap_atomic(page, KM_USER0);
+ void *mapaddr = kmap_atomic(page);
void *buf = mapaddr + offset;
offset += fuse_copy_do(cs, &buf, &count);
- kunmap_atomic(mapaddr, KM_USER0);
+ kunmap_atomic(mapaddr);
} else
offset += fuse_copy_do(cs, NULL, &count);
}
@@ -1378,7 +1378,59 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = -ENOENT;
if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+ err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+ up_read(&fc->killsb);
+ kfree(buf);
+ return err;
+
+err:
+ kfree(buf);
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_delete_out outarg;
+ int err = -ENOMEM;
+ char *buf;
+ struct qstr name;
+
+ buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ err = -EINVAL;
+ if (size < sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+
+ err = -ENAMETOOLONG;
+ if (outarg.namelen > FUSE_NAME_MAX)
+ goto err;
+
+ err = -EINVAL;
+ if (size != sizeof(outarg) + outarg.namelen + 1)
+ goto err;
+
+ name.name = buf;
+ name.len = outarg.namelen;
+ err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+ if (err)
+ goto err;
+ fuse_copy_finish(cs);
+ buf[outarg.namelen] = 0;
+ name.hash = full_name_hash(name.name, name.len);
+
+ down_read(&fc->killsb);
+ err = -ENOENT;
+ if (fc->sb)
+ err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
+ outarg.child, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1597,6 +1649,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_RETRIEVE:
return fuse_notify_retrieve(fc, size, cs);
+ case FUSE_NOTIFY_DELETE:
+ return fuse_notify_delete(fc, size, cs);
+
default:
fuse_copy_finish(cs);
return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5ddd6ea8f83..206632887bb 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -868,7 +868,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
}
int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
- struct qstr *name)
+ u64 child_nodeid, struct qstr *name)
{
int err = -ENOTDIR;
struct inode *parent;
@@ -895,8 +895,36 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
fuse_invalidate_attr(parent);
fuse_invalidate_entry(entry);
+
+ if (child_nodeid != 0 && entry->d_inode) {
+ mutex_lock(&entry->d_inode->i_mutex);
+ if (get_node_id(entry->d_inode) != child_nodeid) {
+ err = -ENOENT;
+ goto badentry;
+ }
+ if (d_mountpoint(entry)) {
+ err = -EBUSY;
+ goto badentry;
+ }
+ if (S_ISDIR(entry->d_inode->i_mode)) {
+ shrink_dcache_parent(entry);
+ if (!simple_empty(entry)) {
+ err = -ENOTEMPTY;
+ goto badentry;
+ }
+ entry->d_inode->i_flags |= S_DEAD;
+ }
+ dont_mount(entry);
+ clear_nlink(entry->d_inode);
+ err = 0;
+ badentry:
+ mutex_unlock(&entry->d_inode->i_mutex);
+ if (!err)
+ d_delete(entry);
+ } else {
+ err = 0;
+ }
dput(entry);
- err = 0;
unlock:
mutex_unlock(&parent->i_mutex);
@@ -1182,6 +1210,30 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
return fuse_fsync_common(file, start, end, datasync, 1);
}
+static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+ /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
+ if (fc->minor < 18)
+ return -ENOTTY;
+
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
+}
+
+static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+ if (fc->minor < 18)
+ return -ENOTTY;
+
+ return fuse_ioctl_common(file, cmd, arg,
+ FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
+}
+
static bool update_mtime(unsigned ivalid)
{
/* Always update if mtime is explicitly set */
@@ -1596,6 +1648,8 @@ static const struct file_operations fuse_dir_operations = {
.open = fuse_dir_open,
.release = fuse_dir_release,
.fsync = fuse_dir_fsync,
+ .unlocked_ioctl = fuse_dir_ioctl,
+ .compat_ioctl = fuse_dir_compat_ioctl,
};
static const struct inode_operations fuse_common_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0c84100acd4..a841868bf9c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1555,48 +1555,16 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
loff_t retval;
struct inode *inode = file->f_path.dentry->d_inode;
- mutex_lock(&inode->i_mutex);
- if (origin != SEEK_CUR && origin != SEEK_SET) {
- retval = fuse_update_attributes(inode, NULL, file, NULL);
- if (retval)
- goto exit;
- }
+ /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
+ if (origin == SEEK_CUR || origin == SEEK_SET)
+ return generic_file_llseek(file, offset, origin);
- switch (origin) {
- case SEEK_END:
- offset += i_size_read(inode);
- break;
- case SEEK_CUR:
- if (offset == 0) {
- retval = file->f_pos;
- goto exit;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- if (offset >= i_size_read(inode)) {
- retval = -ENXIO;
- goto exit;
- }
- break;
- case SEEK_HOLE:
- if (offset >= i_size_read(inode)) {
- retval = -ENXIO;
- goto exit;
- }
- offset = i_size_read(inode);
- break;
- }
- retval = -EINVAL;
- if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- retval = offset;
- }
-exit:
+ mutex_lock(&inode->i_mutex);
+ retval = fuse_update_attributes(inode, NULL, file, NULL);
+ if (!retval)
+ retval = generic_file_llseek(file, offset, origin);
mutex_unlock(&inode->i_mutex);
+
return retval;
}
@@ -1808,7 +1776,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
err = -ENOMEM;
- pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+ pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
if (!pages || !iov_page)
goto out;
@@ -1919,11 +1887,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
goto out;
- vaddr = kmap_atomic(pages[0], KM_USER0);
+ vaddr = kmap_atomic(pages[0]);
err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
transferred, in_iovs + out_iovs,
(flags & FUSE_IOCTL_COMPAT) != 0);
- kunmap_atomic(vaddr, KM_USER0);
+ kunmap_atomic(vaddr);
if (err)
goto out;
@@ -1958,8 +1926,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
}
EXPORT_SYMBOL_GPL(fuse_do_ioctl);
-static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
- unsigned long arg, unsigned int flags)
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags)
{
struct inode *inode = file->f_dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1976,13 +1944,13 @@ static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
static long fuse_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- return fuse_file_ioctl_common(file, cmd, arg, 0);
+ return fuse_ioctl_common(file, cmd, arg, 0);
}
static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
}
/*
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1964da0257d..572cefc7801 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -755,9 +755,15 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
/**
* File-system tells the kernel to invalidate parent attributes and
* the dentry matching parent/name.
+ *
+ * If the child_nodeid is non-zero and:
+ * - matches the inode number for the dentry matching parent/name,
+ * - is not a mount point
+ * - is a file or oan empty directory
+ * then the dentry is unhashed (d_delete()).
*/
int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
- struct qstr *name);
+ u64 child_nodeid, struct qstr *name);
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
bool isdir);
@@ -765,6 +771,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
size_t count, loff_t *ppos, int write);
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags);
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags);
unsigned fuse_file_poll(struct file *file, poll_table *wait);
int fuse_dev_release(struct inode *inode, struct file *file);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 64cf8d07393..4aec5995867 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -988,14 +988,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
err = -ENOMEM;
root = fuse_get_root_inode(sb, d.rootmode);
- if (!root)
+ root_dentry = d_make_root(root);
+ if (!root_dentry)
goto err_put_conn;
-
- root_dentry = d_alloc_root(root);
- if (!root_dentry) {
- iput(root);
- goto err_put_conn;
- }
/* only now - we want root dentry with NULL ->d_op */
sb->s_d_op = &fuse_dentry_operations;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index c465ae066c6..eb08c9e43c2 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,10 +1,6 @@
config GFS2_FS
tristate "GFS2 file system support"
depends on (64BIT || LBDAF)
- select DLM if GFS2_FS_LOCKING_DLM
- select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
- select SYSFS if GFS2_FS_LOCKING_DLM
- select IP_SCTP if DLM_SCTP
select FS_POSIX_ACL
select CRC32
select QUOTACTL
@@ -29,7 +25,8 @@ config GFS2_FS
config GFS2_FS_LOCKING_DLM
bool "GFS2 DLM locking"
- depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG
+ depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
+ HOTPLUG && DLM && CONFIGFS_FS && SYSFS
help
Multiple node locking module for GFS2
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 501e5cba09b..9b2ff0e851b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -434,12 +434,12 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
if (error)
return error;
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
flush_dcache_page(page);
brelse(dibh);
SetPageUptodate(page);
@@ -542,9 +542,9 @@ int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
if (IS_ERR(page))
return PTR_ERR(page);
- p = kmap_atomic(page, KM_USER0);
+ p = kmap_atomic(page);
memcpy(buf + copied, p + offset, amt);
- kunmap_atomic(p, KM_USER0);
+ kunmap_atomic(p);
mark_page_accessed(page);
page_cache_release(page);
copied += amt;
@@ -788,11 +788,11 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memcpy(buf + pos, kaddr + pos, copied);
memset(kaddr + pos + copied, 0, len - copied);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (!PageUptodate(page))
SetPageUptodate(page);
@@ -807,7 +807,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
- ip->i_gh.gh_flags |= GL_NOCACHE;
+ sdp->sd_rindex_uptodate = 0;
}
brelse(dibh);
@@ -873,7 +873,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
- ip->i_gh.gh_flags |= GL_NOCACHE;
+ sdp->sd_rindex_uptodate = 0;
}
brelse(dibh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 14a70401597..03c04febe26 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -60,7 +60,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
int release = 0;
if (!page || page->index) {
- page = grab_cache_page(inode->i_mapping, 0);
+ page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
if (!page)
return -ENOMEM;
release = 1;
@@ -724,7 +724,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
int metadata;
unsigned int revokes = 0;
int x;
- int error = 0;
+ int error;
+
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
if (!*top)
sm->sm_first = 0;
@@ -930,7 +934,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
struct page *page;
int err;
- page = grab_cache_page(mapping, index);
+ page = find_or_create_page(mapping, index, GFP_NOFS);
if (!page)
return 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c35573abd37..a836056343f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1844,6 +1844,10 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
unsigned int x, size = len * sizeof(u64);
int error;
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
ht = kzalloc(size, GFP_NOFS);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c5fb3597f69..a3d2c9ee8d6 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -18,7 +18,6 @@
#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/gfs2_ondisk.h>
-#include <linux/ext2_fs.h>
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/crc32.h>
@@ -313,6 +312,8 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return gfs2_get_flags(filp, (u32 __user *)arg);
case FS_IOC_SETFLAGS:
return gfs2_set_flags(filp, (u32 __user *)arg);
+ case FITRIM:
+ return gfs2_fitrim(filp, (void __user *)arg);
}
return -ENOTTY;
}
@@ -674,6 +675,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
struct gfs2_inode *ip = GFS2_I(inode);
struct buffer_head *dibh;
int error;
+ loff_t size = len;
unsigned int nr_blks;
sector_t lblock = offset >> inode->i_blkbits;
@@ -707,8 +709,8 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
goto out;
}
}
- if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
- i_size_write(inode, offset + len);
+ if (offset + size > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
+ i_size_write(inode, offset + size);
mark_inode_dirty(inode);
@@ -777,12 +779,14 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
if (unlikely(error))
goto out_uninit;
- if (!gfs2_write_alloc_required(ip, offset, len))
- goto out_unlock;
-
while (len > 0) {
if (len < bytes)
bytes = len;
+ if (!gfs2_write_alloc_required(ip, offset, bytes)) {
+ len -= bytes;
+ offset += bytes;
+ continue;
+ }
qa = gfs2_qadata_get(ip);
if (!qa) {
error = -ENOMEM;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 88e8a23d002..dab2526071c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -29,6 +29,7 @@
#include <linux/rcupdate.h>
#include <linux/rculist_bl.h>
#include <linux/bit_spinlock.h>
+#include <linux/percpu.h>
#include "gfs2.h"
#include "incore.h"
@@ -167,14 +168,19 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
spin_unlock(&lru_lock);
}
-static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
{
- spin_lock(&lru_lock);
if (!list_empty(&gl->gl_lru)) {
list_del_init(&gl->gl_lru);
atomic_dec(&lru_count);
clear_bit(GLF_LRU, &gl->gl_flags);
}
+}
+
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+{
+ spin_lock(&lru_lock);
+ __gfs2_glock_remove_from_lru(gl);
spin_unlock(&lru_lock);
}
@@ -217,11 +223,12 @@ void gfs2_glock_put(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_sbd;
struct address_space *mapping = gfs2_glock2aspace(gl);
- if (atomic_dec_and_test(&gl->gl_ref)) {
+ if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
+ __gfs2_glock_remove_from_lru(gl);
+ spin_unlock(&lru_lock);
spin_lock_bucket(gl->gl_hash);
hlist_bl_del_rcu(&gl->gl_list);
spin_unlock_bucket(gl->gl_hash);
- gfs2_glock_remove_from_lru(gl);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
trace_gfs2_glock_put(gl);
@@ -537,6 +544,11 @@ __acquires(&gl->gl_spin)
do_error(gl, 0); /* Fail queued try locks */
}
gl->gl_req = target;
+ set_bit(GLF_BLOCKING, &gl->gl_flags);
+ if ((gl->gl_req == LM_ST_UNLOCKED) ||
+ (gl->gl_state == LM_ST_EXCLUSIVE) ||
+ (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
+ clear_bit(GLF_BLOCKING, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
if (glops->go_xmote_th)
glops->go_xmote_th(gl);
@@ -738,6 +750,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
return -ENOMEM;
atomic_inc(&sdp->sd_glock_disposal);
+ gl->gl_sbd = sdp;
gl->gl_flags = 0;
gl->gl_name = name;
atomic_set(&gl->gl_ref, 1);
@@ -746,12 +759,17 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_demote_state = LM_ST_EXCLUSIVE;
gl->gl_hash = hash;
gl->gl_ops = glops;
- snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number);
+ gl->gl_dstamp = ktime_set(0, 0);
+ preempt_disable();
+ /* We use the global stats to estimate the initial per-glock stats */
+ gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type];
+ preempt_enable();
+ gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0;
+ gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0;
memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
gl->gl_tchange = jiffies;
gl->gl_object = NULL;
- gl->gl_sbd = sdp;
gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
INIT_WORK(&gl->gl_delete, delete_work_func);
@@ -993,6 +1011,8 @@ fail:
}
set_bit(GLF_QUEUED, &gl->gl_flags);
trace_gfs2_glock_queue(gh, 1);
+ gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT);
+ gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT);
if (likely(insert_pt == NULL)) {
list_add_tail(&gh->gh_list, &gl->gl_holders);
if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1353,7 +1373,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
spin_lock(&gl->gl_spin);
gl->gl_reply = ret;
- if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
+ if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
@@ -1652,6 +1672,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'L';
if (gl->gl_object)
*p++ = 'o';
+ if (test_bit(GLF_BLOCKING, gflags))
+ *p++ = 'b';
*p = 0;
return buf;
}
@@ -1708,8 +1730,78 @@ out:
return error;
}
+static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+ struct gfs2_glock *gl = iter_ptr;
+
+ seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n",
+ gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number,
+ (long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
+ (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
+ (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
+ (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
+ (long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
+ (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
+ (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
+ (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
+ return 0;
+}
+
+static const char *gfs2_gltype[] = {
+ "type",
+ "reserved",
+ "nondisk",
+ "inode",
+ "rgrp",
+ "meta",
+ "iopen",
+ "flock",
+ "plock",
+ "quota",
+ "journal",
+};
+
+static const char *gfs2_stype[] = {
+ [GFS2_LKS_SRTT] = "srtt",
+ [GFS2_LKS_SRTTVAR] = "srttvar",
+ [GFS2_LKS_SRTTB] = "srttb",
+ [GFS2_LKS_SRTTVARB] = "srttvarb",
+ [GFS2_LKS_SIRT] = "sirt",
+ [GFS2_LKS_SIRTVAR] = "sirtvar",
+ [GFS2_LKS_DCOUNT] = "dlm",
+ [GFS2_LKS_QCOUNT] = "queue",
+};
+
+#define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype))
+
+static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+ struct gfs2_glock_iter *gi = seq->private;
+ struct gfs2_sbd *sdp = gi->sdp;
+ unsigned index = gi->hash >> 3;
+ unsigned subindex = gi->hash & 0x07;
+ s64 value;
+ int i;
+
+ if (index == 0 && subindex != 0)
+ return 0;
+ seq_printf(seq, "%-10s %8s:", gfs2_gltype[index],
+ (index == 0) ? "cpu": gfs2_stype[subindex]);
+ for_each_possible_cpu(i) {
+ const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i);
+ if (index == 0) {
+ value = i;
+ } else {
+ value = lkstats->lkstats[index - 1].stats[subindex];
+ }
+ seq_printf(seq, " %15lld", (long long)value);
+ }
+ seq_putc(seq, '\n');
+ return 0;
+}
int __init gfs2_glock_init(void)
{
@@ -1822,6 +1914,35 @@ static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
return dump_glock(seq, iter_ptr);
}
+static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct gfs2_glock_iter *gi = seq->private;
+
+ gi->hash = *pos;
+ if (*pos >= GFS2_NR_SBSTATS)
+ return NULL;
+ preempt_disable();
+ return SEQ_START_TOKEN;
+}
+
+static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr,
+ loff_t *pos)
+{
+ struct gfs2_glock_iter *gi = seq->private;
+ (*pos)++;
+ gi->hash++;
+ if (gi->hash >= GFS2_NR_SBSTATS) {
+ preempt_enable();
+ return NULL;
+ }
+ return SEQ_START_TOKEN;
+}
+
+static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr)
+{
+ preempt_enable();
+}
+
static const struct seq_operations gfs2_glock_seq_ops = {
.start = gfs2_glock_seq_start,
.next = gfs2_glock_seq_next,
@@ -1829,7 +1950,21 @@ static const struct seq_operations gfs2_glock_seq_ops = {
.show = gfs2_glock_seq_show,
};
-static int gfs2_debugfs_open(struct inode *inode, struct file *file)
+static const struct seq_operations gfs2_glstats_seq_ops = {
+ .start = gfs2_glock_seq_start,
+ .next = gfs2_glock_seq_next,
+ .stop = gfs2_glock_seq_stop,
+ .show = gfs2_glstats_seq_show,
+};
+
+static const struct seq_operations gfs2_sbstats_seq_ops = {
+ .start = gfs2_sbstats_seq_start,
+ .next = gfs2_sbstats_seq_next,
+ .stop = gfs2_sbstats_seq_stop,
+ .show = gfs2_sbstats_seq_show,
+};
+
+static int gfs2_glocks_open(struct inode *inode, struct file *file)
{
int ret = seq_open_private(file, &gfs2_glock_seq_ops,
sizeof(struct gfs2_glock_iter));
@@ -1841,9 +1976,49 @@ static int gfs2_debugfs_open(struct inode *inode, struct file *file)
return ret;
}
-static const struct file_operations gfs2_debug_fops = {
+static int gfs2_glstats_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
+ sizeof(struct gfs2_glock_iter));
+ if (ret == 0) {
+ struct seq_file *seq = file->private_data;
+ struct gfs2_glock_iter *gi = seq->private;
+ gi->sdp = inode->i_private;
+ }
+ return ret;
+}
+
+static int gfs2_sbstats_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open_private(file, &gfs2_sbstats_seq_ops,
+ sizeof(struct gfs2_glock_iter));
+ if (ret == 0) {
+ struct seq_file *seq = file->private_data;
+ struct gfs2_glock_iter *gi = seq->private;
+ gi->sdp = inode->i_private;
+ }
+ return ret;
+}
+
+static const struct file_operations gfs2_glocks_fops = {
+ .owner = THIS_MODULE,
+ .open = gfs2_glocks_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static const struct file_operations gfs2_glstats_fops = {
.owner = THIS_MODULE,
- .open = gfs2_debugfs_open,
+ .open = gfs2_glstats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static const struct file_operations gfs2_sbstats_fops = {
+ .owner = THIS_MODULE,
+ .open = gfs2_sbstats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
@@ -1857,20 +2032,45 @@ int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",
S_IFREG | S_IRUGO,
sdp->debugfs_dir, sdp,
- &gfs2_debug_fops);
+ &gfs2_glocks_fops);
if (!sdp->debugfs_dentry_glocks)
- return -ENOMEM;
+ goto fail;
+
+ sdp->debugfs_dentry_glstats = debugfs_create_file("glstats",
+ S_IFREG | S_IRUGO,
+ sdp->debugfs_dir, sdp,
+ &gfs2_glstats_fops);
+ if (!sdp->debugfs_dentry_glstats)
+ goto fail;
+
+ sdp->debugfs_dentry_sbstats = debugfs_create_file("sbstats",
+ S_IFREG | S_IRUGO,
+ sdp->debugfs_dir, sdp,
+ &gfs2_sbstats_fops);
+ if (!sdp->debugfs_dentry_sbstats)
+ goto fail;
return 0;
+fail:
+ gfs2_delete_debugfs_file(sdp);
+ return -ENOMEM;
}
void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
{
- if (sdp && sdp->debugfs_dir) {
+ if (sdp->debugfs_dir) {
if (sdp->debugfs_dentry_glocks) {
debugfs_remove(sdp->debugfs_dentry_glocks);
sdp->debugfs_dentry_glocks = NULL;
}
+ if (sdp->debugfs_dentry_glstats) {
+ debugfs_remove(sdp->debugfs_dentry_glstats);
+ sdp->debugfs_dentry_glstats = NULL;
+ }
+ if (sdp->debugfs_dentry_sbstats) {
+ debugfs_remove(sdp->debugfs_dentry_sbstats);
+ sdp->debugfs_dentry_sbstats = NULL;
+ }
debugfs_remove(sdp->debugfs_dir);
sdp->debugfs_dir = NULL;
}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2553b858a72..307ac31df78 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -121,8 +121,11 @@ enum {
struct lm_lockops {
const char *lm_proto_name;
- int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
- void (*lm_unmount) (struct gfs2_sbd *sdp);
+ int (*lm_mount) (struct gfs2_sbd *sdp, const char *table);
+ void (*lm_first_done) (struct gfs2_sbd *sdp);
+ void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid,
+ unsigned int result);
+ void (*lm_unmount) (struct gfs2_sbd *sdp);
void (*lm_withdraw) (struct gfs2_sbd *sdp);
void (*lm_put_lock) (struct gfs2_glock *gl);
int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e1d3bb59945..47d0bda5ac2 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -19,6 +19,8 @@
#include <linux/rculist_bl.h>
#include <linux/completion.h>
#include <linux/rbtree.h>
+#include <linux/ktime.h>
+#include <linux/percpu.h>
#define DIO_WAIT 0x00000010
#define DIO_METADATA 0x00000020
@@ -139,8 +141,45 @@ struct gfs2_bufdata {
#define GDLM_STRNAME_BYTES 25
#define GDLM_LVB_SIZE 32
+/*
+ * ls_recover_flags:
+ *
+ * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been
+ * held by failed nodes whose journals need recovery. Those locks should
+ * only be used for journal recovery until the journal recovery is done.
+ * This is set by the dlm recover_prep callback and cleared by the
+ * gfs2_control thread when journal recovery is complete. To avoid
+ * races between recover_prep setting and gfs2_control clearing, recover_spin
+ * is held while changing this bit and reading/writing recover_block
+ * and recover_start.
+ *
+ * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used.
+ *
+ * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing
+ * recovery of all journals before allowing other nodes to mount the fs.
+ * This is cleared when FIRST_MOUNT_DONE is set.
+ *
+ * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished
+ * recovery of all journals, and now allows other nodes to mount the fs.
+ *
+ * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared
+ * BLOCK_LOCKS for the first time. The gfs2_control thread should now
+ * control clearing BLOCK_LOCKS for further recoveries.
+ *
+ * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq.
+ *
+ * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep()
+ * and recover_done(), i.e. set while recover_block == recover_start.
+ */
+
enum {
DFL_BLOCK_LOCKS = 0,
+ DFL_NO_DLM_OPS = 1,
+ DFL_FIRST_MOUNT = 2,
+ DFL_FIRST_MOUNT_DONE = 3,
+ DFL_MOUNT_DONE = 4,
+ DFL_UNMOUNT = 5,
+ DFL_DLM_RECOVERY = 6,
};
struct lm_lockname {
@@ -168,6 +207,22 @@ struct gfs2_glock_operations {
};
enum {
+ GFS2_LKS_SRTT = 0, /* Non blocking smoothed round trip time */
+ GFS2_LKS_SRTTVAR = 1, /* Non blocking smoothed variance */
+ GFS2_LKS_SRTTB = 2, /* Blocking smoothed round trip time */
+ GFS2_LKS_SRTTVARB = 3, /* Blocking smoothed variance */
+ GFS2_LKS_SIRT = 4, /* Smoothed Inter-request time */
+ GFS2_LKS_SIRTVAR = 5, /* Smoothed Inter-request variance */
+ GFS2_LKS_DCOUNT = 6, /* Count of dlm requests */
+ GFS2_LKS_QCOUNT = 7, /* Count of gfs2_holder queues */
+ GFS2_NR_LKSTATS
+};
+
+struct gfs2_lkstats {
+ s64 stats[GFS2_NR_LKSTATS];
+};
+
+enum {
/* States */
HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
HIF_FIRST = 7,
@@ -201,10 +256,12 @@ enum {
GLF_QUEUED = 12,
GLF_LRU = 13,
GLF_OBJECT = 14, /* Used only for tracing */
+ GLF_BLOCKING = 15,
};
struct gfs2_glock {
struct hlist_bl_node gl_list;
+ struct gfs2_sbd *gl_sbd;
unsigned long gl_flags; /* GLF_... */
struct lm_lockname gl_name;
atomic_t gl_ref;
@@ -224,16 +281,14 @@ struct gfs2_glock {
struct list_head gl_holders;
const struct gfs2_glock_operations *gl_ops;
- char gl_strname[GDLM_STRNAME_BYTES];
+ ktime_t gl_dstamp;
+ struct gfs2_lkstats gl_stats;
struct dlm_lksb gl_lksb;
char gl_lvb[32];
unsigned long gl_tchange;
void *gl_object;
struct list_head gl_lru;
-
- struct gfs2_sbd *gl_sbd;
-
struct list_head gl_ail_list;
atomic_t gl_ail_count;
atomic_t gl_revokes;
@@ -392,6 +447,7 @@ struct gfs2_jdesc {
#define JDF_RECOVERY 1
unsigned int jd_jid;
unsigned int jd_blocks;
+ int jd_recover_error;
};
struct gfs2_statfs_change_host {
@@ -461,6 +517,7 @@ enum {
SDF_NORECOVERY = 4,
SDF_DEMOTE = 5,
SDF_NOJOURNALID = 6,
+ SDF_RORECOVERY = 7, /* read only recovery */
};
#define GFS2_FSNAME_LEN 256
@@ -499,18 +556,36 @@ struct gfs2_sb_host {
struct lm_lockstruct {
int ls_jid;
unsigned int ls_first;
- unsigned int ls_first_done;
unsigned int ls_nodir;
const struct lm_lockops *ls_ops;
- unsigned long ls_flags;
dlm_lockspace_t *ls_dlm;
- int ls_recover_jid_done;
- int ls_recover_jid_status;
+ int ls_recover_jid_done; /* These two are deprecated, */
+ int ls_recover_jid_status; /* used previously by gfs_controld */
+
+ struct dlm_lksb ls_mounted_lksb; /* mounted_lock */
+ struct dlm_lksb ls_control_lksb; /* control_lock */
+ char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */
+ struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
+
+ spinlock_t ls_recover_spin; /* protects following fields */
+ unsigned long ls_recover_flags; /* DFL_ */
+ uint32_t ls_recover_mount; /* gen in first recover_done cb */
+ uint32_t ls_recover_start; /* gen in last recover_done cb */
+ uint32_t ls_recover_block; /* copy recover_start in last recover_prep */
+ uint32_t ls_recover_size; /* size of recover_submit, recover_result */
+ uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */
+ uint32_t *ls_recover_result; /* result of last jid recovery */
+};
+
+struct gfs2_pcpu_lkstats {
+ /* One struct for each glock type */
+ struct gfs2_lkstats lkstats[10];
};
struct gfs2_sbd {
struct super_block *sd_vfs;
+ struct gfs2_pcpu_lkstats __percpu *sd_lkstats;
struct kobject sd_kobj;
unsigned long sd_flags; /* SDF_... */
struct gfs2_sb_host sd_sb;
@@ -544,6 +619,7 @@ struct gfs2_sbd {
wait_queue_head_t sd_glock_wait;
atomic_t sd_glock_disposal;
struct completion sd_locking_init;
+ struct delayed_work sd_control_work;
/* Inode Stuff */
@@ -568,7 +644,6 @@ struct gfs2_sbd {
int sd_rindex_uptodate;
spinlock_t sd_rindex_spin;
- struct mutex sd_rindex_mutex;
struct rb_root sd_rindex_tree;
unsigned int sd_rgrps;
unsigned int sd_max_rg_data;
@@ -673,8 +748,23 @@ struct gfs2_sbd {
unsigned long sd_last_warning;
struct dentry *debugfs_dir; /* debugfs directory */
- struct dentry *debugfs_dentry_glocks; /* for debugfs */
+ struct dentry *debugfs_dentry_glocks;
+ struct dentry *debugfs_dentry_glstats;
+ struct dentry *debugfs_dentry_sbstats;
};
+static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
+{
+ gl->gl_stats.stats[which]++;
+}
+
+static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
+{
+ const struct gfs2_sbd *sdp = gl->gl_sbd;
+ preempt_disable();
+ this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++;
+ preempt_enable();
+}
+
#endif /* __INCORE_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 017960cf1d7..a9ba2444e07 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -391,10 +391,6 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
int error;
int dblocks = 1;
- error = gfs2_rindex_update(sdp);
- if (error)
- fs_warn(sdp, "rindex update returns %d\n", error);
-
error = gfs2_inplace_reserve(dip, RES_DINODE);
if (error)
goto out;
@@ -599,9 +595,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
goto fail_end_trans;
- inc_nlink(&ip->i_inode);
- if (S_ISDIR(ip->i_inode.i_mode))
- inc_nlink(&ip->i_inode);
+ set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1037,14 +1031,21 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
struct buffer_head *bh;
struct gfs2_holder ghs[3];
struct gfs2_rgrpd *rgd;
- int error = -EROFS;
+ int error;
+
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
+ error = -EROFS;
gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
- rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1);
if (!rgd)
goto out_inodes;
+
gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
@@ -1229,6 +1230,10 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
return 0;
}
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
if (odip != ndip) {
error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
0, &r_gh);
@@ -1260,7 +1265,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
* this is the case of the target file already existing
* so we unlink before doing the rename
*/
- nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr);
+ nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1);
if (nrgd)
gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
}
@@ -1350,7 +1355,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
error = alloc_required;
if (error < 0)
goto out_gunlock;
- error = 0;
if (alloc_required) {
struct gfs2_qadata *qa = gfs2_qadata_get(ndip);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index ce85b62bc0a..f8411bd1b80 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
+ * Copyright 2004-2011 Red Hat, Inc.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -11,18 +11,113 @@
#include <linux/dlm.h>
#include <linux/slab.h>
#include <linux/types.h>
+#include <linux/delay.h>
#include <linux/gfs2_ondisk.h>
#include "incore.h"
#include "glock.h"
#include "util.h"
+#include "sys.h"
+#include "trace_gfs2.h"
+extern struct workqueue_struct *gfs2_control_wq;
+/**
+ * gfs2_update_stats - Update time based stats
+ * @mv: Pointer to mean/variance structure to update
+ * @sample: New data to include
+ *
+ * @delta is the difference between the current rtt sample and the
+ * running average srtt. We add 1/8 of that to the srtt in order to
+ * update the current srtt estimate. The varience estimate is a bit
+ * more complicated. We subtract the abs value of the @delta from
+ * the current variance estimate and add 1/4 of that to the running
+ * total.
+ *
+ * Note that the index points at the array entry containing the smoothed
+ * mean value, and the variance is always in the following entry
+ *
+ * Reference: TCP/IP Illustrated, vol 2, p. 831,832
+ * All times are in units of integer nanoseconds. Unlike the TCP/IP case,
+ * they are not scaled fixed point.
+ */
+
+static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
+ s64 sample)
+{
+ s64 delta = sample - s->stats[index];
+ s->stats[index] += (delta >> 3);
+ index++;
+ s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2);
+}
+
+/**
+ * gfs2_update_reply_times - Update locking statistics
+ * @gl: The glock to update
+ *
+ * This assumes that gl->gl_dstamp has been set earlier.
+ *
+ * The rtt (lock round trip time) is an estimate of the time
+ * taken to perform a dlm lock request. We update it on each
+ * reply from the dlm.
+ *
+ * The blocking flag is set on the glock for all dlm requests
+ * which may potentially block due to lock requests from other nodes.
+ * DLM requests where the current lock state is exclusive, the
+ * requested state is null (or unlocked) or where the TRY or
+ * TRY_1CB flags are set are classified as non-blocking. All
+ * other DLM requests are counted as (potentially) blocking.
+ */
+static inline void gfs2_update_reply_times(struct gfs2_glock *gl)
+{
+ struct gfs2_pcpu_lkstats *lks;
+ const unsigned gltype = gl->gl_name.ln_type;
+ unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ?
+ GFS2_LKS_SRTTB : GFS2_LKS_SRTT;
+ s64 rtt;
+
+ preempt_disable();
+ rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp));
+ lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+ gfs2_update_stats(&gl->gl_stats, index, rtt); /* Local */
+ gfs2_update_stats(&lks->lkstats[gltype], index, rtt); /* Global */
+ preempt_enable();
+
+ trace_gfs2_glock_lock_time(gl, rtt);
+}
+
+/**
+ * gfs2_update_request_times - Update locking statistics
+ * @gl: The glock to update
+ *
+ * The irt (lock inter-request times) measures the average time
+ * between requests to the dlm. It is updated immediately before
+ * each dlm call.
+ */
+
+static inline void gfs2_update_request_times(struct gfs2_glock *gl)
+{
+ struct gfs2_pcpu_lkstats *lks;
+ const unsigned gltype = gl->gl_name.ln_type;
+ ktime_t dstamp;
+ s64 irt;
+
+ preempt_disable();
+ dstamp = gl->gl_dstamp;
+ gl->gl_dstamp = ktime_get_real();
+ irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp));
+ lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+ gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt); /* Local */
+ gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt); /* Global */
+ preempt_enable();
+}
+
static void gdlm_ast(void *arg)
{
struct gfs2_glock *gl = arg;
unsigned ret = gl->gl_state;
+ gfs2_update_reply_times(gl);
BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID)
@@ -108,7 +203,7 @@ static int make_mode(const unsigned int lmstate)
static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
const int req)
{
- u32 lkf = 0;
+ u32 lkf = DLM_LKF_VALBLK;
if (gfs_flags & LM_FLAG_TRY)
lkf |= DLM_LKF_NOQUEUE;
@@ -135,26 +230,43 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
if (lkid != 0)
lkf |= DLM_LKF_CONVERT;
- lkf |= DLM_LKF_VALBLK;
-
return lkf;
}
+static void gfs2_reverse_hex(char *c, u64 value)
+{
+ while (value) {
+ *c-- = hex_asc[value & 0x0f];
+ value >>= 4;
+ }
+}
+
static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
unsigned int flags)
{
struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
int req;
u32 lkf;
+ char strname[GDLM_STRNAME_BYTES] = "";
req = make_mode(req_state);
lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
-
+ gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
+ gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
+ if (gl->gl_lksb.sb_lkid) {
+ gfs2_update_request_times(gl);
+ } else {
+ memset(strname, ' ', GDLM_STRNAME_BYTES - 1);
+ strname[GDLM_STRNAME_BYTES - 1] = '\0';
+ gfs2_reverse_hex(strname + 7, gl->gl_name.ln_type);
+ gfs2_reverse_hex(strname + 23, gl->gl_name.ln_number);
+ gl->gl_dstamp = ktime_get_real();
+ }
/*
* Submit the actual lock request.
*/
- return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+ return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
}
@@ -169,6 +281,10 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
return;
}
+ clear_bit(GLF_BLOCKING, &gl->gl_flags);
+ gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
+ gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
+ gfs2_update_request_times(gl);
error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
NULL, gl);
if (error) {
@@ -185,34 +301,1002 @@ static void gdlm_cancel(struct gfs2_glock *gl)
dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
}
-static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
+/*
+ * dlm/gfs2 recovery coordination using dlm_recover callbacks
+ *
+ * 1. dlm_controld sees lockspace members change
+ * 2. dlm_controld blocks dlm-kernel locking activity
+ * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
+ * 4. dlm_controld starts and finishes its own user level recovery
+ * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery
+ * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot)
+ * 7. dlm_recoverd does its own lock recovery
+ * 8. dlm_recoverd unblocks dlm-kernel locking activity
+ * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation)
+ * 10. gfs2_control updates control_lock lvb with new generation and jid bits
+ * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none)
+ * 12. gfs2_recover dequeues and recovers journals of failed nodes
+ * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result)
+ * 14. gfs2_control updates control_lock lvb jid bits for recovered journals
+ * 15. gfs2_control unblocks normal locking when all journals are recovered
+ *
+ * - failures during recovery
+ *
+ * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control
+ * clears BLOCK_LOCKS (step 15), e.g. another node fails while still
+ * recovering for a prior failure. gfs2_control needs a way to detect
+ * this so it can leave BLOCK_LOCKS set in step 15. This is managed using
+ * the recover_block and recover_start values.
+ *
+ * recover_done() provides a new lockspace generation number each time it
+ * is called (step 9). This generation number is saved as recover_start.
+ * When recover_prep() is called, it sets BLOCK_LOCKS and sets
+ * recover_block = recover_start. So, while recover_block is equal to
+ * recover_start, BLOCK_LOCKS should remain set. (recover_spin must
+ * be held around the BLOCK_LOCKS/recover_block/recover_start logic.)
+ *
+ * - more specific gfs2 steps in sequence above
+ *
+ * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start
+ * 6. recover_slot records any failed jids (maybe none)
+ * 9. recover_done sets recover_start = new generation number
+ * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids
+ * 12. gfs2_recover does journal recoveries for failed jids identified above
+ * 14. gfs2_control clears control_lock lvb bits for recovered jids
+ * 15. gfs2_control checks if recover_block == recover_start (step 3 occured
+ * again) then do nothing, otherwise if recover_start > recover_block
+ * then clear BLOCK_LOCKS.
+ *
+ * - parallel recovery steps across all nodes
+ *
+ * All nodes attempt to update the control_lock lvb with the new generation
+ * number and jid bits, but only the first to get the control_lock EX will
+ * do so; others will see that it's already done (lvb already contains new
+ * generation number.)
+ *
+ * . All nodes get the same recover_prep/recover_slot/recover_done callbacks
+ * . All nodes attempt to set control_lock lvb gen + bits for the new gen
+ * . One node gets control_lock first and writes the lvb, others see it's done
+ * . All nodes attempt to recover jids for which they see control_lock bits set
+ * . One node succeeds for a jid, and that one clears the jid bit in the lvb
+ * . All nodes will eventually see all lvb bits clear and unblock locks
+ *
+ * - is there a problem with clearing an lvb bit that should be set
+ * and missing a journal recovery?
+ *
+ * 1. jid fails
+ * 2. lvb bit set for step 1
+ * 3. jid recovered for step 1
+ * 4. jid taken again (new mount)
+ * 5. jid fails (for step 4)
+ * 6. lvb bit set for step 5 (will already be set)
+ * 7. lvb bit cleared for step 3
+ *
+ * This is not a problem because the failure in step 5 does not
+ * require recovery, because the mount in step 4 could not have
+ * progressed far enough to unblock locks and access the fs. The
+ * control_mount() function waits for all recoveries to be complete
+ * for the latest lockspace generation before ever unblocking locks
+ * and returning. The mount in step 4 waits until the recovery in
+ * step 1 is done.
+ *
+ * - special case of first mounter: first node to mount the fs
+ *
+ * The first node to mount a gfs2 fs needs to check all the journals
+ * and recover any that need recovery before other nodes are allowed
+ * to mount the fs. (Others may begin mounting, but they must wait
+ * for the first mounter to be done before taking locks on the fs
+ * or accessing the fs.) This has two parts:
+ *
+ * 1. The mounted_lock tells a node it's the first to mount the fs.
+ * Each node holds the mounted_lock in PR while it's mounted.
+ * Each node tries to acquire the mounted_lock in EX when it mounts.
+ * If a node is granted the mounted_lock EX it means there are no
+ * other mounted nodes (no PR locks exist), and it is the first mounter.
+ * The mounted_lock is demoted to PR when first recovery is done, so
+ * others will fail to get an EX lock, but will get a PR lock.
+ *
+ * 2. The control_lock blocks others in control_mount() while the first
+ * mounter is doing first mount recovery of all journals.
+ * A mounting node needs to acquire control_lock in EX mode before
+ * it can proceed. The first mounter holds control_lock in EX while doing
+ * the first mount recovery, blocking mounts from other nodes, then demotes
+ * control_lock to NL when it's done (others_may_mount/first_done),
+ * allowing other nodes to continue mounting.
+ *
+ * first mounter:
+ * control_lock EX/NOQUEUE success
+ * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters)
+ * set first=1
+ * do first mounter recovery
+ * mounted_lock EX->PR
+ * control_lock EX->NL, write lvb generation
+ *
+ * other mounter:
+ * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry)
+ * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR)
+ * mounted_lock PR/NOQUEUE success
+ * read lvb generation
+ * control_lock EX->NL
+ * set first=0
+ *
+ * - mount during recovery
+ *
+ * If a node mounts while others are doing recovery (not first mounter),
+ * the mounting node will get its initial recover_done() callback without
+ * having seen any previous failures/callbacks.
+ *
+ * It must wait for all recoveries preceding its mount to be finished
+ * before it unblocks locks. It does this by repeating the "other mounter"
+ * steps above until the lvb generation number is >= its mount generation
+ * number (from initial recover_done) and all lvb bits are clear.
+ *
+ * - control_lock lvb format
+ *
+ * 4 bytes generation number: the latest dlm lockspace generation number
+ * from recover_done callback. Indicates the jid bitmap has been updated
+ * to reflect all slot failures through that generation.
+ * 4 bytes unused.
+ * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates
+ * that jid N needs recovery.
+ */
+
+#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */
+
+static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
+ char *lvb_bits)
+{
+ uint32_t gen;
+ memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
+ memcpy(&gen, lvb_bits, sizeof(uint32_t));
+ *lvb_gen = le32_to_cpu(gen);
+}
+
+static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
+ char *lvb_bits)
+{
+ uint32_t gen;
+ memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
+ gen = cpu_to_le32(lvb_gen);
+ memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t));
+}
+
+static int all_jid_bits_clear(char *lvb)
+{
+ int i;
+ for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) {
+ if (lvb[i])
+ return 0;
+ }
+ return 1;
+}
+
+static void sync_wait_cb(void *arg)
+{
+ struct lm_lockstruct *ls = arg;
+ complete(&ls->ls_sync_wait);
+}
+
+static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int error;
- if (fsname == NULL) {
- fs_info(sdp, "no fsname found\n");
- return -EINVAL;
+ error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
+ if (error) {
+ fs_err(sdp, "%s lkid %x error %d\n",
+ name, lksb->sb_lkid, error);
+ return error;
+ }
+
+ wait_for_completion(&ls->ls_sync_wait);
+
+ if (lksb->sb_status != -DLM_EUNLOCK) {
+ fs_err(sdp, "%s lkid %x status %d\n",
+ name, lksb->sb_lkid, lksb->sb_status);
+ return -1;
+ }
+ return 0;
+}
+
+static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
+ unsigned int num, struct dlm_lksb *lksb, char *name)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char strname[GDLM_STRNAME_BYTES];
+ int error, status;
+
+ memset(strname, 0, GDLM_STRNAME_BYTES);
+ snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
+
+ error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
+ strname, GDLM_STRNAME_BYTES - 1,
+ 0, sync_wait_cb, ls, NULL);
+ if (error) {
+ fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
+ name, lksb->sb_lkid, flags, mode, error);
+ return error;
+ }
+
+ wait_for_completion(&ls->ls_sync_wait);
+
+ status = lksb->sb_status;
+
+ if (status && status != -EAGAIN) {
+ fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n",
+ name, lksb->sb_lkid, flags, mode, status);
+ }
+
+ return status;
+}
+
+static int mounted_unlock(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock");
+}
+
+static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK,
+ &ls->ls_mounted_lksb, "mounted_lock");
+}
+
+static int control_unlock(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock");
+}
+
+static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK,
+ &ls->ls_control_lksb, "control_lock");
+}
+
+static void gfs2_control_func(struct work_struct *work)
+{
+ struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char lvb_bits[GDLM_LVB_SIZE];
+ uint32_t block_gen, start_gen, lvb_gen, flags;
+ int recover_set = 0;
+ int write_lvb = 0;
+ int recover_size;
+ int i, error;
+
+ spin_lock(&ls->ls_recover_spin);
+ /*
+ * No MOUNT_DONE means we're still mounting; control_mount()
+ * will set this flag, after which this thread will take over
+ * all further clearing of BLOCK_LOCKS.
+ *
+ * FIRST_MOUNT means this node is doing first mounter recovery,
+ * for which recovery control is handled by
+ * control_mount()/control_first_done(), not this thread.
+ */
+ if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+ test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+ block_gen = ls->ls_recover_block;
+ start_gen = ls->ls_recover_start;
+ spin_unlock(&ls->ls_recover_spin);
+
+ /*
+ * Equal block_gen and start_gen implies we are between
+ * recover_prep and recover_done callbacks, which means
+ * dlm recovery is in progress and dlm locking is blocked.
+ * There's no point trying to do any work until recover_done.
+ */
+
+ if (block_gen == start_gen)
+ return;
+
+ /*
+ * Propagate recover_submit[] and recover_result[] to lvb:
+ * dlm_recoverd adds to recover_submit[] jids needing recovery
+ * gfs2_recover adds to recover_result[] journal recovery results
+ *
+ * set lvb bit for jids in recover_submit[] if the lvb has not
+ * yet been updated for the generation of the failure
+ *
+ * clear lvb bit for jids in recover_result[] if the result of
+ * the journal recovery is SUCCESS
+ */
+
+ error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ if (error) {
+ fs_err(sdp, "control lock EX error %d\n", error);
+ return;
+ }
+
+ control_lvb_read(ls, &lvb_gen, lvb_bits);
+
+ spin_lock(&ls->ls_recover_spin);
+ if (block_gen != ls->ls_recover_block ||
+ start_gen != ls->ls_recover_start) {
+ fs_info(sdp, "recover generation %u block1 %u %u\n",
+ start_gen, block_gen, ls->ls_recover_block);
+ spin_unlock(&ls->ls_recover_spin);
+ control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+ return;
+ }
+
+ recover_size = ls->ls_recover_size;
+
+ if (lvb_gen <= start_gen) {
+ /*
+ * Clear lvb bits for jids we've successfully recovered.
+ * Because all nodes attempt to recover failed journals,
+ * a journal can be recovered multiple times successfully
+ * in succession. Only the first will really do recovery,
+ * the others find it clean, but still report a successful
+ * recovery. So, another node may have already recovered
+ * the jid and cleared the lvb bit for it.
+ */
+ for (i = 0; i < recover_size; i++) {
+ if (ls->ls_recover_result[i] != LM_RD_SUCCESS)
+ continue;
+
+ ls->ls_recover_result[i] = 0;
+
+ if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET))
+ continue;
+
+ __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+ write_lvb = 1;
+ }
+ }
+
+ if (lvb_gen == start_gen) {
+ /*
+ * Failed slots before start_gen are already set in lvb.
+ */
+ for (i = 0; i < recover_size; i++) {
+ if (!ls->ls_recover_submit[i])
+ continue;
+ if (ls->ls_recover_submit[i] < lvb_gen)
+ ls->ls_recover_submit[i] = 0;
+ }
+ } else if (lvb_gen < start_gen) {
+ /*
+ * Failed slots before start_gen are not yet set in lvb.
+ */
+ for (i = 0; i < recover_size; i++) {
+ if (!ls->ls_recover_submit[i])
+ continue;
+ if (ls->ls_recover_submit[i] < start_gen) {
+ ls->ls_recover_submit[i] = 0;
+ __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+ }
+ }
+ /* even if there are no bits to set, we need to write the
+ latest generation to the lvb */
+ write_lvb = 1;
+ } else {
+ /*
+ * we should be getting a recover_done() for lvb_gen soon
+ */
+ }
+ spin_unlock(&ls->ls_recover_spin);
+
+ if (write_lvb) {
+ control_lvb_write(ls, start_gen, lvb_bits);
+ flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;
+ } else {
+ flags = DLM_LKF_CONVERT;
+ }
+
+ error = control_lock(sdp, DLM_LOCK_NL, flags);
+ if (error) {
+ fs_err(sdp, "control lock NL error %d\n", error);
+ return;
+ }
+
+ /*
+ * Everyone will see jid bits set in the lvb, run gfs2_recover_set(),
+ * and clear a jid bit in the lvb if the recovery is a success.
+ * Eventually all journals will be recovered, all jid bits will
+ * be cleared in the lvb, and everyone will clear BLOCK_LOCKS.
+ */
+
+ for (i = 0; i < recover_size; i++) {
+ if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) {
+ fs_info(sdp, "recover generation %u jid %d\n",
+ start_gen, i);
+ gfs2_recover_set(sdp, i);
+ recover_set++;
+ }
+ }
+ if (recover_set)
+ return;
+
+ /*
+ * No more jid bits set in lvb, all recovery is done, unblock locks
+ * (unless a new recover_prep callback has occured blocking locks
+ * again while working above)
+ */
+
+ spin_lock(&ls->ls_recover_spin);
+ if (ls->ls_recover_block == block_gen &&
+ ls->ls_recover_start == start_gen) {
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ fs_info(sdp, "recover generation %u done\n", start_gen);
+ gfs2_glock_thaw(sdp);
+ } else {
+ fs_info(sdp, "recover generation %u block2 %u %u\n",
+ start_gen, block_gen, ls->ls_recover_block);
+ spin_unlock(&ls->ls_recover_spin);
+ }
+}
+
+static int control_mount(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char lvb_bits[GDLM_LVB_SIZE];
+ uint32_t start_gen, block_gen, mount_gen, lvb_gen;
+ int mounted_mode;
+ int retries = 0;
+ int error;
+
+ memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb));
+ memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb));
+ memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE);
+ ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb;
+ init_completion(&ls->ls_sync_wait);
+
+ set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+
+ error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK);
+ if (error) {
+ fs_err(sdp, "control_mount control_lock NL error %d\n", error);
+ return error;
+ }
+
+ error = mounted_lock(sdp, DLM_LOCK_NL, 0);
+ if (error) {
+ fs_err(sdp, "control_mount mounted_lock NL error %d\n", error);
+ control_unlock(sdp);
+ return error;
+ }
+ mounted_mode = DLM_LOCK_NL;
+
+restart:
+ if (retries++ && signal_pending(current)) {
+ error = -EINTR;
+ goto fail;
+ }
+
+ /*
+ * We always start with both locks in NL. control_lock is
+ * demoted to NL below so we don't need to do it here.
+ */
+
+ if (mounted_mode != DLM_LOCK_NL) {
+ error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+ if (error)
+ goto fail;
+ mounted_mode = DLM_LOCK_NL;
}
- error = dlm_new_lockspace(fsname, NULL,
- DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
- (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
- GDLM_LVB_SIZE, NULL, NULL, NULL, &ls->ls_dlm);
+ /*
+ * Other nodes need to do some work in dlm recovery and gfs2_control
+ * before the recover_done and control_lock will be ready for us below.
+ * A delay here is not required but often avoids having to retry.
+ */
+
+ msleep_interruptible(500);
+
+ /*
+ * Acquire control_lock in EX and mounted_lock in either EX or PR.
+ * control_lock lvb keeps track of any pending journal recoveries.
+ * mounted_lock indicates if any other nodes have the fs mounted.
+ */
+
+ error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK);
+ if (error == -EAGAIN) {
+ goto restart;
+ } else if (error) {
+ fs_err(sdp, "control_mount control_lock EX error %d\n", error);
+ goto fail;
+ }
+
+ error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+ if (!error) {
+ mounted_mode = DLM_LOCK_EX;
+ goto locks_done;
+ } else if (error != -EAGAIN) {
+ fs_err(sdp, "control_mount mounted_lock EX error %d\n", error);
+ goto fail;
+ }
+
+ error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+ if (!error) {
+ mounted_mode = DLM_LOCK_PR;
+ goto locks_done;
+ } else {
+ /* not even -EAGAIN should happen here */
+ fs_err(sdp, "control_mount mounted_lock PR error %d\n", error);
+ goto fail;
+ }
+
+locks_done:
+ /*
+ * If we got both locks above in EX, then we're the first mounter.
+ * If not, then we need to wait for the control_lock lvb to be
+ * updated by other mounted nodes to reflect our mount generation.
+ *
+ * In simple first mounter cases, first mounter will see zero lvb_gen,
+ * but in cases where all existing nodes leave/fail before mounting
+ * nodes finish control_mount, then all nodes will be mounting and
+ * lvb_gen will be non-zero.
+ */
+
+ control_lvb_read(ls, &lvb_gen, lvb_bits);
+
+ if (lvb_gen == 0xFFFFFFFF) {
+ /* special value to force mount attempts to fail */
+ fs_err(sdp, "control_mount control_lock disabled\n");
+ error = -EINVAL;
+ goto fail;
+ }
+
+ if (mounted_mode == DLM_LOCK_EX) {
+ /* first mounter, keep both EX while doing first recovery */
+ spin_lock(&ls->ls_recover_spin);
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+ set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ fs_info(sdp, "first mounter control generation %u\n", lvb_gen);
+ return 0;
+ }
+
+ error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+ if (error)
+ goto fail;
+
+ /*
+ * We are not first mounter, now we need to wait for the control_lock
+ * lvb generation to be >= the generation from our first recover_done
+ * and all lvb bits to be clear (no pending journal recoveries.)
+ */
+
+ if (!all_jid_bits_clear(lvb_bits)) {
+ /* journals need recovery, wait until all are clear */
+ fs_info(sdp, "control_mount wait for journal recovery\n");
+ goto restart;
+ }
+
+ spin_lock(&ls->ls_recover_spin);
+ block_gen = ls->ls_recover_block;
+ start_gen = ls->ls_recover_start;
+ mount_gen = ls->ls_recover_mount;
+
+ if (lvb_gen < mount_gen) {
+ /* wait for mounted nodes to update control_lock lvb to our
+ generation, which might include new recovery bits set */
+ fs_info(sdp, "control_mount wait1 block %u start %u mount %u "
+ "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+ lvb_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ goto restart;
+ }
+
+ if (lvb_gen != start_gen) {
+ /* wait for mounted nodes to update control_lock lvb to the
+ latest recovery generation */
+ fs_info(sdp, "control_mount wait2 block %u start %u mount %u "
+ "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+ lvb_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ goto restart;
+ }
+
+ if (block_gen == start_gen) {
+ /* dlm recovery in progress, wait for it to finish */
+ fs_info(sdp, "control_mount wait3 block %u start %u mount %u "
+ "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+ lvb_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ goto restart;
+ }
+
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+ memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+ memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+ spin_unlock(&ls->ls_recover_spin);
+ return 0;
+
+fail:
+ mounted_unlock(sdp);
+ control_unlock(sdp);
+ return error;
+}
+
+static int dlm_recovery_wait(void *word)
+{
+ schedule();
+ return 0;
+}
+
+static int control_first_done(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char lvb_bits[GDLM_LVB_SIZE];
+ uint32_t start_gen, block_gen;
+ int error;
+
+restart:
+ spin_lock(&ls->ls_recover_spin);
+ start_gen = ls->ls_recover_start;
+ block_gen = ls->ls_recover_block;
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) ||
+ !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+ !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ /* sanity check, should not happen */
+ fs_err(sdp, "control_first_done start %u block %u flags %lx\n",
+ start_gen, block_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ control_unlock(sdp);
+ return -1;
+ }
+
+ if (start_gen == block_gen) {
+ /*
+ * Wait for the end of a dlm recovery cycle to switch from
+ * first mounter recovery. We can ignore any recover_slot
+ * callbacks between the recover_prep and next recover_done
+ * because we are still the first mounter and any failed nodes
+ * have not fully mounted, so they don't need recovery.
+ */
+ spin_unlock(&ls->ls_recover_spin);
+ fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
+
+ wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
+ dlm_recovery_wait, TASK_UNINTERRUPTIBLE);
+ goto restart;
+ }
+
+ clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+ set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags);
+ memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+ memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+ spin_unlock(&ls->ls_recover_spin);
+
+ memset(lvb_bits, 0, sizeof(lvb_bits));
+ control_lvb_write(ls, start_gen, lvb_bits);
+
+ error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);
+ if (error)
+ fs_err(sdp, "control_first_done mounted PR error %d\n", error);
+
+ error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ if (error)
+ fs_err(sdp, "control_first_done control NL error %d\n", error);
+
+ return error;
+}
+
+/*
+ * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
+ * to accomodate the largest slot number. (NB dlm slot numbers start at 1,
+ * gfs2 jids start at 0, so jid = slot - 1)
+ */
+
+#define RECOVER_SIZE_INC 16
+
+static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
+ int num_slots)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ uint32_t *submit = NULL;
+ uint32_t *result = NULL;
+ uint32_t old_size, new_size;
+ int i, max_jid;
+
+ max_jid = 0;
+ for (i = 0; i < num_slots; i++) {
+ if (max_jid < slots[i].slot - 1)
+ max_jid = slots[i].slot - 1;
+ }
+
+ old_size = ls->ls_recover_size;
+
+ if (old_size >= max_jid + 1)
+ return 0;
+
+ new_size = old_size + RECOVER_SIZE_INC;
+
+ submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+ result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+ if (!submit || !result) {
+ kfree(submit);
+ kfree(result);
+ return -ENOMEM;
+ }
+
+ spin_lock(&ls->ls_recover_spin);
+ memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t));
+ memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t));
+ kfree(ls->ls_recover_submit);
+ kfree(ls->ls_recover_result);
+ ls->ls_recover_submit = submit;
+ ls->ls_recover_result = result;
+ ls->ls_recover_size = new_size;
+ spin_unlock(&ls->ls_recover_spin);
+ return 0;
+}
+
+static void free_recover_size(struct lm_lockstruct *ls)
+{
+ kfree(ls->ls_recover_submit);
+ kfree(ls->ls_recover_result);
+ ls->ls_recover_submit = NULL;
+ ls->ls_recover_result = NULL;
+ ls->ls_recover_size = 0;
+}
+
+/* dlm calls before it does lock recovery */
+
+static void gdlm_recover_prep(void *arg)
+{
+ struct gfs2_sbd *sdp = arg;
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+ spin_lock(&ls->ls_recover_spin);
+ ls->ls_recover_block = ls->ls_recover_start;
+ set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+
+ if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+ test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+ set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+/* dlm calls after recover_prep has been completed on all lockspace members;
+ identifies slot/jid of failed member */
+
+static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct gfs2_sbd *sdp = arg;
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ int jid = slot->slot - 1;
+
+ spin_lock(&ls->ls_recover_spin);
+ if (ls->ls_recover_size < jid + 1) {
+ fs_err(sdp, "recover_slot jid %d gen %u short size %d",
+ jid, ls->ls_recover_block, ls->ls_recover_size);
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+
+ if (ls->ls_recover_submit[jid]) {
+ fs_info(sdp, "recover_slot jid %d gen %u prev %u",
+ jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
+ }
+ ls->ls_recover_submit[jid] = ls->ls_recover_block;
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+/* dlm calls after recover_slot and after it completes lock recovery */
+
+static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
+ int our_slot, uint32_t generation)
+{
+ struct gfs2_sbd *sdp = arg;
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+ /* ensure the ls jid arrays are large enough */
+ set_recover_size(sdp, slots, num_slots);
+
+ spin_lock(&ls->ls_recover_spin);
+ ls->ls_recover_start = generation;
+
+ if (!ls->ls_recover_mount) {
+ ls->ls_recover_mount = generation;
+ ls->ls_jid = our_slot - 1;
+ }
+
+ if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+ queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
+
+ clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+/* gfs2_recover thread has a journal recovery result */
+
+static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
+ unsigned int result)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+ if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+ return;
+
+ /* don't care about the recovery of own journal during mount */
+ if (jid == ls->ls_jid)
+ return;
+
+ spin_lock(&ls->ls_recover_spin);
+ if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+ if (ls->ls_recover_size < jid + 1) {
+ fs_err(sdp, "recovery_result jid %d short size %d",
+ jid, ls->ls_recover_size);
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+
+ fs_info(sdp, "recover jid %d result %s\n", jid,
+ result == LM_RD_GAVEUP ? "busy" : "success");
+
+ ls->ls_recover_result[jid] = result;
+
+ /* GAVEUP means another node is recovering the journal; delay our
+ next attempt to recover it, to give the other node a chance to
+ finish before trying again */
+
+ if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+ queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work,
+ result == LM_RD_GAVEUP ? HZ : 0);
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+const struct dlm_lockspace_ops gdlm_lockspace_ops = {
+ .recover_prep = gdlm_recover_prep,
+ .recover_slot = gdlm_recover_slot,
+ .recover_done = gdlm_recover_done,
+};
+
+static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char cluster[GFS2_LOCKNAME_LEN];
+ const char *fsname;
+ uint32_t flags;
+ int error, ops_result;
+
+ /*
+ * initialize everything
+ */
+
+ INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
+ spin_lock_init(&ls->ls_recover_spin);
+ ls->ls_recover_flags = 0;
+ ls->ls_recover_mount = 0;
+ ls->ls_recover_start = 0;
+ ls->ls_recover_block = 0;
+ ls->ls_recover_size = 0;
+ ls->ls_recover_submit = NULL;
+ ls->ls_recover_result = NULL;
+
+ error = set_recover_size(sdp, NULL, 0);
if (error)
- printk(KERN_ERR "dlm_new_lockspace error %d", error);
+ goto fail;
+
+ /*
+ * prepare dlm_new_lockspace args
+ */
+
+ fsname = strchr(table, ':');
+ if (!fsname) {
+ fs_info(sdp, "no fsname found\n");
+ error = -EINVAL;
+ goto fail_free;
+ }
+ memset(cluster, 0, sizeof(cluster));
+ memcpy(cluster, table, strlen(table) - strlen(fsname));
+ fsname++;
+
+ flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
+ if (ls->ls_nodir)
+ flags |= DLM_LSFL_NODIR;
+
+ /*
+ * create/join lockspace
+ */
+ error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
+ &gdlm_lockspace_ops, sdp, &ops_result,
+ &ls->ls_dlm);
+ if (error) {
+ fs_err(sdp, "dlm_new_lockspace error %d\n", error);
+ goto fail_free;
+ }
+
+ if (ops_result < 0) {
+ /*
+ * dlm does not support ops callbacks,
+ * old dlm_controld/gfs_controld are used, try without ops.
+ */
+ fs_info(sdp, "dlm lockspace ops not used\n");
+ free_recover_size(ls);
+ set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags);
+ return 0;
+ }
+
+ if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) {
+ fs_err(sdp, "dlm lockspace ops disallow jid preset\n");
+ error = -EINVAL;
+ goto fail_release;
+ }
+
+ /*
+ * control_mount() uses control_lock to determine first mounter,
+ * and for later mounts, waits for any recoveries to be cleared.
+ */
+
+ error = control_mount(sdp);
+ if (error) {
+ fs_err(sdp, "mount control error %d\n", error);
+ goto fail_release;
+ }
+
+ ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+ clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+ return 0;
+
+fail_release:
+ dlm_release_lockspace(ls->ls_dlm, 2);
+fail_free:
+ free_recover_size(ls);
+fail:
return error;
}
+static void gdlm_first_done(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ int error;
+
+ if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+ return;
+
+ error = control_first_done(sdp);
+ if (error)
+ fs_err(sdp, "mount first_done error %d\n", error);
+}
+
static void gdlm_unmount(struct gfs2_sbd *sdp)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+ goto release;
+
+ /* wait for gfs2_control_wq to be done with this mount */
+
+ spin_lock(&ls->ls_recover_spin);
+ set_bit(DFL_UNMOUNT, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ flush_delayed_work_sync(&sdp->sd_control_work);
+
+ /* mounted_lock and control_lock will be purged in dlm recovery */
+release:
if (ls->ls_dlm) {
dlm_release_lockspace(ls->ls_dlm, 2);
ls->ls_dlm = NULL;
}
+
+ free_recover_size(ls);
}
static const match_table_t dlm_tokens = {
@@ -226,6 +1310,8 @@ static const match_table_t dlm_tokens = {
const struct lm_lockops gfs2_dlm_ops = {
.lm_proto_name = "lock_dlm",
.lm_mount = gdlm_mount,
+ .lm_first_done = gdlm_first_done,
+ .lm_recovery_result = gdlm_recovery_result,
.lm_unmount = gdlm_unmount,
.lm_put_lock = gdlm_put_lock,
.lm_lock = gdlm_lock,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 756fae9eaf8..4752eadc7f6 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -19,6 +19,7 @@
#include <linux/freezer.h>
#include <linux/bio.h>
#include <linux/writeback.h>
+#include <linux/list_sort.h>
#include "gfs2.h"
#include "incore.h"
@@ -358,7 +359,7 @@ retry:
return 0;
}
-static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
+u64 gfs2_log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
{
struct gfs2_journal_extent *je;
@@ -467,8 +468,8 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
void gfs2_log_incr_head(struct gfs2_sbd *sdp)
{
- if (sdp->sd_log_flush_head == sdp->sd_log_tail)
- BUG_ON(sdp->sd_log_flush_head != sdp->sd_log_head);
+ BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) &&
+ (sdp->sd_log_flush_head != sdp->sd_log_head));
if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
sdp->sd_log_flush_head = 0;
@@ -476,99 +477,6 @@ void gfs2_log_incr_head(struct gfs2_sbd *sdp)
}
}
-/**
- * gfs2_log_write_endio - End of I/O for a log buffer
- * @bh: The buffer head
- * @uptodate: I/O Status
- *
- */
-
-static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate)
-{
- struct gfs2_sbd *sdp = bh->b_private;
- bh->b_private = NULL;
-
- end_buffer_write_sync(bh, uptodate);
- if (atomic_dec_and_test(&sdp->sd_log_in_flight))
- wake_up(&sdp->sd_log_flush_wait);
-}
-
-/**
- * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
- * @sdp: The GFS2 superblock
- *
- * Returns: the buffer_head
- */
-
-struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
-{
- u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
- struct buffer_head *bh;
-
- bh = sb_getblk(sdp->sd_vfs, blkno);
- lock_buffer(bh);
- memset(bh->b_data, 0, bh->b_size);
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
- gfs2_log_incr_head(sdp);
- atomic_inc(&sdp->sd_log_in_flight);
- bh->b_private = sdp;
- bh->b_end_io = gfs2_log_write_endio;
-
- return bh;
-}
-
-/**
- * gfs2_fake_write_endio -
- * @bh: The buffer head
- * @uptodate: The I/O Status
- *
- */
-
-static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate)
-{
- struct buffer_head *real_bh = bh->b_private;
- struct gfs2_bufdata *bd = real_bh->b_private;
- struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd;
-
- end_buffer_write_sync(bh, uptodate);
- free_buffer_head(bh);
- unlock_buffer(real_bh);
- brelse(real_bh);
- if (atomic_dec_and_test(&sdp->sd_log_in_flight))
- wake_up(&sdp->sd_log_flush_wait);
-}
-
-/**
- * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
- * @sdp: the filesystem
- * @data: the data the buffer_head should point to
- *
- * Returns: the log buffer descriptor
- */
-
-struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
- struct buffer_head *real)
-{
- u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
- struct buffer_head *bh;
-
- bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
- atomic_set(&bh->b_count, 1);
- bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
- set_bh_page(bh, real->b_page, bh_offset(real));
- bh->b_blocknr = blkno;
- bh->b_size = sdp->sd_sb.sb_bsize;
- bh->b_bdev = sdp->sd_vfs->s_bdev;
- bh->b_private = real;
- bh->b_end_io = gfs2_fake_write_endio;
-
- gfs2_log_incr_head(sdp);
- atomic_inc(&sdp->sd_log_in_flight);
-
- return bh;
-}
-
static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
{
unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
@@ -583,66 +491,8 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
sdp->sd_log_tail = new_tail;
}
-/**
- * log_write_header - Get and initialize a journal header buffer
- * @sdp: The GFS2 superblock
- *
- * Returns: the initialized log buffer descriptor
- */
-static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
-{
- u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
- struct buffer_head *bh;
- struct gfs2_log_header *lh;
- unsigned int tail;
- u32 hash;
-
- bh = sb_getblk(sdp->sd_vfs, blkno);
- lock_buffer(bh);
- memset(bh->b_data, 0, bh->b_size);
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
-
- gfs2_ail1_empty(sdp);
- tail = current_tail(sdp);
-
- lh = (struct gfs2_log_header *)bh->b_data;
- memset(lh, 0, sizeof(struct gfs2_log_header));
- lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
- lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
- lh->lh_header.__pad0 = cpu_to_be64(0);
- lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
- lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
- lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
- lh->lh_flags = cpu_to_be32(flags);
- lh->lh_tail = cpu_to_be32(tail);
- lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
- hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
- lh->lh_hash = cpu_to_be32(hash);
-
- bh->b_end_io = end_buffer_write_sync;
- get_bh(bh);
- if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
- submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
- else
- submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
- wait_on_buffer(bh);
-
- if (!buffer_uptodate(bh))
- gfs2_io_error_bh(sdp, bh);
- brelse(bh);
-
- if (sdp->sd_log_tail != tail)
- log_pull_tail(sdp, tail);
- else
- gfs2_assert_withdraw(sdp, !pull);
-
- sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
- gfs2_log_incr_head(sdp);
-}
-
-static void log_flush_commit(struct gfs2_sbd *sdp)
+static void log_flush_wait(struct gfs2_sbd *sdp)
{
DEFINE_WAIT(wait);
@@ -655,8 +505,20 @@ static void log_flush_commit(struct gfs2_sbd *sdp)
} while(atomic_read(&sdp->sd_log_in_flight));
finish_wait(&sdp->sd_log_flush_wait, &wait);
}
+}
+
+static int bd_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct gfs2_bufdata *bda, *bdb;
- log_write_header(sdp, 0, 0);
+ bda = list_entry(a, struct gfs2_bufdata, bd_le.le_list);
+ bdb = list_entry(b, struct gfs2_bufdata, bd_le.le_list);
+
+ if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+ return -1;
+ if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+ return 1;
+ return 0;
}
static void gfs2_ordered_write(struct gfs2_sbd *sdp)
@@ -666,6 +528,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
LIST_HEAD(written);
gfs2_log_lock(sdp);
+ list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp);
while (!list_empty(&sdp->sd_log_le_ordered)) {
bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list);
list_move(&bd->bd_le.le_list, &written);
@@ -711,6 +574,68 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
}
/**
+ * log_write_header - Get and initialize a journal header buffer
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the initialized log buffer descriptor
+ */
+
+static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
+{
+ u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head);
+ struct buffer_head *bh;
+ struct gfs2_log_header *lh;
+ unsigned int tail;
+ u32 hash;
+
+ bh = sb_getblk(sdp->sd_vfs, blkno);
+ lock_buffer(bh);
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+
+ gfs2_ail1_empty(sdp);
+ tail = current_tail(sdp);
+
+ lh = (struct gfs2_log_header *)bh->b_data;
+ memset(lh, 0, sizeof(struct gfs2_log_header));
+ lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+ lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+ lh->lh_header.__pad0 = cpu_to_be64(0);
+ lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+ lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+ lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
+ lh->lh_flags = cpu_to_be32(flags);
+ lh->lh_tail = cpu_to_be32(tail);
+ lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
+ hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
+ lh->lh_hash = cpu_to_be32(hash);
+
+ bh->b_end_io = end_buffer_write_sync;
+ get_bh(bh);
+ if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
+ gfs2_ordered_wait(sdp);
+ log_flush_wait(sdp);
+ submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+ } else {
+ submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
+ }
+ wait_on_buffer(bh);
+
+ if (!buffer_uptodate(bh))
+ gfs2_io_error_bh(sdp, bh);
+ brelse(bh);
+
+ if (sdp->sd_log_tail != tail)
+ log_pull_tail(sdp, tail);
+ else
+ gfs2_assert_withdraw(sdp, !pull);
+
+ sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
+ gfs2_log_incr_head(sdp);
+}
+
+/**
* gfs2_log_flush - flush incore transaction(s)
* @sdp: the filesystem
* @gl: The glock structure to flush. If NULL, flush the whole incore log
@@ -753,11 +678,10 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
gfs2_ordered_write(sdp);
lops_before_commit(sdp);
- gfs2_ordered_wait(sdp);
- if (sdp->sd_log_head != sdp->sd_log_flush_head)
- log_flush_commit(sdp);
- else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
+ if (sdp->sd_log_head != sdp->sd_log_flush_head) {
+ log_write_header(sdp, 0, 0);
+ } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
gfs2_log_lock(sdp);
atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
trace_gfs2_log_blocks(sdp, -1);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index ab0621698b7..ff07454b582 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -53,10 +53,7 @@ extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-
-extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
-extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
- struct buffer_head *real);
+extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp, unsigned int lbn);
extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 0301be655b1..6b1efb594d9 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -12,6 +12,7 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
+#include <linux/mempool.h>
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
#include <linux/fs.h>
@@ -76,7 +77,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
if (bi->bi_clone == 0)
return;
if (sdp->sd_args.ar_discard)
- gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi);
+ gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL);
memcpy(bi->bi_clone + bi->bi_offset,
bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
clear_bit(GBF_FULL, &bi->bi_flags);
@@ -143,6 +144,98 @@ static inline __be64 *bh_ptr_end(struct buffer_head *bh)
return (__force __be64 *)(bh->b_data + bh->b_size);
}
+/**
+ * gfs2_log_write_endio - End of I/O for a log buffer
+ * @bh: The buffer head
+ * @uptodate: I/O Status
+ *
+ */
+
+static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate)
+{
+ struct gfs2_sbd *sdp = bh->b_private;
+ bh->b_private = NULL;
+
+ end_buffer_write_sync(bh, uptodate);
+ if (atomic_dec_and_test(&sdp->sd_log_in_flight))
+ wake_up(&sdp->sd_log_flush_wait);
+}
+
+/**
+ * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
+ * @sdp: The GFS2 superblock
+ *
+ * tReturns: the buffer_head
+ */
+
+static struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
+{
+ u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head);
+ struct buffer_head *bh;
+
+ bh = sb_getblk(sdp->sd_vfs, blkno);
+ lock_buffer(bh);
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+ gfs2_log_incr_head(sdp);
+ atomic_inc(&sdp->sd_log_in_flight);
+ bh->b_private = sdp;
+ bh->b_end_io = gfs2_log_write_endio;
+
+ return bh;
+}
+
+/**
+ * gfs2_fake_write_endio -
+ * @bh: The buffer head
+ * @uptodate: The I/O Status
+ *
+ */
+
+static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate)
+{
+ struct buffer_head *real_bh = bh->b_private;
+ struct gfs2_bufdata *bd = real_bh->b_private;
+ struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd;
+
+ end_buffer_write_sync(bh, uptodate);
+ mempool_free(bh, gfs2_bh_pool);
+ unlock_buffer(real_bh);
+ brelse(real_bh);
+ if (atomic_dec_and_test(&sdp->sd_log_in_flight))
+ wake_up(&sdp->sd_log_flush_wait);
+}
+
+/**
+ * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
+ * @sdp: the filesystem
+ * @data: the data the buffer_head should point to
+ *
+ * Returns: the log buffer descriptor
+ */
+
+static struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+ struct buffer_head *real)
+{
+ u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head);
+ struct buffer_head *bh;
+
+ bh = mempool_alloc(gfs2_bh_pool, GFP_NOFS);
+ atomic_set(&bh->b_count, 1);
+ bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
+ set_bh_page(bh, real->b_page, bh_offset(real));
+ bh->b_blocknr = blkno;
+ bh->b_size = sdp->sd_sb.sb_bsize;
+ bh->b_bdev = sdp->sd_vfs->s_bdev;
+ bh->b_private = real;
+ bh->b_end_io = gfs2_fake_write_endio;
+
+ gfs2_log_incr_head(sdp);
+ atomic_inc(&sdp->sd_log_in_flight);
+
+ return bh;
+}
static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
{
@@ -553,11 +646,11 @@ static void gfs2_check_magic(struct buffer_head *bh)
__be32 *ptr;
clear_buffer_escaped(bh);
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh->b_page);
ptr = kaddr + bh_offset(bh);
if (*ptr == cpu_to_be32(GFS2_MAGIC))
set_buffer_escaped(bh);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
@@ -594,10 +687,10 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
if (buffer_escaped(bd->bd_bh)) {
void *kaddr;
bh1 = gfs2_log_get_buf(sdp);
- kaddr = kmap_atomic(bd->bd_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bd->bd_bh->b_page);
memcpy(bh1->b_data, kaddr + bh_offset(bd->bd_bh),
bh1->b_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
*(__be32 *)bh1->b_data = 0;
clear_buffer_escaped(bd->bd_bh);
unlock_buffer(bd->bd_bh);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c150298e2d8..754426b1e52 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -17,6 +17,7 @@
#include <linux/rcupdate.h>
#include <linux/rculist_bl.h>
#include <linux/atomic.h>
+#include <linux/mempool.h>
#include "gfs2.h"
#include "incore.h"
@@ -28,6 +29,8 @@
#include "recovery.h"
#include "dir.h"
+struct workqueue_struct *gfs2_control_wq;
+
static struct shrinker qd_shrinker = {
.shrink = gfs2_shrink_qd_memory,
.seeks = DEFAULT_SEEKS,
@@ -67,6 +70,16 @@ static void gfs2_init_gl_aspace_once(void *foo)
address_space_init_once(mapping);
}
+static void *gfs2_bh_alloc(gfp_t mask, void *data)
+{
+ return alloc_buffer_head(mask);
+}
+
+static void gfs2_bh_free(void *ptr, void *data)
+{
+ return free_buffer_head(ptr);
+}
+
/**
* init_gfs2_fs - Register GFS2 as a filesystem
*
@@ -146,12 +159,25 @@ static int __init init_gfs2_fs(void)
if (!gfs_recovery_wq)
goto fail_wq;
+ gfs2_control_wq = alloc_workqueue("gfs2_control",
+ WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);
+ if (!gfs2_control_wq)
+ goto fail_recovery;
+
+ gfs2_bh_pool = mempool_create(1024, gfs2_bh_alloc, gfs2_bh_free, NULL);
+ if (!gfs2_bh_pool)
+ goto fail_control;
+
gfs2_register_debugfs();
printk("GFS2 installed\n");
return 0;
+fail_control:
+ destroy_workqueue(gfs2_control_wq);
+fail_recovery:
+ destroy_workqueue(gfs_recovery_wq);
fail_wq:
unregister_filesystem(&gfs2meta_fs_type);
fail_unregister:
@@ -195,9 +221,11 @@ static void __exit exit_gfs2_fs(void)
unregister_filesystem(&gfs2_fs_type);
unregister_filesystem(&gfs2meta_fs_type);
destroy_workqueue(gfs_recovery_wq);
+ destroy_workqueue(gfs2_control_wq);
rcu_barrier();
+ mempool_destroy(gfs2_bh_pool);
kmem_cache_destroy(gfs2_quotad_cachep);
kmem_cache_destroy(gfs2_rgrpd_cachep);
kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index fe72e79e6ff..6f3a18f9e17 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -68,6 +68,12 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
sb->s_fs_info = sdp;
sdp->sd_vfs = sb;
+ sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats);
+ if (!sdp->sd_lkstats) {
+ kfree(sdp);
+ return NULL;
+ }
+
set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
gfs2_tune_init(&sdp->sd_tune);
@@ -77,7 +83,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
spin_lock_init(&sdp->sd_statfs_spin);
spin_lock_init(&sdp->sd_rindex_spin);
- mutex_init(&sdp->sd_rindex_mutex);
sdp->sd_rindex_tree.rb_node = NULL;
INIT_LIST_HEAD(&sdp->sd_jindex_list);
@@ -431,10 +436,9 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
return PTR_ERR(inode);
}
- dentry = d_alloc_root(inode);
+ dentry = d_make_root(inode);
if (!dentry) {
fs_err(sdp, "can't alloc %s dentry\n", name);
- iput(inode);
return -ENOMEM;
}
*dptr = dentry;
@@ -562,8 +566,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
{
char *message = "FIRSTMOUNT=Done";
char *envp[] = { message, NULL };
- struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- ls->ls_first_done = 1;
+
+ fs_info(sdp, "first mount done, others may mount\n");
+
+ if (sdp->sd_lockstruct.ls_ops->lm_first_done)
+ sdp->sd_lockstruct.ls_ops->lm_first_done(sdp);
+
kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
}
@@ -796,6 +804,11 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
fs_err(sdp, "can't get quota file inode: %d\n", error);
goto fail_rindex;
}
+
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ goto fail_qinode;
+
return 0;
fail_qinode:
@@ -944,7 +957,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
struct gfs2_args *args = &sdp->sd_args;
const char *proto = sdp->sd_proto_name;
const char *table = sdp->sd_table_name;
- const char *fsname;
char *o, *options;
int ret;
@@ -1004,21 +1016,12 @@ hostdata_error:
}
}
- if (sdp->sd_args.ar_spectator)
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
- else
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
- sdp->sd_lockstruct.ls_jid);
-
- fsname = strchr(table, ':');
- if (fsname)
- fsname++;
if (lm->lm_mount == NULL) {
fs_info(sdp, "Now mounting FS...\n");
complete_all(&sdp->sd_locking_init);
return 0;
}
- ret = lm->lm_mount(sdp, fsname);
+ ret = lm->lm_mount(sdp, table);
if (ret == 0)
fs_info(sdp, "Joined cluster. Now mounting FS...\n");
complete_all(&sdp->sd_locking_init);
@@ -1084,7 +1087,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
if (sdp->sd_args.ar_spectator) {
sb->s_flags |= MS_RDONLY;
- set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+ set_bit(SDF_RORECOVERY, &sdp->sd_flags);
}
if (sdp->sd_args.ar_posix_acl)
sb->s_flags |= MS_POSIXACL;
@@ -1124,6 +1127,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
if (error)
goto fail;
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
+
gfs2_create_debugfs_file(sdp);
error = gfs2_sys_fs_add(sdp);
@@ -1160,6 +1165,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
goto fail_sb;
}
+ if (sdp->sd_args.ar_spectator)
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
+ sdp->sd_table_name);
+ else
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
+ sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
+
error = init_inodes(sdp, DO);
if (error)
goto fail_sb;
@@ -1213,6 +1225,7 @@ fail_sys:
gfs2_sys_fs_del(sdp);
fail:
gfs2_delete_debugfs_file(sdp);
+ free_percpu(sdp->sd_lkstats);
kfree(sdp);
sb->s_fs_info = NULL;
return error;
@@ -1385,6 +1398,7 @@ static void gfs2_kill_sb(struct super_block *sb)
shrink_dcache_sb(sb);
kill_block_super(sb);
gfs2_delete_debugfs_file(sdp);
+ free_percpu(sdp->sd_lkstats);
kfree(sdp);
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a45b21b0391..6019da3dcae 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -681,7 +681,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
ptr = qp;
nbytes = sizeof(struct gfs2_quota);
get_a_page:
- page = grab_cache_page(mapping, index);
+ page = find_or_create_page(mapping, index, GFP_NOFS);
if (!page)
return -ENOMEM;
@@ -720,12 +720,12 @@ get_a_page:
gfs2_trans_add_bh(ip->i_gl, bh, 0);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
nbytes = PAGE_CACHE_SIZE - offset;
memcpy(kaddr + offset, ptr, nbytes);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
unlock_page(page);
page_cache_release(page);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f2a02edcac8..963b2d75200 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
char env_status[20];
char *envp[] = { env_jid, env_status, NULL };
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
ls->ls_recover_jid_done = jid;
ls->ls_recover_jid_status = message;
sprintf(env_jid, "JID=%d", jid);
sprintf(env_status, "RECOVERY=%s",
message == LM_RD_SUCCESS ? "Done" : "Failed");
kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
+
+ if (sdp->sd_lockstruct.ls_ops->lm_recovery_result)
+ sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
}
void gfs2_recover_func(struct work_struct *work)
@@ -512,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work)
if (error)
goto fail_gunlock_ji;
- if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+ if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) {
+ ro = 1;
+ } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
ro = 1;
} else {
@@ -577,6 +583,7 @@ fail_gunlock_j:
fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
fail:
+ jd->jd_recover_error = error;
gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
done:
clear_bit(JDF_RECOVERY, &jd->jd_flags);
@@ -605,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
TASK_UNINTERRUPTIBLE);
- return 0;
+ return wait ? jd->jd_recover_error : 0;
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 22234627f68..3df65c9ab73 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -327,23 +327,31 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
* Returns: The resource group, or NULL if not found
*/
-struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
{
- struct rb_node **newn;
+ struct rb_node *n, *next;
struct gfs2_rgrpd *cur;
spin_lock(&sdp->sd_rindex_spin);
- newn = &sdp->sd_rindex_tree.rb_node;
- while (*newn) {
- cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node);
+ n = sdp->sd_rindex_tree.rb_node;
+ while (n) {
+ cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
+ next = NULL;
if (blk < cur->rd_addr)
- newn = &((*newn)->rb_left);
+ next = n->rb_left;
else if (blk >= cur->rd_data0 + cur->rd_data)
- newn = &((*newn)->rb_right);
- else {
+ next = n->rb_right;
+ if (next == NULL) {
spin_unlock(&sdp->sd_rindex_spin);
+ if (exact) {
+ if (blk < cur->rd_addr)
+ return NULL;
+ if (blk >= cur->rd_data0 + cur->rd_data)
+ return NULL;
+ }
return cur;
}
+ n = next;
}
spin_unlock(&sdp->sd_rindex_spin);
@@ -532,7 +540,6 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
struct file_ra_state ra_state;
int error, rgrps;
- mutex_lock(&sdp->sd_rindex_mutex);
file_ra_state_init(&ra_state, inode->i_mapping);
for (rgrps = 0;; rgrps++) {
loff_t pos = rgrps * sizeof(struct gfs2_rindex);
@@ -545,11 +552,10 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
break;
total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
}
- mutex_unlock(&sdp->sd_rindex_mutex);
return total_data;
}
-static void rgd_insert(struct gfs2_rgrpd *rgd)
+static int rgd_insert(struct gfs2_rgrpd *rgd)
{
struct gfs2_sbd *sdp = rgd->rd_sbd;
struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
@@ -565,11 +571,13 @@ static void rgd_insert(struct gfs2_rgrpd *rgd)
else if (rgd->rd_addr > cur->rd_addr)
newn = &((*newn)->rb_right);
else
- return;
+ return -EEXIST;
}
rb_link_node(&rgd->rd_node, parent, newn);
rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
+ sdp->sd_rgrps++;
+ return 0;
}
/**
@@ -623,10 +631,13 @@ static int read_rindex_entry(struct gfs2_inode *ip,
if (rgd->rd_data > sdp->sd_max_rg_data)
sdp->sd_max_rg_data = rgd->rd_data;
spin_lock(&sdp->sd_rindex_spin);
- rgd_insert(rgd);
- sdp->sd_rgrps++;
+ error = rgd_insert(rgd);
spin_unlock(&sdp->sd_rindex_spin);
- return error;
+ if (!error)
+ return 0;
+
+ error = 0; /* someone else read in the rgrp; free it and ignore it */
+ gfs2_glock_put(rgd->rd_gl);
fail:
kfree(rgd->rd_bits);
@@ -683,20 +694,22 @@ int gfs2_rindex_update(struct gfs2_sbd *sdp)
struct gfs2_glock *gl = ip->i_gl;
struct gfs2_holder ri_gh;
int error = 0;
+ int unlock_required = 0;
/* Read new copy from disk if we don't have the latest */
if (!sdp->sd_rindex_uptodate) {
- mutex_lock(&sdp->sd_rindex_mutex);
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
- if (error)
- return error;
+ if (!gfs2_glock_is_locked_by_me(gl)) {
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
+ if (error)
+ return error;
+ unlock_required = 1;
+ }
if (!sdp->sd_rindex_uptodate)
error = gfs2_ri_update(ip);
- gfs2_glock_dq_uninit(&ri_gh);
- mutex_unlock(&sdp->sd_rindex_mutex);
+ if (unlock_required)
+ gfs2_glock_dq_uninit(&ri_gh);
}
-
return error;
}
@@ -805,9 +818,9 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
}
-void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
struct buffer_head *bh,
- const struct gfs2_bitmap *bi)
+ const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
{
struct super_block *sb = sdp->sd_vfs;
struct block_device *bdev = sb->s_bdev;
@@ -818,11 +831,19 @@ void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
sector_t nr_sects = 0;
int rv;
unsigned int x;
+ u32 trimmed = 0;
+ u8 diff;
for (x = 0; x < bi->bi_len; x++) {
- const u8 *orig = bh->b_data + bi->bi_offset + x;
- const u8 *clone = bi->bi_clone + bi->bi_offset + x;
- u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
+ const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
+ clone += bi->bi_offset;
+ clone += x;
+ if (bh) {
+ const u8 *orig = bh->b_data + bi->bi_offset + x;
+ diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
+ } else {
+ diff = ~(*clone | (*clone >> 1));
+ }
diff &= 0x55;
if (diff == 0)
continue;
@@ -833,11 +854,14 @@ void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
if (nr_sects == 0)
goto start_new_extent;
if ((start + nr_sects) != blk) {
- rv = blkdev_issue_discard(bdev, start,
- nr_sects, GFP_NOFS,
- 0);
- if (rv)
- goto fail;
+ if (nr_sects >= minlen) {
+ rv = blkdev_issue_discard(bdev,
+ start, nr_sects,
+ GFP_NOFS, 0);
+ if (rv)
+ goto fail;
+ trimmed += nr_sects;
+ }
nr_sects = 0;
start_new_extent:
start = blk;
@@ -848,15 +872,108 @@ start_new_extent:
blk += sects_per_blk;
}
}
- if (nr_sects) {
+ if (nr_sects >= minlen) {
rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
if (rv)
goto fail;
+ trimmed += nr_sects;
}
- return;
+ if (ptrimmed)
+ *ptrimmed = trimmed;
+ return 0;
+
fail:
- fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);
+ if (sdp->sd_args.ar_discard)
+ fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);
sdp->sd_args.ar_discard = 0;
+ return -EIO;
+}
+
+/**
+ * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
+ * @filp: Any file on the filesystem
+ * @argp: Pointer to the arguments (also used to pass result)
+ *
+ * Returns: 0 on success, otherwise error code
+ */
+
+int gfs2_fitrim(struct file *filp, void __user *argp)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
+ struct buffer_head *bh;
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_rgrpd *rgd_end;
+ struct gfs2_holder gh;
+ struct fstrim_range r;
+ int ret = 0;
+ u64 amt;
+ u64 trimmed = 0;
+ unsigned int x;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (argp == NULL) {
+ r.start = 0;
+ r.len = ULLONG_MAX;
+ r.minlen = 0;
+ } else if (copy_from_user(&r, argp, sizeof(r)))
+ return -EFAULT;
+
+ ret = gfs2_rindex_update(sdp);
+ if (ret)
+ return ret;
+
+ rgd = gfs2_blk2rgrpd(sdp, r.start, 0);
+ rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0);
+
+ while (1) {
+
+ ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (ret)
+ goto out;
+
+ if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
+ /* Trim each bitmap in the rgrp */
+ for (x = 0; x < rgd->rd_length; x++) {
+ struct gfs2_bitmap *bi = rgd->rd_bits + x;
+ ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt);
+ if (ret) {
+ gfs2_glock_dq_uninit(&gh);
+ goto out;
+ }
+ trimmed += amt;
+ }
+
+ /* Mark rgrp as having been trimmed */
+ ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
+ if (ret == 0) {
+ bh = rgd->rd_bits[0].bi_bh;
+ rgd->rd_flags |= GFS2_RGF_TRIMMED;
+ gfs2_trans_add_bh(rgd->rd_gl, bh, 1);
+ gfs2_rgrp_out(rgd, bh->b_data);
+ gfs2_trans_end(sdp);
+ }
+ }
+ gfs2_glock_dq_uninit(&gh);
+
+ if (rgd == rgd_end)
+ break;
+
+ rgd = gfs2_rgrpd_get_next(rgd);
+ }
+
+out:
+ r.len = trimmed << 9;
+ if (argp && copy_to_user(argp, &r, sizeof(r)))
+ return -EFAULT;
+
+ return ret;
}
/**
@@ -1003,7 +1120,7 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
rgd = begin = ip->i_rgd;
else
- rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal);
+ rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
if (rgd == NULL)
return -EBADSLT;
@@ -1108,9 +1225,9 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
{
struct gfs2_blkreserv *rs = ip->i_res;
- gfs2_blkrsv_put(ip);
if (rs->rs_rgd_gh.gh_gl)
gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+ gfs2_blkrsv_put(ip);
}
/**
@@ -1288,7 +1405,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
u32 length, rgrp_blk, buf_blk;
unsigned int buf;
- rgd = gfs2_blk2rgrpd(sdp, bstart);
+ rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
if (!rgd) {
if (gfs2_consist(sdp))
fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
@@ -1469,7 +1586,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
return;
trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
rgd->rd_free += blen;
-
+ rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1555,14 +1672,9 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
{
struct gfs2_rgrpd *rgd;
struct gfs2_holder rgd_gh;
- int error;
-
- error = gfs2_rindex_update(sdp);
- if (error)
- return error;
+ int error = -EINVAL;
- error = -EINVAL;
- rgd = gfs2_blk2rgrpd(sdp, no_addr);
+ rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
if (!rgd)
goto fail;
@@ -1605,7 +1717,7 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
rgd = ip->i_rgd;
else
- rgd = gfs2_blk2rgrpd(sdp, block);
+ rgd = gfs2_blk2rgrpd(sdp, block, 1);
if (!rgd) {
fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
return;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index ceec9106cdf..b4b10f4de25 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -11,6 +11,7 @@
#define __RGRP_DOT_H__
#include <linux/slab.h>
+#include <linux/uaccess.h>
struct gfs2_rgrpd;
struct gfs2_sbd;
@@ -18,7 +19,7 @@ struct gfs2_holder;
extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
-extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
+extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact);
extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
@@ -62,8 +63,9 @@ extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
-extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
- struct buffer_head *bh,
- const struct gfs2_bitmap *bi);
+extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+ struct buffer_head *bh,
+ const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
+extern int gfs2_fitrim(struct file *filp, void __user *argp);
#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 4553ce515f6..6172fa77ad5 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1417,7 +1417,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
if (error)
goto out;
- rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1);
if (!rgd) {
gfs2_consist_inode(ip);
error = -EIO;
@@ -1557,6 +1557,7 @@ out:
end_writeback(inode);
gfs2_dir_hash_inval(ip);
ip->i_gl->gl_object = NULL;
+ flush_delayed_work_sync(&ip->i_gl->gl_work);
gfs2_glock_add_to_lru(ip->i_gl);
gfs2_glock_put(ip->i_gl);
ip->i_gl = NULL;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 443cabcfcd2..d33172c291b 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
ssize_t ret;
int val = 0;
- if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))
val = 1;
ret = sprintf(buf, "%d\n", val);
return ret;
@@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
val = simple_strtol(buf, NULL, 0);
if (val == 1)
- set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+ set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
else if (val == 0) {
- clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
smp_mb__after_clear_bit();
gfs2_glock_thaw(sdp);
} else {
@@ -350,8 +350,8 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
goto out;
if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
goto out;
- sdp->sd_lockstruct.ls_first = first;
- rv = 0;
+ sdp->sd_lockstruct.ls_first = first;
+ rv = 0;
out:
spin_unlock(&sdp->sd_jindex_spin);
return rv ? rv : len;
@@ -360,19 +360,14 @@ out:
static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- return sprintf(buf, "%d\n", ls->ls_first_done);
+ return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags));
}
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
{
- unsigned jid;
struct gfs2_jdesc *jd;
int rv;
- rv = sscanf(buf, "%u", &jid);
- if (rv != 1)
- return -EINVAL;
-
rv = -ESHUTDOWN;
spin_lock(&sdp->sd_jindex_spin);
if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
@@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
}
out:
spin_unlock(&sdp->sd_jindex_spin);
+ return rv;
+}
+
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+ unsigned jid;
+ int rv;
+
+ rv = sscanf(buf, "%u", &jid);
+ if (rv != 1)
+ return -EINVAL;
+
+ rv = gfs2_recover_set(sdp, jid);
+
return rv ? rv : len;
}
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index e94560e836d..79182d6ad6a 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
int gfs2_sys_init(void);
void gfs2_sys_uninit(void);
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid);
+
#endif /* __SYS_DOT_H__ */
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 5d07609ec57..dfa89cd7553 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -11,6 +11,7 @@
#include <linux/dlmconstants.h>
#include <linux/gfs2_ondisk.h>
#include <linux/writeback.h>
+#include <linux/ktime.h>
#include "incore.h"
#include "glock.h"
@@ -43,7 +44,8 @@
{(1UL << GLF_FROZEN), "F" }, \
{(1UL << GLF_QUEUED), "q" }, \
{(1UL << GLF_LRU), "L" }, \
- {(1UL << GLF_OBJECT), "o" })
+ {(1UL << GLF_OBJECT), "o" }, \
+ {(1UL << GLF_BLOCKING), "b" })
#ifndef NUMPTY
#define NUMPTY
@@ -236,6 +238,62 @@ TRACE_EVENT(gfs2_glock_queue,
glock_trace_name(__entry->state))
);
+/* DLM sends a reply to GFS2 */
+TRACE_EVENT(gfs2_glock_lock_time,
+
+ TP_PROTO(const struct gfs2_glock *gl, s64 tdiff),
+
+ TP_ARGS(gl, tdiff),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( u64, glnum )
+ __field( u32, gltype )
+ __field( int, status )
+ __field( char, flags )
+ __field( s64, tdiff )
+ __field( s64, srtt )
+ __field( s64, srttvar )
+ __field( s64, srttb )
+ __field( s64, srttvarb )
+ __field( s64, sirt )
+ __field( s64, sirtvar )
+ __field( s64, dcount )
+ __field( s64, qcount )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->glnum = gl->gl_name.ln_number;
+ __entry->gltype = gl->gl_name.ln_type;
+ __entry->status = gl->gl_lksb.sb_status;
+ __entry->flags = gl->gl_lksb.sb_flags;
+ __entry->tdiff = tdiff;
+ __entry->srtt = gl->gl_stats.stats[GFS2_LKS_SRTT];
+ __entry->srttvar = gl->gl_stats.stats[GFS2_LKS_SRTTVAR];
+ __entry->srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
+ __entry->srttvarb = gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
+ __entry->sirt = gl->gl_stats.stats[GFS2_LKS_SIRT];
+ __entry->sirtvar = gl->gl_stats.stats[GFS2_LKS_SIRTVAR];
+ __entry->dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
+ __entry->qcount = gl->gl_stats.stats[GFS2_LKS_QCOUNT];
+ ),
+
+ TP_printk("%u,%u glock %d:%lld status:%d flags:%02x tdiff:%lld srtt:%lld/%lld srttb:%lld/%lld sirt:%lld/%lld dcnt:%lld qcnt:%lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+ (unsigned long long)__entry->glnum,
+ __entry->status, __entry->flags,
+ (long long)__entry->tdiff,
+ (long long)__entry->srtt,
+ (long long)__entry->srttvar,
+ (long long)__entry->srttb,
+ (long long)__entry->srttvarb,
+ (long long)__entry->sirt,
+ (long long)__entry->sirtvar,
+ (long long)__entry->dcount,
+ (long long)__entry->qcount)
+);
+
/* Section 2 - Log/journal
*
* Objectives:
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 53511291fe3..9e7765e8e7b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly;
struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
struct kmem_cache *gfs2_quotad_cachep __read_mostly;
+mempool_t *gfs2_bh_pool __read_mostly;
void gfs2_assert_i(struct gfs2_sbd *sdp)
{
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index b432e04600d..a4ce76c67db 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -10,6 +10,8 @@
#ifndef __UTIL_DOT_H__
#define __UTIL_DOT_H__
+#include <linux/mempool.h>
+
#include "incore.h"
#define fs_printk(level, fs, fmt, arg...) \
@@ -150,6 +152,7 @@ extern struct kmem_cache *gfs2_inode_cachep;
extern struct kmem_cache *gfs2_bufdata_cachep;
extern struct kmem_cache *gfs2_rgrpd_cachep;
extern struct kmem_cache *gfs2_quotad_cachep;
+extern mempool_t *gfs2_bh_pool;
static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
unsigned int *p)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index e9636591b5d..927f4df874a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -238,6 +238,10 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
unsigned int x;
int error;
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
if (GFS2_EA_IS_STUFFED(ea))
return 0;
@@ -251,7 +255,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
if (!blks)
return 0;
- rgd = gfs2_blk2rgrpd(sdp, bn);
+ rgd = gfs2_blk2rgrpd(sdp, bn, 1);
if (!rgd) {
gfs2_consist_inode(ip);
return -EIO;
@@ -1330,6 +1334,10 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
unsigned int x;
int error;
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
@@ -1439,7 +1447,11 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
struct gfs2_holder gh;
int error;
- rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1);
if (!rgd) {
gfs2_consist_inode(ip);
return -EIO;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 8137fb3e678..7b4c537d6e1 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -430,15 +430,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_d_op = &hfs_dentry_operations;
res = -ENOMEM;
- sb->s_root = d_alloc_root(root_inode);
+ sb->s_root = d_make_root(root_inode);
if (!sb->s_root)
- goto bail_iput;
+ goto bail_no_root;
/* everything's okay */
return 0;
-bail_iput:
- iput(root_inode);
bail_no_root:
printk(KERN_ERR "hfs: get root inode failed.\n");
bail:
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 21a5b7fc6db..4e75ac646fe 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -317,6 +317,11 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
/*
+ * hfs+-specific ioctl for making the filesystem bootable
+ */
+#define HFSPLUS_IOC_BLESS _IO('h', 0x80)
+
+/*
* Functions in any *.c used in other files
*/
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 927cdd6d5bf..921967e5abb 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -117,7 +117,7 @@ struct hfsplus_vh {
__be32 write_count;
__be64 encodings_bmp;
- u8 finder_info[32];
+ u32 finder_info[8];
struct hfsplus_fork_raw alloc_file;
struct hfsplus_fork_raw ext_file;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 6643b242bdd..82b69ee4dac 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -193,6 +193,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir,
mutex_init(&hip->extents_lock);
hip->extent_state = 0;
hip->flags = 0;
+ hip->userflags = 0;
set_bit(HFSPLUS_I_RSRC, &hip->flags);
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
@@ -400,6 +401,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
atomic_set(&hip->opencnt, 0);
hip->extent_state = 0;
hip->flags = 0;
+ hip->userflags = 0;
memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
hip->alloc_blocks = 0;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f66c7655b3f..c640ba57074 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -20,6 +20,38 @@
#include <asm/uaccess.h>
#include "hfsplus_fs.h"
+/*
+ * "Blessing" an HFS+ filesystem writes metadata to the superblock informing
+ * the platform firmware which file to boot from
+ */
+static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+ struct hfsplus_vh *vh = sbi->s_vhdr;
+ struct hfsplus_vh *bvh = sbi->s_backup_vhdr;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&sbi->vh_mutex);
+
+ /* Directory containing the bootable system */
+ vh->finder_info[0] = bvh->finder_info[0] =
+ cpu_to_be32(parent_ino(dentry));
+
+ /* Bootloader */
+ vh->finder_info[1] = bvh->finder_info[1] = cpu_to_be32(inode->i_ino);
+
+ /* Per spec, the OS X system folder - same as finder_info[0] here */
+ vh->finder_info[5] = bvh->finder_info[5] =
+ cpu_to_be32(parent_ino(dentry));
+
+ mutex_unlock(&sbi->vh_mutex);
+ return 0;
+}
+
static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
{
struct inode *inode = file->f_path.dentry->d_inode;
@@ -108,6 +140,8 @@ long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return hfsplus_ioctl_getflags(file, argp);
case HFSPLUS_IOC_EXT2_SETFLAGS:
return hfsplus_ioctl_setflags(file, argp);
+ case HFSPLUS_IOC_BLESS:
+ return hfsplus_ioctl_bless(file, argp);
default:
return -ENOTTY;
}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 427682ca9e4..ceb1c281eef 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -465,6 +465,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
goto out_put_alloc_file;
}
+ sb->s_d_op = &hfsplus_dentry_operations;
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root) {
+ err = -ENOMEM;
+ goto out_put_alloc_file;
+ }
+
str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
str.name = HFSP_HIDDENDIR_NAME;
err = hfs_find_init(sbi->cat_tree, &fd);
@@ -515,13 +522,6 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
}
}
- sb->s_d_op = &hfsplus_dentry_operations;
- sb->s_root = d_alloc_root(root);
- if (!sb->s_root) {
- err = -ENOMEM;
- goto out_put_hidden_dir;
- }
-
unload_nls(sbi->nls);
sbi->nls = nls;
return 0;
@@ -529,7 +529,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
out_put_hidden_dir:
iput(sbi->hidden_dir);
out_put_root:
- iput(root);
+ dput(sb->s_root);
+ sb->s_root = NULL;
out_put_alloc_file:
iput(sbi->alloc_file);
out_close_cat_tree:
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 3cbfa93cd78..1fe731337f0 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -67,7 +67,8 @@ extern int access_file(char *path, int r, int w, int x);
extern int open_file(char *path, int r, int w, int append);
extern void *open_dir(char *path, int *err_out);
extern char *read_dir(void *stream, unsigned long long *pos,
- unsigned long long *ino_out, int *len_out);
+ unsigned long long *ino_out, int *len_out,
+ unsigned int *type_out);
extern void close_file(void *stream);
extern int replace_file(int oldfd, int fd);
extern void close_dir(void *stream);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e130bd46d67..07c516bfea7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -283,6 +283,7 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
char *name;
unsigned long long next, ino;
int error, len;
+ unsigned int type;
name = dentry_name(file->f_path.dentry);
if (name == NULL)
@@ -292,9 +293,9 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
if (dir == NULL)
return -error;
next = file->f_pos;
- while ((name = read_dir(dir, &next, &ino, &len)) != NULL) {
+ while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
error = (*filldir)(ent, name, len, file->f_pos,
- ino, DT_UNKNOWN);
+ ino, type);
if (error) break;
file->f_pos = next;
}
@@ -966,9 +967,9 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
}
err = -ENOMEM;
- sb->s_root = d_alloc_root(root_inode);
+ sb->s_root = d_make_root(root_inode);
if (sb->s_root == NULL)
- goto out_put;
+ goto out;
return 0;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index dd7bc38a382..a74ad0d371c 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -98,7 +98,8 @@ void *open_dir(char *path, int *err_out)
}
char *read_dir(void *stream, unsigned long long *pos,
- unsigned long long *ino_out, int *len_out)
+ unsigned long long *ino_out, int *len_out,
+ unsigned int *type_out)
{
DIR *dir = stream;
struct dirent *ent;
@@ -109,6 +110,7 @@ char *read_dir(void *stream, unsigned long long *pos,
return NULL;
*len_out = strlen(ent->d_name);
*ino_out = ent->d_ino;
+ *type_out = ent->d_type;
*pos = telldir(dir);
return ent->d_name;
}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 3690467c944..54f6eccb79d 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -625,11 +625,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
hpfs_init_inode(root);
hpfs_read_inode(root);
unlock_new_inode(root);
- s->s_root = d_alloc_root(root);
- if (!s->s_root) {
- iput(root);
+ s->s_root = d_make_root(root);
+ if (!s->s_root)
goto bail0;
- }
/*
* find the root directory's . pointer & finish filling in the inode
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index d92f4ce8092..a80e45a690a 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -726,17 +726,12 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
err = -ENOMEM;
root_inode = get_inode(sb, dget(proc_mnt->mnt_root));
- if (!root_inode)
- goto out_mntput;
-
- sb->s_root = d_alloc_root(root_inode);
+ sb->s_root = d_make_root(root_inode);
if (!sb->s_root)
- goto out_iput;
+ goto out_mntput;
return 0;
- out_iput:
- iput(root_inode);
out_mntput:
mntput(proc_mnt);
out:
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e425ad9d049..28cf06e4ec8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -41,6 +41,25 @@ const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
+struct hugetlbfs_config {
+ uid_t uid;
+ gid_t gid;
+ umode_t mode;
+ long nr_blocks;
+ long nr_inodes;
+ struct hstate *hstate;
+};
+
+struct hugetlbfs_inode_info {
+ struct shared_policy policy;
+ struct inode vfs_inode;
+};
+
+static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
+{
+ return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
+}
+
static struct backing_dev_info hugetlbfs_backing_dev_info = {
.name = "hugetlbfs",
.ra_pages = 0, /* No readahead */
@@ -154,10 +173,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return addr;
}
- start_addr = mm->free_area_cache;
-
- if (len <= mm->cached_hole_size)
+ if (len > mm->cached_hole_size)
+ start_addr = mm->free_area_cache;
+ else {
start_addr = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = 0;
+ }
full_search:
addr = ALIGN(start_addr, huge_page_size(h));
@@ -171,13 +192,18 @@ full_search:
*/
if (start_addr != TASK_UNMAPPED_BASE) {
start_addr = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = 0;
goto full_search;
}
return -ENOMEM;
}
- if (!vma || addr + len <= vma->vm_start)
+ if (!vma || addr + len <= vma->vm_start) {
+ mm->free_area_cache = addr + len;
return addr;
+ }
+ if (addr + mm->cached_hole_size < vma->vm_start)
+ mm->cached_hole_size = vma->vm_start - addr;
addr = ALIGN(vma->vm_end, huge_page_size(h));
}
}
@@ -238,17 +264,10 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
loff_t isize;
ssize_t retval = 0;
- mutex_lock(&inode->i_mutex);
-
/* validate length */
if (len == 0)
goto out;
- isize = i_size_read(inode);
- if (!isize)
- goto out;
-
- end_index = (isize - 1) >> huge_page_shift(h);
for (;;) {
struct page *page;
unsigned long nr, ret;
@@ -256,18 +275,21 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
+ isize = i_size_read(inode);
+ if (!isize)
+ goto out;
+ end_index = (isize - 1) >> huge_page_shift(h);
if (index >= end_index) {
if (index > end_index)
goto out;
nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
- if (nr <= offset) {
+ if (nr <= offset)
goto out;
- }
}
nr = nr - offset;
/* Find the page */
- page = find_get_page(mapping, index);
+ page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
/*
* We have a HOLE, zero out the user-buffer for the
@@ -279,17 +301,18 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
else
ra = 0;
} else {
+ unlock_page(page);
+
/*
* We have the page, copy it to user space buffer.
*/
ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
ret = ra;
+ page_cache_release(page);
}
if (ra < 0) {
if (retval == 0)
retval = ra;
- if (page)
- page_cache_release(page);
goto out;
}
@@ -299,16 +322,12 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
index += offset >> huge_page_shift(h);
offset &= ~huge_page_mask(h);
- if (page)
- page_cache_release(page);
-
/* short read or no more work */
if ((ret != nr) || (len == 0))
break;
}
out:
*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
- mutex_unlock(&inode->i_mutex);
return retval;
}
@@ -583,7 +602,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
}
static int hugetlbfs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
int rc;
@@ -606,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
spin_lock(&sbinfo->stat_lock);
/* If no limits set, just report 0 for max/free/used
* blocks, like simple_statfs() */
- if (sbinfo->max_blocks >= 0) {
- buf->f_blocks = sbinfo->max_blocks;
- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ if (sbinfo->spool) {
+ long free_pages;
+
+ spin_lock(&sbinfo->spool->lock);
+ buf->f_blocks = sbinfo->spool->max_hpages;
+ free_pages = sbinfo->spool->max_hpages
+ - sbinfo->spool->used_hpages;
+ buf->f_bavail = buf->f_bfree = free_pages;
+ spin_unlock(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
@@ -624,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
if (sbi) {
sb->s_fs_info = NULL;
+
+ if (sbi->spool)
+ hugepage_put_subpool(sbi->spool);
+
kfree(sbi);
}
}
@@ -830,8 +860,6 @@ bad_val:
static int
hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
{
- struct inode * inode;
- struct dentry * root;
int ret;
struct hugetlbfs_config config;
struct hugetlbfs_sb_info *sbinfo;
@@ -854,60 +882,31 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = sbinfo;
sbinfo->hstate = config.hstate;
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_blocks = config.nr_blocks;
- sbinfo->free_blocks = config.nr_blocks;
sbinfo->max_inodes = config.nr_inodes;
sbinfo->free_inodes = config.nr_inodes;
+ sbinfo->spool = NULL;
+ if (config.nr_blocks != -1) {
+ sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+ if (!sbinfo->spool)
+ goto out_free;
+ }
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = huge_page_size(config.hstate);
sb->s_blocksize_bits = huge_page_shift(config.hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
- inode = hugetlbfs_get_root(sb, &config);
- if (!inode)
- goto out_free;
-
- root = d_alloc_root(inode);
- if (!root) {
- iput(inode);
+ sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
+ if (!sb->s_root)
goto out_free;
- }
- sb->s_root = root;
return 0;
out_free:
+ if (sbinfo->spool)
+ kfree(sbinfo->spool);
kfree(sbinfo);
return -ENOMEM;
}
-int hugetlb_get_quota(struct address_space *mapping, long delta)
-{
- int ret = 0;
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
- if (sbinfo->free_blocks > -1) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks - delta >= 0)
- sbinfo->free_blocks -= delta;
- else
- ret = -ENOMEM;
- spin_unlock(&sbinfo->stat_lock);
- }
-
- return ret;
-}
-
-void hugetlb_put_quota(struct address_space *mapping, long delta)
-{
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
- if (sbinfo->free_blocks > -1) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_blocks += delta;
- spin_unlock(&sbinfo->stat_lock);
- }
-}
-
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
@@ -927,8 +926,8 @@ static int can_do_hugetlb_shm(void)
return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
}
-struct file *hugetlb_file_setup(const char *name, size_t size,
- vm_flags_t acctflag,
+struct file *hugetlb_file_setup(const char *name, unsigned long addr,
+ size_t size, vm_flags_t acctflag,
struct user_struct **user, int creat_flags)
{
int error = -ENOMEM;
@@ -937,6 +936,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
struct path path;
struct dentry *root;
struct qstr quick_string;
+ struct hstate *hstate;
+ unsigned long num_pages;
*user = NULL;
if (!hugetlbfs_vfsmount)
@@ -945,7 +946,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
*user = current_user();
if (user_shm_lock(size, *user)) {
- printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
+ task_lock(current);
+ printk_once(KERN_WARNING
+ "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+ current->comm, current->pid);
+ task_unlock(current);
} else {
*user = NULL;
return ERR_PTR(-EPERM);
@@ -966,10 +971,12 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (!inode)
goto out_dentry;
+ hstate = hstate_inode(inode);
+ size += addr & ~huge_page_mask(hstate);
+ num_pages = ALIGN(size, huge_page_size(hstate)) >>
+ huge_page_shift(hstate);
error = -ENOMEM;
- if (hugetlb_reserve_pages(inode, 0,
- size >> huge_page_shift(hstate_inode(inode)), NULL,
- acctflag))
+ if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
goto out_inode;
d_instantiate(path.dentry, inode);
@@ -1005,6 +1012,7 @@ static int __init init_hugetlbfs_fs(void)
if (error)
return error;
+ error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
0, 0, init_once);
@@ -1025,8 +1033,7 @@ static int __init init_hugetlbfs_fs(void)
error = PTR_ERR(vfsmount);
out:
- if (error)
- kmem_cache_destroy(hugetlbfs_inode_cachep);
+ kmem_cache_destroy(hugetlbfs_inode_cachep);
out2:
bdi_destroy(&hugetlbfs_backing_dev_info);
return error;
diff --git a/fs/inode.c b/fs/inode.c
index 4fa4f0916af..9f4f5fecc09 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2,29 +2,19 @@
* (C) 1997 Linus Torvalds
* (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
*/
+#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/dcache.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/writeback.h>
-#include <linux/module.h>
#include <linux/backing-dev.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
#include <linux/hash.h>
#include <linux/swap.h>
#include <linux/security.h>
-#include <linux/pagemap.h>
#include <linux/cdev.h>
#include <linux/bootmem.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
-#include <linux/async.h>
#include <linux/posix_acl.h>
#include <linux/prefetch.h>
-#include <linux/ima.h>
-#include <linux/cred.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include "internal.h"
@@ -322,9 +312,6 @@ EXPORT_SYMBOL(clear_nlink);
void set_nlink(struct inode *inode, unsigned int nlink)
{
if (!nlink) {
- printk_ratelimited(KERN_INFO
- "set_nlink() clearing i_nlink on %s inode %li\n",
- inode->i_sb->s_type->name, inode->i_ino);
clear_nlink(inode);
} else {
/* Yes, some filesystems do change nlink from zero to one */
@@ -941,8 +928,7 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode)
struct file_system_type *type = inode->i_sb->s_type;
/* Set new key only if filesystem hasn't already changed it */
- if (!lockdep_match_class(&inode->i_mutex,
- &type->i_mutex_key)) {
+ if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) {
/*
* ensure nobody is actually holding i_mutex
*/
@@ -969,6 +955,7 @@ void unlock_new_inode(struct inode *inode)
spin_lock(&inode->i_lock);
WARN_ON(!(inode->i_state & I_NEW));
inode->i_state &= ~I_NEW;
+ smp_mb();
wake_up_bit(&inode->i_state, __I_NEW);
spin_unlock(&inode->i_lock);
}
@@ -1372,17 +1359,6 @@ int generic_delete_inode(struct inode *inode)
EXPORT_SYMBOL(generic_delete_inode);
/*
- * Normal UNIX filesystem behaviour: delete the
- * inode when the usage count drops to zero, and
- * i_nlink is zero.
- */
-int generic_drop_inode(struct inode *inode)
-{
- return !inode->i_nlink || inode_unhashed(inode);
-}
-EXPORT_SYMBOL_GPL(generic_drop_inode);
-
-/*
* Called when we're dropping the last reference
* to an inode.
*
@@ -1513,9 +1489,10 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
* This function automatically handles read only file systems and media,
* as well as the "noatime" flag and inode specific "noatime" markers.
*/
-void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
+void touch_atime(struct path *path)
{
- struct inode *inode = dentry->d_inode;
+ struct vfsmount *mnt = path->mnt;
+ struct inode *inode = path->dentry->d_inode;
struct timespec now;
if (inode->i_flags & S_NOATIME)
@@ -1654,7 +1631,7 @@ __setup("ihash_entries=", set_ihash_entries);
*/
void __init inode_init_early(void)
{
- int loop;
+ unsigned int loop;
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
@@ -1672,13 +1649,13 @@ void __init inode_init_early(void)
&i_hash_mask,
0);
- for (loop = 0; loop < (1 << i_hash_shift); loop++)
+ for (loop = 0; loop < (1U << i_hash_shift); loop++)
INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
void __init inode_init(void)
{
- int loop;
+ unsigned int loop;
/* inode slab cache */
inode_cachep = kmem_cache_create("inode_cache",
@@ -1702,7 +1679,7 @@ void __init inode_init(void)
&i_hash_mask,
0);
- for (loop = 0; loop < (1 << i_hash_shift); loop++)
+ for (loop = 0; loop < (1U << i_hash_shift); loop++)
INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 066836e8184..29167bebe87 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -10,7 +10,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/security.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
diff --git a/fs/ioprio.c b/fs/ioprio.c
index f79dab83e17..0f1b9515213 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -48,28 +48,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
if (err)
return err;
- task_lock(task);
- do {
- ioc = task->io_context;
- /* see wmb() in current_io_context() */
- smp_read_barrier_depends();
- if (ioc)
- break;
-
- ioc = alloc_io_context(GFP_ATOMIC, -1);
- if (!ioc) {
- err = -ENOMEM;
- break;
- }
- task->io_context = ioc;
- } while (1);
-
- if (!err) {
- ioc->ioprio = ioprio;
- ioc->ioprio_changed = 1;
+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+ if (ioc) {
+ ioc_ioprio_changed(ioc, ioprio);
+ put_io_context(ioc);
}
- task_unlock(task);
return err;
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bd62c76fb5d..29037c365ba 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -947,9 +947,8 @@ root_found:
s->s_d_op = &isofs_dentry_ops[table];
/* get the root dentry */
- s->s_root = d_alloc_root(inode);
+ s->s_root = d_make_root(inode);
if (!(s->s_root)) {
- iput(inode);
error = -ENOMEM;
goto out_no_inode;
}
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 5d1a00a5041..05f0754f2b4 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -453,8 +453,6 @@ out:
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
- * Called with the journal lock held.
- *
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
@@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)
if (is_journal_aborted(journal))
return 1;
- /* OK, work out the oldest transaction remaining in the log, and
+ /*
+ * OK, work out the oldest transaction remaining in the log, and
* the log block it starts at.
*
* If the log is now empty, we need to work out which is the
* next transaction ID we will write, and where it will
- * start. */
-
+ * start.
+ */
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
transaction = journal->j_checkpoint_transactions;
@@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)
spin_unlock(&journal->j_state_lock);
return 1;
}
+ spin_unlock(&journal->j_state_lock);
+
+ /*
+ * We need to make sure that any blocks that were recently written out
+ * --- perhaps by log_do_checkpoint() --- are flushed out before we
+ * drop the transactions from the journal. It's unlikely this will be
+ * necessary, especially with an appropriately sized journal, but we
+ * need this to guarantee correctness. Fortunately
+ * cleanup_journal_tail() doesn't get called all that often.
+ */
+ if (journal->j_flags & JFS_BARRIER)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+ spin_lock(&journal->j_state_lock);
+ if (!tid_gt(first_tid, journal->j_tail_sequence)) {
+ spin_unlock(&journal->j_state_lock);
+ /* Someone else cleaned up journal so return 0 */
+ return 0;
+ }
/* OK, update the superblock to recover the freed space.
* Physical blocks come first: have we wrapped beyond the end of
* the log? */
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 59c09f9541b..0971e921780 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -129,6 +129,8 @@ static int kjournald(void *arg)
setup_timer(&journal->j_commit_timer, commit_timeout,
(unsigned long)current);
+ set_freezable();
+
/* Record that the journal thread is running */
journal->j_task = current;
wake_up(&journal->j_wait_done_commit);
@@ -328,7 +330,7 @@ repeat:
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
}
- mapped_data = kmap_atomic(new_page, KM_USER0);
+ mapped_data = kmap_atomic(new_page);
/*
* Check for escaping
*/
@@ -337,7 +339,7 @@ repeat:
need_copy_out = 1;
do_escape = 1;
}
- kunmap_atomic(mapped_data, KM_USER0);
+ kunmap_atomic(mapped_data);
/*
* Do we need to do a data copy?
@@ -354,9 +356,9 @@ repeat:
}
jh_in->b_frozen_data = tmp;
- mapped_data = kmap_atomic(new_page, KM_USER0);
+ mapped_data = kmap_atomic(new_page);
memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
- kunmap_atomic(mapped_data, KM_USER0);
+ kunmap_atomic(mapped_data);
new_page = virt_to_page(tmp);
new_offset = offset_in_page(tmp);
@@ -368,9 +370,9 @@ repeat:
* copying, we can finally do so.
*/
if (do_escape) {
- mapped_data = kmap_atomic(new_page, KM_USER0);
+ mapped_data = kmap_atomic(new_page);
*((unsigned int *)(mapped_data + new_offset)) = 0;
- kunmap_atomic(mapped_data, KM_USER0);
+ kunmap_atomic(mapped_data);
}
set_bh_page(new_bh, new_page, new_offset);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 5b43e96788e..008bf062fd2 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,6 +20,7 @@
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
+#include <linux/blkdev.h>
#endif
/*
@@ -263,6 +264,9 @@ int journal_recover(journal_t *journal)
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
+ /* Flush disk caches to get replayed data on the permanent storage */
+ if (journal->j_flags & JFS_BARRIER)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
return err;
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 7fce94b04bc..b2a7e5244e3 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -718,9 +718,9 @@ done:
"Possible IO failure.\n");
page = jh2bh(jh)->b_page;
offset = offset_in_page(jh2bh(jh)->b_data);
- source = kmap_atomic(page, KM_USER0);
+ source = kmap_atomic(page);
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
- kunmap_atomic(source, KM_USER0);
+ kunmap_atomic(source);
}
jbd_unlock_bh_state(bh);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index d49d202903f..c78841ee81c 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh)
* whole transaction.
*
* Requires j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __try_to_free_cp_buf(struct journal_head *jh)
{
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
- if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+ if (jh->b_transaction == NULL && !buffer_locked(bh) &&
!buffer_dirty(bh) && !buffer_write_io_error(bh)) {
/*
* Get our reference so that bh cannot be freed before
@@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
get_bh(bh);
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
- jbd_unlock_bh_state(bh);
BUFFER_TRACE(bh, "release");
__brelse(bh);
- } else {
- jbd_unlock_bh_state(bh);
}
return ret;
}
@@ -180,21 +176,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
}
/*
- * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
- * The caller must restart a list walk. Wait for someone else to run
- * jbd_unlock_bh_state().
- */
-static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
- __releases(journal->j_list_lock)
-{
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_lock_bh_state(bh);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
-}
-
-/*
* Clean up transaction's list of buffers submitted for io.
* We wait for any pending IO to complete and remove any clean
* buffers. Note that we take the buffers in the opposite ordering
@@ -222,15 +203,9 @@ restart:
while (!released && transaction->t_checkpoint_io_list) {
jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
- if (!jbd_trylock_bh_state(bh)) {
- jbd_sync_bh(journal, bh);
- spin_lock(&journal->j_list_lock);
- goto restart;
- }
get_bh(bh);
if (buffer_locked(bh)) {
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
@@ -246,7 +221,6 @@ restart:
* it has been written out and so we can drop it from the list
*/
released = __jbd2_journal_remove_checkpoint(jh);
- jbd_unlock_bh_state(bh);
__brelse(bh);
}
@@ -266,7 +240,6 @@ __flush_batch(journal_t *journal, int *batch_count)
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = journal->j_chkpt_bhs[i];
- clear_buffer_jwrite(bh);
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
}
@@ -281,7 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count)
* be written out.
*
* Called with j_list_lock held and drops it if 1 is returned
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __process_buffer(journal_t *journal, struct journal_head *jh,
int *batch_count, transaction_t *transaction)
@@ -292,7 +264,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
@@ -304,7 +275,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
transaction->t_chp_stats.cs_forced_to_close++;
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
if (unlikely(journal->j_flags & JBD2_UNMOUNT))
/*
* The journal thread is dead; so starting and
@@ -323,11 +293,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
if (unlikely(buffer_write_io_error(bh)))
ret = -EIO;
get_bh(bh);
- J_ASSERT_JH(jh, !buffer_jbddirty(bh));
BUFFER_TRACE(bh, "remove from checkpoint");
__jbd2_journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
__brelse(bh);
} else {
/*
@@ -340,10 +308,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
BUFFER_TRACE(bh, "queue");
get_bh(bh);
J_ASSERT_BH(bh, !buffer_jwrite(bh));
- set_buffer_jwrite(bh);
journal->j_chkpt_bhs[*batch_count] = bh;
__buffer_relink_io(jh);
- jbd_unlock_bh_state(bh);
transaction->t_chp_stats.cs_written++;
(*batch_count)++;
if (*batch_count == JBD2_NR_BATCH) {
@@ -407,15 +373,7 @@ restart:
int retry = 0, err;
while (!retry && transaction->t_checkpoint_list) {
- struct buffer_head *bh;
-
jh = transaction->t_checkpoint_list;
- bh = jh2bh(jh);
- if (!jbd_trylock_bh_state(bh)) {
- jbd_sync_bh(journal, bh);
- retry = 1;
- break;
- }
retry = __process_buffer(journal, jh, &batch_count,
transaction);
if (retry < 0 && !result)
@@ -478,79 +436,28 @@ out:
int jbd2_cleanup_journal_tail(journal_t *journal)
{
- transaction_t * transaction;
tid_t first_tid;
- unsigned long blocknr, freed;
+ unsigned long blocknr;
if (is_journal_aborted(journal))
return 1;
- /* OK, work out the oldest transaction remaining in the log, and
- * the log block it starts at.
- *
- * If the log is now empty, we need to work out which is the
- * next transaction ID we will write, and where it will
- * start. */
-
- write_lock(&journal->j_state_lock);
- spin_lock(&journal->j_list_lock);
- transaction = journal->j_checkpoint_transactions;
- if (transaction) {
- first_tid = transaction->t_tid;
- blocknr = transaction->t_log_start;
- } else if ((transaction = journal->j_committing_transaction) != NULL) {
- first_tid = transaction->t_tid;
- blocknr = transaction->t_log_start;
- } else if ((transaction = journal->j_running_transaction) != NULL) {
- first_tid = transaction->t_tid;
- blocknr = journal->j_head;
- } else {
- first_tid = journal->j_transaction_sequence;
- blocknr = journal->j_head;
- }
- spin_unlock(&journal->j_list_lock);
- J_ASSERT(blocknr != 0);
-
- /* If the oldest pinned transaction is at the tail of the log
- already then there's not much we can do right now. */
- if (journal->j_tail_sequence == first_tid) {
- write_unlock(&journal->j_state_lock);
+ if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
return 1;
- }
-
- /* OK, update the superblock to recover the freed space.
- * Physical blocks come first: have we wrapped beyond the end of
- * the log? */
- freed = blocknr - journal->j_tail;
- if (blocknr < journal->j_tail)
- freed = freed + journal->j_last - journal->j_first;
-
- trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
- jbd_debug(1,
- "Cleaning journal tail from %d to %d (offset %lu), "
- "freeing %lu\n",
- journal->j_tail_sequence, first_tid, blocknr, freed);
-
- journal->j_free += freed;
- journal->j_tail_sequence = first_tid;
- journal->j_tail = blocknr;
- write_unlock(&journal->j_state_lock);
+ J_ASSERT(blocknr != 0);
/*
- * If there is an external journal, we need to make sure that
- * any data blocks that were recently written out --- perhaps
- * by jbd2_log_do_checkpoint() --- are flushed out before we
- * drop the transactions from the external journal. It's
- * unlikely this will be necessary, especially with a
- * appropriately sized journal, but we need this to guarantee
- * correctness. Fortunately jbd2_cleanup_journal_tail()
- * doesn't get called all that often.
+ * We need to make sure that any blocks that were recently written out
+ * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
+ * we drop the transactions from the journal. It's unlikely this will
+ * be necessary, especially with an appropriately sized journal, but we
+ * need this to guarantee correctness. Fortunately
+ * jbd2_cleanup_journal_tail() doesn't get called all that often.
*/
- if ((journal->j_fs_dev != journal->j_dev) &&
- (journal->j_flags & JBD2_BARRIER))
+ if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
- if (!(journal->j_flags & JBD2_ABORT))
- jbd2_journal_update_superblock(journal, 1);
+
+ __jbd2_update_log_tail(journal, first_tid, blocknr);
return 0;
}
@@ -582,15 +489,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
do {
jh = next_jh;
next_jh = jh->b_cpnext;
- /* Use trylock because of the ranking */
- if (jbd_trylock_bh_state(jh2bh(jh))) {
- ret = __try_to_free_cp_buf(jh);
- if (ret) {
- freed++;
- if (ret == 2) {
- *released = 1;
- return freed;
- }
+ ret = __try_to_free_cp_buf(jh);
+ if (ret) {
+ freed++;
+ if (ret == 2) {
+ *released = 1;
+ return freed;
}
}
/*
@@ -673,9 +577,7 @@ out:
* The function can free jh and bh.
*
* This function is called with j_list_lock held.
- * This function is called with jbd_lock_bh_state(jh2bh(jh))
*/
-
int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
{
struct transaction_chp_stats_s *stats;
@@ -722,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
transaction->t_tid, stats);
__jbd2_journal_drop_transaction(journal, transaction);
- kfree(transaction);
+ jbd2_journal_free_transaction(transaction);
/* Just in case anybody was waiting for more transactions to be
checkpointed... */
@@ -797,5 +699,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
+ trace_jbd2_drop_transaction(journal, transaction);
+
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 5069b847515..806525a7269 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -28,7 +28,6 @@
#include <linux/blkdev.h>
#include <linux/bitops.h>
#include <trace/events/jbd2.h>
-#include <asm/system.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -286,10 +285,10 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
char *addr;
__u32 checksum;
- addr = kmap_atomic(page, KM_USER0);
+ addr = kmap_atomic(page);
checksum = crc32_be(crc32_sum,
(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
- kunmap_atomic(addr, KM_USER0);
+ kunmap_atomic(addr);
return checksum;
}
@@ -331,6 +330,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
struct blk_plug plug;
+ /* Tail of the journal */
+ unsigned long first_block;
+ tid_t first_tid;
+ int update_tail;
/*
* First job: lock down the current transaction and wait for
@@ -340,7 +343,18 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/* Do we need to erase the effects of a prior jbd2_journal_flush? */
if (journal->j_flags & JBD2_FLUSHED) {
jbd_debug(3, "super block updated\n");
- jbd2_journal_update_superblock(journal, 1);
+ mutex_lock(&journal->j_checkpoint_mutex);
+ /*
+ * We hold j_checkpoint_mutex so tail cannot change under us.
+ * We don't need any special data guarantees for writing sb
+ * since journal is empty and it is ok for write to be
+ * flushed only with transaction commit.
+ */
+ jbd2_journal_update_sb_log_tail(journal,
+ journal->j_tail_sequence,
+ journal->j_tail,
+ WRITE_SYNC);
+ mutex_unlock(&journal->j_checkpoint_mutex);
} else {
jbd_debug(3, "superblock not updated\n");
}
@@ -677,10 +691,30 @@ start_journal_io:
err = 0;
}
+ /*
+ * Get current oldest transaction in the log before we issue flush
+ * to the filesystem device. After the flush we can be sure that
+ * blocks of all older transactions are checkpointed to persistent
+ * storage and we will be safe to update journal start in the
+ * superblock with the numbers we get here.
+ */
+ update_tail =
+ jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
+
write_lock(&journal->j_state_lock);
+ if (update_tail) {
+ long freed = first_block - journal->j_tail;
+
+ if (first_block < journal->j_tail)
+ freed += journal->j_last - journal->j_first;
+ /* Update tail only if we free significant amount of space */
+ if (freed < journal->j_maxlen / 4)
+ update_tail = 0;
+ }
J_ASSERT(commit_transaction->t_state == T_COMMIT);
commit_transaction->t_state = T_COMMIT_DFLUSH;
write_unlock(&journal->j_state_lock);
+
/*
* If the journal is not located on the file system device,
* then we must flush the file system device before we issue
@@ -831,6 +865,14 @@ wait_for_iobuf:
if (err)
jbd2_journal_abort(journal, err);
+ /*
+ * Now disk caches for filesystem device are flushed so we are safe to
+ * erase checkpointed transactions from the log by updating journal
+ * superblock.
+ */
+ if (update_tail)
+ jbd2_update_log_tail(journal, first_tid, first_block);
+
/* End of a transaction! Finally, we can do checkpoint
processing: any buffers committed as a result of this
transaction can be removed from any checkpoint list it was on
@@ -1048,7 +1090,7 @@ restart_loop:
jbd_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
if (to_free)
- kfree(commit_transaction);
+ jbd2_journal_free_transaction(commit_transaction);
wake_up(&journal->j_wait_done_commit);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c0a5f9f1b12..1afb701622b 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@
#include <asm/uaccess.h>
#include <asm/page.h>
-#include <asm/system.h>
EXPORT_SYMBOL(jbd2_journal_extend);
EXPORT_SYMBOL(jbd2_journal_stop);
@@ -71,7 +70,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke);
EXPORT_SYMBOL(jbd2_journal_init_dev);
EXPORT_SYMBOL(jbd2_journal_init_inode);
-EXPORT_SYMBOL(jbd2_journal_update_format);
EXPORT_SYMBOL(jbd2_journal_check_used_features);
EXPORT_SYMBOL(jbd2_journal_check_available_features);
EXPORT_SYMBOL(jbd2_journal_set_features);
@@ -96,7 +94,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
EXPORT_SYMBOL(jbd2_inode_cache);
-static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
static int jbd2_journal_create_slab(size_t slab_size);
@@ -139,6 +136,8 @@ static int kjournald2(void *arg)
setup_timer(&journal->j_commit_timer, commit_timeout,
(unsigned long)current);
+ set_freezable();
+
/* Record that the journal thread is running */
journal->j_task = current;
wake_up(&journal->j_wait_done_commit);
@@ -345,7 +344,7 @@ repeat:
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
}
- mapped_data = kmap_atomic(new_page, KM_USER0);
+ mapped_data = kmap_atomic(new_page);
/*
* Fire data frozen trigger if data already wasn't frozen. Do this
* before checking for escaping, as the trigger may modify the magic
@@ -364,7 +363,7 @@ repeat:
need_copy_out = 1;
do_escape = 1;
}
- kunmap_atomic(mapped_data, KM_USER0);
+ kunmap_atomic(mapped_data);
/*
* Do we need to do a data copy?
@@ -385,9 +384,9 @@ repeat:
}
jh_in->b_frozen_data = tmp;
- mapped_data = kmap_atomic(new_page, KM_USER0);
+ mapped_data = kmap_atomic(new_page);
memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
- kunmap_atomic(mapped_data, KM_USER0);
+ kunmap_atomic(mapped_data);
new_page = virt_to_page(tmp);
new_offset = offset_in_page(tmp);
@@ -406,9 +405,9 @@ repeat:
* copying, we can finally do so.
*/
if (do_escape) {
- mapped_data = kmap_atomic(new_page, KM_USER0);
+ mapped_data = kmap_atomic(new_page);
*((unsigned int *)(mapped_data + new_offset)) = 0;
- kunmap_atomic(mapped_data, KM_USER0);
+ kunmap_atomic(mapped_data);
}
set_bh_page(new_bh, new_page, new_offset);
@@ -744,6 +743,98 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
return jbd2_journal_add_journal_head(bh);
}
+/*
+ * Return tid of the oldest transaction in the journal and block in the journal
+ * where the transaction starts.
+ *
+ * If the journal is now empty, return which will be the next transaction ID
+ * we will write and where will that transaction start.
+ *
+ * The return value is 0 if journal tail cannot be pushed any further, 1 if
+ * it can.
+ */
+int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
+ unsigned long *block)
+{
+ transaction_t *transaction;
+ int ret;
+
+ read_lock(&journal->j_state_lock);
+ spin_lock(&journal->j_list_lock);
+ transaction = journal->j_checkpoint_transactions;
+ if (transaction) {
+ *tid = transaction->t_tid;
+ *block = transaction->t_log_start;
+ } else if ((transaction = journal->j_committing_transaction) != NULL) {
+ *tid = transaction->t_tid;
+ *block = transaction->t_log_start;
+ } else if ((transaction = journal->j_running_transaction) != NULL) {
+ *tid = transaction->t_tid;
+ *block = journal->j_head;
+ } else {
+ *tid = journal->j_transaction_sequence;
+ *block = journal->j_head;
+ }
+ ret = tid_gt(*tid, journal->j_tail_sequence);
+ spin_unlock(&journal->j_list_lock);
+ read_unlock(&journal->j_state_lock);
+
+ return ret;
+}
+
+/*
+ * Update information in journal structure and in on disk journal superblock
+ * about log tail. This function does not check whether information passed in
+ * really pushes log tail further. It's responsibility of the caller to make
+ * sure provided log tail information is valid (e.g. by holding
+ * j_checkpoint_mutex all the time between computing log tail and calling this
+ * function as is the case with jbd2_cleanup_journal_tail()).
+ *
+ * Requires j_checkpoint_mutex
+ */
+void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+ unsigned long freed;
+
+ BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+
+ /*
+ * We cannot afford for write to remain in drive's caches since as
+ * soon as we update j_tail, next transaction can start reusing journal
+ * space and if we lose sb update during power failure we'd replay
+ * old transaction with possibly newly overwritten data.
+ */
+ jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+ write_lock(&journal->j_state_lock);
+ freed = block - journal->j_tail;
+ if (block < journal->j_tail)
+ freed += journal->j_last - journal->j_first;
+
+ trace_jbd2_update_log_tail(journal, tid, block, freed);
+ jbd_debug(1,
+ "Cleaning journal tail from %d to %d (offset %lu), "
+ "freeing %lu\n",
+ journal->j_tail_sequence, tid, block, freed);
+
+ journal->j_free += freed;
+ journal->j_tail_sequence = tid;
+ journal->j_tail = block;
+ write_unlock(&journal->j_state_lock);
+}
+
+/*
+ * This is a variaon of __jbd2_update_log_tail which checks for validity of
+ * provided log tail and locks j_checkpoint_mutex. So it is safe against races
+ * with other threads updating log tail.
+ */
+void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+ mutex_lock(&journal->j_checkpoint_mutex);
+ if (tid_gt(tid, journal->j_tail_sequence))
+ __jbd2_update_log_tail(journal, tid, block);
+ mutex_unlock(&journal->j_checkpoint_mutex);
+}
+
struct jbd2_stats_proc_session {
journal_t *journal;
struct transaction_stats_s *stats;
@@ -1112,40 +1203,45 @@ static int journal_reset(journal_t *journal)
journal->j_max_transaction_buffers = journal->j_maxlen / 4;
- /* Add the dynamic fields and write it to disk. */
- jbd2_journal_update_superblock(journal, 1);
- return jbd2_journal_start_thread(journal);
-}
-
-/**
- * void jbd2_journal_update_superblock() - Update journal sb on disk.
- * @journal: The journal to update.
- * @wait: Set to '0' if you don't want to wait for IO completion.
- *
- * Update a journal's dynamic superblock fields and write it to disk,
- * optionally waiting for the IO to complete.
- */
-void jbd2_journal_update_superblock(journal_t *journal, int wait)
-{
- journal_superblock_t *sb = journal->j_superblock;
- struct buffer_head *bh = journal->j_sb_buffer;
-
/*
* As a special case, if the on-disk copy is already marked as needing
- * no recovery (s_start == 0) and there are no outstanding transactions
- * in the filesystem, then we can safely defer the superblock update
- * until the next commit by setting JBD2_FLUSHED. This avoids
+ * no recovery (s_start == 0), then we can safely defer the superblock
+ * update until the next commit by setting JBD2_FLUSHED. This avoids
* attempting a write to a potential-readonly device.
*/
- if (sb->s_start == 0 && journal->j_tail_sequence ==
- journal->j_transaction_sequence) {
+ if (sb->s_start == 0) {
jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
"(start %ld, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
- goto out;
+ journal->j_flags |= JBD2_FLUSHED;
+ } else {
+ /* Lock here to make assertions happy... */
+ mutex_lock(&journal->j_checkpoint_mutex);
+ /*
+ * Update log tail information. We use WRITE_FUA since new
+ * transaction will start reusing journal space and so we
+ * must make sure information about current log tail is on
+ * disk before that.
+ */
+ jbd2_journal_update_sb_log_tail(journal,
+ journal->j_tail_sequence,
+ journal->j_tail,
+ WRITE_FUA);
+ mutex_unlock(&journal->j_checkpoint_mutex);
}
+ return jbd2_journal_start_thread(journal);
+}
+static void jbd2_write_superblock(journal_t *journal, int write_op)
+{
+ struct buffer_head *bh = journal->j_sb_buffer;
+ int ret;
+
+ trace_jbd2_write_superblock(journal, write_op);
+ if (!(journal->j_flags & JBD2_BARRIER))
+ write_op &= ~(REQ_FUA | REQ_FLUSH);
+ lock_buffer(bh);
if (buffer_write_io_error(bh)) {
/*
* Oh, dear. A previous attempt to write the journal
@@ -1161,48 +1257,106 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
clear_buffer_write_io_error(bh);
set_buffer_uptodate(bh);
}
+ get_bh(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ ret = submit_bh(write_op, bh);
+ wait_on_buffer(bh);
+ if (buffer_write_io_error(bh)) {
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ ret = -EIO;
+ }
+ if (ret) {
+ printk(KERN_ERR "JBD2: Error %d detected when updating "
+ "journal superblock for %s.\n", ret,
+ journal->j_devname);
+ }
+}
+/**
+ * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
+ * @journal: The journal to update.
+ * @tail_tid: TID of the new transaction at the tail of the log
+ * @tail_block: The first block of the transaction at the tail of the log
+ * @write_op: With which operation should we write the journal sb
+ *
+ * Update a journal's superblock information about log tail and write it to
+ * disk, waiting for the IO to complete.
+ */
+void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+ unsigned long tail_block, int write_op)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+
+ BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+ jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+ tail_block, tail_tid);
+
+ sb->s_sequence = cpu_to_be32(tail_tid);
+ sb->s_start = cpu_to_be32(tail_block);
+
+ jbd2_write_superblock(journal, write_op);
+
+ /* Log is no longer empty */
+ write_lock(&journal->j_state_lock);
+ WARN_ON(!sb->s_sequence);
+ journal->j_flags &= ~JBD2_FLUSHED;
+ write_unlock(&journal->j_state_lock);
+}
+
+/**
+ * jbd2_mark_journal_empty() - Mark on disk journal as empty.
+ * @journal: The journal to update.
+ *
+ * Update a journal's dynamic superblock fields to show that journal is empty.
+ * Write updated superblock to disk waiting for IO to complete.
+ */
+static void jbd2_mark_journal_empty(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+
+ BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
read_lock(&journal->j_state_lock);
- jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
- journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+ jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
+ journal->j_tail_sequence);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
- sb->s_start = cpu_to_be32(journal->j_tail);
- sb->s_errno = cpu_to_be32(journal->j_errno);
+ sb->s_start = cpu_to_be32(0);
read_unlock(&journal->j_state_lock);
- BUFFER_TRACE(bh, "marking dirty");
- mark_buffer_dirty(bh);
- if (wait) {
- sync_dirty_buffer(bh);
- if (buffer_write_io_error(bh)) {
- printk(KERN_ERR "JBD2: I/O error detected "
- "when updating journal superblock for %s.\n",
- journal->j_devname);
- clear_buffer_write_io_error(bh);
- set_buffer_uptodate(bh);
- }
- } else
- write_dirty_buffer(bh, WRITE);
-
-out:
- /* If we have just flushed the log (by marking s_start==0), then
- * any future commit will have to be careful to update the
- * superblock again to re-record the true start of the log. */
+ jbd2_write_superblock(journal, WRITE_FUA);
+ /* Log is no longer empty */
write_lock(&journal->j_state_lock);
- if (sb->s_start)
- journal->j_flags &= ~JBD2_FLUSHED;
- else
- journal->j_flags |= JBD2_FLUSHED;
+ journal->j_flags |= JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
}
+
+/**
+ * jbd2_journal_update_sb_errno() - Update error in the journal.
+ * @journal: The journal to update.
+ *
+ * Update a journal's errno. Write updated superblock to disk waiting for IO
+ * to complete.
+ */
+static void jbd2_journal_update_sb_errno(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+
+ read_lock(&journal->j_state_lock);
+ jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
+ journal->j_errno);
+ sb->s_errno = cpu_to_be32(journal->j_errno);
+ read_unlock(&journal->j_state_lock);
+
+ jbd2_write_superblock(journal, WRITE_SYNC);
+}
+
/*
* Read the superblock for a given journal, performing initial
* validation of the format.
*/
-
static int journal_get_superblock(journal_t *journal)
{
struct buffer_head *bh;
@@ -1396,14 +1550,11 @@ int jbd2_journal_destroy(journal_t *journal)
if (journal->j_sb_buffer) {
if (!is_journal_aborted(journal)) {
- /* We can now mark the journal as empty. */
- journal->j_tail = 0;
- journal->j_tail_sequence =
- ++journal->j_transaction_sequence;
- jbd2_journal_update_superblock(journal, 1);
- } else {
+ mutex_lock(&journal->j_checkpoint_mutex);
+ jbd2_mark_journal_empty(journal);
+ mutex_unlock(&journal->j_checkpoint_mutex);
+ } else
err = -EIO;
- }
brelse(journal->j_sb_buffer);
}
@@ -1550,61 +1701,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
EXPORT_SYMBOL(jbd2_journal_clear_features);
/**
- * int jbd2_journal_update_format () - Update on-disk journal structure.
- * @journal: Journal to act on.
- *
- * Given an initialised but unloaded journal struct, poke about in the
- * on-disk structure to update it to the most recent supported version.
- */
-int jbd2_journal_update_format (journal_t *journal)
-{
- journal_superblock_t *sb;
- int err;
-
- err = journal_get_superblock(journal);
- if (err)
- return err;
-
- sb = journal->j_superblock;
-
- switch (be32_to_cpu(sb->s_header.h_blocktype)) {
- case JBD2_SUPERBLOCK_V2:
- return 0;
- case JBD2_SUPERBLOCK_V1:
- return journal_convert_superblock_v1(journal, sb);
- default:
- break;
- }
- return -EINVAL;
-}
-
-static int journal_convert_superblock_v1(journal_t *journal,
- journal_superblock_t *sb)
-{
- int offset, blocksize;
- struct buffer_head *bh;
-
- printk(KERN_WARNING
- "JBD2: Converting superblock from version 1 to 2.\n");
-
- /* Pre-initialise new fields to zero */
- offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
- blocksize = be32_to_cpu(sb->s_blocksize);
- memset(&sb->s_feature_compat, 0, blocksize-offset);
-
- sb->s_nr_users = cpu_to_be32(1);
- sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
- journal->j_format_version = 2;
-
- bh = journal->j_sb_buffer;
- BUFFER_TRACE(bh, "marking dirty");
- mark_buffer_dirty(bh);
- sync_dirty_buffer(bh);
- return 0;
-}
-
-
-/**
* int jbd2_journal_flush () - Flush journal
* @journal: Journal to act on.
*
@@ -1617,7 +1713,6 @@ int jbd2_journal_flush(journal_t *journal)
{
int err = 0;
transaction_t *transaction = NULL;
- unsigned long old_tail;
write_lock(&journal->j_state_lock);
@@ -1652,6 +1747,7 @@ int jbd2_journal_flush(journal_t *journal)
if (is_journal_aborted(journal))
return -EIO;
+ mutex_lock(&journal->j_checkpoint_mutex);
jbd2_cleanup_journal_tail(journal);
/* Finally, mark the journal as really needing no recovery.
@@ -1659,14 +1755,9 @@ int jbd2_journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
+ jbd2_mark_journal_empty(journal);
+ mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
- old_tail = journal->j_tail;
- journal->j_tail = 0;
- write_unlock(&journal->j_state_lock);
- jbd2_journal_update_superblock(journal, 1);
- write_lock(&journal->j_state_lock);
- journal->j_tail = old_tail;
-
J_ASSERT(!journal->j_running_transaction);
J_ASSERT(!journal->j_committing_transaction);
J_ASSERT(!journal->j_checkpoint_transactions);
@@ -1706,8 +1797,12 @@ int jbd2_journal_wipe(journal_t *journal, int write)
write ? "Clearing" : "Ignoring");
err = jbd2_journal_skip_recovery(journal);
- if (write)
- jbd2_journal_update_superblock(journal, 1);
+ if (write) {
+ /* Lock to make assertions happy... */
+ mutex_lock(&journal->j_checkpoint_mutex);
+ jbd2_mark_journal_empty(journal);
+ mutex_unlock(&journal->j_checkpoint_mutex);
+ }
no_recovery:
return err;
@@ -1757,7 +1852,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
__jbd2_journal_abort_hard(journal);
if (errno)
- jbd2_journal_update_superblock(journal, 1);
+ jbd2_journal_update_sb_errno(journal);
}
/**
@@ -2015,7 +2110,7 @@ static struct kmem_cache *jbd2_journal_head_cache;
static atomic_t nr_journal_heads = ATOMIC_INIT(0);
#endif
-static int journal_init_jbd2_journal_head_cache(void)
+static int jbd2_journal_init_journal_head_cache(void)
{
int retval;
@@ -2033,7 +2128,7 @@ static int journal_init_jbd2_journal_head_cache(void)
return retval;
}
-static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
+static void jbd2_journal_destroy_journal_head_cache(void)
{
if (jbd2_journal_head_cache) {
kmem_cache_destroy(jbd2_journal_head_cache);
@@ -2321,7 +2416,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
-static int __init journal_init_handle_cache(void)
+static int __init jbd2_journal_init_handle_cache(void)
{
jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
if (jbd2_handle_cache == NULL) {
@@ -2356,17 +2451,20 @@ static int __init journal_init_caches(void)
ret = jbd2_journal_init_revoke_caches();
if (ret == 0)
- ret = journal_init_jbd2_journal_head_cache();
+ ret = jbd2_journal_init_journal_head_cache();
+ if (ret == 0)
+ ret = jbd2_journal_init_handle_cache();
if (ret == 0)
- ret = journal_init_handle_cache();
+ ret = jbd2_journal_init_transaction_cache();
return ret;
}
static void jbd2_journal_destroy_caches(void)
{
jbd2_journal_destroy_revoke_caches();
- jbd2_journal_destroy_jbd2_journal_head_cache();
+ jbd2_journal_destroy_journal_head_cache();
jbd2_journal_destroy_handle_cache();
+ jbd2_journal_destroy_transaction_cache();
jbd2_journal_destroy_slabs();
}
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index da6d7baf139..c1a03354a22 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/crc32.h>
+#include <linux/blkdev.h>
#endif
/*
@@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal)
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
-
+ /* Make sure all replayed data is on permanent storage */
+ if (journal->j_flags & JBD2_BARRIER)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
return err;
}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 30b2867d6cc..6973705d6a3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -208,17 +208,13 @@ int __init jbd2_journal_init_revoke_caches(void)
J_ASSERT(!jbd2_revoke_record_cache);
J_ASSERT(!jbd2_revoke_table_cache);
- jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
- sizeof(struct jbd2_revoke_record_s),
- 0,
- SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
- NULL);
+ jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
+ SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
if (!jbd2_revoke_record_cache)
goto record_cache_failure;
- jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
- sizeof(struct jbd2_revoke_table_s),
- 0, SLAB_TEMPORARY, NULL);
+ jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
+ SLAB_TEMPORARY);
if (!jbd2_revoke_table_cache)
goto table_cache_failure;
return 0;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 35ae096bed5..ddcd3549c6c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -33,6 +33,35 @@
static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
+static struct kmem_cache *transaction_cache;
+int __init jbd2_journal_init_transaction_cache(void)
+{
+ J_ASSERT(!transaction_cache);
+ transaction_cache = kmem_cache_create("jbd2_transaction_s",
+ sizeof(transaction_t),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+ NULL);
+ if (transaction_cache)
+ return 0;
+ return -ENOMEM;
+}
+
+void jbd2_journal_destroy_transaction_cache(void)
+{
+ if (transaction_cache) {
+ kmem_cache_destroy(transaction_cache);
+ transaction_cache = NULL;
+ }
+}
+
+void jbd2_journal_free_transaction(transaction_t *transaction)
+{
+ if (unlikely(ZERO_OR_NULL_PTR(transaction)))
+ return;
+ kmem_cache_free(transaction_cache, transaction);
+}
+
/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
@@ -133,7 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
alloc_transaction:
if (!journal->j_running_transaction) {
- new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
+ new_transaction = kmem_cache_alloc(transaction_cache,
+ gfp_mask | __GFP_ZERO);
if (!new_transaction) {
/*
* If __GFP_FS is not present, then we may be
@@ -162,7 +192,7 @@ repeat:
if (is_journal_aborted(journal) ||
(journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
read_unlock(&journal->j_state_lock);
- kfree(new_transaction);
+ jbd2_journal_free_transaction(new_transaction);
return -EROFS;
}
@@ -284,7 +314,7 @@ repeat:
read_unlock(&journal->j_state_lock);
lock_map_acquire(&handle->h_lockdep_map);
- kfree(new_transaction);
+ jbd2_journal_free_transaction(new_transaction);
return 0;
}
@@ -783,12 +813,12 @@ done:
"Possible IO failure.\n");
page = jh2bh(jh)->b_page;
offset = offset_in_page(jh2bh(jh)->b_data);
- source = kmap_atomic(page, KM_USER0);
+ source = kmap_atomic(page);
/* Fire data frozen trigger just before we copy the data */
jbd2_buffer_frozen_trigger(jh, source + offset,
jh->b_triggers);
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
- kunmap_atomic(source, KM_USER0);
+ kunmap_atomic(source);
/*
* Now that the frozen data is saved off, we need to store
@@ -1549,9 +1579,9 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* of these pointers, it could go bad. Generally the caller needs to re-read
* the pointer from the transaction_t.
*
- * Called under j_list_lock. The journal may not be locked.
+ * Called under j_list_lock.
*/
-void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
+static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
{
struct journal_head **list = NULL;
transaction_t *transaction;
@@ -1646,10 +1676,8 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
spin_lock(&journal->j_list_lock);
if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
- if (jh->b_jlist == BJ_None) {
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ __jbd2_journal_remove_checkpoint(jh);
}
spin_unlock(&journal->j_list_lock);
out:
@@ -1949,6 +1977,8 @@ zap_buffer_unlocked:
clear_buffer_mapped(bh);
clear_buffer_req(bh);
clear_buffer_new(bh);
+ clear_buffer_delay(bh);
+ clear_buffer_unwritten(bh);
bh->b_bdev = NULL;
return may_free;
}
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 926d02068a1..922f146e423 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 404111b016c..2b60ce1996a 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/jffs2.h>
#include <linux/mtd/mtd.h>
@@ -42,12 +44,13 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index);
if (IS_ERR(tsk)) {
- printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %ld\n", -PTR_ERR(tsk));
+ pr_warn("fork failed for JFFS2 garbage collect thread: %ld\n",
+ -PTR_ERR(tsk));
complete(&c->gc_thread_exit);
ret = PTR_ERR(tsk);
} else {
/* Wait for it... */
- D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", tsk->pid));
+ jffs2_dbg(1, "Garbage collect thread is pid %d\n", tsk->pid);
wait_for_completion(&c->gc_thread_start);
ret = tsk->pid;
}
@@ -60,7 +63,7 @@ void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c)
int wait = 0;
spin_lock(&c->erase_completion_lock);
if (c->gc_task) {
- D1(printk(KERN_DEBUG "jffs2: Killing GC task %d\n", c->gc_task->pid));
+ jffs2_dbg(1, "Killing GC task %d\n", c->gc_task->pid);
send_sig(SIGKILL, c->gc_task, 1);
wait = 1;
}
@@ -90,7 +93,7 @@ static int jffs2_garbage_collect_thread(void *_c)
if (!jffs2_thread_should_wake(c)) {
set_current_state (TASK_INTERRUPTIBLE);
spin_unlock(&c->erase_completion_lock);
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread sleeping...\n"));
+ jffs2_dbg(1, "%s(): sleeping...\n", __func__);
schedule();
} else
spin_unlock(&c->erase_completion_lock);
@@ -109,7 +112,7 @@ static int jffs2_garbage_collect_thread(void *_c)
schedule_timeout_interruptible(msecs_to_jiffies(50));
if (kthread_should_stop()) {
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): kthread_stop() called.\n"));
+ jffs2_dbg(1, "%s(): kthread_stop() called\n", __func__);
goto die;
}
@@ -126,28 +129,32 @@ static int jffs2_garbage_collect_thread(void *_c)
switch(signr) {
case SIGSTOP:
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGSTOP received.\n"));
+ jffs2_dbg(1, "%s(): SIGSTOP received\n",
+ __func__);
set_current_state(TASK_STOPPED);
schedule();
break;
case SIGKILL:
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGKILL received.\n"));
+ jffs2_dbg(1, "%s(): SIGKILL received\n",
+ __func__);
goto die;
case SIGHUP:
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGHUP received.\n"));
+ jffs2_dbg(1, "%s(): SIGHUP received\n",
+ __func__);
break;
default:
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): signal %ld received\n", signr));
+ jffs2_dbg(1, "%s(): signal %ld received\n",
+ __func__, signr);
}
}
/* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */
disallow_signal(SIGHUP);
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): pass\n"));
+ jffs2_dbg(1, "%s(): pass\n", __func__);
if (jffs2_garbage_collect_pass(c) == -ENOSPC) {
- printk(KERN_NOTICE "No space for garbage collection. Aborting GC thread\n");
+ pr_notice("No space for garbage collection. Aborting GC thread\n");
goto die;
}
}
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 3005ec4520a..a3750f902ad 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -307,8 +309,8 @@ static void jffs2_calc_trigger_levels(struct jffs2_sb_info *c)
trying to GC to make more space. It'll be a fruitless task */
c->nospc_dirty_size = c->sector_size + (c->flash_size / 100);
- dbg_fsbuild("JFFS2 trigger levels (size %d KiB, block size %d KiB, %d blocks)\n",
- c->flash_size / 1024, c->sector_size / 1024, c->nr_blocks);
+ dbg_fsbuild("trigger levels (size %d KiB, block size %d KiB, %d blocks)\n",
+ c->flash_size / 1024, c->sector_size / 1024, c->nr_blocks);
dbg_fsbuild("Blocks required to allow deletion: %d (%d KiB)\n",
c->resv_blocks_deletion, c->resv_blocks_deletion*c->sector_size/1024);
dbg_fsbuild("Blocks required to allow writes: %d (%d KiB)\n",
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 5b6c9d1a2fb..4849a4c9a0e 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -12,6 +12,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include "compr.h"
static DEFINE_SPINLOCK(jffs2_compressor_list_lock);
@@ -79,7 +81,7 @@ static int jffs2_selected_compress(u8 compr, unsigned char *data_in,
output_buf = kmalloc(*cdatalen, GFP_KERNEL);
if (!output_buf) {
- printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
+ pr_warn("No memory for compressor allocation. Compression failed.\n");
return ret;
}
orig_slen = *datalen;
@@ -188,7 +190,8 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
tmp_buf = kmalloc(orig_slen, GFP_KERNEL);
spin_lock(&jffs2_compressor_list_lock);
if (!tmp_buf) {
- printk(KERN_WARNING "JFFS2: No memory for compressor allocation. (%d bytes)\n", orig_slen);
+ pr_warn("No memory for compressor allocation. (%d bytes)\n",
+ orig_slen);
continue;
}
else {
@@ -235,7 +238,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
cpage_out, datalen, cdatalen);
break;
default:
- printk(KERN_ERR "JFFS2: unknown compression mode.\n");
+ pr_err("unknown compression mode\n");
}
if (ret == JFFS2_COMPR_NONE) {
@@ -277,7 +280,8 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
ret = this->decompress(cdata_in, data_out, cdatalen, datalen);
spin_lock(&jffs2_compressor_list_lock);
if (ret) {
- printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret);
+ pr_warn("Decompressor \"%s\" returned %d\n",
+ this->name, ret);
}
else {
this->stat_decompr_blocks++;
@@ -287,7 +291,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
return ret;
}
}
- printk(KERN_WARNING "JFFS2 compression type 0x%02x not available.\n", comprtype);
+ pr_warn("compression type 0x%02x not available\n", comprtype);
spin_unlock(&jffs2_compressor_list_lock);
return -EIO;
}
@@ -299,7 +303,7 @@ int jffs2_register_compressor(struct jffs2_compressor *comp)
struct jffs2_compressor *this;
if (!comp->name) {
- printk(KERN_WARNING "NULL compressor name at registering JFFS2 compressor. Failed.\n");
+ pr_warn("NULL compressor name at registering JFFS2 compressor. Failed.\n");
return -1;
}
comp->compr_buf_size=0;
@@ -309,7 +313,7 @@ int jffs2_register_compressor(struct jffs2_compressor *comp)
comp->stat_compr_new_size=0;
comp->stat_compr_blocks=0;
comp->stat_decompr_blocks=0;
- D1(printk(KERN_DEBUG "Registering JFFS2 compressor \"%s\"\n", comp->name));
+ jffs2_dbg(1, "Registering JFFS2 compressor \"%s\"\n", comp->name);
spin_lock(&jffs2_compressor_list_lock);
@@ -332,15 +336,15 @@ out:
int jffs2_unregister_compressor(struct jffs2_compressor *comp)
{
- D2(struct jffs2_compressor *this;)
+ D2(struct jffs2_compressor *this);
- D1(printk(KERN_DEBUG "Unregistering JFFS2 compressor \"%s\"\n", comp->name));
+ jffs2_dbg(1, "Unregistering JFFS2 compressor \"%s\"\n", comp->name);
spin_lock(&jffs2_compressor_list_lock);
if (comp->usecount) {
spin_unlock(&jffs2_compressor_list_lock);
- printk(KERN_WARNING "JFFS2: Compressor modul is in use. Unregister failed.\n");
+ pr_warn("Compressor module is in use. Unregister failed.\n");
return -1;
}
list_del(&comp->list);
@@ -377,17 +381,17 @@ int __init jffs2_compressors_init(void)
/* Setting default compression mode */
#ifdef CONFIG_JFFS2_CMODE_NONE
jffs2_compression_mode = JFFS2_COMPR_MODE_NONE;
- D1(printk(KERN_INFO "JFFS2: default compression mode: none\n");)
+ jffs2_dbg(1, "default compression mode: none\n");
#else
#ifdef CONFIG_JFFS2_CMODE_SIZE
jffs2_compression_mode = JFFS2_COMPR_MODE_SIZE;
- D1(printk(KERN_INFO "JFFS2: default compression mode: size\n");)
+ jffs2_dbg(1, "default compression mode: size\n");
#else
#ifdef CONFIG_JFFS2_CMODE_FAVOURLZO
jffs2_compression_mode = JFFS2_COMPR_MODE_FAVOURLZO;
- D1(printk(KERN_INFO "JFFS2: default compression mode: favourlzo\n");)
+ jffs2_dbg(1, "default compression mode: favourlzo\n");
#else
- D1(printk(KERN_INFO "JFFS2: default compression mode: priority\n");)
+ jffs2_dbg(1, "default compression mode: priority\n");
#endif
#endif
#endif
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index af186ee674d..c553bd6506d 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -33,7 +33,6 @@ static int __init alloc_workspace(void)
lzo_compress_buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
if (!lzo_mem || !lzo_compress_buf) {
- printk(KERN_WARNING "Failed to allocate lzo deflate workspace\n");
free_workspace();
return -ENOMEM;
}
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 9e7cec808c4..92e0644bf86 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/string.h>
#include <linux/types.h>
#include <linux/jffs2.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 5a001020c54..0b9a1e44e83 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,6 +14,8 @@
#error "The userspace support got too messy and was removed. Update your mkfs.jffs2"
#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/zlib.h>
#include <linux/zutil.h>
@@ -42,18 +44,18 @@ static int __init alloc_workspaces(void)
{
def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS,
MAX_MEM_LEVEL));
- if (!def_strm.workspace) {
- printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL));
+ if (!def_strm.workspace)
return -ENOMEM;
- }
- D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)));
+
+ jffs2_dbg(1, "Allocated %d bytes for deflate workspace\n",
+ zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL));
inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
if (!inf_strm.workspace) {
- printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize());
vfree(def_strm.workspace);
return -ENOMEM;
}
- D1(printk(KERN_DEBUG "Allocated %d bytes for inflate workspace\n", zlib_inflate_workspacesize()));
+ jffs2_dbg(1, "Allocated %d bytes for inflate workspace\n",
+ zlib_inflate_workspacesize());
return 0;
}
@@ -79,7 +81,7 @@ static int jffs2_zlib_compress(unsigned char *data_in,
mutex_lock(&deflate_mutex);
if (Z_OK != zlib_deflateInit(&def_strm, 3)) {
- printk(KERN_WARNING "deflateInit failed\n");
+ pr_warn("deflateInit failed\n");
mutex_unlock(&deflate_mutex);
return -1;
}
@@ -93,13 +95,14 @@ static int jffs2_zlib_compress(unsigned char *data_in,
while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) {
def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE);
def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out);
- D1(printk(KERN_DEBUG "calling deflate with avail_in %d, avail_out %d\n",
- def_strm.avail_in, def_strm.avail_out));
+ jffs2_dbg(1, "calling deflate with avail_in %d, avail_out %d\n",
+ def_strm.avail_in, def_strm.avail_out);
ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH);
- D1(printk(KERN_DEBUG "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n",
- def_strm.avail_in, def_strm.avail_out, def_strm.total_in, def_strm.total_out));
+ jffs2_dbg(1, "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n",
+ def_strm.avail_in, def_strm.avail_out,
+ def_strm.total_in, def_strm.total_out);
if (ret != Z_OK) {
- D1(printk(KERN_DEBUG "deflate in loop returned %d\n", ret));
+ jffs2_dbg(1, "deflate in loop returned %d\n", ret);
zlib_deflateEnd(&def_strm);
mutex_unlock(&deflate_mutex);
return -1;
@@ -111,20 +114,20 @@ static int jffs2_zlib_compress(unsigned char *data_in,
zlib_deflateEnd(&def_strm);
if (ret != Z_STREAM_END) {
- D1(printk(KERN_DEBUG "final deflate returned %d\n", ret));
+ jffs2_dbg(1, "final deflate returned %d\n", ret);
ret = -1;
goto out;
}
if (def_strm.total_out >= def_strm.total_in) {
- D1(printk(KERN_DEBUG "zlib compressed %ld bytes into %ld; failing\n",
- def_strm.total_in, def_strm.total_out));
+ jffs2_dbg(1, "zlib compressed %ld bytes into %ld; failing\n",
+ def_strm.total_in, def_strm.total_out);
ret = -1;
goto out;
}
- D1(printk(KERN_DEBUG "zlib compressed %ld bytes into %ld\n",
- def_strm.total_in, def_strm.total_out));
+ jffs2_dbg(1, "zlib compressed %ld bytes into %ld\n",
+ def_strm.total_in, def_strm.total_out);
*dstlen = def_strm.total_out;
*sourcelen = def_strm.total_in;
@@ -157,18 +160,18 @@ static int jffs2_zlib_decompress(unsigned char *data_in,
((data_in[0] & 0x0f) == Z_DEFLATED) &&
!(((data_in[0]<<8) + data_in[1]) % 31)) {
- D2(printk(KERN_DEBUG "inflate skipping adler32\n"));
+ jffs2_dbg(2, "inflate skipping adler32\n");
wbits = -((data_in[0] >> 4) + 8);
inf_strm.next_in += 2;
inf_strm.avail_in -= 2;
} else {
/* Let this remain D1 for now -- it should never happen */
- D1(printk(KERN_DEBUG "inflate not skipping adler32\n"));
+ jffs2_dbg(1, "inflate not skipping adler32\n");
}
if (Z_OK != zlib_inflateInit2(&inf_strm, wbits)) {
- printk(KERN_WARNING "inflateInit failed\n");
+ pr_warn("inflateInit failed\n");
mutex_unlock(&inflate_mutex);
return 1;
}
@@ -176,7 +179,7 @@ static int jffs2_zlib_decompress(unsigned char *data_in,
while((ret = zlib_inflate(&inf_strm, Z_FINISH)) == Z_OK)
;
if (ret != Z_STREAM_END) {
- printk(KERN_NOTICE "inflate returned %d\n", ret);
+ pr_notice("inflate returned %d\n", ret);
}
zlib_inflateEnd(&inf_strm);
mutex_unlock(&inflate_mutex);
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index e0b76c87a91..1090eb64b90 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/pagemap.h>
@@ -261,12 +263,15 @@ void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c)
bad += c->sector_size;
}
-#define check(sz) \
- if (sz != c->sz##_size) { \
- printk(KERN_WARNING #sz "_size mismatch counted 0x%x, c->" #sz "_size 0x%x\n", \
- sz, c->sz##_size); \
- dump = 1; \
- }
+#define check(sz) \
+do { \
+ if (sz != c->sz##_size) { \
+ pr_warn("%s_size mismatch counted 0x%x, c->%s_size 0x%x\n", \
+ #sz, sz, #sz, c->sz##_size); \
+ dump = 1; \
+ } \
+} while (0)
+
check(free);
check(dirty);
check(used);
@@ -274,11 +279,12 @@ void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c)
check(unchecked);
check(bad);
check(erasing);
+
#undef check
if (nr_counted != c->nr_blocks) {
- printk(KERN_WARNING "%s counted only 0x%x blocks of 0x%x. Where are the others?\n",
- __func__, nr_counted, c->nr_blocks);
+ pr_warn("%s counted only 0x%x blocks of 0x%x. Where are the others?\n",
+ __func__, nr_counted, c->nr_blocks);
dump = 1;
}
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index c4f8eef5ca6..4fd9be4cbc9 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -51,6 +51,7 @@
* superseded by nicer dbg_xxx() macros...
*/
#if CONFIG_JFFS2_FS_DEBUG > 0
+#define DEBUG
#define D1(x) x
#else
#define D1(x)
@@ -62,50 +63,33 @@
#define D2(x)
#endif
+#define jffs2_dbg(level, fmt, ...) \
+do { \
+ if (CONFIG_JFFS2_FS_DEBUG >= level) \
+ pr_debug(fmt, ##__VA_ARGS__); \
+} while (0)
+
/* The prefixes of JFFS2 messages */
+#define JFFS2_DBG KERN_DEBUG
#define JFFS2_DBG_PREFIX "[JFFS2 DBG]"
-#define JFFS2_ERR_PREFIX "JFFS2 error:"
-#define JFFS2_WARN_PREFIX "JFFS2 warning:"
-#define JFFS2_NOTICE_PREFIX "JFFS2 notice:"
-
-#define JFFS2_ERR KERN_ERR
-#define JFFS2_WARN KERN_WARNING
-#define JFFS2_NOT KERN_NOTICE
-#define JFFS2_DBG KERN_DEBUG
-
#define JFFS2_DBG_MSG_PREFIX JFFS2_DBG JFFS2_DBG_PREFIX
-#define JFFS2_ERR_MSG_PREFIX JFFS2_ERR JFFS2_ERR_PREFIX
-#define JFFS2_WARN_MSG_PREFIX JFFS2_WARN JFFS2_WARN_PREFIX
-#define JFFS2_NOTICE_MSG_PREFIX JFFS2_NOT JFFS2_NOTICE_PREFIX
/* JFFS2 message macros */
-#define JFFS2_ERROR(fmt, ...) \
- do { \
- printk(JFFS2_ERR_MSG_PREFIX \
- " (%d) %s: " fmt, task_pid_nr(current), \
- __func__ , ##__VA_ARGS__); \
- } while(0)
+#define JFFS2_ERROR(fmt, ...) \
+ pr_err("error: (%d) %s: " fmt, \
+ task_pid_nr(current), __func__, ##__VA_ARGS__)
#define JFFS2_WARNING(fmt, ...) \
- do { \
- printk(JFFS2_WARN_MSG_PREFIX \
- " (%d) %s: " fmt, task_pid_nr(current), \
- __func__ , ##__VA_ARGS__); \
- } while(0)
+ pr_warn("warning: (%d) %s: " fmt, \
+ task_pid_nr(current), __func__, ##__VA_ARGS__)
#define JFFS2_NOTICE(fmt, ...) \
- do { \
- printk(JFFS2_NOTICE_MSG_PREFIX \
- " (%d) %s: " fmt, task_pid_nr(current), \
- __func__ , ##__VA_ARGS__); \
- } while(0)
+ pr_notice("notice: (%d) %s: " fmt, \
+ task_pid_nr(current), __func__, ##__VA_ARGS__)
#define JFFS2_DEBUG(fmt, ...) \
- do { \
- printk(JFFS2_DBG_MSG_PREFIX \
- " (%d) %s: " fmt, task_pid_nr(current), \
- __func__ , ##__VA_ARGS__); \
- } while(0)
+ printk(KERN_DEBUG "[JFFS2 DBG] (%d) %s: " fmt, \
+ task_pid_nr(current), __func__, ##__VA_ARGS__)
/*
* We split our debugging messages on several parts, depending on the JFFS2
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 973ac5822bd..b56018896d5 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
@@ -79,7 +81,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
uint32_t ino = 0;
struct inode *inode = NULL;
- D1(printk(KERN_DEBUG "jffs2_lookup()\n"));
+ jffs2_dbg(1, "jffs2_lookup()\n");
if (target->d_name.len > JFFS2_MAX_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -103,7 +105,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
if (ino) {
inode = jffs2_iget(dir_i->i_sb, ino);
if (IS_ERR(inode))
- printk(KERN_WARNING "iget() failed for ino #%u\n", ino);
+ pr_warn("iget() failed for ino #%u\n", ino);
}
return d_splice_alias(inode, target);
@@ -119,21 +121,22 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct jffs2_full_dirent *fd;
unsigned long offset, curofs;
- D1(printk(KERN_DEBUG "jffs2_readdir() for dir_i #%lu\n", filp->f_path.dentry->d_inode->i_ino));
+ jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n",
+ filp->f_path.dentry->d_inode->i_ino);
f = JFFS2_INODE_INFO(inode);
offset = filp->f_pos;
if (offset == 0) {
- D1(printk(KERN_DEBUG "Dirent 0: \".\", ino #%lu\n", inode->i_ino));
+ jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino);
if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
goto out;
offset++;
}
if (offset == 1) {
unsigned long pino = parent_ino(filp->f_path.dentry);
- D1(printk(KERN_DEBUG "Dirent 1: \"..\", ino #%lu\n", pino));
+ jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino);
if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0)
goto out;
offset++;
@@ -146,16 +149,18 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
curofs++;
/* First loop: curofs = 2; offset = 2 */
if (curofs < offset) {
- D2(printk(KERN_DEBUG "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
- fd->name, fd->ino, fd->type, curofs, offset));
+ jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
+ fd->name, fd->ino, fd->type, curofs, offset);
continue;
}
if (!fd->ino) {
- D2(printk(KERN_DEBUG "Skipping deletion dirent \"%s\"\n", fd->name));
+ jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n",
+ fd->name);
offset++;
continue;
}
- D2(printk(KERN_DEBUG "Dirent %ld: \"%s\", ino #%u, type %d\n", offset, fd->name, fd->ino, fd->type));
+ jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n",
+ offset, fd->name, fd->ino, fd->type);
if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0)
break;
offset++;
@@ -184,12 +189,12 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry,
c = JFFS2_SB_INFO(dir_i->i_sb);
- D1(printk(KERN_DEBUG "jffs2_create()\n"));
+ jffs2_dbg(1, "%s()\n", __func__);
inode = jffs2_new_inode(dir_i, mode, ri);
if (IS_ERR(inode)) {
- D1(printk(KERN_DEBUG "jffs2_new_inode() failed\n"));
+ jffs2_dbg(1, "jffs2_new_inode() failed\n");
jffs2_free_raw_inode(ri);
return PTR_ERR(inode);
}
@@ -217,9 +222,9 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry,
jffs2_free_raw_inode(ri);
- D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
- inode->i_ino, inode->i_mode, inode->i_nlink,
- f->inocache->pino_nlink, inode->i_mapping->nrpages));
+ jffs2_dbg(1, "%s(): Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
+ __func__, inode->i_ino, inode->i_mode, inode->i_nlink,
+ f->inocache->pino_nlink, inode->i_mapping->nrpages);
d_instantiate(dentry, inode);
unlock_new_inode(inode);
@@ -362,14 +367,15 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
/* We use f->target field to store the target path. */
f->target = kmemdup(target, targetlen + 1, GFP_KERNEL);
if (!f->target) {
- printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
+ pr_warn("Can't allocate %d bytes of memory\n", targetlen + 1);
mutex_unlock(&f->sem);
jffs2_complete_reservation(c);
ret = -ENOMEM;
goto fail;
}
- D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target));
+ jffs2_dbg(1, "%s(): symlink's target '%s' cached\n",
+ __func__, (char *)f->target);
/* No data here. Only a metadata node, which will be
obsoleted by the first data write
@@ -856,7 +862,8 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
f->inocache->pino_nlink++;
mutex_unlock(&f->sem);
- printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
+ pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
+ __func__, ret);
/* Might as well let the VFS know */
d_instantiate(new_dentry, old_dentry->d_inode);
ihold(old_dentry->d_inode);
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index a01cdad6aad..4a6cf289be2 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mtd/mtd.h>
@@ -46,11 +48,12 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
#else /* Linux */
struct erase_info *instr;
- D1(printk(KERN_DEBUG "jffs2_erase_block(): erase block %#08x (range %#08x-%#08x)\n",
- jeb->offset, jeb->offset, jeb->offset + c->sector_size));
+ jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n",
+ __func__,
+ jeb->offset, jeb->offset, jeb->offset + c->sector_size);
instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL);
if (!instr) {
- printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
+ pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
mutex_lock(&c->erase_free_sem);
spin_lock(&c->erase_completion_lock);
list_move(&jeb->list, &c->erase_pending_list);
@@ -69,7 +72,6 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
instr->len = c->sector_size;
instr->callback = jffs2_erase_callback;
instr->priv = (unsigned long)(&instr[1]);
- instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
((struct erase_priv_struct *)instr->priv)->jeb = jeb;
((struct erase_priv_struct *)instr->priv)->c = c;
@@ -84,7 +86,8 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
if (ret == -ENOMEM || ret == -EAGAIN) {
/* Erase failed immediately. Refile it on the list */
- D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret));
+ jffs2_dbg(1, "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n",
+ jeb->offset, ret);
mutex_lock(&c->erase_free_sem);
spin_lock(&c->erase_completion_lock);
list_move(&jeb->list, &c->erase_pending_list);
@@ -97,9 +100,11 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
}
if (ret == -EROFS)
- printk(KERN_WARNING "Erase at 0x%08x failed immediately: -EROFS. Is the sector locked?\n", jeb->offset);
+ pr_warn("Erase at 0x%08x failed immediately: -EROFS. Is the sector locked?\n",
+ jeb->offset);
else
- printk(KERN_WARNING "Erase at 0x%08x failed immediately: errno %d\n", jeb->offset, ret);
+ pr_warn("Erase at 0x%08x failed immediately: errno %d\n",
+ jeb->offset, ret);
jffs2_erase_failed(c, jeb, bad_offset);
}
@@ -125,13 +130,14 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
work_done++;
if (!--count) {
- D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
+ jffs2_dbg(1, "Count reached. jffs2_erase_pending_blocks leaving\n");
goto done;
}
} else if (!list_empty(&c->erase_pending_list)) {
jeb = list_entry(c->erase_pending_list.next, struct jffs2_eraseblock, list);
- D1(printk(KERN_DEBUG "Starting erase of pending block 0x%08x\n", jeb->offset));
+ jffs2_dbg(1, "Starting erase of pending block 0x%08x\n",
+ jeb->offset);
list_del(&jeb->list);
c->erasing_size += c->sector_size;
c->wasted_size -= jeb->wasted_size;
@@ -159,13 +165,13 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->erase_free_sem);
done:
- D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
+ jffs2_dbg(1, "jffs2_erase_pending_blocks completed\n");
return work_done;
}
static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
{
- D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset));
+ jffs2_dbg(1, "Erase completed successfully at 0x%08x\n", jeb->offset);
mutex_lock(&c->erase_free_sem);
spin_lock(&c->erase_completion_lock);
list_move_tail(&jeb->list, &c->erase_complete_list);
@@ -214,7 +220,7 @@ static void jffs2_erase_callback(struct erase_info *instr)
struct erase_priv_struct *priv = (void *)instr->priv;
if(instr->state != MTD_ERASE_DONE) {
- printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
+ pr_warn("Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
(unsigned long long)instr->addr, instr->state);
jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
} else {
@@ -269,8 +275,8 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
return;
}
- D1(printk(KERN_DEBUG "Removed nodes in range 0x%08x-0x%08x from ino #%u\n",
- jeb->offset, jeb->offset + c->sector_size, ic->ino));
+ jffs2_dbg(1, "Removed nodes in range 0x%08x-0x%08x from ino #%u\n",
+ jeb->offset, jeb->offset + c->sector_size, ic->ino);
D2({
int i=0;
@@ -281,7 +287,7 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
printk(KERN_DEBUG);
while(this) {
- printk(KERN_CONT "0x%08x(%d)->",
+ pr_cont("0x%08x(%d)->",
ref_offset(this), ref_flags(this));
if (++i == 5) {
printk(KERN_DEBUG);
@@ -289,7 +295,7 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
}
this = this->next_in_ino;
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
});
switch (ic->class) {
@@ -310,7 +316,8 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
{
struct jffs2_raw_node_ref *block, *ref;
- D1(printk(KERN_DEBUG "Freeing all node refs for eraseblock offset 0x%08x\n", jeb->offset));
+ jffs2_dbg(1, "Freeing all node refs for eraseblock offset 0x%08x\n",
+ jeb->offset);
block = ref = jeb->first_node;
@@ -335,19 +342,20 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
void *ebuf;
uint32_t ofs;
size_t retlen;
- int ret = -EIO;
+ int ret;
unsigned long *wordebuf;
ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen,
&ebuf, NULL);
if (ret != -EOPNOTSUPP) {
if (ret) {
- D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
+ jffs2_dbg(1, "MTD point failed %d\n", ret);
goto do_flash_read;
}
if (retlen < c->sector_size) {
/* Don't muck about if it won't let us point to the whole erase sector */
- D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen));
+ jffs2_dbg(1, "MTD point returned len too short: 0x%zx\n",
+ retlen);
mtd_unpoint(c->mtd, jeb->offset, retlen);
goto do_flash_read;
}
@@ -359,8 +367,10 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
} while(--retlen);
mtd_unpoint(c->mtd, jeb->offset, c->sector_size);
if (retlen) {
- printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n",
- *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf));
+ pr_warn("Newly-erased block contained word 0x%lx at offset 0x%08tx\n",
+ *wordebuf,
+ jeb->offset +
+ c->sector_size-retlen * sizeof(*wordebuf));
return -EIO;
}
return 0;
@@ -368,11 +378,12 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
do_flash_read:
ebuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!ebuf) {
- printk(KERN_WARNING "Failed to allocate page buffer for verifying erase at 0x%08x. Refiling\n", jeb->offset);
+ pr_warn("Failed to allocate page buffer for verifying erase at 0x%08x. Refiling\n",
+ jeb->offset);
return -EAGAIN;
}
- D1(printk(KERN_DEBUG "Verifying erase at 0x%08x\n", jeb->offset));
+ jffs2_dbg(1, "Verifying erase at 0x%08x\n", jeb->offset);
for (ofs = jeb->offset; ofs < jeb->offset + c->sector_size; ) {
uint32_t readlen = min((uint32_t)PAGE_SIZE, jeb->offset + c->sector_size - ofs);
@@ -382,12 +393,14 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
ret = mtd_read(c->mtd, ofs, readlen, &retlen, ebuf);
if (ret) {
- printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
+ pr_warn("Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n",
+ ofs, ret);
ret = -EIO;
goto fail;
}
if (retlen != readlen) {
- printk(KERN_WARNING "Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", ofs, readlen, retlen);
+ pr_warn("Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n",
+ ofs, readlen, retlen);
ret = -EIO;
goto fail;
}
@@ -396,7 +409,8 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
unsigned long *datum = ebuf + i;
if (*datum + 1) {
*bad_offset += i;
- printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08x\n", *datum, *bad_offset);
+ pr_warn("Newly-erased block contained word 0x%lx at offset 0x%08x\n",
+ *datum, *bad_offset);
ret = -EIO;
goto fail;
}
@@ -422,7 +436,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
}
/* Write the erase complete marker */
- D1(printk(KERN_DEBUG "Writing erased marker to block at 0x%08x\n", jeb->offset));
+ jffs2_dbg(1, "Writing erased marker to block at 0x%08x\n", jeb->offset);
bad_offset = jeb->offset;
/* Cleanmarker in oob area or no cleanmarker at all ? */
@@ -451,10 +465,10 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
if (ret || retlen != sizeof(marker)) {
if (ret)
- printk(KERN_WARNING "Write clean marker to block at 0x%08x failed: %d\n",
+ pr_warn("Write clean marker to block at 0x%08x failed: %d\n",
jeb->offset, ret);
else
- printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n",
+ pr_warn("Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n",
jeb->offset, sizeof(marker), retlen);
goto filebad;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 61e6723535b..db3889ba881 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/time.h>
@@ -85,7 +87,8 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
unsigned char *pg_buf;
int ret;
- D2(printk(KERN_DEBUG "jffs2_do_readpage_nolock(): ino #%lu, page at offset 0x%lx\n", inode->i_ino, pg->index << PAGE_CACHE_SHIFT));
+ jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n",
+ __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT);
BUG_ON(!PageLocked(pg));
@@ -105,7 +108,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
flush_dcache_page(pg);
kunmap(pg);
- D2(printk(KERN_DEBUG "readpage finished\n"));
+ jffs2_dbg(2, "readpage finished\n");
return ret;
}
@@ -144,7 +147,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
return -ENOMEM;
*pagep = pg;
- D1(printk(KERN_DEBUG "jffs2_write_begin()\n"));
+ jffs2_dbg(1, "%s()\n", __func__);
if (pageofs > inode->i_size) {
/* Make new hole frag from old EOF to new page */
@@ -153,8 +156,8 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct jffs2_full_dnode *fn;
uint32_t alloc_len;
- D1(printk(KERN_DEBUG "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
- (unsigned int)inode->i_size, pageofs));
+ jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
+ (unsigned int)inode->i_size, pageofs);
ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
@@ -198,7 +201,8 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
f->metadata = NULL;
}
if (ret) {
- D1(printk(KERN_DEBUG "Eep. add_full_dnode_to_inode() failed in write_begin, returned %d\n", ret));
+ jffs2_dbg(1, "Eep. add_full_dnode_to_inode() failed in write_begin, returned %d\n",
+ ret);
jffs2_mark_node_obsolete(c, fn->raw);
jffs2_free_full_dnode(fn);
jffs2_complete_reservation(c);
@@ -222,7 +226,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
if (ret)
goto out_page;
}
- D1(printk(KERN_DEBUG "end write_begin(). pg->flags %lx\n", pg->flags));
+ jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
return ret;
out_page:
@@ -248,8 +252,9 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
int ret = 0;
uint32_t writtenlen = 0;
- D1(printk(KERN_DEBUG "jffs2_write_end(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
- inode->i_ino, pg->index << PAGE_CACHE_SHIFT, start, end, pg->flags));
+ jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
+ __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT,
+ start, end, pg->flags);
/* We need to avoid deadlock with page_cache_read() in
jffs2_garbage_collect_pass(). So the page must be
@@ -268,7 +273,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
ri = jffs2_alloc_raw_inode();
if (!ri) {
- D1(printk(KERN_DEBUG "jffs2_write_end(): Allocation of raw inode failed\n"));
+ jffs2_dbg(1, "%s(): Allocation of raw inode failed\n",
+ __func__);
unlock_page(pg);
page_cache_release(pg);
return -ENOMEM;
@@ -315,13 +321,14 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
/* generic_file_write has written more to the page cache than we've
actually written to the medium. Mark the page !Uptodate so that
it gets reread */
- D1(printk(KERN_DEBUG "jffs2_write_end(): Not all bytes written. Marking page !uptodate\n"));
+ jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n",
+ __func__);
SetPageError(pg);
ClearPageUptodate(pg);
}
- D1(printk(KERN_DEBUG "jffs2_write_end() returning %d\n",
- writtenlen > 0 ? writtenlen : ret));
+ jffs2_dbg(1, "%s() returning %d\n",
+ __func__, writtenlen > 0 ? writtenlen : ret);
unlock_page(pg);
page_cache_release(pg);
return writtenlen > 0 ? writtenlen : ret;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 2e0123867cb..bb6f993ebca 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/sched.h>
@@ -39,7 +41,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
int ret;
int alloc_type = ALLOC_NORMAL;
- D1(printk(KERN_DEBUG "jffs2_setattr(): ino #%lu\n", inode->i_ino));
+ jffs2_dbg(1, "%s(): ino #%lu\n", __func__, inode->i_ino);
/* Special cases - we don't want more than one data node
for these types on the medium at any time. So setattr
@@ -50,7 +52,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
/* For these, we don't actually need to read the old node */
mdatalen = jffs2_encode_dev(&dev, inode->i_rdev);
mdata = (char *)&dev;
- D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of kdev_t\n", mdatalen));
+ jffs2_dbg(1, "%s(): Writing %d bytes of kdev_t\n",
+ __func__, mdatalen);
} else if (S_ISLNK(inode->i_mode)) {
mutex_lock(&f->sem);
mdatalen = f->metadata->size;
@@ -66,7 +69,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
return ret;
}
mutex_unlock(&f->sem);
- D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of symlink target\n", mdatalen));
+ jffs2_dbg(1, "%s(): Writing %d bytes of symlink target\n",
+ __func__, mdatalen);
}
ri = jffs2_alloc_raw_inode();
@@ -233,7 +237,8 @@ void jffs2_evict_inode (struct inode *inode)
struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
- D1(printk(KERN_DEBUG "jffs2_evict_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
+ jffs2_dbg(1, "%s(): ino #%lu mode %o\n",
+ __func__, inode->i_ino, inode->i_mode);
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
jffs2_do_clear_inode(c, f);
@@ -249,7 +254,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
dev_t rdev = 0;
int ret;
- D1(printk(KERN_DEBUG "jffs2_iget(): ino == %lu\n", ino));
+ jffs2_dbg(1, "%s(): ino == %lu\n", __func__, ino);
inode = iget_locked(sb, ino);
if (!inode)
@@ -317,14 +322,16 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
/* Read the device numbers from the media */
if (f->metadata->size != sizeof(jdev.old_id) &&
f->metadata->size != sizeof(jdev.new_id)) {
- printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
+ pr_notice("Device node has strange size %d\n",
+ f->metadata->size);
goto error_io;
}
- D1(printk(KERN_DEBUG "Reading device numbers from flash\n"));
+ jffs2_dbg(1, "Reading device numbers from flash\n");
ret = jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size);
if (ret < 0) {
/* Eep */
- printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
+ pr_notice("Read device numbers for inode %lu failed\n",
+ (unsigned long)inode->i_ino);
goto error;
}
if (f->metadata->size == sizeof(jdev.old_id))
@@ -339,12 +346,13 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
break;
default:
- printk(KERN_WARNING "jffs2_read_inode(): Bogus imode %o for ino %lu\n", inode->i_mode, (unsigned long)inode->i_ino);
+ pr_warn("%s(): Bogus i_mode %o for ino %lu\n",
+ __func__, inode->i_mode, (unsigned long)inode->i_ino);
}
mutex_unlock(&f->sem);
- D1(printk(KERN_DEBUG "jffs2_read_inode() returning\n"));
+ jffs2_dbg(1, "jffs2_read_inode() returning\n");
unlock_new_inode(inode);
return inode;
@@ -362,11 +370,13 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
struct iattr iattr;
if (!(inode->i_state & I_DIRTY_DATASYNC)) {
- D2(printk(KERN_DEBUG "jffs2_dirty_inode() not calling setattr() for ino #%lu\n", inode->i_ino));
+ jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n",
+ __func__, inode->i_ino);
return;
}
- D1(printk(KERN_DEBUG "jffs2_dirty_inode() calling setattr() for ino #%lu\n", inode->i_ino));
+ jffs2_dbg(1, "%s(): calling setattr() for ino #%lu\n",
+ __func__, inode->i_ino);
iattr.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_MTIME|ATTR_CTIME;
iattr.ia_mode = inode->i_mode;
@@ -414,7 +424,8 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
struct jffs2_inode_info *f;
int ret;
- D1(printk(KERN_DEBUG "jffs2_new_inode(): dir_i %ld, mode 0x%x\n", dir_i->i_ino, mode));
+ jffs2_dbg(1, "%s(): dir_i %ld, mode 0x%x\n",
+ __func__, dir_i->i_ino, mode);
c = JFFS2_SB_INFO(sb);
@@ -504,11 +515,11 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
#ifndef CONFIG_JFFS2_FS_WRITEBUFFER
if (c->mtd->type == MTD_NANDFLASH) {
- printk(KERN_ERR "jffs2: Cannot operate on NAND flash unless jffs2 NAND support is compiled in.\n");
+ pr_err("Cannot operate on NAND flash unless jffs2 NAND support is compiled in\n");
return -EINVAL;
}
if (c->mtd->type == MTD_DATAFLASH) {
- printk(KERN_ERR "jffs2: Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in.\n");
+ pr_err("Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in\n");
return -EINVAL;
}
#endif
@@ -522,12 +533,13 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
*/
if ((c->sector_size * blocks) != c->flash_size) {
c->flash_size = c->sector_size * blocks;
- printk(KERN_INFO "jffs2: Flash size not aligned to erasesize, reducing to %dKiB\n",
+ pr_info("Flash size not aligned to erasesize, reducing to %dKiB\n",
c->flash_size / 1024);
}
if (c->flash_size < 5*c->sector_size) {
- printk(KERN_ERR "jffs2: Too few erase blocks (%d)\n", c->flash_size / c->sector_size);
+ pr_err("Too few erase blocks (%d)\n",
+ c->flash_size / c->sector_size);
return -EINVAL;
}
@@ -550,20 +562,20 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
if ((ret = jffs2_do_mount_fs(c)))
goto out_inohash;
- D1(printk(KERN_DEBUG "jffs2_do_fill_super(): Getting root inode\n"));
+ jffs2_dbg(1, "%s(): Getting root inode\n", __func__);
root_i = jffs2_iget(sb, 1);
if (IS_ERR(root_i)) {
- D1(printk(KERN_WARNING "get root inode failed\n"));
+ jffs2_dbg(1, "get root inode failed\n");
ret = PTR_ERR(root_i);
goto out_root;
}
ret = -ENOMEM;
- D1(printk(KERN_DEBUG "jffs2_do_fill_super(): d_alloc_root()\n"));
- sb->s_root = d_alloc_root(root_i);
+ jffs2_dbg(1, "%s(): d_make_root()\n", __func__);
+ sb->s_root = d_make_root(root_i);
if (!sb->s_root)
- goto out_root_i;
+ goto out_root;
sb->s_maxbytes = 0xFFFFFFFF;
sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -573,8 +585,6 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
jffs2_start_garbage_collect_thread(c);
return 0;
- out_root_i:
- iput(root_i);
out_root:
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
@@ -620,20 +630,21 @@ struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c,
*/
inode = ilookup(OFNI_BS_2SFFJ(c), inum);
if (!inode) {
- D1(printk(KERN_DEBUG "ilookup() failed for ino #%u; inode is probably deleted.\n",
- inum));
+ jffs2_dbg(1, "ilookup() failed for ino #%u; inode is probably deleted.\n",
+ inum);
spin_lock(&c->inocache_lock);
ic = jffs2_get_ino_cache(c, inum);
if (!ic) {
- D1(printk(KERN_DEBUG "Inode cache for ino #%u is gone.\n", inum));
+ jffs2_dbg(1, "Inode cache for ino #%u is gone\n",
+ inum);
spin_unlock(&c->inocache_lock);
return NULL;
}
if (ic->state != INO_STATE_CHECKEDABSENT) {
/* Wait for progress. Don't just loop */
- D1(printk(KERN_DEBUG "Waiting for ino #%u in state %d\n",
- ic->ino, ic->state));
+ jffs2_dbg(1, "Waiting for ino #%u in state %d\n",
+ ic->ino, ic->state);
sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
} else {
spin_unlock(&c->inocache_lock);
@@ -651,8 +662,8 @@ struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c,
return ERR_CAST(inode);
}
if (is_bad_inode(inode)) {
- printk(KERN_NOTICE "Eep. read_inode() failed for ino #%u. unlinked %d\n",
- inum, unlinked);
+ pr_notice("Eep. read_inode() failed for ino #%u. unlinked %d\n",
+ inum, unlinked);
/* NB. This will happen again. We need to do something appropriate here. */
iput(inode);
return ERR_PTR(-EIO);
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 31dce611337..ad271c70aa2 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/mtd/mtd.h>
#include <linux/slab.h>
@@ -51,44 +53,44 @@ static struct jffs2_eraseblock *jffs2_find_gc_block(struct jffs2_sb_info *c)
number of free blocks is low. */
again:
if (!list_empty(&c->bad_used_list) && c->nr_free_blocks > c->resv_blocks_gcbad) {
- D1(printk(KERN_DEBUG "Picking block from bad_used_list to GC next\n"));
+ jffs2_dbg(1, "Picking block from bad_used_list to GC next\n");
nextlist = &c->bad_used_list;
} else if (n < 50 && !list_empty(&c->erasable_list)) {
/* Note that most of them will have gone directly to be erased.
So don't favour the erasable_list _too_ much. */
- D1(printk(KERN_DEBUG "Picking block from erasable_list to GC next\n"));
+ jffs2_dbg(1, "Picking block from erasable_list to GC next\n");
nextlist = &c->erasable_list;
} else if (n < 110 && !list_empty(&c->very_dirty_list)) {
/* Most of the time, pick one off the very_dirty list */
- D1(printk(KERN_DEBUG "Picking block from very_dirty_list to GC next\n"));
+ jffs2_dbg(1, "Picking block from very_dirty_list to GC next\n");
nextlist = &c->very_dirty_list;
} else if (n < 126 && !list_empty(&c->dirty_list)) {
- D1(printk(KERN_DEBUG "Picking block from dirty_list to GC next\n"));
+ jffs2_dbg(1, "Picking block from dirty_list to GC next\n");
nextlist = &c->dirty_list;
} else if (!list_empty(&c->clean_list)) {
- D1(printk(KERN_DEBUG "Picking block from clean_list to GC next\n"));
+ jffs2_dbg(1, "Picking block from clean_list to GC next\n");
nextlist = &c->clean_list;
} else if (!list_empty(&c->dirty_list)) {
- D1(printk(KERN_DEBUG "Picking block from dirty_list to GC next (clean_list was empty)\n"));
+ jffs2_dbg(1, "Picking block from dirty_list to GC next (clean_list was empty)\n");
nextlist = &c->dirty_list;
} else if (!list_empty(&c->very_dirty_list)) {
- D1(printk(KERN_DEBUG "Picking block from very_dirty_list to GC next (clean_list and dirty_list were empty)\n"));
+ jffs2_dbg(1, "Picking block from very_dirty_list to GC next (clean_list and dirty_list were empty)\n");
nextlist = &c->very_dirty_list;
} else if (!list_empty(&c->erasable_list)) {
- D1(printk(KERN_DEBUG "Picking block from erasable_list to GC next (clean_list and {very_,}dirty_list were empty)\n"));
+ jffs2_dbg(1, "Picking block from erasable_list to GC next (clean_list and {very_,}dirty_list were empty)\n");
nextlist = &c->erasable_list;
} else if (!list_empty(&c->erasable_pending_wbuf_list)) {
/* There are blocks are wating for the wbuf sync */
- D1(printk(KERN_DEBUG "Synching wbuf in order to reuse erasable_pending_wbuf_list blocks\n"));
+ jffs2_dbg(1, "Synching wbuf in order to reuse erasable_pending_wbuf_list blocks\n");
spin_unlock(&c->erase_completion_lock);
jffs2_flush_wbuf_pad(c);
spin_lock(&c->erase_completion_lock);
goto again;
} else {
/* Eep. All were empty */
- D1(printk(KERN_NOTICE "jffs2: No clean, dirty _or_ erasable blocks to GC from! Where are they all?\n"));
+ jffs2_dbg(1, "No clean, dirty _or_ erasable blocks to GC from! Where are they all?\n");
return NULL;
}
@@ -97,13 +99,15 @@ again:
c->gcblock = ret;
ret->gc_node = ret->first_node;
if (!ret->gc_node) {
- printk(KERN_WARNING "Eep. ret->gc_node for block at 0x%08x is NULL\n", ret->offset);
+ pr_warn("Eep. ret->gc_node for block at 0x%08x is NULL\n",
+ ret->offset);
BUG();
}
/* Have we accidentally picked a clean block with wasted space ? */
if (ret->wasted_size) {
- D1(printk(KERN_DEBUG "Converting wasted_size %08x to dirty_size\n", ret->wasted_size));
+ jffs2_dbg(1, "Converting wasted_size %08x to dirty_size\n",
+ ret->wasted_size);
ret->dirty_size += ret->wasted_size;
c->wasted_size -= ret->wasted_size;
c->dirty_size += ret->wasted_size;
@@ -140,8 +144,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
/* checked_ino is protected by the alloc_sem */
if (c->checked_ino > c->highest_ino && xattr) {
- printk(KERN_CRIT "Checked all inodes but still 0x%x bytes of unchecked space?\n",
- c->unchecked_size);
+ pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
+ c->unchecked_size);
jffs2_dbg_dump_block_lists_nolock(c);
spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->alloc_sem);
@@ -163,8 +167,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
}
if (!ic->pino_nlink) {
- D1(printk(KERN_DEBUG "Skipping check of ino #%d with nlink/pino zero\n",
- ic->ino));
+ jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n",
+ ic->ino);
spin_unlock(&c->inocache_lock);
jffs2_xattr_delete_inode(c, ic);
continue;
@@ -172,13 +176,15 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
switch(ic->state) {
case INO_STATE_CHECKEDABSENT:
case INO_STATE_PRESENT:
- D1(printk(KERN_DEBUG "Skipping ino #%u already checked\n", ic->ino));
+ jffs2_dbg(1, "Skipping ino #%u already checked\n",
+ ic->ino);
spin_unlock(&c->inocache_lock);
continue;
case INO_STATE_GC:
case INO_STATE_CHECKING:
- printk(KERN_WARNING "Inode #%u is in state %d during CRC check phase!\n", ic->ino, ic->state);
+ pr_warn("Inode #%u is in state %d during CRC check phase!\n",
+ ic->ino, ic->state);
spin_unlock(&c->inocache_lock);
BUG();
@@ -186,7 +192,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
/* We need to wait for it to finish, lest we move on
and trigger the BUG() above while we haven't yet
finished checking all its nodes */
- D1(printk(KERN_DEBUG "Waiting for ino #%u to finish reading\n", ic->ino));
+ jffs2_dbg(1, "Waiting for ino #%u to finish reading\n",
+ ic->ino);
/* We need to come back again for the _same_ inode. We've
made no progress in this case, but that should be OK */
c->checked_ino--;
@@ -204,11 +211,13 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
ic->state = INO_STATE_CHECKING;
spin_unlock(&c->inocache_lock);
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() triggering inode scan of ino#%u\n", ic->ino));
+ jffs2_dbg(1, "%s(): triggering inode scan of ino#%u\n",
+ __func__, ic->ino);
ret = jffs2_do_crccheck_inode(c, ic);
if (ret)
- printk(KERN_WARNING "Returned error for crccheck of ino #%u. Expect badness...\n", ic->ino);
+ pr_warn("Returned error for crccheck of ino #%u. Expect badness...\n",
+ ic->ino);
jffs2_set_inocache_state(c, ic, INO_STATE_CHECKEDABSENT);
mutex_unlock(&c->alloc_sem);
@@ -220,11 +229,11 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
!list_empty(&c->erase_pending_list)) {
spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->alloc_sem);
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
+ jffs2_dbg(1, "%s(): erasing pending blocks\n", __func__);
if (jffs2_erase_pending_blocks(c, 1))
return 0;
- D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
+ jffs2_dbg(1, "No progress from erasing block; doing GC anyway\n");
spin_lock(&c->erase_completion_lock);
mutex_lock(&c->alloc_sem);
}
@@ -242,13 +251,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
mutex_unlock(&c->alloc_sem);
return -EAGAIN;
}
- D1(printk(KERN_NOTICE "jffs2: Couldn't find erase block to garbage collect!\n"));
+ jffs2_dbg(1, "Couldn't find erase block to garbage collect!\n");
spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->alloc_sem);
return -EIO;
}
- D1(printk(KERN_DEBUG "GC from block %08x, used_size %08x, dirty_size %08x, free_size %08x\n", jeb->offset, jeb->used_size, jeb->dirty_size, jeb->free_size));
+ jffs2_dbg(1, "GC from block %08x, used_size %08x, dirty_size %08x, free_size %08x\n",
+ jeb->offset, jeb->used_size, jeb->dirty_size, jeb->free_size);
D1(if (c->nextblock)
printk(KERN_DEBUG "Nextblock at %08x, used_size %08x, dirty_size %08x, wasted_size %08x, free_size %08x\n", c->nextblock->offset, c->nextblock->used_size, c->nextblock->dirty_size, c->nextblock->wasted_size, c->nextblock->free_size));
@@ -261,12 +271,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
gcblock_dirty = jeb->dirty_size;
while(ref_obsolete(raw)) {
- D1(printk(KERN_DEBUG "Node at 0x%08x is obsolete... skipping\n", ref_offset(raw)));
+ jffs2_dbg(1, "Node at 0x%08x is obsolete... skipping\n",
+ ref_offset(raw));
raw = ref_next(raw);
if (unlikely(!raw)) {
- printk(KERN_WARNING "eep. End of raw list while still supposedly nodes to GC\n");
- printk(KERN_WARNING "erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n",
- jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size);
+ pr_warn("eep. End of raw list while still supposedly nodes to GC\n");
+ pr_warn("erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n",
+ jeb->offset, jeb->free_size,
+ jeb->dirty_size, jeb->used_size);
jeb->gc_node = raw;
spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->alloc_sem);
@@ -275,7 +287,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
}
jeb->gc_node = raw;
- D1(printk(KERN_DEBUG "Going to garbage collect node at 0x%08x\n", ref_offset(raw)));
+ jffs2_dbg(1, "Going to garbage collect node at 0x%08x\n",
+ ref_offset(raw));
if (!raw->next_in_ino) {
/* Inode-less node. Clean marker, snapshot or something like that */
@@ -316,7 +329,9 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
spin_unlock(&c->erase_completion_lock);
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass collecting from block @0x%08x. Node @0x%08x(%d), ino #%u\n", jeb->offset, ref_offset(raw), ref_flags(raw), ic->ino));
+ jffs2_dbg(1, "%s(): collecting from block @0x%08x. Node @0x%08x(%d), ino #%u\n",
+ __func__, jeb->offset, ref_offset(raw), ref_flags(raw),
+ ic->ino);
/* Three possibilities:
1. Inode is already in-core. We must iget it and do proper
@@ -336,8 +351,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
if (ref_flags(raw) == REF_PRISTINE)
ic->state = INO_STATE_GC;
else {
- D1(printk(KERN_DEBUG "Ino #%u is absent but node not REF_PRISTINE. Reading.\n",
- ic->ino));
+ jffs2_dbg(1, "Ino #%u is absent but node not REF_PRISTINE. Reading.\n",
+ ic->ino);
}
break;
@@ -353,8 +368,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
we're holding the alloc_sem, no other garbage collection
can happen.
*/
- printk(KERN_CRIT "Inode #%u already in state %d in jffs2_garbage_collect_pass()!\n",
- ic->ino, ic->state);
+ pr_crit("Inode #%u already in state %d in jffs2_garbage_collect_pass()!\n",
+ ic->ino, ic->state);
mutex_unlock(&c->alloc_sem);
spin_unlock(&c->inocache_lock);
BUG();
@@ -367,8 +382,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
drop the alloc_sem before sleeping. */
mutex_unlock(&c->alloc_sem);
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() waiting for ino #%u in state %d\n",
- ic->ino, ic->state));
+ jffs2_dbg(1, "%s(): waiting for ino #%u in state %d\n",
+ __func__, ic->ino, ic->state);
sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
/* And because we dropped the alloc_sem we must start again from the
beginning. Ponder chance of livelock here -- we're returning success
@@ -433,7 +448,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
test_gcnode:
if (jeb->dirty_size == gcblock_dirty && !ref_obsolete(jeb->gc_node)) {
/* Eep. This really should never happen. GC is broken */
- printk(KERN_ERR "Error garbage collecting node at %08x!\n", ref_offset(jeb->gc_node));
+ pr_err("Error garbage collecting node at %08x!\n",
+ ref_offset(jeb->gc_node));
ret = -ENOSPC;
}
release_sem:
@@ -445,7 +461,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
eraseit:
if (c->gcblock && !c->gcblock->used_size) {
- D1(printk(KERN_DEBUG "Block at 0x%08x completely obsoleted by GC. Moving to erase_pending_list\n", c->gcblock->offset));
+ jffs2_dbg(1, "Block at 0x%08x completely obsoleted by GC. Moving to erase_pending_list\n",
+ c->gcblock->offset);
/* We're GC'ing an empty block? */
list_add_tail(&c->gcblock->list, &c->erase_pending_list);
c->gcblock = NULL;
@@ -475,12 +492,12 @@ static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_era
if (c->gcblock != jeb) {
spin_unlock(&c->erase_completion_lock);
- D1(printk(KERN_DEBUG "GC block is no longer gcblock. Restart\n"));
+ jffs2_dbg(1, "GC block is no longer gcblock. Restart\n");
goto upnout;
}
if (ref_obsolete(raw)) {
spin_unlock(&c->erase_completion_lock);
- D1(printk(KERN_DEBUG "node to be GC'd was obsoleted in the meantime.\n"));
+ jffs2_dbg(1, "node to be GC'd was obsoleted in the meantime.\n");
/* They'll call again */
goto upnout;
}
@@ -536,10 +553,10 @@ static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_era
} else if (fd) {
ret = jffs2_garbage_collect_deletion_dirent(c, jeb, f, fd);
} else {
- printk(KERN_WARNING "Raw node at 0x%08x wasn't in node lists for ino #%u\n",
- ref_offset(raw), f->inocache->ino);
+ pr_warn("Raw node at 0x%08x wasn't in node lists for ino #%u\n",
+ ref_offset(raw), f->inocache->ino);
if (ref_obsolete(raw)) {
- printk(KERN_WARNING "But it's obsolete so we don't mind too much\n");
+ pr_warn("But it's obsolete so we don't mind too much\n");
} else {
jffs2_dbg_dump_node(c, ref_offset(raw));
BUG();
@@ -562,7 +579,8 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
uint32_t crc, rawlen;
int retried = 0;
- D1(printk(KERN_DEBUG "Going to GC REF_PRISTINE node at 0x%08x\n", ref_offset(raw)));
+ jffs2_dbg(1, "Going to GC REF_PRISTINE node at 0x%08x\n",
+ ref_offset(raw));
alloclen = rawlen = ref_totlen(c, c->gcblock, raw);
@@ -595,8 +613,8 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
crc = crc32(0, node, sizeof(struct jffs2_unknown_node)-4);
if (je32_to_cpu(node->u.hdr_crc) != crc) {
- printk(KERN_WARNING "Header CRC failed on REF_PRISTINE node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
- ref_offset(raw), je32_to_cpu(node->u.hdr_crc), crc);
+ pr_warn("Header CRC failed on REF_PRISTINE node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+ ref_offset(raw), je32_to_cpu(node->u.hdr_crc), crc);
goto bail;
}
@@ -604,16 +622,18 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
case JFFS2_NODETYPE_INODE:
crc = crc32(0, node, sizeof(node->i)-8);
if (je32_to_cpu(node->i.node_crc) != crc) {
- printk(KERN_WARNING "Node CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
- ref_offset(raw), je32_to_cpu(node->i.node_crc), crc);
+ pr_warn("Node CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+ ref_offset(raw), je32_to_cpu(node->i.node_crc),
+ crc);
goto bail;
}
if (je32_to_cpu(node->i.dsize)) {
crc = crc32(0, node->i.data, je32_to_cpu(node->i.csize));
if (je32_to_cpu(node->i.data_crc) != crc) {
- printk(KERN_WARNING "Data CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
- ref_offset(raw), je32_to_cpu(node->i.data_crc), crc);
+ pr_warn("Data CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+ ref_offset(raw),
+ je32_to_cpu(node->i.data_crc), crc);
goto bail;
}
}
@@ -622,21 +642,24 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
case JFFS2_NODETYPE_DIRENT:
crc = crc32(0, node, sizeof(node->d)-8);
if (je32_to_cpu(node->d.node_crc) != crc) {
- printk(KERN_WARNING "Node CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
- ref_offset(raw), je32_to_cpu(node->d.node_crc), crc);
+ pr_warn("Node CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+ ref_offset(raw),
+ je32_to_cpu(node->d.node_crc), crc);
goto bail;
}
if (strnlen(node->d.name, node->d.nsize) != node->d.nsize) {
- printk(KERN_WARNING "Name in dirent node at 0x%08x contains zeroes\n", ref_offset(raw));
+ pr_warn("Name in dirent node at 0x%08x contains zeroes\n",
+ ref_offset(raw));
goto bail;
}
if (node->d.nsize) {
crc = crc32(0, node->d.name, node->d.nsize);
if (je32_to_cpu(node->d.name_crc) != crc) {
- printk(KERN_WARNING "Name CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
- ref_offset(raw), je32_to_cpu(node->d.name_crc), crc);
+ pr_warn("Name CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+ ref_offset(raw),
+ je32_to_cpu(node->d.name_crc), crc);
goto bail;
}
}
@@ -644,8 +667,8 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
default:
/* If it's inode-less, we don't _know_ what it is. Just copy it intact */
if (ic) {
- printk(KERN_WARNING "Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n",
- ref_offset(raw), je16_to_cpu(node->u.nodetype));
+ pr_warn("Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n",
+ ref_offset(raw), je16_to_cpu(node->u.nodetype));
goto bail;
}
}
@@ -657,12 +680,13 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node);
if (ret || (retlen != rawlen)) {
- printk(KERN_NOTICE "Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n",
- rawlen, phys_ofs, ret, retlen);
+ pr_notice("Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n",
+ rawlen, phys_ofs, ret, retlen);
if (retlen) {
jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, rawlen, NULL);
} else {
- printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", phys_ofs);
+ pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n",
+ phys_ofs);
}
if (!retried) {
/* Try to reallocate space and retry */
@@ -671,7 +695,7 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
retried = 1;
- D1(printk(KERN_DEBUG "Retrying failed write of REF_PRISTINE node.\n"));
+ jffs2_dbg(1, "Retrying failed write of REF_PRISTINE node.\n");
jffs2_dbg_acct_sanity_check(c,jeb);
jffs2_dbg_acct_paranoia_check(c, jeb);
@@ -681,14 +705,16 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
it is only an upper estimation */
if (!ret) {
- D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", phys_ofs));
+ jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write.\n",
+ phys_ofs);
jffs2_dbg_acct_sanity_check(c,jeb);
jffs2_dbg_acct_paranoia_check(c, jeb);
goto retry;
}
- D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
+ jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n",
+ ret);
}
if (!ret)
@@ -698,7 +724,8 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, rawlen, ic);
jffs2_mark_node_obsolete(c, raw);
- D1(printk(KERN_DEBUG "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", ref_offset(raw)));
+ jffs2_dbg(1, "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n",
+ ref_offset(raw));
out_node:
kfree(node);
@@ -725,29 +752,32 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
/* For these, we don't actually need to read the old node */
mdatalen = jffs2_encode_dev(&dev, JFFS2_F_I_RDEV(f));
mdata = (char *)&dev;
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bytes of kdev_t\n", mdatalen));
+ jffs2_dbg(1, "%s(): Writing %d bytes of kdev_t\n",
+ __func__, mdatalen);
} else if (S_ISLNK(JFFS2_F_I_MODE(f))) {
mdatalen = fn->size;
mdata = kmalloc(fn->size, GFP_KERNEL);
if (!mdata) {
- printk(KERN_WARNING "kmalloc of mdata failed in jffs2_garbage_collect_metadata()\n");
+ pr_warn("kmalloc of mdata failed in jffs2_garbage_collect_metadata()\n");
return -ENOMEM;
}
ret = jffs2_read_dnode(c, f, fn, mdata, 0, mdatalen);
if (ret) {
- printk(KERN_WARNING "read of old metadata failed in jffs2_garbage_collect_metadata(): %d\n", ret);
+ pr_warn("read of old metadata failed in jffs2_garbage_collect_metadata(): %d\n",
+ ret);
kfree(mdata);
return ret;
}
- D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bites of symlink target\n", mdatalen));
+ jffs2_dbg(1, "%s(): Writing %d bites of symlink target\n",
+ __func__, mdatalen);
}
ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &alloclen,
JFFS2_SUMMARY_INODE_SIZE);
if (ret) {
- printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n",
- sizeof(ri)+ mdatalen, ret);
+ pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n",
+ sizeof(ri) + mdatalen, ret);
goto out;
}
@@ -784,7 +814,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, ALLOC_GC);
if (IS_ERR(new_fn)) {
- printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn));
+ pr_warn("Error writing new dnode: %ld\n", PTR_ERR(new_fn));
ret = PTR_ERR(new_fn);
goto out;
}
@@ -827,14 +857,15 @@ static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_er
ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &alloclen,
JFFS2_SUMMARY_DIRENT_SIZE(rd.nsize));
if (ret) {
- printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n",
- sizeof(rd)+rd.nsize, ret);
+ pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n",
+ sizeof(rd)+rd.nsize, ret);
return ret;
}
new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, ALLOC_GC);
if (IS_ERR(new_fd)) {
- printk(KERN_WARNING "jffs2_write_dirent in garbage_collect_dirent failed: %ld\n", PTR_ERR(new_fd));
+ pr_warn("jffs2_write_dirent in garbage_collect_dirent failed: %ld\n",
+ PTR_ERR(new_fd));
return PTR_ERR(new_fd);
}
jffs2_add_fd_to_list(c, new_fd, &f->dents);
@@ -887,19 +918,22 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct
if (SECTOR_ADDR(raw->flash_offset) == SECTOR_ADDR(fd->raw->flash_offset))
continue;
- D1(printk(KERN_DEBUG "Check potential deletion dirent at %08x\n", ref_offset(raw)));
+ jffs2_dbg(1, "Check potential deletion dirent at %08x\n",
+ ref_offset(raw));
/* This is an obsolete node belonging to the same directory, and it's of the right
length. We need to take a closer look...*/
ret = jffs2_flash_read(c, ref_offset(raw), rawlen, &retlen, (char *)rd);
if (ret) {
- printk(KERN_WARNING "jffs2_g_c_deletion_dirent(): Read error (%d) reading obsolete node at %08x\n", ret, ref_offset(raw));
+ pr_warn("%s(): Read error (%d) reading obsolete node at %08x\n",
+ __func__, ret, ref_offset(raw));
/* If we can't read it, we don't need to continue to obsolete it. Continue */
continue;
}
if (retlen != rawlen) {
- printk(KERN_WARNING "jffs2_g_c_deletion_dirent(): Short read (%zd not %u) reading header from obsolete node at %08x\n",
- retlen, rawlen, ref_offset(raw));
+ pr_warn("%s(): Short read (%zd not %u) reading header from obsolete node at %08x\n",
+ __func__, retlen, rawlen,
+ ref_offset(raw));
continue;
}
@@ -923,8 +957,9 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct
a new deletion dirent to replace it */
mutex_unlock(&c->erase_free_sem);
- D1(printk(KERN_DEBUG "Deletion dirent at %08x still obsoletes real dirent \"%s\" at %08x for ino #%u\n",
- ref_offset(fd->raw), fd->name, ref_offset(raw), je32_to_cpu(rd->ino)));
+ jffs2_dbg(1, "Deletion dirent at %08x still obsoletes real dirent \"%s\" at %08x for ino #%u\n",
+ ref_offset(fd->raw), fd->name,
+ ref_offset(raw), je32_to_cpu(rd->ino));
kfree(rd);
return jffs2_garbage_collect_dirent(c, jeb, f, fd);
@@ -947,7 +982,8 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct
fdp = &(*fdp)->next;
}
if (!found) {
- printk(KERN_WARNING "Deletion dirent \"%s\" not found in list for ino #%u\n", fd->name, f->inocache->ino);
+ pr_warn("Deletion dirent \"%s\" not found in list for ino #%u\n",
+ fd->name, f->inocache->ino);
}
jffs2_mark_node_obsolete(c, fd->raw);
jffs2_free_full_dirent(fd);
@@ -964,8 +1000,8 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
uint32_t alloclen, ilen;
int ret;
- D1(printk(KERN_DEBUG "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n",
- f->inocache->ino, start, end));
+ jffs2_dbg(1, "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n",
+ f->inocache->ino, start, end);
memset(&ri, 0, sizeof(ri));
@@ -976,35 +1012,37 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
write it out again with the _same_ version as before */
ret = jffs2_flash_read(c, ref_offset(fn->raw), sizeof(ri), &readlen, (char *)&ri);
if (readlen != sizeof(ri) || ret) {
- printk(KERN_WARNING "Node read failed in jffs2_garbage_collect_hole. Ret %d, retlen %zd. Data will be lost by writing new hole node\n", ret, readlen);
+ pr_warn("Node read failed in jffs2_garbage_collect_hole. Ret %d, retlen %zd. Data will be lost by writing new hole node\n",
+ ret, readlen);
goto fill;
}
if (je16_to_cpu(ri.nodetype) != JFFS2_NODETYPE_INODE) {
- printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had node type 0x%04x instead of JFFS2_NODETYPE_INODE(0x%04x)\n",
- ref_offset(fn->raw),
- je16_to_cpu(ri.nodetype), JFFS2_NODETYPE_INODE);
+ pr_warn("%s(): Node at 0x%08x had node type 0x%04x instead of JFFS2_NODETYPE_INODE(0x%04x)\n",
+ __func__, ref_offset(fn->raw),
+ je16_to_cpu(ri.nodetype), JFFS2_NODETYPE_INODE);
return -EIO;
}
if (je32_to_cpu(ri.totlen) != sizeof(ri)) {
- printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had totlen 0x%x instead of expected 0x%zx\n",
- ref_offset(fn->raw),
- je32_to_cpu(ri.totlen), sizeof(ri));
+ pr_warn("%s(): Node at 0x%08x had totlen 0x%x instead of expected 0x%zx\n",
+ __func__, ref_offset(fn->raw),
+ je32_to_cpu(ri.totlen), sizeof(ri));
return -EIO;
}
crc = crc32(0, &ri, sizeof(ri)-8);
if (crc != je32_to_cpu(ri.node_crc)) {
- printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had CRC 0x%08x which doesn't match calculated CRC 0x%08x\n",
- ref_offset(fn->raw),
- je32_to_cpu(ri.node_crc), crc);
+ pr_warn("%s: Node at 0x%08x had CRC 0x%08x which doesn't match calculated CRC 0x%08x\n",
+ __func__, ref_offset(fn->raw),
+ je32_to_cpu(ri.node_crc), crc);
/* FIXME: We could possibly deal with this by writing new holes for each frag */
- printk(KERN_WARNING "Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n",
- start, end, f->inocache->ino);
+ pr_warn("Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n",
+ start, end, f->inocache->ino);
goto fill;
}
if (ri.compr != JFFS2_COMPR_ZERO) {
- printk(KERN_WARNING "jffs2_garbage_collect_hole: Node 0x%08x wasn't a hole node!\n", ref_offset(fn->raw));
- printk(KERN_WARNING "Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n",
- start, end, f->inocache->ino);
+ pr_warn("%s(): Node 0x%08x wasn't a hole node!\n",
+ __func__, ref_offset(fn->raw));
+ pr_warn("Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n",
+ start, end, f->inocache->ino);
goto fill;
}
} else {
@@ -1043,14 +1081,14 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
ret = jffs2_reserve_space_gc(c, sizeof(ri), &alloclen,
JFFS2_SUMMARY_INODE_SIZE);
if (ret) {
- printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n",
- sizeof(ri), ret);
+ pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n",
+ sizeof(ri), ret);
return ret;
}
new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_GC);
if (IS_ERR(new_fn)) {
- printk(KERN_WARNING "Error writing new hole node: %ld\n", PTR_ERR(new_fn));
+ pr_warn("Error writing new hole node: %ld\n", PTR_ERR(new_fn));
return PTR_ERR(new_fn);
}
if (je32_to_cpu(ri.version) == f->highest_version) {
@@ -1070,9 +1108,9 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
* above.)
*/
D1(if(unlikely(fn->frags <= 1)) {
- printk(KERN_WARNING "jffs2_garbage_collect_hole: Replacing fn with %d frag(s) but new ver %d != highest_version %d of ino #%d\n",
- fn->frags, je32_to_cpu(ri.version), f->highest_version,
- je32_to_cpu(ri.ino));
+ pr_warn("%s(): Replacing fn with %d frag(s) but new ver %d != highest_version %d of ino #%d\n",
+ __func__, fn->frags, je32_to_cpu(ri.version),
+ f->highest_version, je32_to_cpu(ri.ino));
});
/* This is a partially-overlapped hole node. Mark it REF_NORMAL not REF_PRISTINE */
@@ -1089,11 +1127,11 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
}
}
if (fn->frags) {
- printk(KERN_WARNING "jffs2_garbage_collect_hole: Old node still has frags!\n");
+ pr_warn("%s(): Old node still has frags!\n", __func__);
BUG();
}
if (!new_fn->frags) {
- printk(KERN_WARNING "jffs2_garbage_collect_hole: New node has no frags!\n");
+ pr_warn("%s(): New node has no frags!\n", __func__);
BUG();
}
@@ -1117,8 +1155,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
memset(&ri, 0, sizeof(ri));
- D1(printk(KERN_DEBUG "Writing replacement dnode for ino #%u from offset 0x%x to 0x%x\n",
- f->inocache->ino, start, end));
+ jffs2_dbg(1, "Writing replacement dnode for ino #%u from offset 0x%x to 0x%x\n",
+ f->inocache->ino, start, end);
orig_end = end;
orig_start = start;
@@ -1149,15 +1187,15 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
/* If the previous frag doesn't even reach the beginning, there's
excessive fragmentation. Just merge. */
if (frag->ofs > min) {
- D1(printk(KERN_DEBUG "Expanding down to cover partial frag (0x%x-0x%x)\n",
- frag->ofs, frag->ofs+frag->size));
+ jffs2_dbg(1, "Expanding down to cover partial frag (0x%x-0x%x)\n",
+ frag->ofs, frag->ofs+frag->size);
start = frag->ofs;
continue;
}
/* OK. This frag holds the first byte of the page. */
if (!frag->node || !frag->node->raw) {
- D1(printk(KERN_DEBUG "First frag in page is hole (0x%x-0x%x). Not expanding down.\n",
- frag->ofs, frag->ofs+frag->size));
+ jffs2_dbg(1, "First frag in page is hole (0x%x-0x%x). Not expanding down.\n",
+ frag->ofs, frag->ofs+frag->size);
break;
} else {
@@ -1171,19 +1209,25 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
jeb = &c->blocks[raw->flash_offset / c->sector_size];
if (jeb == c->gcblock) {
- D1(printk(KERN_DEBUG "Expanding down to cover frag (0x%x-0x%x) in gcblock at %08x\n",
- frag->ofs, frag->ofs+frag->size, ref_offset(raw)));
+ jffs2_dbg(1, "Expanding down to cover frag (0x%x-0x%x) in gcblock at %08x\n",
+ frag->ofs,
+ frag->ofs + frag->size,
+ ref_offset(raw));
start = frag->ofs;
break;
}
if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) {
- D1(printk(KERN_DEBUG "Not expanding down to cover frag (0x%x-0x%x) in clean block %08x\n",
- frag->ofs, frag->ofs+frag->size, jeb->offset));
+ jffs2_dbg(1, "Not expanding down to cover frag (0x%x-0x%x) in clean block %08x\n",
+ frag->ofs,
+ frag->ofs + frag->size,
+ jeb->offset);
break;
}
- D1(printk(KERN_DEBUG "Expanding down to cover frag (0x%x-0x%x) in dirty block %08x\n",
- frag->ofs, frag->ofs+frag->size, jeb->offset));
+ jffs2_dbg(1, "Expanding down to cover frag (0x%x-0x%x) in dirty block %08x\n",
+ frag->ofs,
+ frag->ofs + frag->size,
+ jeb->offset);
start = frag->ofs;
break;
}
@@ -1199,15 +1243,15 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
/* If the previous frag doesn't even reach the beginning, there's lots
of fragmentation. Just merge. */
if (frag->ofs+frag->size < max) {
- D1(printk(KERN_DEBUG "Expanding up to cover partial frag (0x%x-0x%x)\n",
- frag->ofs, frag->ofs+frag->size));
+ jffs2_dbg(1, "Expanding up to cover partial frag (0x%x-0x%x)\n",
+ frag->ofs, frag->ofs+frag->size);
end = frag->ofs + frag->size;
continue;
}
if (!frag->node || !frag->node->raw) {
- D1(printk(KERN_DEBUG "Last frag in page is hole (0x%x-0x%x). Not expanding up.\n",
- frag->ofs, frag->ofs+frag->size));
+ jffs2_dbg(1, "Last frag in page is hole (0x%x-0x%x). Not expanding up.\n",
+ frag->ofs, frag->ofs+frag->size);
break;
} else {
@@ -1221,25 +1265,31 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
jeb = &c->blocks[raw->flash_offset / c->sector_size];
if (jeb == c->gcblock) {
- D1(printk(KERN_DEBUG "Expanding up to cover frag (0x%x-0x%x) in gcblock at %08x\n",
- frag->ofs, frag->ofs+frag->size, ref_offset(raw)));
+ jffs2_dbg(1, "Expanding up to cover frag (0x%x-0x%x) in gcblock at %08x\n",
+ frag->ofs,
+ frag->ofs + frag->size,
+ ref_offset(raw));
end = frag->ofs + frag->size;
break;
}
if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) {
- D1(printk(KERN_DEBUG "Not expanding up to cover frag (0x%x-0x%x) in clean block %08x\n",
- frag->ofs, frag->ofs+frag->size, jeb->offset));
+ jffs2_dbg(1, "Not expanding up to cover frag (0x%x-0x%x) in clean block %08x\n",
+ frag->ofs,
+ frag->ofs + frag->size,
+ jeb->offset);
break;
}
- D1(printk(KERN_DEBUG "Expanding up to cover frag (0x%x-0x%x) in dirty block %08x\n",
- frag->ofs, frag->ofs+frag->size, jeb->offset));
+ jffs2_dbg(1, "Expanding up to cover frag (0x%x-0x%x) in dirty block %08x\n",
+ frag->ofs,
+ frag->ofs + frag->size,
+ jeb->offset);
end = frag->ofs + frag->size;
break;
}
}
- D1(printk(KERN_DEBUG "Expanded dnode to write from (0x%x-0x%x) to (0x%x-0x%x)\n",
- orig_start, orig_end, start, end));
+ jffs2_dbg(1, "Expanded dnode to write from (0x%x-0x%x) to (0x%x-0x%x)\n",
+ orig_start, orig_end, start, end);
D1(BUG_ON(end > frag_last(&f->fragtree)->ofs + frag_last(&f->fragtree)->size));
BUG_ON(end < orig_end);
@@ -1256,7 +1306,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
if (IS_ERR(pg_ptr)) {
- printk(KERN_WARNING "read_cache_page() returned error: %ld\n", PTR_ERR(pg_ptr));
+ pr_warn("read_cache_page() returned error: %ld\n",
+ PTR_ERR(pg_ptr));
return PTR_ERR(pg_ptr);
}
@@ -1270,8 +1321,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
&alloclen, JFFS2_SUMMARY_INODE_SIZE);
if (ret) {
- printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dnode failed: %d\n",
- sizeof(ri)+ JFFS2_MIN_DATA_LEN, ret);
+ pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_dnode failed: %d\n",
+ sizeof(ri) + JFFS2_MIN_DATA_LEN, ret);
break;
}
cdatalen = min_t(uint32_t, alloclen - sizeof(ri), end - offset);
@@ -1308,7 +1359,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
jffs2_free_comprbuf(comprbuf, writebuf);
if (IS_ERR(new_fn)) {
- printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn));
+ pr_warn("Error writing new dnode: %ld\n",
+ PTR_ERR(new_fn));
ret = PTR_ERR(new_fn);
break;
}
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index c082868910f..4f47aa24b55 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/init.h>
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 5e03233c236..975a1f562c1 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/fs.h>
@@ -687,8 +689,8 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
if (!size)
return 0;
if (unlikely(size > jeb->free_size)) {
- printk(KERN_CRIT "Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n",
- size, jeb->free_size, jeb->wasted_size);
+ pr_crit("Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n",
+ size, jeb->free_size, jeb->wasted_size);
BUG();
}
/* REF_EMPTY_NODE is !obsolete, so that works OK */
@@ -726,8 +728,10 @@ static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
/* Last node in block. Use free_space */
if (unlikely(ref != jeb->last_node)) {
- printk(KERN_CRIT "ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n",
- ref, ref_offset(ref), jeb->last_node, jeb->last_node?ref_offset(jeb->last_node):0);
+ pr_crit("ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n",
+ ref, ref_offset(ref), jeb->last_node,
+ jeb->last_node ?
+ ref_offset(jeb->last_node) : 0);
BUG();
}
ref_end = jeb->offset + c->sector_size - jeb->free_size;
@@ -747,16 +751,20 @@ uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *je
if (!jeb)
jeb = &c->blocks[ref->flash_offset / c->sector_size];
- printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
- ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
- ret, ref->__totlen);
+ pr_crit("Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
+ ref, ref_offset(ref), ref_offset(ref) + ref->__totlen,
+ ret, ref->__totlen);
if (ref_next(ref)) {
- printk(KERN_CRIT "next %p (0x%08x-0x%08x)\n", ref_next(ref), ref_offset(ref_next(ref)),
- ref_offset(ref_next(ref))+ref->__totlen);
+ pr_crit("next %p (0x%08x-0x%08x)\n",
+ ref_next(ref), ref_offset(ref_next(ref)),
+ ref_offset(ref_next(ref)) + ref->__totlen);
} else
- printk(KERN_CRIT "No next ref. jeb->last_node is %p\n", jeb->last_node);
+ pr_crit("No next ref. jeb->last_node is %p\n",
+ jeb->last_node);
- printk(KERN_CRIT "jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", jeb->wasted_size, jeb->dirty_size, jeb->used_size, jeb->free_size);
+ pr_crit("jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n",
+ jeb->wasted_size, jeb->dirty_size, jeb->used_size,
+ jeb->free_size);
#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS)
__jffs2_dbg_dump_node_refs_nolock(c, jeb);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 694aa5b0350..6784d1e7a7e 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/mtd/mtd.h>
#include <linux/compiler.h>
@@ -46,10 +48,10 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
/* align it */
minsize = PAD(minsize);
- D1(printk(KERN_DEBUG "jffs2_reserve_space(): Requested 0x%x bytes\n", minsize));
+ jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize);
mutex_lock(&c->alloc_sem);
- D1(printk(KERN_DEBUG "jffs2_reserve_space(): alloc sem got\n"));
+ jffs2_dbg(1, "%s(): alloc sem got\n", __func__);
spin_lock(&c->erase_completion_lock);
@@ -73,11 +75,13 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size + c->unchecked_size;
if (dirty < c->nospc_dirty_size) {
if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) {
- D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on dirty space to GC, but it's a deletion. Allowing...\n"));
+ jffs2_dbg(1, "%s(): Low on dirty space to GC, but it's a deletion. Allowing...\n",
+ __func__);
break;
}
- D1(printk(KERN_DEBUG "dirty size 0x%08x + unchecked_size 0x%08x < nospc_dirty_size 0x%08x, returning -ENOSPC\n",
- dirty, c->unchecked_size, c->sector_size));
+ jffs2_dbg(1, "dirty size 0x%08x + unchecked_size 0x%08x < nospc_dirty_size 0x%08x, returning -ENOSPC\n",
+ dirty, c->unchecked_size,
+ c->sector_size);
spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->alloc_sem);
@@ -96,12 +100,13 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
avail = c->free_size + c->dirty_size + c->erasing_size + c->unchecked_size;
if ( (avail / c->sector_size) <= blocksneeded) {
if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) {
- D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on possibly available space, but it's a deletion. Allowing...\n"));
+ jffs2_dbg(1, "%s(): Low on possibly available space, but it's a deletion. Allowing...\n",
+ __func__);
break;
}
- D1(printk(KERN_DEBUG "max. available size 0x%08x < blocksneeded * sector_size 0x%08x, returning -ENOSPC\n",
- avail, blocksneeded * c->sector_size));
+ jffs2_dbg(1, "max. available size 0x%08x < blocksneeded * sector_size 0x%08x, returning -ENOSPC\n",
+ avail, blocksneeded * c->sector_size);
spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->alloc_sem);
return -ENOSPC;
@@ -109,9 +114,14 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
mutex_unlock(&c->alloc_sem);
- D1(printk(KERN_DEBUG "Triggering GC pass. nr_free_blocks %d, nr_erasing_blocks %d, free_size 0x%08x, dirty_size 0x%08x, wasted_size 0x%08x, used_size 0x%08x, erasing_size 0x%08x, bad_size 0x%08x (total 0x%08x of 0x%08x)\n",
- c->nr_free_blocks, c->nr_erasing_blocks, c->free_size, c->dirty_size, c->wasted_size, c->used_size, c->erasing_size, c->bad_size,
- c->free_size + c->dirty_size + c->wasted_size + c->used_size + c->erasing_size + c->bad_size, c->flash_size));
+ jffs2_dbg(1, "Triggering GC pass. nr_free_blocks %d, nr_erasing_blocks %d, free_size 0x%08x, dirty_size 0x%08x, wasted_size 0x%08x, used_size 0x%08x, erasing_size 0x%08x, bad_size 0x%08x (total 0x%08x of 0x%08x)\n",
+ c->nr_free_blocks, c->nr_erasing_blocks,
+ c->free_size, c->dirty_size, c->wasted_size,
+ c->used_size, c->erasing_size, c->bad_size,
+ c->free_size + c->dirty_size +
+ c->wasted_size + c->used_size +
+ c->erasing_size + c->bad_size,
+ c->flash_size);
spin_unlock(&c->erase_completion_lock);
ret = jffs2_garbage_collect_pass(c);
@@ -124,7 +134,8 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
DECLARE_WAITQUEUE(wait, current);
set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(&c->erase_wait, &wait);
- D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
+ jffs2_dbg(1, "%s waiting for erase to complete\n",
+ __func__);
spin_unlock(&c->erase_completion_lock);
schedule();
@@ -144,7 +155,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
if (ret) {
- D1(printk(KERN_DEBUG "jffs2_reserve_space: ret is %d\n", ret));
+ jffs2_dbg(1, "%s(): ret is %d\n", __func__, ret);
}
}
spin_unlock(&c->erase_completion_lock);
@@ -161,13 +172,14 @@ int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
int ret = -EAGAIN;
minsize = PAD(minsize);
- D1(printk(KERN_DEBUG "jffs2_reserve_space_gc(): Requested 0x%x bytes\n", minsize));
+ jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize);
spin_lock(&c->erase_completion_lock);
while(ret == -EAGAIN) {
ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
if (ret) {
- D1(printk(KERN_DEBUG "jffs2_reserve_space_gc: looping, ret is %d\n", ret));
+ jffs2_dbg(1, "%s(): looping, ret is %d\n",
+ __func__, ret);
}
}
spin_unlock(&c->erase_completion_lock);
@@ -184,8 +196,8 @@ static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblo
{
if (c->nextblock == NULL) {
- D1(printk(KERN_DEBUG "jffs2_close_nextblock: Erase block at 0x%08x has already been placed in a list\n",
- jeb->offset));
+ jffs2_dbg(1, "%s(): Erase block at 0x%08x has already been placed in a list\n",
+ __func__, jeb->offset);
return;
}
/* Check, if we have a dirty block now, or if it was dirty already */
@@ -195,17 +207,20 @@ static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblo
jeb->dirty_size += jeb->wasted_size;
jeb->wasted_size = 0;
if (VERYDIRTY(c, jeb->dirty_size)) {
- D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to very_dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
- jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size));
+ jffs2_dbg(1, "Adding full erase block at 0x%08x to very_dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+ jeb->offset, jeb->free_size, jeb->dirty_size,
+ jeb->used_size);
list_add_tail(&jeb->list, &c->very_dirty_list);
} else {
- D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
- jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size));
+ jffs2_dbg(1, "Adding full erase block at 0x%08x to dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+ jeb->offset, jeb->free_size, jeb->dirty_size,
+ jeb->used_size);
list_add_tail(&jeb->list, &c->dirty_list);
}
} else {
- D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
- jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size));
+ jffs2_dbg(1, "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+ jeb->offset, jeb->free_size, jeb->dirty_size,
+ jeb->used_size);
list_add_tail(&jeb->list, &c->clean_list);
}
c->nextblock = NULL;
@@ -230,13 +245,14 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
list_move_tail(&ejeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
jffs2_garbage_collect_trigger(c);
- D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
- ejeb->offset));
+ jffs2_dbg(1, "%s(): Triggering erase of erasable block at 0x%08x\n",
+ __func__, ejeb->offset);
}
if (!c->nr_erasing_blocks &&
!list_empty(&c->erasable_pending_wbuf_list)) {
- D1(printk(KERN_DEBUG "jffs2_find_nextblock: Flushing write buffer\n"));
+ jffs2_dbg(1, "%s(): Flushing write buffer\n",
+ __func__);
/* c->nextblock is NULL, no update to c->nextblock allowed */
spin_unlock(&c->erase_completion_lock);
jffs2_flush_wbuf_pad(c);
@@ -248,9 +264,11 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
if (!c->nr_erasing_blocks) {
/* Ouch. We're in GC, or we wouldn't have got here.
And there's no space left. At all. */
- printk(KERN_CRIT "Argh. No free space left for GC. nr_erasing_blocks is %d. nr_free_blocks is %d. (erasableempty: %s, erasingempty: %s, erasependingempty: %s)\n",
- c->nr_erasing_blocks, c->nr_free_blocks, list_empty(&c->erasable_list)?"yes":"no",
- list_empty(&c->erasing_list)?"yes":"no", list_empty(&c->erase_pending_list)?"yes":"no");
+ pr_crit("Argh. No free space left for GC. nr_erasing_blocks is %d. nr_free_blocks is %d. (erasableempty: %s, erasingempty: %s, erasependingempty: %s)\n",
+ c->nr_erasing_blocks, c->nr_free_blocks,
+ list_empty(&c->erasable_list) ? "yes" : "no",
+ list_empty(&c->erasing_list) ? "yes" : "no",
+ list_empty(&c->erase_pending_list) ? "yes" : "no");
return -ENOSPC;
}
@@ -278,7 +296,8 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
c->wbuf_ofs = 0xffffffff;
#endif
- D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset));
+ jffs2_dbg(1, "%s(): new nextblock = 0x%08x\n",
+ __func__, c->nextblock->offset);
return 0;
}
@@ -345,7 +364,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
if (jffs2_wbuf_dirty(c)) {
spin_unlock(&c->erase_completion_lock);
- D1(printk(KERN_DEBUG "jffs2_do_reserve_space: Flushing write buffer\n"));
+ jffs2_dbg(1, "%s(): Flushing write buffer\n",
+ __func__);
jffs2_flush_wbuf_pad(c);
spin_lock(&c->erase_completion_lock);
jeb = c->nextblock;
@@ -387,7 +407,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
jeb = c->nextblock;
if (jeb->free_size != c->sector_size - c->cleanmarker_size) {
- printk(KERN_WARNING "Eep. Block 0x%08x taken from free_list had free_size of 0x%08x!!\n", jeb->offset, jeb->free_size);
+ pr_warn("Eep. Block 0x%08x taken from free_list had free_size of 0x%08x!!\n",
+ jeb->offset, jeb->free_size);
goto restart;
}
}
@@ -408,8 +429,9 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
spin_lock(&c->erase_completion_lock);
}
- D1(printk(KERN_DEBUG "jffs2_do_reserve_space(): Giving 0x%x bytes at 0x%x\n",
- *len, jeb->offset + (c->sector_size - jeb->free_size)));
+ jffs2_dbg(1, "%s(): Giving 0x%x bytes at 0x%x\n",
+ __func__,
+ *len, jeb->offset + (c->sector_size - jeb->free_size));
return 0;
}
@@ -434,20 +456,22 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
jeb = &c->blocks[ofs / c->sector_size];
- D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n",
- ofs & ~3, ofs & 3, len));
+ jffs2_dbg(1, "%s(): Node at 0x%x(%d), size 0x%x\n",
+ __func__, ofs & ~3, ofs & 3, len);
#if 1
/* Allow non-obsolete nodes only to be added at the end of c->nextblock,
if c->nextblock is set. Note that wbuf.c will file obsolete nodes
even after refiling c->nextblock */
if ((c->nextblock || ((ofs & 3) != REF_OBSOLETE))
&& (jeb != c->nextblock || (ofs & ~3) != jeb->offset + (c->sector_size - jeb->free_size))) {
- printk(KERN_WARNING "argh. node added in wrong place at 0x%08x(%d)\n", ofs & ~3, ofs & 3);
+ pr_warn("argh. node added in wrong place at 0x%08x(%d)\n",
+ ofs & ~3, ofs & 3);
if (c->nextblock)
- printk(KERN_WARNING "nextblock 0x%08x", c->nextblock->offset);
+ pr_warn("nextblock 0x%08x", c->nextblock->offset);
else
- printk(KERN_WARNING "No nextblock");
- printk(", expected at %08x\n", jeb->offset + (c->sector_size - jeb->free_size));
+ pr_warn("No nextblock");
+ pr_cont(", expected at %08x\n",
+ jeb->offset + (c->sector_size - jeb->free_size));
return ERR_PTR(-EINVAL);
}
#endif
@@ -457,8 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) {
/* If it lives on the dirty_list, jffs2_reserve_space will put it there */
- D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
- jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size));
+ jffs2_dbg(1, "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+ jeb->offset, jeb->free_size, jeb->dirty_size,
+ jeb->used_size);
if (jffs2_wbuf_dirty(c)) {
/* Flush the last write in the block if it's outstanding */
spin_unlock(&c->erase_completion_lock);
@@ -480,7 +505,7 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
void jffs2_complete_reservation(struct jffs2_sb_info *c)
{
- D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
+ jffs2_dbg(1, "jffs2_complete_reservation()\n");
spin_lock(&c->erase_completion_lock);
jffs2_garbage_collect_trigger(c);
spin_unlock(&c->erase_completion_lock);
@@ -493,7 +518,7 @@ static inline int on_list(struct list_head *obj, struct list_head *head)
list_for_each(this, head) {
if (this == obj) {
- D1(printk("%p is on list at %p\n", obj, head));
+ jffs2_dbg(1, "%p is on list at %p\n", obj, head);
return 1;
}
@@ -511,16 +536,18 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
uint32_t freed_len;
if(unlikely(!ref)) {
- printk(KERN_NOTICE "EEEEEK. jffs2_mark_node_obsolete called with NULL node\n");
+ pr_notice("EEEEEK. jffs2_mark_node_obsolete called with NULL node\n");
return;
}
if (ref_obsolete(ref)) {
- D1(printk(KERN_DEBUG "jffs2_mark_node_obsolete called with already obsolete node at 0x%08x\n", ref_offset(ref)));
+ jffs2_dbg(1, "%s(): called with already obsolete node at 0x%08x\n",
+ __func__, ref_offset(ref));
return;
}
blocknr = ref->flash_offset / c->sector_size;
if (blocknr >= c->nr_blocks) {
- printk(KERN_NOTICE "raw node at 0x%08x is off the end of device!\n", ref->flash_offset);
+ pr_notice("raw node at 0x%08x is off the end of device!\n",
+ ref->flash_offset);
BUG();
}
jeb = &c->blocks[blocknr];
@@ -542,27 +569,31 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
if (ref_flags(ref) == REF_UNCHECKED) {
D1(if (unlikely(jeb->unchecked_size < freed_len)) {
- printk(KERN_NOTICE "raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n",
- freed_len, blocknr, ref->flash_offset, jeb->used_size);
+ pr_notice("raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n",
+ freed_len, blocknr,
+ ref->flash_offset, jeb->used_size);
BUG();
})
- D1(printk(KERN_DEBUG "Obsoleting previously unchecked node at 0x%08x of len %x: ", ref_offset(ref), freed_len));
+ jffs2_dbg(1, "Obsoleting previously unchecked node at 0x%08x of len %x\n",
+ ref_offset(ref), freed_len);
jeb->unchecked_size -= freed_len;
c->unchecked_size -= freed_len;
} else {
D1(if (unlikely(jeb->used_size < freed_len)) {
- printk(KERN_NOTICE "raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n",
- freed_len, blocknr, ref->flash_offset, jeb->used_size);
+ pr_notice("raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n",
+ freed_len, blocknr,
+ ref->flash_offset, jeb->used_size);
BUG();
})
- D1(printk(KERN_DEBUG "Obsoleting node at 0x%08x of len %#x: ", ref_offset(ref), freed_len));
+ jffs2_dbg(1, "Obsoleting node at 0x%08x of len %#x: ",
+ ref_offset(ref), freed_len);
jeb->used_size -= freed_len;
c->used_size -= freed_len;
}
// Take care, that wasted size is taken into concern
if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + freed_len)) && jeb != c->nextblock) {
- D1(printk("Dirtying\n"));
+ jffs2_dbg(1, "Dirtying\n");
addedsize = freed_len;
jeb->dirty_size += freed_len;
c->dirty_size += freed_len;
@@ -570,12 +601,12 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
/* Convert wasted space to dirty, if not a bad block */
if (jeb->wasted_size) {
if (on_list(&jeb->list, &c->bad_used_list)) {
- D1(printk(KERN_DEBUG "Leaving block at %08x on the bad_used_list\n",
- jeb->offset));
+ jffs2_dbg(1, "Leaving block at %08x on the bad_used_list\n",
+ jeb->offset);
addedsize = 0; /* To fool the refiling code later */
} else {
- D1(printk(KERN_DEBUG "Converting %d bytes of wasted space to dirty in block at %08x\n",
- jeb->wasted_size, jeb->offset));
+ jffs2_dbg(1, "Converting %d bytes of wasted space to dirty in block at %08x\n",
+ jeb->wasted_size, jeb->offset);
addedsize += jeb->wasted_size;
jeb->dirty_size += jeb->wasted_size;
c->dirty_size += jeb->wasted_size;
@@ -584,7 +615,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
}
}
} else {
- D1(printk("Wasting\n"));
+ jffs2_dbg(1, "Wasting\n");
addedsize = 0;
jeb->wasted_size += freed_len;
c->wasted_size += freed_len;
@@ -606,50 +637,57 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
}
if (jeb == c->nextblock) {
- D2(printk(KERN_DEBUG "Not moving nextblock 0x%08x to dirty/erase_pending list\n", jeb->offset));
+ jffs2_dbg(2, "Not moving nextblock 0x%08x to dirty/erase_pending list\n",
+ jeb->offset);
} else if (!jeb->used_size && !jeb->unchecked_size) {
if (jeb == c->gcblock) {
- D1(printk(KERN_DEBUG "gcblock at 0x%08x completely dirtied. Clearing gcblock...\n", jeb->offset));
+ jffs2_dbg(1, "gcblock at 0x%08x completely dirtied. Clearing gcblock...\n",
+ jeb->offset);
c->gcblock = NULL;
} else {
- D1(printk(KERN_DEBUG "Eraseblock at 0x%08x completely dirtied. Removing from (dirty?) list...\n", jeb->offset));
+ jffs2_dbg(1, "Eraseblock at 0x%08x completely dirtied. Removing from (dirty?) list...\n",
+ jeb->offset);
list_del(&jeb->list);
}
if (jffs2_wbuf_dirty(c)) {
- D1(printk(KERN_DEBUG "...and adding to erasable_pending_wbuf_list\n"));
+ jffs2_dbg(1, "...and adding to erasable_pending_wbuf_list\n");
list_add_tail(&jeb->list, &c->erasable_pending_wbuf_list);
} else {
if (jiffies & 127) {
/* Most of the time, we just erase it immediately. Otherwise we
spend ages scanning it on mount, etc. */
- D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
+ jffs2_dbg(1, "...and adding to erase_pending_list\n");
list_add_tail(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
jffs2_garbage_collect_trigger(c);
} else {
/* Sometimes, however, we leave it elsewhere so it doesn't get
immediately reused, and we spread the load a bit. */
- D1(printk(KERN_DEBUG "...and adding to erasable_list\n"));
+ jffs2_dbg(1, "...and adding to erasable_list\n");
list_add_tail(&jeb->list, &c->erasable_list);
}
}
- D1(printk(KERN_DEBUG "Done OK\n"));
+ jffs2_dbg(1, "Done OK\n");
} else if (jeb == c->gcblock) {
- D2(printk(KERN_DEBUG "Not moving gcblock 0x%08x to dirty_list\n", jeb->offset));
+ jffs2_dbg(2, "Not moving gcblock 0x%08x to dirty_list\n",
+ jeb->offset);
} else if (ISDIRTY(jeb->dirty_size) && !ISDIRTY(jeb->dirty_size - addedsize)) {
- D1(printk(KERN_DEBUG "Eraseblock at 0x%08x is freshly dirtied. Removing from clean list...\n", jeb->offset));
+ jffs2_dbg(1, "Eraseblock at 0x%08x is freshly dirtied. Removing from clean list...\n",
+ jeb->offset);
list_del(&jeb->list);
- D1(printk(KERN_DEBUG "...and adding to dirty_list\n"));
+ jffs2_dbg(1, "...and adding to dirty_list\n");
list_add_tail(&jeb->list, &c->dirty_list);
} else if (VERYDIRTY(c, jeb->dirty_size) &&
!VERYDIRTY(c, jeb->dirty_size - addedsize)) {
- D1(printk(KERN_DEBUG "Eraseblock at 0x%08x is now very dirty. Removing from dirty list...\n", jeb->offset));
+ jffs2_dbg(1, "Eraseblock at 0x%08x is now very dirty. Removing from dirty list...\n",
+ jeb->offset);
list_del(&jeb->list);
- D1(printk(KERN_DEBUG "...and adding to very_dirty_list\n"));
+ jffs2_dbg(1, "...and adding to very_dirty_list\n");
list_add_tail(&jeb->list, &c->very_dirty_list);
} else {
- D1(printk(KERN_DEBUG "Eraseblock at 0x%08x not moved anywhere. (free 0x%08x, dirty 0x%08x, used 0x%08x)\n",
- jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size));
+ jffs2_dbg(1, "Eraseblock at 0x%08x not moved anywhere. (free 0x%08x, dirty 0x%08x, used 0x%08x)\n",
+ jeb->offset, jeb->free_size, jeb->dirty_size,
+ jeb->used_size);
}
spin_unlock(&c->erase_completion_lock);
@@ -665,33 +703,40 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
the block hasn't _already_ been erased, and that 'ref' itself hasn't been freed yet
by jffs2_free_jeb_node_refs() in erase.c. Which is nice. */
- D1(printk(KERN_DEBUG "obliterating obsoleted node at 0x%08x\n", ref_offset(ref)));
+ jffs2_dbg(1, "obliterating obsoleted node at 0x%08x\n",
+ ref_offset(ref));
ret = jffs2_flash_read(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n);
if (ret) {
- printk(KERN_WARNING "Read error reading from obsoleted node at 0x%08x: %d\n", ref_offset(ref), ret);
+ pr_warn("Read error reading from obsoleted node at 0x%08x: %d\n",
+ ref_offset(ref), ret);
goto out_erase_sem;
}
if (retlen != sizeof(n)) {
- printk(KERN_WARNING "Short read from obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen);
+ pr_warn("Short read from obsoleted node at 0x%08x: %zd\n",
+ ref_offset(ref), retlen);
goto out_erase_sem;
}
if (PAD(je32_to_cpu(n.totlen)) != PAD(freed_len)) {
- printk(KERN_WARNING "Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", je32_to_cpu(n.totlen), freed_len);
+ pr_warn("Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n",
+ je32_to_cpu(n.totlen), freed_len);
goto out_erase_sem;
}
if (!(je16_to_cpu(n.nodetype) & JFFS2_NODE_ACCURATE)) {
- D1(printk(KERN_DEBUG "Node at 0x%08x was already marked obsolete (nodetype 0x%04x)\n", ref_offset(ref), je16_to_cpu(n.nodetype)));
+ jffs2_dbg(1, "Node at 0x%08x was already marked obsolete (nodetype 0x%04x)\n",
+ ref_offset(ref), je16_to_cpu(n.nodetype));
goto out_erase_sem;
}
/* XXX FIXME: This is ugly now */
n.nodetype = cpu_to_je16(je16_to_cpu(n.nodetype) & ~JFFS2_NODE_ACCURATE);
ret = jffs2_flash_write(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n);
if (ret) {
- printk(KERN_WARNING "Write error in obliterating obsoleted node at 0x%08x: %d\n", ref_offset(ref), ret);
+ pr_warn("Write error in obliterating obsoleted node at 0x%08x: %d\n",
+ ref_offset(ref), ret);
goto out_erase_sem;
}
if (retlen != sizeof(n)) {
- printk(KERN_WARNING "Short write in obliterating obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen);
+ pr_warn("Short write in obliterating obsoleted node at 0x%08x: %zd\n",
+ ref_offset(ref), retlen);
goto out_erase_sem;
}
@@ -751,8 +796,8 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
return 1;
if (c->unchecked_size) {
- D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
- c->unchecked_size, c->checked_ino));
+ jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
+ c->unchecked_size, c->checked_ino);
return 1;
}
@@ -780,8 +825,9 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
}
}
- D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): nr_free_blocks %d, nr_erasing_blocks %d, dirty_size 0x%x, vdirty_blocks %d: %s\n",
- c->nr_free_blocks, c->nr_erasing_blocks, c->dirty_size, nr_very_dirty, ret?"yes":"no"));
+ jffs2_dbg(1, "%s(): nr_free_blocks %d, nr_erasing_blocks %d, dirty_size 0x%x, vdirty_blocks %d: %s\n",
+ __func__, c->nr_free_blocks, c->nr_erasing_blocks,
+ c->dirty_size, nr_very_dirty, ret ? "yes" : "no");
return ret;
}
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index ab65ee3ec85..1cd3aec9d9a 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -76,7 +76,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
#define jffs2_write_nand_cleanmarker(c,jeb) (-EIO)
#define jffs2_flash_write(c, ofs, len, retlen, buf) jffs2_flash_direct_write(c, ofs, len, retlen, buf)
-#define jffs2_flash_read(c, ofs, len, retlen, buf) ((c)->mtd->read((c)->mtd, ofs, len, retlen, buf))
+#define jffs2_flash_read(c, ofs, len, retlen, buf) (mtd_read((c)->mtd, ofs, len, retlen, buf))
#define jffs2_flush_wbuf_pad(c) ({ do{} while(0); (void)(c), 0; })
#define jffs2_flush_wbuf_gc(c, i) ({ do{} while(0); (void)(c), (void) i, 0; })
#define jffs2_write_nand_badblock(c,jeb,bad_offset) (1)
@@ -108,8 +108,6 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
#define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH)
-#define jffs2_flash_write_oob(c, ofs, len, retlen, buf) ((c)->mtd->write_oob((c)->mtd, ofs, len, retlen, buf))
-#define jffs2_flash_read_oob(c, ofs, len, retlen, buf) ((c)->mtd->read_oob((c)->mtd, ofs, len, retlen, buf))
#define jffs2_wbuf_dirty(c) (!!(c)->wbuf_len)
/* wbuf.c */
diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c
index 3f39be1b045..0b042b1fc82 100644
--- a/fs/jffs2/read.c
+++ b/fs/jffs2/read.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/crc32.h>
@@ -36,24 +38,25 @@ int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
ret = jffs2_flash_read(c, ref_offset(fd->raw), sizeof(*ri), &readlen, (char *)ri);
if (ret) {
jffs2_free_raw_inode(ri);
- printk(KERN_WARNING "Error reading node from 0x%08x: %d\n", ref_offset(fd->raw), ret);
+ pr_warn("Error reading node from 0x%08x: %d\n",
+ ref_offset(fd->raw), ret);
return ret;
}
if (readlen != sizeof(*ri)) {
jffs2_free_raw_inode(ri);
- printk(KERN_WARNING "Short read from 0x%08x: wanted 0x%zx bytes, got 0x%zx\n",
- ref_offset(fd->raw), sizeof(*ri), readlen);
+ pr_warn("Short read from 0x%08x: wanted 0x%zx bytes, got 0x%zx\n",
+ ref_offset(fd->raw), sizeof(*ri), readlen);
return -EIO;
}
crc = crc32(0, ri, sizeof(*ri)-8);
- D1(printk(KERN_DEBUG "Node read from %08x: node_crc %08x, calculated CRC %08x. dsize %x, csize %x, offset %x, buf %p\n",
+ jffs2_dbg(1, "Node read from %08x: node_crc %08x, calculated CRC %08x. dsize %x, csize %x, offset %x, buf %p\n",
ref_offset(fd->raw), je32_to_cpu(ri->node_crc),
crc, je32_to_cpu(ri->dsize), je32_to_cpu(ri->csize),
- je32_to_cpu(ri->offset), buf));
+ je32_to_cpu(ri->offset), buf);
if (crc != je32_to_cpu(ri->node_crc)) {
- printk(KERN_WARNING "Node CRC %08x != calculated CRC %08x for node at %08x\n",
- je32_to_cpu(ri->node_crc), crc, ref_offset(fd->raw));
+ pr_warn("Node CRC %08x != calculated CRC %08x for node at %08x\n",
+ je32_to_cpu(ri->node_crc), crc, ref_offset(fd->raw));
ret = -EIO;
goto out_ri;
}
@@ -66,8 +69,8 @@ int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
}
D1(if(ofs + len > je32_to_cpu(ri->dsize)) {
- printk(KERN_WARNING "jffs2_read_dnode() asked for %d bytes at %d from %d-byte node\n",
- len, ofs, je32_to_cpu(ri->dsize));
+ pr_warn("jffs2_read_dnode() asked for %d bytes at %d from %d-byte node\n",
+ len, ofs, je32_to_cpu(ri->dsize));
ret = -EINVAL;
goto out_ri;
});
@@ -107,8 +110,8 @@ int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
decomprbuf = readbuf;
}
- D2(printk(KERN_DEBUG "Read %d bytes to %p\n", je32_to_cpu(ri->csize),
- readbuf));
+ jffs2_dbg(2, "Read %d bytes to %p\n", je32_to_cpu(ri->csize),
+ readbuf);
ret = jffs2_flash_read(c, (ref_offset(fd->raw)) + sizeof(*ri),
je32_to_cpu(ri->csize), &readlen, readbuf);
@@ -119,18 +122,19 @@ int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
crc = crc32(0, readbuf, je32_to_cpu(ri->csize));
if (crc != je32_to_cpu(ri->data_crc)) {
- printk(KERN_WARNING "Data CRC %08x != calculated CRC %08x for node at %08x\n",
- je32_to_cpu(ri->data_crc), crc, ref_offset(fd->raw));
+ pr_warn("Data CRC %08x != calculated CRC %08x for node at %08x\n",
+ je32_to_cpu(ri->data_crc), crc, ref_offset(fd->raw));
ret = -EIO;
goto out_decomprbuf;
}
- D2(printk(KERN_DEBUG "Data CRC matches calculated CRC %08x\n", crc));
+ jffs2_dbg(2, "Data CRC matches calculated CRC %08x\n", crc);
if (ri->compr != JFFS2_COMPR_NONE) {
- D2(printk(KERN_DEBUG "Decompress %d bytes from %p to %d bytes at %p\n",
- je32_to_cpu(ri->csize), readbuf, je32_to_cpu(ri->dsize), decomprbuf));
+ jffs2_dbg(2, "Decompress %d bytes from %p to %d bytes at %p\n",
+ je32_to_cpu(ri->csize), readbuf,
+ je32_to_cpu(ri->dsize), decomprbuf);
ret = jffs2_decompress(c, f, ri->compr | (ri->usercompr << 8), readbuf, decomprbuf, je32_to_cpu(ri->csize), je32_to_cpu(ri->dsize));
if (ret) {
- printk(KERN_WARNING "Error: jffs2_decompress returned %d\n", ret);
+ pr_warn("Error: jffs2_decompress returned %d\n", ret);
goto out_decomprbuf;
}
}
@@ -157,8 +161,8 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
struct jffs2_node_frag *frag;
int ret;
- D1(printk(KERN_DEBUG "jffs2_read_inode_range: ino #%u, range 0x%08x-0x%08x\n",
- f->inocache->ino, offset, offset+len));
+ jffs2_dbg(1, "%s(): ino #%u, range 0x%08x-0x%08x\n",
+ __func__, f->inocache->ino, offset, offset + len);
frag = jffs2_lookup_node_frag(&f->fragtree, offset);
@@ -168,22 +172,27 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
* (or perhaps is before it, if we've been asked to read off the
* end of the file). */
while(offset < end) {
- D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end));
+ jffs2_dbg(2, "%s(): offset %d, end %d\n",
+ __func__, offset, end);
if (unlikely(!frag || frag->ofs > offset ||
frag->ofs + frag->size <= offset)) {
uint32_t holesize = end - offset;
if (frag && frag->ofs > offset) {
- D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset));
+ jffs2_dbg(1, "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n",
+ f->inocache->ino, frag->ofs, offset);
holesize = min(holesize, frag->ofs - offset);
}
- D1(printk(KERN_DEBUG "Filling non-frag hole from %d-%d\n", offset, offset+holesize));
+ jffs2_dbg(1, "Filling non-frag hole from %d-%d\n",
+ offset, offset + holesize);
memset(buf, 0, holesize);
buf += holesize;
offset += holesize;
continue;
} else if (unlikely(!frag->node)) {
uint32_t holeend = min(end, frag->ofs + frag->size);
- D1(printk(KERN_DEBUG "Filling frag hole from %d-%d (frag 0x%x 0x%x)\n", offset, holeend, frag->ofs, frag->ofs + frag->size));
+ jffs2_dbg(1, "Filling frag hole from %d-%d (frag 0x%x 0x%x)\n",
+ offset, holeend, frag->ofs,
+ frag->ofs + frag->size);
memset(buf, 0, holeend - offset);
buf += holeend - offset;
offset = holeend;
@@ -195,20 +204,23 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
fragofs = offset - frag->ofs;
readlen = min(frag->size - fragofs, end - offset);
- D1(printk(KERN_DEBUG "Reading %d-%d from node at 0x%08x (%d)\n",
- frag->ofs+fragofs, frag->ofs+fragofs+readlen,
- ref_offset(frag->node->raw), ref_flags(frag->node->raw)));
+ jffs2_dbg(1, "Reading %d-%d from node at 0x%08x (%d)\n",
+ frag->ofs+fragofs,
+ frag->ofs + fragofs+readlen,
+ ref_offset(frag->node->raw),
+ ref_flags(frag->node->raw));
ret = jffs2_read_dnode(c, f, frag->node, buf, fragofs + frag->ofs - frag->node->ofs, readlen);
- D2(printk(KERN_DEBUG "node read done\n"));
+ jffs2_dbg(2, "node read done\n");
if (ret) {
- D1(printk(KERN_DEBUG"jffs2_read_inode_range error %d\n",ret));
+ jffs2_dbg(1, "%s(): error %d\n",
+ __func__, ret);
memset(buf, 0, readlen);
return ret;
}
buf += readlen;
offset += readlen;
frag = frag_next(frag);
- D2(printk(KERN_DEBUG "node read was OK. Looping\n"));
+ jffs2_dbg(2, "node read was OK. Looping\n");
}
}
return 0;
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 3093ac4fb24..dc0437e8476 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/slab.h>
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index f99464833bb..7654e87b042 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -22,15 +24,15 @@
#define DEFAULT_EMPTY_SCAN_SIZE 256
-#define noisy_printk(noise, args...) do { \
- if (*(noise)) { \
- printk(KERN_NOTICE args); \
- (*(noise))--; \
- if (!(*(noise))) { \
- printk(KERN_NOTICE "Further such events for this erase block will not be printed\n"); \
- } \
- } \
-} while(0)
+#define noisy_printk(noise, fmt, ...) \
+do { \
+ if (*(noise)) { \
+ pr_notice(fmt, ##__VA_ARGS__); \
+ (*(noise))--; \
+ if (!(*(noise))) \
+ pr_notice("Further such events for this erase block will not be printed\n"); \
+ } \
+} while (0)
static uint32_t pseudo_random;
@@ -96,18 +98,17 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
#ifndef __ECOS
size_t pointlen, try_size;
- if (c->mtd->point) {
- ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen,
- (void **)&flashbuf, NULL);
- if (!ret && pointlen < c->mtd->size) {
- /* Don't muck about if it won't let us point to the whole flash */
- D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen));
- mtd_unpoint(c->mtd, 0, pointlen);
- flashbuf = NULL;
- }
- if (ret && ret != -EOPNOTSUPP)
- D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
+ ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen,
+ (void **)&flashbuf, NULL);
+ if (!ret && pointlen < c->mtd->size) {
+ /* Don't muck about if it won't let us point to the whole flash */
+ jffs2_dbg(1, "MTD point returned len too short: 0x%zx\n",
+ pointlen);
+ mtd_unpoint(c->mtd, 0, pointlen);
+ flashbuf = NULL;
}
+ if (ret && ret != -EOPNOTSUPP)
+ jffs2_dbg(1, "MTD point failed %d\n", ret);
#endif
if (!flashbuf) {
/* For NAND it's quicker to read a whole eraseblock at a time,
@@ -117,15 +118,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
else
try_size = PAGE_SIZE;
- D1(printk(KERN_DEBUG "Trying to allocate readbuf of %zu "
- "bytes\n", try_size));
+ jffs2_dbg(1, "Trying to allocate readbuf of %zu "
+ "bytes\n", try_size);
flashbuf = mtd_kmalloc_up_to(c->mtd, &try_size);
if (!flashbuf)
return -ENOMEM;
- D1(printk(KERN_DEBUG "Allocated readbuf of %zu bytes\n",
- try_size));
+ jffs2_dbg(1, "Allocated readbuf of %zu bytes\n",
+ try_size);
buf_size = (uint32_t)try_size;
}
@@ -178,7 +179,8 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
c->nr_free_blocks++;
} else {
/* Dirt */
- D1(printk(KERN_DEBUG "Adding all-dirty block at 0x%08x to erase_pending_list\n", jeb->offset));
+ jffs2_dbg(1, "Adding all-dirty block at 0x%08x to erase_pending_list\n",
+ jeb->offset);
list_add(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
}
@@ -205,7 +207,8 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
}
/* update collected summary information for the current nextblock */
jffs2_sum_move_collected(c, s);
- D1(printk(KERN_DEBUG "jffs2_scan_medium(): new nextblock = 0x%08x\n", jeb->offset));
+ jffs2_dbg(1, "%s(): new nextblock = 0x%08x\n",
+ __func__, jeb->offset);
c->nextblock = jeb;
} else {
ret = file_dirty(c, jeb);
@@ -217,20 +220,21 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
case BLK_STATE_ALLDIRTY:
/* Nothing valid - not even a clean marker. Needs erasing. */
/* For now we just put it on the erasing list. We'll start the erases later */
- D1(printk(KERN_NOTICE "JFFS2: Erase block at 0x%08x is not formatted. It will be erased\n", jeb->offset));
+ jffs2_dbg(1, "Erase block at 0x%08x is not formatted. It will be erased\n",
+ jeb->offset);
list_add(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
break;
case BLK_STATE_BADBLOCK:
- D1(printk(KERN_NOTICE "JFFS2: Block at 0x%08x is bad\n", jeb->offset));
+ jffs2_dbg(1, "Block at 0x%08x is bad\n", jeb->offset);
list_add(&jeb->list, &c->bad_list);
c->bad_size += c->sector_size;
c->free_size -= c->sector_size;
bad_blocks++;
break;
default:
- printk(KERN_WARNING "jffs2_scan_medium(): unknown block state\n");
+ pr_warn("%s(): unknown block state\n", __func__);
BUG();
}
}
@@ -250,16 +254,17 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
uint32_t skip = c->nextblock->free_size % c->wbuf_pagesize;
- D1(printk(KERN_DEBUG "jffs2_scan_medium(): Skipping %d bytes in nextblock to ensure page alignment\n",
- skip));
+ jffs2_dbg(1, "%s(): Skipping %d bytes in nextblock to ensure page alignment\n",
+ __func__, skip);
jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
jffs2_scan_dirty_space(c, c->nextblock, skip);
}
#endif
if (c->nr_erasing_blocks) {
if ( !c->used_size && ((c->nr_free_blocks+empty_blocks+bad_blocks)!= c->nr_blocks || bad_blocks == c->nr_blocks) ) {
- printk(KERN_NOTICE "Cowardly refusing to erase blocks on filesystem with no valid JFFS2 nodes\n");
- printk(KERN_NOTICE "empty_blocks %d, bad_blocks %d, c->nr_blocks %d\n",empty_blocks,bad_blocks,c->nr_blocks);
+ pr_notice("Cowardly refusing to erase blocks on filesystem with no valid JFFS2 nodes\n");
+ pr_notice("empty_blocks %d, bad_blocks %d, c->nr_blocks %d\n",
+ empty_blocks, bad_blocks, c->nr_blocks);
ret = -EIO;
goto out;
}
@@ -287,11 +292,13 @@ static int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf,
ret = jffs2_flash_read(c, ofs, len, &retlen, buf);
if (ret) {
- D1(printk(KERN_WARNING "mtd->read(0x%x bytes from 0x%x) returned %d\n", len, ofs, ret));
+ jffs2_dbg(1, "mtd->read(0x%x bytes from 0x%x) returned %d\n",
+ len, ofs, ret);
return ret;
}
if (retlen < len) {
- D1(printk(KERN_WARNING "Read at 0x%x gave only 0x%zx bytes\n", ofs, retlen));
+ jffs2_dbg(1, "Read at 0x%x gave only 0x%zx bytes\n",
+ ofs, retlen);
return -EIO;
}
return 0;
@@ -368,7 +375,7 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
if (jffs2_sum_active())
jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
- dbg_xattr("scaning xdatum at %#08x (xid=%u, version=%u)\n",
+ dbg_xattr("scanning xdatum at %#08x (xid=%u, version=%u)\n",
ofs, xd->xid, xd->version);
return 0;
}
@@ -449,7 +456,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
ofs = jeb->offset;
prevofs = jeb->offset - 1;
- D1(printk(KERN_DEBUG "jffs2_scan_eraseblock(): Scanning block at 0x%x\n", ofs));
+ jffs2_dbg(1, "%s(): Scanning block at 0x%x\n", __func__, ofs);
#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
if (jffs2_cleanmarker_oob(c)) {
@@ -459,7 +466,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
return BLK_STATE_BADBLOCK;
ret = jffs2_check_nand_cleanmarker(c, jeb);
- D2(printk(KERN_NOTICE "jffs_check_nand_cleanmarker returned %d\n",ret));
+ jffs2_dbg(2, "jffs_check_nand_cleanmarker returned %d\n", ret);
/* Even if it's not found, we still scan to see
if the block is empty. We use this information
@@ -561,7 +568,8 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
if (jffs2_cleanmarker_oob(c)) {
/* scan oob, take care of cleanmarker */
int ret = jffs2_check_oob_empty(c, jeb, cleanmarkerfound);
- D2(printk(KERN_NOTICE "jffs2_check_oob_empty returned %d\n",ret));
+ jffs2_dbg(2, "jffs2_check_oob_empty returned %d\n",
+ ret);
switch (ret) {
case 0: return cleanmarkerfound ? BLK_STATE_CLEANMARKER : BLK_STATE_ALLFF;
case 1: return BLK_STATE_ALLDIRTY;
@@ -569,15 +577,16 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
}
}
#endif
- D1(printk(KERN_DEBUG "Block at 0x%08x is empty (erased)\n", jeb->offset));
+ jffs2_dbg(1, "Block at 0x%08x is empty (erased)\n",
+ jeb->offset);
if (c->cleanmarker_size == 0)
return BLK_STATE_CLEANMARKER; /* don't bother with re-erase */
else
return BLK_STATE_ALLFF; /* OK to erase if all blocks are like this */
}
if (ofs) {
- D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset,
- jeb->offset + ofs));
+ jffs2_dbg(1, "Free space at %08x ends at %08x\n", jeb->offset,
+ jeb->offset + ofs);
if ((err = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
return err;
if ((err = jffs2_scan_dirty_space(c, jeb, ofs)))
@@ -604,12 +613,13 @@ scan_more:
cond_resched();
if (ofs & 3) {
- printk(KERN_WARNING "Eep. ofs 0x%08x not word-aligned!\n", ofs);
+ pr_warn("Eep. ofs 0x%08x not word-aligned!\n", ofs);
ofs = PAD(ofs);
continue;
}
if (ofs == prevofs) {
- printk(KERN_WARNING "ofs 0x%08x has already been seen. Skipping\n", ofs);
+ pr_warn("ofs 0x%08x has already been seen. Skipping\n",
+ ofs);
if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
return err;
ofs += 4;
@@ -618,8 +628,10 @@ scan_more:
prevofs = ofs;
if (jeb->offset + c->sector_size < ofs + sizeof(*node)) {
- D1(printk(KERN_DEBUG "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n", sizeof(struct jffs2_unknown_node),
- jeb->offset, c->sector_size, ofs, sizeof(*node)));
+ jffs2_dbg(1, "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n",
+ sizeof(struct jffs2_unknown_node),
+ jeb->offset, c->sector_size, ofs,
+ sizeof(*node));
if ((err = jffs2_scan_dirty_space(c, jeb, (jeb->offset + c->sector_size)-ofs)))
return err;
break;
@@ -627,8 +639,9 @@ scan_more:
if (buf_ofs + buf_len < ofs + sizeof(*node)) {
buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
- D1(printk(KERN_DEBUG "Fewer than %zd bytes (node header) left to end of buf. Reading 0x%x at 0x%08x\n",
- sizeof(struct jffs2_unknown_node), buf_len, ofs));
+ jffs2_dbg(1, "Fewer than %zd bytes (node header) left to end of buf. Reading 0x%x at 0x%08x\n",
+ sizeof(struct jffs2_unknown_node),
+ buf_len, ofs);
err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
if (err)
return err;
@@ -645,13 +658,13 @@ scan_more:
ofs += 4;
scan_end = min_t(uint32_t, EMPTY_SCAN_SIZE(c->sector_size)/8, buf_len);
- D1(printk(KERN_DEBUG "Found empty flash at 0x%08x\n", ofs));
+ jffs2_dbg(1, "Found empty flash at 0x%08x\n", ofs);
more_empty:
inbuf_ofs = ofs - buf_ofs;
while (inbuf_ofs < scan_end) {
if (unlikely(*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff)) {
- printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n",
- empty_start, ofs);
+ pr_warn("Empty flash at 0x%08x ends at 0x%08x\n",
+ empty_start, ofs);
if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start)))
return err;
goto scan_more;
@@ -661,13 +674,15 @@ scan_more:
ofs += 4;
}
/* Ran off end. */
- D1(printk(KERN_DEBUG "Empty flash to end of buffer at 0x%08x\n", ofs));
+ jffs2_dbg(1, "Empty flash to end of buffer at 0x%08x\n",
+ ofs);
/* If we're only checking the beginning of a block with a cleanmarker,
bail now */
if (buf_ofs == jeb->offset && jeb->used_size == PAD(c->cleanmarker_size) &&
c->cleanmarker_size && !jeb->dirty_size && !ref_next(jeb->first_node)) {
- D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size)));
+ jffs2_dbg(1, "%d bytes at start of block seems clean... assuming all clean\n",
+ EMPTY_SCAN_SIZE(c->sector_size));
return BLK_STATE_CLEANMARKER;
}
if (!buf_size && (scan_end != buf_len)) {/* XIP/point case */
@@ -680,13 +695,14 @@ scan_more:
if (!buf_len) {
/* No more to read. Break out of main loop without marking
this range of empty space as dirty (because it's not) */
- D1(printk(KERN_DEBUG "Empty flash at %08x runs to end of block. Treating as free_space\n",
- empty_start));
+ jffs2_dbg(1, "Empty flash at %08x runs to end of block. Treating as free_space\n",
+ empty_start);
break;
}
/* point never reaches here */
scan_end = buf_len;
- D1(printk(KERN_DEBUG "Reading another 0x%x at 0x%08x\n", buf_len, ofs));
+ jffs2_dbg(1, "Reading another 0x%x at 0x%08x\n",
+ buf_len, ofs);
err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
if (err)
return err;
@@ -695,22 +711,23 @@ scan_more:
}
if (ofs == jeb->offset && je16_to_cpu(node->magic) == KSAMTIB_CIGAM_2SFFJ) {
- printk(KERN_WARNING "Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n", ofs);
+ pr_warn("Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n",
+ ofs);
if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
return err;
ofs += 4;
continue;
}
if (je16_to_cpu(node->magic) == JFFS2_DIRTY_BITMASK) {
- D1(printk(KERN_DEBUG "Dirty bitmask at 0x%08x\n", ofs));
+ jffs2_dbg(1, "Dirty bitmask at 0x%08x\n", ofs);
if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
return err;
ofs += 4;
continue;
}
if (je16_to_cpu(node->magic) == JFFS2_OLD_MAGIC_BITMASK) {
- printk(KERN_WARNING "Old JFFS2 bitmask found at 0x%08x\n", ofs);
- printk(KERN_WARNING "You cannot use older JFFS2 filesystems with newer kernels\n");
+ pr_warn("Old JFFS2 bitmask found at 0x%08x\n", ofs);
+ pr_warn("You cannot use older JFFS2 filesystems with newer kernels\n");
if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
return err;
ofs += 4;
@@ -718,7 +735,8 @@ scan_more:
}
if (je16_to_cpu(node->magic) != JFFS2_MAGIC_BITMASK) {
/* OK. We're out of possibilities. Whinge and move on */
- noisy_printk(&noise, "jffs2_scan_eraseblock(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n",
+ noisy_printk(&noise, "%s(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n",
+ __func__,
JFFS2_MAGIC_BITMASK, ofs,
je16_to_cpu(node->magic));
if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
@@ -733,7 +751,8 @@ scan_more:
hdr_crc = crc32(0, &crcnode, sizeof(crcnode)-4);
if (hdr_crc != je32_to_cpu(node->hdr_crc)) {
- noisy_printk(&noise, "jffs2_scan_eraseblock(): Node at 0x%08x {0x%04x, 0x%04x, 0x%08x) has invalid CRC 0x%08x (calculated 0x%08x)\n",
+ noisy_printk(&noise, "%s(): Node at 0x%08x {0x%04x, 0x%04x, 0x%08x) has invalid CRC 0x%08x (calculated 0x%08x)\n",
+ __func__,
ofs, je16_to_cpu(node->magic),
je16_to_cpu(node->nodetype),
je32_to_cpu(node->totlen),
@@ -747,9 +766,9 @@ scan_more:
if (ofs + je32_to_cpu(node->totlen) > jeb->offset + c->sector_size) {
/* Eep. Node goes over the end of the erase block. */
- printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n",
- ofs, je32_to_cpu(node->totlen));
- printk(KERN_WARNING "Perhaps the file system was created with the wrong erase size?\n");
+ pr_warn("Node at 0x%08x with length 0x%08x would run over the end of the erase block\n",
+ ofs, je32_to_cpu(node->totlen));
+ pr_warn("Perhaps the file system was created with the wrong erase size?\n");
if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
return err;
ofs += 4;
@@ -758,7 +777,8 @@ scan_more:
if (!(je16_to_cpu(node->nodetype) & JFFS2_NODE_ACCURATE)) {
/* Wheee. This is an obsoleted node */
- D2(printk(KERN_DEBUG "Node at 0x%08x is obsolete. Skipping\n", ofs));
+ jffs2_dbg(2, "Node at 0x%08x is obsolete. Skipping\n",
+ ofs);
if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
return err;
ofs += PAD(je32_to_cpu(node->totlen));
@@ -769,8 +789,9 @@ scan_more:
case JFFS2_NODETYPE_INODE:
if (buf_ofs + buf_len < ofs + sizeof(struct jffs2_raw_inode)) {
buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
- D1(printk(KERN_DEBUG "Fewer than %zd bytes (inode node) left to end of buf. Reading 0x%x at 0x%08x\n",
- sizeof(struct jffs2_raw_inode), buf_len, ofs));
+ jffs2_dbg(1, "Fewer than %zd bytes (inode node) left to end of buf. Reading 0x%x at 0x%08x\n",
+ sizeof(struct jffs2_raw_inode),
+ buf_len, ofs);
err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
if (err)
return err;
@@ -785,8 +806,9 @@ scan_more:
case JFFS2_NODETYPE_DIRENT:
if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
- D1(printk(KERN_DEBUG "Fewer than %d bytes (dirent node) left to end of buf. Reading 0x%x at 0x%08x\n",
- je32_to_cpu(node->totlen), buf_len, ofs));
+ jffs2_dbg(1, "Fewer than %d bytes (dirent node) left to end of buf. Reading 0x%x at 0x%08x\n",
+ je32_to_cpu(node->totlen), buf_len,
+ ofs);
err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
if (err)
return err;
@@ -802,9 +824,9 @@ scan_more:
case JFFS2_NODETYPE_XATTR:
if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
- D1(printk(KERN_DEBUG "Fewer than %d bytes (xattr node)"
- " left to end of buf. Reading 0x%x at 0x%08x\n",
- je32_to_cpu(node->totlen), buf_len, ofs));
+ jffs2_dbg(1, "Fewer than %d bytes (xattr node) left to end of buf. Reading 0x%x at 0x%08x\n",
+ je32_to_cpu(node->totlen), buf_len,
+ ofs);
err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
if (err)
return err;
@@ -819,9 +841,9 @@ scan_more:
case JFFS2_NODETYPE_XREF:
if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
- D1(printk(KERN_DEBUG "Fewer than %d bytes (xref node)"
- " left to end of buf. Reading 0x%x at 0x%08x\n",
- je32_to_cpu(node->totlen), buf_len, ofs));
+ jffs2_dbg(1, "Fewer than %d bytes (xref node) left to end of buf. Reading 0x%x at 0x%08x\n",
+ je32_to_cpu(node->totlen), buf_len,
+ ofs);
err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
if (err)
return err;
@@ -836,15 +858,17 @@ scan_more:
#endif /* CONFIG_JFFS2_FS_XATTR */
case JFFS2_NODETYPE_CLEANMARKER:
- D1(printk(KERN_DEBUG "CLEANMARKER node found at 0x%08x\n", ofs));
+ jffs2_dbg(1, "CLEANMARKER node found at 0x%08x\n", ofs);
if (je32_to_cpu(node->totlen) != c->cleanmarker_size) {
- printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n",
- ofs, je32_to_cpu(node->totlen), c->cleanmarker_size);
+ pr_notice("CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n",
+ ofs, je32_to_cpu(node->totlen),
+ c->cleanmarker_size);
if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
return err;
ofs += PAD(sizeof(struct jffs2_unknown_node));
} else if (jeb->first_node) {
- printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n", ofs, jeb->offset);
+ pr_notice("CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n",
+ ofs, jeb->offset);
if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
return err;
ofs += PAD(sizeof(struct jffs2_unknown_node));
@@ -866,7 +890,8 @@ scan_more:
default:
switch (je16_to_cpu(node->nodetype) & JFFS2_COMPAT_MASK) {
case JFFS2_FEATURE_ROCOMPAT:
- printk(KERN_NOTICE "Read-only compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs);
+ pr_notice("Read-only compatible feature node (0x%04x) found at offset 0x%08x\n",
+ je16_to_cpu(node->nodetype), ofs);
c->flags |= JFFS2_SB_FLAG_RO;
if (!(jffs2_is_readonly(c)))
return -EROFS;
@@ -876,18 +901,21 @@ scan_more:
break;
case JFFS2_FEATURE_INCOMPAT:
- printk(KERN_NOTICE "Incompatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs);
+ pr_notice("Incompatible feature node (0x%04x) found at offset 0x%08x\n",
+ je16_to_cpu(node->nodetype), ofs);
return -EINVAL;
case JFFS2_FEATURE_RWCOMPAT_DELETE:
- D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
+ jffs2_dbg(1, "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n",
+ je16_to_cpu(node->nodetype), ofs);
if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
return err;
ofs += PAD(je32_to_cpu(node->totlen));
break;
case JFFS2_FEATURE_RWCOMPAT_COPY: {
- D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
+ jffs2_dbg(1, "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n",
+ je16_to_cpu(node->nodetype), ofs);
jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(node->totlen)), NULL);
@@ -908,8 +936,9 @@ scan_more:
}
}
- D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n",
- jeb->offset,jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size, jeb->wasted_size));
+ jffs2_dbg(1, "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n",
+ jeb->offset, jeb->free_size, jeb->dirty_size,
+ jeb->unchecked_size, jeb->used_size, jeb->wasted_size);
/* mark_node_obsolete can add to wasted !! */
if (jeb->wasted_size) {
@@ -935,7 +964,7 @@ struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uin
ic = jffs2_alloc_inode_cache();
if (!ic) {
- printk(KERN_NOTICE "jffs2_scan_make_inode_cache(): allocation of inode cache failed\n");
+ pr_notice("%s(): allocation of inode cache failed\n", __func__);
return NULL;
}
memset(ic, 0, sizeof(*ic));
@@ -954,7 +983,7 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
struct jffs2_inode_cache *ic;
uint32_t crc, ino = je32_to_cpu(ri->ino);
- D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs));
+ jffs2_dbg(1, "%s(): Node at 0x%08x\n", __func__, ofs);
/* We do very little here now. Just check the ino# to which we should attribute
this node; we can do all the CRC checking etc. later. There's a tradeoff here --
@@ -968,9 +997,8 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
/* Check the node CRC in any case. */
crc = crc32(0, ri, sizeof(*ri)-8);
if (crc != je32_to_cpu(ri->node_crc)) {
- printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on "
- "node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
- ofs, je32_to_cpu(ri->node_crc), crc);
+ pr_notice("%s(): CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+ __func__, ofs, je32_to_cpu(ri->node_crc), crc);
/*
* We believe totlen because the CRC on the node
* _header_ was OK, just the node itself failed.
@@ -989,10 +1017,10 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
/* Wheee. It worked */
jffs2_link_node_ref(c, jeb, ofs | REF_UNCHECKED, PAD(je32_to_cpu(ri->totlen)), ic);
- D1(printk(KERN_DEBUG "Node is ino #%u, version %d. Range 0x%x-0x%x\n",
+ jffs2_dbg(1, "Node is ino #%u, version %d. Range 0x%x-0x%x\n",
je32_to_cpu(ri->ino), je32_to_cpu(ri->version),
je32_to_cpu(ri->offset),
- je32_to_cpu(ri->offset)+je32_to_cpu(ri->dsize)));
+ je32_to_cpu(ri->offset)+je32_to_cpu(ri->dsize));
pseudo_random += je32_to_cpu(ri->version);
@@ -1012,15 +1040,15 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
uint32_t crc;
int err;
- D1(printk(KERN_DEBUG "jffs2_scan_dirent_node(): Node at 0x%08x\n", ofs));
+ jffs2_dbg(1, "%s(): Node at 0x%08x\n", __func__, ofs);
/* We don't get here unless the node is still valid, so we don't have to
mask in the ACCURATE bit any more. */
crc = crc32(0, rd, sizeof(*rd)-8);
if (crc != je32_to_cpu(rd->node_crc)) {
- printk(KERN_NOTICE "jffs2_scan_dirent_node(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
- ofs, je32_to_cpu(rd->node_crc), crc);
+ pr_notice("%s(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+ __func__, ofs, je32_to_cpu(rd->node_crc), crc);
/* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
return err;
@@ -1032,7 +1060,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
/* Should never happen. Did. (OLPC trac #4184)*/
checkedlen = strnlen(rd->name, rd->nsize);
if (checkedlen < rd->nsize) {
- printk(KERN_ERR "Dirent at %08x has zeroes in name. Truncating to %d chars\n",
+ pr_err("Dirent at %08x has zeroes in name. Truncating to %d chars\n",
ofs, checkedlen);
}
fd = jffs2_alloc_full_dirent(checkedlen+1);
@@ -1044,9 +1072,10 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
crc = crc32(0, fd->name, rd->nsize);
if (crc != je32_to_cpu(rd->name_crc)) {
- printk(KERN_NOTICE "jffs2_scan_dirent_node(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
- ofs, je32_to_cpu(rd->name_crc), crc);
- D1(printk(KERN_NOTICE "Name for which CRC failed is (now) '%s', ino #%d\n", fd->name, je32_to_cpu(rd->ino)));
+ pr_notice("%s(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+ __func__, ofs, je32_to_cpu(rd->name_crc), crc);
+ jffs2_dbg(1, "Name for which CRC failed is (now) '%s', ino #%d\n",
+ fd->name, je32_to_cpu(rd->ino));
jffs2_free_full_dirent(fd);
/* FIXME: Why do we believe totlen? */
/* We believe totlen because the CRC on the node _header_ was OK, just the name failed. */
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 0f20208df60..aca97f35b29 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -23,8 +23,8 @@
#include "nodelist.h"
/* ---- Initial Security Label(s) Attachment callback --- */
-int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int jffs2_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array, void *fs_info)
{
const struct xattr *xattr;
int err = 0;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index e537fb0e018..c522d098bb4 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -11,6 +11,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mtd/mtd.h>
@@ -442,13 +444,16 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
/* This should never happen, but https://dev.laptop.org/ticket/4184 */
checkedlen = strnlen(spd->name, spd->nsize);
if (!checkedlen) {
- printk(KERN_ERR "Dirent at %08x has zero at start of name. Aborting mount.\n",
- jeb->offset + je32_to_cpu(spd->offset));
+ pr_err("Dirent at %08x has zero at start of name. Aborting mount.\n",
+ jeb->offset +
+ je32_to_cpu(spd->offset));
return -EIO;
}
if (checkedlen < spd->nsize) {
- printk(KERN_ERR "Dirent at %08x has zeroes in name. Truncating to %d chars\n",
- jeb->offset + je32_to_cpu(spd->offset), checkedlen);
+ pr_err("Dirent at %08x has zeroes in name. Truncating to %d chars\n",
+ jeb->offset +
+ je32_to_cpu(spd->offset),
+ checkedlen);
}
@@ -808,8 +813,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
sum_ofs = jeb->offset + c->sector_size - jeb->free_size;
- dbg_summary("JFFS2: writing out data to flash to pos : 0x%08x\n",
- sum_ofs);
+ dbg_summary("writing out data to flash to pos : 0x%08x\n", sum_ofs);
ret = jffs2_flash_writev(c, vecs, 2, sum_ofs, &retlen, 0);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index f2d96b5e64f..f9916f312bd 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -69,7 +71,7 @@ static void jffs2_write_super(struct super_block *sb)
sb->s_dirt = 0;
if (!(sb->s_flags & MS_RDONLY)) {
- D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
+ jffs2_dbg(1, "%s()\n", __func__);
jffs2_flush_wbuf_gc(c, 0);
}
@@ -214,8 +216,8 @@ static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
JFFS2_COMPR_MODE_FORCEZLIB;
#endif
else {
- printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"",
- name);
+ pr_err("Error: unknown compressor \"%s\"\n",
+ name);
kfree(name);
return -EINVAL;
}
@@ -223,8 +225,8 @@ static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
c->mount_opts.override_compr = true;
break;
default:
- printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n",
- p);
+ pr_err("Error: unrecognized mount option '%s' or missing value\n",
+ p);
return -EINVAL;
}
}
@@ -266,9 +268,9 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
struct jffs2_sb_info *c;
int ret;
- D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
+ jffs2_dbg(1, "jffs2_get_sb_mtd():"
" New superblock for device %d (\"%s\")\n",
- sb->s_mtd->index, sb->s_mtd->name));
+ sb->s_mtd->index, sb->s_mtd->name);
c = kzalloc(sizeof(*c), GFP_KERNEL);
if (!c)
@@ -315,7 +317,7 @@ static void jffs2_put_super (struct super_block *sb)
{
struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
- D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
+ jffs2_dbg(2, "%s()\n", __func__);
if (sb->s_dirt)
jffs2_write_super(sb);
@@ -336,7 +338,7 @@ static void jffs2_put_super (struct super_block *sb)
kfree(c->inocache_list);
jffs2_clear_xattr_subsystem(c);
mtd_sync(c->mtd);
- D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
+ jffs2_dbg(1, "%s(): returning\n", __func__);
}
static void jffs2_kill_sb(struct super_block *sb)
@@ -371,7 +373,7 @@ static int __init init_jffs2_fs(void)
BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
- printk(KERN_INFO "JFFS2 version 2.2."
+ pr_info("version 2.2."
#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
" (NAND)"
#endif
@@ -386,22 +388,22 @@ static int __init init_jffs2_fs(void)
SLAB_MEM_SPREAD),
jffs2_i_init_once);
if (!jffs2_inode_cachep) {
- printk(KERN_ERR "JFFS2 error: Failed to initialise inode cache\n");
+ pr_err("error: Failed to initialise inode cache\n");
return -ENOMEM;
}
ret = jffs2_compressors_init();
if (ret) {
- printk(KERN_ERR "JFFS2 error: Failed to initialise compressors\n");
+ pr_err("error: Failed to initialise compressors\n");
goto out;
}
ret = jffs2_create_slab_caches();
if (ret) {
- printk(KERN_ERR "JFFS2 error: Failed to initialise slab caches\n");
+ pr_err("error: Failed to initialise slab caches\n");
goto out_compressors;
}
ret = register_filesystem(&jffs2_fs_type);
if (ret) {
- printk(KERN_ERR "JFFS2 error: Failed to register filesystem\n");
+ pr_err("error: Failed to register filesystem\n");
goto out_slab;
}
return 0;
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index e3035afb181..6e563332bb2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/namei.h>
@@ -47,10 +49,11 @@ static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd)
*/
if (!p) {
- printk(KERN_ERR "jffs2_follow_link(): can't find symlink target\n");
+ pr_err("%s(): can't find symlink target\n", __func__);
p = ERR_PTR(-EIO);
}
- D1(printk(KERN_DEBUG "jffs2_follow_link(): target path is '%s'\n", (char *) f->target));
+ jffs2_dbg(1, "%s(): target path is '%s'\n",
+ __func__, (char *)f->target);
nd_set_link(nd, p);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 30e8f47e8a2..74d9be19df3 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -11,6 +11,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mtd/mtd.h>
@@ -91,7 +93,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
new = kmalloc(sizeof(*new), GFP_KERNEL);
if (!new) {
- D1(printk(KERN_DEBUG "No memory to allocate inodirty. Fallback to all considered dirty\n"));
+ jffs2_dbg(1, "No memory to allocate inodirty. Fallback to all considered dirty\n");
jffs2_clear_wbuf_ino_list(c);
c->wbuf_inodes = &inodirty_nomem;
return;
@@ -113,19 +115,20 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
list_for_each_safe(this, next, &c->erasable_pending_wbuf_list) {
struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
- D1(printk(KERN_DEBUG "Removing eraseblock at 0x%08x from erasable_pending_wbuf_list...\n", jeb->offset));
+ jffs2_dbg(1, "Removing eraseblock at 0x%08x from erasable_pending_wbuf_list...\n",
+ jeb->offset);
list_del(this);
if ((jiffies + (n++)) & 127) {
/* Most of the time, we just erase it immediately. Otherwise we
spend ages scanning it on mount, etc. */
- D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
+ jffs2_dbg(1, "...and adding to erase_pending_list\n");
list_add_tail(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
jffs2_garbage_collect_trigger(c);
} else {
/* Sometimes, however, we leave it elsewhere so it doesn't get
immediately reused, and we spread the load a bit. */
- D1(printk(KERN_DEBUG "...and adding to erasable_list\n"));
+ jffs2_dbg(1, "...and adding to erasable_list\n");
list_add_tail(&jeb->list, &c->erasable_list);
}
}
@@ -136,7 +139,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int allow_empty)
{
- D1(printk("About to refile bad block at %08x\n", jeb->offset));
+ jffs2_dbg(1, "About to refile bad block at %08x\n", jeb->offset);
/* File the existing block on the bad_used_list.... */
if (c->nextblock == jeb)
@@ -144,12 +147,14 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
else /* Not sure this should ever happen... need more coffee */
list_del(&jeb->list);
if (jeb->first_node) {
- D1(printk("Refiling block at %08x to bad_used_list\n", jeb->offset));
+ jffs2_dbg(1, "Refiling block at %08x to bad_used_list\n",
+ jeb->offset);
list_add(&jeb->list, &c->bad_used_list);
} else {
BUG_ON(allow_empty == REFILE_NOTEMPTY);
/* It has to have had some nodes or we couldn't be here */
- D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
+ jffs2_dbg(1, "Refiling block at %08x to erase_pending_list\n",
+ jeb->offset);
list_add(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
jffs2_garbage_collect_trigger(c);
@@ -230,10 +235,12 @@ static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf,
ret = mtd_read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify);
if (ret && ret != -EUCLEAN && ret != -EBADMSG) {
- printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret);
+ pr_warn("%s(): Read back of page at %08x failed: %d\n",
+ __func__, c->wbuf_ofs, ret);
return ret;
} else if (retlen != c->wbuf_pagesize) {
- printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x gave short read: %zd not %d.\n", ofs, retlen, c->wbuf_pagesize);
+ pr_warn("%s(): Read back of page at %08x gave short read: %zd not %d\n",
+ __func__, ofs, retlen, c->wbuf_pagesize);
return -EIO;
}
if (!memcmp(buf, c->wbuf_verify, c->wbuf_pagesize))
@@ -246,12 +253,12 @@ static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf,
else
eccstr = "OK or unused";
- printk(KERN_WARNING "Write verify error (ECC %s) at %08x. Wrote:\n",
- eccstr, c->wbuf_ofs);
+ pr_warn("Write verify error (ECC %s) at %08x. Wrote:\n",
+ eccstr, c->wbuf_ofs);
print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1,
c->wbuf, c->wbuf_pagesize, 0);
- printk(KERN_WARNING "Read back:\n");
+ pr_warn("Read back:\n");
print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1,
c->wbuf_verify, c->wbuf_pagesize, 0);
@@ -308,7 +315,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
if (!first_raw) {
/* All nodes were obsolete. Nothing to recover. */
- D1(printk(KERN_DEBUG "No non-obsolete nodes to be recovered. Just filing block bad\n"));
+ jffs2_dbg(1, "No non-obsolete nodes to be recovered. Just filing block bad\n");
c->wbuf_len = 0;
return;
}
@@ -331,7 +338,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
buf = kmalloc(end - start, GFP_KERNEL);
if (!buf) {
- printk(KERN_CRIT "Malloc failure in wbuf recovery. Data loss ensues.\n");
+ pr_crit("Malloc failure in wbuf recovery. Data loss ensues.\n");
goto read_failed;
}
@@ -346,7 +353,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
ret = 0;
if (ret || retlen != c->wbuf_ofs - start) {
- printk(KERN_CRIT "Old data are already lost in wbuf recovery. Data loss ensues.\n");
+ pr_crit("Old data are already lost in wbuf recovery. Data loss ensues.\n");
kfree(buf);
buf = NULL;
@@ -380,7 +387,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
/* ... and get an allocation of space from a shiny new block instead */
ret = jffs2_reserve_space_gc(c, end-start, &len, JFFS2_SUMMARY_NOSUM_SIZE);
if (ret) {
- printk(KERN_WARNING "Failed to allocate space for wbuf recovery. Data loss ensues.\n");
+ pr_warn("Failed to allocate space for wbuf recovery. Data loss ensues.\n");
kfree(buf);
return;
}
@@ -390,7 +397,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile);
if (ret) {
- printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n");
+ pr_warn("Failed to allocate node refs for wbuf recovery. Data loss ensues.\n");
kfree(buf);
return;
}
@@ -406,13 +413,13 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
unsigned char *rewrite_buf = buf?:c->wbuf;
uint32_t towrite = (end-start) - ((end-start)%c->wbuf_pagesize);
- D1(printk(KERN_DEBUG "Write 0x%x bytes at 0x%08x in wbuf recover\n",
- towrite, ofs));
+ jffs2_dbg(1, "Write 0x%x bytes at 0x%08x in wbuf recover\n",
+ towrite, ofs);
#ifdef BREAKMEHEADER
static int breakme;
if (breakme++ == 20) {
- printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs);
+ pr_notice("Faking write error at 0x%08x\n", ofs);
breakme = 0;
mtd_write(c->mtd, ofs, towrite, &retlen, brokenbuf);
ret = -EIO;
@@ -423,7 +430,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) {
/* Argh. We tried. Really we did. */
- printk(KERN_CRIT "Recovery of wbuf failed due to a second write error\n");
+ pr_crit("Recovery of wbuf failed due to a second write error\n");
kfree(buf);
if (retlen)
@@ -431,7 +438,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
return;
}
- printk(KERN_NOTICE "Recovery of wbuf succeeded to %08x\n", ofs);
+ pr_notice("Recovery of wbuf succeeded to %08x\n", ofs);
c->wbuf_len = (end - start) - towrite;
c->wbuf_ofs = ofs + towrite;
@@ -459,8 +466,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
struct jffs2_raw_node_ref **adjust_ref = NULL;
struct jffs2_inode_info *f = NULL;
- D1(printk(KERN_DEBUG "Refiling block of %08x at %08x(%d) to %08x\n",
- rawlen, ref_offset(raw), ref_flags(raw), ofs));
+ jffs2_dbg(1, "Refiling block of %08x at %08x(%d) to %08x\n",
+ rawlen, ref_offset(raw), ref_flags(raw), ofs);
ic = jffs2_raw_ref_to_ic(raw);
@@ -540,7 +547,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
/* Fix up the original jeb now it's on the bad_list */
if (first_raw == jeb->first_node) {
- D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
+ jffs2_dbg(1, "Failing block at %08x is now empty. Moving to erase_pending_list\n",
+ jeb->offset);
list_move(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
jffs2_garbage_collect_trigger(c);
@@ -554,7 +562,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
spin_unlock(&c->erase_completion_lock);
- D1(printk(KERN_DEBUG "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n", c->wbuf_ofs, c->wbuf_len));
+ jffs2_dbg(1, "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n",
+ c->wbuf_ofs, c->wbuf_len);
}
@@ -579,7 +588,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
return 0;
if (!mutex_is_locked(&c->alloc_sem)) {
- printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n");
+ pr_crit("jffs2_flush_wbuf() called with alloc_sem not locked!\n");
BUG();
}
@@ -617,7 +626,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
#ifdef BREAKME
static int breakme;
if (breakme++ == 20) {
- printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs);
+ pr_notice("Faking write error at 0x%08x\n", c->wbuf_ofs);
breakme = 0;
mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
brokenbuf);
@@ -629,11 +638,11 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
&retlen, c->wbuf);
if (ret) {
- printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret);
+ pr_warn("jffs2_flush_wbuf(): Write failed with %d\n", ret);
goto wfail;
} else if (retlen != c->wbuf_pagesize) {
- printk(KERN_WARNING "jffs2_flush_wbuf(): Write was short: %zd instead of %d\n",
- retlen, c->wbuf_pagesize);
+ pr_warn("jffs2_flush_wbuf(): Write was short: %zd instead of %d\n",
+ retlen, c->wbuf_pagesize);
ret = -EIO;
goto wfail;
} else if ((ret = jffs2_verify_write(c, c->wbuf, c->wbuf_ofs))) {
@@ -647,17 +656,18 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
if (pad) {
uint32_t waste = c->wbuf_pagesize - c->wbuf_len;
- D1(printk(KERN_DEBUG "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n",
- (wbuf_jeb==c->nextblock)?"next":"", wbuf_jeb->offset));
+ jffs2_dbg(1, "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n",
+ (wbuf_jeb == c->nextblock) ? "next" : "",
+ wbuf_jeb->offset);
/* wbuf_pagesize - wbuf_len is the amount of space that's to be
padded. If there is less free space in the block than that,
something screwed up */
if (wbuf_jeb->free_size < waste) {
- printk(KERN_CRIT "jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n",
- c->wbuf_ofs, c->wbuf_len, waste);
- printk(KERN_CRIT "jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n",
- wbuf_jeb->offset, wbuf_jeb->free_size);
+ pr_crit("jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n",
+ c->wbuf_ofs, c->wbuf_len, waste);
+ pr_crit("jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n",
+ wbuf_jeb->offset, wbuf_jeb->free_size);
BUG();
}
@@ -694,14 +704,14 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino)
uint32_t old_wbuf_len;
int ret = 0;
- D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() called for ino #%u...\n", ino));
+ jffs2_dbg(1, "jffs2_flush_wbuf_gc() called for ino #%u...\n", ino);
if (!c->wbuf)
return 0;
mutex_lock(&c->alloc_sem);
if (!jffs2_wbuf_pending_for_ino(c, ino)) {
- D1(printk(KERN_DEBUG "Ino #%d not pending in wbuf. Returning\n", ino));
+ jffs2_dbg(1, "Ino #%d not pending in wbuf. Returning\n", ino);
mutex_unlock(&c->alloc_sem);
return 0;
}
@@ -711,7 +721,8 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino)
if (c->unchecked_size) {
/* GC won't make any progress for a while */
- D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() padding. Not finished checking\n"));
+ jffs2_dbg(1, "%s(): padding. Not finished checking\n",
+ __func__);
down_write(&c->wbuf_sem);
ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING);
/* retry flushing wbuf in case jffs2_wbuf_recover
@@ -724,7 +735,7 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino)
mutex_unlock(&c->alloc_sem);
- D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() calls gc pass\n"));
+ jffs2_dbg(1, "%s(): calls gc pass\n", __func__);
ret = jffs2_garbage_collect_pass(c);
if (ret) {
@@ -742,7 +753,7 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino)
mutex_lock(&c->alloc_sem);
}
- D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() ends...\n"));
+ jffs2_dbg(1, "%s(): ends...\n", __func__);
mutex_unlock(&c->alloc_sem);
return ret;
@@ -811,9 +822,8 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
if (SECTOR_ADDR(to) != SECTOR_ADDR(c->wbuf_ofs)) {
/* It's a write to a new block */
if (c->wbuf_len) {
- D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx "
- "causes flush of wbuf at 0x%08x\n",
- (unsigned long)to, c->wbuf_ofs));
+ jffs2_dbg(1, "%s(): to 0x%lx causes flush of wbuf at 0x%08x\n",
+ __func__, (unsigned long)to, c->wbuf_ofs);
ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT);
if (ret)
goto outerr;
@@ -825,11 +835,11 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
if (to != PAD(c->wbuf_ofs + c->wbuf_len)) {
/* We're not writing immediately after the writebuffer. Bad. */
- printk(KERN_CRIT "jffs2_flash_writev(): Non-contiguous write "
- "to %08lx\n", (unsigned long)to);
+ pr_crit("%s(): Non-contiguous write to %08lx\n",
+ __func__, (unsigned long)to);
if (c->wbuf_len)
- printk(KERN_CRIT "wbuf was previously %08x-%08x\n",
- c->wbuf_ofs, c->wbuf_ofs+c->wbuf_len);
+ pr_crit("wbuf was previously %08x-%08x\n",
+ c->wbuf_ofs, c->wbuf_ofs + c->wbuf_len);
BUG();
}
@@ -957,8 +967,8 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) {
if (ret == -EBADMSG)
- printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx)"
- " returned ECC error\n", len, ofs);
+ pr_warn("mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n",
+ len, ofs);
/*
* We have the raw data without ECC correction in the buffer,
* maybe we are lucky and all data or parts are correct. We
@@ -1034,9 +1044,8 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
if (ret || ops.oobretlen != ops.ooblen) {
- printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
- " bytes, read %zd bytes, error %d\n",
- jeb->offset, ops.ooblen, ops.oobretlen, ret);
+ pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
+ jeb->offset, ops.ooblen, ops.oobretlen, ret);
if (!ret)
ret = -EIO;
return ret;
@@ -1048,8 +1057,8 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
continue;
if (ops.oobbuf[i] != 0xFF) {
- D2(printk(KERN_DEBUG "Found %02x at %x in OOB for "
- "%08x\n", ops.oobbuf[i], i, jeb->offset));
+ jffs2_dbg(2, "Found %02x at %x in OOB for "
+ "%08x\n", ops.oobbuf[i], i, jeb->offset);
return 1;
}
}
@@ -1077,9 +1086,8 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
if (ret || ops.oobretlen != ops.ooblen) {
- printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
- " bytes, read %zd bytes, error %d\n",
- jeb->offset, ops.ooblen, ops.oobretlen, ret);
+ pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
+ jeb->offset, ops.ooblen, ops.oobretlen, ret);
if (!ret)
ret = -EIO;
return ret;
@@ -1103,9 +1111,8 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
ret = mtd_write_oob(c->mtd, jeb->offset, &ops);
if (ret || ops.oobretlen != ops.ooblen) {
- printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd"
- " bytes, read %zd bytes, error %d\n",
- jeb->offset, ops.ooblen, ops.oobretlen, ret);
+ pr_err("cannot write OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
+ jeb->offset, ops.ooblen, ops.oobretlen, ret);
if (!ret)
ret = -EIO;
return ret;
@@ -1130,11 +1137,12 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
if( ++jeb->bad_count < MAX_ERASE_FAILURES)
return 0;
- printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset);
+ pr_warn("marking eraseblock at %08x as bad\n", bad_offset);
ret = mtd_block_markbad(c->mtd, bad_offset);
if (ret) {
- D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret));
+ jffs2_dbg(1, "%s(): Write failed for block at %08x: error %d\n",
+ __func__, jeb->offset, ret);
return ret;
}
return 1;
@@ -1151,11 +1159,11 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
c->cleanmarker_size = 0;
if (!oinfo || oinfo->oobavail == 0) {
- printk(KERN_ERR "inconsistent device description\n");
+ pr_err("inconsistent device description\n");
return -EINVAL;
}
- D1(printk(KERN_DEBUG "JFFS2 using OOB on NAND\n"));
+ jffs2_dbg(1, "using OOB on NAND\n");
c->oobavail = oinfo->oobavail;
@@ -1222,7 +1230,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
if ((c->flash_size % c->sector_size) != 0) {
c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
- printk(KERN_WARNING "JFFS2 flash size adjusted to %dKiB\n", c->flash_size);
+ pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
};
c->wbuf_ofs = 0xFFFFFFFF;
@@ -1239,7 +1247,8 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
}
#endif
- printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size);
+ pr_info("write-buffering enabled buffer (%d) erasesize (%d)\n",
+ c->wbuf_pagesize, c->sector_size);
return 0;
}
@@ -1297,7 +1306,8 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
if (!c->wbuf)
return -ENOMEM;
- printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size);
+ pr_info("write-buffering enabled buffer (%d) erasesize (%d)\n",
+ c->wbuf_pagesize, c->sector_size);
return 0;
}
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 30d175b6d29..b634de4c810 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/crc32.h>
@@ -36,7 +38,7 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
f->inocache->state = INO_STATE_PRESENT;
jffs2_add_ino_cache(c, f->inocache);
- D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino));
+ jffs2_dbg(1, "%s(): Assigned ino# %d\n", __func__, f->inocache->ino);
ri->ino = cpu_to_je32(f->inocache->ino);
ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -68,7 +70,7 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
unsigned long cnt = 2;
D1(if(je32_to_cpu(ri->hdr_crc) != crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)) {
- printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dnode()\n");
+ pr_crit("Eep. CRC not correct in jffs2_write_dnode()\n");
BUG();
}
);
@@ -78,7 +80,9 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
vecs[1].iov_len = datalen;
if (je32_to_cpu(ri->totlen) != sizeof(*ri) + datalen) {
- printk(KERN_WARNING "jffs2_write_dnode: ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n", je32_to_cpu(ri->totlen), sizeof(*ri), datalen);
+ pr_warn("%s(): ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n",
+ __func__, je32_to_cpu(ri->totlen),
+ sizeof(*ri), datalen);
}
fn = jffs2_alloc_full_dnode();
@@ -95,9 +99,9 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) {
BUG_ON(!retried);
- D1(printk(KERN_DEBUG "jffs2_write_dnode : dnode_version %d, "
- "highest version %d -> updating dnode\n",
- je32_to_cpu(ri->version), f->highest_version));
+ jffs2_dbg(1, "%s(): dnode_version %d, highest version %d -> updating dnode\n",
+ __func__,
+ je32_to_cpu(ri->version), f->highest_version);
ri->version = cpu_to_je32(++f->highest_version);
ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
}
@@ -106,8 +110,8 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
(alloc_mode==ALLOC_GC)?0:f->inocache->ino);
if (ret || (retlen != sizeof(*ri) + datalen)) {
- printk(KERN_NOTICE "Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
- sizeof(*ri)+datalen, flash_ofs, ret, retlen);
+ pr_notice("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
+ sizeof(*ri) + datalen, flash_ofs, ret, retlen);
/* Mark the space as dirtied */
if (retlen) {
@@ -118,7 +122,8 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
this node */
jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*ri)+datalen), NULL);
} else {
- printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs);
+ pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n",
+ flash_ofs);
}
if (!retried && alloc_mode != ALLOC_NORETRY) {
/* Try to reallocate space and retry */
@@ -127,7 +132,7 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
retried = 1;
- D1(printk(KERN_DEBUG "Retrying failed write.\n"));
+ jffs2_dbg(1, "Retrying failed write.\n");
jffs2_dbg_acct_sanity_check(c,jeb);
jffs2_dbg_acct_paranoia_check(c, jeb);
@@ -147,14 +152,16 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
if (!ret) {
flash_ofs = write_ofs(c);
- D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs));
+ jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write.\n",
+ flash_ofs);
jffs2_dbg_acct_sanity_check(c,jeb);
jffs2_dbg_acct_paranoia_check(c, jeb);
goto retry;
}
- D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
+ jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n",
+ ret);
}
/* Release the full_dnode which is now useless, and return */
jffs2_free_full_dnode(fn);
@@ -183,10 +190,10 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
fn->size = je32_to_cpu(ri->dsize);
fn->frags = 0;
- D1(printk(KERN_DEBUG "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n",
+ jffs2_dbg(1, "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n",
flash_ofs & ~3, flash_ofs & 3, je32_to_cpu(ri->dsize),
je32_to_cpu(ri->csize), je32_to_cpu(ri->node_crc),
- je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen)));
+ je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen));
if (retried) {
jffs2_dbg_acct_sanity_check(c,NULL);
@@ -206,22 +213,23 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
int retried = 0;
int ret;
- D1(printk(KERN_DEBUG "jffs2_write_dirent(ino #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x)\n",
+ jffs2_dbg(1, "%s(ino #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x)\n",
+ __func__,
je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino),
- je32_to_cpu(rd->name_crc)));
+ je32_to_cpu(rd->name_crc));
D1(if(je32_to_cpu(rd->hdr_crc) != crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)) {
- printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dirent()\n");
+ pr_crit("Eep. CRC not correct in jffs2_write_dirent()\n");
BUG();
});
if (strnlen(name, namelen) != namelen) {
/* This should never happen, but seems to have done on at least one
occasion: https://dev.laptop.org/ticket/4184 */
- printk(KERN_CRIT "Error in jffs2_write_dirent() -- name contains zero bytes!\n");
- printk(KERN_CRIT "Directory inode #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x\n",
- je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino),
- je32_to_cpu(rd->name_crc));
+ pr_crit("Error in jffs2_write_dirent() -- name contains zero bytes!\n");
+ pr_crit("Directory inode #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x\n",
+ je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino),
+ je32_to_cpu(rd->name_crc));
WARN_ON(1);
return ERR_PTR(-EIO);
}
@@ -249,9 +257,9 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) {
BUG_ON(!retried);
- D1(printk(KERN_DEBUG "jffs2_write_dirent : dirent_version %d, "
- "highest version %d -> updating dirent\n",
- je32_to_cpu(rd->version), f->highest_version));
+ jffs2_dbg(1, "%s(): dirent_version %d, highest version %d -> updating dirent\n",
+ __func__,
+ je32_to_cpu(rd->version), f->highest_version);
rd->version = cpu_to_je32(++f->highest_version);
fd->version = je32_to_cpu(rd->version);
rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
@@ -260,13 +268,14 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
ret = jffs2_flash_writev(c, vecs, 2, flash_ofs, &retlen,
(alloc_mode==ALLOC_GC)?0:je32_to_cpu(rd->pino));
if (ret || (retlen != sizeof(*rd) + namelen)) {
- printk(KERN_NOTICE "Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
- sizeof(*rd)+namelen, flash_ofs, ret, retlen);
+ pr_notice("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
+ sizeof(*rd) + namelen, flash_ofs, ret, retlen);
/* Mark the space as dirtied */
if (retlen) {
jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*rd)+namelen), NULL);
} else {
- printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs);
+ pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n",
+ flash_ofs);
}
if (!retried) {
/* Try to reallocate space and retry */
@@ -275,7 +284,7 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
retried = 1;
- D1(printk(KERN_DEBUG "Retrying failed write.\n"));
+ jffs2_dbg(1, "Retrying failed write.\n");
jffs2_dbg_acct_sanity_check(c,jeb);
jffs2_dbg_acct_paranoia_check(c, jeb);
@@ -295,12 +304,14 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
if (!ret) {
flash_ofs = write_ofs(c);
- D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs));
+ jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write\n",
+ flash_ofs);
jffs2_dbg_acct_sanity_check(c,jeb);
jffs2_dbg_acct_paranoia_check(c, jeb);
goto retry;
}
- D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
+ jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n",
+ ret);
}
/* Release the full_dnode which is now useless, and return */
jffs2_free_full_dirent(fd);
@@ -333,8 +344,8 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
int ret = 0;
uint32_t writtenlen = 0;
- D1(printk(KERN_DEBUG "jffs2_write_inode_range(): Ino #%u, ofs 0x%x, len 0x%x\n",
- f->inocache->ino, offset, writelen));
+ jffs2_dbg(1, "%s(): Ino #%u, ofs 0x%x, len 0x%x\n",
+ __func__, f->inocache->ino, offset, writelen);
while(writelen) {
struct jffs2_full_dnode *fn;
@@ -345,12 +356,13 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
int retried = 0;
retry:
- D2(printk(KERN_DEBUG "jffs2_commit_write() loop: 0x%x to write to 0x%x\n", writelen, offset));
+ jffs2_dbg(2, "jffs2_commit_write() loop: 0x%x to write to 0x%x\n",
+ writelen, offset);
ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN,
&alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
if (ret) {
- D1(printk(KERN_DEBUG "jffs2_reserve_space returned %d\n", ret));
+ jffs2_dbg(1, "jffs2_reserve_space returned %d\n", ret);
break;
}
mutex_lock(&f->sem);
@@ -386,7 +398,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
if (!retried) {
/* Write error to be retried */
retried = 1;
- D1(printk(KERN_DEBUG "Retrying node write in jffs2_write_inode_range()\n"));
+ jffs2_dbg(1, "Retrying node write in jffs2_write_inode_range()\n");
goto retry;
}
break;
@@ -399,7 +411,8 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
}
if (ret) {
/* Eep */
- D1(printk(KERN_DEBUG "Eep. add_full_dnode_to_inode() failed in commit_write, returned %d\n", ret));
+ jffs2_dbg(1, "Eep. add_full_dnode_to_inode() failed in commit_write, returned %d\n",
+ ret);
jffs2_mark_node_obsolete(c, fn->raw);
jffs2_free_full_dnode(fn);
@@ -410,11 +423,11 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
mutex_unlock(&f->sem);
jffs2_complete_reservation(c);
if (!datalen) {
- printk(KERN_WARNING "Eep. We didn't actually write any data in jffs2_write_inode_range()\n");
+ pr_warn("Eep. We didn't actually write any data in jffs2_write_inode_range()\n");
ret = -EIO;
break;
}
- D1(printk(KERN_DEBUG "increasing writtenlen by %d\n", datalen));
+ jffs2_dbg(1, "increasing writtenlen by %d\n", datalen);
writtenlen += datalen;
offset += datalen;
writelen -= datalen;
@@ -439,7 +452,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
*/
ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
JFFS2_SUMMARY_INODE_SIZE);
- D1(printk(KERN_DEBUG "jffs2_do_create(): reserved 0x%x bytes\n", alloclen));
+ jffs2_dbg(1, "%s(): reserved 0x%x bytes\n", __func__, alloclen);
if (ret)
return ret;
@@ -450,11 +463,11 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
- D1(printk(KERN_DEBUG "jffs2_do_create created file with mode 0x%x\n",
- jemode_to_cpu(ri->mode)));
+ jffs2_dbg(1, "jffs2_do_create created file with mode 0x%x\n",
+ jemode_to_cpu(ri->mode));
if (IS_ERR(fn)) {
- D1(printk(KERN_DEBUG "jffs2_write_dnode() failed\n"));
+ jffs2_dbg(1, "jffs2_write_dnode() failed\n");
/* Eeek. Wave bye bye */
mutex_unlock(&f->sem);
jffs2_complete_reservation(c);
@@ -480,7 +493,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
if (ret) {
/* Eep. */
- D1(printk(KERN_DEBUG "jffs2_reserve_space() for dirent failed\n"));
+ jffs2_dbg(1, "jffs2_reserve_space() for dirent failed\n");
return ret;
}
@@ -597,8 +610,8 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
!memcmp(fd->name, name, namelen) &&
!fd->name[namelen]) {
- D1(printk(KERN_DEBUG "Marking old dirent node (ino #%u) @%08x obsolete\n",
- fd->ino, ref_offset(fd->raw)));
+ jffs2_dbg(1, "Marking old dirent node (ino #%u) @%08x obsolete\n",
+ fd->ino, ref_offset(fd->raw));
jffs2_mark_node_obsolete(c, fd->raw);
/* We don't want to remove it from the list immediately,
because that screws up getdents()/seek() semantics even
@@ -627,11 +640,13 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
dead_f->dents = fd->next;
if (fd->ino) {
- printk(KERN_WARNING "Deleting inode #%u with active dentry \"%s\"->ino #%u\n",
- dead_f->inocache->ino, fd->name, fd->ino);
+ pr_warn("Deleting inode #%u with active dentry \"%s\"->ino #%u\n",
+ dead_f->inocache->ino,
+ fd->name, fd->ino);
} else {
- D1(printk(KERN_DEBUG "Removing deletion dirent for \"%s\" from dir ino #%u\n",
- fd->name, dead_f->inocache->ino));
+ jffs2_dbg(1, "Removing deletion dirent for \"%s\" from dir ino #%u\n",
+ fd->name,
+ dead_f->inocache->ino);
}
if (fd->raw)
jffs2_mark_node_obsolete(c, fd->raw);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 3e93cdd1900..b55b803eddc 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 5f7c160ea64..07c91ca6017 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -220,12 +220,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
dquot_initialize(dip);
- /* link count overflow on parent directory ? */
- if (dip->i_nlink == JFS_LINK_MAX) {
- rc = -EMLINK;
- goto out1;
- }
-
/*
* search parent directory for entry/freespace
* (dtSearch() returns parent directory page pinned)
@@ -806,9 +800,6 @@ static int jfs_link(struct dentry *old_dentry,
jfs_info("jfs_link: %s %s", old_dentry->d_name.name,
dentry->d_name.name);
- if (ip->i_nlink == JFS_LINK_MAX)
- return -EMLINK;
-
dquot_initialize(dir);
tid = txBegin(ip->i_sb, 0);
@@ -1138,10 +1129,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
rc = -ENOTEMPTY;
goto out3;
}
- } else if ((new_dir != old_dir) &&
- (new_dir->i_nlink == JFS_LINK_MAX)) {
- rc = -EMLINK;
- goto out3;
}
} else if (new_ip) {
IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 682bca642f3..4a82950f412 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -441,6 +441,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
return -ENOMEM;
sb->s_fs_info = sbi;
+ sb->s_max_links = JFS_LINK_MAX;
sbi->sb = sb;
sbi->uid = sbi->gid = sbi->umask = -1;
@@ -521,7 +522,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
ret = PTR_ERR(inode);
goto out_no_rw;
}
- sb->s_root = d_alloc_root(inode);
+ sb->s_root = d_make_root(inode);
if (!sb->s_root)
goto out_no_root;
@@ -539,7 +540,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
out_no_root:
jfs_err("jfs_read_super: get root dentry failed");
- iput(inode);
out_no_rw:
rc = jfs_umount(sb);
@@ -860,8 +860,14 @@ static int __init init_jfs_fs(void)
jfs_proc_init();
#endif
- return register_filesystem(&jfs_fs_type);
+ rc = register_filesystem(&jfs_fs_type);
+ if (!rc)
+ return 0;
+#ifdef PROC_FS_JFS
+ jfs_proc_clean();
+#endif
+ kthread_stop(jfsSyncThread);
kill_committask:
for (i = 0; i < commit_threads; i++)
kthread_stop(jfsCommitThread[i]);
diff --git a/fs/libfs.c b/fs/libfs.c
index 5b2dbb3ba4f..18d08f5db53 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -3,7 +3,7 @@
* Library for filesystems writers.
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/mount.h>
@@ -264,6 +264,13 @@ Enomem:
return ERR_PTR(-ENOMEM);
}
+int simple_open(struct inode *inode, struct file *file)
+{
+ if (inode->i_private)
+ file->private_data = inode->i_private;
+ return 0;
+}
+
int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
@@ -491,11 +498,9 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
- root = d_alloc_root(inode);
- if (!root) {
- iput(inode);
+ root = d_make_root(inode);
+ if (!root)
return -ENOMEM;
- }
for (i = 0; !files->name || files->name[0]; i++, files++) {
if (!files->name)
continue;
@@ -524,6 +529,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
return 0;
out:
d_genocide(root);
+ shrink_dcache_parent(root);
dput(root);
return -ENOMEM;
}
@@ -536,7 +542,7 @@ int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *c
spin_lock(&pin_fs_lock);
if (unlikely(!*mount)) {
spin_unlock(&pin_fs_lock);
- mnt = vfs_kern_mount(type, 0, type->name, NULL);
+ mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
spin_lock(&pin_fs_lock);
@@ -986,6 +992,7 @@ EXPORT_SYMBOL(simple_dir_operations);
EXPORT_SYMBOL(simple_empty);
EXPORT_SYMBOL(simple_fill_super);
EXPORT_SYMBOL(simple_getattr);
+EXPORT_SYMBOL(simple_open);
EXPORT_SYMBOL(simple_link);
EXPORT_SYMBOL(simple_lookup);
EXPORT_SYMBOL(simple_pin_fs);
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index f848b52c67b..3ddcbb1c0a4 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -598,7 +598,7 @@ static struct rpc_procinfo nlm4_procedures[] = {
PROC(GRANTED_RES, res, norep),
};
-struct rpc_version nlm_version4 = {
+const struct rpc_version nlm_version4 = {
.number = 4,
.nrprocs = ARRAY_SIZE(nlm4_procedures),
.procs = nlm4_procedures,
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 8d4ea8351e3..ba1dc2eebd1 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -62,7 +62,8 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
nlm_init->protocol, nlm_version,
- nlm_init->hostname, nlm_init->noresvport);
+ nlm_init->hostname, nlm_init->noresvport,
+ nlm_init->net);
if (host == NULL) {
lockd_down();
return ERR_PTR(-ENOLCK);
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 180ac34feb9..3d35e3e80c1 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -596,19 +596,19 @@ static struct rpc_procinfo nlm_procedures[] = {
PROC(GRANTED_RES, res, norep),
};
-static struct rpc_version nlm_version1 = {
+static const struct rpc_version nlm_version1 = {
.number = 1,
.nrprocs = ARRAY_SIZE(nlm_procedures),
.procs = nlm_procedures,
};
-static struct rpc_version nlm_version3 = {
+static const struct rpc_version nlm_version3 = {
.number = 3,
.nrprocs = ARRAY_SIZE(nlm_procedures),
.procs = nlm_procedures,
};
-static struct rpc_version *nlm_versions[] = {
+static const struct rpc_version *nlm_versions[] = {
[1] = &nlm_version1,
[3] = &nlm_version3,
#ifdef CONFIG_LOCKD_V4
@@ -618,7 +618,7 @@ static struct rpc_version *nlm_versions[] = {
static struct rpc_stat nlm_rpc_stats;
-struct rpc_program nlm_program = {
+const struct rpc_program nlm_program = {
.name = "lockd",
.number = NLM_PROGRAM,
.nrvers = ARRAY_SIZE(nlm_versions),
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 6f29836ec0c..eb75ca7c2d6 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -17,6 +17,8 @@
#include <linux/lockd/lockd.h>
#include <linux/mutex.h>
+#include <linux/sunrpc/svc_xprt.h>
+
#include <net/ipv6.h>
#define NLMDBG_FACILITY NLMDBG_HOSTCACHE
@@ -54,6 +56,7 @@ struct nlm_lookup_host_info {
const char *hostname; /* remote's hostname */
const size_t hostname_len; /* it's length */
const int noresvport; /* use non-priv port */
+ struct net *net; /* network namespace to bind */
};
/*
@@ -155,6 +158,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
INIT_LIST_HEAD(&host->h_reclaim);
host->h_nsmhandle = nsm;
host->h_addrbuf = nsm->sm_addrbuf;
+ host->net = ni->net;
out:
return host;
@@ -206,7 +210,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
const unsigned short protocol,
const u32 version,
const char *hostname,
- int noresvport)
+ int noresvport,
+ struct net *net)
{
struct nlm_lookup_host_info ni = {
.server = 0,
@@ -217,6 +222,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
.hostname = hostname,
.hostname_len = strlen(hostname),
.noresvport = noresvport,
+ .net = net,
};
struct hlist_head *chain;
struct hlist_node *pos;
@@ -231,6 +237,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
chain = &nlm_client_hosts[nlm_hash_address(sap)];
hlist_for_each_entry(host, pos, chain, h_hash) {
+ if (host->net != net)
+ continue;
if (!rpc_cmp_addr(nlm_addr(host), sap))
continue;
@@ -318,6 +326,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
struct nsm_handle *nsm = NULL;
struct sockaddr *src_sap = svc_daddr(rqstp);
size_t src_len = rqstp->rq_daddrlen;
+ struct net *net = rqstp->rq_xprt->xpt_net;
struct nlm_lookup_host_info ni = {
.server = 1,
.sap = svc_addr(rqstp),
@@ -326,6 +335,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
.version = rqstp->rq_vers,
.hostname = hostname,
.hostname_len = hostname_len,
+ .net = net,
};
dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
@@ -339,6 +349,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
hlist_for_each_entry(host, pos, chain, h_hash) {
+ if (host->net != net)
+ continue;
if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
continue;
@@ -431,7 +443,7 @@ nlm_bind_host(struct nlm_host *host)
.to_retries = 5U,
};
struct rpc_create_args args = {
- .net = &init_net,
+ .net = host->net,
.protocol = host->h_proto,
.address = nlm_addr(host),
.addrsize = host->h_addrlen,
@@ -553,12 +565,8 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
nsm_release(nsm);
}
-/*
- * Shut down the hosts module.
- * Note that this routine is called only at server shutdown time.
- */
void
-nlm_shutdown_hosts(void)
+nlm_shutdown_hosts_net(struct net *net)
{
struct hlist_head *chain;
struct hlist_node *pos;
@@ -570,6 +578,8 @@ nlm_shutdown_hosts(void)
/* First, make all hosts eligible for gc */
dprintk("lockd: nuking all hosts...\n");
for_each_host(host, pos, chain, nlm_server_hosts) {
+ if (net && host->net != net)
+ continue;
host->h_expires = jiffies - 1;
if (host->h_rpcclnt) {
rpc_shutdown_client(host->h_rpcclnt);
@@ -580,15 +590,29 @@ nlm_shutdown_hosts(void)
/* Then, perform a garbage collection pass */
nlm_gc_hosts();
mutex_unlock(&nlm_host_mutex);
+}
+
+/*
+ * Shut down the hosts module.
+ * Note that this routine is called only at server shutdown time.
+ */
+void
+nlm_shutdown_hosts(void)
+{
+ struct hlist_head *chain;
+ struct hlist_node *pos;
+ struct nlm_host *host;
+
+ nlm_shutdown_hosts_net(NULL);
/* complain if any hosts are left */
if (nrhosts != 0) {
printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
dprintk("lockd: %lu hosts left:\n", nrhosts);
for_each_host(host, pos, chain, nlm_server_hosts) {
- dprintk(" %s (cnt %d use %d exp %ld)\n",
+ dprintk(" %s (cnt %d use %d exp %ld net %p)\n",
host->h_name, atomic_read(&host->h_count),
- host->h_inuse, host->h_expires);
+ host->h_inuse, host->h_expires, host->net);
}
}
}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 23d7451b293..7ef14b3c5be 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -47,7 +47,7 @@ struct nsm_res {
u32 state;
};
-static struct rpc_program nsm_program;
+static const struct rpc_program nsm_program;
static LIST_HEAD(nsm_handles);
static DEFINE_SPINLOCK(nsm_lock);
@@ -55,21 +55,21 @@ static DEFINE_SPINLOCK(nsm_lock);
* Local NSM state
*/
u32 __read_mostly nsm_local_state;
-int __read_mostly nsm_use_hostnames;
+bool __read_mostly nsm_use_hostnames;
static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
{
return (struct sockaddr *)&nsm->sm_addr;
}
-static struct rpc_clnt *nsm_create(void)
+static struct rpc_clnt *nsm_create(struct net *net)
{
struct sockaddr_in sin = {
.sin_family = AF_INET,
.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
};
struct rpc_create_args args = {
- .net = &init_net,
+ .net = net,
.protocol = XPRT_TRANSPORT_UDP,
.address = (struct sockaddr *)&sin,
.addrsize = sizeof(sin),
@@ -83,7 +83,8 @@ static struct rpc_clnt *nsm_create(void)
return rpc_create(&args);
}
-static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
+ struct net *net)
{
struct rpc_clnt *clnt;
int status;
@@ -99,7 +100,7 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
.rpc_resp = res,
};
- clnt = nsm_create();
+ clnt = nsm_create(net);
if (IS_ERR(clnt)) {
status = PTR_ERR(clnt);
dprintk("lockd: failed to create NSM upcall transport, "
@@ -149,7 +150,7 @@ int nsm_monitor(const struct nlm_host *host)
*/
nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
- status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
+ status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net);
if (unlikely(res.status != 0))
status = -EIO;
if (unlikely(status < 0)) {
@@ -183,7 +184,7 @@ void nsm_unmonitor(const struct nlm_host *host)
&& nsm->sm_monitored && !nsm->sm_sticky) {
dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
- status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
+ status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net);
if (res.status != 0)
status = -EIO;
if (status < 0)
@@ -534,19 +535,19 @@ static struct rpc_procinfo nsm_procedures[] = {
},
};
-static struct rpc_version nsm_version1 = {
+static const struct rpc_version nsm_version1 = {
.number = 1,
.nrprocs = ARRAY_SIZE(nsm_procedures),
.procs = nsm_procedures
};
-static struct rpc_version * nsm_version[] = {
+static const struct rpc_version *nsm_version[] = {
[1] = &nsm_version1,
};
static struct rpc_stat nsm_stats;
-static struct rpc_program nsm_program = {
+static const struct rpc_program nsm_program = {
.name = "statd",
.number = NSM_PROGRAM,
.nrvers = ARRAY_SIZE(nsm_version),
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
new file mode 100644
index 00000000000..ce227e0fbc5
--- /dev/null
+++ b/fs/lockd/netns.h
@@ -0,0 +1,12 @@
+#ifndef __LOCKD_NETNS_H__
+#define __LOCKD_NETNS_H__
+
+#include <net/netns/generic.h>
+
+struct lockd_net {
+ unsigned int nlmsvc_users;
+};
+
+extern int lockd_net_id;
+
+#endif
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index c061b9aa7dd..f49b9afc443 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,6 +35,8 @@
#include <linux/lockd/lockd.h>
#include <linux/nfs.h>
+#include "netns.h"
+
#define NLMDBG_FACILITY NLMDBG_SVC
#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE)
#define ALLOWED_SIGS (sigmask(SIGKILL))
@@ -50,6 +52,8 @@ static struct task_struct *nlmsvc_task;
static struct svc_rqst *nlmsvc_rqst;
unsigned long nlmsvc_timeout;
+int lockd_net_id;
+
/*
* These can be set at insmod time (useful for NFS as root filesystem),
* and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
@@ -189,27 +193,29 @@ lockd(void *vrqstp)
}
static int create_lockd_listener(struct svc_serv *serv, const char *name,
- const int family, const unsigned short port)
+ struct net *net, const int family,
+ const unsigned short port)
{
struct svc_xprt *xprt;
- xprt = svc_find_xprt(serv, name, family, 0);
+ xprt = svc_find_xprt(serv, name, net, family, 0);
if (xprt == NULL)
- return svc_create_xprt(serv, name, &init_net, family, port,
+ return svc_create_xprt(serv, name, net, family, port,
SVC_SOCK_DEFAULTS);
svc_xprt_put(xprt);
return 0;
}
-static int create_lockd_family(struct svc_serv *serv, const int family)
+static int create_lockd_family(struct svc_serv *serv, struct net *net,
+ const int family)
{
int err;
- err = create_lockd_listener(serv, "udp", family, nlm_udpport);
+ err = create_lockd_listener(serv, "udp", net, family, nlm_udpport);
if (err < 0)
return err;
- return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
+ return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport);
}
/*
@@ -222,16 +228,16 @@ static int create_lockd_family(struct svc_serv *serv, const int family)
* Returns zero if all listeners are available; otherwise a
* negative errno value is returned.
*/
-static int make_socks(struct svc_serv *serv)
+static int make_socks(struct svc_serv *serv, struct net *net)
{
static int warned;
int err;
- err = create_lockd_family(serv, PF_INET);
+ err = create_lockd_family(serv, net, PF_INET);
if (err < 0)
goto out_err;
- err = create_lockd_family(serv, PF_INET6);
+ err = create_lockd_family(serv, net, PF_INET6);
if (err < 0 && err != -EAFNOSUPPORT)
goto out_err;
@@ -245,6 +251,47 @@ out_err:
return err;
}
+static int lockd_up_net(struct net *net)
+{
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
+ struct svc_serv *serv = nlmsvc_rqst->rq_server;
+ int error;
+
+ if (ln->nlmsvc_users)
+ return 0;
+
+ error = svc_rpcb_setup(serv, net);
+ if (error)
+ goto err_rpcb;
+
+ error = make_socks(serv, net);
+ if (error < 0)
+ goto err_socks;
+ return 0;
+
+err_socks:
+ svc_rpcb_cleanup(serv, net);
+err_rpcb:
+ return error;
+}
+
+static void lockd_down_net(struct net *net)
+{
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
+ struct svc_serv *serv = nlmsvc_rqst->rq_server;
+
+ if (ln->nlmsvc_users) {
+ if (--ln->nlmsvc_users == 0) {
+ nlm_shutdown_hosts_net(net);
+ svc_shutdown_net(serv, net);
+ }
+ } else {
+ printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n",
+ nlmsvc_task, net);
+ BUG();
+ }
+}
+
/*
* Bring up the lockd process if it's not already up.
*/
@@ -252,13 +299,16 @@ int lockd_up(void)
{
struct svc_serv *serv;
int error = 0;
+ struct net *net = current->nsproxy->net_ns;
mutex_lock(&nlmsvc_mutex);
/*
* Check whether we're already up and running.
*/
- if (nlmsvc_rqst)
+ if (nlmsvc_rqst) {
+ error = lockd_up_net(net);
goto out;
+ }
/*
* Sanity check: if there's no pid,
@@ -275,7 +325,7 @@ int lockd_up(void)
goto out;
}
- error = make_socks(serv);
+ error = make_socks(serv, net);
if (error < 0)
goto destroy_and_out;
@@ -313,8 +363,12 @@ int lockd_up(void)
destroy_and_out:
svc_destroy(serv);
out:
- if (!error)
+ if (!error) {
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+ ln->nlmsvc_users++;
nlmsvc_users++;
+ }
mutex_unlock(&nlmsvc_mutex);
return error;
}
@@ -328,8 +382,10 @@ lockd_down(void)
{
mutex_lock(&nlmsvc_mutex);
if (nlmsvc_users) {
- if (--nlmsvc_users)
+ if (--nlmsvc_users) {
+ lockd_down_net(current->nsproxy->net_ns);
goto out;
+ }
} else {
printk(KERN_ERR "lockd_down: no users! task=%p\n",
nlmsvc_task);
@@ -440,7 +496,7 @@ static int param_set_##name(const char *val, struct kernel_param *kp) \
__typeof__(type) num = which_strtol(val, &endp, 0); \
if (endp == val || *endp || num < (min) || num > (max)) \
return -EINVAL; \
- *((int *) kp->arg) = num; \
+ *((type *) kp->arg) = num; \
return 0; \
}
@@ -497,24 +553,55 @@ module_param_call(nlm_tcpport, param_set_port, param_get_int,
module_param(nsm_use_hostnames, bool, 0644);
module_param(nlm_max_connections, uint, 0644);
+static int lockd_init_net(struct net *net)
+{
+ return 0;
+}
+
+static void lockd_exit_net(struct net *net)
+{
+}
+
+static struct pernet_operations lockd_net_ops = {
+ .init = lockd_init_net,
+ .exit = lockd_exit_net,
+ .id = &lockd_net_id,
+ .size = sizeof(struct lockd_net),
+};
+
+
/*
* Initialising and terminating the module.
*/
static int __init init_nlm(void)
{
+ int err;
+
#ifdef CONFIG_SYSCTL
+ err = -ENOMEM;
nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root);
- return nlm_sysctl_table ? 0 : -ENOMEM;
-#else
+ if (nlm_sysctl_table == NULL)
+ goto err_sysctl;
+#endif
+ err = register_pernet_subsys(&lockd_net_ops);
+ if (err)
+ goto err_pernet;
return 0;
+
+err_pernet:
+#ifdef CONFIG_SYSCTL
+ unregister_sysctl_table(nlm_sysctl_table);
#endif
+err_sysctl:
+ return err;
}
static void __exit exit_nlm(void)
{
/* FIXME: delete all NLM clients */
nlm_shutdown_hosts();
+ unregister_pernet_subsys(&lockd_net_ops);
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nlm_sysctl_table);
#endif
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index f0179c3745d..e46353f41a4 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -46,7 +46,6 @@ static void nlmsvc_remove_block(struct nlm_block *block);
static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
static void nlmsvc_freegrantargs(struct nlm_rqst *call);
static const struct rpc_call_ops nlmsvc_grant_ops;
-static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
/*
* The list of blocked locks to retry
@@ -54,6 +53,35 @@ static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
static LIST_HEAD(nlm_blocked);
static DEFINE_SPINLOCK(nlm_blocked_lock);
+#ifdef LOCKD_DEBUG
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+ /*
+ * We can get away with a static buffer because we're only
+ * called with BKL held.
+ */
+ static char buf[2*NLM_MAXCOOKIELEN+1];
+ unsigned int i, len = sizeof(buf);
+ char *p = buf;
+
+ len--; /* allow for trailing \0 */
+ if (len < 3)
+ return "???";
+ for (i = 0 ; i < cookie->len ; i++) {
+ if (len < 2) {
+ strcpy(p-3, "...");
+ break;
+ }
+ sprintf(p, "%02x", cookie->data[i]);
+ p += 2;
+ len -= 2;
+ }
+ *p = '\0';
+
+ return buf;
+}
+#endif
+
/*
* Insert a blocked lock into the global list
*/
@@ -935,32 +963,3 @@ nlmsvc_retry_blocked(void)
return timeout;
}
-
-#ifdef RPC_DEBUG
-static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
-{
- /*
- * We can get away with a static buffer because we're only
- * called with BKL held.
- */
- static char buf[2*NLM_MAXCOOKIELEN+1];
- unsigned int i, len = sizeof(buf);
- char *p = buf;
-
- len--; /* allow for trailing \0 */
- if (len < 3)
- return "???";
- for (i = 0 ; i < cookie->len ; i++) {
- if (len < 2) {
- strcpy(p-3, "...");
- break;
- }
- sprintf(p, "%02x", cookie->data[i]);
- p += 2;
- len -= 2;
- }
- *p = '\0';
-
- return buf;
-}
-#endif
diff --git a/fs/locks.c b/fs/locks.c
index 637694bf3a0..0d68f1f8179 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -510,12 +510,13 @@ static void __locks_delete_block(struct file_lock *waiter)
/*
*/
-static void locks_delete_block(struct file_lock *waiter)
+void locks_delete_block(struct file_lock *waiter)
{
lock_flocks();
__locks_delete_block(waiter);
unlock_flocks();
}
+EXPORT_SYMBOL(locks_delete_block);
/* Insert waiter into blocker's block list.
* We use a circular list so that processes can be easily woken up in
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index e97404d611e..9c501449450 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -152,9 +152,6 @@ static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
filler_t *filler = logfs_mtd_readpage;
struct mtd_info *mtd = super->s_mtd;
- if (!mtd_can_have_bb(mtd))
- return NULL;
-
*ofs = 0;
while (mtd_block_isbad(mtd, *ofs)) {
*ofs += mtd->erasesize;
@@ -172,9 +169,6 @@ static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
filler_t *filler = logfs_mtd_readpage;
struct mtd_info *mtd = super->s_mtd;
- if (!mtd_can_have_bb(mtd))
- return NULL;
-
*ofs = mtd->size - mtd->erasesize;
while (mtd_block_isbad(mtd, *ofs)) {
*ofs -= mtd->erasesize;
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 501043e8966..bea5d1b9954 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -71,7 +71,7 @@ static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
static int write_inode(struct inode *inode)
{
- return __logfs_write_inode(inode, WF_LOCK);
+ return __logfs_write_inode(inode, NULL, WF_LOCK);
}
static s64 dir_seek_data(struct inode *inode, s64 pos)
@@ -177,17 +177,17 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
(filler_t *)logfs_readpage, NULL);
if (IS_ERR(page))
return page;
- dd = kmap_atomic(page, KM_USER0);
+ dd = kmap_atomic(page);
BUG_ON(dd->namelen == 0);
if (name->len != be16_to_cpu(dd->namelen) ||
memcmp(name->name, dd->name, name->len)) {
- kunmap_atomic(dd, KM_USER0);
+ kunmap_atomic(dd);
page_cache_release(page);
continue;
}
- kunmap_atomic(dd, KM_USER0);
+ kunmap_atomic(dd);
return page;
}
return NULL;
@@ -365,9 +365,9 @@ static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
return NULL;
}
index = page->index;
- dd = kmap_atomic(page, KM_USER0);
+ dd = kmap_atomic(page);
ino = be64_to_cpu(dd->ino);
- kunmap_atomic(dd, KM_USER0);
+ kunmap_atomic(dd);
page_cache_release(page);
inode = logfs_iget(dir->i_sb, ino);
@@ -402,12 +402,12 @@ static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
if (!page)
return -ENOMEM;
- dd = kmap_atomic(page, KM_USER0);
+ dd = kmap_atomic(page);
memset(dd, 0, sizeof(*dd));
dd->ino = cpu_to_be64(inode->i_ino);
dd->type = logfs_type(inode);
logfs_set_name(dd, &dentry->d_name);
- kunmap_atomic(dd, KM_USER0);
+ kunmap_atomic(dd);
err = logfs_write_buf(dir, page, WF_LOCK);
unlock_page(page);
@@ -558,9 +558,6 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
{
struct inode *inode = old_dentry->d_inode;
- if (inode->i_nlink >= LOGFS_LINK_MAX)
- return -EMLINK;
-
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
ihold(inode);
inc_nlink(inode);
@@ -579,9 +576,9 @@ static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
if (IS_ERR(page))
return PTR_ERR(page);
*pos = page->index;
- map = kmap_atomic(page, KM_USER0);
+ map = kmap_atomic(page);
memcpy(dd, map, sizeof(*dd));
- kunmap_atomic(map, KM_USER0);
+ kunmap_atomic(map);
page_cache_release(page);
return 0;
}
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index b548c87a86f..3886cded283 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -230,7 +230,9 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
mutex_lock(&inode->i_mutex);
+ logfs_get_wblocks(sb, NULL, WF_LOCK);
logfs_write_anchor(sb);
+ logfs_put_wblocks(sb, NULL, WF_LOCK);
mutex_unlock(&inode->i_mutex);
return 0;
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index caa4419285d..d4efb061bdc 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -367,7 +367,7 @@ static struct gc_candidate *get_candidate(struct super_block *sb)
int i, max_dist;
struct gc_candidate *cand = NULL, *this;
- max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+ max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
for (i = max_dist; i >= 0; i--) {
this = first_in_list(&super->s_low_list[i]);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 388df1aa35e..a422f42238b 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -286,7 +286,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
return 0;
- ret = __logfs_write_inode(inode, flags);
+ ret = __logfs_write_inode(inode, NULL, flags);
LOGFS_BUG_ON(ret, inode->i_sb);
return ret;
}
@@ -363,7 +363,9 @@ static void logfs_init_once(void *_li)
static int logfs_sync_fs(struct super_block *sb, int wait)
{
+ logfs_get_wblocks(sb, NULL, WF_LOCK);
logfs_write_anchor(sb);
+ logfs_put_wblocks(sb, NULL, WF_LOCK);
return 0;
}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 9da29706f91..1e1c369df22 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -612,7 +612,6 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
if (len == 0)
return logfs_write_header(super, header, 0, type);
- BUG_ON(len > sb->s_blocksize);
compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
if (compr_len < 0 || type == JE_ANCHOR) {
memcpy(data, buf, len);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 926373866a5..5f093760946 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -528,7 +528,7 @@ void logfs_destroy_inode_cache(void);
void logfs_set_blocks(struct inode *inode, u64 no);
/* these logically belong into inode.c but actually reside in readwrite.c */
int logfs_read_inode(struct inode *inode);
-int __logfs_write_inode(struct inode *inode, long flags);
+int __logfs_write_inode(struct inode *inode, struct page *, long flags);
void logfs_evict_inode(struct inode *inode);
/* journal.c */
@@ -577,6 +577,8 @@ void initialize_block_counters(struct page *page, struct logfs_block *block,
__be64 *array, int page_is_empty);
int logfs_exist_block(struct inode *inode, u64 bix);
int get_page_reserve(struct inode *inode, struct page *page);
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
extern struct logfs_block_ops indirect_block_ops;
/* segment.c */
@@ -594,6 +596,7 @@ int logfs_init_mapping(struct super_block *sb);
void logfs_sync_area(struct logfs_area *area);
void logfs_sync_segments(struct super_block *sb);
void freeseg(struct super_block *sb, u32 segno);
+void free_areas(struct super_block *sb);
/* area handling */
int logfs_init_areas(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 2ac4217b790..e3ab5e5a904 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -244,8 +244,7 @@ static void preunlock_page(struct super_block *sb, struct page *page, int lock)
* is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
* in addition to PG_locked.
*/
-static void logfs_get_wblocks(struct super_block *sb, struct page *page,
- int lock)
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
{
struct logfs_super *super = logfs_super(sb);
@@ -260,8 +259,7 @@ static void logfs_get_wblocks(struct super_block *sb, struct page *page,
}
}
-static void logfs_put_wblocks(struct super_block *sb, struct page *page,
- int lock)
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
{
struct logfs_super *super = logfs_super(sb);
@@ -424,7 +422,7 @@ static void inode_write_block(struct logfs_block *block)
if (inode->i_ino == LOGFS_INO_MASTER)
logfs_write_anchor(inode->i_sb);
else {
- ret = __logfs_write_inode(inode, 0);
+ ret = __logfs_write_inode(inode, NULL, 0);
/* see indirect_write_block comment */
BUG_ON(ret);
}
@@ -519,9 +517,9 @@ static int indirect_write_alias(struct super_block *sb,
ino = page->mapping->host->i_ino;
logfs_unpack_index(page->index, &bix, &level);
- child = kmap_atomic(page, KM_USER0);
+ child = kmap_atomic(page);
val = child[pos];
- kunmap_atomic(child, KM_USER0);
+ kunmap_atomic(child);
err = write_one_alias(sb, ino, bix, level, pos, val);
if (err)
return err;
@@ -560,8 +558,13 @@ static void inode_free_block(struct super_block *sb, struct logfs_block *block)
static void indirect_free_block(struct super_block *sb,
struct logfs_block *block)
{
- ClearPagePrivate(block->page);
- block->page->private = 0;
+ struct page *page = block->page;
+
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ set_page_private(page, 0);
+ }
__free_block(sb, block);
}
@@ -650,8 +653,11 @@ static void alloc_data_block(struct inode *inode, struct page *page)
logfs_unpack_index(page->index, &bix, &level);
block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
block->page = page;
+
SetPagePrivate(page);
- page->private = (unsigned long)block;
+ page_cache_get(page);
+ set_page_private(page, (unsigned long) block);
+
block->ops = &indirect_block_ops;
}
@@ -667,9 +673,9 @@ static void alloc_indirect_block(struct inode *inode, struct page *page,
alloc_data_block(inode, page);
block = logfs_block(page);
- array = kmap_atomic(page, KM_USER0);
+ array = kmap_atomic(page);
initialize_block_counters(page, block, array, page_is_empty);
- kunmap_atomic(array, KM_USER0);
+ kunmap_atomic(array);
}
static void block_set_pointer(struct page *page, int index, u64 ptr)
@@ -679,10 +685,10 @@ static void block_set_pointer(struct page *page, int index, u64 ptr)
u64 oldptr;
BUG_ON(!block);
- array = kmap_atomic(page, KM_USER0);
+ array = kmap_atomic(page);
oldptr = be64_to_cpu(array[index]);
array[index] = cpu_to_be64(ptr);
- kunmap_atomic(array, KM_USER0);
+ kunmap_atomic(array);
SetPageUptodate(page);
block->full += !!(ptr & LOGFS_FULLY_POPULATED)
@@ -695,9 +701,9 @@ static u64 block_get_pointer(struct page *page, int index)
__be64 *block;
u64 ptr;
- block = kmap_atomic(page, KM_USER0);
+ block = kmap_atomic(page);
ptr = be64_to_cpu(block[index]);
- kunmap_atomic(block, KM_USER0);
+ kunmap_atomic(block);
return ptr;
}
@@ -844,7 +850,7 @@ static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
}
slot = get_bits(bix, SUBLEVEL(level));
- rblock = kmap_atomic(page, KM_USER0);
+ rblock = kmap_atomic(page);
while (slot < LOGFS_BLOCK_FACTOR) {
if (data && (rblock[slot] != 0))
break;
@@ -855,12 +861,12 @@ static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
bix &= ~(increment - 1);
}
if (slot >= LOGFS_BLOCK_FACTOR) {
- kunmap_atomic(rblock, KM_USER0);
+ kunmap_atomic(rblock);
logfs_put_read_page(page);
return bix;
}
bofs = be64_to_cpu(rblock[slot]);
- kunmap_atomic(rblock, KM_USER0);
+ kunmap_atomic(rblock);
logfs_put_read_page(page);
if (!bofs) {
BUG_ON(data);
@@ -1570,11 +1576,15 @@ int logfs_write_buf(struct inode *inode, struct page *page, long flags)
static int __logfs_delete(struct inode *inode, struct page *page)
{
long flags = WF_DELETE;
+ int err;
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
if (page->index < I0_BLOCKS)
return logfs_write_direct(inode, page, flags);
+ err = grow_inode(inode, page->index, 0);
+ if (err)
+ return err;
return logfs_write_rec(inode, page, page->index, 0, flags);
}
@@ -1623,7 +1633,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
if (inode->i_ino == LOGFS_INO_MASTER)
logfs_write_anchor(inode->i_sb);
else {
- err = __logfs_write_inode(inode, flags);
+ err = __logfs_write_inode(inode, page, flags);
}
}
}
@@ -1873,7 +1883,7 @@ int logfs_truncate(struct inode *inode, u64 target)
logfs_get_wblocks(sb, NULL, 1);
err = __logfs_truncate(inode, size);
if (!err)
- err = __logfs_write_inode(inode, 0);
+ err = __logfs_write_inode(inode, NULL, 0);
logfs_put_wblocks(sb, NULL, 1);
}
@@ -1901,8 +1911,11 @@ static void move_page_to_inode(struct inode *inode, struct page *page)
li->li_block = block;
block->page = NULL;
- page->private = 0;
- ClearPagePrivate(page);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ set_page_private(page, 0);
+ }
}
static void move_inode_to_page(struct page *page, struct inode *inode)
@@ -1918,8 +1931,12 @@ static void move_inode_to_page(struct page *page, struct inode *inode)
BUG_ON(PagePrivate(page));
block->ops = &indirect_block_ops;
block->page = page;
- page->private = (unsigned long)block;
- SetPagePrivate(page);
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, (unsigned long) block);
+ }
block->inode = NULL;
li->li_block = NULL;
@@ -1944,9 +1961,9 @@ int logfs_read_inode(struct inode *inode)
if (IS_ERR(page))
return PTR_ERR(page);
- di = kmap_atomic(page, KM_USER0);
+ di = kmap_atomic(page);
logfs_disk_to_inode(di, inode);
- kunmap_atomic(di, KM_USER0);
+ kunmap_atomic(di);
move_page_to_inode(inode, page);
page_cache_release(page);
return 0;
@@ -1965,9 +1982,9 @@ static struct page *inode_to_page(struct inode *inode)
if (!page)
return NULL;
- di = kmap_atomic(page, KM_USER0);
+ di = kmap_atomic(page);
logfs_inode_to_disk(inode, di);
- kunmap_atomic(di, KM_USER0);
+ kunmap_atomic(di);
move_inode_to_page(page, inode);
return page;
}
@@ -2024,13 +2041,13 @@ static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
if (write)
alloc_indirect_block(inode, page, 0);
- se = kmap_atomic(page, KM_USER0);
+ se = kmap_atomic(page);
change_se(se + child_no, arg);
if (write) {
logfs_set_alias(sb, logfs_block(page), child_no);
BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
}
- kunmap_atomic(se, KM_USER0);
+ kunmap_atomic(se);
logfs_put_write_page(page);
}
@@ -2106,14 +2123,14 @@ void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
ec_level);
}
-int __logfs_write_inode(struct inode *inode, long flags)
+int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
{
struct super_block *sb = inode->i_sb;
int ret;
- logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+ logfs_get_wblocks(sb, page, flags & WF_LOCK);
ret = do_write_inode(inode);
- logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+ logfs_put_wblocks(sb, page, flags & WF_LOCK);
return ret;
}
@@ -2228,10 +2245,10 @@ int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
if (!page)
return -ENOMEM;
- pagebuf = kmap_atomic(page, KM_USER0);
+ pagebuf = kmap_atomic(page);
memcpy(pagebuf, buf, count);
flush_dcache_page(page);
- kunmap_atomic(pagebuf, KM_USER0);
+ kunmap_atomic(pagebuf);
if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
i_size_write(inode, pos + LOGFS_BLOCKSIZE);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 9d518735325..e28d090c98d 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -86,7 +86,11 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
BUG_ON(!page); /* FIXME: reserve a pool */
SetPageUptodate(page);
memcpy(page_address(page) + offset, buf, copylen);
- SetPagePrivate(page);
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ }
page_cache_release(page);
buf += copylen;
@@ -110,7 +114,10 @@ static void pad_partial_page(struct logfs_area *area)
page = get_mapping_page(sb, index, 0);
BUG_ON(!page); /* FIXME: reserve a pool */
memset(page_address(page) + offset, 0xff, len);
- SetPagePrivate(page);
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ }
page_cache_release(page);
}
}
@@ -130,7 +137,10 @@ static void pad_full_pages(struct logfs_area *area)
BUG_ON(!page); /* FIXME: reserve a pool */
SetPageUptodate(page);
memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
- SetPagePrivate(page);
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ }
page_cache_release(page);
index++;
no_indizes--;
@@ -485,8 +495,12 @@ static void move_btree_to_page(struct inode *inode, struct page *page,
mempool_free(item, super->s_alias_pool);
}
block->page = page;
- SetPagePrivate(page);
- page->private = (unsigned long)block;
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, (unsigned long) block);
+ }
block->ops = &indirect_block_ops;
initialize_block_counters(page, block, data, 0);
}
@@ -529,15 +543,19 @@ void move_page_to_btree(struct page *page)
BUG_ON(!item); /* mempool empty */
memset(item, 0, sizeof(*item));
- child = kmap_atomic(page, KM_USER0);
+ child = kmap_atomic(page);
item->val = child[pos];
- kunmap_atomic(child, KM_USER0);
+ kunmap_atomic(child);
item->child_no = pos;
list_add(&item->list, &block->item_list);
}
block->page = NULL;
- ClearPagePrivate(page);
- page->private = 0;
+
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ set_page_private(page, 0);
+ }
block->ops = &btree_block_ops;
err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
block);
@@ -702,7 +720,10 @@ void freeseg(struct super_block *sb, u32 segno)
page = find_get_page(mapping, ofs >> PAGE_SHIFT);
if (!page)
continue;
- ClearPagePrivate(page);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ }
page_cache_release(page);
}
}
@@ -841,6 +862,16 @@ static void free_area(struct logfs_area *area)
kfree(area);
}
+void free_areas(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int i;
+
+ for_each_area(i)
+ free_area(super->s_area[i]);
+ free_area(super->s_journal_area);
+}
+
static struct logfs_area *alloc_area(struct super_block *sb)
{
struct logfs_area *area;
@@ -923,10 +954,6 @@ err:
void logfs_cleanup_areas(struct super_block *sb)
{
struct logfs_super *super = logfs_super(sb);
- int i;
btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
- for_each_area(i)
- free_area(super->s_area[i]);
- free_area(super->s_journal_area);
}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index e795c234ea3..97bca623d89 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -315,11 +315,9 @@ static int logfs_get_sb_final(struct super_block *sb)
if (IS_ERR(rootdir))
goto fail;
- sb->s_root = d_alloc_root(rootdir);
- if (!sb->s_root) {
- iput(rootdir);
+ sb->s_root = d_make_root(rootdir);
+ if (!sb->s_root)
goto fail;
- }
/* at that point we know that ->put_super() will be called */
super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
@@ -486,14 +484,15 @@ static void logfs_kill_sb(struct super_block *sb)
/* Alias entries slow down mount, so evict as many as possible */
sync_filesystem(sb);
logfs_write_anchor(sb);
+ free_areas(sb);
/*
* From this point on alias entries are simply dropped - and any
* writes to the object store are considered bugs.
*/
- super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
log_super("LogFS: Now in shutdown\n");
generic_shutdown_super(sb);
+ super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
@@ -541,6 +540,7 @@ static struct dentry *logfs_get_sb_device(struct logfs_super *super,
* the filesystem incompatible with 32bit systems.
*/
sb->s_maxbytes = (1ull << 43) - 1;
+ sb->s_max_links = LOGFS_LINK_MAX;
sb->s_op = &logfs_super_operations;
sb->s_flags = flags | MS_NOATIME;
@@ -626,7 +626,10 @@ static int __init logfs_init(void)
if (ret)
goto out2;
- return register_filesystem(&logfs_fs_type);
+ ret = register_filesystem(&logfs_fs_type);
+ if (!ret)
+ return 0;
+ logfs_destroy_inode_cache();
out2:
logfs_compr_exit();
out1:
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 085a9262c69..685b2d981b8 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -335,7 +335,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
goto fail;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memset(kaddr, 0, PAGE_CACHE_SIZE);
if (sbi->s_version == MINIX_V3) {
@@ -355,7 +355,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
de->inode = dir->i_ino;
strcpy(de->name, "..");
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);
fail:
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index fa8b612b8ce..fcb05d2c6b5 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -190,24 +190,24 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
sbi->s_version = MINIX_V1;
sbi->s_dirsize = 16;
sbi->s_namelen = 14;
- sbi->s_link_max = MINIX_LINK_MAX;
+ s->s_max_links = MINIX_LINK_MAX;
} else if (s->s_magic == MINIX_SUPER_MAGIC2) {
sbi->s_version = MINIX_V1;
sbi->s_dirsize = 32;
sbi->s_namelen = 30;
- sbi->s_link_max = MINIX_LINK_MAX;
+ s->s_max_links = MINIX_LINK_MAX;
} else if (s->s_magic == MINIX2_SUPER_MAGIC) {
sbi->s_version = MINIX_V2;
sbi->s_nzones = ms->s_zones;
sbi->s_dirsize = 16;
sbi->s_namelen = 14;
- sbi->s_link_max = MINIX2_LINK_MAX;
+ s->s_max_links = MINIX2_LINK_MAX;
} else if (s->s_magic == MINIX2_SUPER_MAGIC2) {
sbi->s_version = MINIX_V2;
sbi->s_nzones = ms->s_zones;
sbi->s_dirsize = 32;
sbi->s_namelen = 30;
- sbi->s_link_max = MINIX2_LINK_MAX;
+ s->s_max_links = MINIX2_LINK_MAX;
} else if ( *(__u16 *)(bh->b_data + 24) == MINIX3_SUPER_MAGIC) {
m3s = (struct minix3_super_block *) bh->b_data;
s->s_magic = m3s->s_magic;
@@ -221,9 +221,9 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
sbi->s_dirsize = 64;
sbi->s_namelen = 60;
sbi->s_version = MINIX_V3;
- sbi->s_link_max = MINIX2_LINK_MAX;
sbi->s_mount_state = MINIX_VALID_FS;
sb_set_blocksize(s, m3s->s_blocksize);
+ s->s_max_links = MINIX2_LINK_MAX;
} else
goto out_no_fs;
@@ -254,14 +254,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
minix_set_bit(0,sbi->s_imap[0]->b_data);
minix_set_bit(0,sbi->s_zmap[0]->b_data);
- /* set up enough so that it can read an inode */
- s->s_op = &minix_sops;
- root_inode = minix_iget(s, MINIX_ROOT_INO);
- if (IS_ERR(root_inode)) {
- ret = PTR_ERR(root_inode);
- goto out_no_root;
- }
-
/* Apparently minix can create filesystems that allocate more blocks for
* the bitmaps than needed. We simply ignore that, but verify it didn't
* create one with not enough blocks and bail out if so.
@@ -270,7 +262,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
if (sbi->s_imap_blocks < block) {
printk("MINIX-fs: file system does not have enough "
"imap blocks allocated. Refusing to mount\n");
- goto out_iput;
+ goto out_no_bitmap;
}
block = minix_blocks_needed(
@@ -279,13 +271,21 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
if (sbi->s_zmap_blocks < block) {
printk("MINIX-fs: file system does not have enough "
"zmap blocks allocated. Refusing to mount.\n");
- goto out_iput;
+ goto out_no_bitmap;
+ }
+
+ /* set up enough so that it can read an inode */
+ s->s_op = &minix_sops;
+ root_inode = minix_iget(s, MINIX_ROOT_INO);
+ if (IS_ERR(root_inode)) {
+ ret = PTR_ERR(root_inode);
+ goto out_no_root;
}
ret = -ENOMEM;
- s->s_root = d_alloc_root(root_inode);
+ s->s_root = d_make_root(root_inode);
if (!s->s_root)
- goto out_iput;
+ goto out_no_root;
if (!(s->s_flags & MS_RDONLY)) {
if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
@@ -301,10 +301,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
return 0;
-out_iput:
- iput(root_inode);
- goto out_freemap;
-
out_no_root:
if (!silent)
printk("MINIX-fs: get root inode failed\n");
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index c889ef0aa57..1ebd1185462 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -34,7 +34,6 @@ struct minix_sb_info {
unsigned long s_max_size;
int s_dirsize;
int s_namelen;
- int s_link_max;
struct buffer_head ** s_imap;
struct buffer_head ** s_zmap;
struct buffer_head * s_sbh;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 2f76e38c206..2d0ee178630 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -94,9 +94,6 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
{
struct inode *inode = old_dentry->d_inode;
- if (inode->i_nlink >= minix_sb(inode->i_sb)->s_link_max)
- return -EMLINK;
-
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
ihold(inode);
@@ -106,10 +103,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
{
struct inode * inode;
- int err = -EMLINK;
-
- if (dir->i_nlink >= minix_sb(dir->i_sb)->s_link_max)
- goto out;
+ int err;
inode_inc_link_count(dir);
@@ -181,7 +175,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
struct inode * new_dir, struct dentry *new_dentry)
{
- struct minix_sb_info * info = minix_sb(old_dir->i_sb);
struct inode * old_inode = old_dentry->d_inode;
struct inode * new_inode = new_dentry->d_inode;
struct page * dir_page = NULL;
@@ -219,11 +212,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
} else {
- if (dir_de) {
- err = -EMLINK;
- if (new_dir->i_nlink >= info->s_link_max)
- goto out_dir;
- }
err = minix_add_link(new_dentry, old_inode);
if (err)
goto out_dir;
diff --git a/fs/mpage.c b/fs/mpage.c
index fdfae9fa98c..0face1c4d4c 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -13,7 +13,7 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/kdev_t.h>
#include <linux/gfp.h>
@@ -371,9 +371,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
- struct blk_plug plug;
-
- blk_start_plug(&plug);
map_bh.b_state = 0;
map_bh.b_size = 0;
@@ -395,7 +392,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
BUG_ON(!list_empty(pages));
if (bio)
mpage_bio_submit(READ, bio);
- blk_finish_plug(&plug);
return 0;
}
EXPORT_SYMBOL(mpage_readpages);
diff --git a/fs/namei.c b/fs/namei.c
index c283a1ec008..0062dd17eb5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -15,7 +15,7 @@
*/
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
static char *getname_flags(const char __user *filename, int flags, int *empty)
{
- char *tmp, *result;
+ char *result = __getname();
+ int retval;
- result = ERR_PTR(-ENOMEM);
- tmp = __getname();
- if (tmp) {
- int retval = do_getname(filename, tmp);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
- result = tmp;
- if (retval < 0) {
- if (retval == -ENOENT && empty)
- *empty = 1;
- if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
- __putname(tmp);
- result = ERR_PTR(retval);
- }
+ retval = do_getname(filename, result);
+ if (retval < 0) {
+ if (retval == -ENOENT && empty)
+ *empty = 1;
+ if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+ __putname(result);
+ return ERR_PTR(retval);
}
}
audit_getname(result);
@@ -163,7 +161,7 @@ static char *getname_flags(const char __user *filename, int flags, int *empty)
char *getname(const char __user * filename)
{
- return getname_flags(filename, 0, 0);
+ return getname_flags(filename, 0, NULL);
}
#ifdef CONFIG_AUDITSYSCALL
@@ -644,7 +642,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
cond_resched();
current->total_link_count++;
- touch_atime(link->mnt, dentry);
+ touch_atime(link);
nd_set_link(nd, NULL);
error = security_inode_follow_link(link->dentry, nd);
@@ -1056,51 +1054,65 @@ static void follow_dotdot(struct nameidata *nd)
}
/*
- * Allocate a dentry with name and parent, and perform a parent
- * directory ->lookup on it. Returns the new dentry, or ERR_PTR
- * on error. parent->d_inode->i_mutex must be held. d_lookup must
- * have verified that no child exists while under i_mutex.
+ * This looks up the name in dcache, possibly revalidates the old dentry and
+ * allocates a new one if not found or not valid. In the need_lookup argument
+ * returns whether i_op->lookup is necessary.
+ *
+ * dir->d_inode->i_mutex must be held
*/
-static struct dentry *d_alloc_and_lookup(struct dentry *parent,
- struct qstr *name, struct nameidata *nd)
+static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
+ struct nameidata *nd, bool *need_lookup)
{
- struct inode *inode = parent->d_inode;
struct dentry *dentry;
- struct dentry *old;
+ int error;
- /* Don't create child dentry for a dead directory. */
- if (unlikely(IS_DEADDIR(inode)))
- return ERR_PTR(-ENOENT);
+ *need_lookup = false;
+ dentry = d_lookup(dir, name);
+ if (dentry) {
+ if (d_need_lookup(dentry)) {
+ *need_lookup = true;
+ } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
+ error = d_revalidate(dentry, nd);
+ if (unlikely(error <= 0)) {
+ if (error < 0) {
+ dput(dentry);
+ return ERR_PTR(error);
+ } else if (!d_invalidate(dentry)) {
+ dput(dentry);
+ dentry = NULL;
+ }
+ }
+ }
+ }
- dentry = d_alloc(parent, name);
- if (unlikely(!dentry))
- return ERR_PTR(-ENOMEM);
+ if (!dentry) {
+ dentry = d_alloc(dir, name);
+ if (unlikely(!dentry))
+ return ERR_PTR(-ENOMEM);
- old = inode->i_op->lookup(inode, dentry, nd);
- if (unlikely(old)) {
- dput(dentry);
- dentry = old;
+ *need_lookup = true;
}
return dentry;
}
/*
- * We already have a dentry, but require a lookup to be performed on the parent
- * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error.
- * parent->d_inode->i_mutex must be held. d_lookup must have verified that no
- * child exists while under i_mutex.
+ * Call i_op->lookup on the dentry. The dentry must be negative but may be
+ * hashed if it was pouplated with DCACHE_NEED_LOOKUP.
+ *
+ * dir->d_inode->i_mutex must be held
*/
-static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentry,
- struct nameidata *nd)
+static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
{
- struct inode *inode = parent->d_inode;
struct dentry *old;
/* Don't create child dentry for a dead directory. */
- if (unlikely(IS_DEADDIR(inode)))
+ if (unlikely(IS_DEADDIR(dir))) {
+ dput(dentry);
return ERR_PTR(-ENOENT);
+ }
- old = inode->i_op->lookup(inode, dentry, nd);
+ old = dir->i_op->lookup(dir, dentry, nd);
if (unlikely(old)) {
dput(dentry);
dentry = old;
@@ -1108,6 +1120,19 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr
return dentry;
}
+static struct dentry *__lookup_hash(struct qstr *name,
+ struct dentry *base, struct nameidata *nd)
+{
+ bool need_lookup;
+ struct dentry *dentry;
+
+ dentry = lookup_dcache(name, base, nd, &need_lookup);
+ if (!need_lookup)
+ return dentry;
+
+ return lookup_real(base->d_inode, dentry, nd);
+}
+
/*
* It's more convoluted than I'd like it to be, but... it's still fairly
* small and for now I'd prefer to have fast path as straight as possible.
@@ -1139,6 +1164,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
return -ECHILD;
nd->seq = seq;
+ if (unlikely(d_need_lookup(dentry)))
+ goto unlazy;
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
status = d_revalidate(dentry, nd);
if (unlikely(status <= 0)) {
@@ -1147,8 +1174,6 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
goto unlazy;
}
}
- if (unlikely(d_need_lookup(dentry)))
- goto unlazy;
path->mnt = mnt;
path->dentry = dentry;
if (unlikely(!__follow_mount_rcu(nd, path, inode)))
@@ -1163,38 +1188,14 @@ unlazy:
dentry = __d_lookup(parent, name);
}
- if (dentry && unlikely(d_need_lookup(dentry))) {
+ if (unlikely(!dentry))
+ goto need_lookup;
+
+ if (unlikely(d_need_lookup(dentry))) {
dput(dentry);
- dentry = NULL;
- }
-retry:
- if (unlikely(!dentry)) {
- struct inode *dir = parent->d_inode;
- BUG_ON(nd->inode != dir);
-
- mutex_lock(&dir->i_mutex);
- dentry = d_lookup(parent, name);
- if (likely(!dentry)) {
- dentry = d_alloc_and_lookup(parent, name, nd);
- if (IS_ERR(dentry)) {
- mutex_unlock(&dir->i_mutex);
- return PTR_ERR(dentry);
- }
- /* known good */
- need_reval = 0;
- status = 1;
- } else if (unlikely(d_need_lookup(dentry))) {
- dentry = d_inode_lookup(parent, dentry, nd);
- if (IS_ERR(dentry)) {
- mutex_unlock(&dir->i_mutex);
- return PTR_ERR(dentry);
- }
- /* known good */
- need_reval = 0;
- status = 1;
- }
- mutex_unlock(&dir->i_mutex);
+ goto need_lookup;
}
+
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
status = d_revalidate(dentry, nd);
if (unlikely(status <= 0)) {
@@ -1204,12 +1205,10 @@ retry:
}
if (!d_invalidate(dentry)) {
dput(dentry);
- dentry = NULL;
- need_reval = 1;
- goto retry;
+ goto need_lookup;
}
}
-
+done:
path->mnt = mnt;
path->dentry = dentry;
err = follow_managed(path, nd->flags);
@@ -1221,6 +1220,16 @@ retry:
nd->flags |= LOOKUP_JUMPED;
*inode = path->dentry->d_inode;
return 0;
+
+need_lookup:
+ BUG_ON(nd->inode != parent->d_inode);
+
+ mutex_lock(&parent->d_inode->i_mutex);
+ dentry = __lookup_hash(name, parent, nd);
+ mutex_unlock(&parent->d_inode->i_mutex);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ goto done;
}
static inline int may_lookup(struct nameidata *nd)
@@ -1375,6 +1384,128 @@ static inline int can_lookup(struct inode *inode)
}
/*
+ * We can do the critical dentry name comparison and hashing
+ * operations one word at a time, but we are limited to:
+ *
+ * - Architectures with fast unaligned word accesses. We could
+ * do a "get_unaligned()" if this helps and is sufficiently
+ * fast.
+ *
+ * - Little-endian machines (so that we can generate the mask
+ * of low bytes efficiently). Again, we *could* do a byte
+ * swapping load on big-endian architectures if that is not
+ * expensive enough to make the optimization worthless.
+ *
+ * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
+ * do not trap on the (extremely unlikely) case of a page
+ * crossing operation.
+ *
+ * - Furthermore, we need an efficient 64-bit compile for the
+ * 64-bit case in order to generate the "number of bytes in
+ * the final mask". Again, that could be replaced with a
+ * efficient population count instruction or similar.
+ */
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+
+#include <asm/word-at-a-time.h>
+
+#ifdef CONFIG_64BIT
+
+static inline unsigned int fold_hash(unsigned long hash)
+{
+ hash += hash >> (8*sizeof(int));
+ return hash;
+}
+
+#else /* 32-bit case */
+
+#define fold_hash(x) (x)
+
+#endif
+
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+ unsigned long a, mask;
+ unsigned long hash = 0;
+
+ for (;;) {
+ a = *(unsigned long *)name;
+ if (len < sizeof(unsigned long))
+ break;
+ hash += a;
+ hash *= 9;
+ name += sizeof(unsigned long);
+ len -= sizeof(unsigned long);
+ if (!len)
+ goto done;
+ }
+ mask = ~(~0ul << len*8);
+ hash += mask & a;
+done:
+ return fold_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+
+/*
+ * Calculate the length and hash of the path component, and
+ * return the length of the component;
+ */
+static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+{
+ unsigned long a, mask, hash, len;
+
+ hash = a = 0;
+ len = -sizeof(unsigned long);
+ do {
+ hash = (hash + a) * 9;
+ len += sizeof(unsigned long);
+ a = *(unsigned long *)(name+len);
+ /* Do we have any NUL or '/' bytes in this word? */
+ mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/'));
+ } while (!mask);
+
+ /* The mask *below* the first high bit set */
+ mask = (mask - 1) & ~mask;
+ mask >>= 7;
+ hash += a & mask;
+ *hashp = fold_hash(hash);
+
+ return len + count_masked_bytes(mask);
+}
+
+#else
+
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+ unsigned long hash = init_name_hash();
+ while (len--)
+ hash = partial_name_hash(*name++, hash);
+ return end_name_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+
+/*
+ * We know there's a real path component here of at least
+ * one character.
+ */
+static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+{
+ unsigned long hash = init_name_hash();
+ unsigned long len = 0, c;
+
+ c = (unsigned char)*name;
+ do {
+ len++;
+ hash = partial_name_hash(c, hash);
+ c = (unsigned char)name[len];
+ } while (c && c != '/');
+ *hashp = end_name_hash(hash);
+ return len;
+}
+
+#endif
+
+/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
* the final dentry. We expect 'base' to be positive and a directory.
@@ -1394,31 +1525,22 @@ static int link_path_walk(const char *name, struct nameidata *nd)
/* At this point we know we have a real path component. */
for(;;) {
- unsigned long hash;
struct qstr this;
- unsigned int c;
+ long len;
int type;
err = may_lookup(nd);
if (err)
break;
+ len = hash_name(name, &this.hash);
this.name = name;
- c = *(const unsigned char *)name;
-
- hash = init_name_hash();
- do {
- name++;
- hash = partial_name_hash(c, hash);
- c = *(const unsigned char *)name;
- } while (c && (c != '/'));
- this.len = name - (const char *) this.name;
- this.hash = end_name_hash(hash);
+ this.len = len;
type = LAST_NORM;
- if (this.name[0] == '.') switch (this.len) {
+ if (name[0] == '.') switch (len) {
case 2:
- if (this.name[1] == '.') {
+ if (name[1] == '.') {
type = LAST_DOTDOT;
nd->flags |= LOOKUP_JUMPED;
}
@@ -1437,12 +1559,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
}
}
- /* remove trailing slashes? */
- if (!c)
+ if (!name[len])
goto last_component;
- while (*++name == '/');
- if (!*name)
+ /*
+ * If it wasn't NUL, we know it was '/'. Skip that
+ * slash, and continue until no more slashes.
+ */
+ do {
+ len++;
+ } while (unlikely(name[len] == '/'));
+ if (!name[len])
goto last_component;
+ name += len;
err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
if (err < 0)
@@ -1698,59 +1826,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
return err;
}
-static struct dentry *__lookup_hash(struct qstr *name,
- struct dentry *base, struct nameidata *nd)
-{
- struct inode *inode = base->d_inode;
- struct dentry *dentry;
- int err;
-
- err = inode_permission(inode, MAY_EXEC);
- if (err)
- return ERR_PTR(err);
-
- /*
- * Don't bother with __d_lookup: callers are for creat as
- * well as unlink, so a lot of the time it would cost
- * a double lookup.
- */
- dentry = d_lookup(base, name);
-
- if (dentry && d_need_lookup(dentry)) {
- /*
- * __lookup_hash is called with the parent dir's i_mutex already
- * held, so we are good to go here.
- */
- dentry = d_inode_lookup(base, dentry, nd);
- if (IS_ERR(dentry))
- return dentry;
- }
-
- if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) {
- int status = d_revalidate(dentry, nd);
- if (unlikely(status <= 0)) {
- /*
- * The dentry failed validation.
- * If d_revalidate returned 0 attempt to invalidate
- * the dentry otherwise d_revalidate is asking us
- * to return a fail status.
- */
- if (status < 0) {
- dput(dentry);
- return ERR_PTR(status);
- } else if (!d_invalidate(dentry)) {
- dput(dentry);
- dentry = NULL;
- }
- }
- }
-
- if (!dentry)
- dentry = d_alloc_and_lookup(base, name, nd);
-
- return dentry;
-}
-
/*
* Restricted form of lookup. Doesn't follow links, single-component only,
* needs parent already locked. Doesn't follow mounts.
@@ -1775,24 +1850,22 @@ static struct dentry *lookup_hash(struct nameidata *nd)
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
struct qstr this;
- unsigned long hash;
unsigned int c;
+ int err;
WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
this.name = name;
this.len = len;
+ this.hash = full_name_hash(name, len);
if (!len)
return ERR_PTR(-EACCES);
- hash = init_name_hash();
while (len--) {
c = *(const unsigned char *)name++;
if (c == '/' || c == '\0')
return ERR_PTR(-EACCES);
- hash = partial_name_hash(c, hash);
}
- this.hash = end_name_hash(hash);
/*
* See if the low-level filesystem might want
* to use its own hash..
@@ -1803,6 +1876,10 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
return ERR_PTR(err);
}
+ err = inode_permission(base->d_inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+
return __lookup_hash(&this, base, NULL);
}
@@ -1827,7 +1904,7 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
int user_path_at(int dfd, const char __user *name, unsigned flags,
struct path *path)
{
- return user_path_at_empty(dfd, name, flags, path, 0);
+ return user_path_at_empty(dfd, name, flags, path, NULL);
}
static int user_path_parent(int dfd, const char __user *path,
@@ -2140,7 +2217,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
/* sayonara */
error = complete_walk(nd);
if (error)
- return ERR_PTR(-ECHILD);
+ return ERR_PTR(error);
error = -ENOTDIR;
if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2239,7 +2316,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
error = complete_walk(nd);
if (error)
- goto exit;
+ return ERR_PTR(error);
error = -EISDIR;
if (S_ISDIR(nd->inode->i_mode))
goto exit;
@@ -2547,6 +2624,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int error = may_create(dir, dentry);
+ unsigned max_links = dir->i_sb->s_max_links;
if (error)
return error;
@@ -2559,6 +2637,9 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (error)
return error;
+ if (max_links && dir->i_nlink >= max_links)
+ return -EMLINK;
+
error = dir->i_op->mkdir(dir, dentry, mode);
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -2600,7 +2681,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
/*
* The dentry_unhash() helper will try to drop the dentry early: we
- * should have a usage count of 2 if we're the only user of this
+ * should have a usage count of 1 if we're the only user of this
* dentry, and if that is true (possibly after pruning the dcache),
* then we drop the dentry now.
*
@@ -2889,6 +2970,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
{
struct inode *inode = old_dentry->d_inode;
+ unsigned max_links = dir->i_sb->s_max_links;
int error;
if (!inode)
@@ -2919,6 +3001,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
/* Make sure we don't allow creating hardlink to an unlinked file */
if (inode->i_nlink == 0)
error = -ENOENT;
+ else if (max_links && inode->i_nlink >= max_links)
+ error = -EMLINK;
else
error = dir->i_op->link(old_dentry, dir, new_dentry);
mutex_unlock(&inode->i_mutex);
@@ -3028,6 +3112,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
{
int error = 0;
struct inode *target = new_dentry->d_inode;
+ unsigned max_links = new_dir->i_sb->s_max_links;
/*
* If we are going to change the parent - check write permissions,
@@ -3051,6 +3136,11 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
goto out;
+ error = -EMLINK;
+ if (max_links && !target && new_dir != old_dir &&
+ new_dir->i_nlink >= max_links)
+ goto out;
+
if (target)
shrink_dcache_parent(new_dentry);
error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3349,9 +3439,9 @@ retry:
if (err)
goto fail;
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memcpy(kaddr, symname, len-1);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
page, fsdata);
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 64a326418aa..3ff5fcc1528 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -7,7 +7,6 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/time.h>
#include <linux/kernel.h>
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 3d1e34f8a68..87484fb8d17 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -11,7 +11,6 @@
#include <linux/module.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/byteorder.h>
@@ -716,13 +715,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (!root_inode)
goto out_disconnect;
DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
- sb->s_root = d_alloc_root(root_inode);
+ sb->s_root = d_make_root(root_inode);
if (!sb->s_root)
- goto out_no_root;
+ goto out_disconnect;
return 0;
-out_no_root:
- iput(root_inode);
out_disconnect:
ncp_lock_server(server);
ncp_disconnect(server);
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index e5d71b27a5b..be20a7e171a 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -19,7 +19,6 @@
#include <linux/memcontrol.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include "ncp_fs.h"
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index dbcd82126ae..2a0e6c59914 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -64,6 +64,7 @@ config NFS_V4
bool "NFS client support for NFS version 4"
depends on NFS_FS
select SUNRPC_GSS
+ select KEYS
help
This option enables support for version 4 of the NFS protocol
(RFC 3530) in the kernel's NFS client.
@@ -98,6 +99,18 @@ config PNFS_OBJLAYOUT
depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
default m
+config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
+ string "NFSv4.1 Implementation ID Domain"
+ depends on NFS_V4_1
+ default "kernel.org"
+ help
+ This option defines the domain portion of the implementation ID that
+ may be sent in the NFS exchange_id operation. The value must be in
+ the format of a DNS domain name and should be set to the DNS domain
+ name of the distribution.
+ If the NFS client is unchanged from the upstream kernel, this
+ option should be set to the default "kernel.org".
+
config ROOT_NFS
bool "Root file system on NFS"
depends on NFS_FS=y && IP_PNP
@@ -130,16 +143,10 @@ config NFS_USE_KERNEL_DNS
bool
depends on NFS_V4 && !NFS_USE_LEGACY_DNS
select DNS_RESOLVER
- select KEYS
default y
-config NFS_USE_NEW_IDMAPPER
- bool "Use the new idmapper upcall routine"
- depends on NFS_V4 && KEYS
- help
- Say Y here if you want NFS to use the new idmapper upcall functions.
- You will need /sbin/request-key (usually provided by the keyutils
- package). For details, read
- <file:Documentation/filesystems/nfs/idmapper.txt>.
-
- If you are unsure, say N.
+config NFS_DEBUG
+ bool
+ depends on NFS_FS && SUNRPC_DEBUG
+ select CRC32
+ default y
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 281ae95932c..9c94297bb70 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -46,9 +46,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
-struct dentry *bl_device_pipe;
-wait_queue_head_t bl_wq;
-
static void print_page(struct page *page)
{
dprintk("PRINTPAGE page %p\n", page);
@@ -90,9 +87,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
*/
struct parallel_io {
struct kref refcnt;
- struct rpc_call_ops call_ops;
- void (*pnfs_callback) (void *data);
+ void (*pnfs_callback) (void *data, int num_se);
void *data;
+ int bse_count;
};
static inline struct parallel_io *alloc_parallel(void *data)
@@ -103,6 +100,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
if (rv) {
rv->data = data;
kref_init(&rv->refcnt);
+ rv->bse_count = 0;
}
return rv;
}
@@ -117,7 +115,7 @@ static void destroy_parallel(struct kref *kref)
struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
dprintk("%s enter\n", __func__);
- p->pnfs_callback(p->data);
+ p->pnfs_callback(p->data, p->bse_count);
kfree(p);
}
@@ -146,14 +144,19 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
{
struct bio *bio;
+ npg = min(npg, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOIO, npg);
- if (!bio)
- return NULL;
+ if (!bio && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (npg /= 2))
+ bio = bio_alloc(GFP_NOIO, npg);
+ }
- bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
- bio->bi_bdev = be->be_mdev;
- bio->bi_end_io = end_io;
- bio->bi_private = par;
+ if (bio) {
+ bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+ bio->bi_bdev = be->be_mdev;
+ bio->bi_end_io = end_io;
+ bio->bi_private = par;
+ }
return bio;
}
@@ -212,22 +215,15 @@ static void bl_read_cleanup(struct work_struct *work)
}
static void
-bl_end_par_io_read(void *data)
+bl_end_par_io_read(void *data, int unused)
{
struct nfs_read_data *rdata = data;
+ rdata->task.tk_status = rdata->pnfs_error;
INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
schedule_work(&rdata->task.u.tk_work);
}
-/* We don't want normal .rpc_call_done callback used, so we replace it
- * with this stub.
- */
-static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
-{
- return;
-}
-
static enum pnfs_try_status
bl_read_pagelist(struct nfs_read_data *rdata)
{
@@ -237,18 +233,15 @@ bl_read_pagelist(struct nfs_read_data *rdata)
sector_t isect, extent_length = 0;
struct parallel_io *par;
loff_t f_offset = rdata->args.offset;
- size_t count = rdata->args.count;
struct page **pages = rdata->args.pages;
int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
- dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
- rdata->npages, f_offset, count);
+ dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
+ rdata->npages, f_offset, (unsigned int)rdata->args.count);
par = alloc_parallel(rdata);
if (!par)
goto use_mds;
- par->call_ops = *rdata->mds_ops;
- par->call_ops.rpc_call_done = bl_rpc_do_nothing;
par->pnfs_callback = bl_end_par_io_read;
/* At this point, we can no longer jump to use_mds */
@@ -322,6 +315,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
{
sector_t isect, end;
struct pnfs_block_extent *be;
+ struct pnfs_block_short_extent *se;
dprintk("%s(%llu, %u)\n", __func__, offset, count);
if (count == 0)
@@ -334,8 +328,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
be = bl_find_get_extent(bl, isect, NULL);
BUG_ON(!be); /* FIXME */
len = min(end, be->be_f_offset + be->be_length) - isect;
- if (be->be_state == PNFS_BLOCK_INVALID_DATA)
- bl_mark_for_commit(be, isect, len); /* What if fails? */
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ se = bl_pop_one_short_extent(be->be_inval);
+ BUG_ON(!se);
+ bl_mark_for_commit(be, isect, len, se);
+ }
isect += len;
bl_put_extent(be);
}
@@ -357,7 +354,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
end_page_writeback(page);
page_cache_release(page);
} while (bvec >= bio->bi_io_vec);
- if (!uptodate) {
+
+ if (unlikely(!uptodate)) {
if (!wdata->pnfs_error)
wdata->pnfs_error = -EIO;
pnfs_set_lo_fail(wdata->lseg);
@@ -366,7 +364,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
put_parallel(par);
}
-/* This is basically copied from mpage_end_io_read */
static void bl_end_io_write(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
@@ -392,7 +389,7 @@ static void bl_write_cleanup(struct work_struct *work)
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
wdata = container_of(task, struct nfs_write_data, task);
- if (!wdata->pnfs_error) {
+ if (likely(!wdata->pnfs_error)) {
/* Marks for LAYOUTCOMMIT */
mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
wdata->args.offset, wdata->args.count);
@@ -401,11 +398,16 @@ static void bl_write_cleanup(struct work_struct *work)
}
/* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data)
+static void bl_end_par_io_write(void *data, int num_se)
{
struct nfs_write_data *wdata = data;
- wdata->task.tk_status = 0;
+ if (unlikely(wdata->pnfs_error)) {
+ bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
+ num_se);
+ }
+
+ wdata->task.tk_status = wdata->pnfs_error;
wdata->verf.committed = NFS_FILE_SYNC;
INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
schedule_work(&wdata->task.u.tk_work);
@@ -484,6 +486,55 @@ cleanup:
return ret;
}
+/* Find or create a zeroing page marked being writeback.
+ * Return ERR_PTR on error, NULL to indicate skip this page and page itself
+ * to indicate write out.
+ */
+static struct page *
+bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
+ struct pnfs_block_extent *cow_read)
+{
+ struct page *page;
+ int locked = 0;
+ page = find_get_page(inode->i_mapping, index);
+ if (page)
+ goto check_page;
+
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (unlikely(!page)) {
+ dprintk("%s oom\n", __func__);
+ return ERR_PTR(-ENOMEM);
+ }
+ locked = 1;
+
+check_page:
+ /* PageDirty: Other will write this out
+ * PageWriteback: Other is writing this out
+ * PageUptodate: It was read before
+ */
+ if (PageDirty(page) || PageWriteback(page)) {
+ print_page(page);
+ if (locked)
+ unlock_page(page);
+ page_cache_release(page);
+ return NULL;
+ }
+
+ if (!locked) {
+ lock_page(page);
+ locked = 1;
+ goto check_page;
+ }
+ if (!PageUptodate(page)) {
+ /* New page, readin or zero it */
+ init_page_for_write(page, cow_read);
+ }
+ set_page_writeback(page);
+ unlock_page(page);
+
+ return page;
+}
+
static enum pnfs_try_status
bl_write_pagelist(struct nfs_write_data *wdata, int sync)
{
@@ -508,9 +559,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
*/
par = alloc_parallel(wdata);
if (!par)
- return PNFS_NOT_ATTEMPTED;
- par->call_ops = *wdata->mds_ops;
- par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+ goto out_mds;
par->pnfs_callback = bl_end_par_io_write;
/* At this point, have to be more careful with error handling */
@@ -518,12 +567,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
if (!be || !is_writable(be, isect)) {
dprintk("%s no matching extents!\n", __func__);
- wdata->pnfs_error = -EINVAL;
- goto out;
+ goto out_mds;
}
/* First page inside INVALID extent */
if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (likely(!bl_push_one_short_extent(be->be_inval)))
+ par->bse_count++;
+ else
+ goto out_mds;
temp = offset >> PAGE_CACHE_SHIFT;
npg_zero = do_div(temp, npg_per_block);
isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
@@ -543,36 +595,16 @@ fill_invalid_ext:
dprintk("%s zero %dth page: index %lu isect %llu\n",
__func__, npg_zero, index,
(unsigned long long)isect);
- page =
- find_or_create_page(wdata->inode->i_mapping, index,
- GFP_NOFS);
- if (!page) {
- dprintk("%s oom\n", __func__);
- wdata->pnfs_error = -ENOMEM;
+ page = bl_find_get_zeroing_page(wdata->inode, index,
+ cow_read);
+ if (unlikely(IS_ERR(page))) {
+ wdata->pnfs_error = PTR_ERR(page);
goto out;
- }
-
- /* PageDirty: Other will write this out
- * PageWriteback: Other is writing this out
- * PageUptodate: It was read before
- * sector_initialized: already written out
- */
- if (PageDirty(page) || PageWriteback(page)) {
- print_page(page);
- unlock_page(page);
- page_cache_release(page);
+ } else if (page == NULL)
goto next_page;
- }
- if (!PageUptodate(page)) {
- /* New page, readin or zero it */
- init_page_for_write(page, cow_read);
- }
- set_page_writeback(page);
- unlock_page(page);
ret = bl_mark_sectors_init(be->be_inval, isect,
- PAGE_CACHE_SECTORS,
- NULL);
+ PAGE_CACHE_SECTORS);
if (unlikely(ret)) {
dprintk("%s bl_mark_sectors_init fail %d\n",
__func__, ret);
@@ -581,6 +613,19 @@ fill_invalid_ext:
wdata->pnfs_error = ret;
goto out;
}
+ if (likely(!bl_push_one_short_extent(be->be_inval)))
+ par->bse_count++;
+ else {
+ end_page_writeback(page);
+ page_cache_release(page);
+ wdata->pnfs_error = -ENOMEM;
+ goto out;
+ }
+ /* FIXME: This should be done in bi_end_io */
+ mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+ page->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE);
+
bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
isect, page, be,
bl_end_io_write_zero, par);
@@ -589,10 +634,6 @@ fill_invalid_ext:
bio = NULL;
goto out;
}
- /* FIXME: This should be done in bi_end_io */
- mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
- page->index << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE);
next_page:
isect += PAGE_CACHE_SECTORS;
extent_length -= PAGE_CACHE_SECTORS;
@@ -616,13 +657,21 @@ next_page:
wdata->pnfs_error = -EINVAL;
goto out;
}
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (likely(!bl_push_one_short_extent(
+ be->be_inval)))
+ par->bse_count++;
+ else {
+ wdata->pnfs_error = -ENOMEM;
+ goto out;
+ }
+ }
extent_length = be->be_length -
(isect - be->be_f_offset);
}
if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
ret = bl_mark_sectors_init(be->be_inval, isect,
- PAGE_CACHE_SECTORS,
- NULL);
+ PAGE_CACHE_SECTORS);
if (unlikely(ret)) {
dprintk("%s bl_mark_sectors_init fail %d\n",
__func__, ret);
@@ -664,6 +713,10 @@ out:
bl_submit_bio(WRITE, bio);
put_parallel(par);
return PNFS_ATTEMPTED;
+out_mds:
+ bl_put_extent(be);
+ kfree(par);
+ return PNFS_NOT_ATTEMPTED;
}
/* FIXME - range ignored */
@@ -690,11 +743,17 @@ static void
release_inval_marks(struct pnfs_inval_markings *marks)
{
struct pnfs_inval_tracking *pos, *temp;
+ struct pnfs_block_short_extent *se, *stemp;
list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
list_del(&pos->it_link);
kfree(pos);
}
+
+ list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
+ list_del(&se->bse_node);
+ kfree(se);
+ }
return;
}
@@ -779,16 +838,13 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
static void free_blk_mountid(struct block_mount_id *mid)
{
if (mid) {
- struct pnfs_block_dev *dev;
- spin_lock(&mid->bm_lock);
- while (!list_empty(&mid->bm_devlist)) {
- dev = list_first_entry(&mid->bm_devlist,
- struct pnfs_block_dev,
- bm_node);
+ struct pnfs_block_dev *dev, *tmp;
+
+ /* No need to take bm_lock as we are last user freeing bm_devlist */
+ list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
list_del(&dev->bm_node);
bl_free_block_dev(dev);
}
- spin_unlock(&mid->bm_lock);
kfree(mid);
}
}
@@ -965,10 +1021,128 @@ static const struct rpc_pipe_ops bl_upcall_ops = {
.destroy_msg = bl_pipe_destroy_msg,
};
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ struct dentry *dir, *dentry;
+
+ dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+ if (dir == NULL)
+ return ERR_PTR(-ENOENT);
+ dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+ dput(dir);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ if (pipe->dentry)
+ rpc_unlink(pipe->dentry);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ if (nn->bl_device_pipe == NULL) {
+ module_put(THIS_MODULE);
+ return 0;
+ }
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ break;
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (nn->bl_device_pipe->dentry)
+ nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfs4blocklayout_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+ struct dentry *dentry;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (!pipefs_sb)
+ return NULL;
+ dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ }
+}
+
+static int nfs4blocklayout_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+
+ init_waitqueue_head(&nn->bl_wq);
+ nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+ if (IS_ERR(nn->bl_device_pipe))
+ return PTR_ERR(nn->bl_device_pipe);
+ dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ return PTR_ERR(dentry);
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ return 0;
+}
+
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ nn->bl_device_pipe = NULL;
+}
+
+static struct pernet_operations nfs4blocklayout_net_ops = {
+ .init = nfs4blocklayout_net_init,
+ .exit = nfs4blocklayout_net_exit,
+};
+
static int __init nfs4blocklayout_init(void)
{
- struct vfsmount *mnt;
- struct path path;
int ret;
dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
@@ -977,32 +1151,17 @@ static int __init nfs4blocklayout_init(void)
if (ret)
goto out;
- init_waitqueue_head(&bl_wq);
-
- mnt = rpc_get_mount();
- if (IS_ERR(mnt)) {
- ret = PTR_ERR(mnt);
+ ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+ if (ret)
goto out_remove;
- }
-
- ret = vfs_path_lookup(mnt->mnt_root,
- mnt,
- NFS_PIPE_DIRNAME, 0, &path);
+ ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
if (ret)
- goto out_putrpc;
-
- bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
- &bl_upcall_ops, 0);
- path_put(&path);
- if (IS_ERR(bl_device_pipe)) {
- ret = PTR_ERR(bl_device_pipe);
- goto out_putrpc;
- }
+ goto out_notifier;
out:
return ret;
-out_putrpc:
- rpc_put_mount();
+out_notifier:
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
out_remove:
pnfs_unregister_layoutdriver(&blocklayout_type);
return ret;
@@ -1013,9 +1172,9 @@ static void __exit nfs4blocklayout_exit(void)
dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
__func__);
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+ unregister_pernet_subsys(&nfs4blocklayout_net_ops);
pnfs_unregister_layoutdriver(&blocklayout_type);
- rpc_unlink(bl_device_pipe);
- rpc_put_mount();
}
MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 42acf7ef599..03350690118 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -37,6 +37,7 @@
#include <linux/sunrpc/rpc_pipe_fs.h>
#include "../pnfs.h"
+#include "../netns.h"
#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
@@ -50,6 +51,7 @@ struct pnfs_block_dev {
struct list_head bm_node;
struct nfs4_deviceid bm_mdevid; /* associated devid */
struct block_device *bm_mdev; /* meta device itself */
+ struct net *net;
};
enum exstate4 {
@@ -70,6 +72,7 @@ struct pnfs_inval_markings {
spinlock_t im_lock;
struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
sector_t im_block_size; /* Server blocksize in sectors */
+ struct list_head im_extents; /* Short extents for INVAL->RW conversion */
};
struct pnfs_inval_tracking {
@@ -105,6 +108,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
{
spin_lock_init(&marks->im_lock);
INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+ INIT_LIST_HEAD(&marks->im_extents);
marks->im_block_size = blocksize;
marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
blocksize);
@@ -149,9 +153,9 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
return BLK_LO2EXT(lseg->pls_layout);
}
-struct bl_dev_msg {
- int32_t status;
- uint32_t major, minor;
+struct bl_pipe_msg {
+ struct rpc_pipe_msg msg;
+ wait_queue_head_t *bl_wq;
};
struct bl_msg_hdr {
@@ -159,9 +163,6 @@ struct bl_msg_hdr {
u16 totallen; /* length of entire message, including hdr itself */
};
-extern struct dentry *bl_device_pipe;
-extern wait_queue_head_t bl_wq;
-
#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
@@ -186,8 +187,7 @@ struct pnfs_block_extent *
bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
struct pnfs_block_extent **cow_read);
int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length,
- sector_t **pages);
+ sector_t offset, sector_t length);
void bl_put_extent(struct pnfs_block_extent *be);
struct pnfs_block_extent *bl_alloc_extent(void);
int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
@@ -200,6 +200,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
int bl_add_merge_extent(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
int bl_mark_for_commit(struct pnfs_block_extent *be,
- sector_t offset, sector_t length);
+ sector_t offset, sector_t length,
+ struct pnfs_block_short_extent *new);
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index d08ba9107fd..a5c88a554d9 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -46,7 +46,7 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
*rp = xdr_decode_hyper(*rp, &s);
if (s & 0x1ff) {
- printk(KERN_WARNING "%s: sector not aligned\n", __func__);
+ printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
return -1;
}
*sp = s >> SECTOR_SHIFT;
@@ -79,27 +79,30 @@ int nfs4_blkdev_put(struct block_device *bdev)
return blkdev_put(bdev, FMODE_READ);
}
-static struct bl_dev_msg bl_mount_reply;
-
ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
size_t mlen)
{
+ struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+ nfs_net_id);
+
if (mlen != sizeof (struct bl_dev_msg))
return -EINVAL;
- if (copy_from_user(&bl_mount_reply, src, mlen) != 0)
+ if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
return -EFAULT;
- wake_up(&bl_wq);
+ wake_up(&nn->bl_wq);
return mlen;
}
void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
{
+ struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
+
if (msg->errno >= 0)
return;
- wake_up(&bl_wq);
+ wake_up(bl_pipe_msg->bl_wq);
}
/*
@@ -111,29 +114,33 @@ nfs4_blk_decode_device(struct nfs_server *server,
{
struct pnfs_block_dev *rv;
struct block_device *bd = NULL;
- struct rpc_pipe_msg msg;
+ struct bl_pipe_msg bl_pipe_msg;
+ struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
struct bl_msg_hdr bl_msg = {
.type = BL_DEVICE_MOUNT,
.totallen = dev->mincount,
};
uint8_t *dataptr;
DECLARE_WAITQUEUE(wq, current);
- struct bl_dev_msg *reply = &bl_mount_reply;
int offset, len, i, rc;
+ struct net *net = server->nfs_client->net;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct bl_dev_msg *reply = &nn->bl_mount_reply;
dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
dev->mincount);
- memset(&msg, 0, sizeof(msg));
- msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
- if (!msg.data) {
+ bl_pipe_msg.bl_wq = &nn->bl_wq;
+ memset(msg, 0, sizeof(*msg));
+ msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
+ if (!msg->data) {
rv = ERR_PTR(-ENOMEM);
goto out;
}
- memcpy(msg.data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg.data;
+ memcpy(msg->data, &bl_msg, sizeof(bl_msg));
+ dataptr = (uint8_t *) msg->data;
len = dev->mincount;
offset = sizeof(bl_msg);
for (i = 0; len > 0; i++) {
@@ -142,13 +149,13 @@ nfs4_blk_decode_device(struct nfs_server *server,
len -= PAGE_CACHE_SIZE;
offset += PAGE_CACHE_SIZE;
}
- msg.len = sizeof(bl_msg) + dev->mincount;
+ msg->len = sizeof(bl_msg) + dev->mincount;
dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
- add_wait_queue(&bl_wq, &wq);
- rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg);
+ add_wait_queue(&nn->bl_wq, &wq);
+ rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
if (rc < 0) {
- remove_wait_queue(&bl_wq, &wq);
+ remove_wait_queue(&nn->bl_wq, &wq);
rv = ERR_PTR(rc);
goto out;
}
@@ -156,7 +163,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
__set_current_state(TASK_RUNNING);
- remove_wait_queue(&bl_wq, &wq);
+ remove_wait_queue(&nn->bl_wq, &wq);
if (reply->status != BL_DEVICE_REQUEST_PROC) {
dprintk("%s failed to open device: %d\n",
@@ -181,13 +188,14 @@ nfs4_blk_decode_device(struct nfs_server *server,
rv->bm_mdev = bd;
memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
+ rv->net = net;
dprintk("%s Created device %s with bd_block_size %u\n",
__func__,
bd->bd_disk->disk_name,
bd->bd_block_size);
out:
- kfree(msg.data);
+ kfree(msg->data);
return rv;
}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
index d055c755807..737d839bc17 100644
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -38,9 +38,10 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-static void dev_remove(dev_t dev)
+static void dev_remove(struct net *net, dev_t dev)
{
- struct rpc_pipe_msg msg;
+ struct bl_pipe_msg bl_pipe_msg;
+ struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
struct bl_dev_msg bl_umount_request;
struct bl_msg_hdr bl_msg = {
.type = BL_DEVICE_UMOUNT,
@@ -48,36 +49,38 @@ static void dev_remove(dev_t dev)
};
uint8_t *dataptr;
DECLARE_WAITQUEUE(wq, current);
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
dprintk("Entering %s\n", __func__);
- memset(&msg, 0, sizeof(msg));
- msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
- if (!msg.data)
+ bl_pipe_msg.bl_wq = &nn->bl_wq;
+ memset(msg, 0, sizeof(*msg));
+ msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
+ if (!msg->data)
goto out;
memset(&bl_umount_request, 0, sizeof(bl_umount_request));
bl_umount_request.major = MAJOR(dev);
bl_umount_request.minor = MINOR(dev);
- memcpy(msg.data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg.data;
+ memcpy(msg->data, &bl_msg, sizeof(bl_msg));
+ dataptr = (uint8_t *) msg->data;
memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
- msg.len = sizeof(bl_msg) + bl_msg.totallen;
+ msg->len = sizeof(bl_msg) + bl_msg.totallen;
- add_wait_queue(&bl_wq, &wq);
- if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
- remove_wait_queue(&bl_wq, &wq);
+ add_wait_queue(&nn->bl_wq, &wq);
+ if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
+ remove_wait_queue(&nn->bl_wq, &wq);
goto out;
}
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
__set_current_state(TASK_RUNNING);
- remove_wait_queue(&bl_wq, &wq);
+ remove_wait_queue(&nn->bl_wq, &wq);
out:
- kfree(msg.data);
+ kfree(msg->data);
}
/*
@@ -90,10 +93,10 @@ static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
dprintk("%s Releasing\n", __func__);
rv = nfs4_blkdev_put(bdev->bm_mdev);
if (rv)
- printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n",
+ printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n",
__func__, rv);
- dev_remove(bdev->bm_mdev->bd_dev);
+ dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
}
void bl_free_block_dev(struct pnfs_block_dev *bdev)
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 19fa7b0b8c0..1f9a6032796 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -110,13 +110,7 @@ static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
return 0;
} else {
struct pnfs_inval_tracking *new;
- if (storage)
- new = storage;
- else {
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (!new)
- return -ENOMEM;
- }
+ new = storage;
new->it_sector = s;
new->it_tags = (1 << tag);
list_add(&new->it_link, &pos->it_link);
@@ -139,11 +133,13 @@ static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
}
/* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
+static int _preload_range(struct pnfs_inval_markings *marks,
+ u64 offset, u64 length)
{
u64 start, end, s;
int count, i, used = 0, status = -ENOMEM;
struct pnfs_inval_tracking **storage;
+ struct my_tree *tree = &marks->im_tree;
dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
start = normalize(offset, tree->mtt_step_size);
@@ -151,7 +147,7 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
count = (int)(end - start) / (int)tree->mtt_step_size;
/* Pre-malloc what memory we might need */
- storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
+ storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
if (!storage)
return -ENOMEM;
for (i = 0; i < count; i++) {
@@ -161,12 +157,11 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
goto out_cleanup;
}
- /* Now need lock - HOW??? */
-
+ spin_lock_bh(&marks->im_lock);
for (s = start; s < end; s += tree->mtt_step_size)
used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+ spin_unlock_bh(&marks->im_lock);
- /* Unlock - HOW??? */
status = 0;
out_cleanup:
@@ -179,41 +174,14 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
return status;
}
-static void set_needs_init(sector_t *array, sector_t offset)
-{
- sector_t *p = array;
-
- dprintk("%s enter\n", __func__);
- if (!p)
- return;
- while (*p < offset)
- p++;
- if (*p == offset)
- return;
- else if (*p == ~0) {
- *p++ = offset;
- *p = ~0;
- return;
- } else {
- sector_t *save = p;
- dprintk("%s Adding %llu\n", __func__, (u64)offset);
- while (*p != ~0)
- p++;
- p++;
- memmove(save + 1, save, (char *)p - (char *)save);
- *save = offset;
- return;
- }
-}
-
/* We are relying on page lock to serialize this */
int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
{
int rv;
- spin_lock(&marks->im_lock);
+ spin_lock_bh(&marks->im_lock);
rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
- spin_unlock(&marks->im_lock);
+ spin_unlock_bh(&marks->im_lock);
return rv;
}
@@ -253,78 +221,39 @@ static int is_range_written(struct pnfs_inval_markings *marks,
{
int rv;
- spin_lock(&marks->im_lock);
+ spin_lock_bh(&marks->im_lock);
rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
- spin_unlock(&marks->im_lock);
+ spin_unlock_bh(&marks->im_lock);
return rv;
}
/* Marks sectors in [offest, offset_length) as having been initialized.
* All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Notes where partial block is initialized, and helps prepare it for
- * complete initialization later.
+ * Currently assumes offset is page-aligned
*/
-/* Currently assumes offset is page-aligned */
int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length,
- sector_t **pages)
+ sector_t offset, sector_t length)
{
- sector_t s, start, end;
- sector_t *array = NULL; /* Pages to mark */
+ sector_t start, end;
dprintk("%s(offset=%llu,len=%llu) enter\n",
__func__, (u64)offset, (u64)length);
- s = max((sector_t) 3,
- 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
- dprintk("%s set max=%llu\n", __func__, (u64)s);
- if (pages) {
- array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
- if (!array)
- goto outerr;
- array[0] = ~0;
- }
start = normalize(offset, marks->im_block_size);
end = normalize_up(offset + length, marks->im_block_size);
- if (_preload_range(&marks->im_tree, start, end - start))
+ if (_preload_range(marks, start, end - start))
goto outerr;
- spin_lock(&marks->im_lock);
-
- for (s = normalize_up(start, PAGE_CACHE_SECTORS);
- s < offset; s += PAGE_CACHE_SECTORS) {
- dprintk("%s pre-area pages\n", __func__);
- /* Portion of used block is not initialized */
- if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
- set_needs_init(array, s);
- }
+ spin_lock_bh(&marks->im_lock);
if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
goto out_unlock;
- for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
- s < end; s += PAGE_CACHE_SECTORS) {
- dprintk("%s post-area pages\n", __func__);
- if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
- set_needs_init(array, s);
- }
-
- spin_unlock(&marks->im_lock);
+ spin_unlock_bh(&marks->im_lock);
- if (pages) {
- if (array[0] == ~0) {
- kfree(array);
- *pages = NULL;
- } else
- *pages = array;
- }
return 0;
- out_unlock:
- spin_unlock(&marks->im_lock);
- outerr:
- if (pages) {
- kfree(array);
- *pages = NULL;
- }
+out_unlock:
+ spin_unlock_bh(&marks->im_lock);
+outerr:
return -ENOMEM;
}
@@ -338,9 +267,9 @@ static int mark_written_sectors(struct pnfs_inval_markings *marks,
dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
(u64)offset, (u64)length);
- spin_lock(&marks->im_lock);
+ spin_lock_bh(&marks->im_lock);
status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
- spin_unlock(&marks->im_lock);
+ spin_unlock_bh(&marks->im_lock);
return status;
}
@@ -440,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
/* Note the range described by offset, length is guaranteed to be contained
* within be.
+ * new will be freed, either by this function or add_to_commitlist if they
+ * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
*/
int bl_mark_for_commit(struct pnfs_block_extent *be,
- sector_t offset, sector_t length)
+ sector_t offset, sector_t length,
+ struct pnfs_block_short_extent *new)
{
sector_t new_end, end = offset + length;
- struct pnfs_block_short_extent *new;
struct pnfs_block_layout *bl = container_of(be->be_inval,
struct pnfs_block_layout,
bl_inval);
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (!new)
- return -ENOMEM;
-
mark_written_sectors(be->be_inval, offset, length);
/* We want to add the range to commit list, but it must be
* block-normalized, and verified that the normalized range has
@@ -483,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
new->bse_mdev = be->be_mdev;
spin_lock(&bl->bl_ext_lock);
- /* new will be freed, either by add_to_commitlist if it decides not
- * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
- */
add_to_commitlist(bl, new);
spin_unlock(&bl->bl_ext_lock);
return 0;
@@ -933,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
}
}
}
+
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
+{
+ struct pnfs_block_short_extent *new;
+
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (unlikely(!new))
+ return -ENOMEM;
+
+ spin_lock_bh(&marks->im_lock);
+ list_add(&new->bse_node, &marks->im_extents);
+ spin_unlock_bh(&marks->im_lock);
+
+ return 0;
+}
+
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
+{
+ struct pnfs_block_short_extent *rv = NULL;
+
+ spin_lock_bh(&marks->im_lock);
+ if (!list_empty(&marks->im_extents)) {
+ rv = list_entry((&marks->im_extents)->next,
+ struct pnfs_block_short_extent, bse_node);
+ list_del_init(&rv->bse_node);
+ }
+ spin_unlock_bh(&marks->im_lock);
+
+ return rv;
+}
+
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
+{
+ struct pnfs_block_short_extent *se = NULL, *tmp;
+
+ if (num_to_free <= 0)
+ return;
+
+ spin_lock(&marks->im_lock);
+ list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
+ list_del(&se->bse_node);
+ kfree(se);
+ if (--num_to_free == 0)
+ break;
+ }
+ spin_unlock(&marks->im_lock);
+
+ BUG_ON(num_to_free > 0);
+}
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index c98b439332f..dded2636811 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <net/net_namespace.h>
#include "cache_lib.h"
@@ -111,30 +112,54 @@ int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
return 0;
}
-int nfs_cache_register(struct cache_detail *cd)
+int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
{
- struct vfsmount *mnt;
- struct path path;
int ret;
+ struct dentry *dir;
- mnt = rpc_get_mount();
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
- ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &path);
- if (ret)
- goto err;
- ret = sunrpc_cache_register_pipefs(path.dentry, cd->name, 0600, cd);
- path_put(&path);
- if (!ret)
- return ret;
-err:
- rpc_put_mount();
+ dir = rpc_d_lookup_sb(sb, "cache");
+ BUG_ON(dir == NULL);
+ ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
+ dput(dir);
return ret;
}
-void nfs_cache_unregister(struct cache_detail *cd)
+int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
{
- sunrpc_cache_unregister_pipefs(cd);
- rpc_put_mount();
+ struct super_block *pipefs_sb;
+ int ret = 0;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ ret = nfs_cache_register_sb(pipefs_sb, cd);
+ rpc_put_sb_net(net);
+ }
+ return ret;
+}
+
+void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd)
+{
+ if (cd->u.pipefs.dir)
+ sunrpc_cache_unregister_pipefs(cd);
+}
+
+void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ nfs_cache_unregister_sb(pipefs_sb, cd);
+ rpc_put_sb_net(net);
+ }
+}
+
+void nfs_cache_init(struct cache_detail *cd)
+{
+ sunrpc_init_cache_detail(cd);
}
+void nfs_cache_destroy(struct cache_detail *cd)
+{
+ sunrpc_destroy_cache_detail(cd);
+}
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 7cf6cafcc00..317db95e37f 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -23,5 +23,11 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
-extern int nfs_cache_register(struct cache_detail *cd);
-extern void nfs_cache_unregister(struct cache_detail *cd);
+extern void nfs_cache_init(struct cache_detail *cd);
+extern void nfs_cache_destroy(struct cache_detail *cd);
+extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd);
+extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd);
+extern int nfs_cache_register_sb(struct super_block *sb,
+ struct cache_detail *cd);
+extern void nfs_cache_unregister_sb(struct super_block *sb,
+ struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 516f3375e06..eb95f5091c1 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -85,7 +85,7 @@ nfs4_callback_svc(void *vrqstp)
}
if (err < 0) {
if (err != preverr) {
- printk(KERN_WARNING "%s: unexpected error "
+ printk(KERN_WARNING "NFS: %s: unexpected error "
"from svc_recv (%d)\n", __func__, err);
preverr = err;
}
@@ -101,12 +101,12 @@ nfs4_callback_svc(void *vrqstp)
/*
* Prepare to bring up the NFSv4 callback service
*/
-struct svc_rqst *
-nfs4_callback_up(struct svc_serv *serv)
+static struct svc_rqst *
+nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
{
int ret;
- ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
+ ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET,
nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
if (ret <= 0)
goto out_err;
@@ -114,7 +114,7 @@ nfs4_callback_up(struct svc_serv *serv)
dprintk("NFS: Callback listener port = %u (af %u)\n",
nfs_callback_tcpport, PF_INET);
- ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
+ ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET6,
nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
if (ret > 0) {
nfs_callback_tcpport6 = ret;
@@ -172,7 +172,7 @@ nfs41_callback_svc(void *vrqstp)
/*
* Bring up the NFSv4.1 callback service
*/
-struct svc_rqst *
+static struct svc_rqst *
nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
{
struct svc_rqst *rqstp;
@@ -183,7 +183,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
* fore channel connection.
* Returns the input port (0) and sets the svc_serv bc_xprt on success
*/
- ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+ ret = svc_create_xprt(serv, "tcp-bc", xprt->xprt_net, PF_INET, 0,
SVC_SOCK_ANONYMOUS);
if (ret < 0) {
rqstp = ERR_PTR(ret);
@@ -269,7 +269,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
serv, xprt, &rqstp, &callback_svc);
if (!minorversion_setup) {
/* v4.0 callback setup */
- rqstp = nfs4_callback_up(serv);
+ rqstp = nfs4_callback_up(serv, xprt);
callback_svc = nfs4_callback_svc;
}
@@ -332,7 +332,6 @@ void nfs_callback_down(int minorversion)
int
check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
{
- struct rpc_clnt *r = clp->cl_rpcclient;
char *p = svc_gss_principal(rqstp);
if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
@@ -353,7 +352,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
if (memcmp(p, "nfs@", 4) != 0)
return 0;
p += 4;
- if (strcmp(p, r->cl_server) != 0)
+ if (strcmp(p, clp->cl_hostname) != 0)
return 0;
return 1;
}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 07df5f1d85e..a5527c90a5a 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,7 +38,8 @@ enum nfs4_callback_opnum {
struct cb_process_state {
__be32 drc_status;
struct nfs_client *clp;
- int slotid;
+ u32 slotid;
+ struct net *net;
};
struct cb_compound_hdr_arg {
@@ -162,7 +163,7 @@ struct cb_layoutrecallargs {
};
};
-extern unsigned nfs4_callback_layoutrecall(
+extern __be32 nfs4_callback_layoutrecall(
struct cb_layoutrecallargs *args,
void *dummy, struct cb_process_state *cps);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 54cea8ad5a7..1b5d809a105 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -8,6 +8,7 @@
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
#include <linux/slab.h>
+#include <linux/rcupdate.h>
#include "nfs4_fs.h"
#include "callback.h"
#include "delegation.h"
@@ -33,7 +34,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
res->bitmap[0] = res->bitmap[1] = 0;
res->status = htonl(NFS4ERR_BADHANDLE);
- dprintk("NFS: GETATTR callback request from %s\n",
+ dprintk_rcu("NFS: GETATTR callback request from %s\n",
rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
inode = nfs_delegation_find_inode(cps->clp, &args->fh);
@@ -73,7 +74,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
goto out;
- dprintk("NFS: RECALL callback request from %s\n",
+ dprintk_rcu("NFS: RECALL callback request from %s\n",
rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
res = htonl(NFS4ERR_BADHANDLE);
@@ -86,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
res = 0;
break;
case -ENOENT:
- if (res != 0)
- res = htonl(NFS4ERR_BAD_STATEID);
+ res = htonl(NFS4ERR_BAD_STATEID);
break;
default:
res = htonl(NFS4ERR_RESOURCE);
@@ -98,52 +98,64 @@ out:
return res;
}
-int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
-{
- if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
- sizeof(delegation->stateid.data)) != 0)
- return 0;
- return 1;
-}
-
#if defined(CONFIG_NFS_V4_1)
-static u32 initiate_file_draining(struct nfs_client *clp,
- struct cb_layoutrecallargs *args)
+/*
+ * Lookup a layout by filehandle.
+ *
+ * Note: gets a refcount on the layout hdr and on its respective inode.
+ * Caller must put the layout hdr and the inode.
+ *
+ * TODO: keep track of all layouts (and delegations) in a hash table
+ * hashed by filehandle.
+ */
+static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh)
{
struct nfs_server *server;
- struct pnfs_layout_hdr *lo;
struct inode *ino;
- bool found = false;
- u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
- LIST_HEAD(free_me_list);
+ struct pnfs_layout_hdr *lo;
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry(lo, &server->layouts, plh_layouts) {
- if (nfs_compare_fh(&args->cbl_fh,
- &NFS_I(lo->plh_inode)->fh))
+ if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
continue;
ino = igrab(lo->plh_inode);
if (!ino)
continue;
- found = true;
- /* Without this, layout can be freed as soon
- * as we release cl_lock.
- */
get_layout_hdr(lo);
- break;
+ return lo;
}
- if (found)
- break;
}
+
+ return NULL;
+}
+
+static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh)
+{
+ struct pnfs_layout_hdr *lo;
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+ lo = get_layout_by_fh_locked(clp, fh);
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
- if (!found)
+ return lo;
+}
+
+static u32 initiate_file_draining(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ struct inode *ino;
+ struct pnfs_layout_hdr *lo;
+ u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+ LIST_HEAD(free_me_list);
+
+ lo = get_layout_by_fh(clp, &args->cbl_fh);
+ if (!lo)
return NFS4ERR_NOMATCHING_LAYOUT;
+ ino = lo->plh_inode;
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
mark_matching_lsegs_invalid(lo, &free_me_list,
@@ -213,17 +225,13 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
static u32 do_callback_layoutrecall(struct nfs_client *clp,
struct cb_layoutrecallargs *args)
{
- u32 res = NFS4ERR_DELAY;
+ u32 res;
dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
- if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
- goto out;
if (args->cbl_recall_type == RETURN_FILE)
res = initiate_file_draining(clp, args);
else
res = initiate_bulk_draining(clp, args);
- clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
-out:
dprintk("%s returning %i\n", __func__, res);
return res;
@@ -303,21 +311,6 @@ out:
return res;
}
-int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
-{
- if (delegation == NULL)
- return 0;
-
- if (stateid->stateid.seqid != 0)
- return 0;
- if (memcmp(&delegation->stateid.stateid.other,
- &stateid->stateid.other,
- NFS4_STATEID_OTHER_SIZE))
- return 0;
-
- return 1;
-}
-
/*
* Validate the sequenceID sent by the server.
* Return success if the sequenceID is one more than what we last saw on
@@ -441,7 +434,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
int i;
__be32 status = htonl(NFS4ERR_BADSESSION);
- clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
+ clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid);
if (clp == NULL)
goto out;
@@ -517,7 +510,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
if (!cps->clp) /* set in cb_sequence */
goto out;
- dprintk("NFS: RECALL_ANY callback request from %s\n",
+ dprintk_rcu("NFS: RECALL_ANY callback request from %s\n",
rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
status = cpu_to_be32(NFS4ERR_INVAL);
@@ -552,7 +545,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
if (!cps->clp) /* set in cb_sequence */
goto out;
- dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
+ dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
args->crsa_target_max_slots);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 726e59a9e50..95bfc243992 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,8 @@
#include <linux/sunrpc/svc.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/sunrpc/bc_xprt.h>
#include "nfs4_fs.h"
@@ -73,7 +75,7 @@ static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
p = xdr_inline_decode(xdr, nbytes);
if (unlikely(p == NULL))
- printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n");
+ printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");
return p;
}
@@ -138,10 +140,10 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
{
__be32 *p;
- p = read_buf(xdr, 16);
+ p = read_buf(xdr, NFS4_STATEID_SIZE);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(stateid->data, p, 16);
+ memcpy(stateid, p, NFS4_STATEID_SIZE);
return 0;
}
@@ -155,7 +157,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
return status;
/* We do not like overly long tags! */
if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
- printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
+ printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",
__func__, hdr->taglen);
return htonl(NFS4ERR_RESOURCE);
}
@@ -167,7 +169,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
if (hdr->minorversion <= 1) {
hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
} else {
- printk(KERN_WARNING "%s: NFSv4 server callback with "
+ pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
"illegal minor version %u!\n",
__func__, hdr->minorversion);
return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
@@ -305,6 +307,10 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
n = ntohl(*p++);
if (n <= 0)
goto out;
+ if (n > ULONG_MAX / sizeof(*args->devs)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
if (!args->devs) {
@@ -755,14 +761,14 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
* Let the state manager know callback processing done.
* A single slot, so highest used slotid is either 0 or -1
*/
- tbl->highest_used_slotid = -1;
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
nfs4_check_drain_bc_complete(session);
spin_unlock(&tbl->slot_tbl_lock);
}
static void nfs4_cb_free_slot(struct cb_process_state *cps)
{
- if (cps->slotid != -1)
+ if (cps->slotid != NFS4_NO_SLOT)
nfs4_callback_free_slot(cps->clp->cl_session);
}
@@ -856,7 +862,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
struct cb_process_state cps = {
.drc_status = 0,
.clp = NULL,
- .slotid = -1,
+ .slotid = NFS4_NO_SLOT,
+ .net = rqstp->rq_xprt->xpt_net,
};
unsigned int nops = 0;
@@ -872,7 +879,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
return rpc_garbage_args;
if (hdr_arg.minorversion == 0) {
- cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+ cps.clp = nfs4_find_client_ident(rqstp->rq_xprt->xpt_net, hdr_arg.cb_ident);
if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
return rpc_drop_reply;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 277dfaf2e99..da7b5e4ff9e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -36,11 +36,13 @@
#include <linux/inet.h>
#include <linux/in6.h>
#include <linux/slab.h>
+#include <linux/idr.h>
#include <net/ipv6.h>
#include <linux/nfs_xdr.h>
#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
-#include <asm/system.h>
#include "nfs4_fs.h"
#include "callback.h"
@@ -49,15 +51,12 @@
#include "internal.h"
#include "fscache.h"
#include "pnfs.h"
+#include "netns.h"
#define NFSDBG_FACILITY NFSDBG_CLIENT
-static DEFINE_SPINLOCK(nfs_client_lock);
-static LIST_HEAD(nfs_client_list);
-static LIST_HEAD(nfs_volume_list);
static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
#ifdef CONFIG_NFS_V4
-static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
/*
* Get a unique NFSv4.0 callback identifier which will be used
@@ -66,15 +65,16 @@ static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
{
int ret = 0;
+ struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
if (clp->rpc_ops->version != 4 || minorversion != 0)
return ret;
retry:
- if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+ if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL))
return -ENOMEM;
- spin_lock(&nfs_client_lock);
- ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
- spin_unlock(&nfs_client_lock);
+ spin_lock(&nn->nfs_client_lock);
+ ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident);
+ spin_unlock(&nn->nfs_client_lock);
if (ret == -EAGAIN)
goto retry;
return ret;
@@ -84,12 +84,12 @@ retry:
/*
* Turn off NFSv4 uid/gid mapping when using AUTH_SYS
*/
-static int nfs4_disable_idmapping = 1;
+static bool nfs4_disable_idmapping = true;
/*
* RPC cruft for NFS
*/
-static struct rpc_version *nfs_version[5] = {
+static const struct rpc_version *nfs_version[5] = {
[2] = &nfs_version2,
#ifdef CONFIG_NFS_V3
[3] = &nfs_version3,
@@ -99,7 +99,7 @@ static struct rpc_version *nfs_version[5] = {
#endif
};
-struct rpc_program nfs_program = {
+const struct rpc_program nfs_program = {
.name = "nfs",
.number = NFS_PROGRAM,
.nrvers = ARRAY_SIZE(nfs_version),
@@ -115,11 +115,11 @@ struct rpc_stat nfs_rpcstat = {
#ifdef CONFIG_NFS_V3_ACL
static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
-static struct rpc_version * nfsacl_version[] = {
+static const struct rpc_version *nfsacl_version[] = {
[3] = &nfsacl_version3,
};
-struct rpc_program nfsacl_program = {
+const struct rpc_program nfsacl_program = {
.name = "nfsacl",
.number = NFS_ACL_PROGRAM,
.nrvers = ARRAY_SIZE(nfsacl_version),
@@ -135,6 +135,7 @@ struct nfs_client_initdata {
const struct nfs_rpc_ops *rpc_ops;
int proto;
u32 minorversion;
+ struct net *net;
};
/*
@@ -171,6 +172,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
clp->cl_rpcclient = ERR_PTR(-EINVAL);
clp->cl_proto = cl_init->proto;
+ clp->net = get_net(cl_init->net);
#ifdef CONFIG_NFS_V4
err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
@@ -202,8 +204,11 @@ error_0:
#ifdef CONFIG_NFS_V4_1
static void nfs4_shutdown_session(struct nfs_client *clp)
{
- if (nfs4_has_session(clp))
+ if (nfs4_has_session(clp)) {
+ nfs4_deviceid_purge_client(clp);
nfs4_destroy_session(clp->cl_session);
+ }
+
}
#else /* CONFIG_NFS_V4_1 */
static void nfs4_shutdown_session(struct nfs_client *clp)
@@ -233,16 +238,20 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
}
/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
-void nfs_cleanup_cb_ident_idr(void)
+void nfs_cleanup_cb_ident_idr(struct net *net)
{
- idr_destroy(&cb_ident_idr);
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ idr_destroy(&nn->cb_ident_idr);
}
/* nfs_client_lock held */
static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
{
+ struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
+
if (clp->cl_cb_ident)
- idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+ idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident);
}
static void pnfs_init_server(struct nfs_server *server)
@@ -260,7 +269,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
{
}
-void nfs_cleanup_cb_ident_idr(void)
+void nfs_cleanup_cb_ident_idr(struct net *net)
{
}
@@ -292,10 +301,10 @@ static void nfs_free_client(struct nfs_client *clp)
if (clp->cl_machine_cred != NULL)
put_rpccred(clp->cl_machine_cred);
- nfs4_deviceid_purge_client(clp);
-
+ put_net(clp->net);
kfree(clp->cl_hostname);
kfree(clp->server_scope);
+ kfree(clp->impl_id);
kfree(clp);
dprintk("<-- nfs_free_client()\n");
@@ -306,15 +315,18 @@ static void nfs_free_client(struct nfs_client *clp)
*/
void nfs_put_client(struct nfs_client *clp)
{
+ struct nfs_net *nn;
+
if (!clp)
return;
dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
+ nn = net_generic(clp->net, nfs_net_id);
- if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
+ if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
list_del(&clp->cl_share_link);
nfs_cb_idr_remove_locked(clp);
- spin_unlock(&nfs_client_lock);
+ spin_unlock(&nn->nfs_client_lock);
BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -392,6 +404,7 @@ static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
(sin1->sin_port == sin2->sin_port);
}
+#if defined(CONFIG_NFS_V4_1)
/*
* Test if two socket addresses represent the same actual socket,
* by comparing (only) relevant fields, excluding the port number.
@@ -410,6 +423,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
}
return 0;
}
+#endif /* CONFIG_NFS_V4_1 */
/*
* Test if two socket addresses represent the same actual socket,
@@ -430,10 +444,10 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
return 0;
}
+#if defined(CONFIG_NFS_V4_1)
/* Common match routine for v4.0 and v4.1 callback services */
-bool
-nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
- u32 minorversion)
+static bool nfs4_cb_match_client(const struct sockaddr *addr,
+ struct nfs_client *clp, u32 minorversion)
{
struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
@@ -453,6 +467,7 @@ nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
return true;
}
+#endif /* CONFIG_NFS_V4_1 */
/*
* Find an nfs_client on the list that matches the initialisation data
@@ -462,8 +477,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
{
struct nfs_client *clp;
const struct sockaddr *sap = data->addr;
+ struct nfs_net *nn = net_generic(data->net, nfs_net_id);
- list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+ list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
/* Don't match clients that failed to initialise properly */
if (clp->cl_cons_state < 0)
@@ -501,13 +517,14 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
{
struct nfs_client *clp, *new = NULL;
int error;
+ struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
dprintk("--> nfs_get_client(%s,v%u)\n",
cl_init->hostname ?: "", cl_init->rpc_ops->version);
/* see if the client already exists */
do {
- spin_lock(&nfs_client_lock);
+ spin_lock(&nn->nfs_client_lock);
clp = nfs_match_client(cl_init);
if (clp)
@@ -515,7 +532,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
if (new)
goto install_client;
- spin_unlock(&nfs_client_lock);
+ spin_unlock(&nn->nfs_client_lock);
new = nfs_alloc_client(cl_init);
} while (!IS_ERR(new));
@@ -526,8 +543,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
/* install a new client and return with it unready */
install_client:
clp = new;
- list_add(&clp->cl_share_link, &nfs_client_list);
- spin_unlock(&nfs_client_lock);
+ list_add(&clp->cl_share_link, &nn->nfs_client_list);
+ spin_unlock(&nn->nfs_client_lock);
error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
authflavour, noresvport);
@@ -542,7 +559,7 @@ install_client:
* - make sure it's ready before returning
*/
found_client:
- spin_unlock(&nfs_client_lock);
+ spin_unlock(&nn->nfs_client_lock);
if (new)
nfs_free_client(new);
@@ -642,7 +659,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
{
struct rpc_clnt *clnt = NULL;
struct rpc_create_args args = {
- .net = &init_net,
+ .net = clp->net,
.protocol = clp->cl_proto,
.address = (struct sockaddr *)&clp->cl_addr,
.addrsize = clp->cl_addrlen,
@@ -696,6 +713,7 @@ static int nfs_start_lockd(struct nfs_server *server)
.nfs_version = clp->rpc_ops->version,
.noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
1 : 0,
+ .net = clp->net,
};
if (nlm_init.nfs_version > 3)
@@ -831,6 +849,7 @@ static int nfs_init_server(struct nfs_server *server,
.addrlen = data->nfs_server.addrlen,
.rpc_ops = &nfs_v2_clientops,
.proto = data->nfs_server.protocol,
+ .net = data->net,
};
struct rpc_timeout timeparms;
struct nfs_client *clp;
@@ -1029,25 +1048,30 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
static void nfs_server_insert_lists(struct nfs_server *server)
{
struct nfs_client *clp = server->nfs_client;
+ struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
- spin_lock(&nfs_client_lock);
+ spin_lock(&nn->nfs_client_lock);
list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
- list_add_tail(&server->master_link, &nfs_volume_list);
+ list_add_tail(&server->master_link, &nn->nfs_volume_list);
clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
- spin_unlock(&nfs_client_lock);
+ spin_unlock(&nn->nfs_client_lock);
}
static void nfs_server_remove_lists(struct nfs_server *server)
{
struct nfs_client *clp = server->nfs_client;
+ struct nfs_net *nn;
- spin_lock(&nfs_client_lock);
+ if (clp == NULL)
+ return;
+ nn = net_generic(clp->net, nfs_net_id);
+ spin_lock(&nn->nfs_client_lock);
list_del_rcu(&server->client_link);
- if (clp && list_empty(&clp->cl_superblocks))
+ if (list_empty(&clp->cl_superblocks))
set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
list_del(&server->master_link);
- spin_unlock(&nfs_client_lock);
+ spin_unlock(&nn->nfs_client_lock);
synchronize_rcu();
}
@@ -1086,6 +1110,8 @@ static struct nfs_server *nfs_alloc_server(void)
return NULL;
}
+ ida_init(&server->openowner_id);
+ ida_init(&server->lockowner_id);
pnfs_init_server(server);
return server;
@@ -1111,6 +1137,8 @@ void nfs_free_server(struct nfs_server *server)
nfs_put_client(server->nfs_client);
+ ida_destroy(&server->lockowner_id);
+ ida_destroy(&server->openowner_id);
nfs_free_iostats(server->io_stats);
bdi_destroy(&server->backing_dev_info);
kfree(server);
@@ -1189,45 +1217,19 @@ error:
/*
* NFSv4.0 callback thread helper
*
- * Find a client by IP address, protocol version, and minorversion
- *
- * Called from the pg_authenticate method. The callback identifier
- * is not used as it has not been decoded.
- *
- * Returns NULL if no such client
- */
-struct nfs_client *
-nfs4_find_client_no_ident(const struct sockaddr *addr)
-{
- struct nfs_client *clp;
-
- spin_lock(&nfs_client_lock);
- list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
- if (nfs4_cb_match_client(addr, clp, 0) == false)
- continue;
- atomic_inc(&clp->cl_count);
- spin_unlock(&nfs_client_lock);
- return clp;
- }
- spin_unlock(&nfs_client_lock);
- return NULL;
-}
-
-/*
- * NFSv4.0 callback thread helper
- *
* Find a client by callback identifier
*/
struct nfs_client *
-nfs4_find_client_ident(int cb_ident)
+nfs4_find_client_ident(struct net *net, int cb_ident)
{
struct nfs_client *clp;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
- spin_lock(&nfs_client_lock);
- clp = idr_find(&cb_ident_idr, cb_ident);
+ spin_lock(&nn->nfs_client_lock);
+ clp = idr_find(&nn->cb_ident_idr, cb_ident);
if (clp)
atomic_inc(&clp->cl_count);
- spin_unlock(&nfs_client_lock);
+ spin_unlock(&nn->nfs_client_lock);
return clp;
}
@@ -1240,13 +1242,14 @@ nfs4_find_client_ident(int cb_ident)
* Returns NULL if no such client
*/
struct nfs_client *
-nfs4_find_client_sessionid(const struct sockaddr *addr,
+nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
struct nfs4_sessionid *sid)
{
struct nfs_client *clp;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
- spin_lock(&nfs_client_lock);
- list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+ spin_lock(&nn->nfs_client_lock);
+ list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
if (nfs4_cb_match_client(addr, clp, 1) == false)
continue;
@@ -1259,17 +1262,17 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
continue;
atomic_inc(&clp->cl_count);
- spin_unlock(&nfs_client_lock);
+ spin_unlock(&nn->nfs_client_lock);
return clp;
}
- spin_unlock(&nfs_client_lock);
+ spin_unlock(&nn->nfs_client_lock);
return NULL;
}
#else /* CONFIG_NFS_V4_1 */
struct nfs_client *
-nfs4_find_client_sessionid(const struct sockaddr *addr,
+nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
struct nfs4_sessionid *sid)
{
return NULL;
@@ -1284,16 +1287,18 @@ static int nfs4_init_callback(struct nfs_client *clp)
int error;
if (clp->rpc_ops->version == 4) {
+ struct rpc_xprt *xprt;
+
+ xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt);
+
if (nfs4_has_session(clp)) {
- error = xprt_setup_backchannel(
- clp->cl_rpcclient->cl_xprt,
+ error = xprt_setup_backchannel(xprt,
NFS41_BC_MIN_CALLBACKS);
if (error < 0)
return error;
}
- error = nfs_callback_up(clp->cl_mvops->minor_version,
- clp->cl_rpcclient->cl_xprt);
+ error = nfs_callback_up(clp->cl_mvops->minor_version, xprt);
if (error < 0) {
dprintk("%s: failed to start callback. Error = %d\n",
__func__, error);
@@ -1344,6 +1349,7 @@ int nfs4_init_client(struct nfs_client *clp,
rpc_authflavor_t authflavour,
int noresvport)
{
+ char buf[INET6_ADDRSTRLEN + 1];
int error;
if (clp->cl_cons_state == NFS_CS_READY) {
@@ -1359,6 +1365,20 @@ int nfs4_init_client(struct nfs_client *clp,
1, noresvport);
if (error < 0)
goto error;
+
+ /* If no clientaddr= option was specified, find a usable cb address */
+ if (ip_addr == NULL) {
+ struct sockaddr_storage cb_addr;
+ struct sockaddr *sap = (struct sockaddr *)&cb_addr;
+
+ error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr));
+ if (error < 0)
+ goto error;
+ error = rpc_ntop(sap, buf, sizeof(buf));
+ if (error < 0)
+ goto error;
+ ip_addr = (const char *)buf;
+ }
strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
error = nfs_idmap_new(clp);
@@ -1393,7 +1413,7 @@ static int nfs4_set_client(struct nfs_server *server,
const char *ip_addr,
rpc_authflavor_t authflavour,
int proto, const struct rpc_timeout *timeparms,
- u32 minorversion)
+ u32 minorversion, struct net *net)
{
struct nfs_client_initdata cl_init = {
.hostname = hostname,
@@ -1402,6 +1422,7 @@ static int nfs4_set_client(struct nfs_server *server,
.rpc_ops = &nfs_v4_clientops,
.proto = proto,
.minorversion = minorversion,
+ .net = net,
};
struct nfs_client *clp;
int error;
@@ -1453,6 +1474,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
.rpc_ops = &nfs_v4_clientops,
.proto = ds_proto,
.minorversion = mds_clp->cl_minorversion,
+ .net = mds_clp->net,
};
struct rpc_timeout ds_timeout = {
.to_initval = 15 * HZ,
@@ -1580,7 +1602,8 @@ static int nfs4_init_server(struct nfs_server *server,
data->auth_flavors[0],
data->nfs_server.protocol,
&timeparms,
- data->minorversion);
+ data->minorversion,
+ data->net);
if (error < 0)
goto error;
@@ -1675,9 +1698,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
data->addrlen,
parent_client->cl_ipaddr,
data->authflavor,
- parent_server->client->cl_xprt->prot,
+ rpc_protocol(parent_server->client),
parent_server->client->cl_timeout,
- parent_client->cl_mvops->minor_version);
+ parent_client->cl_mvops->minor_version,
+ parent_client->net);
if (error < 0)
goto error;
@@ -1770,6 +1794,18 @@ out_free_server:
return ERR_PTR(error);
}
+void nfs_clients_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ INIT_LIST_HEAD(&nn->nfs_client_list);
+ INIT_LIST_HEAD(&nn->nfs_volume_list);
+#ifdef CONFIG_NFS_V4
+ idr_init(&nn->cb_ident_idr);
+#endif
+ spin_lock_init(&nn->nfs_client_lock);
+}
+
#ifdef CONFIG_PROC_FS
static struct proc_dir_entry *proc_fs_nfs;
@@ -1823,13 +1859,15 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
{
struct seq_file *m;
int ret;
+ struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
+ struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
ret = seq_open(file, &nfs_server_list_ops);
if (ret < 0)
return ret;
m = file->private_data;
- m->private = PDE(inode)->data;
+ m->private = net;
return 0;
}
@@ -1839,9 +1877,11 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
*/
static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
{
+ struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+
/* lock the list against modification */
- spin_lock(&nfs_client_lock);
- return seq_list_start_head(&nfs_client_list, *_pos);
+ spin_lock(&nn->nfs_client_lock);
+ return seq_list_start_head(&nn->nfs_client_list, *_pos);
}
/*
@@ -1849,7 +1889,9 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
*/
static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
{
- return seq_list_next(v, &nfs_client_list, pos);
+ struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+
+ return seq_list_next(v, &nn->nfs_client_list, pos);
}
/*
@@ -1857,7 +1899,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
*/
static void nfs_server_list_stop(struct seq_file *p, void *v)
{
- spin_unlock(&nfs_client_lock);
+ struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+
+ spin_unlock(&nn->nfs_client_lock);
}
/*
@@ -1866,9 +1910,10 @@ static void nfs_server_list_stop(struct seq_file *p, void *v)
static int nfs_server_list_show(struct seq_file *m, void *v)
{
struct nfs_client *clp;
+ struct nfs_net *nn = net_generic(m->private, nfs_net_id);
/* display header on line 1 */
- if (v == &nfs_client_list) {
+ if (v == &nn->nfs_client_list) {
seq_puts(m, "NV SERVER PORT USE HOSTNAME\n");
return 0;
}
@@ -1880,12 +1925,14 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
if (clp->cl_cons_state != NFS_CS_READY)
return 0;
+ rcu_read_lock();
seq_printf(m, "v%u %s %s %3d %s\n",
clp->rpc_ops->version,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
atomic_read(&clp->cl_count),
clp->cl_hostname);
+ rcu_read_unlock();
return 0;
}
@@ -1897,13 +1944,15 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
{
struct seq_file *m;
int ret;
+ struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
+ struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
ret = seq_open(file, &nfs_volume_list_ops);
if (ret < 0)
return ret;
m = file->private_data;
- m->private = PDE(inode)->data;
+ m->private = net;
return 0;
}
@@ -1913,9 +1962,11 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
*/
static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
{
+ struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+
/* lock the list against modification */
- spin_lock(&nfs_client_lock);
- return seq_list_start_head(&nfs_volume_list, *_pos);
+ spin_lock(&nn->nfs_client_lock);
+ return seq_list_start_head(&nn->nfs_volume_list, *_pos);
}
/*
@@ -1923,7 +1974,9 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
*/
static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
{
- return seq_list_next(v, &nfs_volume_list, pos);
+ struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+
+ return seq_list_next(v, &nn->nfs_volume_list, pos);
}
/*
@@ -1931,7 +1984,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
*/
static void nfs_volume_list_stop(struct seq_file *p, void *v)
{
- spin_unlock(&nfs_client_lock);
+ struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+
+ spin_unlock(&nn->nfs_client_lock);
}
/*
@@ -1942,9 +1997,10 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
struct nfs_server *server;
struct nfs_client *clp;
char dev[8], fsid[17];
+ struct nfs_net *nn = net_generic(m->private, nfs_net_id);
/* display header on line 1 */
- if (v == &nfs_volume_list) {
+ if (v == &nn->nfs_volume_list) {
seq_puts(m, "NV SERVER PORT DEV FSID FSC\n");
return 0;
}
@@ -1959,6 +2015,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
(unsigned long long) server->fsid.major,
(unsigned long long) server->fsid.minor);
+ rcu_read_lock();
seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
clp->rpc_ops->version,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
@@ -1966,6 +2023,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
dev,
fsid,
nfs_server_fscache_state(server));
+ rcu_read_unlock();
return 0;
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7f265406980..89af1d26927 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -105,7 +105,7 @@ again:
continue;
if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
continue;
- if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0)
+ if (!nfs4_stateid_match(&state->stateid, stateid))
continue;
get_nfs_open_context(ctx);
spin_unlock(&inode->i_lock);
@@ -139,8 +139,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
if (delegation != NULL) {
spin_lock(&delegation->lock);
if (delegation->inode != NULL) {
- memcpy(delegation->stateid.data, res->delegation.data,
- sizeof(delegation->stateid.data));
+ nfs4_stateid_copy(&delegation->stateid, &res->delegation);
delegation->type = res->delegation_type;
delegation->maxsize = res->maxsize;
oldcred = delegation->cred;
@@ -236,8 +235,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
if (delegation == NULL)
return -ENOMEM;
- memcpy(delegation->stateid.data, res->delegation.data,
- sizeof(delegation->stateid.data));
+ nfs4_stateid_copy(&delegation->stateid, &res->delegation);
delegation->type = res->delegation_type;
delegation->maxsize = res->maxsize;
delegation->change_attr = inode->i_version;
@@ -250,19 +248,22 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
old_delegation = rcu_dereference_protected(nfsi->delegation,
lockdep_is_held(&clp->cl_lock));
if (old_delegation != NULL) {
- if (memcmp(&delegation->stateid, &old_delegation->stateid,
- sizeof(old_delegation->stateid)) == 0 &&
+ if (nfs4_stateid_match(&delegation->stateid,
+ &old_delegation->stateid) &&
delegation->type == old_delegation->type) {
goto out;
}
/*
* Deal with broken servers that hand out two
* delegations for the same file.
+ * Allow for upgrades to a WRITE delegation, but
+ * nothing else.
*/
dfprintk(FILE, "%s: server %s handed out "
"a duplicate delegation!\n",
__func__, clp->cl_hostname);
- if (delegation->type <= old_delegation->type) {
+ if (delegation->type == old_delegation->type ||
+ !(delegation->type & FMODE_WRITE)) {
freeme = delegation;
delegation = NULL;
goto out;
@@ -455,17 +456,24 @@ static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
rcu_read_unlock();
}
-static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
-{
- nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
-}
-
static void nfs_delegation_run_state_manager(struct nfs_client *clp)
{
if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
nfs4_schedule_state_manager(clp);
}
+void nfs_remove_bad_delegation(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode));
+ if (delegation) {
+ nfs_inode_find_state_and_recover(inode, &delegation->stateid);
+ nfs_free_delegation(delegation);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
+
/**
* nfs_expire_all_delegation_types
* @clp: client to process
@@ -488,18 +496,6 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
}
-/**
- * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
- * @clp: client to process
- *
- */
-void nfs_handle_cb_pathdown(struct nfs_client *clp)
-{
- if (clp == NULL)
- return;
- nfs_client_mark_return_all_delegations(clp);
-}
-
static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
{
struct nfs_delegation *delegation;
@@ -531,7 +527,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
/**
* nfs_async_inode_return_delegation - asynchronously return a delegation
* @inode: inode to process
- * @stateid: state ID information from CB_RECALL arguments
+ * @stateid: state ID information
*
* Returns zero on success, or a negative errno value.
*/
@@ -545,7 +541,7 @@ int nfs_async_inode_return_delegation(struct inode *inode,
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
- if (!clp->cl_mvops->validate_stateid(delegation, stateid)) {
+ if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) {
rcu_read_unlock();
return -ENOENT;
}
@@ -684,21 +680,25 @@ int nfs_delegations_present(struct nfs_client *clp)
* nfs4_copy_delegation_stateid - Copy inode's state ID information
* @dst: stateid data structure to fill in
* @inode: inode to check
+ * @flags: delegation type requirement
*
- * Returns one and fills in "dst->data" * if inode had a delegation,
- * otherwise zero is returned.
+ * Returns "true" and fills in "dst->data" * if inode had a delegation,
+ * otherwise "false" is returned.
*/
-int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
+bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
+ fmode_t flags)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_delegation *delegation;
- int ret = 0;
+ bool ret;
+ flags &= FMODE_READ|FMODE_WRITE;
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
- if (delegation != NULL) {
- memcpy(dst->data, delegation->stateid.data, sizeof(dst->data));
- ret = 1;
+ ret = (delegation != NULL && (delegation->type & flags) == flags);
+ if (ret) {
+ nfs4_stateid_copy(dst, &delegation->stateid);
+ nfs_mark_delegation_referenced(delegation);
}
rcu_read_unlock();
return ret;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index d9322e490c5..cd6a7a8dada 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -42,9 +42,9 @@ void nfs_super_return_all_delegations(struct super_block *sb);
void nfs_expire_all_delegations(struct nfs_client *clp);
void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
-void nfs_handle_cb_pathdown(struct nfs_client *clp);
int nfs_client_return_marked_delegations(struct nfs_client *clp);
int nfs_delegations_present(struct nfs_client *clp);
+void nfs_remove_bad_delegation(struct inode *inode);
void nfs_delegation_mark_reclaim(struct nfs_client *clp);
void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
@@ -53,7 +53,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
-int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
+bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs_have_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index fd9a872fada..4aaf0316d76 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -207,7 +207,7 @@ struct nfs_cache_array_entry {
};
struct nfs_cache_array {
- unsigned int size;
+ int size;
int eof_index;
u64 last_cookie;
struct nfs_cache_array_entry array[0];
@@ -260,10 +260,10 @@ void nfs_readdir_clear_array(struct page *page)
struct nfs_cache_array *array;
int i;
- array = kmap_atomic(page, KM_USER0);
+ array = kmap_atomic(page);
for (i = 0; i < array->size; i++)
kfree(array->array[i].string.name);
- kunmap_atomic(array, KM_USER0);
+ kunmap_atomic(array);
}
/*
@@ -1429,6 +1429,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
}
open_flags = nd->intent.open.flags;
+ attr.ia_valid = 0;
ctx = create_nfs_open_context(dentry, open_flags);
res = ERR_CAST(ctx);
@@ -1437,11 +1438,14 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
if (nd->flags & LOOKUP_CREATE) {
attr.ia_mode = nd->intent.open.create_mode;
- attr.ia_valid = ATTR_MODE;
+ attr.ia_valid |= ATTR_MODE;
attr.ia_mode &= ~current_umask();
- } else {
+ } else
open_flags &= ~(O_EXCL | O_CREAT);
- attr.ia_valid = 0;
+
+ if (open_flags & O_TRUNC) {
+ attr.ia_valid |= ATTR_SIZE;
+ attr.ia_size = 0;
}
/* Open the file on the server */
@@ -1495,6 +1499,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
struct inode *inode;
struct inode *dir;
struct nfs_open_context *ctx;
+ struct iattr attr;
int openflags, ret = 0;
if (nd->flags & LOOKUP_RCU)
@@ -1523,19 +1528,27 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
/* We cannot do exclusive creation on a positive dentry */
if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
goto no_open_dput;
- /* We can't create new files, or truncate existing ones here */
- openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
+ /* We can't create new files here */
+ openflags &= ~(O_CREAT|O_EXCL);
ctx = create_nfs_open_context(dentry, openflags);
ret = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
+
+ attr.ia_valid = 0;
+ if (openflags & O_TRUNC) {
+ attr.ia_valid |= ATTR_SIZE;
+ attr.ia_size = 0;
+ nfs_wb_all(inode);
+ }
+
/*
* Note: we're not holding inode->i_mutex and so may be racing with
* operations that change the directory. We therefore save the
* change attribute *before* we do the RPC call.
*/
- inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
switch (ret) {
@@ -1870,11 +1883,11 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
if (!page)
return -ENOMEM;
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memcpy(kaddr, symname, pathlen);
if (pathlen < PAGE_SIZE)
memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
if (error != 0) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1940f1a56a5..481be7f7bdd 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -51,7 +51,6 @@
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/atomic.h>
@@ -265,9 +264,7 @@ static void nfs_direct_read_release(void *calldata)
}
static const struct rpc_call_ops nfs_read_direct_ops = {
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs_read_prepare,
-#endif /* CONFIG_NFS_V4_1 */
.rpc_call_done = nfs_direct_read_result,
.rpc_release = nfs_direct_read_release,
};
@@ -554,9 +551,7 @@ static void nfs_direct_commit_release(void *calldata)
}
static const struct rpc_call_ops nfs_commit_direct_ops = {
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs_write_prepare,
-#endif /* CONFIG_NFS_V4_1 */
.rpc_call_done = nfs_direct_commit_result,
.rpc_release = nfs_direct_commit_release,
};
@@ -696,9 +691,7 @@ out_unlock:
}
static const struct rpc_call_ops nfs_write_direct_ops = {
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs_write_prepare,
-#endif /* CONFIG_NFS_V4_1 */
.rpc_call_done = nfs_direct_write_result,
.rpc_release = nfs_direct_write_release,
};
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index a6e711ad130..b3924b8a600 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -10,8 +10,9 @@
#include <linux/sunrpc/clnt.h>
#include <linux/dns_resolver.h>
+#include "dns_resolve.h"
-ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
struct sockaddr *sa, size_t salen)
{
ssize_t ret;
@@ -20,7 +21,7 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
if (ip_len > 0)
- ret = rpc_pton(ip_addr, ip_len, sa, salen);
+ ret = rpc_pton(net, ip_addr, ip_len, sa, salen);
else
ret = -ESRCH;
kfree(ip_addr);
@@ -40,15 +41,15 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
#include "dns_resolve.h"
#include "cache_lib.h"
+#include "netns.h"
#define NFS_DNS_HASHBITS 4
#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
-static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
-
struct nfs_dns_ent {
struct cache_head h;
@@ -224,7 +225,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
len = qword_get(&buf, buf1, sizeof(buf1));
if (len <= 0)
goto out;
- key.addrlen = rpc_pton(buf1, len,
+ key.addrlen = rpc_pton(cd->net, buf1, len,
(struct sockaddr *)&key.addr,
sizeof(key.addr));
@@ -259,21 +260,6 @@ out:
return ret;
}
-static struct cache_detail nfs_dns_resolve = {
- .owner = THIS_MODULE,
- .hash_size = NFS_DNS_HASHTBL_SIZE,
- .hash_table = nfs_dns_table,
- .name = "dns_resolve",
- .cache_put = nfs_dns_ent_put,
- .cache_upcall = nfs_dns_upcall,
- .cache_parse = nfs_dns_parse,
- .cache_show = nfs_dns_show,
- .match = nfs_dns_match,
- .init = nfs_dns_ent_init,
- .update = nfs_dns_ent_update,
- .alloc = nfs_dns_ent_alloc,
-};
-
static int do_cache_lookup(struct cache_detail *cd,
struct nfs_dns_ent *key,
struct nfs_dns_ent **item,
@@ -336,8 +322,8 @@ out:
return ret;
}
-ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
- struct sockaddr *sa, size_t salen)
+ssize_t nfs_dns_resolve_name(struct net *net, char *name,
+ size_t namelen, struct sockaddr *sa, size_t salen)
{
struct nfs_dns_ent key = {
.hostname = name,
@@ -345,28 +331,118 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
};
struct nfs_dns_ent *item = NULL;
ssize_t ret;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
- ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
+ ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item);
if (ret == 0) {
if (salen >= item->addrlen) {
memcpy(sa, &item->addr, item->addrlen);
ret = item->addrlen;
} else
ret = -EOVERFLOW;
- cache_put(&item->h, &nfs_dns_resolve);
+ cache_put(&item->h, nn->nfs_dns_resolve);
} else if (ret == -ENOENT)
ret = -ESRCH;
return ret;
}
+int nfs_dns_resolver_cache_init(struct net *net)
+{
+ int err = -ENOMEM;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct cache_detail *cd;
+ struct cache_head **tbl;
+
+ cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL);
+ if (cd == NULL)
+ goto err_cd;
+
+ tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *),
+ GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_tbl;
+
+ cd->owner = THIS_MODULE,
+ cd->hash_size = NFS_DNS_HASHTBL_SIZE,
+ cd->hash_table = tbl,
+ cd->name = "dns_resolve",
+ cd->cache_put = nfs_dns_ent_put,
+ cd->cache_upcall = nfs_dns_upcall,
+ cd->cache_parse = nfs_dns_parse,
+ cd->cache_show = nfs_dns_show,
+ cd->match = nfs_dns_match,
+ cd->init = nfs_dns_ent_init,
+ cd->update = nfs_dns_ent_update,
+ cd->alloc = nfs_dns_ent_alloc,
+
+ nfs_cache_init(cd);
+ err = nfs_cache_register_net(net, cd);
+ if (err)
+ goto err_reg;
+ nn->nfs_dns_resolve = cd;
+ return 0;
+
+err_reg:
+ nfs_cache_destroy(cd);
+ kfree(cd->hash_table);
+err_tbl:
+ kfree(cd);
+err_cd:
+ return err;
+}
+
+void nfs_dns_resolver_cache_destroy(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct cache_detail *cd = nn->nfs_dns_resolve;
+
+ nfs_cache_unregister_net(net, cd);
+ nfs_cache_destroy(cd);
+ kfree(cd->hash_table);
+ kfree(cd);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct cache_detail *cd = nn->nfs_dns_resolve;
+ int ret = 0;
+
+ if (cd == NULL)
+ return 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ ret = nfs_cache_register_sb(sb, cd);
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ nfs_cache_unregister_sb(sb, cd);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfs_dns_resolver_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
int nfs_dns_resolver_init(void)
{
- return nfs_cache_register(&nfs_dns_resolve);
+ return rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
}
void nfs_dns_resolver_destroy(void)
{
- nfs_cache_unregister(&nfs_dns_resolve);
+ rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
}
-
#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index 199bb5543a9..2e4f596d292 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -15,12 +15,22 @@ static inline int nfs_dns_resolver_init(void)
static inline void nfs_dns_resolver_destroy(void)
{}
+
+static inline int nfs_dns_resolver_cache_init(struct net *net)
+{
+ return 0;
+}
+
+static inline void nfs_dns_resolver_cache_destroy(struct net *net)
+{}
#else
extern int nfs_dns_resolver_init(void);
extern void nfs_dns_resolver_destroy(void);
+extern int nfs_dns_resolver_cache_init(struct net *net);
+extern void nfs_dns_resolver_cache_destroy(struct net *net);
#endif
-extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
- struct sockaddr *sa, size_t salen);
+extern ssize_t nfs_dns_resolve_name(struct net *net, char *name,
+ size_t namelen, struct sockaddr *sa, size_t salen);
#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c43a452f7da..aa9b709fd32 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -30,7 +30,6 @@
#include <linux/swap.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include "delegation.h"
#include "internal.h"
@@ -530,6 +529,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (mapping != dentry->d_inode->i_mapping)
goto out_unlock;
+ wait_on_page_writeback(page);
+
pagelen = nfs_page_length(page);
if (pagelen == 0)
goto out_unlock;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 419119c371b..ae65c16b367 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -327,7 +327,7 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_server *nfss = NFS_SERVER(inode);
- struct fscache_cookie *old = nfsi->fscache;
+ NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache);
nfs_fscache_inode_lock(inode);
if (nfsi->fscache) {
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index dcb61548887..4ca6f5c8038 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -32,7 +32,6 @@
#include <linux/namei.h>
#include <linux/security.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include "nfs4_fs.h"
@@ -49,11 +48,9 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
{
/* The mntroot acts as the dummy root dentry for this superblock */
if (sb->s_root == NULL) {
- sb->s_root = d_alloc_root(inode);
- if (sb->s_root == NULL) {
- iput(inode);
+ sb->s_root = d_make_root(inode);
+ if (sb->s_root == NULL)
return -ENOMEM;
- }
ihold(inode);
/*
* Ensure that this dentry is invisible to d_find_alias().
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 2c05f1991e1..b7f348bb618 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,11 +34,29 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/fs.h>
#include <linux/nfs_idmap.h>
+#include <net/net_namespace.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <linux/module.h>
+
+#include "internal.h"
+#include "netns.h"
+
+#define NFS_UINT_MAXLEN 11
+
+/* Default cache timeout is 10 minutes */
+unsigned int nfs_idmap_cache_timeout = 600;
+static const struct cred *id_resolver_cache;
+static struct key_type key_type_id_resolver_legacy;
+
/**
* nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
@@ -142,24 +160,7 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
return snprintf(buf, buflen, "%u", id);
}
-#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
-
-#include <linux/cred.h>
-#include <linux/sunrpc/sched.h>
-#include <linux/nfs4.h>
-#include <linux/nfs_fs_sb.h>
-#include <linux/keyctl.h>
-#include <linux/key-type.h>
-#include <linux/rcupdate.h>
-#include <linux/err.h>
-
-#include <keys/user-type.h>
-
-#define NFS_UINT_MAXLEN 11
-
-const struct cred *id_resolver_cache;
-
-struct key_type key_type_id_resolver = {
+static struct key_type key_type_id_resolver = {
.name = "id_resolver",
.instantiate = user_instantiate,
.match = user_match,
@@ -169,13 +170,14 @@ struct key_type key_type_id_resolver = {
.read = user_read,
};
-int nfs_idmap_init(void)
+static int nfs_idmap_init_keyring(void)
{
struct cred *cred;
struct key *keyring;
int ret = 0;
- printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
+ printk(KERN_NOTICE "NFS: Registering the %s key type\n",
+ key_type_id_resolver.name);
cred = prepare_kernel_cred(NULL);
if (!cred)
@@ -198,6 +200,7 @@ int nfs_idmap_init(void)
if (ret < 0)
goto failed_put_key;
+ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
cred->thread_keyring = keyring;
cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
id_resolver_cache = cred;
@@ -210,7 +213,7 @@ failed_put_cred:
return ret;
}
-void nfs_idmap_quit(void)
+static void nfs_idmap_quit_keyring(void)
{
key_revoke(id_resolver_cache->thread_keyring);
unregister_key_type(&key_type_id_resolver);
@@ -245,8 +248,10 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
return desclen;
}
-static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
- const char *type, void *data, size_t data_size)
+static ssize_t nfs_idmap_request_key(struct key_type *key_type,
+ const char *name, size_t namelen,
+ const char *type, void *data,
+ size_t data_size, struct idmap *idmap)
{
const struct cred *saved_cred;
struct key *rkey;
@@ -259,8 +264,12 @@ static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
goto out;
saved_cred = override_creds(id_resolver_cache);
- rkey = request_key(&key_type_id_resolver, desc, "");
+ if (idmap)
+ rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
+ else
+ rkey = request_key(&key_type_id_resolver, desc, "");
revert_creds(saved_cred);
+
kfree(desc);
if (IS_ERR(rkey)) {
ret = PTR_ERR(rkey);
@@ -293,31 +302,46 @@ out:
return ret;
}
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+ const char *type, void *data,
+ size_t data_size, struct idmap *idmap)
+{
+ ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
+ name, namelen, type, data,
+ data_size, NULL);
+ if (ret < 0) {
+ ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
+ name, namelen, type, data,
+ data_size, idmap);
+ }
+ return ret;
+}
/* ID -> Name */
-static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
+ size_t buflen, struct idmap *idmap)
{
char id_str[NFS_UINT_MAXLEN];
int id_len;
ssize_t ret;
id_len = snprintf(id_str, sizeof(id_str), "%u", id);
- ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
+ ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap);
if (ret < 0)
return -EINVAL;
return ret;
}
/* Name -> ID */
-static int nfs_idmap_lookup_id(const char *name, size_t namelen,
- const char *type, __u32 *id)
+static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type,
+ __u32 *id, struct idmap *idmap)
{
char id_str[NFS_UINT_MAXLEN];
long id_long;
ssize_t data_size;
int ret = 0;
- data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
+ data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap);
if (data_size <= 0) {
ret = -EINVAL;
} else {
@@ -327,114 +351,103 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
return ret;
}
-int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
-{
- if (nfs_map_string_to_numeric(name, namelen, uid))
- return 0;
- return nfs_idmap_lookup_id(name, namelen, "uid", uid);
-}
-
-int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
-{
- if (nfs_map_string_to_numeric(name, namelen, gid))
- return 0;
- return nfs_idmap_lookup_id(name, namelen, "gid", gid);
-}
-
-int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
-{
- int ret = -EINVAL;
-
- if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
- ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
- if (ret < 0)
- ret = nfs_map_numeric_to_string(uid, buf, buflen);
- return ret;
-}
-int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
-{
- int ret = -EINVAL;
+/* idmap classic begins here */
+module_param(nfs_idmap_cache_timeout, int, 0644);
- if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
- ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
- if (ret < 0)
- ret = nfs_map_numeric_to_string(gid, buf, buflen);
- return ret;
-}
-
-#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
-
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/init.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/sched.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/workqueue.h>
-#include <linux/sunrpc/rpc_pipe_fs.h>
-
-#include <linux/nfs_fs.h>
-
-#include "nfs4_fs.h"
-
-#define IDMAP_HASH_SZ 128
-
-/* Default cache timeout is 10 minutes */
-unsigned int nfs_idmap_cache_timeout = 600 * HZ;
-
-static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
-{
- char *endp;
- int num = simple_strtol(val, &endp, 0);
- int jif = num * HZ;
- if (endp == val || *endp || num < 0 || jif < num)
- return -EINVAL;
- *((int *)kp->arg) = jif;
- return 0;
-}
-
-module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
- &nfs_idmap_cache_timeout, 0644);
-
-struct idmap_hashent {
- unsigned long ih_expires;
- __u32 ih_id;
- size_t ih_namelen;
- char ih_name[IDMAP_NAMESZ];
+struct idmap {
+ struct rpc_pipe *idmap_pipe;
+ struct key_construction *idmap_key_cons;
};
-struct idmap_hashtable {
- __u8 h_type;
- struct idmap_hashent h_entries[IDMAP_HASH_SZ];
+enum {
+ Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err
};
-struct idmap {
- struct dentry *idmap_dentry;
- wait_queue_head_t idmap_wq;
- struct idmap_msg idmap_im;
- struct mutex idmap_lock; /* Serializes upcalls */
- struct mutex idmap_im_lock; /* Protects the hashtable */
- struct idmap_hashtable idmap_user_hash;
- struct idmap_hashtable idmap_group_hash;
+static const match_table_t nfs_idmap_tokens = {
+ { Opt_find_uid, "uid:%s" },
+ { Opt_find_gid, "gid:%s" },
+ { Opt_find_user, "user:%s" },
+ { Opt_find_group, "group:%s" },
+ { Opt_find_err, NULL }
};
+static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
size_t);
static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
-static unsigned int fnvhash32(const void *, size_t);
-
static const struct rpc_pipe_ops idmap_upcall_ops = {
.upcall = rpc_pipe_generic_upcall,
.downcall = idmap_pipe_downcall,
.destroy_msg = idmap_pipe_destroy_msg,
};
+static struct key_type key_type_id_resolver_legacy = {
+ .name = "id_resolver",
+ .instantiate = user_instantiate,
+ .match = user_match,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = user_describe,
+ .read = user_read,
+ .request_key = nfs_idmap_legacy_upcall,
+};
+
+static void __nfs_idmap_unregister(struct rpc_pipe *pipe)
+{
+ if (pipe->dentry)
+ rpc_unlink(pipe->dentry);
+}
+
+static int __nfs_idmap_register(struct dentry *dir,
+ struct idmap *idmap,
+ struct rpc_pipe *pipe)
+{
+ struct dentry *dentry;
+
+ dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ pipe->dentry = dentry;
+ return 0;
+}
+
+static void nfs_idmap_unregister(struct nfs_client *clp,
+ struct rpc_pipe *pipe)
+{
+ struct net *net = clp->net;
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ __nfs_idmap_unregister(pipe);
+ rpc_put_sb_net(net);
+ }
+}
+
+static int nfs_idmap_register(struct nfs_client *clp,
+ struct idmap *idmap,
+ struct rpc_pipe *pipe)
+{
+ struct net *net = clp->net;
+ struct super_block *pipefs_sb;
+ int err = 0;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ if (clp->cl_rpcclient->cl_dentry)
+ err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
+ idmap, pipe);
+ rpc_put_sb_net(net);
+ }
+ return err;
+}
+
int
nfs_idmap_new(struct nfs_client *clp)
{
struct idmap *idmap;
+ struct rpc_pipe *pipe;
int error;
BUG_ON(clp->cl_idmap != NULL);
@@ -443,19 +456,19 @@ nfs_idmap_new(struct nfs_client *clp)
if (idmap == NULL)
return -ENOMEM;
- idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
- "idmap", idmap, &idmap_upcall_ops, 0);
- if (IS_ERR(idmap->idmap_dentry)) {
- error = PTR_ERR(idmap->idmap_dentry);
+ pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0);
+ if (IS_ERR(pipe)) {
+ error = PTR_ERR(pipe);
kfree(idmap);
return error;
}
-
- mutex_init(&idmap->idmap_lock);
- mutex_init(&idmap->idmap_im_lock);
- init_waitqueue_head(&idmap->idmap_wq);
- idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
- idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
+ error = nfs_idmap_register(clp, idmap, pipe);
+ if (error) {
+ rpc_destroy_pipe_data(pipe);
+ kfree(idmap);
+ return error;
+ }
+ idmap->idmap_pipe = pipe;
clp->cl_idmap = idmap;
return 0;
@@ -468,211 +481,220 @@ nfs_idmap_delete(struct nfs_client *clp)
if (!idmap)
return;
- rpc_unlink(idmap->idmap_dentry);
+ nfs_idmap_unregister(clp, idmap->idmap_pipe);
+ rpc_destroy_pipe_data(idmap->idmap_pipe);
clp->cl_idmap = NULL;
kfree(idmap);
}
-/*
- * Helper routines for manipulating the hashtable
- */
-static inline struct idmap_hashent *
-idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len)
-{
- return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ];
-}
-
-static struct idmap_hashent *
-idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len)
+static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event,
+ struct super_block *sb)
{
- struct idmap_hashent *he = idmap_name_hash(h, name, len);
+ int err = 0;
- if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0)
- return NULL;
- if (time_after(jiffies, he->ih_expires))
- return NULL;
- return he;
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ BUG_ON(clp->cl_rpcclient->cl_dentry == NULL);
+ err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
+ clp->cl_idmap,
+ clp->cl_idmap->idmap_pipe);
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (clp->cl_idmap->idmap_pipe) {
+ struct dentry *parent;
+
+ parent = clp->cl_idmap->idmap_pipe->dentry->d_parent;
+ __nfs_idmap_unregister(clp->cl_idmap->idmap_pipe);
+ /*
+ * Note: This is a dirty hack. SUNRPC hook has been
+ * called already but simple_rmdir() call for the
+ * directory returned with error because of idmap pipe
+ * inside. Thus now we have to remove this directory
+ * here.
+ */
+ if (rpc_rmdir(parent))
+ printk(KERN_ERR "NFS: %s: failed to remove "
+ "clnt dir!\n", __func__);
+ }
+ break;
+ default:
+ printk(KERN_ERR "NFS: %s: unknown event: %ld\n", __func__,
+ event);
+ return -ENOTSUPP;
+ }
+ return err;
+}
+
+static struct nfs_client *nfs_get_client_for_event(struct net *net, int event)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *cl_dentry;
+ struct nfs_client *clp;
+
+ spin_lock(&nn->nfs_client_lock);
+ list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
+ if (clp->rpc_ops != &nfs_v4_clientops)
+ continue;
+ cl_dentry = clp->cl_idmap->idmap_pipe->dentry;
+ if (((event == RPC_PIPEFS_MOUNT) && cl_dentry) ||
+ ((event == RPC_PIPEFS_UMOUNT) && !cl_dentry))
+ continue;
+ atomic_inc(&clp->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+ return clp;
+ }
+ spin_unlock(&nn->nfs_client_lock);
+ return NULL;
}
-static inline struct idmap_hashent *
-idmap_id_hash(struct idmap_hashtable* h, __u32 id)
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
{
- return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ];
-}
+ struct super_block *sb = ptr;
+ struct nfs_client *clp;
+ int error = 0;
-static struct idmap_hashent *
-idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
-{
- struct idmap_hashent *he = idmap_id_hash(h, id);
- if (he->ih_id != id || he->ih_namelen == 0)
- return NULL;
- if (time_after(jiffies, he->ih_expires))
- return NULL;
- return he;
+ while ((clp = nfs_get_client_for_event(sb->s_fs_info, event))) {
+ error = __rpc_pipefs_event(clp, event, sb);
+ nfs_put_client(clp);
+ if (error)
+ break;
+ }
+ return error;
}
-/*
- * Routines for allocating new entries in the hashtable.
- * For now, we just have 1 entry per bucket, so it's all
- * pretty trivial.
- */
-static inline struct idmap_hashent *
-idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len)
-{
- return idmap_name_hash(h, name, len);
-}
+#define PIPEFS_NFS_PRIO 1
+
+static struct notifier_block nfs_idmap_block = {
+ .notifier_call = rpc_pipefs_event,
+ .priority = SUNRPC_PIPEFS_NFS_PRIO,
+};
-static inline struct idmap_hashent *
-idmap_alloc_id(struct idmap_hashtable *h, __u32 id)
+int nfs_idmap_init(void)
{
- return idmap_id_hash(h, id);
+ int ret;
+ ret = nfs_idmap_init_keyring();
+ if (ret != 0)
+ goto out;
+ ret = rpc_pipefs_notifier_register(&nfs_idmap_block);
+ if (ret != 0)
+ nfs_idmap_quit_keyring();
+out:
+ return ret;
}
-static void
-idmap_update_entry(struct idmap_hashent *he, const char *name,
- size_t namelen, __u32 id)
+void nfs_idmap_quit(void)
{
- he->ih_id = id;
- memcpy(he->ih_name, name, namelen);
- he->ih_name[namelen] = '\0';
- he->ih_namelen = namelen;
- he->ih_expires = jiffies + nfs_idmap_cache_timeout;
+ rpc_pipefs_notifier_unregister(&nfs_idmap_block);
+ nfs_idmap_quit_keyring();
}
-/*
- * Name -> ID
- */
-static int
-nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
- const char *name, size_t namelen, __u32 *id)
+static int nfs_idmap_prepare_message(char *desc, struct idmap_msg *im,
+ struct rpc_pipe_msg *msg)
{
- struct rpc_pipe_msg msg;
- struct idmap_msg *im;
- struct idmap_hashent *he;
- DECLARE_WAITQUEUE(wq, current);
- int ret = -EIO;
-
- im = &idmap->idmap_im;
-
- /*
- * String sanity checks
- * Note that the userland daemon expects NUL terminated strings
- */
- for (;;) {
- if (namelen == 0)
- return -EINVAL;
- if (name[namelen-1] != '\0')
- break;
- namelen--;
- }
- if (namelen >= IDMAP_NAMESZ)
- return -EINVAL;
+ substring_t substr;
+ int token, ret;
- mutex_lock(&idmap->idmap_lock);
- mutex_lock(&idmap->idmap_im_lock);
-
- he = idmap_lookup_name(h, name, namelen);
- if (he != NULL) {
- *id = he->ih_id;
- ret = 0;
- goto out;
- }
+ memset(im, 0, sizeof(*im));
+ memset(msg, 0, sizeof(*msg));
- memset(im, 0, sizeof(*im));
- memcpy(im->im_name, name, namelen);
+ im->im_type = IDMAP_TYPE_GROUP;
+ token = match_token(desc, nfs_idmap_tokens, &substr);
- im->im_type = h->h_type;
- im->im_conv = IDMAP_CONV_NAMETOID;
+ switch (token) {
+ case Opt_find_uid:
+ im->im_type = IDMAP_TYPE_USER;
+ case Opt_find_gid:
+ im->im_conv = IDMAP_CONV_NAMETOID;
+ ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ);
+ break;
- memset(&msg, 0, sizeof(msg));
- msg.data = im;
- msg.len = sizeof(*im);
+ case Opt_find_user:
+ im->im_type = IDMAP_TYPE_USER;
+ case Opt_find_group:
+ im->im_conv = IDMAP_CONV_IDTONAME;
+ ret = match_int(&substr, &im->im_id);
+ break;
- add_wait_queue(&idmap->idmap_wq, &wq);
- if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
- remove_wait_queue(&idmap->idmap_wq, &wq);
+ default:
+ ret = -EINVAL;
goto out;
}
- set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&idmap->idmap_im_lock);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&idmap->idmap_wq, &wq);
- mutex_lock(&idmap->idmap_im_lock);
+ msg->data = im;
+ msg->len = sizeof(struct idmap_msg);
- if (im->im_status & IDMAP_STATUS_SUCCESS) {
- *id = im->im_id;
- ret = 0;
- }
-
- out:
- memset(im, 0, sizeof(*im));
- mutex_unlock(&idmap->idmap_im_lock);
- mutex_unlock(&idmap->idmap_lock);
+out:
return ret;
}
-/*
- * ID -> Name
- */
-static int
-nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
- __u32 id, char *name)
+static int nfs_idmap_legacy_upcall(struct key_construction *cons,
+ const char *op,
+ void *aux)
{
- struct rpc_pipe_msg msg;
+ struct rpc_pipe_msg *msg;
struct idmap_msg *im;
- struct idmap_hashent *he;
- DECLARE_WAITQUEUE(wq, current);
- int ret = -EIO;
- unsigned int len;
-
- im = &idmap->idmap_im;
+ struct idmap *idmap = (struct idmap *)aux;
+ struct key *key = cons->key;
+ int ret;
- mutex_lock(&idmap->idmap_lock);
- mutex_lock(&idmap->idmap_im_lock);
+ /* msg and im are freed in idmap_pipe_destroy_msg */
+ msg = kmalloc(sizeof(*msg), GFP_KERNEL);
+ if (IS_ERR(msg)) {
+ ret = PTR_ERR(msg);
+ goto out0;
+ }
- he = idmap_lookup_id(h, id);
- if (he) {
- memcpy(name, he->ih_name, he->ih_namelen);
- ret = he->ih_namelen;
- goto out;
+ im = kmalloc(sizeof(*im), GFP_KERNEL);
+ if (IS_ERR(im)) {
+ ret = PTR_ERR(im);
+ goto out1;
}
- memset(im, 0, sizeof(*im));
- im->im_type = h->h_type;
- im->im_conv = IDMAP_CONV_IDTONAME;
- im->im_id = id;
+ ret = nfs_idmap_prepare_message(key->description, im, msg);
+ if (ret < 0)
+ goto out2;
- memset(&msg, 0, sizeof(msg));
- msg.data = im;
- msg.len = sizeof(*im);
+ idmap->idmap_key_cons = cons;
- add_wait_queue(&idmap->idmap_wq, &wq);
+ ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
+ if (ret < 0)
+ goto out2;
- if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
- remove_wait_queue(&idmap->idmap_wq, &wq);
- goto out;
- }
+ return ret;
+
+out2:
+ kfree(im);
+out1:
+ kfree(msg);
+out0:
+ key_revoke(cons->key);
+ key_revoke(cons->authkey);
+ return ret;
+}
+
+static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data)
+{
+ return key_instantiate_and_link(key, data, strlen(data) + 1,
+ id_resolver_cache->thread_keyring,
+ authkey);
+}
- set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&idmap->idmap_im_lock);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&idmap->idmap_wq, &wq);
- mutex_lock(&idmap->idmap_im_lock);
-
- if (im->im_status & IDMAP_STATUS_SUCCESS) {
- if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0)
- goto out;
- memcpy(name, im->im_name, len);
- ret = len;
+static int nfs_idmap_read_message(struct idmap_msg *im, struct key *key, struct key *authkey)
+{
+ char id_str[NFS_UINT_MAXLEN];
+ int ret = -EINVAL;
+
+ switch (im->im_conv) {
+ case IDMAP_CONV_NAMETOID:
+ sprintf(id_str, "%d", im->im_id);
+ ret = nfs_idmap_instantiate(key, authkey, id_str);
+ break;
+ case IDMAP_CONV_IDTONAME:
+ ret = nfs_idmap_instantiate(key, authkey, im->im_name);
+ break;
}
- out:
- memset(im, 0, sizeof(*im));
- mutex_unlock(&idmap->idmap_im_lock);
- mutex_unlock(&idmap->idmap_lock);
return ret;
}
@@ -681,115 +703,51 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
{
struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
struct idmap *idmap = (struct idmap *)rpci->private;
- struct idmap_msg im_in, *im = &idmap->idmap_im;
- struct idmap_hashtable *h;
- struct idmap_hashent *he = NULL;
+ struct key_construction *cons = idmap->idmap_key_cons;
+ struct idmap_msg im;
size_t namelen_in;
int ret;
- if (mlen != sizeof(im_in))
- return -ENOSPC;
-
- if (copy_from_user(&im_in, src, mlen) != 0)
- return -EFAULT;
-
- mutex_lock(&idmap->idmap_im_lock);
-
- ret = mlen;
- im->im_status = im_in.im_status;
- /* If we got an error, terminate now, and wake up pending upcalls */
- if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) {
- wake_up(&idmap->idmap_wq);
+ if (mlen != sizeof(im)) {
+ ret = -ENOSPC;
goto out;
}
- /* Sanity checking of strings */
- ret = -EINVAL;
- namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ);
- if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ)
+ if (copy_from_user(&im, src, mlen) != 0) {
+ ret = -EFAULT;
goto out;
+ }
- switch (im_in.im_type) {
- case IDMAP_TYPE_USER:
- h = &idmap->idmap_user_hash;
- break;
- case IDMAP_TYPE_GROUP:
- h = &idmap->idmap_group_hash;
- break;
- default:
- goto out;
+ if (!(im.im_status & IDMAP_STATUS_SUCCESS)) {
+ ret = mlen;
+ complete_request_key(idmap->idmap_key_cons, -ENOKEY);
+ goto out_incomplete;
}
- switch (im_in.im_conv) {
- case IDMAP_CONV_IDTONAME:
- /* Did we match the current upcall? */
- if (im->im_conv == IDMAP_CONV_IDTONAME
- && im->im_type == im_in.im_type
- && im->im_id == im_in.im_id) {
- /* Yes: copy string, including the terminating '\0' */
- memcpy(im->im_name, im_in.im_name, namelen_in);
- im->im_name[namelen_in] = '\0';
- wake_up(&idmap->idmap_wq);
- }
- he = idmap_alloc_id(h, im_in.im_id);
- break;
- case IDMAP_CONV_NAMETOID:
- /* Did we match the current upcall? */
- if (im->im_conv == IDMAP_CONV_NAMETOID
- && im->im_type == im_in.im_type
- && strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in
- && memcmp(im->im_name, im_in.im_name, namelen_in) == 0) {
- im->im_id = im_in.im_id;
- wake_up(&idmap->idmap_wq);
- }
- he = idmap_alloc_name(h, im_in.im_name, namelen_in);
- break;
- default:
+ namelen_in = strnlen(im.im_name, IDMAP_NAMESZ);
+ if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
+ ret = -EINVAL;
goto out;
}
- /* If the entry is valid, also copy it to the cache */
- if (he != NULL)
- idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id);
- ret = mlen;
+ ret = nfs_idmap_read_message(&im, cons->key, cons->authkey);
+ if (ret >= 0) {
+ key_set_timeout(cons->key, nfs_idmap_cache_timeout);
+ ret = mlen;
+ }
+
out:
- mutex_unlock(&idmap->idmap_im_lock);
+ complete_request_key(idmap->idmap_key_cons, ret);
+out_incomplete:
return ret;
}
static void
idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
{
- struct idmap_msg *im = msg->data;
- struct idmap *idmap = container_of(im, struct idmap, idmap_im);
-
- if (msg->errno >= 0)
- return;
- mutex_lock(&idmap->idmap_im_lock);
- im->im_status = IDMAP_STATUS_LOOKUPFAIL;
- wake_up(&idmap->idmap_wq);
- mutex_unlock(&idmap->idmap_im_lock);
-}
-
-/*
- * Fowler/Noll/Vo hash
- * http://www.isthe.com/chongo/tech/comp/fnv/
- */
-
-#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */
-#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */
-
-static unsigned int fnvhash32(const void *buf, size_t buflen)
-{
- const unsigned char *p, *end = (const unsigned char *)buf + buflen;
- unsigned int hash = FNV_1_32;
-
- for (p = buf; p < end; p++) {
- hash *= FNV_P_32;
- hash ^= (unsigned int)*p;
- }
-
- return hash;
+ /* Free memory allocated in nfs_idmap_legacy_upcall() */
+ kfree(msg->data);
+ kfree(msg);
}
int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
@@ -798,16 +756,16 @@ int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_
if (nfs_map_string_to_numeric(name, namelen, uid))
return 0;
- return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
+ return nfs_idmap_lookup_id(name, namelen, "uid", uid, idmap);
}
-int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
{
struct idmap *idmap = server->nfs_client->cl_idmap;
- if (nfs_map_string_to_numeric(name, namelen, uid))
+ if (nfs_map_string_to_numeric(name, namelen, gid))
return 0;
- return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
+ return nfs_idmap_lookup_id(name, namelen, "gid", gid, idmap);
}
int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
@@ -816,21 +774,19 @@ int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, s
int ret = -EINVAL;
if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
- ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+ ret = nfs_idmap_lookup_name(uid, "user", buf, buflen, idmap);
if (ret < 0)
ret = nfs_map_numeric_to_string(uid, buf, buflen);
return ret;
}
-int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
{
struct idmap *idmap = server->nfs_client->cl_idmap;
int ret = -EINVAL;
if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
- ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+ ret = nfs_idmap_lookup_name(gid, "group", buf, buflen, idmap);
if (ret < 0)
- ret = nfs_map_numeric_to_string(uid, buf, buflen);
+ ret = nfs_map_numeric_to_string(gid, buf, buflen);
return ret;
}
-
-#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 25c3bfad795..e8bbfa5b350 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -39,8 +39,8 @@
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/freezer.h>
+#include <linux/crc32.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include "nfs4_fs.h"
@@ -51,13 +51,14 @@
#include "fscache.h"
#include "dns_resolve.h"
#include "pnfs.h"
+#include "netns.h"
#define NFSDBG_FACILITY NFSDBG_VFS
#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1
/* Default is to see 64-bit inode numbers */
-static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
static void nfs_invalidate_inode(struct inode *);
static int nfs_update_inode(struct inode *, struct nfs_fattr *);
@@ -388,9 +389,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
unlock_new_inode(inode);
} else
nfs_refresh_inode(inode, fattr);
- dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n",
+ dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode),
+ nfs_display_fhandle_hash(fh),
atomic_read(&inode->i_count));
out:
@@ -401,7 +403,7 @@ out_no_inode:
goto out;
}
-#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)
int
nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -423,7 +425,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
/* Optimization: if the end result is no change, don't RPC */
attr->ia_valid &= NFS_VALID_ATTRS;
- if ((attr->ia_valid & ~ATTR_FILE) == 0)
+ if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
return 0;
/* Write all dirty data */
@@ -1044,6 +1046,67 @@ struct nfs_fh *nfs_alloc_fhandle(void)
return fh;
}
+#ifdef NFS_DEBUG
+/*
+ * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle
+ * in the same way that wireshark does
+ *
+ * @fh: file handle
+ *
+ * For debugging only.
+ */
+u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
+{
+ /* wireshark uses 32-bit AUTODIN crc and does a bitwise
+ * not on the result */
+ return ~crc32(0xFFFFFFFF, &fh->data[0], fh->size);
+}
+
+/*
+ * _nfs_display_fhandle - display an NFS file handle on the console
+ *
+ * @fh: file handle to display
+ * @caption: display caption
+ *
+ * For debugging only.
+ */
+void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)
+{
+ unsigned short i;
+
+ if (fh == NULL || fh->size == 0) {
+ printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh);
+ return;
+ }
+
+ printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n",
+ caption, fh, fh->size, _nfs_display_fhandle_hash(fh));
+ for (i = 0; i < fh->size; i += 16) {
+ __be32 *pos = (__be32 *)&fh->data[i];
+
+ switch ((fh->size - i - 1) >> 2) {
+ case 0:
+ printk(KERN_DEFAULT " %08x\n",
+ be32_to_cpup(pos));
+ break;
+ case 1:
+ printk(KERN_DEFAULT " %08x %08x\n",
+ be32_to_cpup(pos), be32_to_cpup(pos + 1));
+ break;
+ case 2:
+ printk(KERN_DEFAULT " %08x %08x %08x\n",
+ be32_to_cpup(pos), be32_to_cpup(pos + 1),
+ be32_to_cpup(pos + 2));
+ break;
+ default:
+ printk(KERN_DEFAULT " %08x %08x %08x %08x\n",
+ be32_to_cpup(pos), be32_to_cpup(pos + 1),
+ be32_to_cpup(pos + 2), be32_to_cpup(pos + 3));
+ }
+ }
+}
+#endif
+
/**
* nfs_inode_attrs_need_update - check if the inode attributes need updating
* @inode - pointer to inode
@@ -1211,8 +1274,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long now = jiffies;
unsigned long save_cache_validity;
- dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
+ dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
+ nfs_display_fhandle_hash(NFS_FH(inode)),
atomic_read(&inode->i_count), fattr->valid);
if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
@@ -1406,7 +1470,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/*
* Big trouble! The inode has become a different object.
*/
- printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n",
+ printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",
__func__, inode->i_ino, inode->i_mode, fattr->mode);
out_err:
/*
@@ -1495,7 +1559,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&nfsi->open_files);
INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
- INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
+ INIT_LIST_HEAD(&nfsi->commit_list);
nfsi->npages = 0;
nfsi->ncommit = 0;
atomic_set(&nfsi->silly_count, 1);
@@ -1552,6 +1616,28 @@ static void nfsiod_stop(void)
destroy_workqueue(wq);
}
+int nfs_net_id;
+EXPORT_SYMBOL_GPL(nfs_net_id);
+
+static int nfs_net_init(struct net *net)
+{
+ nfs_clients_init(net);
+ return nfs_dns_resolver_cache_init(net);
+}
+
+static void nfs_net_exit(struct net *net)
+{
+ nfs_dns_resolver_cache_destroy(net);
+ nfs_cleanup_cb_ident_idr(net);
+}
+
+static struct pernet_operations nfs_net_ops = {
+ .init = nfs_net_init,
+ .exit = nfs_net_exit,
+ .id = &nfs_net_id,
+ .size = sizeof(struct nfs_net),
+};
+
/*
* Initialize NFS
*/
@@ -1561,10 +1647,14 @@ static int __init init_nfs_fs(void)
err = nfs_idmap_init();
if (err < 0)
- goto out9;
+ goto out10;
err = nfs_dns_resolver_init();
if (err < 0)
+ goto out9;
+
+ err = register_pernet_subsys(&nfs_net_ops);
+ if (err < 0)
goto out8;
err = nfs_fscache_register();
@@ -1600,14 +1690,14 @@ static int __init init_nfs_fs(void)
goto out0;
#ifdef CONFIG_PROC_FS
- rpc_proc_register(&nfs_rpcstat);
+ rpc_proc_register(&init_net, &nfs_rpcstat);
#endif
if ((err = register_nfs_fs()) != 0)
goto out;
return 0;
out:
#ifdef CONFIG_PROC_FS
- rpc_proc_unregister("nfs");
+ rpc_proc_unregister(&init_net, "nfs");
#endif
nfs_destroy_directcache();
out0:
@@ -1625,10 +1715,12 @@ out5:
out6:
nfs_fscache_unregister();
out7:
- nfs_dns_resolver_destroy();
+ unregister_pernet_subsys(&nfs_net_ops);
out8:
- nfs_idmap_quit();
+ nfs_dns_resolver_destroy();
out9:
+ nfs_idmap_quit();
+out10:
return err;
}
@@ -1640,12 +1732,12 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_inodecache();
nfs_destroy_nfspagecache();
nfs_fscache_unregister();
+ unregister_pernet_subsys(&nfs_net_ops);
nfs_dns_resolver_destroy();
nfs_idmap_quit();
#ifdef CONFIG_PROC_FS
- rpc_proc_unregister("nfs");
+ rpc_proc_unregister(&init_net, "nfs");
#endif
- nfs_cleanup_cb_ident_idr();
unregister_nfs_fs();
nfs_fs_proc_exit();
nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ee92538b06..2476dc69365 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -123,6 +123,7 @@ struct nfs_parsed_mount_data {
} nfs_server;
struct security_mnt_opts lsm_opts;
+ struct net *net;
};
/* mount_clnt.c */
@@ -137,20 +138,22 @@ struct nfs_mount_request {
int noresvport;
unsigned int *auth_flav_len;
rpc_authflavor_t *auth_flavs;
+ struct net *net;
};
extern int nfs_mount(struct nfs_mount_request *info);
extern void nfs_umount(const struct nfs_mount_request *info);
/* client.c */
-extern struct rpc_program nfs_program;
+extern const struct rpc_program nfs_program;
+extern void nfs_clients_init(struct net *net);
-extern void nfs_cleanup_cb_ident_idr(void);
+extern void nfs_cleanup_cb_ident_idr(struct net *);
extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
-extern struct nfs_client *nfs4_find_client_ident(int);
+extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
extern struct nfs_client *
-nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
+nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
+ struct nfs4_sessionid *);
extern struct nfs_server *nfs_create_server(
const struct nfs_parsed_mount_data *,
struct nfs_fh *);
@@ -329,10 +332,12 @@ void nfs_retry_commit(struct list_head *page_list,
void nfs_commit_clear_lock(struct nfs_inode *nfsi);
void nfs_commitdata_release(void *data);
void nfs_commit_release_pages(struct nfs_write_data *data);
+void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head);
+void nfs_request_remove_commit_list(struct nfs_page *req);
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
- struct page *, struct page *);
+ struct page *, struct page *, enum migrate_mode);
#else
#define nfs_migrate_page NULL
#endif
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index d4c2d6b7507..8e65c7f1f87 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -16,7 +16,7 @@
#include <linux/nfs_fs.h>
#include "internal.h"
-#ifdef RPC_DEBUG
+#ifdef NFS_DEBUG
# define NFSDBG_FACILITY NFSDBG_MOUNT
#endif
@@ -67,7 +67,7 @@ enum {
MOUNTPROC3_EXPORT = 5,
};
-static struct rpc_program mnt_program;
+static const struct rpc_program mnt_program;
/*
* Defined by OpenGroup XNFS Version 3W, chapter 8
@@ -153,7 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
.rpc_resp = &result,
};
struct rpc_create_args args = {
- .net = &init_net,
+ .net = info->net,
.protocol = info->protocol,
.address = info->sap,
.addrsize = info->salen,
@@ -225,7 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
.to_retries = 2,
};
struct rpc_create_args args = {
- .net = &init_net,
+ .net = info->net,
.protocol = IPPROTO_UDP,
.address = info->sap,
.addrsize = info->salen,
@@ -488,19 +488,19 @@ static struct rpc_procinfo mnt3_procedures[] = {
};
-static struct rpc_version mnt_version1 = {
+static const struct rpc_version mnt_version1 = {
.number = 1,
.nrprocs = ARRAY_SIZE(mnt_procedures),
.procs = mnt_procedures,
};
-static struct rpc_version mnt_version3 = {
+static const struct rpc_version mnt_version3 = {
.number = 3,
.nrprocs = ARRAY_SIZE(mnt3_procedures),
.procs = mnt3_procedures,
};
-static struct rpc_version *mnt_version[] = {
+static const struct rpc_version *mnt_version[] = {
NULL,
&mnt_version1,
NULL,
@@ -509,7 +509,7 @@ static struct rpc_version *mnt_version[] = {
static struct rpc_stat mnt_stats;
-static struct rpc_program mnt_program = {
+static const struct rpc_program mnt_program = {
.name = "mount",
.number = NFS_MNT_PROGRAM,
.nrvers = ARRAY_SIZE(mnt_version),
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 8102391bb37..1807866bb3a 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -276,7 +276,10 @@ out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fh);
out_nofree:
- dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
+ if (IS_ERR(mnt))
+ dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt));
+ else
+ dprintk("<-- %s() = %p\n", __func__, mnt);
return mnt;
}
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
new file mode 100644
index 00000000000..aa14ec303e9
--- /dev/null
+++ b/fs/nfs/netns.h
@@ -0,0 +1,27 @@
+#ifndef __NFS_NETNS_H__
+#define __NFS_NETNS_H__
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+struct bl_dev_msg {
+ int32_t status;
+ uint32_t major, minor;
+};
+
+struct nfs_net {
+ struct cache_detail *nfs_dns_resolve;
+ struct rpc_pipe *bl_device_pipe;
+ struct bl_dev_msg bl_mount_reply;
+ wait_queue_head_t bl_wq;
+ struct list_head nfs_client_list;
+ struct list_head nfs_volume_list;
+#ifdef CONFIG_NFS_V4
+ struct idr cb_ident_idr; /* Protected by nfs_client_lock */
+#endif
+ spinlock_t nfs_client_lock;
+};
+
+extern int nfs_net_id;
+
+#endif
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 792cb13a430..1f56000fabb 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -1150,7 +1150,7 @@ struct rpc_procinfo nfs_procedures[] = {
PROC(STATFS, fhandle, statfsres, 0),
};
-struct rpc_version nfs_version2 = {
+const struct rpc_version nfs_version2 = {
.number = 2,
.nrprocs = ARRAY_SIZE(nfs_procedures),
.procs = nfs_procedures
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 7ef23979896..e4498dc351a 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -192,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
.pages = pages,
};
struct nfs3_getaclres res = {
- 0
+ NULL,
};
struct rpc_message msg = {
.rpc_argp = &args,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 91943953a37..5242eae6711 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -428,6 +428,11 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
}
+static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+ rpc_call_start(task);
+}
+
static int
nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
{
@@ -445,6 +450,11 @@ nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
}
+static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ rpc_call_start(task);
+}
+
static int
nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
struct inode *new_dir)
@@ -814,6 +824,11 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message
msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
}
+static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+{
+ rpc_call_start(task);
+}
+
static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
{
if (nfs3_async_handle_jukebox(task, data->inode))
@@ -828,6 +843,11 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
}
+static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+{
+ rpc_call_start(task);
+}
+
static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
{
if (nfs3_async_handle_jukebox(task, data->inode))
@@ -864,9 +884,11 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.create = nfs3_proc_create,
.remove = nfs3_proc_remove,
.unlink_setup = nfs3_proc_unlink_setup,
+ .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
.unlink_done = nfs3_proc_unlink_done,
.rename = nfs3_proc_rename,
.rename_setup = nfs3_proc_rename_setup,
+ .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
.rename_done = nfs3_proc_rename_done,
.link = nfs3_proc_link,
.symlink = nfs3_proc_symlink,
@@ -879,8 +901,10 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.pathconf = nfs3_proc_pathconf,
.decode_dirent = nfs3_decode_dirent,
.read_setup = nfs3_proc_read_setup,
+ .read_rpc_prepare = nfs3_proc_read_rpc_prepare,
.read_done = nfs3_read_done,
.write_setup = nfs3_proc_write_setup,
+ .write_rpc_prepare = nfs3_proc_write_rpc_prepare,
.write_done = nfs3_write_done,
.commit_setup = nfs3_proc_commit_setup,
.commit_done = nfs3_commit_done,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 183c6b123d0..a77cc9a3ce5 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -2461,7 +2461,7 @@ struct rpc_procinfo nfs3_procedures[] = {
PROC(COMMIT, commit, commit, 5),
};
-struct rpc_version nfs_version3 = {
+const struct rpc_version nfs_version3 = {
.number = 3,
.nrprocs = ARRAY_SIZE(nfs3_procedures),
.procs = nfs3_procedures
@@ -2489,7 +2489,7 @@ static struct rpc_procinfo nfs3_acl_procedures[] = {
},
};
-struct rpc_version nfsacl_version3 = {
+const struct rpc_version nfsacl_version3 = {
.number = 3,
.nrprocs = sizeof(nfs3_acl_procedures)/
sizeof(nfs3_acl_procedures[0]),
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4d7d0aedc10..97ecc863dd7 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -20,7 +20,6 @@ enum nfs4_client_state {
NFS4CLNT_RECLAIM_REBOOT,
NFS4CLNT_RECLAIM_NOGRACE,
NFS4CLNT_DELEGRETURN,
- NFS4CLNT_LAYOUTRECALL,
NFS4CLNT_SESSION_RESET,
NFS4CLNT_RECALL_SLOT,
NFS4CLNT_LEASE_CONFIRM,
@@ -44,7 +43,7 @@ struct nfs4_minor_version_ops {
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
int cache_reply);
- int (*validate_stateid)(struct nfs_delegation *,
+ bool (*match_stateid)(const nfs4_stateid *,
const nfs4_stateid *);
int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
struct nfs_fsinfo *);
@@ -53,26 +52,25 @@ struct nfs4_minor_version_ops {
const struct nfs4_state_maintenance_ops *state_renewal_ops;
};
-/*
- * struct rpc_sequence ensures that RPC calls are sent in the exact
- * order that they appear on the list.
- */
-struct rpc_sequence {
- struct rpc_wait_queue wait; /* RPC call delay queue */
- spinlock_t lock; /* Protects the list */
- struct list_head list; /* Defines sequence of RPC calls */
+struct nfs_unique_id {
+ struct rb_node rb_node;
+ __u64 id;
};
#define NFS_SEQID_CONFIRMED 1
struct nfs_seqid_counter {
- struct rpc_sequence *sequence;
+ int owner_id;
int flags;
u32 counter;
+ spinlock_t lock; /* Protects the list */
+ struct list_head list; /* Defines sequence of RPC calls */
+ struct rpc_wait_queue wait; /* RPC call delay queue */
};
struct nfs_seqid {
struct nfs_seqid_counter *sequence;
struct list_head list;
+ struct rpc_task *task;
};
static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status)
@@ -81,18 +79,12 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status
seqid->flags |= NFS_SEQID_CONFIRMED;
}
-struct nfs_unique_id {
- struct rb_node rb_node;
- __u64 id;
-};
-
/*
* NFS4 state_owners and lock_owners are simply labels for ordered
* sequences of RPC calls. Their sole purpose is to provide once-only
* semantics by allowing the server to identify replayed requests.
*/
struct nfs4_state_owner {
- struct nfs_unique_id so_owner_id;
struct nfs_server *so_server;
struct list_head so_lru;
unsigned long so_expires;
@@ -105,7 +97,6 @@ struct nfs4_state_owner {
unsigned long so_flags;
struct list_head so_states;
struct nfs_seqid_counter so_seqid;
- struct rpc_sequence so_sequence;
};
enum {
@@ -146,8 +137,6 @@ struct nfs4_lock_state {
#define NFS_LOCK_INITIALIZED 1
int ls_flags;
struct nfs_seqid_counter ls_seqid;
- struct rpc_sequence ls_sequence;
- struct nfs_unique_id ls_id;
nfs4_stateid ls_stateid;
atomic_t ls_count;
struct nfs4_lock_owner ls_owner;
@@ -193,6 +182,7 @@ struct nfs4_exception {
long timeout;
int retry;
struct nfs4_state *state;
+ struct inode *inode;
};
struct nfs4_state_recovery_ops {
@@ -224,7 +214,7 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, boo
extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
struct nfs4_fs_locations *fs_locations, struct page *page);
-extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
+extern int nfs4_release_lockowner(struct nfs4_lock_state *);
extern const struct xattr_handler *nfs4_xattr_handlers[];
#if defined(CONFIG_NFS_V4_1)
@@ -233,12 +223,13 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
return server->nfs_client->cl_session;
}
+extern bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy);
extern int nfs4_setup_sequence(const struct nfs_server *server,
struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
- int cache_reply, struct rpc_task *task);
+ struct rpc_task *task);
extern int nfs41_setup_sequence(struct nfs4_session *session,
struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
- int cache_reply, struct rpc_task *task);
+ struct rpc_task *task);
extern void nfs4_destroy_session(struct nfs4_session *session);
extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
extern int nfs4_proc_create_session(struct nfs_client *);
@@ -269,7 +260,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
static inline int nfs4_setup_sequence(const struct nfs_server *server,
struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
- int cache_reply, struct rpc_task *task)
+ struct rpc_task *task)
{
return 0;
}
@@ -319,7 +310,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
}
#endif /* CONFIG_NFS_V4_1 */
-extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
+extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t);
extern void nfs4_put_state_owner(struct nfs4_state_owner *);
extern void nfs4_purge_state_owners(struct nfs_server *);
extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
@@ -327,6 +318,8 @@ extern void nfs4_put_open_state(struct nfs4_state *);
extern void nfs4_close_state(struct nfs4_state *, fmode_t);
extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
+extern void nfs_inode_find_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid);
extern void nfs4_schedule_lease_recovery(struct nfs_client *);
extern void nfs4_schedule_state_manager(struct nfs_client *);
extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
@@ -337,7 +330,8 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
struct server_scope **);
extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
+extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
+ fmode_t, fl_owner_t, pid_t);
extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -346,6 +340,8 @@ extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
extern void nfs_release_seqid(struct nfs_seqid *seqid);
extern void nfs_free_seqid(struct nfs_seqid *seqid);
+extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
+
extern const nfs4_stateid zero_stateid;
/* nfs4xdr.c */
@@ -357,6 +353,16 @@ struct nfs4_mount_data;
extern struct svc_version nfs4_callback_version1;
extern struct svc_version nfs4_callback_version4;
+static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ return memcmp(dst, src, sizeof(*dst)) == 0;
+}
+
#else
#define nfs4_close_state(a, b) do { } while (0)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 71ec08617e2..5acfd9ea8a3 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -33,7 +33,10 @@
#include <linux/nfs_page.h>
#include <linux/module.h>
+#include <linux/sunrpc/metrics.h>
+
#include "internal.h"
+#include "delegation.h"
#include "nfs4filelayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -84,12 +87,27 @@ static int filelayout_async_handle_error(struct rpc_task *task,
struct nfs_client *clp,
int *reset)
{
+ struct nfs_server *mds_server = NFS_SERVER(state->inode);
+ struct nfs_client *mds_client = mds_server->nfs_client;
+
if (task->tk_status >= 0)
return 0;
-
*reset = 0;
switch (task->tk_status) {
+ /* MDS state errors */
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ nfs_remove_bad_delegation(state->inode);
+ case -NFS4ERR_OPENMODE:
+ nfs4_schedule_stateid_recovery(mds_server, state);
+ goto wait_on_recovery;
+ case -NFS4ERR_EXPIRED:
+ nfs4_schedule_stateid_recovery(mds_server, state);
+ nfs4_schedule_lease_recovery(mds_client);
+ goto wait_on_recovery;
+ /* DS session errors */
case -NFS4ERR_BADSESSION:
case -NFS4ERR_BADSLOT:
case -NFS4ERR_BAD_HIGH_SLOT:
@@ -115,8 +133,14 @@ static int filelayout_async_handle_error(struct rpc_task *task,
*reset = 1;
break;
}
+out:
task->tk_status = 0;
return -EAGAIN;
+wait_on_recovery:
+ rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
+ rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
+ goto out;
}
/* NFS_PROTO call done callback routines */
@@ -173,7 +197,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
&rdata->args.seq_args, &rdata->res.seq_res,
- 0, task))
+ task))
return;
rpc_call_start(task);
@@ -189,10 +213,18 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
rdata->mds_ops->rpc_call_done(task, data);
}
+static void filelayout_read_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+ rpc_count_iostats(task, NFS_SERVER(rdata->inode)->client->cl_metrics);
+}
+
static void filelayout_read_release(void *data)
{
struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+ put_lseg(rdata->lseg);
rdata->mds_ops->rpc_release(data);
}
@@ -254,7 +286,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
&wdata->args.seq_args, &wdata->res.seq_res,
- 0, task))
+ task))
return;
rpc_call_start(task);
@@ -268,10 +300,18 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
wdata->mds_ops->rpc_call_done(task, data);
}
+static void filelayout_write_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+ rpc_count_iostats(task, NFS_SERVER(wdata->inode)->client->cl_metrics);
+}
+
static void filelayout_write_release(void *data)
{
struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+ put_lseg(wdata->lseg);
wdata->mds_ops->rpc_release(data);
}
@@ -282,24 +322,28 @@ static void filelayout_commit_release(void *data)
nfs_commit_release_pages(wdata);
if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding))
nfs_commit_clear_lock(NFS_I(wdata->inode));
+ put_lseg(wdata->lseg);
nfs_commitdata_release(wdata);
}
-struct rpc_call_ops filelayout_read_call_ops = {
+static const struct rpc_call_ops filelayout_read_call_ops = {
.rpc_call_prepare = filelayout_read_prepare,
.rpc_call_done = filelayout_read_call_done,
+ .rpc_count_stats = filelayout_read_count_stats,
.rpc_release = filelayout_read_release,
};
-struct rpc_call_ops filelayout_write_call_ops = {
+static const struct rpc_call_ops filelayout_write_call_ops = {
.rpc_call_prepare = filelayout_write_prepare,
.rpc_call_done = filelayout_write_call_done,
+ .rpc_count_stats = filelayout_write_count_stats,
.rpc_release = filelayout_write_release,
};
-struct rpc_call_ops filelayout_commit_call_ops = {
+static const struct rpc_call_ops filelayout_commit_call_ops = {
.rpc_call_prepare = filelayout_write_prepare,
.rpc_call_done = filelayout_write_call_done,
+ .rpc_count_stats = filelayout_write_count_stats,
.rpc_release = filelayout_commit_release,
};
@@ -367,7 +411,8 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
idx = nfs4_fl_calc_ds_index(lseg, j);
ds = nfs4_fl_prepare_ds(lseg, idx);
if (!ds) {
- printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+ printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n",
+ __func__);
set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
return PNFS_NOT_ATTEMPTED;
@@ -575,7 +620,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
goto out_err_free;
fl->fh_array[i]->size = be32_to_cpup(p++);
if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
- printk(KERN_ERR "Too big fh %d received %d\n",
+ printk(KERN_ERR "NFS: Too big fh %d received %d\n",
i, fl->fh_array[i]->size);
goto out_err_free;
}
@@ -640,14 +685,16 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
int size = (fl->stripe_type == STRIPE_SPARSE) ?
fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
- fl->commit_buckets = kcalloc(size, sizeof(struct list_head), gfp_flags);
+ fl->commit_buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket), gfp_flags);
if (!fl->commit_buckets) {
filelayout_free_lseg(&fl->generic_hdr);
return NULL;
}
fl->number_of_buckets = size;
- for (i = 0; i < size; i++)
- INIT_LIST_HEAD(&fl->commit_buckets[i]);
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&fl->commit_buckets[i].written);
+ INIT_LIST_HEAD(&fl->commit_buckets[i].committing);
+ }
}
return &fl->generic_hdr;
}
@@ -679,7 +726,7 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
return (p_stripe == r_stripe);
}
-void
+static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
@@ -696,7 +743,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
nfs_pageio_reset_read_mds(pgio);
}
-void
+static void
filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
@@ -725,11 +772,6 @@ static const struct nfs_pageio_ops filelayout_pg_write_ops = {
.pg_doio = pnfs_generic_pg_writepages,
};
-static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)
-{
- return !FILELAYOUT_LSEG(lseg)->commit_through_mds;
-}
-
static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
{
if (fl->stripe_type == STRIPE_SPARSE)
@@ -738,13 +780,48 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
return j;
}
-struct list_head *filelayout_choose_commit_list(struct nfs_page *req)
+/* The generic layer is about to remove the req from the commit list.
+ * If this will make the bucket empty, it will need to put the lseg reference.
+ */
+static void
+filelayout_clear_request_commit(struct nfs_page *req)
+{
+ struct pnfs_layout_segment *freeme = NULL;
+ struct inode *inode = req->wb_context->dentry->d_inode;
+
+ spin_lock(&inode->i_lock);
+ if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
+ goto out;
+ if (list_is_singular(&req->wb_list)) {
+ struct pnfs_layout_segment *lseg;
+
+ /* From here we can find the bucket, but for the moment,
+ * since there is only one relevant lseg...
+ */
+ list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ freeme = lseg;
+ break;
+ }
+ }
+ }
+out:
+ nfs_request_remove_commit_list(req);
+ spin_unlock(&inode->i_lock);
+ put_lseg(freeme);
+}
+
+static struct list_head *
+filelayout_choose_commit_list(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg)
{
- struct pnfs_layout_segment *lseg = req->wb_commit_lseg;
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
u32 i, j;
struct list_head *list;
+ if (fl->commit_through_mds)
+ return &NFS_I(req->wb_context->dentry->d_inode)->commit_list;
+
/* Note that we are calling nfs4_fl_calc_j_index on each page
* that ends up being committed to a data server. An attractive
* alternative is to add a field to nfs_write_data and nfs_page
@@ -754,14 +831,30 @@ struct list_head *filelayout_choose_commit_list(struct nfs_page *req)
j = nfs4_fl_calc_j_index(lseg,
(loff_t)req->wb_index << PAGE_CACHE_SHIFT);
i = select_bucket_index(fl, j);
- list = &fl->commit_buckets[i];
+ list = &fl->commit_buckets[i].written;
if (list_empty(list)) {
- /* Non-empty buckets hold a reference on the lseg */
+ /* Non-empty buckets hold a reference on the lseg. That ref
+ * is normally transferred to the COMMIT call and released
+ * there. It could also be released if the last req is pulled
+ * off due to a rewrite, in which case it will be done in
+ * filelayout_remove_commit_req
+ */
get_lseg(lseg);
}
+ set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
return list;
}
+static void
+filelayout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg)
+{
+ struct list_head *list;
+
+ list = filelayout_choose_commit_list(req, lseg);
+ nfs_request_add_commit_list(req, list);
+}
+
static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
{
struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
@@ -797,11 +890,12 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
ds = nfs4_fl_prepare_ds(lseg, idx);
if (!ds) {
- printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+ printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n",
+ __func__);
set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
prepare_to_resend_writes(data);
- data->mds_ops->rpc_release(data);
+ filelayout_commit_release(data);
return -EAGAIN;
}
dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how);
@@ -817,24 +911,87 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
/*
* This is only useful while we are using whole file layouts.
*/
-static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
+static struct pnfs_layout_segment *
+find_only_write_lseg_locked(struct inode *inode)
{
- struct pnfs_layout_segment *lseg, *rv = NULL;
+ struct pnfs_layout_segment *lseg;
- spin_lock(&inode->i_lock);
list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
if (lseg->pls_range.iomode == IOMODE_RW)
- rv = get_lseg(lseg);
+ return lseg;
+ return NULL;
+}
+
+static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
+{
+ struct pnfs_layout_segment *rv;
+
+ spin_lock(&inode->i_lock);
+ rv = find_only_write_lseg_locked(inode);
+ if (rv)
+ get_lseg(rv);
spin_unlock(&inode->i_lock);
return rv;
}
-static int alloc_ds_commits(struct inode *inode, struct list_head *list)
+static int
+filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
+ spinlock_t *lock)
+{
+ struct list_head *src = &bucket->written;
+ struct list_head *dst = &bucket->committing;
+ struct nfs_page *req, *tmp;
+ int ret = 0;
+
+ list_for_each_entry_safe(req, tmp, src, wb_list) {
+ if (!nfs_lock_request(req))
+ continue;
+ if (cond_resched_lock(lock))
+ list_safe_reset_next(req, tmp, wb_list);
+ nfs_request_remove_commit_list(req);
+ clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+ nfs_list_add_request(req, dst);
+ ret++;
+ if (ret == max)
+ break;
+ }
+ return ret;
+}
+
+/* Move reqs from written to committing lists, returning count of number moved.
+ * Note called with i_lock held.
+ */
+static int filelayout_scan_commit_lists(struct inode *inode, int max,
+ spinlock_t *lock)
+{
+ struct pnfs_layout_segment *lseg;
+ struct nfs4_filelayout_segment *fl;
+ int i, rv = 0, cnt;
+
+ lseg = find_only_write_lseg_locked(inode);
+ if (!lseg)
+ goto out_done;
+ fl = FILELAYOUT_LSEG(lseg);
+ if (fl->commit_through_mds)
+ goto out_done;
+ for (i = 0; i < fl->number_of_buckets && max != 0; i++) {
+ cnt = filelayout_scan_ds_commit_list(&fl->commit_buckets[i],
+ max, lock);
+ max -= cnt;
+ rv += cnt;
+ }
+out_done:
+ return rv;
+}
+
+static unsigned int
+alloc_ds_commits(struct inode *inode, struct list_head *list)
{
struct pnfs_layout_segment *lseg;
struct nfs4_filelayout_segment *fl;
struct nfs_write_data *data;
int i, j;
+ unsigned int nreq = 0;
/* Won't need this when non-whole file layout segments are supported
* instead we will use a pnfs_layout_hdr structure */
@@ -843,28 +1000,27 @@ static int alloc_ds_commits(struct inode *inode, struct list_head *list)
return 0;
fl = FILELAYOUT_LSEG(lseg);
for (i = 0; i < fl->number_of_buckets; i++) {
- if (list_empty(&fl->commit_buckets[i]))
+ if (list_empty(&fl->commit_buckets[i].committing))
continue;
data = nfs_commitdata_alloc();
if (!data)
- goto out_bad;
+ break;
data->ds_commit_index = i;
data->lseg = lseg;
list_add(&data->pages, list);
+ nreq++;
}
- put_lseg(lseg);
- return 0;
-out_bad:
+ /* Clean up on error */
for (j = i; j < fl->number_of_buckets; j++) {
- if (list_empty(&fl->commit_buckets[i]))
+ if (list_empty(&fl->commit_buckets[i].committing))
continue;
- nfs_retry_commit(&fl->commit_buckets[i], lseg);
+ nfs_retry_commit(&fl->commit_buckets[i].committing, lseg);
put_lseg(lseg); /* associated with emptying bucket */
}
put_lseg(lseg);
/* Caller will clean up entries put on list */
- return -ENOMEM;
+ return nreq;
}
/* This follows nfs_commit_list pretty closely */
@@ -874,40 +1030,40 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
{
struct nfs_write_data *data, *tmp;
LIST_HEAD(list);
+ unsigned int nreq = 0;
if (!list_empty(mds_pages)) {
data = nfs_commitdata_alloc();
- if (!data)
- goto out_bad;
- data->lseg = NULL;
- list_add(&data->pages, &list);
+ if (data != NULL) {
+ data->lseg = NULL;
+ list_add(&data->pages, &list);
+ nreq++;
+ } else
+ nfs_retry_commit(mds_pages, NULL);
}
- if (alloc_ds_commits(inode, &list))
- goto out_bad;
+ nreq += alloc_ds_commits(inode, &list);
+
+ if (nreq == 0) {
+ nfs_commit_clear_lock(NFS_I(inode));
+ goto out;
+ }
+
+ atomic_add(nreq, &NFS_I(inode)->commits_outstanding);
list_for_each_entry_safe(data, tmp, &list, pages) {
list_del_init(&data->pages);
- atomic_inc(&NFS_I(inode)->commits_outstanding);
if (!data->lseg) {
nfs_init_commit(data, mds_pages, NULL);
nfs_initiate_commit(data, NFS_CLIENT(inode),
data->mds_ops, how);
} else {
- nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg);
+ nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index].committing, data->lseg);
filelayout_initiate_commit(data, how);
}
}
- return 0;
- out_bad:
- list_for_each_entry_safe(data, tmp, &list, pages) {
- nfs_retry_commit(&data->pages, data->lseg);
- list_del_init(&data->pages);
- nfs_commit_free(data);
- }
- nfs_retry_commit(mds_pages, NULL);
- nfs_commit_clear_lock(NFS_I(inode));
- return -ENOMEM;
+out:
+ return PNFS_ATTEMPTED;
}
static void
@@ -924,8 +1080,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.free_lseg = filelayout_free_lseg,
.pg_read_ops = &filelayout_pg_read_ops,
.pg_write_ops = &filelayout_pg_write_ops,
- .mark_pnfs_commit = filelayout_mark_pnfs_commit,
- .choose_commit_list = filelayout_choose_commit_list,
+ .mark_request_commit = filelayout_mark_request_commit,
+ .clear_request_commit = filelayout_clear_request_commit,
+ .scan_commit_lists = filelayout_scan_commit_lists,
.commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2e42284253f..21190bb1f5e 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -74,6 +74,11 @@ struct nfs4_file_layout_dsaddr {
struct nfs4_pnfs_ds *ds_list[1];
};
+struct nfs4_fl_commit_bucket {
+ struct list_head written;
+ struct list_head committing;
+};
+
struct nfs4_filelayout_segment {
struct pnfs_layout_segment generic_hdr;
u32 stripe_type;
@@ -84,7 +89,7 @@ struct nfs4_filelayout_segment {
struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
unsigned int num_fh;
struct nfs_fh **fh_array;
- struct list_head *commit_buckets; /* Sort commits to ds */
+ struct nfs4_fl_commit_bucket *commit_buckets; /* Sort commits to ds */
int number_of_buckets;
};
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index ed388aae968..a866bbd2890 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -45,7 +45,7 @@
* - incremented when a device id maps a data server already in the cache.
* - decremented when deviceid is removed from the cache.
*/
-DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
static LIST_HEAD(nfs4_data_server_cache);
/* Debug routines */
@@ -108,58 +108,40 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
return false;
}
-/*
- * Lookup DS by addresses. The first matching address returns true.
- * nfs4_ds_cache_lock is held
- */
-static struct nfs4_pnfs_ds *
-_data_server_lookup_locked(struct list_head *dsaddrs)
+static bool
+_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
+ const struct list_head *dsaddrs2)
{
- struct nfs4_pnfs_ds *ds;
struct nfs4_pnfs_ds_addr *da1, *da2;
- list_for_each_entry(da1, dsaddrs, da_node) {
- list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
- list_for_each_entry(da2, &ds->ds_addrs, da_node) {
- if (same_sockaddr(
- (struct sockaddr *)&da1->da_addr,
- (struct sockaddr *)&da2->da_addr))
- return ds;
- }
- }
+ /* step through both lists, comparing as we go */
+ for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
+ da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
+ da1 != NULL && da2 != NULL;
+ da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
+ da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
+ if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
+ (struct sockaddr *)&da2->da_addr))
+ return false;
}
- return NULL;
+ if (da1 == NULL && da2 == NULL)
+ return true;
+
+ return false;
}
/*
- * Compare two lists of addresses.
+ * Lookup DS by addresses. nfs4_ds_cache_lock is held
*/
-static bool
-_data_server_match_all_addrs_locked(struct list_head *dsaddrs1,
- struct list_head *dsaddrs2)
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(const struct list_head *dsaddrs)
{
- struct nfs4_pnfs_ds_addr *da1, *da2;
- size_t count1 = 0,
- count2 = 0;
-
- list_for_each_entry(da1, dsaddrs1, da_node)
- count1++;
-
- list_for_each_entry(da2, dsaddrs2, da_node) {
- bool found = false;
- count2++;
- list_for_each_entry(da1, dsaddrs1, da_node) {
- if (same_sockaddr((struct sockaddr *)&da1->da_addr,
- (struct sockaddr *)&da2->da_addr)) {
- found = true;
- break;
- }
- }
- if (!found)
- return false;
- }
+ struct nfs4_pnfs_ds *ds;
- return (count1 == count2);
+ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+ if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+ return ds;
+ return NULL;
}
/*
@@ -356,11 +338,6 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
dprintk("%s add new data server %s\n", __func__,
ds->ds_remotestr);
} else {
- if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs,
- dsaddrs)) {
- dprintk("%s: multipath address mismatch: %s != %s",
- __func__, tmp_ds->ds_remotestr, remotestr);
- }
kfree(remotestr);
kfree(ds);
atomic_inc(&tmp_ds->ds_count);
@@ -378,11 +355,11 @@ out:
* Currently only supports ipv4, ipv6 and one multi-path address.
*/
static struct nfs4_pnfs_ds_addr *
-decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
+decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
{
struct nfs4_pnfs_ds_addr *da = NULL;
char *buf, *portstr;
- u32 port;
+ __be16 port;
int nlen, rlen;
int tmp[2];
__be32 *p;
@@ -457,7 +434,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
INIT_LIST_HEAD(&da->da_node);
- if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr,
+ if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
sizeof(da->da_addr))) {
dprintk("%s: error parsing address %s\n", __func__, buf);
goto out_free_da;
@@ -554,7 +531,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
cnt = be32_to_cpup(p);
dprintk("%s stripe count %d\n", __func__, cnt);
if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
- printk(KERN_WARNING "%s: stripe count %d greater than "
+ printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
"supported maximum %d\n", __func__,
cnt, NFS4_PNFS_MAX_STRIPE_CNT);
goto out_err_free_scratch;
@@ -585,7 +562,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
num = be32_to_cpup(p);
dprintk("%s ds_num %u\n", __func__, num);
if (num > NFS4_PNFS_MAX_MULTI_CNT) {
- printk(KERN_WARNING "%s: multipath count %d greater than "
+ printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
"supported maximum %d\n", __func__,
num, NFS4_PNFS_MAX_MULTI_CNT);
goto out_err_free_stripe_indices;
@@ -593,7 +570,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
/* validate stripe indices are all < num */
if (max_stripe_index >= num) {
- printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n",
+ printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
__func__, max_stripe_index, num);
goto out_err_free_stripe_indices;
}
@@ -625,7 +602,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
mp_count = be32_to_cpup(p); /* multipath count */
for (j = 0; j < mp_count; j++) {
- da = decode_ds_addr(&stream, gfp_flags);
+ da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->net,
+ &stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
}
@@ -686,7 +664,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
new = decode_device(inode, dev, gfp_flags);
if (!new) {
- printk(KERN_WARNING "%s: Could not decode or add device\n",
+ printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
__func__);
return NULL;
}
@@ -835,7 +813,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
if (ds == NULL) {
- printk(KERN_ERR "%s: No data server for offset index %d\n",
+ printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
__func__, ds_idx);
return NULL;
}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index bb80c49b653..9c8eca315f4 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -94,13 +94,14 @@ static int nfs4_validate_fspath(struct dentry *dentry,
}
static size_t nfs_parse_server_name(char *string, size_t len,
- struct sockaddr *sa, size_t salen)
+ struct sockaddr *sa, size_t salen, struct nfs_server *server)
{
+ struct net *net = rpc_net_ns(server->client);
ssize_t ret;
- ret = rpc_pton(string, len, sa, salen);
+ ret = rpc_pton(net, string, len, sa, salen);
if (ret == 0) {
- ret = nfs_dns_resolve_name(string, len, sa, salen);
+ ret = nfs_dns_resolve_name(net, string, len, sa, salen);
if (ret < 0)
ret = 0;
}
@@ -137,7 +138,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
continue;
mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
- mountdata->addr, addr_bufsize);
+ mountdata->addr, addr_bufsize,
+ NFS_SB(mountdata->sb));
if (mountdata->addrlen == 0)
continue;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 75366dc8968..f82bde005a8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -72,18 +72,21 @@
#define NFS4_MAX_LOOP_ON_RECOVER (10)
+static unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
+
struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
struct nfs4_state *state);
#ifdef CONFIG_NFS_V4_1
-static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *);
-static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *);
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *);
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *);
#endif
/* Prevent leaks of NFSv4 errors into userland */
static int nfs4_map_errors(int err)
@@ -193,7 +196,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
* when talking to the server, we always send cookie 0
* instead of 1 or 2.
*/
- start = p = kmap_atomic(*readdir->pages, KM_USER0);
+ start = p = kmap_atomic(*readdir->pages);
if (cookie == 0) {
*p++ = xdr_one; /* next */
@@ -221,7 +224,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
readdir->pgbase = (char *)p - (char *)start;
readdir->count -= readdir->pgbase;
- kunmap_atomic(start, KM_USER0);
+ kunmap_atomic(start);
}
static int nfs4_wait_clnt_recover(struct nfs_client *clp)
@@ -259,17 +262,29 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state;
+ struct inode *inode = exception->inode;
int ret = errorcode;
exception->retry = 0;
switch(errorcode) {
case 0:
return 0;
+ case -NFS4ERR_OPENMODE:
+ if (inode && nfs_have_delegation(inode, FMODE_READ)) {
+ nfs_inode_return_delegation(inode);
+ exception->retry = 1;
+ return 0;
+ }
+ if (state == NULL)
+ break;
+ nfs4_schedule_stateid_recovery(server, state);
+ goto wait_on_recovery;
+ case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
+ nfs_remove_bad_delegation(state->inode);
nfs4_schedule_stateid_recovery(server, state);
goto wait_on_recovery;
case -NFS4ERR_EXPIRED:
@@ -360,16 +375,14 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
* When updating highest_used_slotid there may be "holes" in the bitmap
* so we need to scan down from highest_used_slotid to 0 looking for the now
* highest slotid in use.
- * If none found, highest_used_slotid is set to -1.
+ * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
*
* Must be called while holding tbl->slot_tbl_lock
*/
static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
+nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid)
{
- int slotid = free_slotid;
-
- BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
+ BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE);
/* clear used bit in bitmap */
__clear_bit(slotid, tbl->used_slots);
@@ -379,10 +392,16 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
if (slotid < tbl->max_slots)
tbl->highest_used_slotid = slotid;
else
- tbl->highest_used_slotid = -1;
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
}
- dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__,
- free_slotid, tbl->highest_used_slotid);
+ dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
+ slotid, tbl->highest_used_slotid);
+}
+
+bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy)
+{
+ rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+ return true;
}
/*
@@ -390,16 +409,13 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
*/
static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
{
- struct rpc_task *task;
-
if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
- task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
- if (task)
- rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+ rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq,
+ nfs4_set_task_privileged, NULL);
return;
}
- if (ses->fc_slot_table.highest_used_slotid != -1)
+ if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
return;
dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
@@ -412,7 +428,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
{
if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
- ses->bc_slot_table.highest_used_slotid != -1)
+ ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
return;
dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
complete(&ses->bc_slot_table.complete);
@@ -507,25 +523,25 @@ static int nfs4_sequence_done(struct rpc_task *task,
* nfs4_find_slot looks for an unset bit in the used_slots bitmap.
* If found, we mark the slot as used, update the highest_used_slotid,
* and respectively set up the sequence operation args.
- * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise.
+ * The slot number is returned if found, or NFS4_NO_SLOT otherwise.
*
* Note: must be called with under the slot_tbl_lock.
*/
-static u8
+static u32
nfs4_find_slot(struct nfs4_slot_table *tbl)
{
- int slotid;
- u8 ret_id = NFS4_MAX_SLOT_TABLE;
- BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
+ u32 slotid;
+ u32 ret_id = NFS4_NO_SLOT;
- dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
+ dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
__func__, tbl->used_slots[0], tbl->highest_used_slotid,
tbl->max_slots);
slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
if (slotid >= tbl->max_slots)
goto out;
__set_bit(slotid, tbl->used_slots);
- if (slotid > tbl->highest_used_slotid)
+ if (slotid > tbl->highest_used_slotid ||
+ tbl->highest_used_slotid == NFS4_NO_SLOT)
tbl->highest_used_slotid = slotid;
ret_id = slotid;
out:
@@ -534,15 +550,25 @@ out:
return ret_id;
}
+static void nfs41_init_sequence(struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res, int cache_reply)
+{
+ args->sa_session = NULL;
+ args->sa_cache_this = 0;
+ if (cache_reply)
+ args->sa_cache_this = 1;
+ res->sr_session = NULL;
+ res->sr_slot = NULL;
+}
+
int nfs41_setup_sequence(struct nfs4_session *session,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
- int cache_reply,
struct rpc_task *task)
{
struct nfs4_slot *slot;
struct nfs4_slot_table *tbl;
- u8 slotid;
+ u32 slotid;
dprintk("--> %s\n", __func__);
/* slot already allocated? */
@@ -570,7 +596,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
}
slotid = nfs4_find_slot(tbl);
- if (slotid == NFS4_MAX_SLOT_TABLE) {
+ if (slotid == NFS4_NO_SLOT) {
rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
spin_unlock(&tbl->slot_tbl_lock);
dprintk("<-- %s: no free slots\n", __func__);
@@ -582,7 +608,6 @@ int nfs41_setup_sequence(struct nfs4_session *session,
slot = tbl->slots + slotid;
args->sa_session = session;
args->sa_slotid = slotid;
- args->sa_cache_this = cache_reply;
dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
@@ -602,24 +627,19 @@ EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
int nfs4_setup_sequence(const struct nfs_server *server,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
- int cache_reply,
struct rpc_task *task)
{
struct nfs4_session *session = nfs4_get_session(server);
int ret = 0;
- if (session == NULL) {
- args->sa_session = NULL;
- res->sr_session = NULL;
+ if (session == NULL)
goto out;
- }
dprintk("--> %s clp %p session %p sr_slot %td\n",
__func__, session->clp, session, res->sr_slot ?
res->sr_slot - session->fc_slot_table.slots : -1);
- ret = nfs41_setup_sequence(session, args, res, cache_reply,
- task);
+ ret = nfs41_setup_sequence(session, args, res, task);
out:
dprintk("<-- %s status=%d\n", __func__, ret);
return ret;
@@ -629,7 +649,6 @@ struct nfs41_call_sync_data {
const struct nfs_server *seq_server;
struct nfs4_sequence_args *seq_args;
struct nfs4_sequence_res *seq_res;
- int cache_reply;
};
static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
@@ -639,7 +658,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
if (nfs4_setup_sequence(data->seq_server, data->seq_args,
- data->seq_res, data->cache_reply, task))
+ data->seq_res, task))
return;
rpc_call_start(task);
}
@@ -657,12 +676,12 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
nfs41_sequence_done(task, data->seq_res);
}
-struct rpc_call_ops nfs41_call_sync_ops = {
+static const struct rpc_call_ops nfs41_call_sync_ops = {
.rpc_call_prepare = nfs41_call_sync_prepare,
.rpc_call_done = nfs41_call_sync_done,
};
-struct rpc_call_ops nfs41_call_priv_sync_ops = {
+static const struct rpc_call_ops nfs41_call_priv_sync_ops = {
.rpc_call_prepare = nfs41_call_priv_sync_prepare,
.rpc_call_done = nfs41_call_sync_done,
};
@@ -672,7 +691,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
struct rpc_message *msg,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
- int cache_reply,
int privileged)
{
int ret;
@@ -681,7 +699,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
.seq_server = server,
.seq_args = args,
.seq_res = res,
- .cache_reply = cache_reply,
};
struct rpc_task_setup task_setup = {
.rpc_client = clnt,
@@ -690,7 +707,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
.callback_data = &data
};
- res->sr_slot = NULL;
if (privileged)
task_setup.callback_ops = &nfs41_call_priv_sync_ops;
task = rpc_run_task(&task_setup);
@@ -710,10 +726,17 @@ int _nfs4_call_sync_session(struct rpc_clnt *clnt,
struct nfs4_sequence_res *res,
int cache_reply)
{
- return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0);
+ nfs41_init_sequence(args, res, cache_reply);
+ return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0);
}
#else
+static inline
+void nfs41_init_sequence(struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res, int cache_reply)
+{
+}
+
static int nfs4_sequence_done(struct rpc_task *task,
struct nfs4_sequence_res *res)
{
@@ -728,7 +751,7 @@ int _nfs4_call_sync(struct rpc_clnt *clnt,
struct nfs4_sequence_res *res,
int cache_reply)
{
- args->sa_session = res->sr_session = NULL;
+ nfs41_init_sequence(args, res, cache_reply);
return rpc_call_sync(clnt, msg, 0);
}
@@ -815,20 +838,22 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p->o_arg.open_flags = flags;
p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
p->o_arg.clientid = server->nfs_client->cl_clientid;
- p->o_arg.id = sp->so_owner_id.id;
+ p->o_arg.id = sp->so_seqid.owner_id;
p->o_arg.name = &dentry->d_name;
p->o_arg.server = server;
p->o_arg.bitmask = server->attr_bitmask;
p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
- if (flags & O_CREAT) {
- u32 *s;
+ if (attrs != NULL && attrs->ia_valid != 0) {
+ __be32 verf[2];
p->o_arg.u.attrs = &p->attrs;
memcpy(&p->attrs, attrs, sizeof(p->attrs));
- s = (u32 *) p->o_arg.u.verifier.data;
- s[0] = jiffies;
- s[1] = current->pid;
+
+ verf[0] = jiffies;
+ verf[1] = current->pid;
+ memcpy(p->o_arg.u.verifier.data, verf,
+ sizeof(p->o_arg.u.verifier.data));
}
p->c_arg.fh = &p->o_res.fh;
p->c_arg.stateid = &p->o_res.stateid;
@@ -878,7 +903,7 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode
{
int ret = 0;
- if (open_mode & O_EXCL)
+ if (open_mode & (O_EXCL|O_TRUNC))
goto out;
switch (mode & (FMODE_READ|FMODE_WRITE)) {
case FMODE_READ:
@@ -927,8 +952,8 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
{
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
- memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data));
- memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data));
+ nfs4_stateid_copy(&state->stateid, stateid);
+ nfs4_stateid_copy(&state->open_stateid, stateid);
switch (fmode) {
case FMODE_READ:
set_bit(NFS_O_RDONLY_STATE, &state->flags);
@@ -956,7 +981,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
*/
write_seqlock(&state->seqlock);
if (deleg_stateid != NULL) {
- memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data));
+ nfs4_stateid_copy(&state->stateid, deleg_stateid);
set_bit(NFS_DELEGATED_STATE, &state->flags);
}
if (open_stateid != NULL)
@@ -987,7 +1012,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
if (delegation == NULL)
delegation = &deleg_cur->stateid;
- else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0)
+ else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation))
goto no_delegation_unlock;
nfs_mark_delegation_referenced(deleg_cur);
@@ -1026,7 +1051,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
struct nfs4_state *state = opendata->state;
struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_delegation *delegation;
- int open_mode = opendata->o_arg.open_flags & O_EXCL;
+ int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC);
fmode_t fmode = opendata->o_arg.fmode;
nfs4_stateid stateid;
int ret = -EAGAIN;
@@ -1048,7 +1073,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
break;
}
/* Save the delegation */
- memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
rcu_read_unlock();
ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
if (ret != 0)
@@ -1090,6 +1115,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
if (state == NULL)
goto err_put_inode;
if (data->o_res.delegation_type != 0) {
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
int delegation_flags = 0;
rcu_read_lock();
@@ -1101,7 +1127,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
"returning a delegation for "
"OPEN(CLAIM_DELEGATE_CUR)\n",
- NFS_CLIENT(inode)->cl_server);
+ clp->cl_hostname);
} else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
nfs_inode_set_delegation(state->inode,
data->owner->so_cred,
@@ -1210,10 +1236,10 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
* Check if we need to update the current stateid.
*/
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 &&
- memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) {
+ !nfs4_stateid_match(&state->stateid, &state->open_stateid)) {
write_seqlock(&state->seqlock);
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
- memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data));
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
write_sequnlock(&state->seqlock);
}
return 0;
@@ -1282,8 +1308,7 @@ static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs
if (IS_ERR(opendata))
return PTR_ERR(opendata);
opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR;
- memcpy(opendata->o_arg.u.delegation.data, stateid->data,
- sizeof(opendata->o_arg.u.delegation.data));
+ nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
ret = nfs4_open_recover(opendata, state);
nfs4_opendata_put(opendata);
return ret;
@@ -1319,8 +1344,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
* The show must go on: exit, but mark the
* stateid as needing recovery.
*/
+ case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
+ nfs_inode_find_state_and_recover(state->inode,
+ stateid);
nfs4_schedule_stateid_recovery(server, state);
case -EKEYEXPIRED:
/*
@@ -1345,8 +1373,7 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
data->rpc_status = task->tk_status;
if (data->rpc_status == 0) {
- memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
- sizeof(data->o_res.stateid.data));
+ nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid);
nfs_confirm_seqid(&data->owner->so_seqid, 0);
renew_lease(data->o_res.server, data->timestamp);
data->rpc_done = 1;
@@ -1440,7 +1467,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
rcu_read_unlock();
}
/* Update sequence id. */
- data->o_arg.id = sp->so_owner_id.id;
+ data->o_arg.id = sp->so_seqid.owner_id;
data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
@@ -1449,7 +1476,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
data->timestamp = jiffies;
if (nfs4_setup_sequence(data->o_arg.server,
&data->o_arg.seq_args,
- &data->o_res.seq_res, 1, task))
+ &data->o_res.seq_res, task))
return;
rpc_call_start(task);
return;
@@ -1551,6 +1578,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
};
int status;
+ nfs41_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1);
kref_get(&data->kref);
data->rpc_done = 0;
data->rpc_status = 0;
@@ -1712,15 +1740,32 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
}
#if defined(CONFIG_NFS_V4_1)
-static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+static int nfs41_check_expired_stateid(struct nfs4_state *state, nfs4_stateid *stateid, unsigned int flags)
{
- int status;
+ int status = NFS_OK;
struct nfs_server *server = NFS_SERVER(state->inode);
- status = nfs41_test_stateid(server, state);
- if (status == NFS_OK)
- return 0;
- nfs41_free_stateid(server, state);
+ if (state->flags & flags) {
+ status = nfs41_test_stateid(server, stateid);
+ if (status != NFS_OK) {
+ nfs41_free_stateid(server, stateid);
+ state->flags &= ~flags;
+ }
+ }
+ return status;
+}
+
+static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ int deleg_status, open_status;
+ int deleg_flags = 1 << NFS_DELEGATED_STATE;
+ int open_flags = (1 << NFS_O_RDONLY_STATE) | (1 << NFS_O_WRONLY_STATE) | (1 << NFS_O_RDWR_STATE);
+
+ deleg_status = nfs41_check_expired_stateid(state, &state->stateid, deleg_flags);
+ open_status = nfs41_check_expired_stateid(state, &state->open_stateid, open_flags);
+
+ if ((deleg_status == NFS_OK) && (open_status == NFS_OK))
+ return NFS_OK;
return nfs4_open_expired(sp, state);
}
#endif
@@ -1754,7 +1799,8 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode
/* Protect against reboot recovery conflicts */
status = -ENOMEM;
- if (!(sp = nfs4_get_state_owner(server, cred))) {
+ sp = nfs4_get_state_owner(server, cred, GFP_KERNEL);
+ if (sp == NULL) {
dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
goto out_err;
}
@@ -1829,7 +1875,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry,
* the user though...
*/
if (status == -NFS4ERR_BAD_SEQID) {
- printk(KERN_WARNING "NFS: v4 server %s "
+ pr_warn_ratelimited("NFS: v4 server %s "
" returned a bad sequence-id error!\n",
NFS_SERVER(dir)->nfs_client->cl_hostname);
exception.retry = 1;
@@ -1882,12 +1928,14 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
nfs_fattr_init(fattr);
- if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
+ if (state != NULL) {
+ nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
+ current->files, current->tgid);
+ } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode,
+ FMODE_WRITE)) {
/* Use that stateid */
- } else if (state != NULL) {
- nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);
} else
- memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
+ nfs4_stateid_copy(&arg.stateid, &zero_stateid);
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (status == 0 && state != NULL)
@@ -1900,7 +1948,10 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs4_exception exception = { };
+ struct nfs4_exception exception = {
+ .state = state,
+ .inode = inode,
+ };
int err;
do {
err = nfs4_handle_exception(server,
@@ -1954,6 +2005,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
struct nfs4_state *state = calldata->state;
struct nfs_server *server = NFS_SERVER(calldata->inode);
+ dprintk("%s: begin!\n", __func__);
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
return;
/* hmm. we are done with the inode, and in the process of freeing
@@ -1981,6 +2033,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
}
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, calldata->res.fattr);
+ dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
}
static void nfs4_close_prepare(struct rpc_task *task, void *data)
@@ -1989,6 +2042,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
struct nfs4_state *state = calldata->state;
int call_close = 0;
+ dprintk("%s: begin!\n", __func__);
if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
return;
@@ -2013,7 +2067,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
if (!call_close) {
/* Note: exit _without_ calling nfs4_close_done */
task->tk_action = NULL;
- return;
+ goto out;
}
if (calldata->arg.fmode == 0) {
@@ -2022,17 +2076,20 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
task, NULL);
- return;
+ goto out;
}
}
nfs_fattr_init(calldata->res.fattr);
calldata->timestamp = jiffies;
if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
- &calldata->arg.seq_args, &calldata->res.seq_res,
- 1, task))
- return;
+ &calldata->arg.seq_args,
+ &calldata->res.seq_res,
+ task))
+ goto out;
rpc_call_start(task);
+out:
+ dprintk("%s: done!\n", __func__);
}
static const struct rpc_call_ops nfs4_close_ops = {
@@ -2074,6 +2131,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
calldata = kzalloc(sizeof(*calldata), gfp_mask);
if (calldata == NULL)
goto out;
+ nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1);
calldata->inode = state->inode;
calldata->state = state;
calldata->arg.fh = NFS_FH(state->inode);
@@ -2182,6 +2240,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
server->acl_bitmask = res.acl_bitmask;
+ server->fh_expire_type = res.fh_expire_type;
}
return status;
@@ -2230,11 +2289,12 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
switch (err) {
case 0:
case -NFS4ERR_WRONGSEC:
- break;
+ goto out;
default:
err = nfs4_handle_exception(server, err, &exception);
}
} while (exception.retry);
+out:
return err;
}
@@ -2303,7 +2363,6 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
return nfs4_map_errors(status);
}
-static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
/*
* Get locations and (maybe) other attributes of a referral.
* Note that we'll actually follow the referral later when
@@ -2420,6 +2479,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
}
}
+ /* Deal with open(O_TRUNC) */
+ if (sattr->ia_valid & ATTR_OPEN)
+ sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
+
status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
if (status == 0)
nfs_setattr_update_inode(inode, sattr);
@@ -2494,7 +2557,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_accessargs args = {
.fh = NFS_FH(inode),
- .bitmask = server->attr_bitmask,
+ .bitmask = server->cache_consistency_bitmask,
};
struct nfs4_accessres res = {
.server = server,
@@ -2712,8 +2775,18 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
args->bitmask = server->cache_consistency_bitmask;
res->server = server;
- res->seq_res.sr_slot = NULL;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
+ nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
+}
+
+static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+ if (nfs4_setup_sequence(NFS_SERVER(data->dir),
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task))
+ return;
+ rpc_call_start(task);
}
static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
@@ -2738,6 +2811,17 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
arg->bitmask = server->attr_bitmask;
res->server = server;
+ nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1);
+}
+
+static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ if (nfs4_setup_sequence(NFS_SERVER(data->old_dir),
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task))
+ return;
+ rpc_call_start(task);
}
static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
@@ -3232,6 +3316,17 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
data->timestamp = jiffies;
data->read_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+ nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+}
+
+static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+{
+ if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task))
+ return;
+ rpc_call_start(task);
}
/* Reset the the nfs_read_data to send the read to the MDS. */
@@ -3305,6 +3400,17 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
data->timestamp = jiffies;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
+ nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+}
+
+static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+{
+ if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task))
+ return;
+ rpc_call_start(task);
}
static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3339,6 +3445,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
data->write_done_cb = nfs4_commit_done_cb;
data->res.server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
+ nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
}
struct nfs4_renewdata {
@@ -3575,8 +3682,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
}
if (npages > 1) {
/* for decoding across pages */
- args.acl_scratch = alloc_page(GFP_KERNEL);
- if (!args.acl_scratch)
+ res.acl_scratch = alloc_page(GFP_KERNEL);
+ if (!res.acl_scratch)
goto out_free;
}
args.acl_len = npages * PAGE_SIZE;
@@ -3587,7 +3694,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
res.acl_flags |= NFS4_ACL_LEN_REQUEST;
resp_buf = page_address(pages[0]);
- dprintk("%s buf %p buflen %ld npages %d args.acl_len %ld\n",
+ dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
__func__, buf, buflen, npages, args.acl_len);
ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
&msg, &args.seq_args, &res.seq_res, 0);
@@ -3605,15 +3712,15 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
if (acl_len > buflen)
goto out_free;
_copy_from_pages(buf, pages, res.acl_data_offset,
- res.acl_len);
+ acl_len);
}
ret = acl_len;
out_free:
for (i = 0; i < npages; i++)
if (pages[i])
__free_page(pages[i]);
- if (args.acl_scratch)
- __free_page(args.acl_scratch);
+ if (res.acl_scratch)
+ __free_page(res.acl_scratch);
return ret;
}
@@ -3714,8 +3821,12 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
if (task->tk_status >= 0)
return 0;
switch(task->tk_status) {
+ case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
+ if (state == NULL)
+ break;
+ nfs_remove_bad_delegation(state->inode);
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
@@ -3764,6 +3875,16 @@ wait_on_recovery:
return -EAGAIN;
}
+static void nfs4_construct_boot_verifier(struct nfs_client *clp,
+ nfs4_verifier *bootverf)
+{
+ __be32 verf[2];
+
+ verf[0] = htonl((u32)clp->cl_boot_time.tv_sec);
+ verf[1] = htonl((u32)clp->cl_boot_time.tv_nsec);
+ memcpy(bootverf->data, verf, sizeof(bootverf->data));
+}
+
int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
unsigned short port, struct rpc_cred *cred,
struct nfs4_setclientid_res *res)
@@ -3780,15 +3901,13 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
.rpc_resp = res,
.rpc_cred = cred,
};
- __be32 *p;
int loop = 0;
int status;
- p = (__be32*)sc_verifier.data;
- *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
- *p = htonl((u32)clp->cl_boot_time.tv_nsec);
+ nfs4_construct_boot_verifier(clp, &sc_verifier);
for(;;) {
+ rcu_read_lock();
setclientid.sc_name_len = scnprintf(setclientid.sc_name,
sizeof(setclientid.sc_name), "%s/%s %s %s %u",
clp->cl_ipaddr,
@@ -3805,6 +3924,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
sizeof(setclientid.sc_uaddr), "%s.%u.%u",
clp->cl_ipaddr, port >> 8, port & 255);
+ rcu_read_unlock();
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
if (status != -NFS4ERR_CLID_INUSE)
@@ -3891,7 +4011,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
if (nfs4_setup_sequence(d_data->res.server,
&d_data->args.seq_args,
- &d_data->res.seq_res, 1, task))
+ &d_data->res.seq_res, task))
return;
rpc_call_start(task);
}
@@ -3925,11 +4045,12 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
data = kzalloc(sizeof(*data), GFP_NOFS);
if (data == NULL)
return -ENOMEM;
+ nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
data->args.fhandle = &data->fh;
data->args.stateid = &data->stateid;
data->args.bitmask = server->attr_bitmask;
nfs_copy_fh(&data->fh, NFS_FH(inode));
- memcpy(&data->stateid, stateid, sizeof(data->stateid));
+ nfs4_stateid_copy(&data->stateid, stateid);
data->res.fattr = &data->fattr;
data->res.server = server;
nfs_fattr_init(data->res.fattr);
@@ -4016,7 +4137,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
if (status != 0)
goto out;
lsp = request->fl_u.nfs4_fl.owner;
- arg.lock_owner.id = lsp->ls_id.id;
+ arg.lock_owner.id = lsp->ls_seqid.owner_id;
arg.lock_owner.s_dev = server->s_dev;
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
switch (status) {
@@ -4112,9 +4233,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
return;
switch (task->tk_status) {
case 0:
- memcpy(calldata->lsp->ls_stateid.data,
- calldata->res.stateid.data,
- sizeof(calldata->lsp->ls_stateid.data));
+ nfs4_stateid_copy(&calldata->lsp->ls_stateid,
+ &calldata->res.stateid);
renew_lease(calldata->server, calldata->timestamp);
break;
case -NFS4ERR_BAD_STATEID:
@@ -4142,7 +4262,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
calldata->timestamp = jiffies;
if (nfs4_setup_sequence(calldata->server,
&calldata->arg.seq_args,
- &calldata->res.seq_res, 1, task))
+ &calldata->res.seq_res, task))
return;
rpc_call_start(task);
}
@@ -4182,6 +4302,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
return ERR_PTR(-ENOMEM);
}
+ nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
msg.rpc_argp = &data->arg;
msg.rpc_resp = &data->res;
task_setup_data.callback_data = data;
@@ -4261,7 +4382,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
goto out_free_seqid;
p->arg.lock_stateid = &lsp->ls_stateid;
p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
- p->arg.lock_owner.id = lsp->ls_id.id;
+ p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
p->arg.lock_owner.s_dev = server->s_dev;
p->res.lock_seqid = p->arg.lock_seqid;
p->lsp = lsp;
@@ -4297,7 +4418,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
data->timestamp = jiffies;
if (nfs4_setup_sequence(data->server,
&data->arg.seq_args,
- &data->res.seq_res, 1, task))
+ &data->res.seq_res, task))
return;
rpc_call_start(task);
dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
@@ -4326,8 +4447,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
goto out;
}
if (data->rpc_status == 0) {
- memcpy(data->lsp->ls_stateid.data, data->res.stateid.data,
- sizeof(data->lsp->ls_stateid.data));
+ nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid);
data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
}
@@ -4415,6 +4535,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
data->arg.reclaim = NFS_LOCK_RECLAIM;
task_setup_data.callback_ops = &nfs4_recover_lock_ops;
}
+ nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
msg.rpc_argp = &data->arg;
msg.rpc_resp = &data->res;
task_setup_data.callback_data = data;
@@ -4479,15 +4600,34 @@ out:
}
#if defined(CONFIG_NFS_V4_1)
-static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
+static int nfs41_check_expired_locks(struct nfs4_state *state)
{
- int status;
+ int status, ret = NFS_OK;
+ struct nfs4_lock_state *lsp;
struct nfs_server *server = NFS_SERVER(state->inode);
- status = nfs41_test_stateid(server, state);
+ list_for_each_entry(lsp, &state->lock_states, ls_locks) {
+ if (lsp->ls_flags & NFS_LOCK_INITIALIZED) {
+ status = nfs41_test_stateid(server, &lsp->ls_stateid);
+ if (status != NFS_OK) {
+ nfs41_free_stateid(server, &lsp->ls_stateid);
+ lsp->ls_flags &= ~NFS_LOCK_INITIALIZED;
+ ret = status;
+ }
+ }
+ };
+
+ return ret;
+}
+
+static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
+{
+ int status = NFS_OK;
+
+ if (test_bit(LK_STATE_IN_USE, &state->flags))
+ status = nfs41_check_expired_locks(state);
if (status == NFS_OK)
- return 0;
- nfs41_free_stateid(server, state);
+ return status;
return nfs4_lock_expired(state, request);
}
#endif
@@ -4523,7 +4663,8 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
/* Note: we always want to sleep here! */
request->fl_flags = fl_flags | FL_SLEEP;
if (do_vfs_lock(request->fl_file, request) < 0)
- printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
+ printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock "
+ "manager!\n", __func__);
out_unlock:
up_read(&nfsi->rwsem);
out:
@@ -4533,7 +4674,9 @@ out:
static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
{
- struct nfs4_exception exception = { };
+ struct nfs4_exception exception = {
+ .state = state,
+ };
int err;
do {
@@ -4603,8 +4746,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
switch (err) {
default:
- printk(KERN_ERR "%s: unhandled error %d.\n",
- __func__, err);
+ printk(KERN_ERR "NFS: %s: unhandled error "
+ "%d.\n", __func__, err);
case 0:
case -ESTALE:
goto out;
@@ -4626,6 +4769,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
* The show must go on: exit, but mark the
* stateid as needing recovery.
*/
+ case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_OPENMODE:
@@ -4655,33 +4799,44 @@ out:
return err;
}
+struct nfs_release_lockowner_data {
+ struct nfs4_lock_state *lsp;
+ struct nfs_server *server;
+ struct nfs_release_lockowner_args args;
+};
+
static void nfs4_release_lockowner_release(void *calldata)
{
+ struct nfs_release_lockowner_data *data = calldata;
+ nfs4_free_lock_state(data->server, data->lsp);
kfree(calldata);
}
-const struct rpc_call_ops nfs4_release_lockowner_ops = {
+static const struct rpc_call_ops nfs4_release_lockowner_ops = {
.rpc_release = nfs4_release_lockowner_release,
};
-void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
+int nfs4_release_lockowner(struct nfs4_lock_state *lsp)
{
struct nfs_server *server = lsp->ls_state->owner->so_server;
- struct nfs_release_lockowner_args *args;
+ struct nfs_release_lockowner_data *data;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
};
if (server->nfs_client->cl_mvops->minor_version != 0)
- return;
- args = kmalloc(sizeof(*args), GFP_NOFS);
- if (!args)
- return;
- args->lock_owner.clientid = server->nfs_client->cl_clientid;
- args->lock_owner.id = lsp->ls_id.id;
- args->lock_owner.s_dev = server->s_dev;
- msg.rpc_argp = args;
- rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
+ return -EINVAL;
+ data = kmalloc(sizeof(*data), GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+ data->lsp = lsp;
+ data->server = server;
+ data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
+ data->args.lock_owner.id = lsp->ls_seqid.owner_id;
+ data->args.lock_owner.s_dev = server->s_dev;
+ msg.rpc_argp = &data->args;
+ rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
+ return 0;
}
#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -4727,11 +4882,11 @@ static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) ||
(fattr->valid & NFS_ATTR_FATTR_FILEID)) &&
(fattr->valid & NFS_ATTR_FATTR_FSID) &&
- (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
+ (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)))
return;
fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
- NFS_ATTR_FATTR_NLINK;
+ NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL;
fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
fattr->nlink = 2;
}
@@ -4798,7 +4953,8 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
return status;
}
-int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
+static int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
+ struct nfs4_secinfo_flavors *flavors)
{
struct nfs4_exception exception = { };
int err;
@@ -4852,6 +5008,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
{
nfs4_verifier verifier;
struct nfs41_exchange_id_args args = {
+ .verifier = &verifier,
.client = clp,
.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
};
@@ -4865,15 +5022,11 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
.rpc_resp = &res,
.rpc_cred = cred,
};
- __be32 *p;
dprintk("--> %s\n", __func__);
BUG_ON(clp == NULL);
- p = (u32 *)verifier.data;
- *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
- *p = htonl((u32)clp->cl_boot_time.tv_nsec);
- args.verifier = &verifier;
+ nfs4_construct_boot_verifier(clp, &verifier);
args.id_len = scnprintf(args.id, sizeof(args.id),
"%s/%s.%s/%u",
@@ -4883,14 +5036,29 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
clp->cl_rpcclient->cl_auth->au_flavor);
res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL);
- if (unlikely(!res.server_scope))
- return -ENOMEM;
+ if (unlikely(!res.server_scope)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_KERNEL);
+ if (unlikely(!res.impl_id)) {
+ status = -ENOMEM;
+ goto out_server_scope;
+ }
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
if (!status)
status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
if (!status) {
+ /* use the most recent implementation id */
+ kfree(clp->impl_id);
+ clp->impl_id = res.impl_id;
+ } else
+ kfree(res.impl_id);
+
+ if (!status) {
if (clp->server_scope &&
!nfs41_same_server_scope(clp->server_scope,
res.server_scope)) {
@@ -4901,12 +5069,21 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
clp->server_scope = NULL;
}
- if (!clp->server_scope)
+ if (!clp->server_scope) {
clp->server_scope = res.server_scope;
- else
- kfree(res.server_scope);
+ goto out;
+ }
}
+out_server_scope:
+ kfree(res.server_scope);
+out:
+ if (clp->impl_id)
+ dprintk("%s: Server Implementation ID: "
+ "domain: %s, name: %s, date: %llu,%u\n",
+ __func__, clp->impl_id->domain, clp->impl_id->name,
+ clp->impl_id->date.seconds,
+ clp->impl_id->date.nseconds);
dprintk("<-- %s status= %d\n", __func__, status);
return status;
}
@@ -4930,7 +5107,7 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task,
since we're invoked within one */
ret = nfs41_setup_sequence(data->clp->cl_session,
&data->args->la_seq_args,
- &data->res->lr_seq_res, 0, task);
+ &data->res->lr_seq_res, task);
BUG_ON(ret == -EAGAIN);
rpc_call_start(task);
@@ -4963,7 +5140,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
dprintk("<-- %s\n", __func__);
}
-struct rpc_call_ops nfs4_get_lease_time_ops = {
+static const struct rpc_call_ops nfs4_get_lease_time_ops = {
.rpc_call_prepare = nfs4_get_lease_time_prepare,
.rpc_call_done = nfs4_get_lease_time_done,
};
@@ -4994,6 +5171,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
};
int status;
+ nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
dprintk("--> %s\n", __func__);
task = rpc_run_task(&task_setup);
@@ -5008,37 +5186,53 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
return status;
}
+static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags)
+{
+ return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags);
+}
+
+static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *new,
+ u32 max_slots,
+ u32 ivalue)
+{
+ struct nfs4_slot *old = NULL;
+ u32 i;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ if (new) {
+ old = tbl->slots;
+ tbl->slots = new;
+ tbl->max_slots = max_slots;
+ }
+ tbl->highest_used_slotid = -1; /* no slot is currently used */
+ for (i = 0; i < tbl->max_slots; i++)
+ tbl->slots[i].seq_nr = ivalue;
+ spin_unlock(&tbl->slot_tbl_lock);
+ kfree(old);
+}
+
/*
- * Reset a slot table
+ * (re)Initialise a slot table
*/
-static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
- int ivalue)
+static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
+ u32 ivalue)
{
struct nfs4_slot *new = NULL;
- int i;
- int ret = 0;
+ int ret = -ENOMEM;
dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
max_reqs, tbl->max_slots);
/* Does the newly negotiated max_reqs match the existing slot table? */
if (max_reqs != tbl->max_slots) {
- ret = -ENOMEM;
- new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
- GFP_NOFS);
+ new = nfs4_alloc_slots(max_reqs, GFP_NOFS);
if (!new)
goto out;
- ret = 0;
- kfree(tbl->slots);
- }
- spin_lock(&tbl->slot_tbl_lock);
- if (new) {
- tbl->slots = new;
- tbl->max_slots = max_reqs;
}
- for (i = 0; i < tbl->max_slots; ++i)
- tbl->slots[i].seq_nr = ivalue;
- spin_unlock(&tbl->slot_tbl_lock);
+ ret = 0;
+
+ nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue);
dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
tbl, tbl->slots, tbl->max_slots);
out:
@@ -5061,36 +5255,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session)
}
/*
- * Initialize slot table
- */
-static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
- int max_slots, int ivalue)
-{
- struct nfs4_slot *slot;
- int ret = -ENOMEM;
-
- BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE);
-
- dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
-
- slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
- if (!slot)
- goto out;
- ret = 0;
-
- spin_lock(&tbl->slot_tbl_lock);
- tbl->max_slots = max_slots;
- tbl->slots = slot;
- tbl->highest_used_slotid = -1; /* no slot is currently used */
- spin_unlock(&tbl->slot_tbl_lock);
- dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
- tbl, tbl->slots, tbl->max_slots);
-out:
- dprintk("<-- %s: return %d\n", __func__, ret);
- return ret;
-}
-
-/*
* Initialize or reset the forechannel and backchannel tables
*/
static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
@@ -5101,25 +5265,16 @@ static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
dprintk("--> %s\n", __func__);
/* Fore channel */
tbl = &ses->fc_slot_table;
- if (tbl->slots == NULL) {
- status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
- if (status) /* -ENOMEM */
- return status;
- } else {
- status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
- if (status)
- return status;
- }
+ status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+ if (status) /* -ENOMEM */
+ return status;
/* Back channel */
tbl = &ses->bc_slot_table;
- if (tbl->slots == NULL) {
- status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
- if (status)
- /* Fore and back channel share a connection so get
- * both slot tables or neither */
- nfs4_destroy_slot_tables(ses);
- } else
- status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+ status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+ if (status && tbl->slots == NULL)
+ /* Fore and back channel share a connection so get
+ * both slot tables or neither */
+ nfs4_destroy_slot_tables(ses);
return status;
}
@@ -5133,13 +5288,13 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
return NULL;
tbl = &session->fc_slot_table;
- tbl->highest_used_slotid = -1;
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
spin_lock_init(&tbl->slot_tbl_lock);
rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
init_completion(&tbl->complete);
tbl = &session->bc_slot_table;
- tbl->highest_used_slotid = -1;
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
spin_lock_init(&tbl->slot_tbl_lock);
rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
init_completion(&tbl->complete);
@@ -5152,11 +5307,16 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
void nfs4_destroy_session(struct nfs4_session *session)
{
+ struct rpc_xprt *xprt;
+
nfs4_proc_destroy_session(session);
+
+ rcu_read_lock();
+ xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
+ rcu_read_unlock();
dprintk("%s Destroy backchannel for xprt %p\n",
- __func__, session->clp->cl_rpcclient->cl_xprt);
- xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt,
- NFS41_BC_MIN_CALLBACKS);
+ __func__, xprt);
+ xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
nfs4_destroy_slot_tables(session);
kfree(session);
}
@@ -5184,7 +5344,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
args->fc_attrs.max_rqst_sz = mxrqst_sz;
args->fc_attrs.max_resp_sz = mxresp_sz;
args->fc_attrs.max_ops = NFS4_MAX_OPS;
- args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
+ args->fc_attrs.max_reqs = max_session_slots;
dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
"max_ops=%u max_reqs=%u\n",
@@ -5224,6 +5384,8 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
return -EINVAL;
if (rcvd->max_reqs == 0)
return -EINVAL;
+ if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE)
+ rcvd->max_reqs = NFS4_MAX_SLOT_TABLE;
return 0;
}
@@ -5239,9 +5401,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
return -EINVAL;
/* These would render the backchannel useless: */
- if (rcvd->max_ops == 0)
+ if (rcvd->max_ops != sent->max_ops)
return -EINVAL;
- if (rcvd->max_reqs == 0)
+ if (rcvd->max_reqs != sent->max_reqs)
return -EINVAL;
return 0;
}
@@ -5344,7 +5506,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
if (status)
printk(KERN_WARNING
- "Got error %d from the server on DESTROY_SESSION. "
+ "NFS: Got error %d from the server on DESTROY_SESSION. "
"Session has been destroyed regardless...\n", status);
dprintk("<-- nfs4_proc_destroy_session\n");
@@ -5467,7 +5629,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
args = task->tk_msg.rpc_argp;
res = task->tk_msg.rpc_resp;
- if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task))
+ if (nfs41_setup_sequence(clp->cl_session, args, res, task))
return;
rpc_call_start(task);
}
@@ -5499,6 +5661,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
nfs_put_client(clp);
return ERR_PTR(-ENOMEM);
}
+ nfs41_init_sequence(&calldata->args, &calldata->res, 0);
msg.rpc_argp = &calldata->args;
msg.rpc_resp = &calldata->res;
calldata->clp = clp;
@@ -5560,7 +5723,7 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
if (nfs41_setup_sequence(calldata->clp->cl_session,
&calldata->arg.seq_args,
- &calldata->res.seq_res, 0, task))
+ &calldata->res.seq_res, task))
return;
rpc_call_start(task);
@@ -5639,6 +5802,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
calldata->clp = clp;
calldata->arg.one_fs = 0;
+ nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
msg.rpc_argp = &calldata->arg;
msg.rpc_resp = &calldata->res;
task_setup_data.callback_data = calldata;
@@ -5670,7 +5834,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
* to be no way to prevent it completely.
*/
if (nfs4_setup_sequence(server, &lgp->args.seq_args,
- &lgp->res.seq_res, 0, task))
+ &lgp->res.seq_res, task))
return;
if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
NFS_I(lgp->args.inode)->layout,
@@ -5745,6 +5909,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
lgp->res.layoutp = &lgp->args.layout;
lgp->res.seq_res.sr_slot = NULL;
+ nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -5765,7 +5930,7 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
dprintk("--> %s\n", __func__);
if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
- &lrp->res.seq_res, 0, task))
+ &lrp->res.seq_res, task))
return;
rpc_call_start(task);
}
@@ -5831,6 +5996,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
int status;
dprintk("--> %s\n", __func__);
+ nfs41_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -5931,7 +6097,7 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
struct nfs_server *server = NFS_SERVER(data->args.inode);
if (nfs4_setup_sequence(server, &data->args.seq_args,
- &data->res.seq_res, 1, task))
+ &data->res.seq_res, task))
return;
rpc_call_start(task);
}
@@ -5946,21 +6112,22 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
return;
switch (task->tk_status) { /* Just ignore these failures */
- case NFS4ERR_DELEG_REVOKED: /* layout was recalled */
- case NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */
- case NFS4ERR_BADLAYOUT: /* no layout */
- case NFS4ERR_GRACE: /* loca_recalim always false */
+ case -NFS4ERR_DELEG_REVOKED: /* layout was recalled */
+ case -NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */
+ case -NFS4ERR_BADLAYOUT: /* no layout */
+ case -NFS4ERR_GRACE: /* loca_recalim always false */
task->tk_status = 0;
- }
-
- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
- rpc_restart_call_prepare(task);
- return;
- }
-
- if (task->tk_status == 0)
+ break;
+ case 0:
nfs_post_op_update_inode_force_wcc(data->args.inode,
data->res.fattr);
+ break;
+ default:
+ if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ }
}
static void nfs4_layoutcommit_release(void *calldata)
@@ -6018,6 +6185,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
data->args.lastbytewritten,
data->args.inode->i_ino);
+ nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -6063,11 +6231,12 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
case 0:
case -NFS4ERR_WRONGSEC:
case -NFS4ERR_NOTSUPP:
- break;
+ goto out;
default:
err = nfs4_handle_exception(server, err, &exception);
}
} while (exception.retry);
+out:
return err;
}
@@ -6111,11 +6280,12 @@ out_freepage:
out:
return err;
}
-static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state)
+
+static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
{
int status;
struct nfs41_test_stateid_args args = {
- .stateid = &state->stateid,
+ .stateid = stateid,
};
struct nfs41_test_stateid_res res;
struct rpc_message msg = {
@@ -6123,28 +6293,31 @@ static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *sta
.rpc_argp = &args,
.rpc_resp = &res,
};
- args.seq_args.sa_session = res.seq_res.sr_session = NULL;
- status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1);
+
+ nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
+ status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
+
+ if (status == NFS_OK)
+ return res.status;
return status;
}
-static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state)
+static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
{
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs41_test_stateid(server, state),
+ _nfs41_test_stateid(server, stateid),
&exception);
} while (exception.retry);
return err;
}
-static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state)
+static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
{
- int status;
struct nfs41_free_stateid_args args = {
- .stateid = &state->stateid,
+ .stateid = stateid,
};
struct nfs41_free_stateid_res res;
struct rpc_message msg = {
@@ -6153,25 +6326,46 @@ static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *stat
.rpc_resp = &res,
};
- args.seq_args.sa_session = res.seq_res.sr_session = NULL;
- status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1);
- return status;
+ nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
+ return nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
}
-static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state)
+static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
{
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs4_free_stateid(server, state),
+ _nfs4_free_stateid(server, stateid),
&exception);
} while (exception.retry);
return err;
}
+
+static bool nfs41_match_stateid(const nfs4_stateid *s1,
+ const nfs4_stateid *s2)
+{
+ if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
+ return false;
+
+ if (s1->seqid == s2->seqid)
+ return true;
+ if (s1->seqid == 0 || s2->seqid == 0)
+ return true;
+
+ return false;
+}
+
#endif /* CONFIG_NFS_V4_1 */
-struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
+static bool nfs4_match_stateid(const nfs4_stateid *s1,
+ const nfs4_stateid *s2)
+{
+ return nfs4_stateid_match(s1, s2);
+}
+
+
+static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
.state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
.recover_open = nfs4_open_reclaim,
@@ -6181,7 +6375,7 @@ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
};
#if defined(CONFIG_NFS_V4_1)
-struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
+static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
.state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
.recover_open = nfs4_open_reclaim,
@@ -6192,7 +6386,7 @@ struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
};
#endif /* CONFIG_NFS_V4_1 */
-struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
+static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
.state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
.recover_open = nfs4_open_expired,
@@ -6202,7 +6396,7 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
};
#if defined(CONFIG_NFS_V4_1)
-struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
+static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
.state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
.recover_open = nfs41_open_expired,
@@ -6212,14 +6406,14 @@ struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
};
#endif /* CONFIG_NFS_V4_1 */
-struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
+static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
.sched_state_renewal = nfs4_proc_async_renew,
.get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,
.renew_lease = nfs4_proc_renew,
};
#if defined(CONFIG_NFS_V4_1)
-struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
+static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
.sched_state_renewal = nfs41_proc_async_sequence,
.get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,
.renew_lease = nfs4_proc_sequence,
@@ -6229,7 +6423,7 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
.minor_version = 0,
.call_sync = _nfs4_call_sync,
- .validate_stateid = nfs4_validate_delegation_stateid,
+ .match_stateid = nfs4_match_stateid,
.find_root_sec = nfs4_find_root_sec,
.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
@@ -6240,7 +6434,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
.minor_version = 1,
.call_sync = _nfs4_call_sync_session,
- .validate_stateid = nfs41_validate_delegation_stateid,
+ .match_stateid = nfs41_match_stateid,
.find_root_sec = nfs41_find_root_sec,
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -6280,9 +6474,11 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.create = nfs4_proc_create,
.remove = nfs4_proc_remove,
.unlink_setup = nfs4_proc_unlink_setup,
+ .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
.unlink_done = nfs4_proc_unlink_done,
.rename = nfs4_proc_rename,
.rename_setup = nfs4_proc_rename_setup,
+ .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
.rename_done = nfs4_proc_rename_done,
.link = nfs4_proc_link,
.symlink = nfs4_proc_symlink,
@@ -6296,8 +6492,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.set_capabilities = nfs4_server_capabilities,
.decode_dirent = nfs4_decode_dirent,
.read_setup = nfs4_proc_read_setup,
+ .read_rpc_prepare = nfs4_proc_read_rpc_prepare,
.read_done = nfs4_read_done,
.write_setup = nfs4_proc_write_setup,
+ .write_rpc_prepare = nfs4_proc_write_rpc_prepare,
.write_done = nfs4_write_done,
.commit_setup = nfs4_proc_commit_setup,
.commit_done = nfs4_commit_done,
@@ -6321,6 +6519,10 @@ const struct xattr_handler *nfs4_xattr_handlers[] = {
NULL
};
+module_param(max_session_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
+ "requests the client will negotiate");
+
/*
* Local variables:
* c-basic-offset: 8
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a53f33b4ac3..0f43414eb25 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -146,6 +146,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
struct rpc_cred *cred = NULL;
struct nfs_server *server;
+ /* Use machine credentials if available */
+ cred = nfs4_get_machine_cred_locked(clp);
+ if (cred != NULL)
+ goto out;
+
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
cred = nfs4_get_renew_cred_server_locked(server);
@@ -153,6 +158,8 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
break;
}
rcu_read_unlock();
+
+out:
return cred;
}
@@ -190,30 +197,29 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
static void nfs4_end_drain_session(struct nfs_client *clp)
{
struct nfs4_session *ses = clp->cl_session;
+ struct nfs4_slot_table *tbl;
int max_slots;
if (ses == NULL)
return;
+ tbl = &ses->fc_slot_table;
if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
- spin_lock(&ses->fc_slot_table.slot_tbl_lock);
- max_slots = ses->fc_slot_table.max_slots;
+ spin_lock(&tbl->slot_tbl_lock);
+ max_slots = tbl->max_slots;
while (max_slots--) {
- struct rpc_task *task;
-
- task = rpc_wake_up_next(&ses->fc_slot_table.
- slot_tbl_waitq);
- if (!task)
+ if (rpc_wake_up_first(&tbl->slot_tbl_waitq,
+ nfs4_set_task_privileged,
+ NULL) == NULL)
break;
- rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
}
- spin_unlock(&ses->fc_slot_table.slot_tbl_lock);
+ spin_unlock(&tbl->slot_tbl_lock);
}
}
static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
{
spin_lock(&tbl->slot_tbl_lock);
- if (tbl->highest_used_slotid != -1) {
+ if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
INIT_COMPLETION(tbl->complete);
spin_unlock(&tbl->slot_tbl_lock);
return wait_for_completion_interruptible(&tbl->complete);
@@ -317,62 +323,6 @@ out:
return cred;
}
-static void nfs_alloc_unique_id_locked(struct rb_root *root,
- struct nfs_unique_id *new,
- __u64 minval, int maxbits)
-{
- struct rb_node **p, *parent;
- struct nfs_unique_id *pos;
- __u64 mask = ~0ULL;
-
- if (maxbits < 64)
- mask = (1ULL << maxbits) - 1ULL;
-
- /* Ensure distribution is more or less flat */
- get_random_bytes(&new->id, sizeof(new->id));
- new->id &= mask;
- if (new->id < minval)
- new->id += minval;
-retry:
- p = &root->rb_node;
- parent = NULL;
-
- while (*p != NULL) {
- parent = *p;
- pos = rb_entry(parent, struct nfs_unique_id, rb_node);
-
- if (new->id < pos->id)
- p = &(*p)->rb_left;
- else if (new->id > pos->id)
- p = &(*p)->rb_right;
- else
- goto id_exists;
- }
- rb_link_node(&new->rb_node, parent, p);
- rb_insert_color(&new->rb_node, root);
- return;
-id_exists:
- for (;;) {
- new->id++;
- if (new->id < minval || (new->id & mask) != new->id) {
- new->id = minval;
- break;
- }
- parent = rb_next(parent);
- if (parent == NULL)
- break;
- pos = rb_entry(parent, struct nfs_unique_id, rb_node);
- if (new->id < pos->id)
- break;
- }
- goto retry;
-}
-
-static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
-{
- rb_erase(&id->rb_node, root);
-}
-
static struct nfs4_state_owner *
nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
{
@@ -405,6 +355,7 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
struct rb_node **p = &server->state_owners.rb_node,
*parent = NULL;
struct nfs4_state_owner *sp;
+ int err;
while (*p != NULL) {
parent = *p;
@@ -421,8 +372,9 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
return sp;
}
}
- nfs_alloc_unique_id_locked(&server->openowner_id,
- &new->so_owner_id, 1, 64);
+ err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id);
+ if (err)
+ return ERR_PTR(err);
rb_link_node(&new->so_server_node, parent, p);
rb_insert_color(&new->so_server_node, &server->state_owners);
return new;
@@ -435,7 +387,23 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
if (!RB_EMPTY_NODE(&sp->so_server_node))
rb_erase(&sp->so_server_node, &server->state_owners);
- nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
+ ida_remove(&server->openowner_id, sp->so_seqid.owner_id);
+}
+
+static void
+nfs4_init_seqid_counter(struct nfs_seqid_counter *sc)
+{
+ sc->flags = 0;
+ sc->counter = 0;
+ spin_lock_init(&sc->lock);
+ INIT_LIST_HEAD(&sc->list);
+ rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue");
+}
+
+static void
+nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc)
+{
+ rpc_destroy_wait_queue(&sc->wait);
}
/*
@@ -444,19 +412,20 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
*
*/
static struct nfs4_state_owner *
-nfs4_alloc_state_owner(void)
+nfs4_alloc_state_owner(struct nfs_server *server,
+ struct rpc_cred *cred,
+ gfp_t gfp_flags)
{
struct nfs4_state_owner *sp;
- sp = kzalloc(sizeof(*sp),GFP_NOFS);
+ sp = kzalloc(sizeof(*sp), gfp_flags);
if (!sp)
return NULL;
+ sp->so_server = server;
+ sp->so_cred = get_rpccred(cred);
spin_lock_init(&sp->so_lock);
INIT_LIST_HEAD(&sp->so_states);
- rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
- sp->so_seqid.sequence = &sp->so_sequence;
- spin_lock_init(&sp->so_sequence.lock);
- INIT_LIST_HEAD(&sp->so_sequence.list);
+ nfs4_init_seqid_counter(&sp->so_seqid);
atomic_set(&sp->so_count, 1);
INIT_LIST_HEAD(&sp->so_lru);
return sp;
@@ -478,7 +447,7 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
{
- rpc_destroy_wait_queue(&sp->so_sequence.wait);
+ nfs4_destroy_seqid_counter(&sp->so_seqid);
put_rpccred(sp->so_cred);
kfree(sp);
}
@@ -516,7 +485,8 @@ static void nfs4_gc_state_owners(struct nfs_server *server)
* Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
*/
struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
- struct rpc_cred *cred)
+ struct rpc_cred *cred,
+ gfp_t gfp_flags)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state_owner *sp, *new;
@@ -526,20 +496,18 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
spin_unlock(&clp->cl_lock);
if (sp != NULL)
goto out;
- new = nfs4_alloc_state_owner();
+ new = nfs4_alloc_state_owner(server, cred, gfp_flags);
if (new == NULL)
goto out;
- new->so_server = server;
- new->so_cred = cred;
- spin_lock(&clp->cl_lock);
- sp = nfs4_insert_state_owner_locked(new);
- spin_unlock(&clp->cl_lock);
- if (sp == new)
- get_rpccred(cred);
- else {
- rpc_destroy_wait_queue(&new->so_sequence.wait);
- kfree(new);
- }
+ do {
+ if (ida_pre_get(&server->openowner_id, gfp_flags) == 0)
+ break;
+ spin_lock(&clp->cl_lock);
+ sp = nfs4_insert_state_owner_locked(new);
+ spin_unlock(&clp->cl_lock);
+ } while (sp == ERR_PTR(-EAGAIN));
+ if (sp != new)
+ nfs4_free_state_owner(new);
out:
nfs4_gc_state_owners(server);
return sp;
@@ -795,15 +763,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
{
struct nfs4_lock_state *lsp;
struct nfs_server *server = state->owner->so_server;
- struct nfs_client *clp = server->nfs_client;
lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
if (lsp == NULL)
return NULL;
- rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
- spin_lock_init(&lsp->ls_sequence.lock);
- INIT_LIST_HEAD(&lsp->ls_sequence.list);
- lsp->ls_seqid.sequence = &lsp->ls_sequence;
+ nfs4_init_seqid_counter(&lsp->ls_seqid);
atomic_set(&lsp->ls_count, 1);
lsp->ls_state = state;
lsp->ls_owner.lo_type = type;
@@ -815,25 +779,22 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
lsp->ls_owner.lo_u.posix_owner = fl_owner;
break;
default:
- kfree(lsp);
- return NULL;
+ goto out_free;
}
- spin_lock(&clp->cl_lock);
- nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
- spin_unlock(&clp->cl_lock);
+ lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
+ if (lsp->ls_seqid.owner_id < 0)
+ goto out_free;
INIT_LIST_HEAD(&lsp->ls_locks);
return lsp;
+out_free:
+ kfree(lsp);
+ return NULL;
}
-static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
+void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
- struct nfs_server *server = lsp->ls_state->owner->so_server;
- struct nfs_client *clp = server->nfs_client;
-
- spin_lock(&clp->cl_lock);
- nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
- spin_unlock(&clp->cl_lock);
- rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
+ ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id);
+ nfs4_destroy_seqid_counter(&lsp->ls_seqid);
kfree(lsp);
}
@@ -865,7 +826,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
}
spin_unlock(&state->state_lock);
if (new != NULL)
- nfs4_free_lock_state(new);
+ nfs4_free_lock_state(state->owner->so_server, new);
return lsp;
}
@@ -886,9 +847,11 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
if (list_empty(&state->lock_states))
clear_bit(LK_STATE_IN_USE, &state->flags);
spin_unlock(&state->state_lock);
- if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
- nfs4_release_lockowner(lsp);
- nfs4_free_lock_state(lsp);
+ if (lsp->ls_flags & NFS_LOCK_INITIALIZED) {
+ if (nfs4_release_lockowner(lsp) == 0)
+ return;
+ }
+ nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp);
}
static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -918,7 +881,8 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
if (fl->fl_flags & FL_POSIX)
lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
else if (fl->fl_flags & FL_FLOCK)
- lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE);
+ lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
+ NFS4_FLOCK_LOCK_TYPE);
else
return -EINVAL;
if (lsp == NULL)
@@ -928,28 +892,49 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
return 0;
}
-/*
- * Byte-range lock aware utility to initialize the stateid of read/write
- * requests.
- */
-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid)
+static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state,
+ fl_owner_t fl_owner, pid_t fl_pid)
{
struct nfs4_lock_state *lsp;
- int seq;
+ bool ret = false;
- do {
- seq = read_seqbegin(&state->seqlock);
- memcpy(dst, &state->stateid, sizeof(*dst));
- } while (read_seqretry(&state->seqlock, seq));
if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
- return;
+ goto out;
spin_lock(&state->state_lock);
lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
- if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
- memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
+ if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) {
+ nfs4_stateid_copy(dst, &lsp->ls_stateid);
+ ret = true;
+ }
spin_unlock(&state->state_lock);
nfs4_put_lock_state(lsp);
+out:
+ return ret;
+}
+
+static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+{
+ int seq;
+
+ do {
+ seq = read_seqbegin(&state->seqlock);
+ nfs4_stateid_copy(dst, &state->stateid);
+ } while (read_seqretry(&state->seqlock, seq));
+}
+
+/*
+ * Byte-range lock aware utility to initialize the stateid of read/write
+ * requests.
+ */
+void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
+ fmode_t fmode, fl_owner_t fl_owner, pid_t fl_pid)
+{
+ if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
+ return;
+ if (nfs4_copy_lock_stateid(dst, state, fl_owner, fl_pid))
+ return;
+ nfs4_copy_open_stateid(dst, state);
}
struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
@@ -960,20 +945,28 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m
if (new != NULL) {
new->sequence = counter;
INIT_LIST_HEAD(&new->list);
+ new->task = NULL;
}
return new;
}
void nfs_release_seqid(struct nfs_seqid *seqid)
{
- if (!list_empty(&seqid->list)) {
- struct rpc_sequence *sequence = seqid->sequence->sequence;
+ struct nfs_seqid_counter *sequence;
- spin_lock(&sequence->lock);
- list_del_init(&seqid->list);
- spin_unlock(&sequence->lock);
- rpc_wake_up(&sequence->wait);
+ if (list_empty(&seqid->list))
+ return;
+ sequence = seqid->sequence;
+ spin_lock(&sequence->lock);
+ list_del_init(&seqid->list);
+ if (!list_empty(&sequence->list)) {
+ struct nfs_seqid *next;
+
+ next = list_first_entry(&sequence->list,
+ struct nfs_seqid, list);
+ rpc_wake_up_queued_task(&sequence->wait, next->task);
}
+ spin_unlock(&sequence->lock);
}
void nfs_free_seqid(struct nfs_seqid *seqid)
@@ -989,14 +982,14 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
*/
static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
{
- BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid);
+ BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid);
switch (status) {
case 0:
break;
case -NFS4ERR_BAD_SEQID:
if (seqid->sequence->flags & NFS_SEQID_CONFIRMED)
return;
- printk(KERN_WARNING "NFS: v4 server returned a bad"
+ pr_warn_ratelimited("NFS: v4 server returned a bad"
" sequence-id error on an"
" unconfirmed sequence %p!\n",
seqid->sequence);
@@ -1040,10 +1033,11 @@ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
{
- struct rpc_sequence *sequence = seqid->sequence->sequence;
+ struct nfs_seqid_counter *sequence = seqid->sequence;
int status = 0;
spin_lock(&sequence->lock);
+ seqid->task = task;
if (list_empty(&seqid->list))
list_add_tail(&seqid->list, &sequence->list);
if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
@@ -1072,19 +1066,28 @@ static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
void nfs4_schedule_state_manager(struct nfs_client *clp)
{
struct task_struct *task;
+ char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
return;
__module_get(THIS_MODULE);
atomic_inc(&clp->cl_count);
- task = kthread_run(nfs4_run_state_manager, clp, "%s-manager",
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_ADDR));
- if (!IS_ERR(task))
- return;
- nfs4_clear_state_manager_bit(clp);
- nfs_put_client(clp);
- module_put(THIS_MODULE);
+
+ /* The rcu_read_lock() is not strictly necessary, as the state
+ * manager is the only thread that ever changes the rpc_xprt
+ * after it's initialized. At this point, we're single threaded. */
+ rcu_read_lock();
+ snprintf(buf, sizeof(buf), "%s-manager",
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+ rcu_read_unlock();
+ task = kthread_run(nfs4_run_state_manager, clp, buf);
+ if (IS_ERR(task)) {
+ printk(KERN_ERR "%s: kthread_run: %ld\n",
+ __func__, PTR_ERR(task));
+ nfs4_clear_state_manager_bit(clp);
+ nfs_put_client(clp);
+ module_put(THIS_MODULE);
+ }
}
/*
@@ -1098,10 +1101,25 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
nfs4_schedule_state_manager(clp);
}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
+
+/*
+ * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
+ * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a
+ * resend of the SETCLIENTID and hence re-establish the
+ * callback channel. Then return all existing delegations.
+ */
+static void nfs40_handle_cb_pathdown(struct nfs_client *clp)
+{
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs_expire_all_delegations(clp);
+}
void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
{
- nfs_handle_cb_pathdown(clp);
+ nfs40_handle_cb_pathdown(clp);
nfs4_schedule_state_manager(clp);
}
@@ -1135,6 +1153,34 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4
nfs4_state_mark_reclaim_nograce(clp, state);
nfs4_schedule_state_manager(clp);
}
+EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
+
+void nfs_inode_find_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ bool found = false;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ if (state == NULL)
+ continue;
+ if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+ continue;
+ if (!nfs4_stateid_match(&state->stateid, stateid))
+ continue;
+ nfs4_state_mark_reclaim_nograce(clp, state);
+ found = true;
+ }
+ spin_unlock(&inode->i_lock);
+ if (found)
+ nfs4_schedule_state_manager(clp);
+}
+
static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
{
@@ -1173,8 +1219,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
goto out;
default:
- printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
- __func__, status);
+ printk(KERN_ERR "NFS: %s: unhandled error %d. "
+ "Zeroing state\n", __func__, status);
case -ENOMEM:
case -NFS4ERR_DENIED:
case -NFS4ERR_RECLAIM_BAD:
@@ -1220,8 +1266,9 @@ restart:
spin_lock(&state->state_lock);
list_for_each_entry(lock, &state->lock_states, ls_locks) {
if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
- printk("%s: Lock reclaim failed!\n",
- __func__);
+ pr_warn_ratelimited("NFS: "
+ "%s: Lock reclaim "
+ "failed!\n", __func__);
}
spin_unlock(&state->state_lock);
nfs4_put_open_state(state);
@@ -1230,8 +1277,8 @@ restart:
}
switch (status) {
default:
- printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
- __func__, status);
+ printk(KERN_ERR "NFS: %s: unhandled error %d. "
+ "Zeroing state\n", __func__, status);
case -ENOENT:
case -ENOMEM:
case -ESTALE:
@@ -1239,8 +1286,8 @@ restart:
* Open state on this file cannot be recovered
* All we can do is revert to using the zero stateid.
*/
- memset(state->stateid.data, 0,
- sizeof(state->stateid.data));
+ memset(&state->stateid, 0,
+ sizeof(state->stateid));
/* Mark the file as being 'closed' */
state->state = 0;
break;
@@ -1418,7 +1465,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
case 0:
break;
case -NFS4ERR_CB_PATH_DOWN:
- nfs_handle_cb_pathdown(clp);
+ nfs40_handle_cb_pathdown(clp);
break;
case -NFS4ERR_NO_GRACE:
nfs4_state_end_reclaim_reboot(clp);
@@ -1799,7 +1846,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
} while (atomic_read(&clp->cl_count) > 1);
return;
out_error:
- printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
+ pr_warn_ratelimited("NFS: state manager failed on NFSv4 server %s"
" with error %d\n", clp->cl_hostname, -status);
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 95e92e43840..c74fdb114b4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -44,6 +44,8 @@
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/kdev_t.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/msg_prot.h>
#include <linux/sunrpc/gss_api.h>
@@ -271,7 +273,12 @@ static int nfs4_stat_to_errno(int);
1 /* flags */ + \
1 /* spa_how */ + \
0 /* SP4_NONE (for now) */ + \
- 1 /* zero implemetation id array */)
+ 1 /* implementation id array of size 1 */ + \
+ 1 /* nii_domain */ + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 1 /* nii_name */ + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 3 /* nii_date */)
#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
2 /* eir_clientid */ + \
1 /* eir_sequenceid */ + \
@@ -284,7 +291,11 @@ static int nfs4_stat_to_errno(int);
/* eir_server_scope<> */ \
XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
1 /* eir_server_impl_id array length */ + \
- 0 /* ignored eir_server_impl_id contents */)
+ 1 /* nii_domain */ + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 1 /* nii_name */ + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 3 /* nii_date */)
#define encode_channel_attrs_maxsz (6 + 1 /* ca_rdma_ird.len (0) */)
#define decode_channel_attrs_maxsz (6 + \
1 /* ca_rdma_ird.len */ + \
@@ -838,6 +849,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
XDR_UNIT);
#endif /* CONFIG_NFS_V4_1 */
+static unsigned short send_implementation_id = 1;
+
+module_param(send_implementation_id, ushort, 0644);
+MODULE_PARM_DESC(send_implementation_id,
+ "Send implementation ID with NFSv4.1 exchange_id");
+
static const umode_t nfs_type2fmt[] = {
[NF4BAD] = 0,
[NF4REG] = S_IFREG,
@@ -868,15 +885,44 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
return p;
}
+static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, len);
+ xdr_encode_opaque_fixed(p, buf, len);
+}
+
static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
{
__be32 *p;
- p = xdr_reserve_space(xdr, 4 + len);
- BUG_ON(p == NULL);
+ p = reserve_space(xdr, 4 + len);
xdr_encode_opaque(p, str, len);
}
+static void encode_uint32(struct xdr_stream *xdr, u32 n)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(n);
+}
+
+static void encode_uint64(struct xdr_stream *xdr, u64 n)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 8);
+ xdr_encode_hyper(p, n);
+}
+
+static void encode_nfs4_seqid(struct xdr_stream *xdr,
+ const struct nfs_seqid *seqid)
+{
+ encode_uint32(xdr, seqid->sequence->counter);
+}
+
static void encode_compound_hdr(struct xdr_stream *xdr,
struct rpc_rqst *req,
struct compound_hdr *hdr)
@@ -889,28 +935,37 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
* but this is not required as a MUST for the server to do so. */
hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
- dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
- p = reserve_space(xdr, 4 + hdr->taglen + 8);
- p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
+ encode_string(xdr, hdr->taglen, hdr->tag);
+ p = reserve_space(xdr, 8);
*p++ = cpu_to_be32(hdr->minorversion);
hdr->nops_p = p;
*p = cpu_to_be32(hdr->nops);
}
+static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op,
+ uint32_t replen,
+ struct compound_hdr *hdr)
+{
+ encode_uint32(xdr, op);
+ hdr->nops++;
+ hdr->replen += replen;
+}
+
static void encode_nops(struct compound_hdr *hdr)
{
BUG_ON(hdr->nops > NFS4_MAX_OPS);
*hdr->nops_p = htonl(hdr->nops);
}
-static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
+static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid)
{
- __be32 *p;
+ encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
+}
- p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
- BUG_ON(p == NULL);
- xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE);
+static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
+{
+ encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
}
static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
@@ -1023,7 +1078,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
* Now we backfill the bitmap and the attribute buffer length.
*/
if (len != ((char *)p - (char *)q) + 4) {
- printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n",
+ printk(KERN_ERR "NFS: Attr length error, %u != %Zu\n",
len, ((char *)p - (char *)q) + 4);
BUG();
}
@@ -1037,46 +1092,33 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 8);
- *p++ = cpu_to_be32(OP_ACCESS);
- *p = cpu_to_be32(access);
- hdr->nops++;
- hdr->replen += decode_access_maxsz;
+ encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr);
+ encode_uint32(xdr, access);
}
static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
- *p++ = cpu_to_be32(OP_CLOSE);
- *p++ = cpu_to_be32(arg->seqid->sequence->counter);
- xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
- hdr->nops++;
- hdr->replen += decode_close_maxsz;
+ encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
+ encode_nfs4_seqid(xdr, arg->seqid);
+ encode_nfs4_stateid(xdr, arg->stateid);
}
static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
{
__be32 *p;
- p = reserve_space(xdr, 16);
- *p++ = cpu_to_be32(OP_COMMIT);
+ encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
+ p = reserve_space(xdr, 12);
p = xdr_encode_hyper(p, args->offset);
*p = cpu_to_be32(args->count);
- hdr->nops++;
- hdr->replen += decode_commit_maxsz;
}
static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
{
__be32 *p;
- p = reserve_space(xdr, 8);
- *p++ = cpu_to_be32(OP_CREATE);
- *p = cpu_to_be32(create->ftype);
+ encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr);
+ encode_uint32(xdr, create->ftype);
switch (create->ftype) {
case NF4LNK:
@@ -1096,9 +1138,6 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- hdr->nops++;
- hdr->replen += decode_create_maxsz;
-
encode_attrs(xdr, create->attrs, create->server);
}
@@ -1106,25 +1145,21 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
{
__be32 *p;
- p = reserve_space(xdr, 12);
- *p++ = cpu_to_be32(OP_GETATTR);
+ encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
+ p = reserve_space(xdr, 8);
*p++ = cpu_to_be32(1);
*p = cpu_to_be32(bitmap);
- hdr->nops++;
- hdr->replen += decode_getattr_maxsz;
}
static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
{
__be32 *p;
- p = reserve_space(xdr, 16);
- *p++ = cpu_to_be32(OP_GETATTR);
+ encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
+ p = reserve_space(xdr, 12);
*p++ = cpu_to_be32(2);
*p++ = cpu_to_be32(bm0);
*p = cpu_to_be32(bm1);
- hdr->nops++;
- hdr->replen += decode_getattr_maxsz;
}
static void
@@ -1134,8 +1169,7 @@ encode_getattr_three(struct xdr_stream *xdr,
{
__be32 *p;
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(OP_GETATTR);
+ encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
if (bm2) {
p = reserve_space(xdr, 16);
*p++ = cpu_to_be32(3);
@@ -1152,8 +1186,6 @@ encode_getattr_three(struct xdr_stream *xdr,
*p++ = cpu_to_be32(1);
*p = cpu_to_be32(bm0);
}
- hdr->nops++;
- hdr->replen += decode_getattr_maxsz;
}
static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -1179,23 +1211,13 @@ static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, stru
static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(OP_GETFH);
- hdr->nops++;
- hdr->replen += decode_getfh_maxsz;
+ encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr);
}
static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 8 + name->len);
- *p++ = cpu_to_be32(OP_LINK);
- xdr_encode_opaque(p, name->name, name->len);
- hdr->nops++;
- hdr->replen += decode_link_maxsz;
+ encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
}
static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -1232,79 +1254,60 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
{
__be32 *p;
- p = reserve_space(xdr, 32);
- *p++ = cpu_to_be32(OP_LOCK);
+ encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr);
+ p = reserve_space(xdr, 28);
*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
*p++ = cpu_to_be32(args->reclaim);
p = xdr_encode_hyper(p, args->fl->fl_start);
p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
*p = cpu_to_be32(args->new_lock_owner);
if (args->new_lock_owner){
- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
- *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
- p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
- *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
+ encode_nfs4_seqid(xdr, args->open_seqid);
+ encode_nfs4_stateid(xdr, args->open_stateid);
+ encode_nfs4_seqid(xdr, args->lock_seqid);
encode_lockowner(xdr, &args->lock_owner);
}
else {
- p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
- p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
- *p = cpu_to_be32(args->lock_seqid->sequence->counter);
+ encode_nfs4_stateid(xdr, args->lock_stateid);
+ encode_nfs4_seqid(xdr, args->lock_seqid);
}
- hdr->nops++;
- hdr->replen += decode_lock_maxsz;
}
static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
{
__be32 *p;
- p = reserve_space(xdr, 24);
- *p++ = cpu_to_be32(OP_LOCKT);
+ encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr);
+ p = reserve_space(xdr, 20);
*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
p = xdr_encode_hyper(p, args->fl->fl_start);
p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
encode_lockowner(xdr, &args->lock_owner);
- hdr->nops++;
- hdr->replen += decode_lockt_maxsz;
}
static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
{
__be32 *p;
- p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
- *p++ = cpu_to_be32(OP_LOCKU);
- *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
- *p++ = cpu_to_be32(args->seqid->sequence->counter);
- p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
+ encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
+ encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
+ encode_nfs4_seqid(xdr, args->seqid);
+ encode_nfs4_stateid(xdr, args->stateid);
+ p = reserve_space(xdr, 16);
p = xdr_encode_hyper(p, args->fl->fl_start);
xdr_encode_hyper(p, nfs4_lock_length(args->fl));
- hdr->nops++;
- hdr->replen += decode_locku_maxsz;
}
static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
+ encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr);
encode_lockowner(xdr, lowner);
- hdr->nops++;
- hdr->replen += decode_release_lockowner_maxsz;
}
static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
{
- int len = name->len;
- __be32 *p;
-
- p = reserve_space(xdr, 8 + len);
- *p++ = cpu_to_be32(OP_LOOKUP);
- xdr_encode_opaque(p, name->name, len);
- hdr->nops++;
- hdr->replen += decode_lookup_maxsz;
+ encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
}
static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
@@ -1335,9 +1338,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
* opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
* owner 4 = 32
*/
- p = reserve_space(xdr, 8);
- *p++ = cpu_to_be32(OP_OPEN);
- *p = cpu_to_be32(arg->seqid->sequence->counter);
+ encode_nfs4_seqid(xdr, arg->seqid);
encode_share_access(xdr, arg->fmode);
p = reserve_space(xdr, 32);
p = xdr_encode_hyper(p, arg->clientid);
@@ -1437,14 +1438,15 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
{
__be32 *p;
- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
- *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+ encode_nfs4_stateid(xdr, stateid);
encode_string(xdr, name->len, name->name);
}
static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
{
+ encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr);
encode_openhdr(xdr, arg);
encode_opentype(xdr, arg);
switch (arg->claim) {
@@ -1460,88 +1462,64 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg,
default:
BUG();
}
- hdr->nops++;
- hdr->replen += decode_open_maxsz;
}
static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
- *p++ = cpu_to_be32(OP_OPEN_CONFIRM);
- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
- *p = cpu_to_be32(arg->seqid->sequence->counter);
- hdr->nops++;
- hdr->replen += decode_open_confirm_maxsz;
+ encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr);
+ encode_nfs4_stateid(xdr, arg->stateid);
+ encode_nfs4_seqid(xdr, arg->seqid);
}
static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
- *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
- *p = cpu_to_be32(arg->seqid->sequence->counter);
+ encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
+ encode_nfs4_stateid(xdr, arg->stateid);
+ encode_nfs4_seqid(xdr, arg->seqid);
encode_share_access(xdr, arg->fmode);
- hdr->nops++;
- hdr->replen += decode_open_downgrade_maxsz;
}
static void
encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
{
- int len = fh->size;
- __be32 *p;
-
- p = reserve_space(xdr, 8 + len);
- *p++ = cpu_to_be32(OP_PUTFH);
- xdr_encode_opaque(p, fh->data, len);
- hdr->nops++;
- hdr->replen += decode_putfh_maxsz;
+ encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr);
+ encode_string(xdr, fh->size, fh->data);
}
static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(OP_PUTROOTFH);
- hdr->nops++;
- hdr->replen += decode_putrootfh_maxsz;
+ encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
}
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
+static void encode_open_stateid(struct xdr_stream *xdr,
+ const struct nfs_open_context *ctx,
+ const struct nfs_lock_context *l_ctx,
+ fmode_t fmode,
+ int zero_seqid)
{
nfs4_stateid stateid;
- __be32 *p;
- p = reserve_space(xdr, NFS4_STATEID_SIZE);
if (ctx->state != NULL) {
- nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
+ nfs4_select_rw_stateid(&stateid, ctx->state,
+ fmode, l_ctx->lockowner, l_ctx->pid);
if (zero_seqid)
- stateid.stateid.seqid = 0;
- xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
+ stateid.seqid = 0;
+ encode_nfs4_stateid(xdr, &stateid);
} else
- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+ encode_nfs4_stateid(xdr, &zero_stateid);
}
static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
{
__be32 *p;
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(OP_READ);
-
- encode_stateid(xdr, args->context, args->lock_context,
- hdr->minorversion);
+ encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr);
+ encode_open_stateid(xdr, args->context, args->lock_context,
+ FMODE_READ, hdr->minorversion);
p = reserve_space(xdr, 12);
p = xdr_encode_hyper(p, args->offset);
*p = cpu_to_be32(args->count);
- hdr->nops++;
- hdr->replen += decode_read_maxsz;
}
static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1551,7 +1529,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
FATTR4_WORD1_MOUNTED_ON_FILEID,
};
uint32_t dircount = readdir->count >> 1;
- __be32 *p;
+ __be32 *p, verf[2];
if (readdir->plus) {
attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
@@ -1566,80 +1544,54 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
attrs[0] |= FATTR4_WORD0_FILEID;
- p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
- *p++ = cpu_to_be32(OP_READDIR);
- p = xdr_encode_hyper(p, readdir->cookie);
- p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
+ encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
+ encode_uint64(xdr, readdir->cookie);
+ encode_nfs4_verifier(xdr, &readdir->verifier);
+ p = reserve_space(xdr, 20);
*p++ = cpu_to_be32(dircount);
*p++ = cpu_to_be32(readdir->count);
*p++ = cpu_to_be32(2);
*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
- hdr->nops++;
- hdr->replen += decode_readdir_maxsz;
+ memcpy(verf, readdir->verifier.data, sizeof(verf));
dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
__func__,
(unsigned long long)readdir->cookie,
- ((u32 *)readdir->verifier.data)[0],
- ((u32 *)readdir->verifier.data)[1],
+ verf[0], verf[1],
attrs[0] & readdir->bitmask[0],
attrs[1] & readdir->bitmask[1]);
}
static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(OP_READLINK);
- hdr->nops++;
- hdr->replen += decode_readlink_maxsz;
+ encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr);
}
static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 8 + name->len);
- *p++ = cpu_to_be32(OP_REMOVE);
- xdr_encode_opaque(p, name->name, name->len);
- hdr->nops++;
- hdr->replen += decode_remove_maxsz;
+ encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
}
static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(OP_RENAME);
+ encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr);
encode_string(xdr, oldname->len, oldname->name);
encode_string(xdr, newname->len, newname->name);
- hdr->nops++;
- hdr->replen += decode_rename_maxsz;
}
-static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
+static void encode_renew(struct xdr_stream *xdr, clientid4 clid,
+ struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 12);
- *p++ = cpu_to_be32(OP_RENEW);
- xdr_encode_hyper(p, client_stateid->cl_clientid);
- hdr->nops++;
- hdr->replen += decode_renew_maxsz;
+ encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr);
+ encode_uint64(xdr, clid);
}
static void
encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(OP_RESTOREFH);
- hdr->nops++;
- hdr->replen += decode_restorefh_maxsz;
+ encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr);
}
static void
@@ -1647,9 +1599,8 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
{
__be32 *p;
- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
- *p++ = cpu_to_be32(OP_SETATTR);
- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+ encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &zero_stateid);
p = reserve_space(xdr, 2*4);
*p++ = cpu_to_be32(1);
*p = cpu_to_be32(FATTR4_WORD0_ACL);
@@ -1657,30 +1608,18 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
p = reserve_space(xdr, 4);
*p = cpu_to_be32(arg->acl_len);
xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
- hdr->nops++;
- hdr->replen += decode_setacl_maxsz;
}
static void
encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(OP_SAVEFH);
- hdr->nops++;
- hdr->replen += decode_savefh_maxsz;
+ encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr);
}
static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
- *p++ = cpu_to_be32(OP_SETATTR);
- xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
- hdr->nops++;
- hdr->replen += decode_setattr_maxsz;
+ encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &arg->stateid);
encode_attrs(xdr, arg->iap, server);
}
@@ -1688,9 +1627,8 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
{
__be32 *p;
- p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
- *p++ = cpu_to_be32(OP_SETCLIENTID);
- xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+ encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
+ encode_nfs4_verifier(xdr, setclientid->sc_verifier);
encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
p = reserve_space(xdr, 4);
@@ -1699,31 +1637,23 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
p = reserve_space(xdr, 4);
*p = cpu_to_be32(setclientid->sc_cb_ident);
- hdr->nops++;
- hdr->replen += decode_setclientid_maxsz;
}
static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
- *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
- p = xdr_encode_hyper(p, arg->clientid);
- xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
- hdr->nops++;
- hdr->replen += decode_setclientid_confirm_maxsz;
+ encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM,
+ decode_setclientid_confirm_maxsz, hdr);
+ encode_uint64(xdr, arg->clientid);
+ encode_nfs4_verifier(xdr, &arg->confirm);
}
static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
{
__be32 *p;
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(OP_WRITE);
-
- encode_stateid(xdr, args->context, args->lock_context,
- hdr->minorversion);
+ encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr);
+ encode_open_stateid(xdr, args->context, args->lock_context,
+ FMODE_WRITE, hdr->minorversion);
p = reserve_space(xdr, 16);
p = xdr_encode_hyper(p, args->offset);
@@ -1731,32 +1661,18 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
*p = cpu_to_be32(args->count);
xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
- hdr->nops++;
- hdr->replen += decode_write_maxsz;
}
static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
-
- *p++ = cpu_to_be32(OP_DELEGRETURN);
- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
- hdr->nops++;
- hdr->replen += decode_delegreturn_maxsz;
+ encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr);
+ encode_nfs4_stateid(xdr, stateid);
}
static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
{
- int len = name->len;
- __be32 *p;
-
- p = reserve_space(xdr, 8 + len);
- *p++ = cpu_to_be32(OP_SECINFO);
- xdr_encode_opaque(p, name->name, len);
- hdr->nops++;
- hdr->replen += decode_secinfo_maxsz;
+ encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
}
#if defined(CONFIG_NFS_V4_1)
@@ -1766,19 +1682,39 @@ static void encode_exchange_id(struct xdr_stream *xdr,
struct compound_hdr *hdr)
{
__be32 *p;
+ char impl_name[NFS4_OPAQUE_LIMIT];
+ int len = 0;
- p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
- *p++ = cpu_to_be32(OP_EXCHANGE_ID);
- xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
+ encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
+ encode_nfs4_verifier(xdr, args->verifier);
encode_string(xdr, args->id_len, args->id);
p = reserve_space(xdr, 12);
*p++ = cpu_to_be32(args->flags);
*p++ = cpu_to_be32(0); /* zero length state_protect4_a */
- *p = cpu_to_be32(0); /* zero length implementation id array */
- hdr->nops++;
- hdr->replen += decode_exchange_id_maxsz;
+
+ if (send_implementation_id &&
+ sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
+ sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN)
+ <= NFS4_OPAQUE_LIMIT + 1)
+ len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s",
+ utsname()->sysname, utsname()->release,
+ utsname()->version, utsname()->machine);
+
+ if (len > 0) {
+ *p = cpu_to_be32(1); /* implementation id array length=1 */
+
+ encode_string(xdr,
+ sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1,
+ CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN);
+ encode_string(xdr, len, impl_name);
+ /* just send zeros for nii_date - the date is in nii_name */
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, 0);
+ *p = cpu_to_be32(0);
+ } else
+ *p = cpu_to_be32(0); /* implementation id array length=0 */
}
static void encode_create_session(struct xdr_stream *xdr,
@@ -1801,8 +1737,8 @@ static void encode_create_session(struct xdr_stream *xdr,
len = scnprintf(machine_name, sizeof(machine_name), "%s",
clp->cl_ipaddr);
- p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
- *p++ = cpu_to_be32(OP_CREATE_SESSION);
+ encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
+ p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12);
p = xdr_encode_hyper(p, clp->cl_clientid);
*p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
*p++ = cpu_to_be32(args->flags); /*flags */
@@ -1835,33 +1771,22 @@ static void encode_create_session(struct xdr_stream *xdr,
*p++ = cpu_to_be32(0); /* UID */
*p++ = cpu_to_be32(0); /* GID */
*p = cpu_to_be32(0); /* No more gids */
- hdr->nops++;
- hdr->replen += decode_create_session_maxsz;
}
static void encode_destroy_session(struct xdr_stream *xdr,
struct nfs4_session *session,
struct compound_hdr *hdr)
{
- __be32 *p;
- p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
- *p++ = cpu_to_be32(OP_DESTROY_SESSION);
- xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
- hdr->nops++;
- hdr->replen += decode_destroy_session_maxsz;
+ encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr);
+ encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
}
static void encode_reclaim_complete(struct xdr_stream *xdr,
struct nfs41_reclaim_complete_args *args,
struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 8);
- *p++ = cpu_to_be32(OP_RECLAIM_COMPLETE);
- *p++ = cpu_to_be32(args->one_fs);
- hdr->nops++;
- hdr->replen += decode_reclaim_complete_maxsz;
+ encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr);
+ encode_uint32(xdr, args->one_fs);
}
#endif /* CONFIG_NFS_V4_1 */
@@ -1883,8 +1808,7 @@ static void encode_sequence(struct xdr_stream *xdr,
WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
slot = tp->slots + args->sa_slotid;
- p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
- *p++ = cpu_to_be32(OP_SEQUENCE);
+ encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
/*
* Sessionid + seqid + slotid + max slotid + cache_this
@@ -1898,13 +1822,12 @@ static void encode_sequence(struct xdr_stream *xdr,
((u32 *)session->sess_id.data)[3],
slot->seq_nr, args->sa_slotid,
tp->highest_used_slotid, args->sa_cache_this);
+ p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);
p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
*p++ = cpu_to_be32(slot->seq_nr);
*p++ = cpu_to_be32(args->sa_slotid);
*p++ = cpu_to_be32(tp->highest_used_slotid);
*p = cpu_to_be32(args->sa_cache_this);
- hdr->nops++;
- hdr->replen += decode_sequence_maxsz;
#endif /* CONFIG_NFS_V4_1 */
}
@@ -1919,14 +1842,12 @@ encode_getdevicelist(struct xdr_stream *xdr,
.data = "dummmmmy",
};
- p = reserve_space(xdr, 20);
- *p++ = cpu_to_be32(OP_GETDEVICELIST);
+ encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
+ p = reserve_space(xdr, 16);
*p++ = cpu_to_be32(args->layoutclass);
*p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
xdr_encode_hyper(p, 0ULL); /* cookie */
encode_nfs4_verifier(xdr, &dummy);
- hdr->nops++;
- hdr->replen += decode_getdevicelist_maxsz;
}
static void
@@ -1936,15 +1857,13 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
{
__be32 *p;
- p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
- *p++ = cpu_to_be32(OP_GETDEVICEINFO);
+ encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
+ p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);
p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(args->pdev->layout_type);
*p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
*p++ = cpu_to_be32(0); /* bitmap length 0 */
- hdr->nops++;
- hdr->replen += decode_getdeviceinfo_maxsz;
}
static void
@@ -1954,16 +1873,16 @@ encode_layoutget(struct xdr_stream *xdr,
{
__be32 *p;
- p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
- *p++ = cpu_to_be32(OP_LAYOUTGET);
+ encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr);
+ p = reserve_space(xdr, 36);
*p++ = cpu_to_be32(0); /* Signal layout available */
*p++ = cpu_to_be32(args->type);
*p++ = cpu_to_be32(args->range.iomode);
p = xdr_encode_hyper(p, args->range.offset);
p = xdr_encode_hyper(p, args->range.length);
p = xdr_encode_hyper(p, args->minlength);
- p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
- *p = cpu_to_be32(args->maxcount);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ encode_uint32(xdr, args->maxcount);
dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
__func__,
@@ -1972,8 +1891,6 @@ encode_layoutget(struct xdr_stream *xdr,
(unsigned long)args->range.offset,
(unsigned long)args->range.length,
args->maxcount);
- hdr->nops++;
- hdr->replen += decode_layoutget_maxsz;
}
static int
@@ -1987,13 +1904,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
NFS_SERVER(args->inode)->pnfs_curr_ld->id);
- p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
- *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
+ encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr);
+ p = reserve_space(xdr, 20);
/* Only whole file layouts */
p = xdr_encode_hyper(p, 0); /* offset */
p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
- *p++ = cpu_to_be32(0); /* reclaim */
- p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
+ *p = cpu_to_be32(0); /* reclaim */
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 20);
*p++ = cpu_to_be32(1); /* newoffset = TRUE */
p = xdr_encode_hyper(p, args->lastbytewritten);
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
@@ -2002,13 +1920,9 @@ encode_layoutcommit(struct xdr_stream *xdr,
if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
NFS_I(inode)->layout, xdr, args);
- else {
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(0); /* no layout-type payload */
- }
+ else
+ encode_uint32(xdr, 0); /* no layout-type payload */
- hdr->nops++;
- hdr->replen += decode_layoutcommit_maxsz;
return 0;
}
@@ -2019,27 +1933,23 @@ encode_layoutreturn(struct xdr_stream *xdr,
{
__be32 *p;
- p = reserve_space(xdr, 20);
- *p++ = cpu_to_be32(OP_LAYOUTRETURN);
+ encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
+ p = reserve_space(xdr, 16);
*p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
*p++ = cpu_to_be32(args->layout_type);
*p++ = cpu_to_be32(IOMODE_ANY);
*p = cpu_to_be32(RETURN_FILE);
- p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 16);
p = xdr_encode_hyper(p, 0);
p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
spin_lock(&args->inode->i_lock);
- xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+ encode_nfs4_stateid(xdr, &args->stateid);
spin_unlock(&args->inode->i_lock);
if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
NFS_I(args->inode)->layout, xdr, args);
- } else {
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(0);
- }
- hdr->nops++;
- hdr->replen += decode_layoutreturn_maxsz;
+ } else
+ encode_uint32(xdr, 0);
}
static int
@@ -2047,12 +1957,8 @@ encode_secinfo_no_name(struct xdr_stream *xdr,
const struct nfs41_secinfo_no_name_args *args,
struct compound_hdr *hdr)
{
- __be32 *p;
- p = reserve_space(xdr, 8);
- *p++ = cpu_to_be32(OP_SECINFO_NO_NAME);
- *p++ = cpu_to_be32(args->style);
- hdr->nops++;
- hdr->replen += decode_secinfo_no_name_maxsz;
+ encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr);
+ encode_uint32(xdr, args->style);
return 0;
}
@@ -2060,26 +1966,17 @@ static void encode_test_stateid(struct xdr_stream *xdr,
struct nfs41_test_stateid_args *args,
struct compound_hdr *hdr)
{
- __be32 *p;
-
- p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE);
- *p++ = cpu_to_be32(OP_TEST_STATEID);
- *p++ = cpu_to_be32(1);
- xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
- hdr->nops++;
- hdr->replen += decode_test_stateid_maxsz;
+ encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr);
+ encode_uint32(xdr, 1);
+ encode_nfs4_stateid(xdr, args->stateid);
}
static void encode_free_stateid(struct xdr_stream *xdr,
struct nfs41_free_stateid_args *args,
struct compound_hdr *hdr)
{
- __be32 *p;
- p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE);
- *p++ = cpu_to_be32(OP_FREE_STATEID);
- xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
- hdr->nops++;
- hdr->replen += decode_free_stateid_maxsz;
+ encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
+ encode_nfs4_stateid(xdr, args->stateid);
}
#endif /* CONFIG_NFS_V4_1 */
@@ -2522,7 +2419,6 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
args->acl_pages, args->acl_pgbase, args->acl_len);
- xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE);
encode_nops(&hdr);
}
@@ -2634,6 +2530,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fhandle, &hdr);
encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
+ FATTR4_WORD0_FH_EXPIRE_TYPE|
FATTR4_WORD0_LINK_SUPPORT|
FATTR4_WORD0_SYMLINK_SUPPORT|
FATTR4_WORD0_ACLSUPPORT, &hdr);
@@ -2651,7 +2548,7 @@ static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
};
encode_compound_hdr(xdr, req, &hdr);
- encode_renew(xdr, clp, &hdr);
+ encode_renew(xdr, clp->cl_clientid, &hdr);
encode_nops(&hdr);
}
@@ -3181,6 +3078,28 @@ out_overflow:
return -EIO;
}
+static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
+ uint32_t *bitmap, uint32_t *type)
+{
+ __be32 *p;
+
+ *type = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *type = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE;
+ }
+ dprintk("%s: expire type=0x%x\n", __func__, *type);
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
{
__be32 *p;
@@ -3514,16 +3433,17 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
n = be32_to_cpup(p);
if (n == 0)
goto root_path;
- dprintk("path ");
+ dprintk("pathname4: ");
path->ncomponents = 0;
while (path->ncomponents < n) {
struct nfs4_string *component = &path->components[path->ncomponents];
status = decode_opaque_inline(xdr, &component->len, &component->data);
if (unlikely(status != 0))
goto out_eio;
- if (path->ncomponents != n)
- dprintk("/");
- dprintk("%s", component->data);
+ ifdebug (XDR)
+ pr_cont("%s%.*s ",
+ (path->ncomponents != n ? "/ " : ""),
+ component->len, component->data);
if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
path->ncomponents++;
else {
@@ -3532,14 +3452,13 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
}
}
out:
- dprintk("\n");
return status;
root_path:
/* a root pathname is sent as a zero component4 */
path->ncomponents = 1;
path->components[0].len=0;
path->components[0].data=NULL;
- dprintk("path /\n");
+ dprintk("pathname4: /\n");
goto out;
out_eio:
dprintk(" status %d", status);
@@ -3561,7 +3480,11 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
status = 0;
if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
goto out;
- dprintk("%s: fsroot ", __func__);
+ status = -EIO;
+ /* Ignore borken servers that return unrequested attrs */
+ if (unlikely(res == NULL))
+ goto out;
+ dprintk("%s: fsroot:\n", __func__);
status = decode_pathname(xdr, &res->fs_path);
if (unlikely(status != 0))
goto out;
@@ -3582,7 +3505,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
m = be32_to_cpup(p);
loc->nservers = 0;
- dprintk("%s: servers ", __func__);
+ dprintk("%s: servers:\n", __func__);
while (loc->nservers < m) {
struct nfs4_string *server = &loc->servers[loc->nservers];
status = decode_opaque_inline(xdr, &server->len, &server->data);
@@ -3614,7 +3537,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
res->nlocations++;
}
if (res->nlocations != 0)
- status = NFS_ATTR_FATTR_V4_REFERRAL;
+ status = NFS_ATTR_FATTR_V4_LOCATIONS;
out:
dprintk("%s: fs_locations done, error = %d\n", __func__, status);
return status;
@@ -4158,7 +4081,7 @@ static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
{
- return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+ return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
}
static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
@@ -4175,7 +4098,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
static int decode_verifier(struct xdr_stream *xdr, void *verifier)
{
- return decode_opaque_fixed(xdr, verifier, 8);
+ return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE);
}
static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
@@ -4225,6 +4148,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
goto xdr_error;
if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0)
goto xdr_error;
+ if ((status = decode_attr_fh_expire_type(xdr, bitmap,
+ &res->fh_expire_type)) != 0)
+ goto xdr_error;
if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0)
goto xdr_error;
if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0)
@@ -4295,6 +4221,7 @@ xdr_error:
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
struct nfs_fattr *fattr, struct nfs_fh *fh,
+ struct nfs4_fs_locations *fs_loc,
const struct nfs_server *server)
{
int status;
@@ -4342,9 +4269,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
- struct nfs4_fs_locations,
- fattr));
+ status = decode_attr_fs_locations(xdr, bitmap, fs_loc);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
@@ -4408,7 +4333,8 @@ xdr_error:
}
static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
- struct nfs_fh *fh, const struct nfs_server *server)
+ struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
+ const struct nfs_server *server)
{
__be32 *savep;
uint32_t attrlen,
@@ -4427,7 +4353,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
if (status < 0)
goto xdr_error;
- status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server);
+ status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
if (status < 0)
goto xdr_error;
@@ -4440,7 +4366,7 @@ xdr_error:
static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
const struct nfs_server *server)
{
- return decode_getfattr_generic(xdr, fattr, NULL, server);
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
}
/*
@@ -4464,8 +4390,8 @@ static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
return 0;
}
if (num > 1)
- printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
- "per filesystem not supported\n", __func__);
+ printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout "
+ "drivers per filesystem not supported\n", __func__);
/* Decode and set first layout type, move xdr->p past unused types */
p = xdr_inline_decode(xdr, num * 4);
@@ -4864,17 +4790,16 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
size_t hdrlen;
u32 recvd, pglen = rcvbuf->page_len;
int status;
+ __be32 verf[2];
status = decode_op_hdr(xdr, OP_READDIR);
if (!status)
status = decode_verifier(xdr, readdir->verifier.data);
if (unlikely(status))
return status;
+ memcpy(verf, readdir->verifier.data, sizeof(verf));
dprintk("%s: verifier = %08x:%08x\n",
- __func__,
- ((u32 *)readdir->verifier.data)[0],
- ((u32 *)readdir->verifier.data)[1]);
-
+ __func__, verf[0], verf[1]);
hdrlen = (char *) xdr->p - (char *) iov->iov_base;
recvd = rcvbuf->len - hdrlen;
@@ -5121,7 +5046,7 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
goto out_overflow;
res->count = be32_to_cpup(p++);
res->verf->committed = be32_to_cpup(p++);
- memcpy(res->verf->verifier, p, 8);
+ memcpy(res->verf->verifier, p, NFS4_VERIFIER_SIZE);
return 0;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -5215,6 +5140,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
char *dummy_str;
int status;
struct nfs_client *clp = res->client;
+ uint32_t impl_id_count;
status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
if (status)
@@ -5256,11 +5182,38 @@ static int decode_exchange_id(struct xdr_stream *xdr,
memcpy(res->server_scope->server_scope, dummy_str, dummy);
res->server_scope->server_scope_sz = dummy;
- /* Throw away Implementation id array */
- status = decode_opaque_inline(xdr, &dummy, &dummy_str);
- if (unlikely(status))
- return status;
+ /* Implementation Id */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ impl_id_count = be32_to_cpup(p++);
+
+ if (impl_id_count) {
+ /* nii_domain */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+ return -EIO;
+ memcpy(res->impl_id->domain, dummy_str, dummy);
+
+ /* nii_name */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+ return -EIO;
+ memcpy(res->impl_id->name, dummy_str, dummy);
+ /* nii_date */
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &res->impl_id->date.seconds);
+ res->impl_id->date.nseconds = be32_to_cpup(p);
+
+ /* if there's more than one entry, ignore the rest */
+ }
return 0;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -5286,8 +5239,8 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
attrs->max_reqs = be32_to_cpup(p++);
nr_attrs = be32_to_cpup(p);
if (unlikely(nr_attrs > 1)) {
- printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
- __func__, nr_attrs);
+ printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs "
+ "count %u\n", __func__, nr_attrs);
return -EINVAL;
}
if (nr_attrs == 1) {
@@ -5437,14 +5390,14 @@ static int decode_getdevicelist(struct xdr_stream *xdr,
p += 2;
/* Read verifier */
- p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
+ p = xdr_decode_opaque_fixed(p, verftemp.verifier, NFS4_VERIFIER_SIZE);
res->num_devs = be32_to_cpup(p);
dprintk("%s: num_dev %d\n", __func__, res->num_devs);
if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
- printk(KERN_ERR "%s too many result dev_num %u\n",
+ printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
__func__, res->num_devs);
return -EIO;
}
@@ -5538,11 +5491,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
status = decode_op_hdr(xdr, OP_LAYOUTGET);
if (status)
return status;
- p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->return_on_close = be32_to_cpup(p);
+ decode_stateid(xdr, &res->stateid);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out_overflow;
- res->return_on_close = be32_to_cpup(p++);
- p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
layout_count = be32_to_cpup(p);
if (!layout_count) {
dprintk("%s: server responded with empty layout array\n",
@@ -5667,7 +5623,8 @@ static int decode_test_stateid(struct xdr_stream *xdr,
if (unlikely(!p))
goto out_overflow;
res->status = be32_to_cpup(p++);
- return res->status;
+
+ return status;
out_overflow:
print_overflow_msg(__func__, xdr);
out:
@@ -6032,6 +5989,10 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
struct compound_hdr hdr;
int status;
+ if (res->acl_scratch != NULL) {
+ void *p = page_address(res->acl_scratch);
+ xdr_set_scratch_buffer(xdr, p, PAGE_SIZE);
+ }
status = decode_compound_hdr(xdr, &hdr);
if (status)
goto out;
@@ -6580,8 +6541,9 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
if (status)
goto out;
xdr_enter_page(xdr, PAGE_SIZE);
- status = decode_getfattr(xdr, &res->fs_locations->fattr,
- res->fs_locations->server);
+ status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
+ NULL, res->fs_locations,
+ res->fs_locations->server);
out:
return status;
}
@@ -6961,7 +6923,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
goto out_overflow;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
- entry->server) < 0)
+ NULL, entry->server) < 0)
goto out_overflow;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
@@ -7109,7 +7071,7 @@ struct rpc_procinfo nfs4_procedures[] = {
#endif /* CONFIG_NFS_V4_1 */
};
-struct rpc_version nfs_version4 = {
+const struct rpc_version nfs_version4 = {
.number = 4,
.nrprocs = ARRAY_SIZE(nfs4_procedures),
.procs = nfs4_procedures
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c4744e1d513..cd3c910d2d1 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -104,7 +104,7 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
/* server:export path string passed to super.c */
static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
-#ifdef RPC_DEBUG
+#ifdef NFS_DEBUG
/*
* When the "nfsrootdebug" kernel command line option is specified,
* enable debugging messages for NFSROOT.
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 55d01280a60..4bff4a3dab4 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -137,6 +137,7 @@ static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
struct objio_dev_ent *ode;
struct osd_dev *od;
struct osd_dev_info odi;
+ bool retry_flag = true;
int err;
ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
@@ -171,10 +172,18 @@ static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
goto out;
}
+retry_lookup:
od = osduld_info_lookup(&odi);
if (unlikely(IS_ERR(od))) {
err = PTR_ERR(od);
dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+ if (err == -ENODEV && retry_flag) {
+ err = objlayout_autologin(deviceaddr);
+ if (likely(!err)) {
+ retry_flag = false;
+ goto retry_lookup;
+ }
+ }
goto out;
}
@@ -205,25 +214,36 @@ static void copy_single_comp(struct ore_components *oc, unsigned c,
int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
struct objio_segment **pseg)
{
- struct __alloc_objio_segment {
- struct objio_segment olseg;
- struct ore_dev *ods[numdevs];
- struct ore_comp comps[numdevs];
- } *aolseg;
-
- aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
- if (unlikely(!aolseg)) {
+/* This is the in memory structure of the objio_segment
+ *
+ * struct __alloc_objio_segment {
+ * struct objio_segment olseg;
+ * struct ore_dev *ods[numdevs];
+ * struct ore_comp comps[numdevs];
+ * } *aolseg;
+ * NOTE: The code as above compiles and runs perfectly. It is elegant,
+ * type safe and compact. At some Past time Linus has decided he does not
+ * like variable length arrays, For the sake of this principal we uglify
+ * the code as below.
+ */
+ struct objio_segment *lseg;
+ size_t lseg_size = sizeof(*lseg) +
+ numdevs * sizeof(lseg->oc.ods[0]) +
+ numdevs * sizeof(*lseg->oc.comps);
+
+ lseg = kzalloc(lseg_size, gfp_flags);
+ if (unlikely(!lseg)) {
dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
- numdevs, sizeof(*aolseg));
+ numdevs, lseg_size);
return -ENOMEM;
}
- aolseg->olseg.oc.numdevs = numdevs;
- aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
- aolseg->olseg.oc.comps = aolseg->comps;
- aolseg->olseg.oc.ods = aolseg->ods;
+ lseg->oc.numdevs = numdevs;
+ lseg->oc.single_comp = EC_MULTPLE_COMPS;
+ lseg->oc.ods = (void *)(lseg + 1);
+ lseg->oc.comps = (void *)(lseg->oc.ods + numdevs);
- *pseg = &aolseg->olseg;
+ *pseg = lseg;
return 0;
}
@@ -582,10 +602,10 @@ objlayout_init(void)
if (ret)
printk(KERN_INFO
- "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+ "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",
__func__, ret);
else
- printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
+ printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n",
__func__);
return ret;
}
@@ -594,7 +614,7 @@ static void __exit
objlayout_exit(void)
{
pnfs_unregister_layoutdriver(&objlayout_type);
- printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
+ printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n",
__func__);
}
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index b3c29039f5b..8d45f1c318c 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -37,6 +37,9 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <linux/kmod.h>
+#include <linux/moduleparam.h>
+#include <linux/ratelimit.h>
#include <scsi/osd_initiator.h>
#include "objlayout.h"
@@ -156,7 +159,7 @@ last_byte_offset(u64 start, u64 len)
return end > start ? end - 1 : NFS4_MAX_UINT64;
}
-void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
+static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
struct page ***p_pages, unsigned *p_pgbase,
u64 offset, unsigned long count)
{
@@ -490,9 +493,9 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
if (!ioerr->oer_errno)
continue;
- printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
- "dev(%llx:%llx) par=0x%llx obj=0x%llx "
- "offset=0x%llx length=0x%llx\n",
+ printk(KERN_ERR "NFS: %s: err[%d]: errno=%d "
+ "is_write=%d dev(%llx:%llx) par=0x%llx "
+ "obj=0x%llx offset=0x%llx length=0x%llx\n",
__func__, i, ioerr->oer_errno,
ioerr->oer_iswrite,
_DEVID_LO(&ioerr->oer_component.oid_device_id),
@@ -651,3 +654,134 @@ void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
__free_page(odi->page);
kfree(odi);
}
+
+enum {
+ OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
+ OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
+ OSD_LOGIN_UPCALL_PATHLEN = 256
+};
+
+static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login";
+
+module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog),
+ 0600);
+MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program");
+
+struct __auto_login {
+ char uri[OBJLAYOUT_MAX_URI_LEN];
+ char osdname[OBJLAYOUT_MAX_OSDNAME_LEN];
+ char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN];
+};
+
+static int __objlayout_upcall(struct __auto_login *login)
+{
+ static char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *argv[8];
+ int ret;
+
+ if (unlikely(!osd_login_prog[0])) {
+ dprintk("%s: osd_login_prog is disabled\n", __func__);
+ return -EACCES;
+ }
+
+ dprintk("%s uri: %s\n", __func__, login->uri);
+ dprintk("%s osdname %s\n", __func__, login->osdname);
+ dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex);
+
+ argv[0] = (char *)osd_login_prog;
+ argv[1] = "-u";
+ argv[2] = login->uri;
+ argv[3] = "-o";
+ argv[4] = login->osdname;
+ argv[5] = "-s";
+ argv[6] = login->systemid_hex;
+ argv[7] = NULL;
+
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ /*
+ * Disable the upcall mechanism if we're getting an ENOENT or
+ * EACCES error. The admin can re-enable it on the fly by using
+ * sysfs to set the objlayoutdriver.osd_login_prog module parameter once
+ * the problem has been fixed.
+ */
+ if (ret == -ENOENT || ret == -EACCES) {
+ printk(KERN_ERR "PNFS-OBJ: %s was not found please set "
+ "objlayoutdriver.osd_login_prog kernel parameter!\n",
+ osd_login_prog);
+ osd_login_prog[0] = '\0';
+ }
+ dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret);
+
+ return ret;
+}
+
+/* Assume dest is all zeros */
+static void __copy_nfsS_and_zero_terminate(struct nfs4_string s,
+ char *dest, int max_len,
+ const char *var_name)
+{
+ if (!s.len)
+ return;
+
+ if (s.len >= max_len) {
+ pr_warn_ratelimited(
+ "objlayout_autologin: %s: s.len(%d) >= max_len(%d)",
+ var_name, s.len, max_len);
+ s.len = max_len - 1; /* space for null terminator */
+ }
+
+ memcpy(dest, s.data, s.len);
+}
+
+/* Assume sysid is all zeros */
+static void _sysid_2_hex(struct nfs4_string s,
+ char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN])
+{
+ int i;
+ char *cur;
+
+ if (!s.len)
+ return;
+
+ if (s.len != OSD_SYSTEMID_LEN) {
+ pr_warn_ratelimited(
+ "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN",
+ s.len);
+ if (s.len > OSD_SYSTEMID_LEN)
+ s.len = OSD_SYSTEMID_LEN;
+ }
+
+ cur = sysid;
+ for (i = 0; i < s.len; i++)
+ cur = hex_byte_pack(cur, s.data[i]);
+}
+
+int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+ int rc;
+ struct __auto_login login;
+
+ if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len)
+ return -ENODEV;
+
+ memset(&login, 0, sizeof(login));
+ __copy_nfsS_and_zero_terminate(
+ deviceaddr->oda_targetaddr.ota_netaddr.r_addr,
+ login.uri, sizeof(login.uri), "URI");
+
+ __copy_nfsS_and_zero_terminate(
+ deviceaddr->oda_osdname,
+ login.osdname, sizeof(login.osdname), "OSDNAME");
+
+ _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex);
+
+ rc = __objlayout_upcall(&login);
+ if (rc > 0) /* script returns positive values */
+ rc = -ENODEV;
+
+ return rc;
+}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 8ec34727ed2..880ba086be9 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -184,4 +184,6 @@ extern void objlayout_encode_layoutreturn(
struct xdr_stream *,
const struct nfs4_layoutreturn_args *);
+extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr);
+
#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 5668f7c54c4..d21fceaa9f6 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -13,6 +13,7 @@
#include <linux/file.h>
#include <linux/sched.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
#include <linux/nfs3.h>
#include <linux/nfs4.h>
#include <linux/nfs_page.h>
@@ -106,36 +107,6 @@ void nfs_unlock_request(struct nfs_page *req)
nfs_release_request(req);
}
-/**
- * nfs_set_page_tag_locked - Tag a request as locked
- * @req:
- */
-int nfs_set_page_tag_locked(struct nfs_page *req)
-{
- if (!nfs_lock_request_dontget(req))
- return 0;
- if (test_bit(PG_MAPPED, &req->wb_flags))
- radix_tree_tag_set(&NFS_I(req->wb_context->dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
- return 1;
-}
-
-/**
- * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers
- */
-void nfs_clear_page_tag_locked(struct nfs_page *req)
-{
- if (test_bit(PG_MAPPED, &req->wb_flags)) {
- struct inode *inode = req->wb_context->dentry->d_inode;
- struct nfs_inode *nfsi = NFS_I(inode);
-
- spin_lock(&inode->i_lock);
- radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
- nfs_unlock_request(req);
- spin_unlock(&inode->i_lock);
- } else
- nfs_unlock_request(req);
-}
-
/*
* nfs_clear_request - Free up all resources allocated to the request
* @req:
@@ -425,67 +396,6 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
}
}
-#define NFS_SCAN_MAXENTRIES 16
-/**
- * nfs_scan_list - Scan a list for matching requests
- * @nfsi: NFS inode
- * @dst: Destination list
- * @idx_start: lower bound of page->index to scan
- * @npages: idx_start + npages sets the upper bound to scan.
- * @tag: tag to scan for
- *
- * Moves elements from one of the inode request lists.
- * If the number of requests is set to 0, the entire address_space
- * starting at index idx_start, is scanned.
- * The requests are *not* checked to ensure that they form a contiguous set.
- * You must be holding the inode's i_lock when calling this function
- */
-int nfs_scan_list(struct nfs_inode *nfsi,
- struct list_head *dst, pgoff_t idx_start,
- unsigned int npages, int tag)
-{
- struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
- struct nfs_page *req;
- pgoff_t idx_end;
- int found, i;
- int res;
- struct list_head *list;
-
- res = 0;
- if (npages == 0)
- idx_end = ~0;
- else
- idx_end = idx_start + npages - 1;
-
- for (;;) {
- found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
- (void **)&pgvec[0], idx_start,
- NFS_SCAN_MAXENTRIES, tag);
- if (found <= 0)
- break;
- for (i = 0; i < found; i++) {
- req = pgvec[i];
- if (req->wb_index > idx_end)
- goto out;
- idx_start = req->wb_index + 1;
- if (nfs_set_page_tag_locked(req)) {
- kref_get(&req->wb_kref);
- radix_tree_tag_clear(&nfsi->nfs_page_tree,
- req->wb_index, tag);
- list = pnfs_choose_commit_list(req, dst);
- nfs_list_add_request(req, list);
- res++;
- if (res == INT_MAX)
- goto out;
- }
- }
- /* for latency reduction */
- cond_resched_lock(&nfsi->vfs_inode.i_lock);
- }
-out:
- return res;
-}
-
int __init nfs_init_nfspagecache(void)
{
nfs_page_cachep = kmem_cache_create("nfs_page",
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 17149a49006..b5d45158694 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -101,8 +101,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
goto out_no_driver;
if (!(server->nfs_client->cl_exchange_flags &
(EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
- printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
- id, server->nfs_client->cl_exchange_flags);
+ printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
+ __func__, id, server->nfs_client->cl_exchange_flags);
goto out_no_driver;
}
ld_type = find_pnfs_driver(id);
@@ -122,8 +122,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
server->pnfs_curr_ld = ld_type;
if (ld_type->set_layoutdriver
&& ld_type->set_layoutdriver(server, mntfh)) {
- printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n",
- __func__, id);
+ printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
+ "driver %u.\n", __func__, id);
module_put(ld_type->owner);
goto out_no_driver;
}
@@ -143,11 +143,11 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
struct pnfs_layoutdriver_type *tmp;
if (ld_type->id == 0) {
- printk(KERN_ERR "%s id 0 is reserved\n", __func__);
+ printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
return status;
}
if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
- printk(KERN_ERR "%s Layout driver must provide "
+ printk(KERN_ERR "NFS: %s Layout driver must provide "
"alloc_lseg and free_lseg.\n", __func__);
return status;
}
@@ -160,7 +160,7 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
ld_type->name);
} else {
- printk(KERN_ERR "%s Module with id %d already loaded!\n",
+ printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
__func__, ld_type->id);
}
spin_unlock(&pnfs_spinlock);
@@ -496,12 +496,12 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
{
u32 oldseq, newseq;
- oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
- newseq = be32_to_cpu(new->stateid.seqid);
+ oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ newseq = be32_to_cpu(new->seqid);
if ((int)(newseq - oldseq) > 0) {
- memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
+ nfs4_stateid_copy(&lo->plh_stateid, new);
if (update_barrier) {
- u32 new_barrier = be32_to_cpu(new->stateid.seqid);
+ u32 new_barrier = be32_to_cpu(new->seqid);
if ((int)(new_barrier - lo->plh_barrier))
lo->plh_barrier = new_barrier;
@@ -525,7 +525,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
int lget)
{
if ((stateid) &&
- (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
+ (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0)
return true;
return lo->plh_block_lgets ||
test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
@@ -549,11 +549,10 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
do {
seq = read_seqbegin(&open_state->seqlock);
- memcpy(dst->data, open_state->stateid.data,
- sizeof(open_state->stateid.data));
+ nfs4_stateid_copy(dst, &open_state->stateid);
} while (read_seqretry(&open_state->seqlock, seq));
} else
- memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+ nfs4_stateid_copy(dst, &lo->plh_stateid);
spin_unlock(&lo->plh_inode->i_lock);
dprintk("<-- %s\n", __func__);
return status;
@@ -590,7 +589,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
max_pages = max_resp_sz >> PAGE_SHIFT;
- pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
+ pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
if (!pages)
goto out_err_free;
@@ -760,7 +759,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
}
if (!found) {
struct pnfs_layout_hdr *lo = nfsi->layout;
- u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+ u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
/* Since close does not return a layout stateid for use as
* a barrier, we choose the worst-case barrier.
@@ -966,8 +965,7 @@ pnfs_update_layout(struct inode *ino,
}
/* Do we even need to bother with this? */
- if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
}
@@ -1032,7 +1030,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct nfs4_layoutget_res *res = &lgp->res;
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
- struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
int status = 0;
/* Inject layout blob into I/O device driver */
@@ -1048,8 +1045,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
}
spin_lock(&ino->i_lock);
- if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
dprintk("%s forget reply due to recall\n", __func__);
goto out_forget_reply;
}
@@ -1214,6 +1210,7 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
}
data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
}
+ put_lseg(data->lseg);
data->mds_ops->rpc_release(data);
}
EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
@@ -1227,6 +1224,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
nfs_list_add_request(data->req, &desc->pg_list);
nfs_pageio_reset_write_mds(desc);
desc->pg_recoalesce = 1;
+ put_lseg(data->lseg);
nfs_writedata_release(data);
}
@@ -1327,6 +1325,7 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
data->mds_ops->rpc_call_done(&data->task, data);
} else
pnfs_ld_handle_read_error(data);
+ put_lseg(data->lseg);
data->mds_ops->rpc_release(data);
}
EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
@@ -1530,8 +1529,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
end_pos = nfsi->layout->plh_lwb;
nfsi->layout->plh_lwb = 0;
- memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
- sizeof(nfsi->layout->plh_stateid.data));
+ nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
spin_unlock(&inode->i_lock);
data->args.inode = inode;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 53d593a0a4f..442ebf68eee 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,11 +94,10 @@ struct pnfs_layoutdriver_type {
const struct nfs_pageio_ops *pg_read_ops;
const struct nfs_pageio_ops *pg_write_ops;
- /* Returns true if layoutdriver wants to divert this request to
- * driver's commit routine.
- */
- bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg);
- struct list_head * (*choose_commit_list) (struct nfs_page *req);
+ void (*mark_request_commit) (struct nfs_page *req,
+ struct pnfs_layout_segment *lseg);
+ void (*clear_request_commit) (struct nfs_page *req);
+ int (*scan_commit_lists) (struct inode *inode, int max, spinlock_t *lock);
int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how);
/*
@@ -229,7 +228,6 @@ struct nfs4_deviceid_node {
atomic_t ref;
};
-void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
@@ -262,20 +260,6 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss)
return nfss->pnfs_curr_ld != NULL;
}
-static inline void
-pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
-{
- if (lseg) {
- struct pnfs_layoutdriver_type *ld;
-
- ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld;
- if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) {
- set_bit(PG_PNFS_COMMIT, &req->wb_flags);
- req->wb_commit_lseg = get_lseg(lseg);
- }
- }
-}
-
static inline int
pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
{
@@ -284,27 +268,42 @@ pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how);
}
-static inline struct list_head *
-pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
+static inline bool
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
{
- struct list_head *rv;
+ struct inode *inode = req->wb_context->dentry->d_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
- if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) {
- struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode;
+ if (lseg == NULL || ld->mark_request_commit == NULL)
+ return false;
+ ld->mark_request_commit(req, lseg);
+ return true;
+}
- set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
- rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req);
- /* matched by ref taken when PG_PNFS_COMMIT is set */
- put_lseg(req->wb_commit_lseg);
- } else
- rv = mds;
- return rv;
+static inline bool
+pnfs_clear_request_commit(struct nfs_page *req)
+{
+ struct inode *inode = req->wb_context->dentry->d_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld == NULL || ld->clear_request_commit == NULL)
+ return false;
+ ld->clear_request_commit(req);
+ return true;
}
-static inline void pnfs_clear_request_commit(struct nfs_page *req)
+static inline int
+pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)
{
- if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags))
- put_lseg(req->wb_commit_lseg);
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ int ret;
+
+ if (ld == NULL || ld->scan_commit_lists == NULL)
+ return 0;
+ ret = ld->scan_commit_lists(inode, max, lock);
+ if (ret != 0)
+ set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
+ return ret;
}
/* Should the pNFS client commit and return the layout upon a setattr */
@@ -328,6 +327,13 @@ static inline int pnfs_return_layout(struct inode *ino)
return 0;
}
+#ifdef NFS_DEBUG
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+#else
+static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
+{
+}
+#endif /* NFS_DEBUG */
#else /* CONFIG_NFS_V4_1 */
static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -400,35 +406,35 @@ static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, st
return false;
}
-static inline void
-pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
-{
-}
-
static inline int
pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
{
return PNFS_NOT_ATTEMPTED;
}
-static inline struct list_head *
-pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
+static inline bool
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
{
- return mds;
+ return false;
}
-static inline void pnfs_clear_request_commit(struct nfs_page *req)
+static inline bool
+pnfs_clear_request_commit(struct nfs_page *req)
{
+ return false;
}
-static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+static inline int
+pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)
{
return 0;
}
-static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
+static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
+ return 0;
}
+
#endif /* CONFIG_NFS_V4_1 */
#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 4f359d2a26e..73f701f1f4d 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -43,6 +43,7 @@
static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+#ifdef NFS_DEBUG
void
nfs4_print_deviceid(const struct nfs4_deviceid *id)
{
@@ -52,6 +53,7 @@ nfs4_print_deviceid(const struct nfs4_deviceid *id)
p[0], p[1], p[2], p[3]);
}
EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+#endif
static inline u32
nfs4_deviceid_hash(const struct nfs4_deviceid *id)
@@ -92,7 +94,7 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
* @clp nfs_client associated with deviceid
* @id deviceid to look up
*/
-struct nfs4_deviceid_node *
+static struct nfs4_deviceid_node *
_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
const struct nfs_client *clp, const struct nfs4_deviceid *id,
long hash)
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 0c672588fe5..b63b6f4d14f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -358,6 +358,11 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
}
+static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+ rpc_call_start(task);
+}
+
static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
{
if (nfs_async_handle_expired_key(task))
@@ -372,6 +377,11 @@ nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
}
+static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ rpc_call_start(task);
+}
+
static int
nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
struct inode *new_dir)
@@ -651,6 +661,11 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
}
+static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+{
+ rpc_call_start(task);
+}
+
static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
{
if (nfs_async_handle_expired_key(task))
@@ -668,6 +683,11 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message
msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
}
+static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+{
+ rpc_call_start(task);
+}
+
static void
nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
{
@@ -721,9 +741,11 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.create = nfs_proc_create,
.remove = nfs_proc_remove,
.unlink_setup = nfs_proc_unlink_setup,
+ .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
.unlink_done = nfs_proc_unlink_done,
.rename = nfs_proc_rename,
.rename_setup = nfs_proc_rename_setup,
+ .rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
.rename_done = nfs_proc_rename_done,
.link = nfs_proc_link,
.symlink = nfs_proc_symlink,
@@ -736,8 +758,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.pathconf = nfs_proc_pathconf,
.decode_dirent = nfs2_decode_dirent,
.read_setup = nfs_proc_read_setup,
+ .read_rpc_prepare = nfs_proc_read_rpc_prepare,
.read_done = nfs_read_done,
.write_setup = nfs_proc_write_setup,
+ .write_rpc_prepare = nfs_proc_write_rpc_prepare,
.write_done = nfs_write_done,
.commit_setup = nfs_proc_commit_setup,
.lock = nfs_proc_lock,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index cfa175c223d..9a0e8ef4a40 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -20,7 +20,6 @@
#include <linux/nfs_page.h>
#include <linux/module.h>
-#include <asm/system.h>
#include "pnfs.h"
#include "nfs4_fs.h"
@@ -66,7 +65,6 @@ void nfs_readdata_free(struct nfs_read_data *p)
void nfs_readdata_release(struct nfs_read_data *rdata)
{
- put_lseg(rdata->lseg);
put_nfs_open_context(rdata->args.context);
nfs_readdata_free(rdata);
}
@@ -465,23 +463,14 @@ static void nfs_readpage_release_partial(void *calldata)
nfs_readdata_release(calldata);
}
-#if defined(CONFIG_NFS_V4_1)
void nfs_read_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_read_data *data = calldata;
-
- if (nfs4_setup_sequence(NFS_SERVER(data->inode),
- &data->args.seq_args, &data->res.seq_res,
- 0, task))
- return;
- rpc_call_start(task);
+ NFS_PROTO(data->inode)->read_rpc_prepare(task, data);
}
-#endif /* CONFIG_NFS_V4_1 */
static const struct rpc_call_ops nfs_read_partial_ops = {
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs_read_prepare,
-#endif /* CONFIG_NFS_V4_1 */
.rpc_call_done = nfs_readpage_result_partial,
.rpc_release = nfs_readpage_release_partial,
};
@@ -545,9 +534,7 @@ static void nfs_readpage_release_full(void *calldata)
}
static const struct rpc_call_ops nfs_read_full_ops = {
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs_read_prepare,
-#endif /* CONFIG_NFS_V4_1 */
.rpc_call_done = nfs_readpage_result_full,
.rpc_release = nfs_readpage_release_full,
};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3dfa4f112c0..37412f706b3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -52,8 +52,9 @@
#include <linux/nfs_xdr.h>
#include <linux/magic.h>
#include <linux/parser.h>
+#include <linux/nsproxy.h>
+#include <linux/rcupdate.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include "nfs4_fs.h"
@@ -79,7 +80,6 @@ enum {
Opt_cto, Opt_nocto,
Opt_ac, Opt_noac,
Opt_lock, Opt_nolock,
- Opt_v2, Opt_v3, Opt_v4,
Opt_udp, Opt_tcp, Opt_rdma,
Opt_acl, Opt_noacl,
Opt_rdirplus, Opt_nordirplus,
@@ -97,10 +97,10 @@ enum {
Opt_namelen,
Opt_mountport,
Opt_mountvers,
- Opt_nfsvers,
Opt_minorversion,
/* Mount options that take string arguments */
+ Opt_nfsvers,
Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
Opt_addr, Opt_mountaddr, Opt_clientaddr,
Opt_lookupcache,
@@ -132,9 +132,6 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_noac, "noac" },
{ Opt_lock, "lock" },
{ Opt_nolock, "nolock" },
- { Opt_v2, "v2" },
- { Opt_v3, "v3" },
- { Opt_v4, "v4" },
{ Opt_udp, "udp" },
{ Opt_tcp, "tcp" },
{ Opt_rdma, "rdma" },
@@ -163,9 +160,10 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_namelen, "namlen=%s" },
{ Opt_mountport, "mountport=%s" },
{ Opt_mountvers, "mountvers=%s" },
+ { Opt_minorversion, "minorversion=%s" },
+
{ Opt_nfsvers, "nfsvers=%s" },
{ Opt_nfsvers, "vers=%s" },
- { Opt_minorversion, "minorversion=%s" },
{ Opt_sec, "sec=%s" },
{ Opt_proto, "proto=%s" },
@@ -179,6 +177,9 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_fscache_uniq, "fsc=%s" },
{ Opt_local_lock, "local_lock=%s" },
+ /* The following needs to be listed after all other options */
+ { Opt_nfsvers, "v%s" },
+
{ Opt_err, NULL }
};
@@ -259,6 +260,22 @@ static match_table_t nfs_local_lock_tokens = {
{ Opt_local_lock_err, NULL }
};
+enum {
+ Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
+ Opt_vers_4_1,
+
+ Opt_vers_err
+};
+
+static match_table_t nfs_vers_tokens = {
+ { Opt_vers_2, "2" },
+ { Opt_vers_3, "3" },
+ { Opt_vers_4, "4" },
+ { Opt_vers_4_0, "4.0" },
+ { Opt_vers_4_1, "4.1" },
+
+ { Opt_vers_err, NULL }
+};
static void nfs_umount_begin(struct super_block *);
static int nfs_statfs(struct dentry *, struct kstatfs *);
@@ -620,7 +637,6 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
struct nfs_client *clp = nfss->nfs_client;
seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
- seq_printf(m, ",minorversion=%u", clp->cl_minorversion);
}
#else
static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
@@ -629,6 +645,15 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
}
#endif
+static void nfs_show_nfs_version(struct seq_file *m,
+ unsigned int version,
+ unsigned int minorversion)
+{
+ seq_printf(m, ",vers=%u", version);
+ if (version == 4)
+ seq_printf(m, ".%u", minorversion);
+}
+
/*
* Describe the mount options in force on this server representation
*/
@@ -656,7 +681,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
u32 version = clp->rpc_ops->version;
int local_flock, local_fcntl;
- seq_printf(m, ",vers=%u", version);
+ nfs_show_nfs_version(m, version, clp->cl_minorversion);
seq_printf(m, ",rsize=%u", nfss->rsize);
seq_printf(m, ",wsize=%u", nfss->wsize);
if (nfss->bsize != 0)
@@ -676,8 +701,10 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
else
seq_puts(m, nfs_infop->nostr);
}
+ rcu_read_lock();
seq_printf(m, ",proto=%s",
rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
+ rcu_read_unlock();
if (version == 4) {
if (nfss->port != NFS_PORT)
seq_printf(m, ",port=%u", nfss->port);
@@ -726,9 +753,11 @@ static int nfs_show_options(struct seq_file *m, struct dentry *root)
nfs_show_mount_options(m, nfss, 0);
+ rcu_read_lock();
seq_printf(m, ",addr=%s",
rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
RPC_DISPLAY_ADDR));
+ rcu_read_unlock();
return 0;
}
@@ -745,7 +774,6 @@ static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
#endif
#endif
-#ifdef CONFIG_NFS_V4
#ifdef CONFIG_NFS_V4_1
static void show_pnfs(struct seq_file *m, struct nfs_server *server)
{
@@ -755,9 +783,26 @@ static void show_pnfs(struct seq_file *m, struct nfs_server *server)
else
seq_printf(m, "not configured");
}
+
+static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
+{
+ if (nfss->nfs_client && nfss->nfs_client->impl_id) {
+ struct nfs41_impl_id *impl_id = nfss->nfs_client->impl_id;
+ seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s',"
+ "date='%llu,%u'",
+ impl_id->name, impl_id->domain,
+ impl_id->date.seconds, impl_id->date.nseconds);
+ }
+}
#else
-static void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#ifdef CONFIG_NFS_V4
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+}
#endif
+static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
+{
+}
#endif
static int nfs_show_devname(struct seq_file *m, struct dentry *root)
@@ -806,6 +851,8 @@ static int nfs_show_stats(struct seq_file *m, struct dentry *root)
seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+ show_implementation_id(m, nfss);
+
seq_printf(m, "\n\tcaps:\t");
seq_printf(m, "caps=0x%x", nfss->caps);
seq_printf(m, ",wtmult=%u", nfss->wtmult);
@@ -908,6 +955,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
data->auth_flavor_len = 1;
data->version = version;
data->minorversion = 0;
+ data->net = current->nsproxy->net_ns;
security_init_mnt_opts(&data->lsm_opts);
}
return data;
@@ -1052,6 +1100,40 @@ static int nfs_parse_security_flavors(char *value,
return 1;
}
+static int nfs_parse_version_string(char *string,
+ struct nfs_parsed_mount_data *mnt,
+ substring_t *args)
+{
+ mnt->flags &= ~NFS_MOUNT_VER3;
+ switch (match_token(string, nfs_vers_tokens, args)) {
+ case Opt_vers_2:
+ mnt->version = 2;
+ break;
+ case Opt_vers_3:
+ mnt->flags |= NFS_MOUNT_VER3;
+ mnt->version = 3;
+ break;
+ case Opt_vers_4:
+ /* Backward compatibility option. In future,
+ * the mount program should always supply
+ * a NFSv4 minor version number.
+ */
+ mnt->version = 4;
+ break;
+ case Opt_vers_4_0:
+ mnt->version = 4;
+ mnt->minorversion = 0;
+ break;
+ case Opt_vers_4_1:
+ mnt->version = 4;
+ mnt->minorversion = 1;
+ break;
+ default:
+ return 0;
+ }
+ return 1;
+}
+
static int nfs_get_option_str(substring_t args[], char **option)
{
kfree(*option);
@@ -1157,18 +1239,6 @@ static int nfs_parse_mount_options(char *raw,
mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
NFS_MOUNT_LOCAL_FCNTL);
break;
- case Opt_v2:
- mnt->flags &= ~NFS_MOUNT_VER3;
- mnt->version = 2;
- break;
- case Opt_v3:
- mnt->flags |= NFS_MOUNT_VER3;
- mnt->version = 3;
- break;
- case Opt_v4:
- mnt->flags &= ~NFS_MOUNT_VER3;
- mnt->version = 4;
- break;
case Opt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1295,26 +1365,6 @@ static int nfs_parse_mount_options(char *raw,
goto out_invalid_value;
mnt->mount_server.version = option;
break;
- case Opt_nfsvers:
- if (nfs_get_option_ul(args, &option))
- goto out_invalid_value;
- switch (option) {
- case NFS2_VERSION:
- mnt->flags &= ~NFS_MOUNT_VER3;
- mnt->version = 2;
- break;
- case NFS3_VERSION:
- mnt->flags |= NFS_MOUNT_VER3;
- mnt->version = 3;
- break;
- case NFS4_VERSION:
- mnt->flags &= ~NFS_MOUNT_VER3;
- mnt->version = 4;
- break;
- default:
- goto out_invalid_value;
- }
- break;
case Opt_minorversion:
if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
@@ -1326,6 +1376,15 @@ static int nfs_parse_mount_options(char *raw,
/*
* options that take text values
*/
+ case Opt_nfsvers:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+ rc = nfs_parse_version_string(string, mnt, args);
+ kfree(string);
+ if (!rc)
+ goto out_invalid_value;
+ break;
case Opt_sec:
string = match_strdup(args);
if (string == NULL)
@@ -1405,7 +1464,7 @@ static int nfs_parse_mount_options(char *raw,
if (string == NULL)
goto out_nomem;
mnt->nfs_server.addrlen =
- rpc_pton(string, strlen(string),
+ rpc_pton(mnt->net, string, strlen(string),
(struct sockaddr *)
&mnt->nfs_server.address,
sizeof(mnt->nfs_server.address));
@@ -1427,7 +1486,7 @@ static int nfs_parse_mount_options(char *raw,
if (string == NULL)
goto out_nomem;
mnt->mount_server.addrlen =
- rpc_pton(string, strlen(string),
+ rpc_pton(mnt->net, string, strlen(string),
(struct sockaddr *)
&mnt->mount_server.address,
sizeof(mnt->mount_server.address));
@@ -1516,6 +1575,9 @@ static int nfs_parse_mount_options(char *raw,
if (!sloppy && invalid_option)
return 0;
+ if (mnt->minorversion && mnt->version != 4)
+ goto out_minorversion_mismatch;
+
/*
* verify that any proto=/mountproto= options match the address
* familiies in the addr=/mountaddr= options.
@@ -1549,6 +1611,10 @@ out_invalid_address:
out_invalid_value:
printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
return 0;
+out_minorversion_mismatch:
+ printk(KERN_INFO "NFS: mount option vers=%u does not support "
+ "minorversion=%u\n", mnt->version, mnt->minorversion);
+ return 0;
out_nomem:
printk(KERN_INFO "NFS: not enough memory to parse option\n");
return 0;
@@ -1622,6 +1688,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
.noresvport = args->flags & NFS_MOUNT_NORESVPORT,
.auth_flav_len = &server_authlist_len,
.auth_flavs = server_authlist,
+ .net = args->net,
};
int status;
@@ -2047,7 +2114,7 @@ static inline void nfs_initialise_sb(struct super_block *sb)
/* We probably want something more informative here */
snprintf(sb->s_id, sizeof(sb->s_id),
- "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+ "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev));
if (sb->s_blocksize == 0)
sb->s_blocksize = nfs_block_bits(server->wsize,
@@ -2499,12 +2566,6 @@ static int nfs4_validate_text_mount_data(void *options,
return -EINVAL;
}
- if (args->client_address == NULL) {
- dfprintk(MOUNT,
- "NFS4: mount program didn't pass callback address\n");
- return -EINVAL;
- }
-
return nfs_parse_devname(dev_name,
&args->nfs_server.hostname,
NFS4_MAXNAMLEN,
@@ -2663,8 +2724,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
if (!s->s_root) {
/* initial superblock/root creation */
nfs4_fill_super(s);
- nfs_fscache_get_super_cookie(
- s, data ? data->fscache_uniq : NULL, NULL);
+ nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);
}
mntroot = nfs4_get_root(s, mntfh, dev_name);
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 978aaeb8a09..ad4d2e787b2 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,7 +32,6 @@ static ctl_table nfs_cb_sysctls[] = {
.extra1 = (int *)&nfs_set_port_min,
.extra2 = (int *)&nfs_set_port_max,
},
-#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
{
.procname = "idmap_cache_timeout",
.data = &nfs_idmap_cache_timeout,
@@ -40,7 +39,6 @@ static ctl_table nfs_cb_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
-#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
#endif
{
.procname = "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 4f9319a2e56..3210a03342f 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -20,15 +20,6 @@
#include "iostat.h"
#include "delegation.h"
-struct nfs_unlinkdata {
- struct hlist_node list;
- struct nfs_removeargs args;
- struct nfs_removeres res;
- struct inode *dir;
- struct rpc_cred *cred;
- struct nfs_fattr dir_attr;
-};
-
/**
* nfs_free_unlinkdata - release data from a sillydelete operation.
* @data: pointer to unlink structure.
@@ -107,25 +98,16 @@ static void nfs_async_unlink_release(void *calldata)
nfs_sb_deactive(sb);
}
-#if defined(CONFIG_NFS_V4_1)
-void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
+static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_unlinkdata *data = calldata;
- struct nfs_server *server = NFS_SERVER(data->dir);
-
- if (nfs4_setup_sequence(server, &data->args.seq_args,
- &data->res.seq_res, 1, task))
- return;
- rpc_call_start(task);
+ NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data);
}
-#endif /* CONFIG_NFS_V4_1 */
static const struct rpc_call_ops nfs_unlink_ops = {
.rpc_call_done = nfs_async_unlink_done,
.rpc_release = nfs_async_unlink_release,
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs_unlink_prepare,
-#endif /* CONFIG_NFS_V4_1 */
};
static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
@@ -341,18 +323,6 @@ nfs_cancel_async_unlink(struct dentry *dentry)
spin_unlock(&dentry->d_lock);
}
-struct nfs_renamedata {
- struct nfs_renameargs args;
- struct nfs_renameres res;
- struct rpc_cred *cred;
- struct inode *old_dir;
- struct dentry *old_dentry;
- struct nfs_fattr old_fattr;
- struct inode *new_dir;
- struct dentry *new_dentry;
- struct nfs_fattr new_fattr;
-};
-
/**
* nfs_async_rename_done - Sillyrename post-processing
* @task: rpc_task of the sillyrename
@@ -403,25 +373,16 @@ static void nfs_async_rename_release(void *calldata)
kfree(data);
}
-#if defined(CONFIG_NFS_V4_1)
static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_renamedata *data = calldata;
- struct nfs_server *server = NFS_SERVER(data->old_dir);
-
- if (nfs4_setup_sequence(server, &data->args.seq_args,
- &data->res.seq_res, 1, task))
- return;
- rpc_call_start(task);
+ NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data);
}
-#endif /* CONFIG_NFS_V4_1 */
static const struct rpc_call_ops nfs_rename_ops = {
.rpc_call_done = nfs_async_rename_done,
.rpc_release = nfs_async_rename_release,
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs_rename_prepare,
-#endif /* CONFIG_NFS_V4_1 */
};
/**
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0c3885255f9..2c68818f68a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -100,7 +100,6 @@ void nfs_writedata_free(struct nfs_write_data *p)
void nfs_writedata_release(struct nfs_write_data *wdata)
{
- put_lseg(wdata->lseg);
put_nfs_open_context(wdata->args.context);
nfs_writedata_free(wdata);
}
@@ -236,10 +235,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo
req = nfs_page_find_request_locked(page);
if (req == NULL)
break;
- if (nfs_set_page_tag_locked(req))
+ if (nfs_lock_request_dontget(req))
break;
/* Note: If we hold the page lock, as is the case in nfs_writepage,
- * then the call to nfs_set_page_tag_locked() will always
+ * then the call to nfs_lock_request_dontget() will always
* succeed provided that someone hasn't already marked the
* request as dirty (in which case we don't care).
*/
@@ -375,21 +374,14 @@ out_err:
/*
* Insert a write request into an inode
*/
-static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
{
struct nfs_inode *nfsi = NFS_I(inode);
- int error;
-
- error = radix_tree_preload(GFP_NOFS);
- if (error != 0)
- goto out;
/* Lock the request! */
nfs_lock_request_dontget(req);
spin_lock(&inode->i_lock);
- error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
- BUG_ON(error);
if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
inode->i_version++;
set_bit(PG_MAPPED, &req->wb_flags);
@@ -397,12 +389,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
set_page_private(req->wb_page, (unsigned long)req);
nfsi->npages++;
kref_get(&req->wb_kref);
- radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
- NFS_PAGE_TAG_LOCKED);
spin_unlock(&inode->i_lock);
- radix_tree_preload_end();
-out:
- return error;
}
/*
@@ -419,7 +406,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
set_page_private(req->wb_page, 0);
ClearPagePrivate(req->wb_page);
clear_bit(PG_MAPPED, &req->wb_flags);
- radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
nfsi->npages--;
spin_unlock(&inode->i_lock);
nfs_release_request(req);
@@ -432,39 +418,90 @@ nfs_mark_request_dirty(struct nfs_page *req)
}
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-/*
- * Add a request to the inode's commit list.
+/**
+ * nfs_request_add_commit_list - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @head: commit list head
+ *
+ * This sets the PG_CLEAN bit, updates the inode global count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must _not_ hold the inode->i_lock, but must be
+ * holding the nfs_page lock.
*/
-static void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+void
+nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head)
{
struct inode *inode = req->wb_context->dentry->d_inode;
- struct nfs_inode *nfsi = NFS_I(inode);
- spin_lock(&inode->i_lock);
set_bit(PG_CLEAN, &(req)->wb_flags);
- radix_tree_tag_set(&nfsi->nfs_page_tree,
- req->wb_index,
- NFS_PAGE_TAG_COMMIT);
- nfsi->ncommit++;
+ spin_lock(&inode->i_lock);
+ nfs_list_add_request(req, head);
+ NFS_I(inode)->ncommit++;
spin_unlock(&inode->i_lock);
- pnfs_mark_request_commit(req, lseg);
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
-static int
+/**
+ * nfs_request_remove_commit_list - Remove request from a commit list
+ * @req: pointer to a nfs_page
+ *
+ * This clears the PG_CLEAN bit, and updates the inode global count of
+ * number of outstanding requests requiring a commit
+ * It does not update the MM page stats.
+ *
+ * The caller _must_ hold the inode->i_lock and the nfs_page lock.
+ */
+void
+nfs_request_remove_commit_list(struct nfs_page *req)
+{
+ struct inode *inode = req->wb_context->dentry->d_inode;
+
+ if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
+ return;
+ nfs_list_remove_request(req);
+ NFS_I(inode)->ncommit--;
+}
+EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
+
+
+/*
+ * Add a request to the inode's commit list.
+ */
+static void
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+{
+ struct inode *inode = req->wb_context->dentry->d_inode;
+
+ if (pnfs_mark_request_commit(req, lseg))
+ return;
+ nfs_request_add_commit_list(req, &NFS_I(inode)->commit_list);
+}
+
+static void
+nfs_clear_page_commit(struct page *page)
+{
+ dec_zone_page_state(page, NR_UNSTABLE_NFS);
+ dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+}
+
+static void
nfs_clear_request_commit(struct nfs_page *req)
{
- struct page *page = req->wb_page;
+ if (test_bit(PG_CLEAN, &req->wb_flags)) {
+ struct inode *inode = req->wb_context->dentry->d_inode;
- if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
- dec_zone_page_state(page, NR_UNSTABLE_NFS);
- dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
- return 1;
+ if (!pnfs_clear_request_commit(req)) {
+ spin_lock(&inode->i_lock);
+ nfs_request_remove_commit_list(req);
+ spin_unlock(&inode->i_lock);
+ }
+ nfs_clear_page_commit(req->wb_page);
}
- return 0;
}
static inline
@@ -491,15 +528,14 @@ int nfs_reschedule_unstable_write(struct nfs_page *req,
return 0;
}
#else
-static inline void
+static void
nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
{
}
-static inline int
+static void
nfs_clear_request_commit(struct nfs_page *req)
{
- return 0;
}
static inline
@@ -520,46 +556,65 @@ int nfs_reschedule_unstable_write(struct nfs_page *req,
static int
nfs_need_commit(struct nfs_inode *nfsi)
{
- return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+ return nfsi->ncommit > 0;
+}
+
+/* i_lock held by caller */
+static int
+nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max,
+ spinlock_t *lock)
+{
+ struct nfs_page *req, *tmp;
+ int ret = 0;
+
+ list_for_each_entry_safe(req, tmp, src, wb_list) {
+ if (!nfs_lock_request(req))
+ continue;
+ if (cond_resched_lock(lock))
+ list_safe_reset_next(req, tmp, wb_list);
+ nfs_request_remove_commit_list(req);
+ nfs_list_add_request(req, dst);
+ ret++;
+ if (ret == max)
+ break;
+ }
+ return ret;
}
/*
* nfs_scan_commit - Scan an inode for commit requests
* @inode: NFS inode to scan
* @dst: destination list
- * @idx_start: lower bound of page->index to scan.
- * @npages: idx_start + npages sets the upper bound to scan.
*
* Moves requests from the inode's 'commit' request list.
* The requests are *not* checked to ensure that they form a contiguous set.
*/
static int
-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
+nfs_scan_commit(struct inode *inode, struct list_head *dst)
{
struct nfs_inode *nfsi = NFS_I(inode);
- int ret;
-
- if (!nfs_need_commit(nfsi))
- return 0;
+ int ret = 0;
spin_lock(&inode->i_lock);
- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
- if (ret > 0)
- nfsi->ncommit -= ret;
- spin_unlock(&inode->i_lock);
-
- if (nfs_need_commit(NFS_I(inode)))
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ if (nfsi->ncommit > 0) {
+ const int max = INT_MAX;
+ ret = nfs_scan_commit_list(&nfsi->commit_list, dst, max,
+ &inode->i_lock);
+ ret += pnfs_scan_commit_lists(inode, max - ret,
+ &inode->i_lock);
+ }
+ spin_unlock(&inode->i_lock);
return ret;
}
+
#else
static inline int nfs_need_commit(struct nfs_inode *nfsi)
{
return 0;
}
-static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
+static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst)
{
return 0;
}
@@ -604,7 +659,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
|| end < req->wb_offset)
goto out_flushme;
- if (nfs_set_page_tag_locked(req))
+ if (nfs_lock_request_dontget(req))
break;
/* The request is locked, so wait and then retry */
@@ -616,13 +671,6 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
spin_lock(&inode->i_lock);
}
- if (nfs_clear_request_commit(req) &&
- radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
- req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) {
- NFS_I(inode)->ncommit--;
- pnfs_clear_request_commit(req);
- }
-
/* Okay, the request matches. Update the region */
if (offset < req->wb_offset) {
req->wb_offset = offset;
@@ -634,6 +682,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
req->wb_bytes = rqend - req->wb_offset;
out_unlock:
spin_unlock(&inode->i_lock);
+ nfs_clear_request_commit(req);
return req;
out_flushme:
spin_unlock(&inode->i_lock);
@@ -655,7 +704,6 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
{
struct inode *inode = page->mapping->host;
struct nfs_page *req;
- int error;
req = nfs_try_to_update_request(inode, page, offset, bytes);
if (req != NULL)
@@ -663,11 +711,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
req = nfs_create_request(ctx, inode, page, offset, bytes);
if (IS_ERR(req))
goto out;
- error = nfs_inode_add_request(inode, req);
- if (error != 0) {
- nfs_release_request(req);
- req = ERR_PTR(error);
- }
+ nfs_inode_add_request(inode, req);
out:
return req;
}
@@ -684,7 +728,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
nfs_grow_file(page, offset, count);
nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
nfs_mark_request_dirty(req);
- nfs_clear_page_tag_locked(req);
+ nfs_unlock_request(req);
return 0;
}
@@ -777,7 +821,7 @@ static void nfs_writepage_release(struct nfs_page *req,
if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
nfs_inode_remove_request(req);
- nfs_clear_page_tag_locked(req);
+ nfs_unlock_request(req);
nfs_end_page_writeback(page);
}
@@ -925,7 +969,7 @@ static void nfs_redirty_request(struct nfs_page *req)
struct page *page = req->wb_page;
nfs_mark_request_dirty(req);
- nfs_clear_page_tag_locked(req);
+ nfs_unlock_request(req);
nfs_end_page_writeback(page);
}
@@ -1128,23 +1172,14 @@ out:
nfs_writedata_release(calldata);
}
-#if defined(CONFIG_NFS_V4_1)
void nfs_write_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_write_data *data = calldata;
-
- if (nfs4_setup_sequence(NFS_SERVER(data->inode),
- &data->args.seq_args,
- &data->res.seq_res, 1, task))
- return;
- rpc_call_start(task);
+ NFS_PROTO(data->inode)->write_rpc_prepare(task, data);
}
-#endif /* CONFIG_NFS_V4_1 */
static const struct rpc_call_ops nfs_write_partial_ops = {
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs_write_prepare,
-#endif /* CONFIG_NFS_V4_1 */
.rpc_call_done = nfs_writeback_done_partial,
.rpc_release = nfs_writeback_release_partial,
};
@@ -1199,16 +1234,14 @@ static void nfs_writeback_release_full(void *calldata)
remove_request:
nfs_inode_remove_request(req);
next:
- nfs_clear_page_tag_locked(req);
+ nfs_unlock_request(req);
nfs_end_page_writeback(page);
}
nfs_writedata_release(calldata);
}
static const struct rpc_call_ops nfs_write_full_ops = {
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs_write_prepare,
-#endif /* CONFIG_NFS_V4_1 */
.rpc_call_done = nfs_writeback_done_full,
.rpc_release = nfs_writeback_release_full,
};
@@ -1325,7 +1358,6 @@ void nfs_commitdata_release(void *data)
{
struct nfs_write_data *wdata = data;
- put_lseg(wdata->lseg);
put_nfs_open_context(wdata->args.context);
nfs_commit_free(wdata);
}
@@ -1411,7 +1443,7 @@ void nfs_retry_commit(struct list_head *page_list,
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
BDI_RECLAIMABLE);
- nfs_clear_page_tag_locked(req);
+ nfs_unlock_request(req);
}
}
EXPORT_SYMBOL_GPL(nfs_retry_commit);
@@ -1460,7 +1492,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data)
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- nfs_clear_request_commit(req);
+ nfs_clear_page_commit(req->wb_page);
dprintk("NFS: commit (%s/%lld %d@%lld)",
req->wb_context->dentry->d_sb->s_id,
@@ -1486,7 +1518,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data)
dprintk(" mismatch\n");
nfs_mark_request_dirty(req);
next:
- nfs_clear_page_tag_locked(req);
+ nfs_unlock_request(req);
}
}
EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
@@ -1501,9 +1533,7 @@ static void nfs_commit_release(void *calldata)
}
static const struct rpc_call_ops nfs_commit_ops = {
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs_write_prepare,
-#endif /* CONFIG_NFS_V4_1 */
.rpc_call_done = nfs_commit_done,
.rpc_release = nfs_commit_release,
};
@@ -1517,7 +1547,7 @@ int nfs_commit_inode(struct inode *inode, int how)
res = nfs_commit_set_lock(NFS_I(inode), may_wait);
if (res <= 0)
goto out_mark_dirty;
- res = nfs_scan_commit(inode, &head, 0, 0);
+ res = nfs_scan_commit(inode, &head);
if (res) {
int error;
@@ -1635,6 +1665,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
if (req == NULL)
break;
if (nfs_lock_request_dontget(req)) {
+ nfs_clear_request_commit(req);
nfs_inode_remove_request(req);
/*
* In case nfs_inode_remove_request has marked the
@@ -1688,7 +1719,7 @@ out_error:
#ifdef CONFIG_MIGRATION
int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page)
+ struct page *page, enum migrate_mode mode)
{
/*
* If PagePrivate is set, then the page is currently associated with
@@ -1703,7 +1734,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
nfs_fscache_release_page(page, GFP_KERNEL);
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
#endif
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 10e6366608f..8df1ea4a6ff 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -80,3 +80,13 @@ config NFSD_V4
available from http://linux-nfs.org/.
If unsure, say N.
+
+config NFSD_FAULT_INJECTION
+ bool "NFS server manual fault injection"
+ depends on NFSD_V4 && DEBUG_KERNEL
+ help
+ This option enables support for manually injecting faults
+ into the NFS server. This is intended to be used for
+ testing error recovery on the NFS client.
+
+ If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9b118ee2019..af32ef06b4f 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_NFSD) += nfsd.o
nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/current_stateid.h b/fs/nfsd/current_stateid.h
new file mode 100644
index 00000000000..4123551208d
--- /dev/null
+++ b/fs/nfsd/current_stateid.h
@@ -0,0 +1,28 @@
+#ifndef _NFSD4_CURRENT_STATE_H
+#define _NFSD4_CURRENT_STATE_H
+
+#include "state.h"
+#include "xdr4.h"
+
+extern void clear_current_stateid(struct nfsd4_compound_state *cstate);
+/*
+ * functions to set current state id
+ */
+extern void nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *);
+extern void nfsd4_set_openstateid(struct nfsd4_compound_state *, struct nfsd4_open *);
+extern void nfsd4_set_lockstateid(struct nfsd4_compound_state *, struct nfsd4_lock *);
+extern void nfsd4_set_closestateid(struct nfsd4_compound_state *, struct nfsd4_close *);
+
+/*
+ * functions to consume current state id
+ */
+extern void nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *);
+extern void nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *, struct nfsd4_delegreturn *);
+extern void nfsd4_get_freestateid(struct nfsd4_compound_state *, struct nfsd4_free_stateid *);
+extern void nfsd4_get_setattrstateid(struct nfsd4_compound_state *, struct nfsd4_setattr *);
+extern void nfsd4_get_closestateid(struct nfsd4_compound_state *, struct nfsd4_close *);
+extern void nfsd4_get_lockustateid(struct nfsd4_compound_state *, struct nfsd4_locku *);
+extern void nfsd4_get_readstateid(struct nfsd4_compound_state *, struct nfsd4_read *);
+extern void nfsd4_get_writestateid(struct nfsd4_compound_state *, struct nfsd4_write *);
+
+#endif /* _NFSD4_CURRENT_STATE_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 62f3b9074e8..8e9689abbc0 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -87,7 +87,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
struct svc_expkey key;
struct svc_expkey *ek = NULL;
- if (mesg[mlen-1] != '\n')
+ if (mesg[mlen - 1] != '\n')
return -EINVAL;
mesg[mlen-1] = 0;
@@ -1226,12 +1226,12 @@ nfsd_export_init(void)
int rv;
dprintk("nfsd: initializing export module.\n");
- rv = cache_register(&svc_export_cache);
+ rv = cache_register_net(&svc_export_cache, &init_net);
if (rv)
return rv;
- rv = cache_register(&svc_expkey_cache);
+ rv = cache_register_net(&svc_expkey_cache, &init_net);
if (rv)
- cache_unregister(&svc_export_cache);
+ cache_unregister_net(&svc_export_cache, &init_net);
return rv;
}
@@ -1255,8 +1255,8 @@ nfsd_export_shutdown(void)
dprintk("nfsd: shutting down export module.\n");
- cache_unregister(&svc_expkey_cache);
- cache_unregister(&svc_export_cache);
+ cache_unregister_net(&svc_expkey_cache, &init_net);
+ cache_unregister_net(&svc_export_cache, &init_net);
svcauth_unix_purge();
dprintk("nfsd: export shutdown complete.\n");
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
new file mode 100644
index 00000000000..9559ce46873
--- /dev/null
+++ b/fs/nfsd/fault_inject.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+ *
+ * Uses debugfs to create fault injection points for client testing
+ */
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include "state.h"
+#include "fault_inject.h"
+
+struct nfsd_fault_inject_op {
+ char *file;
+ void (*func)(u64);
+};
+
+static struct nfsd_fault_inject_op inject_ops[] = {
+ {
+ .file = "forget_clients",
+ .func = nfsd_forget_clients,
+ },
+ {
+ .file = "forget_locks",
+ .func = nfsd_forget_locks,
+ },
+ {
+ .file = "forget_openowners",
+ .func = nfsd_forget_openowners,
+ },
+ {
+ .file = "forget_delegations",
+ .func = nfsd_forget_delegations,
+ },
+ {
+ .file = "recall_delegations",
+ .func = nfsd_recall_delegations,
+ },
+};
+
+static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
+static struct dentry *debug_dir;
+
+static int nfsd_inject_set(void *op_ptr, u64 val)
+{
+ struct nfsd_fault_inject_op *op = op_ptr;
+
+ if (val == 0)
+ printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
+ else
+ printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
+
+ op->func(val);
+ return 0;
+}
+
+static int nfsd_inject_get(void *data, u64 *val)
+{
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n");
+
+void nfsd_fault_inject_cleanup(void)
+{
+ debugfs_remove_recursive(debug_dir);
+}
+
+int nfsd_fault_inject_init(void)
+{
+ unsigned int i;
+ struct nfsd_fault_inject_op *op;
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+
+ debug_dir = debugfs_create_dir("nfsd", NULL);
+ if (!debug_dir)
+ goto fail;
+
+ for (i = 0; i < NUM_INJECT_OPS; i++) {
+ op = &inject_ops[i];
+ if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd))
+ goto fail;
+ }
+ return 0;
+
+fail:
+ nfsd_fault_inject_cleanup();
+ return -ENOMEM;
+}
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
new file mode 100644
index 00000000000..90bd0570956
--- /dev/null
+++ b/fs/nfsd/fault_inject.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+ *
+ * Function definitions for fault injection
+ */
+
+#ifndef LINUX_NFSD_FAULT_INJECT_H
+#define LINUX_NFSD_FAULT_INJECT_H
+
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+int nfsd_fault_inject_init(void);
+void nfsd_fault_inject_cleanup(void);
+void nfsd_forget_clients(u64);
+void nfsd_forget_locks(u64);
+void nfsd_forget_openowners(u64);
+void nfsd_forget_delegations(u64);
+void nfsd_recall_delegations(u64);
+#else /* CONFIG_NFSD_FAULT_INJECTION */
+static inline int nfsd_fault_inject_init(void) { return 0; }
+static inline void nfsd_fault_inject_cleanup(void) {}
+static inline void nfsd_forget_clients(u64 num) {}
+static inline void nfsd_forget_locks(u64 num) {}
+static inline void nfsd_forget_openowners(u64 num) {}
+static inline void nfsd_forget_delegations(u64 num) {}
+static inline void nfsd_recall_delegations(u64 num) {}
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+
+#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
new file mode 100644
index 00000000000..12e0cff435b
--- /dev/null
+++ b/fs/nfsd/netns.h
@@ -0,0 +1,34 @@
+/*
+ * per net namespace data structures for nfsd
+ *
+ * Copyright (C) 2012, Jeff Layton <jlayton@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __NFSD_NETNS_H__
+#define __NFSD_NETNS_H__
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+struct cld_net;
+
+struct nfsd_net {
+ struct cld_net *cld_net;
+};
+
+extern int nfsd_net_id;
+#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6f3ebb48b12..c8e9f637153 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -605,24 +605,24 @@ static struct rpc_version nfs_cb_version4 = {
.procs = nfs4_cb_procedures
};
-static struct rpc_version *nfs_cb_version[] = {
+static const struct rpc_version *nfs_cb_version[] = {
&nfs_cb_version4,
};
-static struct rpc_program cb_program;
+static const struct rpc_program cb_program;
static struct rpc_stat cb_stats = {
.program = &cb_program
};
#define NFS4_CALLBACK 0x40000000
-static struct rpc_program cb_program = {
+static const struct rpc_program cb_program = {
.name = "nfs4_cb",
.number = NFS4_CALLBACK,
.nrvers = ARRAY_SIZE(nfs_cb_version),
.version = nfs_cb_version,
.stats = &cb_stats,
- .pipe_dir_name = "/nfsd4_cb",
+ .pipe_dir_name = "nfsd4_cb",
};
static int max_cb_time(void)
@@ -645,7 +645,6 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
.timeout = &timeparms,
.program = &cb_program,
.version = 0,
- .authflavor = clp->cl_flavor,
.flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
};
struct rpc_clnt *client;
@@ -656,6 +655,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
args.client_name = clp->cl_principal;
args.prognumber = conn->cb_prog,
args.protocol = XPRT_TRANSPORT_TCP;
+ args.authflavor = clp->cl_flavor;
clp->cl_cb_ident = conn->cb_ident;
} else {
if (!conn->cb_xprt)
@@ -665,6 +665,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
args.bc_xprt = conn->cb_xprt;
args.prognumber = clp->cl_cb_session->se_cb_prog;
args.protocol = XPRT_TRANSPORT_BC_TCP;
+ args.authflavor = RPC_AUTH_UNIX;
}
/* Create RPC client */
client = rpc_create(&args);
@@ -754,9 +755,9 @@ static void do_probe_callback(struct nfs4_client *clp)
*/
void nfsd4_probe_callback(struct nfs4_client *clp)
{
- /* XXX: atomicity? Also, should we be using cl_cb_flags? */
+ /* XXX: atomicity? Also, should we be using cl_flags? */
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
- set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+ set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
do_probe_callback(clp);
}
@@ -915,7 +916,7 @@ void nfsd4_destroy_callback_queue(void)
/* must be called under the state lock */
void nfsd4_shutdown_callback(struct nfs4_client *clp)
{
- set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
+ set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags);
/*
* Note this won't actually result in a null callback;
* instead, nfsd4_do_callback_rpc() will detect the killed
@@ -966,15 +967,15 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
clp->cl_cb_conn.cb_xprt = NULL;
}
- if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
+ if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
return;
spin_lock(&clp->cl_lock);
/*
* Only serialized callback code is allowed to clear these
* flags; main nfsd code can only set them:
*/
- BUG_ON(!clp->cl_cb_flags);
- clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+ BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
+ clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
c = __nfsd4_find_backchannel(clp);
if (c) {
@@ -986,7 +987,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
err = setup_callback_client(clp, &conn, ses);
if (err) {
- warn_no_callback_path(clp, err);
+ nfsd4_mark_cb_down(clp, err);
return;
}
/* Yay, the callback channel's back! Restart any callbacks: */
@@ -1000,7 +1001,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
struct nfs4_client *clp = cb->cb_clp;
struct rpc_clnt *clnt;
- if (clp->cl_cb_flags)
+ if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
nfsd4_process_cb_update(cb);
clnt = clp->cl_cb_client;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 55780a22fdb..322d11ce06a 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,10 +36,19 @@
#include <linux/seq_file.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <net/net_namespace.h>
#include "idmap.h"
#include "nfsd.h"
/*
+ * Turn off idmapping when using AUTH_SYS.
+ */
+static bool nfs4_disable_idmapping = true;
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+ "Turn off server's NFSv4 idmapping when using 'sec=sys'");
+
+/*
* Cache entry
*/
@@ -466,20 +475,20 @@ nfsd_idmap_init(void)
{
int rv;
- rv = cache_register(&idtoname_cache);
+ rv = cache_register_net(&idtoname_cache, &init_net);
if (rv)
return rv;
- rv = cache_register(&nametoid_cache);
+ rv = cache_register_net(&nametoid_cache, &init_net);
if (rv)
- cache_unregister(&idtoname_cache);
+ cache_unregister_net(&idtoname_cache, &init_net);
return rv;
}
void
nfsd_idmap_shutdown(void)
{
- cache_unregister(&idtoname_cache);
- cache_unregister(&nametoid_cache);
+ cache_unregister_net(&idtoname_cache, &init_net);
+ cache_unregister_net(&nametoid_cache, &init_net);
}
static int
@@ -560,28 +569,65 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
return ret;
}
+static bool
+numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id)
+{
+ int ret;
+ char buf[11];
+
+ if (namelen + 1 > sizeof(buf))
+ /* too long to represent a 32-bit id: */
+ return false;
+ /* Just to make sure it's null-terminated: */
+ memcpy(buf, name, namelen);
+ buf[namelen] = '\0';
+ ret = kstrtouint(name, 10, id);
+ return ret == 0;
+}
+
+static __be32
+do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id)
+{
+ if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS)
+ if (numeric_name_to_id(rqstp, type, name, namelen, id))
+ return 0;
+ /*
+ * otherwise, fall through and try idmapping, for
+ * backwards compatibility with clients sending names:
+ */
+ return idmap_name_to_id(rqstp, type, name, namelen, id);
+}
+
+static int
+do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
+{
+ if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS)
+ return sprintf(name, "%u", id);
+ return idmap_id_to_name(rqstp, type, id, name);
+}
+
__be32
nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
__u32 *id)
{
- return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
+ return do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
}
__be32
nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
__u32 *id)
{
- return idmap_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id);
+ return do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id);
}
int
nfsd_map_uid_to_name(struct svc_rqst *rqstp, __u32 id, char *name)
{
- return idmap_id_to_name(rqstp, IDMAP_TYPE_USER, id, name);
+ return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name);
}
int
nfsd_map_gid_to_name(struct svc_rqst *rqstp, __u32 id, char *name)
{
- return idmap_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name);
+ return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c5e28ed8bca..2ed14dfd00a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -39,6 +39,7 @@
#include "cache.h"
#include "xdr4.h"
#include "vfs.h"
+#include "current_stateid.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -192,10 +193,13 @@ static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
static __be32
do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
- struct svc_fh resfh;
+ struct svc_fh *resfh;
__be32 status;
- fh_init(&resfh, NFS4_FHSIZE);
+ resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
+ if (!resfh)
+ return nfserr_jukebox;
+ fh_init(resfh, NFS4_FHSIZE);
open->op_truncate = 0;
if (open->op_create) {
@@ -220,7 +224,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
*/
status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
open->op_fname.len, &open->op_iattr,
- &resfh, open->op_createmode,
+ resfh, open->op_createmode,
(u32 *)open->op_verf.data,
&open->op_truncate, &open->op_created);
@@ -234,30 +238,29 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
FATTR4_WORD1_TIME_MODIFY);
} else {
status = nfsd_lookup(rqstp, current_fh,
- open->op_fname.data, open->op_fname.len, &resfh);
+ open->op_fname.data, open->op_fname.len, resfh);
fh_unlock(current_fh);
if (status)
goto out;
- status = nfsd_check_obj_isreg(&resfh);
+ status = nfsd_check_obj_isreg(resfh);
}
if (status)
goto out;
if (is_create_with_attrs(open) && open->op_acl != NULL)
- do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval);
-
- set_change_info(&open->op_cinfo, current_fh);
- fh_dup2(current_fh, &resfh);
+ do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval);
/* set reply cache */
fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
- &resfh.fh_handle);
+ &resfh->fh_handle);
if (!open->op_created)
- status = do_open_permission(rqstp, current_fh, open,
+ status = do_open_permission(rqstp, resfh, open,
NFSD_MAY_NOP);
-
+ set_change_info(&open->op_cinfo, current_fh);
+ fh_dup2(current_fh, resfh);
out:
- fh_put(&resfh);
+ fh_put(resfh);
+ kfree(resfh);
return status;
}
@@ -266,10 +269,6 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
{
__be32 status;
- /* Only reclaims from previously confirmed clients are valid */
- if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
- return status;
-
/* We don't know the target directory, and therefore can not
* set the change info
*/
@@ -314,16 +313,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
return nfserr_inval;
- /* We don't yet support WANT bits: */
- open->op_share_access &= NFS4_SHARE_ACCESS_MASK;
-
open->op_created = 0;
/*
* RFC5661 18.51.3
* Before RECLAIM_COMPLETE done, server should deny new lock
*/
if (nfsd4_has_session(cstate) &&
- !cstate->session->se_client->cl_firststate &&
+ !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
+ &cstate->session->se_client->cl_flags) &&
open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
return nfserr_grace;
@@ -373,6 +370,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ status = nfs4_check_open_reclaim(&open->op_clientid);
+ if (status)
+ goto out;
case NFS4_OPEN_CLAIM_FH:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
status = do_open_fhandle(rqstp, &cstate->current_fh,
@@ -453,6 +453,10 @@ nfsd4_restorefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_restorefh;
fh_dup2(&cstate->current_fh, &cstate->save_fh);
+ if (HAS_STATE_ID(cstate, SAVED_STATE_ID_FLAG)) {
+ memcpy(&cstate->current_stateid, &cstate->save_stateid, sizeof(stateid_t));
+ SET_STATE_ID(cstate, CURRENT_STATE_ID_FLAG);
+ }
return nfs_ok;
}
@@ -464,6 +468,10 @@ nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_nofilehandle;
fh_dup2(&cstate->save_fh, &cstate->current_fh);
+ if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG)) {
+ memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t));
+ SET_STATE_ID(cstate, SAVED_STATE_ID_FLAG);
+ }
return nfs_ok;
}
@@ -482,14 +490,20 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&access->ac_supported);
}
+static void gen_boot_verifier(nfs4_verifier *verifier)
+{
+ __be32 verf[2];
+
+ verf[0] = (__be32)nfssvc_boot.tv_sec;
+ verf[1] = (__be32)nfssvc_boot.tv_usec;
+ memcpy(verifier->data, verf, sizeof(verifier->data));
+}
+
static __be32
nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_commit *commit)
{
- u32 *p = (u32 *)commit->co_verf.data;
- *p++ = nfssvc_boot.tv_sec;
- *p++ = nfssvc_boot.tv_usec;
-
+ gen_boot_verifier(&commit->co_verf);
return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
commit->co_count);
}
@@ -866,7 +880,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
stateid_t *stateid = &write->wr_stateid;
struct file *filp = NULL;
- u32 *p;
__be32 status = nfs_ok;
unsigned long cnt;
@@ -888,9 +901,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
cnt = write->wr_buflen;
write->wr_how_written = write->wr_stable_how;
- p = (u32 *)write->wr_verifier.data;
- *p++ = nfssvc_boot.tv_sec;
- *p++ = nfssvc_boot.tv_usec;
+ gen_boot_verifier(&write->wr_verifier);
status = nfsd_write(rqstp, &cstate->current_fh, filp,
write->wr_offset, rqstp->rq_vec, write->wr_vlen,
@@ -1001,6 +1012,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
void *);
typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op);
+typedef void(*stateid_setter)(struct nfsd4_compound_state *, void *);
+typedef void(*stateid_getter)(struct nfsd4_compound_state *, void *);
enum nfsd4_op_flags {
ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
@@ -1026,6 +1039,10 @@ enum nfsd4_op_flags {
* the v4.0 case).
*/
OP_CACHEME = 1 << 6,
+ /*
+ * These are ops which clear current state id.
+ */
+ OP_CLEAR_STATEID = 1 << 7,
};
struct nfsd4_operation {
@@ -1034,11 +1051,15 @@ struct nfsd4_operation {
char *op_name;
/* Try to get response size before operation */
nfsd4op_rsize op_rsize_bop;
+ stateid_setter op_get_currentstateid;
+ stateid_getter op_set_currentstateid;
};
static struct nfsd4_operation nfsd4_ops[];
+#ifdef NFSD_DEBUG
static const char *nfsd4_op_name(unsigned opnum);
+#endif
/*
* Enforce NFSv4.1 COMPOUND ordering rules:
@@ -1216,13 +1237,23 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
if (op->status)
goto encode_op;
- if (opdesc->op_func)
+ if (opdesc->op_func) {
+ if (opdesc->op_get_currentstateid)
+ opdesc->op_get_currentstateid(cstate, &op->u);
op->status = opdesc->op_func(rqstp, cstate, &op->u);
- else
+ } else
BUG_ON(op->status == nfs_ok);
- if (!op->status && need_wrongsec_check(rqstp))
- op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
+ if (!op->status) {
+ if (opdesc->op_set_currentstateid)
+ opdesc->op_set_currentstateid(cstate, &op->u);
+
+ if (opdesc->op_flags & OP_CLEAR_STATEID)
+ clear_current_stateid(cstate);
+
+ if (need_wrongsec_check(rqstp))
+ op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
+ }
encode_op:
/* Only from SEQUENCE */
@@ -1414,6 +1445,8 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_CLOSE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_closestateid,
+ .op_set_currentstateid = (stateid_setter)nfsd4_set_closestateid,
},
[OP_COMMIT] = {
.op_func = (nfsd4op_func)nfsd4_commit,
@@ -1423,7 +1456,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_CREATE] = {
.op_func = (nfsd4op_func)nfsd4_create,
- .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME | OP_CLEAR_STATEID,
.op_name = "OP_CREATE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
},
@@ -1432,6 +1465,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_DELEGRETURN",
.op_rsize_bop = nfsd4_only_status_rsize,
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_delegreturnstateid,
},
[OP_GETATTR] = {
.op_func = (nfsd4op_func)nfsd4_getattr,
@@ -1454,6 +1488,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_LOCK",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
+ .op_set_currentstateid = (stateid_setter)nfsd4_set_lockstateid,
},
[OP_LOCKT] = {
.op_func = (nfsd4op_func)nfsd4_lockt,
@@ -1464,15 +1499,16 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_LOCKU",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_lockustateid,
},
[OP_LOOKUP] = {
.op_func = (nfsd4op_func)nfsd4_lookup,
- .op_flags = OP_HANDLES_WRONGSEC,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
.op_name = "OP_LOOKUP",
},
[OP_LOOKUPP] = {
.op_func = (nfsd4op_func)nfsd4_lookupp,
- .op_flags = OP_HANDLES_WRONGSEC,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
.op_name = "OP_LOOKUPP",
},
[OP_NVERIFY] = {
@@ -1484,6 +1520,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize,
+ .op_set_currentstateid = (stateid_setter)nfsd4_set_openstateid,
},
[OP_OPEN_CONFIRM] = {
.op_func = (nfsd4op_func)nfsd4_open_confirm,
@@ -1496,25 +1533,30 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN_DOWNGRADE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_opendowngradestateid,
+ .op_set_currentstateid = (stateid_setter)nfsd4_set_opendowngradestateid,
},
[OP_PUTFH] = {
.op_func = (nfsd4op_func)nfsd4_putfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING
+ | OP_CLEAR_STATEID,
.op_name = "OP_PUTFH",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTPUBFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING
+ | OP_CLEAR_STATEID,
.op_name = "OP_PUTPUBFH",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTROOTFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING
+ | OP_CLEAR_STATEID,
.op_name = "OP_PUTROOTFH",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
@@ -1523,6 +1565,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READ",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid,
},
[OP_READDIR] = {
.op_func = (nfsd4op_func)nfsd4_readdir,
@@ -1577,6 +1620,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_SETATTR",
.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_setattrstateid,
},
[OP_SETCLIENTID] = {
.op_func = (nfsd4op_func)nfsd4_setclientid,
@@ -1601,6 +1645,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_WRITE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid,
},
[OP_RELEASE_LOCKOWNER] = {
.op_func = (nfsd4op_func)nfsd4_release_lockowner,
@@ -1675,12 +1720,14 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
};
+#ifdef NFSD_DEBUG
static const char *nfsd4_op_name(unsigned opnum)
{
if (opnum < ARRAY_SIZE(nfsd4_ops))
return nfsd4_ops[opnum].op_name;
return "unknown_operation";
}
+#endif
#define nfsd4_voidres nfsd4_voidargs
struct nfsd4_voidargs { int dummy; };
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 80a0be9ed00..4767429264a 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2004 The Regents of the University of Michigan.
+* Copyright (c) 2012 Jeff Layton <jlayton@redhat.com>
* All rights reserved.
*
* Andy Adamson <andros@citi.umich.edu>
@@ -36,16 +37,34 @@
#include <linux/namei.h>
#include <linux/crypto.h>
#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfsd/cld.h>
#include "nfsd.h"
#include "state.h"
#include "vfs.h"
+#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
+/* Declarations */
+struct nfsd4_client_tracking_ops {
+ int (*init)(struct net *);
+ void (*exit)(struct net *);
+ void (*create)(struct nfs4_client *);
+ void (*remove)(struct nfs4_client *);
+ int (*check)(struct nfs4_client *);
+ void (*grace_done)(struct net *, time_t);
+};
+
/* Globals */
static struct file *rec_file;
static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
+static struct nfsd4_client_tracking_ops *client_tracking_ops;
static int
nfs4_save_creds(const struct cred **original_creds)
@@ -117,7 +136,7 @@ out_no_tfm:
return status;
}
-int
+static void
nfsd4_create_clid_dir(struct nfs4_client *clp)
{
const struct cred *original_cred;
@@ -127,13 +146,13 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
- if (!rec_file || clp->cl_firststate)
- return 0;
-
- clp->cl_firststate = 1;
+ if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+ if (!rec_file)
+ return;
status = nfs4_save_creds(&original_cred);
if (status < 0)
- return status;
+ return;
dir = rec_file->f_path.dentry;
/* lock the parent */
@@ -144,8 +163,15 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
status = PTR_ERR(dentry);
goto out_unlock;
}
- status = -EEXIST;
if (dentry->d_inode)
+ /*
+ * In the 4.1 case, where we're called from
+ * reclaim_complete(), records from the previous reboot
+ * may still be left, so this is OK.
+ *
+ * In the 4.0 case, we should never get here; but we may
+ * as well be forgiving and just succeed silently.
+ */
goto out_put;
status = mnt_want_write_file(rec_file);
if (status)
@@ -164,7 +190,6 @@ out_unlock:
" and is writeable", status,
user_recovery_dirname);
nfs4_reset_creds(original_cred);
- return status;
}
typedef int (recdir_func)(struct dentry *, struct dentry *);
@@ -259,19 +284,19 @@ out_unlock:
return status;
}
-void
+static void
nfsd4_remove_clid_dir(struct nfs4_client *clp)
{
const struct cred *original_cred;
int status;
- if (!rec_file || !clp->cl_firststate)
+ if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
return;
status = mnt_want_write_file(rec_file);
if (status)
goto out;
- clp->cl_firststate = 0;
+ clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
status = nfs4_save_creds(&original_cred);
if (status < 0)
@@ -286,7 +311,6 @@ out:
if (status)
printk("NFSD: Failed to remove expired client state directory"
" %.*s\n", HEXDIR_LEN, clp->cl_recdir);
- return;
}
static int
@@ -305,8 +329,9 @@ purge_old(struct dentry *parent, struct dentry *child)
return 0;
}
-void
-nfsd4_recdir_purge_old(void) {
+static void
+nfsd4_recdir_purge_old(struct net *net, time_t boot_time)
+{
int status;
if (!rec_file)
@@ -337,7 +362,7 @@ load_recdir(struct dentry *parent, struct dentry *child)
return 0;
}
-int
+static int
nfsd4_recdir_load(void) {
int status;
@@ -355,8 +380,8 @@ nfsd4_recdir_load(void) {
* Hold reference to the recovery directory.
*/
-void
-nfsd4_init_recdir()
+static int
+nfsd4_init_recdir(void)
{
const struct cred *original_cred;
int status;
@@ -371,20 +396,44 @@ nfsd4_init_recdir()
printk("NFSD: Unable to change credentials to find recovery"
" directory: error %d\n",
status);
- return;
+ return status;
}
rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
if (IS_ERR(rec_file)) {
printk("NFSD: unable to find recovery directory %s\n",
user_recovery_dirname);
+ status = PTR_ERR(rec_file);
rec_file = NULL;
}
nfs4_reset_creds(original_cred);
+ return status;
}
-void
+static int
+nfsd4_load_reboot_recovery_data(struct net *net)
+{
+ int status;
+
+ /* XXX: The legacy code won't work in a container */
+ if (net != &init_net) {
+ WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client "
+ "tracking in a container!\n");
+ return -EINVAL;
+ }
+
+ nfs4_lock_state();
+ status = nfsd4_init_recdir();
+ if (!status)
+ status = nfsd4_recdir_load();
+ nfs4_unlock_state();
+ if (status)
+ printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+ return status;
+}
+
+static void
nfsd4_shutdown_recdir(void)
{
if (!rec_file)
@@ -393,6 +442,13 @@ nfsd4_shutdown_recdir(void)
rec_file = NULL;
}
+static void
+nfsd4_legacy_tracking_exit(struct net *net)
+{
+ nfs4_release_reclaim();
+ nfsd4_shutdown_recdir();
+}
+
/*
* Change the NFSv4 recovery directory to recdir.
*/
@@ -419,3 +475,572 @@ nfs4_recoverydir(void)
{
return user_recovery_dirname;
}
+
+static int
+nfsd4_check_legacy_client(struct nfs4_client *clp)
+{
+ /* did we already find that this client is stable? */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
+
+ /* look for it in the reclaim hashtable otherwise */
+ if (nfsd4_find_reclaim_client(clp)) {
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
+ .init = nfsd4_load_reboot_recovery_data,
+ .exit = nfsd4_legacy_tracking_exit,
+ .create = nfsd4_create_clid_dir,
+ .remove = nfsd4_remove_clid_dir,
+ .check = nfsd4_check_legacy_client,
+ .grace_done = nfsd4_recdir_purge_old,
+};
+
+/* Globals */
+#define NFSD_PIPE_DIR "nfsd"
+#define NFSD_CLD_PIPE "cld"
+
+/* per-net-ns structure for holding cld upcall info */
+struct cld_net {
+ struct rpc_pipe *cn_pipe;
+ spinlock_t cn_lock;
+ struct list_head cn_list;
+ unsigned int cn_xid;
+};
+
+struct cld_upcall {
+ struct list_head cu_list;
+ struct cld_net *cu_net;
+ struct task_struct *cu_task;
+ struct cld_msg cu_msg;
+};
+
+static int
+__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+{
+ int ret;
+ struct rpc_pipe_msg msg;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.data = cmsg;
+ msg.len = sizeof(*cmsg);
+
+ /*
+ * Set task state before we queue the upcall. That prevents
+ * wake_up_process in the downcall from racing with schedule.
+ */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ ret = rpc_queue_upcall(pipe, &msg);
+ if (ret < 0) {
+ set_current_state(TASK_RUNNING);
+ goto out;
+ }
+
+ schedule();
+ set_current_state(TASK_RUNNING);
+
+ if (msg.errno < 0)
+ ret = msg.errno;
+out:
+ return ret;
+}
+
+static int
+cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+{
+ int ret;
+
+ /*
+ * -EAGAIN occurs when pipe is closed and reopened while there are
+ * upcalls queued.
+ */
+ do {
+ ret = __cld_pipe_upcall(pipe, cmsg);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+static ssize_t
+cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+ struct cld_upcall *tmp, *cup;
+ struct cld_msg *cmsg = (struct cld_msg *)src;
+ uint32_t xid;
+ struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+ nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+
+ if (mlen != sizeof(*cmsg)) {
+ dprintk("%s: got %lu bytes, expected %lu\n", __func__, mlen,
+ sizeof(*cmsg));
+ return -EINVAL;
+ }
+
+ /* copy just the xid so we can try to find that */
+ if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) {
+ dprintk("%s: error when copying xid from userspace", __func__);
+ return -EFAULT;
+ }
+
+ /* walk the list and find corresponding xid */
+ cup = NULL;
+ spin_lock(&cn->cn_lock);
+ list_for_each_entry(tmp, &cn->cn_list, cu_list) {
+ if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
+ cup = tmp;
+ list_del_init(&cup->cu_list);
+ break;
+ }
+ }
+ spin_unlock(&cn->cn_lock);
+
+ /* couldn't find upcall? */
+ if (!cup) {
+ dprintk("%s: couldn't find upcall -- xid=%u\n", __func__, xid);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
+ return -EFAULT;
+
+ wake_up_process(cup->cu_task);
+ return mlen;
+}
+
+static void
+cld_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct cld_msg *cmsg = msg->data;
+ struct cld_upcall *cup = container_of(cmsg, struct cld_upcall,
+ cu_msg);
+
+ /* errno >= 0 means we got a downcall */
+ if (msg->errno >= 0)
+ return;
+
+ wake_up_process(cup->cu_task);
+}
+
+static const struct rpc_pipe_ops cld_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = cld_pipe_downcall,
+ .destroy_msg = cld_pipe_destroy_msg,
+};
+
+static struct dentry *
+nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe)
+{
+ struct dentry *dir, *dentry;
+
+ dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR);
+ if (dir == NULL)
+ return ERR_PTR(-ENOENT);
+ dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe);
+ dput(dir);
+ return dentry;
+}
+
+static void
+nfsd4_cld_unregister_sb(struct rpc_pipe *pipe)
+{
+ if (pipe->dentry)
+ rpc_unlink(pipe->dentry);
+}
+
+static struct dentry *
+nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe)
+{
+ struct super_block *sb;
+ struct dentry *dentry;
+
+ sb = rpc_get_sb_net(net);
+ if (!sb)
+ return NULL;
+ dentry = nfsd4_cld_register_sb(sb, pipe);
+ rpc_put_sb_net(net);
+ return dentry;
+}
+
+static void
+nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe)
+{
+ struct super_block *sb;
+
+ sb = rpc_get_sb_net(net);
+ if (sb) {
+ nfsd4_cld_unregister_sb(pipe);
+ rpc_put_sb_net(net);
+ }
+}
+
+/* Initialize rpc_pipefs pipe for communication with client tracking daemon */
+static int
+nfsd4_init_cld_pipe(struct net *net)
+{
+ int ret;
+ struct dentry *dentry;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct cld_net *cn;
+
+ if (nn->cld_net)
+ return 0;
+
+ cn = kzalloc(sizeof(*cn), GFP_KERNEL);
+ if (!cn) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ cn->cn_pipe = rpc_mkpipe_data(&cld_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
+ if (IS_ERR(cn->cn_pipe)) {
+ ret = PTR_ERR(cn->cn_pipe);
+ goto err;
+ }
+ spin_lock_init(&cn->cn_lock);
+ INIT_LIST_HEAD(&cn->cn_list);
+
+ dentry = nfsd4_cld_register_net(net, cn->cn_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ goto err_destroy_data;
+ }
+
+ cn->cn_pipe->dentry = dentry;
+ nn->cld_net = cn;
+ return 0;
+
+err_destroy_data:
+ rpc_destroy_pipe_data(cn->cn_pipe);
+err:
+ kfree(cn);
+ printk(KERN_ERR "NFSD: unable to create nfsdcld upcall pipe (%d)\n",
+ ret);
+ return ret;
+}
+
+static void
+nfsd4_remove_cld_pipe(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+
+ nfsd4_cld_unregister_net(net, cn->cn_pipe);
+ rpc_destroy_pipe_data(cn->cn_pipe);
+ kfree(nn->cld_net);
+ nn->cld_net = NULL;
+}
+
+static struct cld_upcall *
+alloc_cld_upcall(struct cld_net *cn)
+{
+ struct cld_upcall *new, *tmp;
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return new;
+
+ /* FIXME: hard cap on number in flight? */
+restart_search:
+ spin_lock(&cn->cn_lock);
+ list_for_each_entry(tmp, &cn->cn_list, cu_list) {
+ if (tmp->cu_msg.cm_xid == cn->cn_xid) {
+ cn->cn_xid++;
+ spin_unlock(&cn->cn_lock);
+ goto restart_search;
+ }
+ }
+ new->cu_task = current;
+ new->cu_msg.cm_vers = CLD_UPCALL_VERSION;
+ put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid);
+ new->cu_net = cn;
+ list_add(&new->cu_list, &cn->cn_list);
+ spin_unlock(&cn->cn_lock);
+
+ dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid);
+
+ return new;
+}
+
+static void
+free_cld_upcall(struct cld_upcall *victim)
+{
+ struct cld_net *cn = victim->cu_net;
+
+ spin_lock(&cn->cn_lock);
+ list_del(&victim->cu_list);
+ spin_unlock(&cn->cn_lock);
+ kfree(victim);
+}
+
+/* Ask daemon to create a new record */
+static void
+nfsd4_cld_create(struct nfs4_client *clp)
+{
+ int ret;
+ struct cld_upcall *cup;
+ /* FIXME: determine net from clp */
+ struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+
+ /* Don't upcall if it's already stored */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
+ cup = alloc_cld_upcall(cn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ cup->cu_msg.cm_cmd = Cld_Create;
+ cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+ memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+ clp->cl_name.len);
+
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+ if (!ret) {
+ ret = cup->cu_msg.cm_status;
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ printk(KERN_ERR "NFSD: Unable to create client "
+ "record on stable storage: %d\n", ret);
+}
+
+/* Ask daemon to create a new record */
+static void
+nfsd4_cld_remove(struct nfs4_client *clp)
+{
+ int ret;
+ struct cld_upcall *cup;
+ /* FIXME: determine net from clp */
+ struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+
+ /* Don't upcall if it's already removed */
+ if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
+ cup = alloc_cld_upcall(cn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ cup->cu_msg.cm_cmd = Cld_Remove;
+ cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+ memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+ clp->cl_name.len);
+
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+ if (!ret) {
+ ret = cup->cu_msg.cm_status;
+ clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ printk(KERN_ERR "NFSD: Unable to remove client "
+ "record from stable storage: %d\n", ret);
+}
+
+/* Check for presence of a record, and update its timestamp */
+static int
+nfsd4_cld_check(struct nfs4_client *clp)
+{
+ int ret;
+ struct cld_upcall *cup;
+ /* FIXME: determine net from clp */
+ struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+
+ /* Don't upcall if one was already stored during this grace pd */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
+
+ cup = alloc_cld_upcall(cn);
+ if (!cup) {
+ printk(KERN_ERR "NFSD: Unable to check client record on "
+ "stable storage: %d\n", -ENOMEM);
+ return -ENOMEM;
+ }
+
+ cup->cu_msg.cm_cmd = Cld_Check;
+ cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+ memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+ clp->cl_name.len);
+
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+ if (!ret) {
+ ret = cup->cu_msg.cm_status;
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+
+ free_cld_upcall(cup);
+ return ret;
+}
+
+static void
+nfsd4_cld_grace_done(struct net *net, time_t boot_time)
+{
+ int ret;
+ struct cld_upcall *cup;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+
+ cup = alloc_cld_upcall(cn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ cup->cu_msg.cm_cmd = Cld_GraceDone;
+ cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time;
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+ if (!ret)
+ ret = cup->cu_msg.cm_status;
+
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
+}
+
+static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+ .init = nfsd4_init_cld_pipe,
+ .exit = nfsd4_remove_cld_pipe,
+ .create = nfsd4_cld_create,
+ .remove = nfsd4_cld_remove,
+ .check = nfsd4_cld_check,
+ .grace_done = nfsd4_cld_grace_done,
+};
+
+int
+nfsd4_client_tracking_init(struct net *net)
+{
+ int status;
+ struct path path;
+
+ if (!client_tracking_ops) {
+ client_tracking_ops = &nfsd4_cld_tracking_ops;
+ status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
+ if (!status) {
+ if (S_ISDIR(path.dentry->d_inode->i_mode))
+ client_tracking_ops =
+ &nfsd4_legacy_tracking_ops;
+ path_put(&path);
+ }
+ }
+
+ status = client_tracking_ops->init(net);
+ if (status) {
+ printk(KERN_WARNING "NFSD: Unable to initialize client "
+ "recovery tracking! (%d)\n", status);
+ client_tracking_ops = NULL;
+ }
+ return status;
+}
+
+void
+nfsd4_client_tracking_exit(struct net *net)
+{
+ if (client_tracking_ops) {
+ client_tracking_ops->exit(net);
+ client_tracking_ops = NULL;
+ }
+}
+
+void
+nfsd4_client_record_create(struct nfs4_client *clp)
+{
+ if (client_tracking_ops)
+ client_tracking_ops->create(clp);
+}
+
+void
+nfsd4_client_record_remove(struct nfs4_client *clp)
+{
+ if (client_tracking_ops)
+ client_tracking_ops->remove(clp);
+}
+
+int
+nfsd4_client_record_check(struct nfs4_client *clp)
+{
+ if (client_tracking_ops)
+ return client_tracking_ops->check(clp);
+
+ return -EOPNOTSUPP;
+}
+
+void
+nfsd4_record_grace_done(struct net *net, time_t boot_time)
+{
+ if (client_tracking_ops)
+ client_tracking_ops->grace_done(net, boot_time);
+}
+
+static int
+rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+ struct dentry *dentry;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ if (!cn) {
+ module_put(THIS_MODULE);
+ return 0;
+ }
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ break;
+ }
+ cn->cn_pipe->dentry = dentry;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (cn->cn_pipe->dentry)
+ nfsd4_cld_unregister_sb(cn->cn_pipe);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+struct notifier_block nfsd4_cld_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+int
+register_cld_notifier(void)
+{
+ return rpc_pipefs_notifier_register(&nfsd4_cld_block);
+}
+
+void
+unregister_cld_notifier(void)
+{
+ rpc_pipefs_notifier_unregister(&nfsd4_cld_block);
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9ca16dc09e0..1841f8bf845 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,12 +49,24 @@
time_t nfsd4_lease = 90; /* default lease time */
time_t nfsd4_grace = 90;
static time_t boot_time;
-static stateid_t zerostateid; /* bits all 0 */
-static stateid_t onestateid; /* bits all 1 */
+
+#define all_ones {{~0,~0},~0}
+static const stateid_t one_stateid = {
+ .si_generation = ~0,
+ .si_opaque = all_ones,
+};
+static const stateid_t zero_stateid = {
+ /* all fields zero */
+};
+static const stateid_t currentstateid = {
+ .si_generation = 1,
+};
+
static u64 current_sessionid = 1;
-#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
-#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
+#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t)))
+#define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t)))
+#define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t)))
/* forward declarations */
static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
@@ -83,6 +95,19 @@ nfs4_lock_state(void)
mutex_lock(&client_mutex);
}
+static void free_session(struct kref *);
+
+/* Must be called under the client_lock */
+static void nfsd4_put_session_locked(struct nfsd4_session *ses)
+{
+ kref_put(&ses->se_ref, free_session);
+}
+
+static void nfsd4_get_session(struct nfsd4_session *ses)
+{
+ kref_get(&ses->se_ref);
+}
+
void
nfs4_unlock_state(void)
{
@@ -133,21 +158,21 @@ unsigned int max_delegations;
* Open owner state (share locks)
*/
-/* hash tables for open owners */
-#define OPEN_OWNER_HASH_BITS 8
-#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS)
-#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1)
+/* hash tables for lock and open owners */
+#define OWNER_HASH_BITS 8
+#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
+#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
-static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
{
unsigned int ret;
ret = opaque_hashval(ownername->data, ownername->len);
ret += clientid;
- return ret & OPEN_OWNER_HASH_MASK;
+ return ret & OWNER_HASH_MASK;
}
-static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
+static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
/* hash table for nfs4_file */
#define FILE_HASH_BITS 8
@@ -514,6 +539,7 @@ static void unhash_lockowner(struct nfs4_lockowner *lo)
list_del(&lo->lo_owner.so_strhash);
list_del(&lo->lo_perstateid);
+ list_del(&lo->lo_owner_ino_hash);
while (!list_empty(&lo->lo_owner.so_stateids)) {
stp = list_first_entry(&lo->lo_owner.so_stateids,
struct nfs4_ol_stateid, st_perstateowner);
@@ -596,12 +622,20 @@ hash_sessionid(struct nfs4_sessionid *sessionid)
return sid->sequence % SESSION_HASH_SIZE;
}
+#ifdef NFSD_DEBUG
static inline void
dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
{
u32 *ptr = (u32 *)(&sessionid->data[0]);
dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
}
+#else
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+}
+#endif
+
static void
gen_sessionid(struct nfsd4_session *ses)
@@ -823,11 +857,12 @@ static void nfsd4_del_conns(struct nfsd4_session *s)
spin_unlock(&clp->cl_lock);
}
-void free_session(struct kref *kref)
+static void free_session(struct kref *kref)
{
struct nfsd4_session *ses;
int mem;
+ BUG_ON(!spin_is_locked(&client_lock));
ses = container_of(kref, struct nfsd4_session, se_ref);
nfsd4_del_conns(ses);
spin_lock(&nfsd_drc_lock);
@@ -838,6 +873,13 @@ void free_session(struct kref *kref)
kfree(ses);
}
+void nfsd4_put_session(struct nfsd4_session *ses)
+{
+ spin_lock(&client_lock);
+ nfsd4_put_session_locked(ses);
+ spin_unlock(&client_lock);
+}
+
static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
{
struct nfsd4_session *new;
@@ -885,7 +927,9 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
status = nfsd4_new_conn_from_crses(rqstp, new);
/* whoops: benny points out, status is ignored! (err, or bogus) */
if (status) {
+ spin_lock(&client_lock);
free_session(&new->se_ref);
+ spin_unlock(&client_lock);
return NULL;
}
if (cses->flags & SESSION4_BACK_CHAN) {
@@ -985,12 +1029,11 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
if (clp == NULL)
return NULL;
- clp->cl_name.data = kmalloc(name.len, GFP_KERNEL);
+ clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
if (clp->cl_name.data == NULL) {
kfree(clp);
return NULL;
}
- memcpy(clp->cl_name.data, name.data, name.len);
clp->cl_name.len = name.len;
return clp;
}
@@ -998,12 +1041,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
static inline void
free_client(struct nfs4_client *clp)
{
+ BUG_ON(!spin_is_locked(&client_lock));
while (!list_empty(&clp->cl_sessions)) {
struct nfsd4_session *ses;
ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
se_perclnt);
list_del(&ses->se_perclnt);
- nfsd4_put_session(ses);
+ nfsd4_put_session_locked(ses);
}
if (clp->cl_cred.cr_group_info)
put_group_info(clp->cl_cred.cr_group_info);
@@ -1058,7 +1102,6 @@ expire_client(struct nfs4_client *clp)
spin_unlock(&recall_lock);
while (!list_empty(&reaplist)) {
dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
- list_del_init(&dp->dl_recall_lru);
unhash_delegation(dp);
}
while (!list_empty(&clp->cl_openowners)) {
@@ -1131,12 +1174,12 @@ static void gen_clid(struct nfs4_client *clp)
static void gen_confirm(struct nfs4_client *clp)
{
+ __be32 verf[2];
static u32 i;
- u32 *p;
- p = (u32 *)clp->cl_confirm.data;
- *p++ = get_seconds();
- *p++ = i++;
+ verf[0] = (__be32)get_seconds();
+ verf[1] = (__be32)i++;
+ memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
}
static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
@@ -1173,7 +1216,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
if (princ) {
clp->cl_principal = kstrdup(princ, GFP_KERNEL);
if (clp->cl_principal == NULL) {
+ spin_lock(&client_lock);
free_client(clp);
+ spin_unlock(&client_lock);
return NULL;
}
}
@@ -1301,7 +1346,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
else
goto out_err;
- conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+ conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val,
se->se_callback_addr_len,
(struct sockaddr *)&conn->cb_addr,
sizeof(conn->cb_addr));
@@ -1340,6 +1385,7 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
slot->sl_opcnt = resp->opcnt;
slot->sl_status = resp->cstate.status;
+ slot->sl_flags |= NFSD4_SLOT_INITIALIZED;
if (nfsd4_not_cached(resp)) {
slot->sl_datalen = 0;
return;
@@ -1367,15 +1413,12 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
struct nfsd4_op *op;
struct nfsd4_slot *slot = resp->cstate.slot;
- dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__,
- resp->opcnt, resp->cstate.slot->sl_cachethis);
-
/* Encode the replayed sequence operation */
op = &args->ops[resp->opcnt - 1];
nfsd4_encode_operation(resp, op);
/* Return nfserr_retry_uncached_rep in next operation. */
- if (args->opcnt > 1 && slot->sl_cachethis == 0) {
+ if (args->opcnt > 1 && !(slot->sl_flags & NFSD4_SLOT_CACHETHIS)) {
op = &args->ops[resp->opcnt++];
op->status = nfserr_retry_uncached_rep;
nfsd4_encode_operation(resp, op);
@@ -1568,16 +1611,11 @@ check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
else
return nfserr_seq_misordered;
}
- /* Normal */
+ /* Note unsigned 32-bit arithmetic handles wraparound: */
if (likely(seqid == slot_seqid + 1))
return nfs_ok;
- /* Replay */
if (seqid == slot_seqid)
return nfserr_replay_cache;
- /* Wraparound */
- if (seqid == 1 && (slot_seqid + 1) == 0)
- return nfs_ok;
- /* Misordered replay or misordered new request */
return nfserr_seq_misordered;
}
@@ -1808,9 +1846,10 @@ nfsd4_destroy_session(struct svc_rqst *r,
nfsd4_probe_callback_sync(ses->se_client);
nfs4_unlock_state();
+ spin_lock(&client_lock);
nfsd4_del_conns(ses);
-
- nfsd4_put_session(ses);
+ nfsd4_put_session_locked(ses);
+ spin_unlock(&client_lock);
status = nfs_ok;
out:
dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1914,8 +1953,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
* sr_highest_slotid and the sr_target_slot id to maxslots */
seq->maxslots = session->se_fchannel.maxreqs;
- status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse);
+ status = check_slot_seqid(seq->seqid, slot->sl_seqid,
+ slot->sl_flags & NFSD4_SLOT_INUSE);
if (status == nfserr_replay_cache) {
+ status = nfserr_seq_misordered;
+ if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED))
+ goto out;
cstate->slot = slot;
cstate->session = session;
/* Return the cached reply status and set cstate->status
@@ -1931,9 +1974,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
conn = NULL;
/* Success! bump slot seqid */
- slot->sl_inuse = true;
slot->sl_seqid = seq->seqid;
- slot->sl_cachethis = seq->cachethis;
+ slot->sl_flags |= NFSD4_SLOT_INUSE;
+ if (seq->cachethis)
+ slot->sl_flags |= NFSD4_SLOT_CACHETHIS;
+ else
+ slot->sl_flags &= ~NFSD4_SLOT_CACHETHIS;
cstate->slot = slot;
cstate->session = session;
@@ -2023,7 +2069,8 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
nfs4_lock_state();
status = nfserr_complete_already;
- if (cstate->session->se_client->cl_firststate)
+ if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
+ &cstate->session->se_client->cl_flags))
goto out;
status = nfserr_stale_clientid;
@@ -2038,7 +2085,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
goto out;
status = nfs_ok;
- nfsd4_create_clid_dir(cstate->session->se_client);
+ nfsd4_client_record_create(cstate->session->se_client);
out:
nfs4_unlock_state();
return status;
@@ -2233,7 +2280,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
conf = find_confirmed_client_by_str(unconf->cl_recdir,
hash);
if (conf) {
- nfsd4_remove_clid_dir(conf);
+ nfsd4_client_record_remove(conf);
expire_client(conf);
}
move_to_confirmed(unconf);
@@ -2301,7 +2348,7 @@ nfsd4_free_slabs(void)
nfsd4_free_slab(&deleg_slab);
}
-static int
+int
nfsd4_init_slabs(void)
{
openowner_slab = kmem_cache_create("nfsd4_openowners",
@@ -2373,7 +2420,7 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
{
- list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
+ list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
list_add(&oo->oo_perclient, &clp->cl_openowners);
}
@@ -2436,7 +2483,9 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
struct nfs4_stateowner *so;
struct nfs4_openowner *oo;
- list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
+ list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+ if (!so->so_is_open_owner)
+ continue;
if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
oo = openowner(so);
renew_client(oo->oo_owner.so_client);
@@ -2580,7 +2629,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
if (open->op_file == NULL)
return nfserr_jukebox;
- strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
+ strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
oo = find_openstateowner_str(strhashval, open);
open->op_openowner = oo;
if (!oo) {
@@ -2624,8 +2673,6 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
static int share_access_to_flags(u32 share_access)
{
- share_access &= ~NFS4_SHARE_WANT_MASK;
-
return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
}
@@ -2767,10 +2814,9 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
static void
-nfs4_set_claim_prev(struct nfsd4_open *open)
+nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session)
{
open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
- open->op_openowner->oo_owner.so_client->cl_firststate = 1;
}
/* Should we give out recallable state?: */
@@ -2846,6 +2892,27 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
return 0;
}
+static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
+{
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+ if (status == -EAGAIN)
+ open->op_why_no_deleg = WND4_CONTENTION;
+ else {
+ open->op_why_no_deleg = WND4_RESOURCE;
+ switch (open->op_deleg_want) {
+ case NFS4_SHARE_WANT_READ_DELEG:
+ case NFS4_SHARE_WANT_WRITE_DELEG:
+ case NFS4_SHARE_WANT_ANY_DELEG:
+ break;
+ case NFS4_SHARE_WANT_CANCEL:
+ open->op_why_no_deleg = WND4_CANCELLED;
+ break;
+ case NFS4_SHARE_WANT_NO_DELEG:
+ BUG(); /* not supposed to get here */
+ }
+ }
+}
+
/*
* Attempt to hand out a delegation.
*/
@@ -2855,7 +2922,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_
struct nfs4_delegation *dp;
struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
int cb_up;
- int status, flag = 0;
+ int status = 0, flag = 0;
cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2896,11 +2963,16 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_
dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
STATEID_VAL(&dp->dl_stid.sc_stateid));
out:
- if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
- && flag == NFS4_OPEN_DELEGATE_NONE
- && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
- dprintk("NFSD: WARNING: refusing delegation reclaim\n");
open->op_delegate_type = flag;
+ if (flag == NFS4_OPEN_DELEGATE_NONE) {
+ if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
+ open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
+ dprintk("NFSD: WARNING: refusing delegation reclaim\n");
+
+ /* 4.1 client asking for a delegation? */
+ if (open->op_deleg_want)
+ nfsd4_open_deleg_none_ext(open, status);
+ }
return;
out_free:
nfs4_put_delegation(dp);
@@ -2909,6 +2981,24 @@ out_no_deleg:
goto out;
}
+static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
+ struct nfs4_delegation *dp)
+{
+ if (open->op_deleg_want == NFS4_SHARE_WANT_READ_DELEG &&
+ dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) {
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+ open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE;
+ } else if (open->op_deleg_want == NFS4_SHARE_WANT_WRITE_DELEG &&
+ dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) {
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+ open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE;
+ }
+ /* Otherwise the client must be confused wanting a delegation
+ * it already has, therefore we don't return
+ * NFS4_OPEN_DELEGATE_NONE_EXT and reason.
+ */
+}
+
/*
* called with nfs4_lock_state() held.
*/
@@ -2970,24 +3060,36 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
- if (nfsd4_has_session(&resp->cstate))
+ if (nfsd4_has_session(&resp->cstate)) {
open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+ open->op_why_no_deleg = WND4_NOT_WANTED;
+ goto nodeleg;
+ }
+ }
+
/*
* Attempt to hand out a delegation. No error return, because the
* OPEN succeeds even if we fail.
*/
nfs4_open_delegation(current_fh, open, stp);
-
+nodeleg:
status = nfs_ok;
dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
STATEID_VAL(&stp->st_stid.sc_stateid));
out:
+ /* 4.1 client trying to upgrade/downgrade delegation? */
+ if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp &&
+ open->op_deleg_want)
+ nfsd4_deleg_xgrade_none_ext(open, dp);
+
if (fp)
put_nfs4_file(fp);
if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
- nfs4_set_claim_prev(open);
+ nfs4_set_claim_prev(open, nfsd4_has_session(&resp->cstate));
/*
* To finish the open response, we just need to set the rflags.
*/
@@ -3057,7 +3159,7 @@ static void
nfsd4_end_grace(void)
{
dprintk("NFSD: end of grace period\n");
- nfsd4_recdir_purge_old();
+ nfsd4_record_grace_done(&init_net, boot_time);
locks_end_grace(&nfsd4_manager);
/*
* Now that every NFSv4 client has had the chance to recover and
@@ -3106,7 +3208,7 @@ nfs4_laundromat(void)
clp = list_entry(pos, struct nfs4_client, cl_lru);
dprintk("NFSD: purging unused client (clientid %08x)\n",
clp->cl_clientid.cl_id);
- nfsd4_remove_clid_dir(clp);
+ nfsd4_client_record_remove(clp);
expire_client(clp);
}
spin_lock(&recall_lock);
@@ -3123,7 +3225,6 @@ nfs4_laundromat(void)
spin_unlock(&recall_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- list_del_init(&dp->dl_recall_lru);
unhash_delegation(dp);
}
test_val = nfsd4_lease;
@@ -3392,7 +3493,14 @@ __be32
nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_test_stateid *test_stateid)
{
- /* real work is done during encoding */
+ struct nfsd4_test_stateid_id *stateid;
+ struct nfs4_client *cl = cstate->session->se_client;
+
+ nfs4_lock_state();
+ list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list)
+ stateid->ts_id_status = nfs4_validate_stateid(cl, &stateid->ts_id_stateid);
+ nfs4_unlock_state();
+
return nfs_ok;
}
@@ -3531,7 +3639,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
- nfsd4_create_clid_dir(oo->oo_owner.so_client);
+ nfsd4_client_record_create(oo->oo_owner.so_client);
status = nfs_ok;
out:
if (!cstate->replay_owner)
@@ -3588,7 +3696,9 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
cstate->current_fh.fh_dentry->d_name.name);
/* We don't yet support WANT bits: */
- od->od_share_access &= NFS4_SHARE_ACCESS_MASK;
+ if (od->od_deleg_want)
+ dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__,
+ od->od_deleg_want);
nfs4_lock_state();
status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
@@ -3718,13 +3828,11 @@ out:
}
-/*
- * Lock owner state (byte-range locks)
- */
#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
-#define LOCK_HASH_BITS 8
-#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS)
-#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1)
+
+#define LOCKOWNER_INO_HASH_BITS 8
+#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
+#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
static inline u64
end_offset(u64 start, u64 len)
@@ -3746,16 +3854,14 @@ last_byte_offset(u64 start, u64 len)
return end > start ? end - 1: NFS4_MAX_UINT64;
}
-static inline unsigned int
-lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
- struct xdr_netobj *ownername)
+static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername)
{
return (file_hashval(inode) + cl_id
+ opaque_hashval(ownername->data, ownername->len))
- & LOCK_HASH_MASK;
+ & LOCKOWNER_INO_HASH_MASK;
}
-static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
+static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
/*
* TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3809,23 +3915,39 @@ nevermind:
deny->ld_type = NFS4_WRITE_LT;
}
+static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner)
+{
+ struct nfs4_ol_stateid *lst;
+
+ if (!same_owner_str(&lo->lo_owner, owner, clid))
+ return false;
+ lst = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
+ return lst->st_file->fi_inode == inode;
+}
+
static struct nfs4_lockowner *
find_lockowner_str(struct inode *inode, clientid_t *clid,
struct xdr_netobj *owner)
{
- unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
- struct nfs4_stateowner *op;
+ unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
+ struct nfs4_lockowner *lo;
- list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
- if (same_owner_str(op, owner, clid))
- return lockowner(op);
+ list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
+ if (same_lockowner_ino(lo, inode, clid, owner))
+ return lo;
}
return NULL;
}
static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
{
- list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+ struct inode *inode = open_stp->st_file->fi_inode;
+ unsigned int inohash = lockowner_ino_hashval(inode,
+ clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
+
+ list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
+ list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]);
list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
}
@@ -3834,7 +3956,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
* Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
* occurred.
*
- * strhashval = lock_ownerstr_hashval
+ * strhashval = ownerstr_hashval
*/
static struct nfs4_lockowner *
@@ -3892,6 +4014,37 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
__set_bit(access, &lock_stp->st_access_bmap);
}
+__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
+{
+ struct nfs4_file *fi = ost->st_file;
+ struct nfs4_openowner *oo = openowner(ost->st_stateowner);
+ struct nfs4_client *cl = oo->oo_owner.so_client;
+ struct nfs4_lockowner *lo;
+ unsigned int strhashval;
+
+ lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner);
+ if (lo) {
+ if (!cstate->minorversion)
+ return nfserr_bad_seqid;
+ /* XXX: a lockowner always has exactly one stateid: */
+ *lst = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
+ return nfs_ok;
+ }
+ strhashval = ownerstr_hashval(cl->cl_clientid.cl_id,
+ &lock->v.new.owner);
+ lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
+ if (lo == NULL)
+ return nfserr_jukebox;
+ *lst = alloc_init_lock_stateid(lo, fi, ost);
+ if (*lst == NULL) {
+ release_lockowner(lo);
+ return nfserr_jukebox;
+ }
+ *new = true;
+ return nfs_ok;
+}
+
/*
* LOCK operation
*/
@@ -3907,7 +4060,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct file_lock file_lock;
struct file_lock conflock;
__be32 status = 0;
- unsigned int strhashval;
+ bool new_state = false;
int lkflg;
int err;
@@ -3933,10 +4086,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* lock stateid.
*/
struct nfs4_ol_stateid *open_stp = NULL;
-
+
+ if (nfsd4_has_session(cstate))
+ /* See rfc 5661 18.10.3: given clientid is ignored: */
+ memcpy(&lock->v.new.clientid,
+ &cstate->session->se_client->cl_clientid,
+ sizeof(clientid_t));
+
status = nfserr_stale_clientid;
- if (!nfsd4_has_session(cstate) &&
- STALE_CLIENTID(&lock->lk_new_clientid))
+ if (STALE_CLIENTID(&lock->lk_new_clientid))
goto out;
/* validate and update open stateid and open seqid */
@@ -3948,25 +4106,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
open_sop = openowner(open_stp->st_stateowner);
status = nfserr_bad_stateid;
- if (!nfsd4_has_session(cstate) &&
- !same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+ if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
&lock->v.new.clientid))
goto out;
- /* create lockowner and lock stateid */
- fp = open_stp->st_file;
- strhashval = lock_ownerstr_hashval(fp->fi_inode,
- open_sop->oo_owner.so_client->cl_clientid.cl_id,
- &lock->v.new.owner);
- /* XXX: Do we need to check for duplicate stateowners on
- * the same file, or should they just be allowed (and
- * create new stateids)? */
- status = nfserr_jukebox;
- lock_sop = alloc_init_lock_stateowner(strhashval,
- open_sop->oo_owner.so_client, open_stp, lock);
- if (lock_sop == NULL)
- goto out;
- lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
- if (lock_stp == NULL)
+ status = lookup_or_create_lock_state(cstate, open_stp, lock,
+ &lock_stp, &new_state);
+ if (status)
goto out;
} else {
/* lock (lock owner + lock stateid) already exists */
@@ -3976,10 +4121,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
NFS4_LOCK_STID, &lock_stp);
if (status)
goto out;
- lock_sop = lockowner(lock_stp->st_stateowner);
- fp = lock_stp->st_file;
}
- /* lock_sop and lock_stp have been created or found */
+ lock_sop = lockowner(lock_stp->st_stateowner);
+ fp = lock_stp->st_file;
lkflg = setlkflg(lock->lk_type);
status = nfs4_check_openmode(lock_stp, lkflg);
@@ -4054,7 +4198,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
}
out:
- if (status && lock->lk_is_new && lock_sop)
+ if (status && new_state)
release_lockowner(lock_sop);
if (!cstate->replay_owner)
nfs4_unlock_state();
@@ -4251,7 +4395,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
struct nfs4_ol_stateid *stp;
struct xdr_netobj *owner = &rlockowner->rl_owner;
struct list_head matches;
- int i;
+ unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
__be32 status;
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
@@ -4266,22 +4410,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
nfs4_lock_state();
status = nfserr_locks_held;
- /* XXX: we're doing a linear search through all the lockowners.
- * Yipes! For now we'll just hope clients aren't really using
- * release_lockowner much, but eventually we have to fix these
- * data structures. */
INIT_LIST_HEAD(&matches);
- for (i = 0; i < LOCK_HASH_SIZE; i++) {
- list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
- if (!same_owner_str(sop, owner, clid))
- continue;
- list_for_each_entry(stp, &sop->so_stateids,
- st_perstateowner) {
- lo = lockowner(sop);
- if (check_for_locks(stp->st_file, lo))
- goto out;
- list_add(&lo->lo_list, &matches);
- }
+
+ list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) {
+ if (sop->so_is_open_owner)
+ continue;
+ if (!same_owner_str(sop, owner, clid))
+ continue;
+ list_for_each_entry(stp, &sop->so_stateids,
+ st_perstateowner) {
+ lo = lockowner(sop);
+ if (check_for_locks(stp->st_file, lo))
+ goto out;
+ list_add(&lo->lo_list, &matches);
}
}
/* Clients probably won't expect us to return with some (but not all)
@@ -4314,7 +4455,9 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
struct nfs4_client *clp;
clp = find_confirmed_client_by_str(name, strhashval);
- return clp ? 1 : 0;
+ if (!clp)
+ return 0;
+ return test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
}
/*
@@ -4338,7 +4481,7 @@ nfs4_client_to_reclaim(const char *name)
return 1;
}
-static void
+void
nfs4_release_reclaim(void)
{
struct nfs4_client_reclaim *crp = NULL;
@@ -4358,19 +4501,12 @@ nfs4_release_reclaim(void)
/*
* called from OPEN, CLAIM_PREVIOUS with a new clientid. */
-static struct nfs4_client_reclaim *
-nfs4_find_reclaim_client(clientid_t *clid)
+struct nfs4_client_reclaim *
+nfsd4_find_reclaim_client(struct nfs4_client *clp)
{
unsigned int strhashval;
- struct nfs4_client *clp;
struct nfs4_client_reclaim *crp = NULL;
-
- /* find clientid in conf_id_hashtbl */
- clp = find_confirmed_client(clid);
- if (clp == NULL)
- return NULL;
-
dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n",
clp->cl_name.len, clp->cl_name.data,
clp->cl_recdir);
@@ -4391,19 +4527,137 @@ nfs4_find_reclaim_client(clientid_t *clid)
__be32
nfs4_check_open_reclaim(clientid_t *clid)
{
- return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
+ struct nfs4_client *clp;
+
+ /* find clientid in conf_id_hashtbl */
+ clp = find_confirmed_client(clid);
+ if (clp == NULL)
+ return nfserr_reclaim_bad;
+
+ return nfsd4_client_record_check(clp) ? nfserr_reclaim_bad : nfs_ok;
+}
+
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+
+void nfsd_forget_clients(u64 num)
+{
+ struct nfs4_client *clp, *next;
+ int count = 0;
+
+ nfs4_lock_state();
+ list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
+ nfsd4_client_record_remove(clp);
+ expire_client(clp);
+ if (++count == num)
+ break;
+ }
+ nfs4_unlock_state();
+
+ printk(KERN_INFO "NFSD: Forgot %d clients", count);
+}
+
+static void release_lockowner_sop(struct nfs4_stateowner *sop)
+{
+ release_lockowner(lockowner(sop));
}
+static void release_openowner_sop(struct nfs4_stateowner *sop)
+{
+ release_openowner(openowner(sop));
+}
+
+static int nfsd_release_n_owners(u64 num, bool is_open_owner,
+ void (*release_sop)(struct nfs4_stateowner *))
+{
+ int i, count = 0;
+ struct nfs4_stateowner *sop, *next;
+
+ for (i = 0; i < OWNER_HASH_SIZE; i++) {
+ list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) {
+ if (sop->so_is_open_owner != is_open_owner)
+ continue;
+ release_sop(sop);
+ if (++count == num)
+ return count;
+ }
+ }
+ return count;
+}
+
+void nfsd_forget_locks(u64 num)
+{
+ int count;
+
+ nfs4_lock_state();
+ count = nfsd_release_n_owners(num, false, release_lockowner_sop);
+ nfs4_unlock_state();
+
+ printk(KERN_INFO "NFSD: Forgot %d locks", count);
+}
+
+void nfsd_forget_openowners(u64 num)
+{
+ int count;
+
+ nfs4_lock_state();
+ count = nfsd_release_n_owners(num, true, release_openowner_sop);
+ nfs4_unlock_state();
+
+ printk(KERN_INFO "NFSD: Forgot %d open owners", count);
+}
+
+int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *))
+{
+ int i, count = 0;
+ struct nfs4_file *fp, *fnext;
+ struct nfs4_delegation *dp, *dnext;
+
+ for (i = 0; i < FILE_HASH_SIZE; i++) {
+ list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) {
+ list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) {
+ deleg_func(dp);
+ if (++count == num)
+ return count;
+ }
+ }
+ }
+
+ return count;
+}
+
+void nfsd_forget_delegations(u64 num)
+{
+ unsigned int count;
+
+ nfs4_lock_state();
+ count = nfsd_process_n_delegations(num, unhash_delegation);
+ nfs4_unlock_state();
+
+ printk(KERN_INFO "NFSD: Forgot %d delegations", count);
+}
+
+void nfsd_recall_delegations(u64 num)
+{
+ unsigned int count;
+
+ nfs4_lock_state();
+ spin_lock(&recall_lock);
+ count = nfsd_process_n_delegations(num, nfsd_break_one_deleg);
+ spin_unlock(&recall_lock);
+ nfs4_unlock_state();
+
+ printk(KERN_INFO "NFSD: Recalled %d delegations", count);
+}
+
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+
/* initialization to perform at module load time: */
-int
+void
nfs4_state_init(void)
{
- int i, status;
+ int i;
- status = nfsd4_init_slabs();
- if (status)
- return status;
for (i = 0; i < CLIENT_HASH_SIZE; i++) {
INIT_LIST_HEAD(&conf_id_hashtbl[i]);
INIT_LIST_HEAD(&conf_str_hashtbl[i]);
@@ -4416,31 +4670,15 @@ nfs4_state_init(void)
for (i = 0; i < FILE_HASH_SIZE; i++) {
INIT_LIST_HEAD(&file_hashtbl[i]);
}
- for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
+ for (i = 0; i < OWNER_HASH_SIZE; i++) {
+ INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
}
- for (i = 0; i < LOCK_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
- }
- memset(&onestateid, ~0, sizeof(stateid_t));
+ for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
INIT_LIST_HEAD(&close_lru);
INIT_LIST_HEAD(&client_lru);
INIT_LIST_HEAD(&del_recall_lru);
reclaim_str_hashtbl_size = 0;
- return 0;
-}
-
-static void
-nfsd4_load_reboot_recovery_data(void)
-{
- int status;
-
- nfs4_lock_state();
- nfsd4_init_recdir();
- status = nfsd4_recdir_load();
- nfs4_unlock_state();
- if (status)
- printk("NFSD: Failure reading reboot recovery data\n");
}
/*
@@ -4466,21 +4704,34 @@ set_max_delegations(void)
/* initialization to perform when the nfsd service is started: */
-static int
-__nfs4_state_start(void)
+int
+nfs4_state_start(void)
{
int ret;
+ /*
+ * FIXME: For now, we hang most of the pernet global stuff off of
+ * init_net until nfsd is fully containerized. Eventually, we'll
+ * need to pass a net pointer into this function, take a reference
+ * to that instead and then do most of the rest of this on a per-net
+ * basis.
+ */
+ get_net(&init_net);
+ nfsd4_client_tracking_init(&init_net);
boot_time = get_seconds();
locks_start_grace(&nfsd4_manager);
printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
nfsd4_grace);
ret = set_callback_cred();
- if (ret)
- return -ENOMEM;
+ if (ret) {
+ ret = -ENOMEM;
+ goto out_recovery;
+ }
laundry_wq = create_singlethread_workqueue("nfsd4");
- if (laundry_wq == NULL)
- return -ENOMEM;
+ if (laundry_wq == NULL) {
+ ret = -ENOMEM;
+ goto out_recovery;
+ }
ret = nfsd4_create_callback_queue();
if (ret)
goto out_free_laundry;
@@ -4489,16 +4740,12 @@ __nfs4_state_start(void)
return 0;
out_free_laundry:
destroy_workqueue(laundry_wq);
+out_recovery:
+ nfsd4_client_tracking_exit(&init_net);
+ put_net(&init_net);
return ret;
}
-int
-nfs4_state_start(void)
-{
- nfsd4_load_reboot_recovery_data();
- return __nfs4_state_start();
-}
-
static void
__nfs4_state_shutdown(void)
{
@@ -4526,11 +4773,11 @@ __nfs4_state_shutdown(void)
spin_unlock(&recall_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- list_del_init(&dp->dl_recall_lru);
unhash_delegation(dp);
}
- nfsd4_shutdown_recdir();
+ nfsd4_client_tracking_exit(&init_net);
+ put_net(&init_net);
}
void
@@ -4540,8 +4787,108 @@ nfs4_state_shutdown(void)
destroy_workqueue(laundry_wq);
locks_end_grace(&nfsd4_manager);
nfs4_lock_state();
- nfs4_release_reclaim();
__nfs4_state_shutdown();
nfs4_unlock_state();
nfsd4_destroy_callback_queue();
}
+
+static void
+get_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid)
+{
+ if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG) && CURRENT_STATEID(stateid))
+ memcpy(stateid, &cstate->current_stateid, sizeof(stateid_t));
+}
+
+static void
+put_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid)
+{
+ if (cstate->minorversion) {
+ memcpy(&cstate->current_stateid, stateid, sizeof(stateid_t));
+ SET_STATE_ID(cstate, CURRENT_STATE_ID_FLAG);
+ }
+}
+
+void
+clear_current_stateid(struct nfsd4_compound_state *cstate)
+{
+ CLEAR_STATE_ID(cstate, CURRENT_STATE_ID_FLAG);
+}
+
+/*
+ * functions to set current state id
+ */
+void
+nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *odp)
+{
+ put_stateid(cstate, &odp->od_stateid);
+}
+
+void
+nfsd4_set_openstateid(struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+{
+ put_stateid(cstate, &open->op_stateid);
+}
+
+void
+nfsd4_set_closestateid(struct nfsd4_compound_state *cstate, struct nfsd4_close *close)
+{
+ put_stateid(cstate, &close->cl_stateid);
+}
+
+void
+nfsd4_set_lockstateid(struct nfsd4_compound_state *cstate, struct nfsd4_lock *lock)
+{
+ put_stateid(cstate, &lock->lk_resp_stateid);
+}
+
+/*
+ * functions to consume current state id
+ */
+
+void
+nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *odp)
+{
+ get_stateid(cstate, &odp->od_stateid);
+}
+
+void
+nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *cstate, struct nfsd4_delegreturn *drp)
+{
+ get_stateid(cstate, &drp->dr_stateid);
+}
+
+void
+nfsd4_get_freestateid(struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *fsp)
+{
+ get_stateid(cstate, &fsp->fr_stateid);
+}
+
+void
+nfsd4_get_setattrstateid(struct nfsd4_compound_state *cstate, struct nfsd4_setattr *setattr)
+{
+ get_stateid(cstate, &setattr->sa_stateid);
+}
+
+void
+nfsd4_get_closestateid(struct nfsd4_compound_state *cstate, struct nfsd4_close *close)
+{
+ get_stateid(cstate, &close->cl_stateid);
+}
+
+void
+nfsd4_get_lockustateid(struct nfsd4_compound_state *cstate, struct nfsd4_locku *locku)
+{
+ get_stateid(cstate, &locku->lu_stateid);
+}
+
+void
+nfsd4_get_readstateid(struct nfsd4_compound_state *cstate, struct nfsd4_read *read)
+{
+ get_stateid(cstate, &read->rd_stateid);
+}
+
+void
+nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, struct nfsd4_write *write)
+{
+ get_stateid(cstate, &write->wr_stateid);
+}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b6fa792d6b8..bcd8904ab1e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -133,22 +133,6 @@ xdr_error: \
} \
} while (0)
-static void save_buf(struct nfsd4_compoundargs *argp, struct nfsd4_saved_compoundargs *savep)
-{
- savep->p = argp->p;
- savep->end = argp->end;
- savep->pagelen = argp->pagelen;
- savep->pagelist = argp->pagelist;
-}
-
-static void restore_buf(struct nfsd4_compoundargs *argp, struct nfsd4_saved_compoundargs *savep)
-{
- argp->p = savep->p;
- argp->end = savep->end;
- argp->pagelen = savep->pagelen;
- argp->pagelist = savep->pagelist;
-}
-
static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
{
/* We want more bytes than seem to be available.
@@ -215,10 +199,9 @@ defer_free(struct nfsd4_compoundargs *argp,
static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
{
if (p == argp->tmp) {
- p = kmalloc(nbytes, GFP_KERNEL);
+ p = kmemdup(argp->tmp, nbytes, GFP_KERNEL);
if (!p)
return NULL;
- memcpy(p, argp->tmp, nbytes);
} else {
BUG_ON(p != argp->tmpp);
argp->tmpp = NULL;
@@ -639,14 +622,18 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
DECODE_TAIL;
}
-static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
+static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when)
{
__be32 *p;
u32 w;
READ_BUF(4);
READ32(w);
- *x = w;
+ *share_access = w & NFS4_SHARE_ACCESS_MASK;
+ *deleg_want = w & NFS4_SHARE_WANT_MASK;
+ if (deleg_when)
+ *deleg_when = w & NFS4_SHARE_WHEN_MASK;
+
switch (w & NFS4_SHARE_ACCESS_MASK) {
case NFS4_SHARE_ACCESS_READ:
case NFS4_SHARE_ACCESS_WRITE:
@@ -674,6 +661,9 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
w &= ~NFS4_SHARE_WANT_MASK;
if (!w)
return nfs_ok;
+
+ if (!deleg_when) /* open_downgrade */
+ return nfserr_inval;
switch (w) {
case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
@@ -720,6 +710,7 @@ static __be32
nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
{
DECODE_HEAD;
+ u32 dummy;
memset(open->op_bmval, 0, sizeof(open->op_bmval));
open->op_iattr.ia_valid = 0;
@@ -728,7 +719,9 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
/* seqid, share_access, share_deny, clientid, ownerlen */
READ_BUF(4);
READ32(open->op_seqid);
- status = nfsd4_decode_share_access(argp, &open->op_share_access);
+ /* decode, yet ignore deleg_when until supported */
+ status = nfsd4_decode_share_access(argp, &open->op_share_access,
+ &open->op_deleg_want, &dummy);
if (status)
goto xdr_error;
status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
@@ -756,14 +749,14 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
goto out;
break;
case NFS4_CREATE_EXCLUSIVE:
- READ_BUF(8);
- COPYMEM(open->op_verf.data, 8);
+ READ_BUF(NFS4_VERIFIER_SIZE);
+ COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
break;
case NFS4_CREATE_EXCLUSIVE4_1:
if (argp->minorversion < 1)
goto xdr_error;
- READ_BUF(8);
- COPYMEM(open->op_verf.data, 8);
+ READ_BUF(NFS4_VERIFIER_SIZE);
+ COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
status = nfsd4_decode_fattr(argp, open->op_bmval,
&open->op_iattr, &open->op_acl);
if (status)
@@ -849,7 +842,8 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
return status;
READ_BUF(4);
READ32(open_down->od_seqid);
- status = nfsd4_decode_share_access(argp, &open_down->od_share_access);
+ status = nfsd4_decode_share_access(argp, &open_down->od_share_access,
+ &open_down->od_deleg_want, NULL);
if (status)
return status;
status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
@@ -995,8 +989,8 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
{
DECODE_HEAD;
- READ_BUF(8);
- COPYMEM(setclientid->se_verf.data, 8);
+ READ_BUF(NFS4_VERIFIER_SIZE);
+ COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE);
status = nfsd4_decode_opaque(argp, &setclientid->se_name);
if (status)
@@ -1021,9 +1015,9 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
{
DECODE_HEAD;
- READ_BUF(8 + sizeof(nfs4_verifier));
+ READ_BUF(8 + NFS4_VERIFIER_SIZE);
COPYMEM(&scd_c->sc_clientid, 8);
- COPYMEM(&scd_c->sc_confirm, sizeof(nfs4_verifier));
+ COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE);
DECODE_TAIL;
}
@@ -1386,26 +1380,29 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid)
{
- unsigned int nbytes;
- stateid_t si;
int i;
- __be32 *p;
- __be32 status;
+ __be32 *p, status;
+ struct nfsd4_test_stateid_id *stateid;
READ_BUF(4);
test_stateid->ts_num_ids = ntohl(*p++);
- nbytes = test_stateid->ts_num_ids * sizeof(stateid_t);
- if (nbytes > (u32)((char *)argp->end - (char *)argp->p))
- goto xdr_error;
-
- test_stateid->ts_saved_args = argp;
- save_buf(argp, &test_stateid->ts_savedp);
+ INIT_LIST_HEAD(&test_stateid->ts_stateid_list);
for (i = 0; i < test_stateid->ts_num_ids; i++) {
- status = nfsd4_decode_stateid(argp, &si);
+ stateid = kmalloc(sizeof(struct nfsd4_test_stateid_id), GFP_KERNEL);
+ if (!stateid) {
+ status = PTR_ERR(stateid);
+ goto out;
+ }
+
+ defer_free(argp, kfree, stateid);
+ INIT_LIST_HEAD(&stateid->ts_id_list);
+ list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
+
+ status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid);
if (status)
- return status;
+ goto out;
}
status = 0;
@@ -2662,8 +2659,8 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(8);
- WRITEMEM(commit->co_verf.data, 8);
+ RESERVE_SPACE(NFS4_VERIFIER_SIZE);
+ WRITEMEM(commit->co_verf.data, NFS4_VERIFIER_SIZE);
ADJUST_ARGS();
}
return nfserr;
@@ -2852,6 +2849,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
WRITE32(0); /* XXX: is NULL principal ok? */
ADJUST_ARGS();
break;
+ case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */
+ switch (open->op_why_no_deleg) {
+ case WND4_CONTENTION:
+ case WND4_RESOURCE:
+ RESERVE_SPACE(8);
+ WRITE32(open->op_why_no_deleg);
+ WRITE32(0); /* deleg signaling not supported yet */
+ break;
+ default:
+ RESERVE_SPACE(4);
+ WRITE32(open->op_why_no_deleg);
+ }
+ ADJUST_ARGS();
+ break;
default:
BUG();
}
@@ -3009,7 +3020,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
if (resp->xbuf->page_len)
return nfserr_resource;
- RESERVE_SPACE(8); /* verifier */
+ RESERVE_SPACE(NFS4_VERIFIER_SIZE);
savep = p;
/* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
@@ -3210,9 +3221,9 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(8 + sizeof(nfs4_verifier));
+ RESERVE_SPACE(8 + NFS4_VERIFIER_SIZE);
WRITEMEM(&scd->se_clientid, 8);
- WRITEMEM(&scd->se_confirm, sizeof(nfs4_verifier));
+ WRITEMEM(&scd->se_confirm, NFS4_VERIFIER_SIZE);
ADJUST_ARGS();
}
else if (nfserr == nfserr_clid_inuse) {
@@ -3233,7 +3244,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
RESERVE_SPACE(16);
WRITE32(write->wr_bytes_written);
WRITE32(write->wr_how_written);
- WRITEMEM(write->wr_verifier.data, 8);
+ WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE);
ADJUST_ARGS();
}
return nfserr;
@@ -3392,30 +3403,17 @@ __be32
nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
struct nfsd4_test_stateid *test_stateid)
{
- struct nfsd4_compoundargs *argp;
- struct nfs4_client *cl = resp->cstate.session->se_client;
- stateid_t si;
+ struct nfsd4_test_stateid_id *stateid, *next;
__be32 *p;
- int i;
- int valid;
-
- restore_buf(test_stateid->ts_saved_args, &test_stateid->ts_savedp);
- argp = test_stateid->ts_saved_args;
- RESERVE_SPACE(4);
+ RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids));
*p++ = htonl(test_stateid->ts_num_ids);
- resp->p = p;
- nfs4_lock_state();
- for (i = 0; i < test_stateid->ts_num_ids; i++) {
- nfsd4_decode_stateid(argp, &si);
- valid = nfs4_validate_stateid(cl, &si);
- RESERVE_SPACE(4);
- *p++ = htonl(valid);
- resp->p = p;
+ list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) {
+ *p++ = htonl(stateid->ts_id_status);
}
- nfs4_unlock_state();
+ ADJUST_ARGS();
return nfserr;
}
@@ -3533,7 +3531,7 @@ int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
if (length > session->se_fchannel.maxresp_sz)
return nfserr_rep_too_big;
- if (slot->sl_cachethis == 1 &&
+ if ((slot->sl_flags & NFSD4_SLOT_CACHETHIS) &&
length > session->se_fchannel.maxresp_cached)
return nfserr_rep_too_big_to_cache;
@@ -3657,8 +3655,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
if (nfsd4_has_session(cs)) {
if (cs->status != nfserr_replay_cache) {
nfsd4_store_cache_entry(resp);
- dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
- cs->slot->sl_inuse = false;
+ cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
}
/* Renew the clientid on success and on replay */
release_session_client(cs->session);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bb4a11d58a5..2c53be6d357 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -13,11 +13,14 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/gss_api.h>
#include <linux/sunrpc/gss_krb5_enctypes.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/module.h>
#include "idmap.h"
#include "nfsd.h"
#include "cache.h"
+#include "fault_inject.h"
+#include "netns.h"
/*
* We have a single directory with several nodes in it.
@@ -222,7 +225,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
if (qword_get(&buf, fo_path, size) < 0)
return -EINVAL;
- if (rpc_pton(fo_path, size, sap, salen) == 0)
+ if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0)
return -EINVAL;
return nlmsvc_unlock_all_by_ip(sap);
@@ -721,7 +724,7 @@ static ssize_t __write_ports_addxprt(char *buf)
nfsd_serv->sv_nrthreads--;
return 0;
out_close:
- xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
+ xprt = svc_find_xprt(nfsd_serv, transport, &init_net, PF_INET, port);
if (xprt != NULL) {
svc_close_xprt(xprt);
svc_xprt_put(xprt);
@@ -747,7 +750,7 @@ static ssize_t __write_ports_delxprt(char *buf)
if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
return -EINVAL;
- xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
+ xprt = svc_find_xprt(nfsd_serv, transport, &init_net, AF_UNSPEC, port);
if (xprt == NULL)
return -ENOTCONN;
@@ -1123,14 +1126,30 @@ static int create_proc_exports_entry(void)
}
#endif
+int nfsd_net_id;
+static struct pernet_operations nfsd_net_ops = {
+ .id = &nfsd_net_id,
+ .size = sizeof(struct nfsd_net),
+};
+
static int __init init_nfsd(void)
{
int retval;
printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
- retval = nfs4_state_init(); /* nfs4 locking state */
+ retval = register_cld_notifier();
if (retval)
return retval;
+ retval = register_pernet_subsys(&nfsd_net_ops);
+ if (retval < 0)
+ goto out_unregister_notifier;
+ retval = nfsd4_init_slabs();
+ if (retval)
+ goto out_unregister_pernet;
+ nfs4_state_init();
+ retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+ if (retval)
+ goto out_free_slabs;
nfsd_stat_init(); /* Statistics */
retval = nfsd_reply_cache_init();
if (retval)
@@ -1161,7 +1180,13 @@ out_free_cache:
nfsd_reply_cache_shutdown();
out_free_stat:
nfsd_stat_shutdown();
+ nfsd_fault_inject_cleanup();
+out_free_slabs:
nfsd4_free_slabs();
+out_unregister_pernet:
+ unregister_pernet_subsys(&nfsd_net_ops);
+out_unregister_notifier:
+ unregister_cld_notifier();
return retval;
}
@@ -1175,7 +1200,10 @@ static void __exit exit_nfsd(void)
nfsd_lockd_shutdown();
nfsd_idmap_shutdown();
nfsd4_free_slabs();
+ nfsd_fault_inject_cleanup();
unregister_filesystem(&nfsd_fs_type);
+ unregister_pernet_subsys(&nfsd_net_ops);
+ unregister_cld_notifier();
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 58134a23fdf..1671429ffa6 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -104,14 +104,16 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
*/
#ifdef CONFIG_NFSD_V4
extern unsigned int max_delegations;
-int nfs4_state_init(void);
+void nfs4_state_init(void);
+int nfsd4_init_slabs(void);
void nfsd4_free_slabs(void);
int nfs4_state_start(void);
void nfs4_state_shutdown(void);
void nfs4_reset_lease(time_t leasetime);
int nfs4_reset_recoverydir(char *recdir);
#else
-static inline int nfs4_state_init(void) { return 0; }
+static inline void nfs4_state_init(void) { }
+static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
static inline int nfs4_state_start(void) { return 0; }
static inline void nfs4_state_shutdown(void) { }
@@ -338,15 +340,15 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
}
/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
-#define NFSD_WRITEONLY_ATTRS_WORD1 \
-(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#define NFSD_WRITEONLY_ATTRS_WORD1 \
+ (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
-#define NFSD_WRITEABLE_ATTRS_WORD0 \
-(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL )
-#define NFSD_WRITEABLE_ATTRS_WORD1 \
-(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
- | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#define NFSD_WRITEABLE_ATTRS_WORD0 \
+ (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
+#define NFSD_WRITEABLE_ATTRS_WORD1 \
+ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
#define NFSD_WRITEABLE_ATTRS_WORD2 0
#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
@@ -362,12 +364,17 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
NFSD_WRITEABLE_ATTRS_WORD2
extern int nfsd4_is_junction(struct dentry *dentry);
-#else
+extern int register_cld_notifier(void);
+extern void unregister_cld_notifier(void);
+#else /* CONFIG_NFSD_V4 */
static inline int nfsd4_is_junction(struct dentry *dentry)
{
return 0;
}
+#define register_cld_notifier() 0
+#define unregister_cld_notifier() do { } while(0)
+
#endif /* CONFIG_NFSD_V4 */
#endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index eda7d7e55e0..28dfad39f0c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -251,13 +251,13 @@ static void nfsd_shutdown(void)
nfsd_up = false;
}
-static void nfsd_last_thread(struct svc_serv *serv)
+static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
{
/* When last nfsd thread exits we need to do some clean-up */
nfsd_serv = NULL;
nfsd_shutdown();
- svc_rpcb_cleanup(serv);
+ svc_rpcb_cleanup(serv, net);
printk(KERN_WARNING "nfsd: last server has exited, flushing export "
"cache\n");
@@ -307,33 +307,37 @@ static void set_max_drc(void)
dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
}
-int nfsd_create_serv(void)
+static int nfsd_get_default_max_blksize(void)
{
- int err = 0;
+ struct sysinfo i;
+ unsigned long long target;
+ unsigned long ret;
+
+ si_meminfo(&i);
+ target = (i.totalram - i.totalhigh) << PAGE_SHIFT;
+ /*
+ * Aim for 1/4096 of memory per thread This gives 1MB on 4Gig
+ * machines, but only uses 32K on 128M machines. Bottom out at
+ * 8K on 32M and smaller. Of course, this is only a default.
+ */
+ target >>= 12;
+
+ ret = NFSSVC_MAXBLKSIZE;
+ while (ret > target && ret >= 8*1024*2)
+ ret /= 2;
+ return ret;
+}
+int nfsd_create_serv(void)
+{
WARN_ON(!mutex_is_locked(&nfsd_mutex));
if (nfsd_serv) {
svc_get(nfsd_serv);
return 0;
}
- if (nfsd_max_blksize == 0) {
- /* choose a suitable default */
- struct sysinfo i;
- si_meminfo(&i);
- /* Aim for 1/4096 of memory per thread
- * This gives 1MB on 4Gig machines
- * But only uses 32K on 128M machines.
- * Bottom out at 8K on 32M and smaller.
- * Of course, this is only a default.
- */
- nfsd_max_blksize = NFSSVC_MAXBLKSIZE;
- i.totalram <<= PAGE_SHIFT - 12;
- while (nfsd_max_blksize > i.totalram &&
- nfsd_max_blksize >= 8*1024*2)
- nfsd_max_blksize /= 2;
- }
+ if (nfsd_max_blksize == 0)
+ nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions();
-
nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
nfsd_last_thread, nfsd, THIS_MODULE);
if (nfsd_serv == NULL)
@@ -341,7 +345,7 @@ int nfsd_create_serv(void)
set_max_drc();
do_gettimeofday(&nfssvc_boot); /* record boot time */
- return err;
+ return 0;
}
int nfsd_nrpools(void)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index a3cf38476a1..89ab137d379 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -128,12 +128,14 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
(NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE)
struct nfsd4_slot {
- bool sl_inuse;
- bool sl_cachethis;
- u16 sl_opcnt;
u32 sl_seqid;
__be32 sl_status;
u32 sl_datalen;
+ u16 sl_opcnt;
+#define NFSD4_SLOT_INUSE (1 << 0)
+#define NFSD4_SLOT_CACHETHIS (1 << 1)
+#define NFSD4_SLOT_INITIALIZED (1 << 2)
+ u8 sl_flags;
char sl_data[];
};
@@ -196,18 +198,7 @@ struct nfsd4_session {
struct nfsd4_slot *se_slots[]; /* forward channel slots */
};
-static inline void
-nfsd4_put_session(struct nfsd4_session *ses)
-{
- extern void free_session(struct kref *kref);
- kref_put(&ses->se_ref, free_session);
-}
-
-static inline void
-nfsd4_get_session(struct nfsd4_session *ses)
-{
- kref_get(&ses->se_ref);
-}
+extern void nfsd4_put_session(struct nfsd4_session *ses);
/* formatted contents of nfs4_sessionid */
struct nfsd4_sessionid {
@@ -245,14 +236,17 @@ struct nfs4_client {
struct svc_cred cl_cred; /* setclientid principal */
clientid_t cl_clientid; /* generated by server */
nfs4_verifier cl_confirm; /* generated by server */
- u32 cl_firststate; /* recovery dir creation */
u32 cl_minorversion;
/* for v4.0 and v4.1 callbacks: */
struct nfs4_cb_conn cl_cb_conn;
-#define NFSD4_CLIENT_CB_UPDATE 1
-#define NFSD4_CLIENT_KILL 2
- unsigned long cl_cb_flags;
+#define NFSD4_CLIENT_CB_UPDATE (0)
+#define NFSD4_CLIENT_CB_KILL (1)
+#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
+#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
+#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
+ 1 << NFSD4_CLIENT_CB_KILL)
+ unsigned long cl_flags;
struct rpc_clnt *cl_cb_client;
u32 cl_cb_ident;
#define NFSD4_CB_UP 0
@@ -366,6 +360,7 @@ struct nfs4_openowner {
struct nfs4_lockowner {
struct nfs4_stateowner lo_owner; /* must be first element */
+ struct list_head lo_owner_ino_hash; /* hash by owner,file */
struct list_head lo_perstateid; /* for lockowners only */
struct list_head lo_list; /* for temporary uses */
};
@@ -462,6 +457,8 @@ extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
extern void nfs4_lock_state(void);
extern void nfs4_unlock_state(void);
extern int nfs4_in_grace(void);
+extern void nfs4_release_reclaim(void);
+extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp);
extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
extern void nfs4_free_openowner(struct nfs4_openowner *);
extern void nfs4_free_lockowner(struct nfs4_lockowner *);
@@ -476,16 +473,17 @@ extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfs4_put_delegation(struct nfs4_delegation *dp);
extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
-extern void nfsd4_init_recdir(void);
-extern int nfsd4_recdir_load(void);
-extern void nfsd4_shutdown_recdir(void);
extern int nfs4_client_to_reclaim(const char *name);
extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
-extern void nfsd4_recdir_purge_old(void);
-extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
-extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
extern void release_session_client(struct nfsd4_session *);
extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
+/* nfs4recover operations */
+extern int nfsd4_client_tracking_init(struct net *net);
+extern void nfsd4_client_tracking_exit(struct net *net);
+extern void nfsd4_client_record_create(struct nfs4_client *clp);
+extern void nfsd4_client_record_remove(struct nfs4_client *clp);
+extern int nfsd4_client_record_check(struct nfs4_client *clp);
+extern void nfsd4_record_grace_done(struct net *net, time_t boot_time);
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index a2e2402b2af..6d4521feb6e 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -25,6 +25,7 @@
#include <linux/module.h>
#include <linux/sunrpc/stats.h>
#include <linux/nfsd/stats.h>
+#include <net/net_namespace.h>
#include "nfsd.h"
@@ -94,11 +95,11 @@ static const struct file_operations nfsd_proc_fops = {
void
nfsd_stat_init(void)
{
- svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops);
+ svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops);
}
void
nfsd_stat_shutdown(void)
{
- svc_proc_unregister("nfsd");
+ svc_proc_unregister(&init_net, "nfsd");
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d25a723b68a..296d671654d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -594,8 +594,19 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
return error;
}
-#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
-#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
+/*
+ * NFS junction information is stored in an extended attribute.
+ */
+#define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs"
+
+/**
+ * nfsd4_is_junction - Test if an object could be an NFS junction
+ *
+ * @dentry: object to test
+ *
+ * Returns 1 if "dentry" appears to contain NFS junction information.
+ * Otherwise 0 is returned.
+ */
int nfsd4_is_junction(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
@@ -606,7 +617,7 @@ int nfsd4_is_junction(struct dentry *dentry)
return 0;
if (!(inode->i_mode & S_ISVTX))
return 0;
- if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
+ if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
return 0;
return 1;
}
@@ -726,12 +737,13 @@ static int nfsd_open_break_lease(struct inode *inode, int access)
/*
* Open an existing file or directory.
- * The access argument indicates the type of open (read/write/lock)
+ * The may_flags argument indicates the type of open (read/write/lock)
+ * and additional flags.
* N.B. After this call fhp needs an fh_put
*/
__be32
nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
- int access, struct file **filp)
+ int may_flags, struct file **filp)
{
struct dentry *dentry;
struct inode *inode;
@@ -746,7 +758,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
* and (hopefully) checked permission - so allow OWNER_OVERRIDE
* in case a chmod has now revoked permission.
*/
- err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE);
+ err = fh_verify(rqstp, fhp, type, may_flags | NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
@@ -757,7 +769,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
* or any access when mandatory locking enabled
*/
err = nfserr_perm;
- if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE))
+ if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
goto out;
/*
* We must ignore files (but only files) which might have mandatory
@@ -770,12 +782,12 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
if (!inode->i_fop)
goto out;
- host_err = nfsd_open_break_lease(inode, access);
+ host_err = nfsd_open_break_lease(inode, may_flags);
if (host_err) /* NOMEM or WOULDBLOCK */
goto out_nfserr;
- if (access & NFSD_MAY_WRITE) {
- if (access & NFSD_MAY_READ)
+ if (may_flags & NFSD_MAY_WRITE) {
+ if (may_flags & NFSD_MAY_READ)
flags = O_RDWR|O_LARGEFILE;
else
flags = O_WRONLY|O_LARGEFILE;
@@ -784,8 +796,15 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
flags, current_cred());
if (IS_ERR(*filp))
host_err = PTR_ERR(*filp);
- else
- host_err = ima_file_check(*filp, access);
+ else {
+ host_err = ima_file_check(*filp, may_flags);
+
+ if (may_flags & NFSD_MAY_64BIT_COOKIE)
+ (*filp)->f_mode |= FMODE_64BITHASH;
+ else
+ (*filp)->f_mode |= FMODE_32BITHASH;
+ }
+
out_nfserr:
err = nfserrno(host_err);
out:
@@ -1530,30 +1549,31 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32
nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
{
- struct dentry *dentry;
struct inode *inode;
mm_segment_t oldfs;
__be32 err;
int host_err;
+ struct path path;
err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);
if (err)
goto out;
- dentry = fhp->fh_dentry;
- inode = dentry->d_inode;
+ path.mnt = fhp->fh_export->ex_path.mnt;
+ path.dentry = fhp->fh_dentry;
+ inode = path.dentry->d_inode;
err = nfserr_inval;
if (!inode->i_op->readlink)
goto out;
- touch_atime(fhp->fh_export->ex_path.mnt, dentry);
+ touch_atime(&path);
/* N.B. Why does this call need a get_fs()??
* Remove the set_fs and watch the fireworks:-) --okir
*/
oldfs = get_fs(); set_fs(KERNEL_DS);
- host_err = inode->i_op->readlink(dentry, buf, *lenp);
+ host_err = inode->i_op->readlink(path.dentry, buf, *lenp);
set_fs(oldfs);
if (host_err < 0)
@@ -2009,8 +2029,13 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
__be32 err;
struct file *file;
loff_t offset = *offsetp;
+ int may_flags = NFSD_MAY_READ;
+
+ /* NFSv2 only supports 32 bit cookies */
+ if (rqstp->rq_vers > 2)
+ may_flags |= NFSD_MAY_64BIT_COOKIE;
- err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file);
+ err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file);
if (err)
goto out;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 1dcd238e11a..ec0611b2b73 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -27,6 +27,8 @@
#define NFSD_MAY_BYPASS_GSS 0x400
#define NFSD_MAY_READ_IF_EXEC 0x800
+#define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */
+
#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 2364747ee97..1b3501598ab 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -43,6 +43,13 @@
#define NFSD4_MAX_TAGLEN 128
#define XDR_LEN(n) (((n) + 3) & ~3)
+#define CURRENT_STATE_ID_FLAG (1<<0)
+#define SAVED_STATE_ID_FLAG (1<<1)
+
+#define SET_STATE_ID(c, f) ((c)->sid_flags |= (f))
+#define HAS_STATE_ID(c, f) ((c)->sid_flags & (f))
+#define CLEAR_STATE_ID(c, f) ((c)->sid_flags &= ~(f))
+
struct nfsd4_compound_state {
struct svc_fh current_fh;
struct svc_fh save_fh;
@@ -54,6 +61,10 @@ struct nfsd4_compound_state {
size_t iovlen;
u32 minorversion;
u32 status;
+ stateid_t current_stateid;
+ stateid_t save_stateid;
+ /* to indicate current and saved state id presents */
+ u32 sid_flags;
};
static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
@@ -212,16 +223,19 @@ struct nfsd4_open {
struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */
u32 op_delegate_type; /* request - CLAIM_PREV only */
stateid_t op_delegate_stateid; /* request - response */
+ u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */
u32 op_create; /* request */
u32 op_createmode; /* request */
u32 op_bmval[3]; /* request */
struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
- nfs4_verifier verf; /* EXCLUSIVE4 */
+ nfs4_verifier op_verf __attribute__((aligned(32)));
+ /* EXCLUSIVE4 */
clientid_t op_clientid; /* request */
struct xdr_netobj op_owner; /* request */
u32 op_seqid; /* request */
u32 op_share_access; /* request */
u32 op_share_deny; /* request */
+ u32 op_deleg_want; /* request */
stateid_t op_stateid; /* response */
u32 op_recall; /* recall */
struct nfsd4_change_info op_cinfo; /* response */
@@ -234,7 +248,6 @@ struct nfsd4_open {
struct nfs4_acl *op_acl;
};
#define op_iattr iattr
-#define op_verf verf
struct nfsd4_open_confirm {
stateid_t oc_req_stateid /* request */;
@@ -245,8 +258,9 @@ struct nfsd4_open_confirm {
struct nfsd4_open_downgrade {
stateid_t od_stateid;
u32 od_seqid;
- u32 od_share_access;
- u32 od_share_deny;
+ u32 od_share_access; /* request */
+ u32 od_deleg_want; /* request */
+ u32 od_share_deny; /* request */
};
@@ -343,10 +357,15 @@ struct nfsd4_saved_compoundargs {
struct page **pagelist;
};
+struct nfsd4_test_stateid_id {
+ __be32 ts_id_status;
+ stateid_t ts_id_stateid;
+ struct list_head ts_id_list;
+};
+
struct nfsd4_test_stateid {
__be32 ts_num_ids;
- struct nfsd4_compoundargs *ts_saved_args;
- struct nfsd4_saved_compoundargs ts_savedp;
+ struct list_head ts_stateid_list;
};
struct nfsd4_free_stateid {
@@ -503,7 +522,8 @@ static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
{
- return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp);
+ return !(resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS)
+ || nfsd4_is_solo_sequence(resp);
}
#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs)
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index c9b342c8b50..dab5c4c6dfa 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -218,11 +218,11 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
kaddr, 1);
mark_buffer_dirty(cp_bh);
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh,
kaddr);
le64_add_cpu(&header->ch_ncheckpoints, 1);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(cpfile);
}
@@ -313,7 +313,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
continue;
}
- kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(
cpfile, cno, cp_bh, kaddr);
nicps = 0;
@@ -334,7 +334,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
cpfile, cp_bh, kaddr, nicps);
if (count == 0) {
/* make hole */
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(cp_bh);
ret =
nilfs_cpfile_delete_checkpoint_block(
@@ -349,18 +349,18 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
}
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(cp_bh);
}
if (tnicps > 0) {
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh,
kaddr);
le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(cpfile);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
brelse(header_bh);
@@ -408,7 +408,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
continue; /* skip hole */
}
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
if (!nilfs_checkpoint_invalid(cp)) {
@@ -418,7 +418,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
n++;
}
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(bh);
}
@@ -451,10 +451,10 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
ret = nilfs_cpfile_get_header_block(cpfile, &bh);
if (ret < 0)
goto out;
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(bh);
if (curr == 0) {
ret = 0;
@@ -472,7 +472,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
ret = 0; /* No snapshots (started from a hole block) */
goto out;
}
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh->b_page);
while (n < nci) {
cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
curr = ~(__u64)0; /* Terminator */
@@ -488,7 +488,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
if (curr_blkoff != next_blkoff) {
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(bh);
ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
0, &bh);
@@ -496,12 +496,12 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
WARN_ON(ret == -ENOENT);
goto out;
}
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh->b_page);
}
curr = next;
curr_blkoff = next_blkoff;
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(bh);
*cnop = curr;
ret = n;
@@ -592,24 +592,24 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
if (nilfs_checkpoint_invalid(cp)) {
ret = -ENOENT;
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
goto out_cp;
}
if (nilfs_checkpoint_snapshot(cp)) {
ret = 0;
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
goto out_cp;
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
if (ret < 0)
goto out_cp;
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
list = &header->ch_snapshot_list;
curr_bh = header_bh;
@@ -621,13 +621,13 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
curr = prev;
if (curr_blkoff != prev_blkoff) {
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(curr_bh);
ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
0, &curr_bh);
if (ret < 0)
goto out_header;
- kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(curr_bh->b_page);
}
curr_blkoff = prev_blkoff;
cp = nilfs_cpfile_block_get_checkpoint(
@@ -635,7 +635,7 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
list = &cp->cp_snapshot_list;
prev = le64_to_cpu(list->ssl_prev);
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (prev != 0) {
ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
@@ -647,29 +647,29 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
get_bh(prev_bh);
}
- kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(curr_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, curr, curr_bh, kaddr);
list->ssl_prev = cpu_to_le64(cno);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
- kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
nilfs_checkpoint_set_snapshot(cp);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
- kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(prev_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, prev, prev_bh, kaddr);
list->ssl_next = cpu_to_le64(cno);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
le64_add_cpu(&header->ch_nsnapshots, 1);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
mark_buffer_dirty(prev_bh);
mark_buffer_dirty(curr_bh);
@@ -710,23 +710,23 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
if (nilfs_checkpoint_invalid(cp)) {
ret = -ENOENT;
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
goto out_cp;
}
if (!nilfs_checkpoint_snapshot(cp)) {
ret = 0;
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
goto out_cp;
}
list = &cp->cp_snapshot_list;
next = le64_to_cpu(list->ssl_next);
prev = le64_to_cpu(list->ssl_prev);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
if (ret < 0)
@@ -750,29 +750,29 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
get_bh(prev_bh);
}
- kaddr = kmap_atomic(next_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(next_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, next, next_bh, kaddr);
list->ssl_prev = cpu_to_le64(prev);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
- kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(prev_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, prev, prev_bh, kaddr);
list->ssl_next = cpu_to_le64(next);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
- kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
nilfs_checkpoint_clear_snapshot(cp);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
le64_add_cpu(&header->ch_nsnapshots, -1);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
mark_buffer_dirty(next_bh);
mark_buffer_dirty(prev_bh);
@@ -829,13 +829,13 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
if (ret < 0)
goto out;
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
if (nilfs_checkpoint_invalid(cp))
ret = -ENOENT;
else
ret = nilfs_checkpoint_snapshot(cp);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(bh);
out:
@@ -912,12 +912,12 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
ret = nilfs_cpfile_get_header_block(cpfile, &bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
cpstat->cs_cno = nilfs_mdt_cno(cpfile);
cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(bh);
out_sem:
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index fcc2f869af1..b5c13f3576b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -85,13 +85,13 @@ void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
struct nilfs_dat_entry *entry;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
entry->de_blocknr = cpu_to_le64(0);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
nilfs_palloc_commit_alloc_entry(dat, req);
nilfs_dat_commit_entry(dat, req);
@@ -109,13 +109,13 @@ static void nilfs_dat_commit_free(struct inode *dat,
struct nilfs_dat_entry *entry;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
entry->de_blocknr = cpu_to_le64(0);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
nilfs_dat_commit_entry(dat, req);
nilfs_palloc_commit_free_entry(dat, req);
@@ -136,12 +136,12 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
struct nilfs_dat_entry *entry;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
entry->de_blocknr = cpu_to_le64(blocknr);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
nilfs_dat_commit_entry(dat, req);
}
@@ -160,12 +160,12 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
return ret;
}
- kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (blocknr == 0) {
ret = nilfs_palloc_prepare_free_entry(dat, req);
@@ -186,7 +186,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
sector_t blocknr;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
end = start = le64_to_cpu(entry->de_start);
@@ -196,7 +196,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
}
entry->de_end = cpu_to_le64(end);
blocknr = le64_to_cpu(entry->de_blocknr);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (blocknr == 0)
nilfs_dat_commit_free(dat, req);
@@ -211,12 +211,12 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
sector_t blocknr;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (start == nilfs_mdt_cno(dat) && blocknr == 0)
nilfs_palloc_abort_free_entry(dat, req);
@@ -346,20 +346,20 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
}
}
- kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
(unsigned long long)vblocknr,
(unsigned long long)le64_to_cpu(entry->de_start),
(unsigned long long)le64_to_cpu(entry->de_end));
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(entry_bh);
return -EINVAL;
}
WARN_ON(blocknr == 0);
entry->de_blocknr = cpu_to_le64(blocknr);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
mark_buffer_dirty(entry_bh);
nilfs_mdt_mark_dirty(dat);
@@ -409,7 +409,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
}
}
- kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
blocknr = le64_to_cpu(entry->de_blocknr);
if (blocknr == 0) {
@@ -419,7 +419,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
*blocknrp = blocknr;
out:
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(entry_bh);
return ret;
}
@@ -440,7 +440,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
0, &entry_bh);
if (ret < 0)
return ret;
- kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(entry_bh->b_page);
/* last virtual block number in this block */
first = vinfo->vi_vblocknr;
do_div(first, entries_per_block);
@@ -456,7 +456,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
vinfo->vi_end = le64_to_cpu(entry->de_end);
vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(entry_bh);
}
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index ca35b3a46d1..df1a7fb238d 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -602,7 +602,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
unlock_page(page);
goto fail;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memset(kaddr, 0, chunk_size);
de = (struct nilfs_dir_entry *)kaddr;
de->name_len = 1;
@@ -617,7 +617,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
de->inode = cpu_to_le64(parent->i_ino);
memcpy(de->name, "..\0", 4);
nilfs_set_de_type(de, inode);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
nilfs_commit_chunk(page, mapping, 0, chunk_size);
fail:
page_cache_release(page);
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 684d76300a8..5a48df79d67 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -122,11 +122,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
return ret;
}
- kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(req.pr_entry_bh->b_page);
raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
req.pr_entry_bh, kaddr);
raw_inode->i_flags = 0;
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
mark_buffer_dirty(req.pr_entry_bh);
brelse(req.pr_entry_bh);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 886649627c3..2a70fce70c6 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -603,6 +603,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
nsegs = argv[4].v_nmembs;
if (argv[4].v_size != argsz[4])
goto out;
+ if (nsegs > UINT_MAX / sizeof(__u64))
+ goto out;
/*
* argv[4] points to segment numbers this ioctl cleans. We
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 800e8d78a83..f9897d09c69 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -58,12 +58,12 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
set_buffer_mapped(bh);
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh->b_page);
memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
if (init_block)
init_block(inode, bh, kaddr);
flush_dcache_page(bh->b_page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1cd3f624dff..fce2bbee66d 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -193,9 +193,6 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
struct nilfs_transaction_info ti;
int err;
- if (inode->i_nlink >= NILFS_LINK_MAX)
- return -EMLINK;
-
err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
if (err)
return err;
@@ -219,9 +216,6 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct nilfs_transaction_info ti;
int err;
- if (dir->i_nlink >= NILFS_LINK_MAX)
- return -EMLINK;
-
err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
if (err)
return err;
@@ -400,11 +394,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
drop_nlink(new_inode);
nilfs_mark_inode_dirty(new_inode);
} else {
- if (dir_de) {
- err = -EMLINK;
- if (new_dir->i_nlink >= NILFS_LINK_MAX)
- goto out_dir;
- }
err = nilfs_add_link(new_dentry, old_inode);
if (err)
goto out_dir;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 65221a04c6f..3e7b2a0dc0c 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -119,11 +119,11 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
struct page *spage = sbh->b_page, *dpage = dbh->b_page;
struct buffer_head *bh;
- kaddr0 = kmap_atomic(spage, KM_USER0);
- kaddr1 = kmap_atomic(dpage, KM_USER1);
+ kaddr0 = kmap_atomic(spage);
+ kaddr1 = kmap_atomic(dpage);
memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
- kunmap_atomic(kaddr1, KM_USER1);
- kunmap_atomic(kaddr0, KM_USER0);
+ kunmap_atomic(kaddr1);
+ kunmap_atomic(kaddr0);
dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
dbh->b_blocknr = sbh->b_blocknr;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index a604ac0331b..f1626f5011c 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -493,9 +493,9 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
if (unlikely(!bh_org))
return -EIO;
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(bh_org);
return 0;
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 850a7c0228f..dc9a913784a 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -227,9 +227,9 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
crc = crc32_le(crc, bh->b_data, bh->b_size);
}
list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh->b_page);
crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
raw_sum->ss_datasum = cpu_to_le32(crc);
}
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 0a0aba617d8..c5b7653a439 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -111,11 +111,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
struct nilfs_sufile_header *header;
void *kaddr;
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
mark_buffer_dirty(header_bh);
}
@@ -319,11 +319,11 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
ret = nilfs_sufile_get_header_block(sufile, &header_bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
ncleansegs = le64_to_cpu(header->sh_ncleansegs);
last_alloc = le64_to_cpu(header->sh_last_alloc);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
nsegments = nilfs_sufile_get_nsegments(sufile);
maxsegnum = sui->allocmax;
@@ -356,7 +356,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
&su_bh);
if (ret < 0)
goto out_header;
- kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
@@ -367,14 +367,14 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
continue;
/* found a clean segment */
nilfs_segment_usage_set_dirty(su);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
le64_add_cpu(&header->sh_ncleansegs, -1);
le64_add_cpu(&header->sh_ndirtysegs, 1);
header->sh_last_alloc = cpu_to_le64(segnum);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
sui->ncleansegs--;
mark_buffer_dirty(header_bh);
@@ -385,7 +385,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
goto out_header;
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(su_bh);
}
@@ -407,16 +407,16 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
struct nilfs_segment_usage *su;
void *kaddr;
- kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (unlikely(!nilfs_segment_usage_clean(su))) {
printk(KERN_WARNING "%s: segment %llu must be clean\n",
__func__, (unsigned long long)segnum);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
return;
}
nilfs_segment_usage_set_dirty(su);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
nilfs_sufile_mod_counter(header_bh, -1, 1);
NILFS_SUI(sufile)->ncleansegs--;
@@ -433,11 +433,11 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
void *kaddr;
int clean, dirty;
- kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
su->su_nblocks == cpu_to_le32(0)) {
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
return;
}
clean = nilfs_segment_usage_clean(su);
@@ -447,7 +447,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
su->su_lastmod = cpu_to_le64(0);
su->su_nblocks = cpu_to_le32(0);
su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
NILFS_SUI(sufile)->ncleansegs -= clean;
@@ -464,12 +464,12 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
void *kaddr;
int sudirty;
- kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_clean(su)) {
printk(KERN_WARNING "%s: segment %llu is already clean\n",
__func__, (unsigned long long)segnum);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
return;
}
WARN_ON(nilfs_segment_usage_error(su));
@@ -477,7 +477,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
sudirty = nilfs_segment_usage_dirty(su);
nilfs_segment_usage_set_clean(su);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
mark_buffer_dirty(su_bh);
nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
@@ -525,13 +525,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
WARN_ON(nilfs_segment_usage_error(su));
if (modtime)
su->su_lastmod = cpu_to_le64(modtime);
su->su_nblocks = cpu_to_le32(nblocks);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
@@ -572,7 +572,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
@@ -582,7 +582,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
spin_lock(&nilfs->ns_last_segment_lock);
sustat->ss_prot_seq = nilfs->ns_prot_seq;
spin_unlock(&nilfs->ns_last_segment_lock);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(header_bh);
out_sem:
@@ -598,15 +598,15 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
void *kaddr;
int suclean;
- kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_error(su)) {
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
return;
}
suclean = nilfs_segment_usage_clean(su);
nilfs_segment_usage_set_error(su);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (suclean) {
nilfs_sufile_mod_counter(header_bh, -1, 0);
@@ -675,7 +675,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
/* hole */
continue;
}
- kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
su2 = su;
@@ -684,7 +684,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
~(1UL << NILFS_SEGMENT_USAGE_ERROR)) ||
nilfs_segment_is_active(nilfs, segnum + j)) {
ret = -EBUSY;
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(su_bh);
goto out_header;
}
@@ -696,7 +696,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
nc++;
}
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (nc > 0) {
mark_buffer_dirty(su_bh);
ncleaned += nc;
@@ -772,10 +772,10 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
sui->ncleansegs -= nsegs - newnsegs;
}
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(sufile);
@@ -840,7 +840,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
continue;
}
- kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
for (j = 0; j < n;
@@ -853,7 +853,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
si->sui_flags |=
(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(su_bh);
}
ret = nsegs;
@@ -902,10 +902,10 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
goto failed;
sui = NILFS_SUI(sufile);
- kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
brelse(header_bh);
sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 08e3d4f9df1..1099a76cee5 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -917,9 +917,8 @@ static int nilfs_get_root_dentry(struct super_block *sb,
if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
dentry = d_find_alias(inode);
if (!dentry) {
- dentry = d_alloc_root(inode);
+ dentry = d_make_root(inode);
if (!dentry) {
- iput(inode);
ret = -ENOMEM;
goto failed_dentry;
}
@@ -1059,6 +1058,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_export_op = &nilfs_export_ops;
sb->s_root = NULL;
sb->s_time_gran = 1;
+ sb->s_max_links = NILFS_LINK_MAX;
bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
sb->s_bdi = bdi ? : &default_backing_dev_info;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d3271409437..501b7f8b739 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -409,6 +409,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
nilfs->ns_r_segments_percentage =
le32_to_cpu(sbp->s_r_segments_percentage);
+ if (nilfs->ns_r_segments_percentage < 1 ||
+ nilfs->ns_r_segments_percentage > 99) {
+ printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n");
+ return -EINVAL;
+ }
+
nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
return 0;
@@ -515,6 +521,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
brelse(sbh[1]);
sbh[1] = NULL;
sbp[1] = NULL;
+ valid[1] = 0;
swp = 0;
}
if (!valid[swp]) {
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index e14587d5568..f104d565b68 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -135,9 +135,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
- /* 1 from caller and 1 for being on i_list/g_list */
- BUG_ON(atomic_read(&mark->refcnt) < 2);
-
spin_lock(&group->mark_lock);
if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
@@ -182,6 +179,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
iput(inode);
/*
+ * We don't necessarily have a ref on mark from caller so the above iput
+ * may have already destroyed it. Don't touch from now on.
+ */
+
+ /*
* it's possible that this group tried to destroy itself, but this
* this mark was simultaneously being freed by inode. If that's the
* case, we finish freeing the group here.
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index ee188158a22..c887b1378f7 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -447,7 +447,7 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
return event;
}
-__init int fsnotify_notification_init(void)
+static __init int fsnotify_notification_init(void)
{
fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
@@ -461,4 +461,3 @@ __init int fsnotify_notification_init(void)
return 0;
}
subsys_initcall(fsnotify_notification_init);
-
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 0b1e885b8cf..fa9c05f97af 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -94,11 +94,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
if (file_ofs < init_size)
ofs = init_size - file_ofs;
local_irq_save(flags);
- kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
+ kaddr = kmap_atomic(page);
memset(kaddr + bh_offset(bh) + ofs, 0,
bh->b_size - ofs);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
+ kunmap_atomic(kaddr);
local_irq_restore(flags);
}
} else {
@@ -147,11 +147,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
/* Should have been verified before we got here... */
BUG_ON(!recs);
local_irq_save(flags);
- kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
+ kaddr = kmap_atomic(page);
for (i = 0; i < recs; i++)
post_read_mst_fixup((NTFS_RECORD*)(kaddr +
i * rec_size), rec_size);
- kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
+ kunmap_atomic(kaddr);
local_irq_restore(flags);
flush_dcache_page(page);
if (likely(page_uptodate && !PageError(page)))
@@ -504,7 +504,7 @@ retry_readpage:
/* Race with shrinking truncate. */
attr_len = i_size;
}
- addr = kmap_atomic(page, KM_USER0);
+ addr = kmap_atomic(page);
/* Copy the data to the page. */
memcpy(addr, (u8*)ctx->attr +
le16_to_cpu(ctx->attr->data.resident.value_offset),
@@ -512,7 +512,7 @@ retry_readpage:
/* Zero the remainder of the page. */
memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
flush_dcache_page(page);
- kunmap_atomic(addr, KM_USER0);
+ kunmap_atomic(addr);
put_unm_err_out:
ntfs_attr_put_search_ctx(ctx);
unm_err_out:
@@ -746,14 +746,14 @@ lock_retry_remap:
unsigned long *bpos, *bend;
/* Check if the buffer is zero. */
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
bpos = (unsigned long *)(kaddr + bh_offset(bh));
bend = (unsigned long *)((u8*)bpos + blocksize);
do {
if (unlikely(*bpos))
break;
} while (likely(++bpos < bend));
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (bpos == bend) {
/*
* Buffer is zero and sparse, no need to write
@@ -1495,14 +1495,14 @@ retry_writepage:
/* Shrinking cannot fail. */
BUG_ON(err);
}
- addr = kmap_atomic(page, KM_USER0);
+ addr = kmap_atomic(page);
/* Copy the data from the page to the mft record. */
memcpy((u8*)ctx->attr +
le16_to_cpu(ctx->attr->data.resident.value_offset),
addr, attr_len);
/* Zero out of bounds area in the page cache page. */
memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
- kunmap_atomic(addr, KM_USER0);
+ kunmap_atomic(addr);
flush_dcache_page(page);
flush_dcache_mft_record_page(ctx->ntfs_ino);
/* We are done with the page. */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index f14fde2b03d..a27e3fecefa 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1,7 +1,7 @@
/**
* attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
* Copyright (c) 2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
@@ -345,10 +345,10 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
unsigned long flags;
bool is_retry = false;
+ BUG_ON(!ni);
ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
ni->mft_no, (unsigned long long)vcn,
write_locked ? "write" : "read");
- BUG_ON(!ni);
BUG_ON(!NInoNonResident(ni));
BUG_ON(vcn < 0);
if (!ni->runlist.rl) {
@@ -469,9 +469,9 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
int err = 0;
bool is_retry = false;
+ BUG_ON(!ni);
ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
- BUG_ON(!ni);
BUG_ON(!NInoNonResident(ni));
BUG_ON(vcn < 0);
if (!ni->runlist.rl) {
@@ -1656,12 +1656,12 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
attr_size = le32_to_cpu(a->data.resident.value_length);
BUG_ON(attr_size != data_size);
if (page && !PageUptodate(page)) {
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memcpy(kaddr, (u8*)a +
le16_to_cpu(a->data.resident.value_offset),
attr_size);
memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
flush_dcache_page(page);
SetPageUptodate(page);
}
@@ -1806,9 +1806,9 @@ undo_err_out:
sizeof(a->data.resident.reserved));
/* Copy the data from the page back to the attribute value. */
if (page) {
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memcpy((u8*)a + mp_ofs, kaddr, attr_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
/* Setup the allocated size in the ntfs inode in case it changed. */
write_lock_irqsave(&ni->size_lock, flags);
@@ -2540,10 +2540,10 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
size = PAGE_CACHE_SIZE;
if (idx == end)
size = end_ofs;
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memset(kaddr + start_ofs, val, size - start_ofs);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
set_page_dirty(page);
page_cache_release(page);
balance_dirty_pages_ratelimited(mapping);
@@ -2561,10 +2561,10 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
"page (index 0x%lx).", idx);
return -ENOMEM;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memset(kaddr, val, PAGE_CACHE_SIZE);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
/*
* If the page has buffers, mark them uptodate since buffer
* state and not page state is definitive in 2.6 kernels.
@@ -2598,10 +2598,10 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
"(error, index 0x%lx).", idx);
return PTR_ERR(page);
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memset(kaddr, val, end_ofs);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
set_page_dirty(page);
page_cache_release(page);
balance_dirty_pages_ratelimited(mapping);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index c587e2d2718..8639169221c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -704,7 +704,7 @@ map_buffer_cached:
u8 *kaddr;
unsigned pofs;
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (bh_pos < pos) {
pofs = bh_pos & ~PAGE_CACHE_MASK;
memset(kaddr + pofs, 0, pos - bh_pos);
@@ -713,7 +713,7 @@ map_buffer_cached:
pofs = end & ~PAGE_CACHE_MASK;
memset(kaddr + pofs, 0, bh_end - end);
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
flush_dcache_page(page);
}
continue;
@@ -1287,9 +1287,9 @@ static inline size_t ntfs_copy_from_user(struct page **pages,
len = PAGE_CACHE_SIZE - ofs;
if (len > bytes)
len = bytes;
- addr = kmap_atomic(*pages, KM_USER0);
+ addr = kmap_atomic(*pages);
left = __copy_from_user_inatomic(addr + ofs, buf, len);
- kunmap_atomic(addr, KM_USER0);
+ kunmap_atomic(addr);
if (unlikely(left)) {
/* Do it the slow way. */
addr = kmap(*pages);
@@ -1401,10 +1401,10 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
len = PAGE_CACHE_SIZE - ofs;
if (len > bytes)
len = bytes;
- addr = kmap_atomic(*pages, KM_USER0);
+ addr = kmap_atomic(*pages);
copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
*iov, *iov_ofs, len);
- kunmap_atomic(addr, KM_USER0);
+ kunmap_atomic(addr);
if (unlikely(copied != len)) {
/* Do it the slow way. */
addr = kmap(*pages);
@@ -1691,7 +1691,7 @@ static int ntfs_commit_pages_after_write(struct page **pages,
BUG_ON(end > le32_to_cpu(a->length) -
le16_to_cpu(a->data.resident.value_offset));
kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
/* Copy the received data from the page to the mft record. */
memcpy(kattr + pos, kaddr + pos, bytes);
/* Update the attribute length if necessary. */
@@ -1713,7 +1713,7 @@ static int ntfs_commit_pages_after_write(struct page **pages,
flush_dcache_page(page);
SetPageUptodate(page);
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
/* Update initialized_size/i_size if necessary. */
read_lock_irqsave(&ni->size_lock, flags);
initialized_size = ni->initialized_size;
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index faece719086..809c0e6d8e0 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -2008,14 +2008,14 @@ typedef struct {
*
* When a directory is small enough to fit inside the index root then this
* is the only attribute describing the directory. When the directory is too
- * large to fit in the index root, on the other hand, two aditional attributes
+ * large to fit in the index root, on the other hand, two additional attributes
* are present: an index allocation attribute, containing sub-nodes of the B+
* directory tree (see below), and a bitmap attribute, describing which virtual
* cluster numbers (vcns) in the index allocation attribute are in use by an
* index block.
*
* NOTE: The root directory (FILE_root) contains an entry for itself. Other
- * dircetories do not contain entries for themselves, though.
+ * directories do not contain entries for themselves, though.
*/
typedef struct {
ATTR_TYPE type; /* Type of the indexed attribute. Is
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 382857f9c7d..3014a36a255 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
/**
* mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
* Copyright (c) 2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
@@ -1367,7 +1367,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
ntfs_error(vol->sb, "Failed to merge runlists for mft "
"bitmap.");
if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to dealocate "
+ ntfs_error(vol->sb, "Failed to deallocate "
"allocated cluster.%s", es);
NVolSetErrors(vol);
}
@@ -1805,7 +1805,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
ntfs_error(vol->sb, "Failed to merge runlists for mft data "
"attribute.");
if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to dealocate clusters "
+ ntfs_error(vol->sb, "Failed to deallocate clusters "
"from the mft data attribute.%s", es);
NVolSetErrors(vol);
}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 608be451609..b341492542c 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
/*
* super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
* Copyright (c) 2001,2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
@@ -1239,7 +1239,6 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
{
MFT_REF mref;
struct inode *vi;
- ntfs_inode *ni;
struct page *page;
u32 *kaddr, *kend;
ntfs_name *name = NULL;
@@ -1290,7 +1289,6 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
"is not the system volume.", i_size_read(vi));
goto iput_out;
}
- ni = NTFS_I(vi);
page = ntfs_map_page(vi->i_mapping, 0);
if (IS_ERR(page)) {
ntfs_error(vol->sb, "Failed to read from hiberfil.sys.");
@@ -2475,7 +2473,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
nr_free -= PAGE_CACHE_SIZE * 8;
continue;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
/*
* Subtract the number of set bits. If this
* is the last page and it is partial we don't really care as
@@ -2485,7 +2483,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
*/
nr_free -= bitmap_weight(kaddr,
PAGE_CACHE_SIZE * BITS_PER_BYTE);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
page_cache_release(page);
}
ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
@@ -2546,7 +2544,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
nr_free -= PAGE_CACHE_SIZE * 8;
continue;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
/*
* Subtract the number of set bits. If this
* is the last page and it is partial we don't really care as
@@ -2556,7 +2554,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
*/
nr_free -= bitmap_weight(kaddr,
PAGE_CACHE_SIZE * BITS_PER_BYTE);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
page_cache_release(page);
}
ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
@@ -2910,9 +2908,10 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
ntfs_error(sb, "Failed to load system files.");
goto unl_upcase_iput_tmp_ino_err_out_now;
}
- if ((sb->s_root = d_alloc_root(vol->root_ino))) {
- /* We grab a reference, simulating an ntfs_iget(). */
- ihold(vol->root_ino);
+
+ /* We grab a reference, simulating an ntfs_iget(). */
+ ihold(vol->root_ino);
+ if ((sb->s_root = d_make_root(vol->root_ino))) {
ntfs_debug("Exiting, status successful.");
/* Release the default upcase if it has no users. */
mutex_lock(&ntfs_lock);
@@ -3160,6 +3159,8 @@ static int __init init_ntfs_fs(void)
}
printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n");
+ /* Unregister the ntfs sysctls. */
+ ntfs_sysctl(0);
sysctl_err_out:
kmem_cache_destroy(ntfs_big_inode_cache);
big_inode_err_out:
@@ -3198,7 +3199,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm
MODULE_VERSION(NTFS_VERSION);
MODULE_LICENSE("GPL");
#ifdef DEBUG
-module_param(debug_msgs, bool, 0);
+module_param(debug_msgs, bint, 0);
MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
#endif
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 78b68af3b0e..657743254eb 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -102,7 +102,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
* copy, the data is still good. */
if (buffer_jbd(buffer_cache_bh)
&& ocfs2_inode_is_new(inode)) {
- kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh_result->b_page);
if (!kaddr) {
mlog(ML_ERROR, "couldn't kmap!\n");
goto bail;
@@ -110,7 +110,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
memcpy(kaddr + (bh_result->b_size * iblock),
buffer_cache_bh->b_data,
bh_result->b_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
set_buffer_uptodate(bh_result);
}
brelse(buffer_cache_bh);
@@ -236,13 +236,13 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
return -EROFS;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (size)
memcpy(kaddr, di->id2.i_data.id_data, size);
/* Clear the remaining part of the page */
memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
SetPageUptodate(page);
@@ -689,7 +689,7 @@ static void ocfs2_clear_page_regions(struct page *page,
ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (from || to) {
if (from > cluster_start)
@@ -700,7 +700,7 @@ static void ocfs2_clear_page_regions(struct page *page,
memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
/*
@@ -1981,9 +1981,9 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
}
}
- kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
+ kaddr = kmap_atomic(wc->w_target_page);
memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
trace_ocfs2_write_end_inline(
(unsigned long long)OCFS2_I(inode)->ip_blkno,
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index abfac0d7ae9..3b5825ef319 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -582,24 +582,14 @@ static int dlmfs_fill_super(struct super_block * sb,
void * data,
int silent)
{
- struct inode * inode;
- struct dentry * root;
-
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = DLMFS_MAGIC;
sb->s_op = &dlmfs_ops;
- inode = dlmfs_get_root_inode(sb);
- if (!inode)
- return -ENOMEM;
-
- root = d_alloc_root(inode);
- if (!root) {
- iput(inode);
+ sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
+ if (!sb->s_root)
return -ENOMEM;
- }
- sb->s_root = root;
return 0;
}
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index a6fda3c188a..a1a1bfd652c 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -28,8 +28,6 @@
#include "suballoc.h"
#include "move_extents.h"
-#include <linux/ext2_fs.h>
-
#define o2info_from_user(a, b) \
copy_from_user(&(a), (b), sizeof(a))
#define o2info_to_user(a, b) \
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index be244692550..a9856e3eaaf 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1053,7 +1053,7 @@ static int ocfs2_rename(struct inode *old_dir,
handle_t *handle = NULL;
struct buffer_head *old_dir_bh = NULL;
struct buffer_head *new_dir_bh = NULL;
- nlink_t old_dir_nlink = old_dir->i_nlink;
+ u32 old_dir_nlink = old_dir->i_nlink;
struct ocfs2_dinode *old_di;
struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 604e12c4e97..68f4541c2db 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1154,19 +1154,19 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
}
status = ocfs2_mount_volume(sb);
- if (osb->root_inode)
- inode = igrab(osb->root_inode);
-
if (status < 0)
goto read_super_error;
+ if (osb->root_inode)
+ inode = igrab(osb->root_inode);
+
if (!inode) {
status = -EIO;
mlog_errno(status);
goto read_super_error;
}
- root = d_alloc_root(inode);
+ root = d_make_root(inode);
if (!root) {
status = -ENOMEM;
mlog_errno(status);
@@ -1220,9 +1220,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
read_super_error:
brelse(bh);
- if (inode)
- iput(inode);
-
if (osb) {
atomic_set(&osb->vol_state, VOLUME_DISABLED);
wake_up(&osb->osb_mount_event);
@@ -1627,21 +1624,17 @@ static int __init ocfs2_init(void)
init_waitqueue_head(&ocfs2__ioend_wq[i]);
status = init_ocfs2_uptodate_cache();
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ if (status < 0)
+ goto out1;
status = ocfs2_initialize_mem_caches();
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ if (status < 0)
+ goto out2;
ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
if (!ocfs2_wq) {
status = -ENOMEM;
- goto leave;
+ goto out3;
}
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
@@ -1653,17 +1646,23 @@ static int __init ocfs2_init(void)
ocfs2_set_locking_protocol();
status = register_quota_format(&ocfs2_quota_format);
-leave:
- if (status < 0) {
- ocfs2_free_mem_caches();
- exit_ocfs2_uptodate_cache();
- mlog_errno(status);
- }
+ if (status < 0)
+ goto out4;
+ status = register_filesystem(&ocfs2_fs_type);
+ if (!status)
+ return 0;
- if (status >= 0) {
- return register_filesystem(&ocfs2_fs_type);
- } else
- return -1;
+ unregister_quota_format(&ocfs2_quota_format);
+out4:
+ destroy_workqueue(ocfs2_wq);
+ debugfs_remove(ocfs2_debugfs_root);
+out3:
+ ocfs2_free_mem_caches();
+out2:
+ exit_ocfs2_uptodate_cache();
+out1:
+ mlog_errno(status);
+ return status;
}
static void __exit ocfs2_exit(void)
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6065bb0ba20..dbc84222258 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -539,11 +539,9 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
goto out_brelse_bh2;
}
- sb->s_root = d_alloc_root(root);
- if (!sb->s_root) {
- iput(root);
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root)
goto out_brelse_bh2;
- }
printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name);
ret = 0;
diff --git a/fs/open.c b/fs/open.c
index 77becc04114..5720854156d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -836,7 +836,7 @@ EXPORT_SYMBOL(dentry_open);
static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
struct fdtable *fdt = files_fdtable(files);
- __FD_CLR(fd, fdt->open_fds);
+ __clear_open_fd(fd, fdt);
if (fd < files->next_fd)
files->next_fd = fd;
}
@@ -1080,7 +1080,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
if (!filp)
goto out_unlock;
rcu_assign_pointer(fdt->fd[fd], NULL);
- FD_CLR(fd, fdt->close_on_exec);
+ __clear_close_on_exec(fd, fdt);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
retval = filp_close(filp, files);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index a88c03bc749..bc49c975d50 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -408,13 +408,12 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
oi->type = op_inode_node;
oi->u.node = of_find_node_by_path("/");
- s->s_root = d_alloc_root(root_inode);
+ s->s_root = d_make_root(root_inode);
if (!s->s_root)
goto out_no_root_dentry;
return 0;
out_no_root_dentry:
- iput(root_inode);
ret = -ENOMEM;
out_no_root:
printk("openprom_fill_super: get root inode failed\n");
diff --git a/fs/pipe.c b/fs/pipe.c
index f0e485d54e6..25feaa3faac 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -13,6 +13,7 @@
#include <linux/fs.h>
#include <linux/log2.h>
#include <linux/mount.h>
+#include <linux/magic.h>
#include <linux/pipe_fs_i.h>
#include <linux/uio.h>
#include <linux/highmem.h>
@@ -230,7 +231,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
{
if (atomic) {
buf->flags |= PIPE_BUF_FLAG_ATOMIC;
- return kmap_atomic(buf->page, KM_USER0);
+ return kmap_atomic(buf->page);
}
return kmap(buf->page);
@@ -251,7 +252,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
{
if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
- kunmap_atomic(map_data, KM_USER0);
+ kunmap_atomic(map_data);
} else
kunmap(buf->page);
}
@@ -565,14 +566,14 @@ redo1:
iov_fault_in_pages_read(iov, chars);
redo2:
if (atomic)
- src = kmap_atomic(page, KM_USER0);
+ src = kmap_atomic(page);
else
src = kmap(page);
error = pipe_iov_copy_from_user(src, iov, chars,
atomic);
if (atomic)
- kunmap_atomic(src, KM_USER0);
+ kunmap_atomic(src);
else
kunmap(page);
@@ -1137,7 +1138,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
if (nr_pages < pipe->nrbufs)
return -EBUSY;
- bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
+ bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
if (unlikely(!bufs))
return -ENOMEM;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index cea4623f1ed..5e325a42e33 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -18,7 +18,7 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/posix_acl.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/errno.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 8c344f037bd..f9bd395b347 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
state = *get_task_state(task);
vsize = eip = esp = 0;
- permitted = ptrace_may_access(task, PTRACE_MODE_READ);
+ permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
mm = get_task_mm(task);
if (mm) {
vsize = task_vsize(mm);
@@ -462,56 +462,56 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(start_time);
- seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
-%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
- pid_nr_ns(pid, ns),
- tcomm,
- state,
- ppid,
- pgid,
- sid,
- tty_nr,
- tty_pgrp,
- task->flags,
- min_flt,
- cmin_flt,
- maj_flt,
- cmaj_flt,
- cputime_to_clock_t(utime),
- cputime_to_clock_t(stime),
- cputime_to_clock_t(cutime),
- cputime_to_clock_t(cstime),
- priority,
- nice,
- num_threads,
- start_time,
- vsize,
- mm ? get_mm_rss(mm) : 0,
- rsslim,
- mm ? (permitted ? mm->start_code : 1) : 0,
- mm ? (permitted ? mm->end_code : 1) : 0,
- (permitted && mm) ? mm->start_stack : 0,
- esp,
- eip,
- /* The signal information here is obsolete.
- * It must be decimal for Linux 2.0 compatibility.
- * Use /proc/#/status for real-time signals.
- */
- task->pending.signal.sig[0] & 0x7fffffffUL,
- task->blocked.sig[0] & 0x7fffffffUL,
- sigign .sig[0] & 0x7fffffffUL,
- sigcatch .sig[0] & 0x7fffffffUL,
- wchan,
- 0UL,
- 0UL,
- task->exit_signal,
- task_cpu(task),
- task->rt_priority,
- task->policy,
- (unsigned long long)delayacct_blkio_ticks(task),
- cputime_to_clock_t(gtime),
- cputime_to_clock_t(cgtime));
+ seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
+ seq_put_decimal_ll(m, ' ', ppid);
+ seq_put_decimal_ll(m, ' ', pgid);
+ seq_put_decimal_ll(m, ' ', sid);
+ seq_put_decimal_ll(m, ' ', tty_nr);
+ seq_put_decimal_ll(m, ' ', tty_pgrp);
+ seq_put_decimal_ull(m, ' ', task->flags);
+ seq_put_decimal_ull(m, ' ', min_flt);
+ seq_put_decimal_ull(m, ' ', cmin_flt);
+ seq_put_decimal_ull(m, ' ', maj_flt);
+ seq_put_decimal_ull(m, ' ', cmaj_flt);
+ seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
+ seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
+ seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
+ seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
+ seq_put_decimal_ll(m, ' ', priority);
+ seq_put_decimal_ll(m, ' ', nice);
+ seq_put_decimal_ll(m, ' ', num_threads);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, ' ', start_time);
+ seq_put_decimal_ull(m, ' ', vsize);
+ seq_put_decimal_ll(m, ' ', mm ? get_mm_rss(mm) : 0);
+ seq_put_decimal_ull(m, ' ', rsslim);
+ seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
+ seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
+ seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
+ seq_put_decimal_ull(m, ' ', esp);
+ seq_put_decimal_ull(m, ' ', eip);
+ /* The signal information here is obsolete.
+ * It must be decimal for Linux 2.0 compatibility.
+ * Use /proc/#/status for real-time signals.
+ */
+ seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', wchan);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ll(m, ' ', task->exit_signal);
+ seq_put_decimal_ll(m, ' ', task_cpu(task));
+ seq_put_decimal_ull(m, ' ', task->rt_priority);
+ seq_put_decimal_ull(m, ' ', task->policy);
+ seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
+ seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
+ seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
+ seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_data : 0);
+ seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->end_data : 0);
+ seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_brk : 0);
+ seq_putc(m, '\n');
if (mm)
mmput(mm);
return 0;
@@ -539,8 +539,20 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
}
- seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
- size, resident, shared, text, data);
+ /*
+ * For quick read, open code by putting numbers directly
+ * expected format is
+ * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+ * size, resident, shared, text, data);
+ */
+ seq_put_decimal_ull(m, 0, size);
+ seq_put_decimal_ull(m, ' ', resident);
+ seq_put_decimal_ull(m, ' ', shared);
+ seq_put_decimal_ull(m, ' ', text);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, ' ', data);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_putc(m, '\n');
return 0;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8173dfd89cb..1c8b280146d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -198,82 +198,9 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
-static struct mm_struct *__check_mem_permission(struct task_struct *task)
-{
- struct mm_struct *mm;
-
- mm = get_task_mm(task);
- if (!mm)
- return ERR_PTR(-EINVAL);
-
- /*
- * A task can always look at itself, in case it chooses
- * to use system calls instead of load instructions.
- */
- if (task == current)
- return mm;
-
- /*
- * If current is actively ptrace'ing, and would also be
- * permitted to freshly attach with ptrace now, permit it.
- */
- if (task_is_stopped_or_traced(task)) {
- int match;
- rcu_read_lock();
- match = (ptrace_parent(task) == current);
- rcu_read_unlock();
- if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
- return mm;
- }
-
- /*
- * No one else is allowed.
- */
- mmput(mm);
- return ERR_PTR(-EPERM);
-}
-
-/*
- * If current may access user memory in @task return a reference to the
- * corresponding mm, otherwise ERR_PTR.
- */
-static struct mm_struct *check_mem_permission(struct task_struct *task)
-{
- struct mm_struct *mm;
- int err;
-
- /*
- * Avoid racing if task exec's as we might get a new mm but validate
- * against old credentials.
- */
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
- if (err)
- return ERR_PTR(err);
-
- mm = __check_mem_permission(task);
- mutex_unlock(&task->signal->cred_guard_mutex);
-
- return mm;
-}
-
struct mm_struct *mm_for_maps(struct task_struct *task)
{
- struct mm_struct *mm;
- int err;
-
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
- if (err)
- return ERR_PTR(err);
-
- mm = get_task_mm(task);
- if (mm && mm != current->mm &&
- !ptrace_may_access(task, PTRACE_MODE_READ)) {
- mmput(mm);
- mm = ERR_PTR(-EACCES);
- }
- mutex_unlock(&task->signal->cred_guard_mutex);
-
- return mm;
+ return mm_access(task, PTRACE_MODE_READ);
}
static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -654,6 +581,8 @@ static int proc_pid_permission(struct inode *inode, int mask)
bool has_perms;
task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
has_perms = has_pid_permissions(pid, task, 1);
put_task_struct(task);
@@ -750,133 +679,96 @@ static const struct file_operations proc_single_file_operations = {
static int mem_open(struct inode* inode, struct file* file)
{
- file->private_data = (void*)((long)current->self_exec_id);
- /* OK to pass negative loff_t, we can catch out-of-range */
- file->f_mode |= FMODE_UNSIGNED_OFFSET;
- return 0;
-}
-
-static ssize_t mem_read(struct file * file, char __user * buf,
- size_t count, loff_t *ppos)
-{
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- char *page;
- unsigned long src = *ppos;
- int ret = -ESRCH;
struct mm_struct *mm;
if (!task)
- goto out_no_task;
+ return -ESRCH;
- ret = -ENOMEM;
- page = (char *)__get_free_page(GFP_TEMPORARY);
- if (!page)
- goto out;
+ mm = mm_access(task, PTRACE_MODE_ATTACH);
+ put_task_struct(task);
- mm = check_mem_permission(task);
- ret = PTR_ERR(mm);
if (IS_ERR(mm))
- goto out_free;
-
- ret = -EIO;
-
- if (file->private_data != (void*)((long)current->self_exec_id))
- goto out_put;
-
- ret = 0;
-
- while (count > 0) {
- int this_len, retval;
+ return PTR_ERR(mm);
- this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- retval = access_remote_vm(mm, src, page, this_len, 0);
- if (!retval) {
- if (!ret)
- ret = -EIO;
- break;
- }
-
- if (copy_to_user(buf, page, retval)) {
- ret = -EFAULT;
- break;
- }
-
- ret += retval;
- src += retval;
- buf += retval;
- count -= retval;
+ if (mm) {
+ /* ensure this mm_struct can't be freed */
+ atomic_inc(&mm->mm_count);
+ /* but do not pin its memory */
+ mmput(mm);
}
- *ppos = src;
-out_put:
- mmput(mm);
-out_free:
- free_page((unsigned long) page);
-out:
- put_task_struct(task);
-out_no_task:
- return ret;
+ /* OK to pass negative loff_t, we can catch out-of-range */
+ file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ file->private_data = mm;
+
+ return 0;
}
-static ssize_t mem_write(struct file * file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t mem_rw(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, int write)
{
- int copied;
+ struct mm_struct *mm = file->private_data;
+ unsigned long addr = *ppos;
+ ssize_t copied;
char *page;
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- unsigned long dst = *ppos;
- struct mm_struct *mm;
- copied = -ESRCH;
- if (!task)
- goto out_no_task;
+ if (!mm)
+ return 0;
- copied = -ENOMEM;
page = (char *)__get_free_page(GFP_TEMPORARY);
if (!page)
- goto out_task;
-
- mm = check_mem_permission(task);
- copied = PTR_ERR(mm);
- if (IS_ERR(mm))
- goto out_free;
-
- copied = -EIO;
- if (file->private_data != (void *)((long)current->self_exec_id))
- goto out_mm;
+ return -ENOMEM;
copied = 0;
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ goto free;
+
while (count > 0) {
- int this_len, retval;
+ int this_len = min_t(int, count, PAGE_SIZE);
- this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- if (copy_from_user(page, buf, this_len)) {
+ if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
break;
}
- retval = access_remote_vm(mm, dst, page, this_len, 1);
- if (!retval) {
+
+ this_len = access_remote_vm(mm, addr, page, this_len, write);
+ if (!this_len) {
if (!copied)
copied = -EIO;
break;
}
- copied += retval;
- buf += retval;
- dst += retval;
- count -= retval;
+
+ if (!write && copy_to_user(buf, page, this_len)) {
+ copied = -EFAULT;
+ break;
+ }
+
+ buf += this_len;
+ addr += this_len;
+ copied += this_len;
+ count -= this_len;
}
- *ppos = dst;
+ *ppos = addr;
-out_mm:
mmput(mm);
-out_free:
+free:
free_page((unsigned long) page);
-out_task:
- put_task_struct(task);
-out_no_task:
return copied;
}
+static ssize_t mem_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, buf, count, ppos, 0);
+}
+
+static ssize_t mem_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
+
loff_t mem_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
@@ -893,11 +785,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
return file->f_pos;
}
+static int mem_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
static const struct file_operations proc_mem_operations = {
.llseek = mem_lseek,
.read = mem_read,
.write = mem_write,
.open = mem_open,
+ .release = mem_release,
};
static ssize_t environ_read(struct file *file, char __user *buf,
@@ -1197,9 +1098,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
ssize_t length;
uid_t loginuid;
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
-
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
@@ -1228,7 +1126,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
goto out_free_page;
}
- length = audit_set_loginuid(current, loginuid);
+ length = audit_set_loginuid(loginuid);
if (likely(length == 0))
length = count;
@@ -1412,8 +1310,7 @@ sched_autogroup_write(struct file *file, const char __user *buf,
if (!p)
return -ESRCH;
- err = nice;
- err = proc_sched_autogroup_set_nice(p, &err);
+ err = proc_sched_autogroup_set_nice(p, nice);
if (err)
count = err;
@@ -1856,7 +1753,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
fdt = files_fdtable(files);
f_flags = file->f_flags & ~O_CLOEXEC;
- if (FD_ISSET(fd, fdt->close_on_exec))
+ if (close_on_exec(fd, fdt))
f_flags |= O_CLOEXEC;
if (path) {
@@ -3092,9 +2989,9 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
- REG("maps", S_IRUGO, proc_maps_operations),
+ REG("maps", S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_NUMA
- REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
+ REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
@@ -3105,7 +3002,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
- REG("smaps", S_IRUGO, proc_smaps_operations),
+ REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("pagemap", S_IRUGO, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -3451,9 +3348,9 @@ static const struct pid_entry tid_base_stuff[] = {
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
- REG("maps", S_IRUGO, proc_maps_operations),
+ REG("maps", S_IRUGO, proc_tid_maps_operations),
#ifdef CONFIG_NUMA
- REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
+ REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
@@ -3463,7 +3360,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
- REG("smaps", S_IRUGO, proc_smaps_operations),
+ REG("smaps", S_IRUGO, proc_tid_smaps_operations),
REG("pagemap", S_IRUGO, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 84fd3235a59..205c9228083 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -22,7 +22,6 @@
#include <linux/slab.h>
#include <linux/mount.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -486,8 +485,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
int proc_fill_super(struct super_block *s)
{
- struct inode * root_inode;
-
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
@@ -496,19 +493,11 @@ int proc_fill_super(struct super_block *s)
s->s_time_gran = 1;
pde_get(&proc_root);
- root_inode = proc_get_inode(s, &proc_root);
- if (!root_inode)
- goto out_no_root;
- root_inode->i_uid = 0;
- root_inode->i_gid = 0;
- s->s_root = d_alloc_root(root_inode);
- if (!s->s_root)
- goto out_no_root;
- return 0;
+ s->s_root = d_make_root(proc_get_inode(s, &proc_root));
+ if (s->s_root)
+ return 0;
-out_no_root:
printk("proc_read_super: get root inode failed\n");
- iput(root_inode);
pde_put(&proc_root);
return -ENOMEM;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 292577531ad..5f79bb8b4c6 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,12 +10,15 @@
*/
#include <linux/proc_fs.h>
+struct ctl_table_header;
extern struct proc_dir_entry proc_root;
#ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void);
+extern void sysctl_head_put(struct ctl_table_header *head);
#else
static inline void proc_sys_init(void) { }
+static inline void sysctl_head_put(struct ctl_table_header *head) { }
#endif
#ifdef CONFIG_NET
extern int proc_net_init(void);
@@ -53,9 +56,12 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
-extern const struct file_operations proc_maps_operations;
-extern const struct file_operations proc_numa_maps_operations;
-extern const struct file_operations proc_smaps_operations;
+extern const struct file_operations proc_pid_maps_operations;
+extern const struct file_operations proc_tid_maps_operations;
+extern const struct file_operations proc_pid_numa_maps_operations;
+extern const struct file_operations proc_tid_numa_maps_operations;
+extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_tid_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
extern const struct file_operations proc_net_operations;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d245cb23dd7..86c67eee439 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -157,7 +157,8 @@ static int kcore_update_ram(void)
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/* calculate vmemmap's address from given system ram pfn and register it */
-int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
{
unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
unsigned long nr_pages = ent->size >> PAGE_SHIFT;
@@ -189,7 +190,8 @@ int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
}
#else
-int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
{
return 1;
}
@@ -513,7 +515,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
n = copy_to_user(buffer, (char *)start, tsz);
/*
- * We cannot distingush between fault on source
+ * We cannot distinguish between fault on source
* and fault on destination. When this happens
* we clear too and hope it will trigger the
* EFAULT again.
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 27da860115c..0d9e23a39e4 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -53,7 +53,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
ei->ns_ops = ns_ops;
ei->ns = ns;
- dentry->d_op = &pid_dentry_operations;
+ d_set_d_op(dentry, &pid_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
if (pid_revalidate(dentry, NULL))
@@ -156,15 +156,15 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out;
- last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
- for (entry = ns_entries; entry <= last; entry++) {
+ last = &ns_entries[ARRAY_SIZE(ns_entries)];
+ for (entry = ns_entries; entry < last; entry++) {
if (strlen((*entry)->name) != len)
continue;
if (!memcmp(dentry->d_name.name, (*entry)->name, len))
break;
}
error = ERR_PTR(-ENOENT);
- if (entry > last)
+ if (entry == last)
goto out;
error = proc_ns_instantiate(dir, dentry, task, *entry);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 6d8e6a9e93a..7fcd0d60a96 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,6 +115,8 @@ u64 stable_page_flags(struct page *page)
u |= 1 << KPF_COMPOUND_TAIL;
if (PageHuge(page))
u |= 1 << KPF_HUGE;
+ else if (PageTransCompound(page))
+ u |= 1 << KPF_THP;
/*
* Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a6b62173d4c..21d836f4029 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -6,7 +6,10 @@
#include <linux/poll.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
+#include <linux/sched.h>
#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/module.h>
#include "internal.h"
static const struct dentry_operations proc_sys_dentry_operations;
@@ -24,6 +27,371 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)
wake_up_interruptible(&poll->wait);
}
+static struct ctl_table root_table[] = {
+ {
+ .procname = "",
+ .mode = S_IFDIR|S_IRUGO|S_IXUGO,
+ },
+ { }
+};
+static struct ctl_table_root sysctl_table_root = {
+ .default_set.dir.header = {
+ {{.count = 1,
+ .nreg = 1,
+ .ctl_table = root_table }},
+ .ctl_table_arg = root_table,
+ .root = &sysctl_table_root,
+ .set = &sysctl_table_root.default_set,
+ },
+};
+
+static DEFINE_SPINLOCK(sysctl_lock);
+
+static void drop_sysctl_table(struct ctl_table_header *header);
+static int sysctl_follow_link(struct ctl_table_header **phead,
+ struct ctl_table **pentry, struct nsproxy *namespaces);
+static int insert_links(struct ctl_table_header *head);
+static void put_links(struct ctl_table_header *header);
+
+static void sysctl_print_dir(struct ctl_dir *dir)
+{
+ if (dir->header.parent)
+ sysctl_print_dir(dir->header.parent);
+ printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname);
+}
+
+static int namecmp(const char *name1, int len1, const char *name2, int len2)
+{
+ int minlen;
+ int cmp;
+
+ minlen = len1;
+ if (minlen > len2)
+ minlen = len2;
+
+ cmp = memcmp(name1, name2, minlen);
+ if (cmp == 0)
+ cmp = len1 - len2;
+ return cmp;
+}
+
+/* Called under sysctl_lock */
+static struct ctl_table *find_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir, const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+ struct rb_node *node = dir->root.rb_node;
+
+ while (node)
+ {
+ struct ctl_node *ctl_node;
+ const char *procname;
+ int cmp;
+
+ ctl_node = rb_entry(node, struct ctl_node, node);
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ procname = entry->procname;
+
+ cmp = namecmp(name, namelen, procname, strlen(procname));
+ if (cmp < 0)
+ node = node->rb_left;
+ else if (cmp > 0)
+ node = node->rb_right;
+ else {
+ *phead = head;
+ return entry;
+ }
+ }
+ return NULL;
+}
+
+static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+ struct rb_node *node = &head->node[entry - head->ctl_table].node;
+ struct rb_node **p = &head->parent->root.rb_node;
+ struct rb_node *parent = NULL;
+ const char *name = entry->procname;
+ int namelen = strlen(name);
+
+ while (*p) {
+ struct ctl_table_header *parent_head;
+ struct ctl_table *parent_entry;
+ struct ctl_node *parent_node;
+ const char *parent_name;
+ int cmp;
+
+ parent = *p;
+ parent_node = rb_entry(parent, struct ctl_node, node);
+ parent_head = parent_node->header;
+ parent_entry = &parent_head->ctl_table[parent_node - parent_head->node];
+ parent_name = parent_entry->procname;
+
+ cmp = namecmp(name, namelen, parent_name, strlen(parent_name));
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else {
+ printk(KERN_ERR "sysctl duplicate entry: ");
+ sysctl_print_dir(head->parent);
+ printk(KERN_CONT "/%s\n", entry->procname);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(node, parent, p);
+ return 0;
+}
+
+static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+ struct rb_node *node = &head->node[entry - head->ctl_table].node;
+
+ rb_erase(node, &head->parent->root);
+}
+
+static void init_header(struct ctl_table_header *head,
+ struct ctl_table_root *root, struct ctl_table_set *set,
+ struct ctl_node *node, struct ctl_table *table)
+{
+ head->ctl_table = table;
+ head->ctl_table_arg = table;
+ head->used = 0;
+ head->count = 1;
+ head->nreg = 1;
+ head->unregistering = NULL;
+ head->root = root;
+ head->set = set;
+ head->parent = NULL;
+ head->node = node;
+ if (node) {
+ struct ctl_table *entry;
+ for (entry = table; entry->procname; entry++, node++) {
+ rb_init_node(&node->node);
+ node->header = head;
+ }
+ }
+}
+
+static void erase_header(struct ctl_table_header *head)
+{
+ struct ctl_table *entry;
+ for (entry = head->ctl_table; entry->procname; entry++)
+ erase_entry(head, entry);
+}
+
+static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
+{
+ struct ctl_table *entry;
+ int err;
+
+ dir->header.nreg++;
+ header->parent = dir;
+ err = insert_links(header);
+ if (err)
+ goto fail_links;
+ for (entry = header->ctl_table; entry->procname; entry++) {
+ err = insert_entry(header, entry);
+ if (err)
+ goto fail;
+ }
+ return 0;
+fail:
+ erase_header(header);
+ put_links(header);
+fail_links:
+ header->parent = NULL;
+ drop_sysctl_table(&dir->header);
+ return err;
+}
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+ if (unlikely(p->unregistering))
+ return 0;
+ p->used++;
+ return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+ if (!--p->used)
+ if (unlikely(p->unregistering))
+ complete(p->unregistering);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+ /*
+ * if p->used is 0, nobody will ever touch that entry again;
+ * we'll eliminate all paths to it before dropping sysctl_lock
+ */
+ if (unlikely(p->used)) {
+ struct completion wait;
+ init_completion(&wait);
+ p->unregistering = &wait;
+ spin_unlock(&sysctl_lock);
+ wait_for_completion(&wait);
+ spin_lock(&sysctl_lock);
+ } else {
+ /* anything non-NULL; we'll never dereference it */
+ p->unregistering = ERR_PTR(-EINVAL);
+ }
+ /*
+ * do not remove from the list until nobody holds it; walking the
+ * list in do_sysctl() relies on that.
+ */
+ erase_header(p);
+}
+
+static void sysctl_head_get(struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ head->count++;
+ spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ if (!--head->count)
+ kfree_rcu(head, rcu);
+ spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+ if (!head)
+ BUG();
+ spin_lock(&sysctl_lock);
+ if (!use_table(head))
+ head = ERR_PTR(-ENOENT);
+ spin_unlock(&sysctl_lock);
+ return head;
+}
+
+static void sysctl_head_finish(struct ctl_table_header *head)
+{
+ if (!head)
+ return;
+ spin_lock(&sysctl_lock);
+ unuse_table(head);
+ spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+ struct ctl_table_set *set = &root->default_set;
+ if (root->lookup)
+ set = root->lookup(root, namespaces);
+ return set;
+}
+
+static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+
+ spin_lock(&sysctl_lock);
+ entry = find_entry(&head, dir, name, namelen);
+ if (entry && use_table(head))
+ *phead = head;
+ else
+ entry = NULL;
+ spin_unlock(&sysctl_lock);
+ return entry;
+}
+
+static struct ctl_node *first_usable_entry(struct rb_node *node)
+{
+ struct ctl_node *ctl_node;
+
+ for (;node; node = rb_next(node)) {
+ ctl_node = rb_entry(node, struct ctl_node, node);
+ if (use_table(ctl_node->header))
+ return ctl_node;
+ }
+ return NULL;
+}
+
+static void first_entry(struct ctl_dir *dir,
+ struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+ struct ctl_table_header *head = NULL;
+ struct ctl_table *entry = NULL;
+ struct ctl_node *ctl_node;
+
+ spin_lock(&sysctl_lock);
+ ctl_node = first_usable_entry(rb_first(&dir->root));
+ spin_unlock(&sysctl_lock);
+ if (ctl_node) {
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ }
+ *phead = head;
+ *pentry = entry;
+}
+
+static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+ struct ctl_table_header *head = *phead;
+ struct ctl_table *entry = *pentry;
+ struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
+
+ spin_lock(&sysctl_lock);
+ unuse_table(head);
+
+ ctl_node = first_usable_entry(rb_next(&ctl_node->node));
+ spin_unlock(&sysctl_lock);
+ head = NULL;
+ if (ctl_node) {
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ }
+ *phead = head;
+ *pentry = entry;
+}
+
+void register_sysctl_root(struct ctl_table_root *root)
+{
+}
+
+/*
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+ if (!current_euid())
+ mode >>= 6;
+ else if (in_egroup_p(0))
+ mode >>= 3;
+ if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
+ return 0;
+ return -EACCES;
+}
+
+static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+{
+ int mode;
+
+ if (root->permissions)
+ mode = root->permissions(root, current->nsproxy, table);
+ else
+ mode = table->mode;
+
+ return test_perm(mode, op);
+}
+
static struct inode *proc_sys_make_inode(struct super_block *sb,
struct ctl_table_header *head, struct ctl_table *table)
{
@@ -43,13 +411,12 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_mode = table->mode;
- if (!table->child) {
+ if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
inode->i_op = &proc_sys_inode_operations;
inode->i_fop = &proc_sys_file_operations;
} else {
inode->i_mode |= S_IFDIR;
- clear_nlink(inode);
inode->i_op = &proc_sys_dir_operations;
inode->i_fop = &proc_sys_dir_file_operations;
}
@@ -57,70 +424,42 @@ out:
return inode;
}
-static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
-{
- int len;
- for ( ; p->procname; p++) {
-
- if (!p->procname)
- continue;
-
- len = strlen(p->procname);
- if (len != name->len)
- continue;
-
- if (memcmp(p->procname, name->name, len) != 0)
- continue;
-
- /* I have a match */
- return p;
- }
- return NULL;
-}
-
static struct ctl_table_header *grab_header(struct inode *inode)
{
- if (PROC_I(inode)->sysctl)
- return sysctl_head_grab(PROC_I(inode)->sysctl);
- else
- return sysctl_head_next(NULL);
+ struct ctl_table_header *head = PROC_I(inode)->sysctl;
+ if (!head)
+ head = &sysctl_table_root.default_set.dir.header;
+ return sysctl_head_grab(head);
}
static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
struct ctl_table_header *head = grab_header(dir);
- struct ctl_table *table = PROC_I(dir)->sysctl_entry;
struct ctl_table_header *h = NULL;
struct qstr *name = &dentry->d_name;
struct ctl_table *p;
struct inode *inode;
struct dentry *err = ERR_PTR(-ENOENT);
+ struct ctl_dir *ctl_dir;
+ int ret;
if (IS_ERR(head))
return ERR_CAST(head);
- if (table && !table->child) {
- WARN_ON(1);
- goto out;
- }
-
- table = table ? table->child : head->ctl_table;
-
- p = find_in_table(table, name);
- if (!p) {
- for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
- if (h->attached_to != table)
- continue;
- p = find_in_table(h->attached_by, name);
- if (p)
- break;
- }
- }
+ ctl_dir = container_of(head, struct ctl_dir, header);
+ p = lookup_entry(&h, ctl_dir, name->name, name->len);
if (!p)
goto out;
+ if (S_ISLNK(p->mode)) {
+ ret = sysctl_follow_link(&h, &p, current->nsproxy);
+ err = ERR_PTR(ret);
+ if (ret)
+ goto out;
+ }
+
err = ERR_PTR(-ENOMEM);
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
if (h)
@@ -188,20 +527,32 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
static int proc_sys_open(struct inode *inode, struct file *filp)
{
+ struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ /* sysctl was unregistered */
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
if (table->poll)
filp->private_data = proc_sys_poll_event(table->poll);
+ sysctl_head_finish(head);
+
return 0;
}
static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
{
struct inode *inode = filp->f_path.dentry->d_inode;
+ struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
- unsigned long event = (unsigned long)filp->private_data;
unsigned int ret = DEFAULT_POLLMASK;
+ unsigned long event;
+
+ /* sysctl was unregistered */
+ if (IS_ERR(head))
+ return POLLERR | POLLHUP;
if (!table->proc_handler)
goto out;
@@ -209,6 +560,7 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
if (!table->poll)
goto out;
+ event = (unsigned long)filp->private_data;
poll_wait(filp, &table->poll->wait, wait);
if (event != atomic_read(&table->poll->event)) {
@@ -217,6 +569,8 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
}
out:
+ sysctl_head_finish(head);
+
return ret;
}
@@ -258,28 +612,45 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
}
+static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
+ filldir_t filldir,
+ struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ int err, ret = 0;
+ head = sysctl_head_grab(head);
+
+ if (S_ISLNK(table->mode)) {
+ /* It is not an error if we can not follow the link ignore it */
+ err = sysctl_follow_link(&head, &table, current->nsproxy);
+ if (err)
+ goto out;
+ }
+
+ ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
+out:
+ sysctl_head_finish(head);
+ return ret;
+}
+
static int scan(struct ctl_table_header *head, ctl_table *table,
unsigned long *pos, struct file *file,
void *dirent, filldir_t filldir)
{
+ int res;
- for (; table->procname; table++, (*pos)++) {
- int res;
-
- /* Can't do anything without a proc name */
- if (!table->procname)
- continue;
-
- if (*pos < file->f_pos)
- continue;
+ if ((*pos)++ < file->f_pos)
+ return 0;
+ if (unlikely(S_ISLNK(table->mode)))
+ res = proc_sys_link_fill_cache(file, dirent, filldir, head, table);
+ else
res = proc_sys_fill_cache(file, dirent, filldir, head, table);
- if (res)
- return res;
- file->f_pos = *pos + 1;
- }
- return 0;
+ if (res == 0)
+ file->f_pos = *pos;
+
+ return res;
}
static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
@@ -287,20 +658,16 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
struct ctl_table_header *h = NULL;
+ struct ctl_table *entry;
+ struct ctl_dir *ctl_dir;
unsigned long pos;
int ret = -EINVAL;
if (IS_ERR(head))
return PTR_ERR(head);
- if (table && !table->child) {
- WARN_ON(1);
- goto out;
- }
-
- table = table ? table->child : head->ctl_table;
+ ctl_dir = container_of(head, struct ctl_dir, header);
ret = 0;
/* Avoid a switch here: arm builds fail with missing __cmpdi2 */
@@ -318,14 +685,8 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
pos = 2;
- ret = scan(head, table, &pos, filp, dirent, filldir);
- if (ret)
- goto out;
-
- for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
- if (h->attached_to != table)
- continue;
- ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
+ for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
+ ret = scan(h, entry, &pos, filp, dirent, filldir);
if (ret) {
sysctl_head_finish(h);
break;
@@ -445,6 +806,21 @@ static int proc_sys_delete(const struct dentry *dentry)
return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
}
+static int sysctl_is_seen(struct ctl_table_header *p)
+{
+ struct ctl_table_set *set = p->set;
+ int res;
+ spin_lock(&sysctl_lock);
+ if (p->unregistering)
+ res = 0;
+ else if (!set->is_seen)
+ res = 1;
+ else
+ res = set->is_seen(set);
+ spin_unlock(&sysctl_lock);
+ return res;
+}
+
static int proc_sys_compare(const struct dentry *parent,
const struct inode *pinode,
const struct dentry *dentry, const struct inode *inode,
@@ -470,6 +846,753 @@ static const struct dentry_operations proc_sys_dentry_operations = {
.d_compare = proc_sys_compare,
};
+static struct ctl_dir *find_subdir(struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+
+ entry = find_entry(&head, dir, name, namelen);
+ if (!entry)
+ return ERR_PTR(-ENOENT);
+ if (!S_ISDIR(entry->mode))
+ return ERR_PTR(-ENOTDIR);
+ return container_of(head, struct ctl_dir, header);
+}
+
+static struct ctl_dir *new_dir(struct ctl_table_set *set,
+ const char *name, int namelen)
+{
+ struct ctl_table *table;
+ struct ctl_dir *new;
+ struct ctl_node *node;
+ char *new_name;
+
+ new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
+ sizeof(struct ctl_table)*2 + namelen + 1,
+ GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ node = (struct ctl_node *)(new + 1);
+ table = (struct ctl_table *)(node + 1);
+ new_name = (char *)(table + 2);
+ memcpy(new_name, name, namelen);
+ new_name[namelen] = '\0';
+ table[0].procname = new_name;
+ table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
+ init_header(&new->header, set->dir.header.root, set, node, table);
+
+ return new;
+}
+
+/**
+ * get_subdir - find or create a subdir with the specified name.
+ * @dir: Directory to create the subdirectory in
+ * @name: The name of the subdirectory to find or create
+ * @namelen: The length of name
+ *
+ * Takes a directory with an elevated reference count so we know that
+ * if we drop the lock the directory will not go away. Upon success
+ * the reference is moved from @dir to the returned subdirectory.
+ * Upon error an error code is returned and the reference on @dir is
+ * simply dropped.
+ */
+static struct ctl_dir *get_subdir(struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_set *set = dir->header.set;
+ struct ctl_dir *subdir, *new = NULL;
+ int err;
+
+ spin_lock(&sysctl_lock);
+ subdir = find_subdir(dir, name, namelen);
+ if (!IS_ERR(subdir))
+ goto found;
+ if (PTR_ERR(subdir) != -ENOENT)
+ goto failed;
+
+ spin_unlock(&sysctl_lock);
+ new = new_dir(set, name, namelen);
+ spin_lock(&sysctl_lock);
+ subdir = ERR_PTR(-ENOMEM);
+ if (!new)
+ goto failed;
+
+ /* Was the subdir added while we dropped the lock? */
+ subdir = find_subdir(dir, name, namelen);
+ if (!IS_ERR(subdir))
+ goto found;
+ if (PTR_ERR(subdir) != -ENOENT)
+ goto failed;
+
+ /* Nope. Use the our freshly made directory entry. */
+ err = insert_header(dir, &new->header);
+ subdir = ERR_PTR(err);
+ if (err)
+ goto failed;
+ subdir = new;
+found:
+ subdir->header.nreg++;
+failed:
+ if (unlikely(IS_ERR(subdir))) {
+ printk(KERN_ERR "sysctl could not get directory: ");
+ sysctl_print_dir(dir);
+ printk(KERN_CONT "/%*.*s %ld\n",
+ namelen, namelen, name, PTR_ERR(subdir));
+ }
+ drop_sysctl_table(&dir->header);
+ if (new)
+ drop_sysctl_table(&new->header);
+ spin_unlock(&sysctl_lock);
+ return subdir;
+}
+
+static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
+{
+ struct ctl_dir *parent;
+ const char *procname;
+ if (!dir->header.parent)
+ return &set->dir;
+ parent = xlate_dir(set, dir->header.parent);
+ if (IS_ERR(parent))
+ return parent;
+ procname = dir->header.ctl_table[0].procname;
+ return find_subdir(parent, procname, strlen(procname));
+}
+
+static int sysctl_follow_link(struct ctl_table_header **phead,
+ struct ctl_table **pentry, struct nsproxy *namespaces)
+{
+ struct ctl_table_header *head;
+ struct ctl_table_root *root;
+ struct ctl_table_set *set;
+ struct ctl_table *entry;
+ struct ctl_dir *dir;
+ int ret;
+
+ ret = 0;
+ spin_lock(&sysctl_lock);
+ root = (*pentry)->data;
+ set = lookup_header_set(root, namespaces);
+ dir = xlate_dir(set, (*phead)->parent);
+ if (IS_ERR(dir))
+ ret = PTR_ERR(dir);
+ else {
+ const char *procname = (*pentry)->procname;
+ head = NULL;
+ entry = find_entry(&head, dir, procname, strlen(procname));
+ ret = -ENOENT;
+ if (entry && use_table(head)) {
+ unuse_table(*phead);
+ *phead = head;
+ *pentry = entry;
+ ret = 0;
+ }
+ }
+
+ spin_unlock(&sysctl_lock);
+ return ret;
+}
+
+static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n",
+ path, table->procname, &vaf);
+
+ va_end(args);
+ return -EINVAL;
+}
+
+static int sysctl_check_table(const char *path, struct ctl_table *table)
+{
+ int err = 0;
+ for (; table->procname; table++) {
+ if (table->child)
+ err = sysctl_err(path, table, "Not a file");
+
+ if ((table->proc_handler == proc_dostring) ||
+ (table->proc_handler == proc_dointvec) ||
+ (table->proc_handler == proc_dointvec_minmax) ||
+ (table->proc_handler == proc_dointvec_jiffies) ||
+ (table->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (table->proc_handler == proc_dointvec_ms_jiffies) ||
+ (table->proc_handler == proc_doulongvec_minmax) ||
+ (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!table->data)
+ err = sysctl_err(path, table, "No data");
+ if (!table->maxlen)
+ err = sysctl_err(path, table, "No maxlen");
+ }
+ if (!table->proc_handler)
+ err = sysctl_err(path, table, "No proc_handler");
+
+ if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
+ err = sysctl_err(path, table, "bogus .mode 0%o",
+ table->mode);
+ }
+ return err;
+}
+
+static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
+ struct ctl_table_root *link_root)
+{
+ struct ctl_table *link_table, *entry, *link;
+ struct ctl_table_header *links;
+ struct ctl_node *node;
+ char *link_name;
+ int nr_entries, name_bytes;
+
+ name_bytes = 0;
+ nr_entries = 0;
+ for (entry = table; entry->procname; entry++) {
+ nr_entries++;
+ name_bytes += strlen(entry->procname) + 1;
+ }
+
+ links = kzalloc(sizeof(struct ctl_table_header) +
+ sizeof(struct ctl_node)*nr_entries +
+ sizeof(struct ctl_table)*(nr_entries + 1) +
+ name_bytes,
+ GFP_KERNEL);
+
+ if (!links)
+ return NULL;
+
+ node = (struct ctl_node *)(links + 1);
+ link_table = (struct ctl_table *)(node + nr_entries);
+ link_name = (char *)&link_table[nr_entries + 1];
+
+ for (link = link_table, entry = table; entry->procname; link++, entry++) {
+ int len = strlen(entry->procname) + 1;
+ memcpy(link_name, entry->procname, len);
+ link->procname = link_name;
+ link->mode = S_IFLNK|S_IRWXUGO;
+ link->data = link_root;
+ link_name += len;
+ }
+ init_header(links, dir->header.root, dir->header.set, node, link_table);
+ links->nreg = nr_entries;
+
+ return links;
+}
+
+static bool get_links(struct ctl_dir *dir,
+ struct ctl_table *table, struct ctl_table_root *link_root)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry, *link;
+
+ /* Are there links available for every entry in table? */
+ for (entry = table; entry->procname; entry++) {
+ const char *procname = entry->procname;
+ link = find_entry(&head, dir, procname, strlen(procname));
+ if (!link)
+ return false;
+ if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
+ continue;
+ if (S_ISLNK(link->mode) && (link->data == link_root))
+ continue;
+ return false;
+ }
+
+ /* The checks passed. Increase the registration count on the links */
+ for (entry = table; entry->procname; entry++) {
+ const char *procname = entry->procname;
+ link = find_entry(&head, dir, procname, strlen(procname));
+ head->nreg++;
+ }
+ return true;
+}
+
+static int insert_links(struct ctl_table_header *head)
+{
+ struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+ struct ctl_dir *core_parent = NULL;
+ struct ctl_table_header *links;
+ int err;
+
+ if (head->set == root_set)
+ return 0;
+
+ core_parent = xlate_dir(root_set, head->parent);
+ if (IS_ERR(core_parent))
+ return 0;
+
+ if (get_links(core_parent, head->ctl_table, head->root))
+ return 0;
+
+ core_parent->header.nreg++;
+ spin_unlock(&sysctl_lock);
+
+ links = new_links(core_parent, head->ctl_table, head->root);
+
+ spin_lock(&sysctl_lock);
+ err = -ENOMEM;
+ if (!links)
+ goto out;
+
+ err = 0;
+ if (get_links(core_parent, head->ctl_table, head->root)) {
+ kfree(links);
+ goto out;
+ }
+
+ err = insert_header(core_parent, links);
+ if (err)
+ kfree(links);
+out:
+ drop_sysctl_table(&core_parent->header);
+ return err;
+}
+
+/**
+ * __register_sysctl_table - register a leaf sysctl table
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * The members of the &struct ctl_table structure are used as follows:
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ * enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file
+ *
+ * child - must be %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * There must be a proc_handler routine for any terminal nodes.
+ * Several default handlers are available to cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *__register_sysctl_table(
+ struct ctl_table_set *set,
+ const char *path, struct ctl_table *table)
+{
+ struct ctl_table_root *root = set->dir.header.root;
+ struct ctl_table_header *header;
+ const char *name, *nextname;
+ struct ctl_dir *dir;
+ struct ctl_table *entry;
+ struct ctl_node *node;
+ int nr_entries = 0;
+
+ for (entry = table; entry->procname; entry++)
+ nr_entries++;
+
+ header = kzalloc(sizeof(struct ctl_table_header) +
+ sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+ if (!header)
+ return NULL;
+
+ node = (struct ctl_node *)(header + 1);
+ init_header(header, root, set, node, table);
+ if (sysctl_check_table(path, table))
+ goto fail;
+
+ spin_lock(&sysctl_lock);
+ dir = &set->dir;
+ /* Reference moved down the diretory tree get_subdir */
+ dir->header.nreg++;
+ spin_unlock(&sysctl_lock);
+
+ /* Find the directory for the ctl_table */
+ for (name = path; name; name = nextname) {
+ int namelen;
+ nextname = strchr(name, '/');
+ if (nextname) {
+ namelen = nextname - name;
+ nextname++;
+ } else {
+ namelen = strlen(name);
+ }
+ if (namelen == 0)
+ continue;
+
+ dir = get_subdir(dir, name, namelen);
+ if (IS_ERR(dir))
+ goto fail;
+ }
+
+ spin_lock(&sysctl_lock);
+ if (insert_header(dir, header))
+ goto fail_put_dir_locked;
+
+ drop_sysctl_table(&dir->header);
+ spin_unlock(&sysctl_lock);
+
+ return header;
+
+fail_put_dir_locked:
+ drop_sysctl_table(&dir->header);
+ spin_unlock(&sysctl_lock);
+fail:
+ kfree(header);
+ dump_stack();
+ return NULL;
+}
+
+/**
+ * register_sysctl - register a sysctl table
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the table structure
+ *
+ * Register a sysctl table. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+{
+ return __register_sysctl_table(&sysctl_table_root.default_set,
+ path, table);
+}
+EXPORT_SYMBOL(register_sysctl);
+
+static char *append_path(const char *path, char *pos, const char *name)
+{
+ int namelen;
+ namelen = strlen(name);
+ if (((pos - path) + namelen + 2) >= PATH_MAX)
+ return NULL;
+ memcpy(pos, name, namelen);
+ pos[namelen] = '/';
+ pos[namelen + 1] = '\0';
+ pos += namelen + 1;
+ return pos;
+}
+
+static int count_subheaders(struct ctl_table *table)
+{
+ int has_files = 0;
+ int nr_subheaders = 0;
+ struct ctl_table *entry;
+
+ /* special case: no directory and empty directory */
+ if (!table || !table->procname)
+ return 1;
+
+ for (entry = table; entry->procname; entry++) {
+ if (entry->child)
+ nr_subheaders += count_subheaders(entry->child);
+ else
+ has_files = 1;
+ }
+ return nr_subheaders + has_files;
+}
+
+static int register_leaf_sysctl_tables(const char *path, char *pos,
+ struct ctl_table_header ***subheader, struct ctl_table_set *set,
+ struct ctl_table *table)
+{
+ struct ctl_table *ctl_table_arg = NULL;
+ struct ctl_table *entry, *files;
+ int nr_files = 0;
+ int nr_dirs = 0;
+ int err = -ENOMEM;
+
+ for (entry = table; entry->procname; entry++) {
+ if (entry->child)
+ nr_dirs++;
+ else
+ nr_files++;
+ }
+
+ files = table;
+ /* If there are mixed files and directories we need a new table */
+ if (nr_dirs && nr_files) {
+ struct ctl_table *new;
+ files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1),
+ GFP_KERNEL);
+ if (!files)
+ goto out;
+
+ ctl_table_arg = files;
+ for (new = files, entry = table; entry->procname; entry++) {
+ if (entry->child)
+ continue;
+ *new = *entry;
+ new++;
+ }
+ }
+
+ /* Register everything except a directory full of subdirectories */
+ if (nr_files || !nr_dirs) {
+ struct ctl_table_header *header;
+ header = __register_sysctl_table(set, path, files);
+ if (!header) {
+ kfree(ctl_table_arg);
+ goto out;
+ }
+
+ /* Remember if we need to free the file table */
+ header->ctl_table_arg = ctl_table_arg;
+ **subheader = header;
+ (*subheader)++;
+ }
+
+ /* Recurse into the subdirectories. */
+ for (entry = table; entry->procname; entry++) {
+ char *child_pos;
+
+ if (!entry->child)
+ continue;
+
+ err = -ENAMETOOLONG;
+ child_pos = append_path(path, pos, entry->procname);
+ if (!child_pos)
+ goto out;
+
+ err = register_leaf_sysctl_tables(path, child_pos, subheader,
+ set, entry->child);
+ pos[0] = '\0';
+ if (err)
+ goto out;
+ }
+ err = 0;
+out:
+ /* On failure our caller will unregister all registered subheaders */
+ return err;
+}
+
+/**
+ * __register_sysctl_paths - register a sysctl table hierarchy
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *__register_sysctl_paths(
+ struct ctl_table_set *set,
+ const struct ctl_path *path, struct ctl_table *table)
+{
+ struct ctl_table *ctl_table_arg = table;
+ int nr_subheaders = count_subheaders(table);
+ struct ctl_table_header *header = NULL, **subheaders, **subheader;
+ const struct ctl_path *component;
+ char *new_path, *pos;
+
+ pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!new_path)
+ return NULL;
+
+ pos[0] = '\0';
+ for (component = path; component->procname; component++) {
+ pos = append_path(new_path, pos, component->procname);
+ if (!pos)
+ goto out;
+ }
+ while (table->procname && table->child && !table[1].procname) {
+ pos = append_path(new_path, pos, table->procname);
+ if (!pos)
+ goto out;
+ table = table->child;
+ }
+ if (nr_subheaders == 1) {
+ header = __register_sysctl_table(set, new_path, table);
+ if (header)
+ header->ctl_table_arg = ctl_table_arg;
+ } else {
+ header = kzalloc(sizeof(*header) +
+ sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
+ if (!header)
+ goto out;
+
+ subheaders = (struct ctl_table_header **) (header + 1);
+ subheader = subheaders;
+ header->ctl_table_arg = ctl_table_arg;
+
+ if (register_leaf_sysctl_tables(new_path, pos, &subheader,
+ set, table))
+ goto err_register_leaves;
+ }
+
+out:
+ kfree(new_path);
+ return header;
+
+err_register_leaves:
+ while (subheader > subheaders) {
+ struct ctl_table_header *subh = *(--subheader);
+ struct ctl_table *table = subh->ctl_table_arg;
+ unregister_sysctl_table(subh);
+ kfree(table);
+ }
+ kfree(header);
+ header = NULL;
+ goto out;
+}
+
+/**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+ struct ctl_table *table)
+{
+ return __register_sysctl_paths(&sysctl_table_root.default_set,
+ path, table);
+}
+EXPORT_SYMBOL(register_sysctl_paths);
+
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+ static const struct ctl_path null_path[] = { {} };
+
+ return register_sysctl_paths(null_path, table);
+}
+EXPORT_SYMBOL(register_sysctl_table);
+
+static void put_links(struct ctl_table_header *header)
+{
+ struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+ struct ctl_table_root *root = header->root;
+ struct ctl_dir *parent = header->parent;
+ struct ctl_dir *core_parent;
+ struct ctl_table *entry;
+
+ if (header->set == root_set)
+ return;
+
+ core_parent = xlate_dir(root_set, parent);
+ if (IS_ERR(core_parent))
+ return;
+
+ for (entry = header->ctl_table; entry->procname; entry++) {
+ struct ctl_table_header *link_head;
+ struct ctl_table *link;
+ const char *name = entry->procname;
+
+ link = find_entry(&link_head, core_parent, name, strlen(name));
+ if (link &&
+ ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) ||
+ (S_ISLNK(link->mode) && (link->data == root)))) {
+ drop_sysctl_table(link_head);
+ }
+ else {
+ printk(KERN_ERR "sysctl link missing during unregister: ");
+ sysctl_print_dir(parent);
+ printk(KERN_CONT "/%s\n", name);
+ }
+ }
+}
+
+static void drop_sysctl_table(struct ctl_table_header *header)
+{
+ struct ctl_dir *parent = header->parent;
+
+ if (--header->nreg)
+ return;
+
+ put_links(header);
+ start_unregistering(header);
+ if (!--header->count)
+ kfree_rcu(header, rcu);
+
+ if (parent)
+ drop_sysctl_table(&parent->header);
+}
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+ int nr_subheaders;
+ might_sleep();
+
+ if (header == NULL)
+ return;
+
+ nr_subheaders = count_subheaders(header->ctl_table_arg);
+ if (unlikely(nr_subheaders > 1)) {
+ struct ctl_table_header **subheaders;
+ int i;
+
+ subheaders = (struct ctl_table_header **)(header + 1);
+ for (i = nr_subheaders -1; i >= 0; i--) {
+ struct ctl_table_header *subh = subheaders[i];
+ struct ctl_table *table = subh->ctl_table_arg;
+ unregister_sysctl_table(subh);
+ kfree(table);
+ }
+ kfree(header);
+ return;
+ }
+
+ spin_lock(&sysctl_lock);
+ drop_sysctl_table(header);
+ spin_unlock(&sysctl_lock);
+}
+EXPORT_SYMBOL(unregister_sysctl_table);
+
+void setup_sysctl_set(struct ctl_table_set *set,
+ struct ctl_table_root *root,
+ int (*is_seen)(struct ctl_table_set *))
+{
+ memset(set, 0, sizeof(*set));
+ set->is_seen = is_seen;
+ init_header(&set->dir.header, root, set, NULL, root_table);
+}
+
+void retire_sysctl_set(struct ctl_table_set *set)
+{
+ WARN_ON(!RB_EMPTY_ROOT(&set->dir.root));
+}
+
int __init proc_sys_init(void)
{
struct proc_dir_entry *proc_sys_root;
@@ -478,5 +1601,6 @@ int __init proc_sys_init(void)
proc_sys_root->proc_iops = &proc_sys_dir_operations;
proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
- return 0;
+
+ return sysctl_init();
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 46a15d8a29c..eed44bfc85d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -115,12 +115,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
if (IS_ERR(sb))
return ERR_CAST(sb);
+ if (!proc_parse_options(options, ns)) {
+ deactivate_locked_super(sb);
+ return ERR_PTR(-EINVAL);
+ }
+
if (!sb->s_root) {
sb->s_flags = flags;
- if (!proc_parse_options(options, ns)) {
- deactivate_locked_super(sb);
- return ERR_PTR(-EINVAL);
- }
err = proc_fill_super(sb);
if (err) {
deactivate_locked_super(sb);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index d76ca6ae2b1..64c3b317236 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -18,19 +18,39 @@
#ifndef arch_irq_stat
#define arch_irq_stat() 0
#endif
-#ifndef arch_idle_time
-#define arch_idle_time(cpu) 0
-#endif
+
+#ifdef arch_idle_time
+
+static cputime64_t get_idle_time(int cpu)
+{
+ cputime64_t idle;
+
+ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
+ idle += arch_idle_time(cpu);
+ return idle;
+}
+
+static cputime64_t get_iowait_time(int cpu)
+{
+ cputime64_t iowait;
+
+ iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ if (cpu_online(cpu) && nr_iowait_cpu(cpu))
+ iowait += arch_idle_time(cpu);
+ return iowait;
+}
+
+#else
static u64 get_idle_time(int cpu)
{
u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL);
- if (idle_time == -1ULL) {
+ if (idle_time == -1ULL)
/* !NO_HZ so we can rely on cpustat.idle */
idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
- idle += arch_idle_time(cpu);
- } else
+ else
idle = usecs_to_cputime64(idle_time);
return idle;
@@ -49,6 +69,8 @@ static u64 get_iowait_time(int cpu)
return iowait;
}
+#endif
+
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
@@ -77,6 +99,8 @@ static int show_stat(struct seq_file *p, void *v)
steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ sum += kstat_cpu_irqs_sum(i);
+ sum += arch_irq_stat_cpu(i);
for (j = 0; j < NR_SOFTIRQS; j++) {
unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@@ -87,18 +111,19 @@ static int show_stat(struct seq_file *p, void *v)
}
sum += arch_irq_stat();
- seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu "
- "%llu\n",
- (unsigned long long)cputime64_to_clock_t(user),
- (unsigned long long)cputime64_to_clock_t(nice),
- (unsigned long long)cputime64_to_clock_t(system),
- (unsigned long long)cputime64_to_clock_t(idle),
- (unsigned long long)cputime64_to_clock_t(iowait),
- (unsigned long long)cputime64_to_clock_t(irq),
- (unsigned long long)cputime64_to_clock_t(softirq),
- (unsigned long long)cputime64_to_clock_t(steal),
- (unsigned long long)cputime64_to_clock_t(guest),
- (unsigned long long)cputime64_to_clock_t(guest_nice));
+ seq_puts(p, "cpu ");
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+ seq_putc(p, '\n');
+
for_each_online_cpu(i) {
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
@@ -111,26 +136,24 @@ static int show_stat(struct seq_file *p, void *v)
steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
- seq_printf(p,
- "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
- "%llu\n",
- i,
- (unsigned long long)cputime64_to_clock_t(user),
- (unsigned long long)cputime64_to_clock_t(nice),
- (unsigned long long)cputime64_to_clock_t(system),
- (unsigned long long)cputime64_to_clock_t(idle),
- (unsigned long long)cputime64_to_clock_t(iowait),
- (unsigned long long)cputime64_to_clock_t(irq),
- (unsigned long long)cputime64_to_clock_t(softirq),
- (unsigned long long)cputime64_to_clock_t(steal),
- (unsigned long long)cputime64_to_clock_t(guest),
- (unsigned long long)cputime64_to_clock_t(guest_nice));
+ seq_printf(p, "cpu%d", i);
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+ seq_putc(p, '\n');
}
seq_printf(p, "intr %llu", (unsigned long long)sum);
/* sum again ? it could be updated? */
for_each_irq_nr(j)
- seq_printf(p, " %u", kstat_irqs(j));
+ seq_put_decimal_ull(p, ' ', kstat_irqs(j));
seq_printf(p,
"\nctxt %llu\n"
@@ -147,7 +170,7 @@ static int show_stat(struct seq_file *p, void *v)
seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
for (i = 0; i < NR_SOFTIRQS; i++)
- seq_printf(p, " %u", per_softirq_sums[i]);
+ seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);
seq_putc(p, '\n');
return 0;
@@ -155,11 +178,14 @@ static int show_stat(struct seq_file *p, void *v)
static int stat_open(struct inode *inode, struct file *file)
{
- unsigned size = 4096 * (1 + num_possible_cpus() / 32);
+ unsigned size = 1024 + 128 * num_possible_cpus();
char *buf;
struct seq_file *m;
int res;
+ /* minimum size to display an interrupt count : 2 bytes */
+ size += 2 * nr_irqs;
+
/* don't ask for more than the kmalloc() max size */
if (size > KMALLOC_MAX_SIZE)
size = KMALLOC_MAX_SIZE;
@@ -171,7 +197,7 @@ static int stat_open(struct inode *inode, struct file *file)
if (!res) {
m = file->private_data;
m->buf = buf;
- m->size = size;
+ m->size = ksize(buf);
} else
kfree(buf);
return res;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e418c5abdb0..2b9a7607cbd 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -209,16 +209,20 @@ static int do_maps_open(struct inode *inode, struct file *file,
return ret;
}
-static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
+static void
+show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
+ struct proc_maps_private *priv = m->private;
+ struct task_struct *task = priv->task;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
int len;
+ const char *name = NULL;
if (file) {
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -252,36 +256,57 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
pad_len_spaces(m, len);
seq_path(m, &file->f_path, "\n");
- } else {
- const char *name = arch_vma_name(vma);
- if (!name) {
- if (mm) {
- if (vma->vm_start <= mm->brk &&
- vma->vm_end >= mm->start_brk) {
- name = "[heap]";
- } else if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
- name = "[stack]";
- }
+ goto done;
+ }
+
+ name = arch_vma_name(vma);
+ if (!name) {
+ pid_t tid;
+
+ if (!mm) {
+ name = "[vdso]";
+ goto done;
+ }
+
+ if (vma->vm_start <= mm->brk &&
+ vma->vm_end >= mm->start_brk) {
+ name = "[heap]";
+ goto done;
+ }
+
+ tid = vm_is_stack(task, vma, is_pid);
+
+ if (tid != 0) {
+ /*
+ * Thread stack in /proc/PID/task/TID/maps or
+ * the main process stack.
+ */
+ if (!is_pid || (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack)) {
+ name = "[stack]";
} else {
- name = "[vdso]";
+ /* Thread stack in /proc/PID/maps */
+ pad_len_spaces(m, len);
+ seq_printf(m, "[stack:%d]", tid);
}
}
- if (name) {
- pad_len_spaces(m, len);
- seq_puts(m, name);
- }
+ }
+
+done:
+ if (name) {
+ pad_len_spaces(m, len);
+ seq_puts(m, name);
}
seq_putc(m, '\n');
}
-static int show_map(struct seq_file *m, void *v)
+static int show_map(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
- show_map_vma(m, vma);
+ show_map_vma(m, vma, is_pid);
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task->mm))
@@ -289,20 +314,49 @@ static int show_map(struct seq_file *m, void *v)
return 0;
}
+static int show_pid_map(struct seq_file *m, void *v)
+{
+ return show_map(m, v, 1);
+}
+
+static int show_tid_map(struct seq_file *m, void *v)
+{
+ return show_map(m, v, 0);
+}
+
static const struct seq_operations proc_pid_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_map
+ .show = show_pid_map
};
-static int maps_open(struct inode *inode, struct file *file)
+static const struct seq_operations proc_tid_maps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_map
+};
+
+static int pid_maps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_pid_maps_op);
}
-const struct file_operations proc_maps_operations = {
- .open = maps_open,
+static int tid_maps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_tid_maps_op);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+ .open = pid_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+const struct file_operations proc_tid_maps_operations = {
+ .open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
@@ -394,21 +448,15 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
- spin_lock(&walk->mm->page_table_lock);
- if (pmd_trans_huge(*pmd)) {
- if (pmd_trans_splitting(*pmd)) {
- spin_unlock(&walk->mm->page_table_lock);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- smaps_pte_entry(*(pte_t *)pmd, addr,
- HPAGE_PMD_SIZE, walk);
- spin_unlock(&walk->mm->page_table_lock);
- mss->anonymous_thp += HPAGE_PMD_SIZE;
- return 0;
- }
- } else {
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
spin_unlock(&walk->mm->page_table_lock);
+ mss->anonymous_thp += HPAGE_PMD_SIZE;
+ return 0;
}
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
/*
* The mmap_sem held all the way back in m_start() is what
* keeps khugepaged out of here and from collapsing things
@@ -422,7 +470,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return 0;
}
-static int show_smap(struct seq_file *m, void *v)
+static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
@@ -440,7 +488,7 @@ static int show_smap(struct seq_file *m, void *v)
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
- show_map_vma(m, vma);
+ show_map_vma(m, vma, is_pid);
seq_printf(m,
"Size: %8lu kB\n"
@@ -479,20 +527,49 @@ static int show_smap(struct seq_file *m, void *v)
return 0;
}
+static int show_pid_smap(struct seq_file *m, void *v)
+{
+ return show_smap(m, v, 1);
+}
+
+static int show_tid_smap(struct seq_file *m, void *v)
+{
+ return show_smap(m, v, 0);
+}
+
static const struct seq_operations proc_pid_smaps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_smap
+ .show = show_pid_smap
+};
+
+static const struct seq_operations proc_tid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_smap
};
-static int smaps_open(struct inode *inode, struct file *file)
+static int pid_smaps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_pid_smaps_op);
}
-const struct file_operations proc_smaps_operations = {
- .open = smaps_open,
+static int tid_smaps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_tid_smaps_op);
+}
+
+const struct file_operations proc_pid_smaps_operations = {
+ .open = pid_smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+const struct file_operations proc_tid_smaps_operations = {
+ .open = tid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
@@ -507,6 +584,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
split_huge_page_pmd(walk->mm, pmd);
+ if (pmd_trans_unstable(pmd))
+ return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -518,6 +597,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!page)
continue;
+ if (PageReserved(page))
+ continue;
+
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
ClearPageReferenced(page);
@@ -595,11 +677,18 @@ const struct file_operations proc_clear_refs_operations = {
.llseek = noop_llseek,
};
+typedef struct {
+ u64 pme;
+} pagemap_entry_t;
+
struct pagemapread {
int pos, len;
- u64 *buffer;
+ pagemap_entry_t *buffer;
};
+#define PAGEMAP_WALK_SIZE (PMD_SIZE)
+#define PAGEMAP_WALK_MASK (PMD_MASK)
+
#define PM_ENTRY_BYTES sizeof(u64)
#define PM_STATUS_BITS 3
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
@@ -617,10 +706,15 @@ struct pagemapread {
#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
#define PM_END_OF_BUFFER 1
-static int add_to_pagemap(unsigned long addr, u64 pfn,
+static inline pagemap_entry_t make_pme(u64 val)
+{
+ return (pagemap_entry_t) { .pme = val };
+}
+
+static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
struct pagemapread *pm)
{
- pm->buffer[pm->pos++] = pfn;
+ pm->buffer[pm->pos++] = *pme;
if (pm->pos >= pm->len)
return PM_END_OF_BUFFER;
return 0;
@@ -632,8 +726,10 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
struct pagemapread *pm = walk->private;
unsigned long addr;
int err = 0;
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
+ err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
}
@@ -646,17 +742,35 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
}
-static u64 pte_to_pagemap_entry(pte_t pte)
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte)
{
- u64 pme = 0;
if (is_swap_pte(pte))
- pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
- | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
+ *pme = make_pme(PM_PFRAME(swap_pte_to_pagemap_entry(pte))
+ | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP);
else if (pte_present(pte))
- pme = PM_PFRAME(pte_pfn(pte))
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
- return pme;
+ *pme = make_pme(PM_PFRAME(pte_pfn(pte))
+ | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
+ pmd_t pmd, int offset)
+{
+ /*
+ * Currently pmd for thp is always present because thp can not be
+ * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
+ * This if-check is just to prepare for future implementation.
+ */
+ if (pmd_present(pmd))
+ *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
+ | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+}
+#else
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
+ pmd_t pmd, int offset)
+{
}
+#endif
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -665,13 +779,28 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct pagemapread *pm = walk->private;
pte_t *pte;
int err = 0;
-
- split_huge_page_pmd(walk->mm, pmd);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ for (; addr != end; addr += PAGE_SIZE) {
+ unsigned long offset;
+
+ offset = (addr & ~PAGEMAP_WALK_MASK) >>
+ PAGE_SHIFT;
+ thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ break;
+ }
+ spin_unlock(&walk->mm->page_table_lock);
+ return err;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
for (; addr != end; addr += PAGE_SIZE) {
- u64 pfn = PM_NOT_PRESENT;
/* check to see if we've left 'vma' behind
* and need a new, higher one */
@@ -683,11 +812,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (vma && (vma->vm_start <= addr) &&
!is_vm_hugetlb_page(vma)) {
pte = pte_offset_map(pmd, addr);
- pfn = pte_to_pagemap_entry(*pte);
+ pte_to_pagemap_entry(&pme, *pte);
/* unmap before userspace copy */
pte_unmap(pte);
}
- err = add_to_pagemap(addr, pfn, pm);
+ err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
}
@@ -698,13 +827,12 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
+ pte_t pte, int offset)
{
- u64 pme = 0;
if (pte_present(pte))
- pme = PM_PFRAME(pte_pfn(pte) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
- return pme;
+ *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
+ | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
}
/* This function walks within one hugetlb entry in the single call */
@@ -714,12 +842,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct pagemapread *pm = walk->private;
int err = 0;
- u64 pfn;
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
for (; addr != end; addr += PAGE_SIZE) {
int offset = (addr & ~hmask) >> PAGE_SHIFT;
- pfn = huge_pte_to_pagemap_entry(*pte, offset);
- err = add_to_pagemap(addr, pfn, pm);
+ huge_pte_to_pagemap_entry(&pme, *pte, offset);
+ err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
}
@@ -754,8 +882,6 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* determine which areas of memory are actually mapped and llseek to
* skip over unmapped regions.
*/
-#define PAGEMAP_WALK_SIZE (PMD_SIZE)
-#define PAGEMAP_WALK_MASK (PMD_MASK)
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -938,26 +1064,21 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *pte;
md = walk->private;
- spin_lock(&walk->mm->page_table_lock);
- if (pmd_trans_huge(*pmd)) {
- if (pmd_trans_splitting(*pmd)) {
- spin_unlock(&walk->mm->page_table_lock);
- wait_split_huge_page(md->vma->anon_vma, pmd);
- } else {
- pte_t huge_pte = *(pte_t *)pmd;
- struct page *page;
-
- page = can_gather_numa_stats(huge_pte, md->vma, addr);
- if (page)
- gather_stats(page, md, pte_dirty(huge_pte),
- HPAGE_PMD_SIZE/PAGE_SIZE);
- spin_unlock(&walk->mm->page_table_lock);
- return 0;
- }
- } else {
+
+ if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+ pte_t huge_pte = *(pte_t *)pmd;
+ struct page *page;
+
+ page = can_gather_numa_stats(huge_pte, md->vma, addr);
+ if (page)
+ gather_stats(page, md, pte_dirty(huge_pte),
+ HPAGE_PMD_SIZE/PAGE_SIZE);
spin_unlock(&walk->mm->page_table_lock);
+ return 0;
}
+ if (pmd_trans_unstable(pmd))
+ return 0;
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
@@ -999,7 +1120,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
/*
* Display pages allocated per node and memory policy via /proc.
*/
-static int show_numa_map(struct seq_file *m, void *v)
+static int show_numa_map(struct seq_file *m, void *v, int is_pid)
{
struct numa_maps_private *numa_priv = m->private;
struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
@@ -1036,9 +1157,19 @@ static int show_numa_map(struct seq_file *m, void *v)
seq_path(m, &file->f_path, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_printf(m, " heap");
- } else if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
- seq_printf(m, " stack");
+ } else {
+ pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid);
+ if (tid != 0) {
+ /*
+ * Thread stack in /proc/PID/task/TID/maps or
+ * the main process stack.
+ */
+ if (!is_pid || (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack))
+ seq_printf(m, " stack");
+ else
+ seq_printf(m, " stack:%d", tid);
+ }
}
if (is_vm_hugetlb_page(vma))
@@ -1081,21 +1212,39 @@ out:
return 0;
}
+static int show_pid_numa_map(struct seq_file *m, void *v)
+{
+ return show_numa_map(m, v, 1);
+}
+
+static int show_tid_numa_map(struct seq_file *m, void *v)
+{
+ return show_numa_map(m, v, 0);
+}
+
static const struct seq_operations proc_pid_numa_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_numa_map,
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_pid_numa_map,
+};
+
+static const struct seq_operations proc_tid_numa_maps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_numa_map,
};
-static int numa_maps_open(struct inode *inode, struct file *file)
+static int numa_maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
{
struct numa_maps_private *priv;
int ret = -ENOMEM;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (priv) {
priv->proc_maps.pid = proc_pid(inode);
- ret = seq_open(file, &proc_pid_numa_maps_op);
+ ret = seq_open(file, ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = priv;
@@ -1106,8 +1255,25 @@ static int numa_maps_open(struct inode *inode, struct file *file)
return ret;
}
-const struct file_operations proc_numa_maps_operations = {
- .open = numa_maps_open,
+static int pid_numa_maps_open(struct inode *inode, struct file *file)
+{
+ return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
+}
+
+static int tid_numa_maps_open(struct inode *inode, struct file *file)
+{
+ return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
+}
+
+const struct file_operations proc_pid_numa_maps_operations = {
+ .open = pid_numa_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+const struct file_operations proc_tid_numa_maps_operations = {
+ .open = tid_numa_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 980de547c07..74fe164d1b2 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -134,9 +134,11 @@ static void pad_len_spaces(struct seq_file *m, int len)
/*
* display a single VMA to a sequenced file
*/
-static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
+ int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
+ struct proc_maps_private *priv = m->private;
unsigned long ino = 0;
struct file *file;
dev_t dev = 0;
@@ -168,10 +170,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
pad_len_spaces(m, len);
seq_path(m, &file->f_path, "");
} else if (mm) {
- if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
+ pid_t tid = vm_is_stack(priv->task, vma, is_pid);
+
+ if (tid != 0) {
pad_len_spaces(m, len);
- seq_puts(m, "[stack]");
+ /*
+ * Thread stack in /proc/PID/task/TID/maps or
+ * the main process stack.
+ */
+ if (!is_pid || (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack))
+ seq_printf(m, "[stack]");
+ else
+ seq_printf(m, "[stack:%d]", tid);
}
}
@@ -182,11 +193,22 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
/*
* display mapping lines for a particular process's /proc/pid/maps
*/
-static int show_map(struct seq_file *m, void *_p)
+static int show_map(struct seq_file *m, void *_p, int is_pid)
{
struct rb_node *p = _p;
- return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
+ return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb),
+ is_pid);
+}
+
+static int show_pid_map(struct seq_file *m, void *_p)
+{
+ return show_map(m, _p, 1);
+}
+
+static int show_tid_map(struct seq_file *m, void *_p)
+{
+ return show_map(m, _p, 0);
}
static void *m_start(struct seq_file *m, loff_t *pos)
@@ -240,10 +262,18 @@ static const struct seq_operations proc_pid_maps_ops = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_map
+ .show = show_pid_map
+};
+
+static const struct seq_operations proc_tid_maps_ops = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_map
};
-static int maps_open(struct inode *inode, struct file *file)
+static int maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
{
struct proc_maps_private *priv;
int ret = -ENOMEM;
@@ -251,7 +281,7 @@ static int maps_open(struct inode *inode, struct file *file)
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (priv) {
priv->pid = proc_pid(inode);
- ret = seq_open(file, &proc_pid_maps_ops);
+ ret = seq_open(file, ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = priv;
@@ -262,8 +292,25 @@ static int maps_open(struct inode *inode, struct file *file)
return ret;
}
-const struct file_operations proc_maps_operations = {
- .open = maps_open,
+static int pid_maps_open(struct inode *inode, struct file *file)
+{
+ return maps_open(inode, file, &proc_pid_maps_ops);
+}
+
+static int tid_maps_open(struct inode *inode, struct file *file)
+{
+ return maps_open(inode, file, &proc_tid_maps_ops);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+ .open = pid_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+const struct file_operations proc_tid_maps_operations = {
+ .open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index b0f450a2bb7..0d5071d2998 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -700,3 +700,26 @@ static int __init vmcore_init(void)
return 0;
}
module_init(vmcore_init)
+
+/* Cleanup function for vmcore module. */
+void vmcore_cleanup(void)
+{
+ struct list_head *pos, *next;
+
+ if (proc_vmcore) {
+ remove_proc_entry(proc_vmcore->name, proc_vmcore->parent);
+ proc_vmcore = NULL;
+ }
+
+ /* clear the vmcore list. */
+ list_for_each_safe(pos, next, &vmcore_list) {
+ struct vmcore *m;
+
+ m = list_entry(pos, struct vmcore, list);
+ list_del(&m->list);
+ kfree(m);
+ }
+ kfree(elfcorebuf);
+ elfcorebuf = NULL;
+}
+EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index b3b426edb2f..19507889bb7 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -52,12 +52,6 @@ struct pstore_private {
char data[];
};
-static int pstore_file_open(struct inode *inode, struct file *file)
-{
- file->private_data = inode->i_private;
- return 0;
-}
-
static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
@@ -67,7 +61,7 @@ static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
}
static const struct file_operations pstore_file_operations = {
- .open = pstore_file_open,
+ .open = simple_open,
.read = pstore_file_read,
.llseek = default_llseek,
};
@@ -105,26 +99,12 @@ static const struct inode_operations pstore_dir_inode_operations = {
.unlink = pstore_unlink,
};
-static struct inode *pstore_get_inode(struct super_block *sb,
- const struct inode *dir, int mode, dev_t dev)
+static struct inode *pstore_get_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
-
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_uid = inode->i_gid = 0;
- inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- switch (mode & S_IFMT) {
- case S_IFREG:
- inode->i_fop = &pstore_file_operations;
- break;
- case S_IFDIR:
- inode->i_op = &pstore_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
- inc_nlink(inode);
- break;
- }
}
return inode;
}
@@ -216,9 +196,11 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
return rc;
rc = -ENOMEM;
- inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
+ inode = pstore_get_inode(pstore_sb);
if (!inode)
goto fail;
+ inode->i_mode = S_IFREG | 0444;
+ inode->i_fop = &pstore_file_operations;
private = kmalloc(sizeof *private + size, GFP_KERNEL);
if (!private)
goto fail_alloc;
@@ -278,9 +260,7 @@ fail:
int pstore_fill_super(struct super_block *sb, void *data, int silent)
{
- struct inode *inode = NULL;
- struct dentry *root;
- int err;
+ struct inode *inode;
save_mount_options(sb, data);
@@ -295,27 +275,20 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent)
parse_options(data);
- inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
- if (!inode) {
- err = -ENOMEM;
- goto fail;
- }
- /* override ramfs "dir" options so we catch unlink(2) */
- inode->i_op = &pstore_dir_inode_operations;
-
- root = d_alloc_root(inode);
- sb->s_root = root;
- if (!root) {
- err = -ENOMEM;
- goto fail;
+ inode = pstore_get_inode(sb);
+ if (inode) {
+ inode->i_mode = S_IFDIR | 0755;
+ inode->i_op = &pstore_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ inc_nlink(inode);
}
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
+ return -ENOMEM;
pstore_get_records(0);
return 0;
-fail:
- iput(inode);
- return err;
}
static struct dentry *pstore_mount(struct file_system_type *fs_type,
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 9ec22d3b429..82c585f715e 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -68,9 +68,25 @@ void pstore_set_kmsg_bytes(int bytes)
/* Tag each group of saved records with a sequence number */
static int oopscount;
-static char *reason_str[] = {
- "Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency"
-};
+static const char *get_reason_str(enum kmsg_dump_reason reason)
+{
+ switch (reason) {
+ case KMSG_DUMP_PANIC:
+ return "Panic";
+ case KMSG_DUMP_OOPS:
+ return "Oops";
+ case KMSG_DUMP_EMERG:
+ return "Emergency";
+ case KMSG_DUMP_RESTART:
+ return "Restart";
+ case KMSG_DUMP_HALT:
+ return "Halt";
+ case KMSG_DUMP_POWEROFF:
+ return "Poweroff";
+ default:
+ return "Unknown";
+ }
+}
/*
* callback from kmsg_dump. (s2,l2) has the most recently
@@ -85,17 +101,15 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned long s1_start, s2_start;
unsigned long l1_cpy, l2_cpy;
unsigned long size, total = 0;
- char *dst, *why;
+ char *dst;
+ const char *why;
u64 id;
int hsize, ret;
unsigned int part = 1;
unsigned long flags = 0;
int is_locked = 0;
- if (reason < ARRAY_SIZE(reason_str))
- why = reason_str[reason];
- else
- why = "Unknown";
+ why = get_reason_str(reason);
if (in_nmi()) {
is_locked = spin_trylock(&psinfo->buf_lock);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2bfd987f485..552e994e3aa 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -52,38 +52,6 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
-static struct buffer_head *qnx4_getblk(struct inode *inode, int nr,
- int create)
-{
- struct buffer_head *result = NULL;
-
- if ( nr >= 0 )
- nr = qnx4_block_map( inode, nr );
- if (nr) {
- result = sb_getblk(inode->i_sb, nr);
- return result;
- }
- return NULL;
-}
-
-struct buffer_head *qnx4_bread(struct inode *inode, int block, int create)
-{
- struct buffer_head *bh;
-
- bh = qnx4_getblk(inode, block, create);
- if (!bh || buffer_uptodate(bh)) {
- return bh;
- }
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh)) {
- return bh;
- }
- brelse(bh);
-
- return NULL;
-}
-
static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create )
{
unsigned long phys;
@@ -98,23 +66,31 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h
return 0;
}
+static inline u32 try_extent(qnx4_xtnt_t *extent, u32 *offset)
+{
+ u32 size = le32_to_cpu(extent->xtnt_size);
+ if (*offset < size)
+ return le32_to_cpu(extent->xtnt_blk) + *offset - 1;
+ *offset -= size;
+ return 0;
+}
+
unsigned long qnx4_block_map( struct inode *inode, long iblock )
{
int ix;
- long offset, i_xblk;
- unsigned long block = 0;
+ long i_xblk;
struct buffer_head *bh = NULL;
struct qnx4_xblk *xblk = NULL;
struct qnx4_inode_entry *qnx4_inode = qnx4_raw_inode(inode);
u16 nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts);
+ u32 offset = iblock;
+ u32 block = try_extent(&qnx4_inode->di_first_xtnt, &offset);
- if ( iblock < le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_size) ) {
+ if (block) {
// iblock is in the first extent. This is easy.
- block = le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_blk) + iblock - 1;
} else {
// iblock is beyond first extent. We have to follow the extent chain.
i_xblk = le32_to_cpu(qnx4_inode->di_xblk);
- offset = iblock - le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_size);
ix = 0;
while ( --nxtnt > 0 ) {
if ( ix == 0 ) {
@@ -130,12 +106,11 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
return -EIO;
}
}
- if ( offset < le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_size) ) {
+ block = try_extent(&xblk->xblk_xtnts[ix], &offset);
+ if (block) {
// got it!
- block = le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_blk) + offset - 1;
break;
}
- offset -= le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_size);
if ( ++ix >= xblk->xblk_num_xtnts ) {
i_xblk = le32_to_cpu(xblk->xblk_next_xblk);
ix = 0;
@@ -179,47 +154,33 @@ static const char *qnx4_checkroot(struct super_block *sb)
struct qnx4_inode_entry *rootdir;
int rd, rl;
int i, j;
- int found = 0;
- if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
+ if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
return "no qnx4 filesystem (no root dir).";
- } else {
- QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
- rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
- rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
- for (j = 0; j < rl; j++) {
- bh = sb_bread(sb, rd + j); /* root dir, first block */
- if (bh == NULL) {
- return "unable to read root entry.";
- }
- for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
- rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
- if (rootdir->di_fname != NULL) {
- QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
- if (!strcmp(rootdir->di_fname,
- QNX4_BMNAME)) {
- found = 1;
- qnx4_sb(sb)->BitMap = kmemdup(rootdir,
- sizeof(struct qnx4_inode_entry),
- GFP_KERNEL);
- if (!qnx4_sb(sb)->BitMap) {
- brelse (bh);
- return "not enough memory for bitmap inode";
- }/* keep bitmap inode known */
- break;
- }
- }
- }
+ QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
+ rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
+ rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
+ for (j = 0; j < rl; j++) {
+ bh = sb_bread(sb, rd + j); /* root dir, first block */
+ if (bh == NULL)
+ return "unable to read root entry.";
+ rootdir = (struct qnx4_inode_entry *) bh->b_data;
+ for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) {
+ QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
+ if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0)
+ continue;
+ qnx4_sb(sb)->BitMap = kmemdup(rootdir,
+ sizeof(struct qnx4_inode_entry),
+ GFP_KERNEL);
brelse(bh);
- if (found != 0) {
- break;
- }
- }
- if (found == 0) {
- return "bitmap file not found.";
+ if (!qnx4_sb(sb)->BitMap)
+ return "not enough memory for bitmap inode";
+ /* keep bitmap inode known */
+ return NULL;
}
+ brelse(bh);
}
- return NULL;
+ return "bitmap file not found.";
}
static int qnx4_fill_super(struct super_block *s, void *data, int silent)
@@ -270,19 +231,19 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
if (IS_ERR(root)) {
printk(KERN_ERR "qnx4: get inode failed\n");
ret = PTR_ERR(root);
- goto out;
+ goto outb;
}
ret = -ENOMEM;
- s->s_root = d_alloc_root(root);
+ s->s_root = d_make_root(root);
if (s->s_root == NULL)
- goto outi;
+ goto outb;
brelse(bh);
return 0;
- outi:
- iput(root);
+ outb:
+ kfree(qs->BitMap);
out:
brelse(bh);
outnobh:
@@ -300,44 +261,17 @@ static void qnx4_put_super(struct super_block *sb)
return;
}
-static int qnx4_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page,qnx4_get_block, wbc);
-}
-
static int qnx4_readpage(struct file *file, struct page *page)
{
return block_read_full_page(page,qnx4_get_block);
}
-static int qnx4_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host);
- int ret;
-
- *pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- qnx4_get_block,
- &qnx4_inode->mmu_private);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
-
- return ret;
-}
static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,qnx4_get_block);
}
static const struct address_space_operations qnx4_aops = {
.readpage = qnx4_readpage,
- .writepage = qnx4_writepage,
- .write_begin = qnx4_write_begin,
- .write_end = generic_write_end,
.bmap = qnx4_bmap
};
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 275327b5615..a512c0b30e8 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -39,10 +39,6 @@ static int qnx4_match(int len, const char *name,
} else {
namelen = QNX4_SHORT_NAME_MAX;
}
- /* "" means "." ---> so paths like "/usr/lib//libc.a" work */
- if (!len && (de->di_fname[0] == '.') && (de->di_fname[1] == '\0')) {
- return 1;
- }
thislen = strlen( de->di_fname );
if ( thislen > namelen )
thislen = namelen;
@@ -72,7 +68,9 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
block = offset = blkofs = 0;
while (blkofs * QNX4_BLOCK_SIZE + offset < dir->i_size) {
if (!bh) {
- bh = qnx4_bread(dir, blkofs, 0);
+ block = qnx4_block_map(dir, blkofs);
+ if (block)
+ bh = sb_bread(dir->i_sb, block);
if (!bh) {
blkofs++;
continue;
@@ -80,7 +78,6 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
}
*res_dir = (struct qnx4_inode_entry *) (bh->b_data + offset);
if (qnx4_match(len, name, bh, &offset)) {
- block = qnx4_block_map( dir, blkofs );
*ino = block * QNX4_INODES_PER_BLOCK +
(offset / QNX4_DIR_ENTRY_SIZE) - 1;
return bh;
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 33a60858203..244d4620189 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -27,8 +27,6 @@ extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, stru
extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
-extern struct buffer_head *qnx4_bread(struct inode *, int, int);
-
extern const struct inode_operations qnx4_dir_inode_operations;
extern const struct file_operations qnx4_dir_operations;
extern int qnx4_is_free(struct super_block *sb, long block);
diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig
new file mode 100644
index 00000000000..edbba5c17cc
--- /dev/null
+++ b/fs/qnx6/Kconfig
@@ -0,0 +1,26 @@
+config QNX6FS_FS
+ tristate "QNX6 file system support (read only)"
+ depends on BLOCK && CRC32
+ help
+ This is the file system used by the real-time operating systems
+ QNX 6 (also called QNX RTP).
+ Further information is available at <http://www.qnx.com/>.
+ Say Y if you intend to mount QNX hard disks or floppies formatted
+ with a mkqnx6fs.
+ However, keep in mind that this currently is a readonly driver!
+
+ To compile this file system support as a module, choose M here: the
+ module will be called qnx6.
+
+ If you don't know whether you need it, then you don't need it:
+ answer N.
+
+config QNX6FS_DEBUG
+ bool "QNX6 debugging information"
+ depends on QNX6FS_FS
+ help
+ Turns on extended debugging output.
+
+ If you are not a developer working on the QNX6FS, you probably don't
+ want this:
+ answer N.
diff --git a/fs/qnx6/Makefile b/fs/qnx6/Makefile
new file mode 100644
index 00000000000..9dd06199afc
--- /dev/null
+++ b/fs/qnx6/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux qnx4-filesystem routines.
+#
+
+obj-$(CONFIG_QNX6FS_FS) += qnx6.o
+
+qnx6-objs := inode.o dir.o namei.o super_mmi.o
diff --git a/fs/qnx6/README b/fs/qnx6/README
new file mode 100644
index 00000000000..116d622026c
--- /dev/null
+++ b/fs/qnx6/README
@@ -0,0 +1,8 @@
+
+ This is a snapshot of the QNX6 filesystem for Linux.
+ Please send diffs and remarks to <chaosman@ontika.net> .
+
+Credits :
+
+Al Viro <viro@ZenIV.linux.org.uk> (endless patience with me & support ;))
+Kai Bankett <chaosman@ontika.net> (Maintainer)
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
new file mode 100644
index 00000000000..dc597353db3
--- /dev/null
+++ b/fs/qnx6/dir.c
@@ -0,0 +1,291 @@
+/*
+ * QNX6 file system, Linux implementation.
+ *
+ * Version : 1.0.0
+ *
+ * History :
+ *
+ * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
+ * 16-02-2012 pagemap extension by Al Viro
+ *
+ */
+
+#include "qnx6.h"
+
+static unsigned qnx6_lfile_checksum(char *name, unsigned size)
+{
+ unsigned crc = 0;
+ char *end = name + size;
+ while (name < end) {
+ crc = ((crc >> 1) + *(name++)) ^
+ ((crc & 0x00000001) ? 0x80000000 : 0);
+ }
+ return crc;
+}
+
+static struct page *qnx6_get_page(struct inode *dir, unsigned long n)
+{
+ struct address_space *mapping = dir->i_mapping;
+ struct page *page = read_mapping_page(mapping, n, NULL);
+ if (!IS_ERR(page))
+ kmap(page);
+ return page;
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+ return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+static unsigned last_entry(struct inode *inode, unsigned long page_nr)
+{
+ unsigned long last_byte = inode->i_size;
+ last_byte -= page_nr << PAGE_CACHE_SHIFT;
+ if (last_byte > PAGE_CACHE_SIZE)
+ last_byte = PAGE_CACHE_SIZE;
+ return last_byte / QNX6_DIR_ENTRY_SIZE;
+}
+
+static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
+ struct qnx6_long_dir_entry *de,
+ struct page **p)
+{
+ struct qnx6_sb_info *sbi = QNX6_SB(sb);
+ u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */
+ u32 n = s >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); /* in pages */
+ /* within page */
+ u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_CACHE_MASK;
+ struct address_space *mapping = sbi->longfile->i_mapping;
+ struct page *page = read_mapping_page(mapping, n, NULL);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ kmap(*p = page);
+ return (struct qnx6_long_filename *)(page_address(page) + offs);
+}
+
+static int qnx6_dir_longfilename(struct inode *inode,
+ struct qnx6_long_dir_entry *de,
+ void *dirent, loff_t pos,
+ unsigned de_inode, filldir_t filldir)
+{
+ struct qnx6_long_filename *lf;
+ struct super_block *s = inode->i_sb;
+ struct qnx6_sb_info *sbi = QNX6_SB(s);
+ struct page *page;
+ int lf_size;
+
+ if (de->de_size != 0xff) {
+ /* error - long filename entries always have size 0xff
+ in direntry */
+ printk(KERN_ERR "qnx6: invalid direntry size (%i).\n",
+ de->de_size);
+ return 0;
+ }
+ lf = qnx6_longname(s, de, &page);
+ if (IS_ERR(lf)) {
+ printk(KERN_ERR "qnx6:Error reading longname\n");
+ return 0;
+ }
+
+ lf_size = fs16_to_cpu(sbi, lf->lf_size);
+
+ if (lf_size > QNX6_LONG_NAME_MAX) {
+ QNX6DEBUG((KERN_INFO "file %s\n", lf->lf_fname));
+ printk(KERN_ERR "qnx6:Filename too long (%i)\n", lf_size);
+ qnx6_put_page(page);
+ return 0;
+ }
+
+ /* calc & validate longfilename checksum
+ mmi 3g filesystem does not have that checksum */
+ if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) !=
+ qnx6_lfile_checksum(lf->lf_fname, lf_size))
+ printk(KERN_INFO "qnx6: long filename checksum error.\n");
+
+ QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
+ lf_size, lf->lf_fname, de_inode));
+ if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode,
+ DT_UNKNOWN) < 0) {
+ qnx6_put_page(page);
+ return 0;
+ }
+
+ qnx6_put_page(page);
+ /* success */
+ return 1;
+}
+
+static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct super_block *s = inode->i_sb;
+ struct qnx6_sb_info *sbi = QNX6_SB(s);
+ loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1);
+ unsigned long npages = dir_pages(inode);
+ unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
+ bool done = false;
+
+ if (filp->f_pos >= inode->i_size)
+ return 0;
+
+ for ( ; !done && n < npages; n++, start = 0) {
+ struct page *page = qnx6_get_page(inode, n);
+ int limit = last_entry(inode, n);
+ struct qnx6_dir_entry *de;
+ int i = start;
+
+ if (IS_ERR(page)) {
+ printk(KERN_ERR "qnx6_readdir: read failed\n");
+ filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT;
+ return PTR_ERR(page);
+ }
+ de = ((struct qnx6_dir_entry *)page_address(page)) + start;
+ for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) {
+ int size = de->de_size;
+ u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
+
+ if (!no_inode || !size)
+ continue;
+
+ if (size > QNX6_SHORT_NAME_MAX) {
+ /* long filename detected
+ get the filename from long filename
+ structure / block */
+ if (!qnx6_dir_longfilename(inode,
+ (struct qnx6_long_dir_entry *)de,
+ dirent, pos, no_inode,
+ filldir)) {
+ done = true;
+ break;
+ }
+ } else {
+ QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
+ " inode:%u\n", size, de->de_fname,
+ no_inode));
+ if (filldir(dirent, de->de_fname, size,
+ pos, no_inode, DT_UNKNOWN)
+ < 0) {
+ done = true;
+ break;
+ }
+ }
+ }
+ qnx6_put_page(page);
+ }
+ filp->f_pos = pos;
+ return 0;
+}
+
+/*
+ * check if the long filename is correct.
+ */
+static unsigned qnx6_long_match(int len, const char *name,
+ struct qnx6_long_dir_entry *de, struct inode *dir)
+{
+ struct super_block *s = dir->i_sb;
+ struct qnx6_sb_info *sbi = QNX6_SB(s);
+ struct page *page;
+ int thislen;
+ struct qnx6_long_filename *lf = qnx6_longname(s, de, &page);
+
+ if (IS_ERR(lf))
+ return 0;
+
+ thislen = fs16_to_cpu(sbi, lf->lf_size);
+ if (len != thislen) {
+ qnx6_put_page(page);
+ return 0;
+ }
+ if (memcmp(name, lf->lf_fname, len) == 0) {
+ qnx6_put_page(page);
+ return fs32_to_cpu(sbi, de->de_inode);
+ }
+ qnx6_put_page(page);
+ return 0;
+}
+
+/*
+ * check if the filename is correct.
+ */
+static unsigned qnx6_match(struct super_block *s, int len, const char *name,
+ struct qnx6_dir_entry *de)
+{
+ struct qnx6_sb_info *sbi = QNX6_SB(s);
+ if (memcmp(name, de->de_fname, len) == 0)
+ return fs32_to_cpu(sbi, de->de_inode);
+ return 0;
+}
+
+
+unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
+ struct page **res_page)
+{
+ struct super_block *s = dir->i_sb;
+ struct qnx6_inode_info *ei = QNX6_I(dir);
+ struct page *page = NULL;
+ unsigned long start, n;
+ unsigned long npages = dir_pages(dir);
+ unsigned ino;
+ struct qnx6_dir_entry *de;
+ struct qnx6_long_dir_entry *lde;
+
+ *res_page = NULL;
+
+ if (npages == 0)
+ return 0;
+ start = ei->i_dir_start_lookup;
+ if (start >= npages)
+ start = 0;
+ n = start;
+
+ do {
+ page = qnx6_get_page(dir, n);
+ if (!IS_ERR(page)) {
+ int limit = last_entry(dir, n);
+ int i;
+
+ de = (struct qnx6_dir_entry *)page_address(page);
+ for (i = 0; i < limit; i++, de++) {
+ if (len <= QNX6_SHORT_NAME_MAX) {
+ /* short filename */
+ if (len != de->de_size)
+ continue;
+ ino = qnx6_match(s, len, name, de);
+ if (ino)
+ goto found;
+ } else if (de->de_size == 0xff) {
+ /* deal with long filename */
+ lde = (struct qnx6_long_dir_entry *)de;
+ ino = qnx6_long_match(len,
+ name, lde, dir);
+ if (ino)
+ goto found;
+ } else
+ printk(KERN_ERR "qnx6: undefined "
+ "filename size in inode.\n");
+ }
+ qnx6_put_page(page);
+ }
+
+ if (++n >= npages)
+ n = 0;
+ } while (n != start);
+ return 0;
+
+found:
+ *res_page = page;
+ ei->i_dir_start_lookup = n;
+ return ino;
+}
+
+const struct file_operations qnx6_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .readdir = qnx6_readdir,
+ .fsync = generic_file_fsync,
+};
+
+const struct inode_operations qnx6_dir_inode_operations = {
+ .lookup = qnx6_lookup,
+};
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
new file mode 100644
index 00000000000..e44012dc564
--- /dev/null
+++ b/fs/qnx6/inode.c
@@ -0,0 +1,698 @@
+/*
+ * QNX6 file system, Linux implementation.
+ *
+ * Version : 1.0.0
+ *
+ * History :
+ *
+ * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
+ * 16-02-2012 pagemap extension by Al Viro
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/crc32.h>
+#include <linux/mpage.h>
+#include "qnx6.h"
+
+static const struct super_operations qnx6_sops;
+
+static void qnx6_put_super(struct super_block *sb);
+static struct inode *qnx6_alloc_inode(struct super_block *sb);
+static void qnx6_destroy_inode(struct inode *inode);
+static int qnx6_remount(struct super_block *sb, int *flags, char *data);
+static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int qnx6_show_options(struct seq_file *seq, struct dentry *root);
+
+static const struct super_operations qnx6_sops = {
+ .alloc_inode = qnx6_alloc_inode,
+ .destroy_inode = qnx6_destroy_inode,
+ .put_super = qnx6_put_super,
+ .statfs = qnx6_statfs,
+ .remount_fs = qnx6_remount,
+ .show_options = qnx6_show_options,
+};
+
+static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct qnx6_sb_info *sbi = QNX6_SB(sb);
+
+ if (sbi->s_mount_opt & QNX6_MOUNT_MMI_FS)
+ seq_puts(seq, ",mmi_fs");
+ return 0;
+}
+
+static int qnx6_remount(struct super_block *sb, int *flags, char *data)
+{
+ *flags |= MS_RDONLY;
+ return 0;
+}
+
+static unsigned qnx6_get_devblock(struct super_block *sb, __fs32 block)
+{
+ struct qnx6_sb_info *sbi = QNX6_SB(sb);
+ return fs32_to_cpu(sbi, block) + sbi->s_blks_off;
+}
+
+static unsigned qnx6_block_map(struct inode *inode, unsigned iblock);
+
+static int qnx6_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ unsigned phys;
+
+ QNX6DEBUG((KERN_INFO "qnx6: qnx6_get_block inode=[%ld] iblock=[%ld]\n",
+ inode->i_ino, (unsigned long)iblock));
+
+ phys = qnx6_block_map(inode, iblock);
+ if (phys) {
+ /* logical block is before EOF */
+ map_bh(bh, inode->i_sb, phys);
+ }
+ return 0;
+}
+
+static int qnx6_check_blockptr(__fs32 ptr)
+{
+ if (ptr == ~(__fs32)0) {
+ printk(KERN_ERR "qnx6: hit unused blockpointer.\n");
+ return 0;
+ }
+ return 1;
+}
+
+static int qnx6_readpage(struct file *file, struct page *page)
+{
+ return mpage_readpage(page, qnx6_get_block);
+}
+
+static int qnx6_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block);
+}
+
+/*
+ * returns the block number for the no-th element in the tree
+ * inodebits requred as there are multiple inodes in one inode block
+ */
+static unsigned qnx6_block_map(struct inode *inode, unsigned no)
+{
+ struct super_block *s = inode->i_sb;
+ struct qnx6_sb_info *sbi = QNX6_SB(s);
+ struct qnx6_inode_info *ei = QNX6_I(inode);
+ unsigned block = 0;
+ struct buffer_head *bh;
+ __fs32 ptr;
+ int levelptr;
+ int ptrbits = sbi->s_ptrbits;
+ int bitdelta;
+ u32 mask = (1 << ptrbits) - 1;
+ int depth = ei->di_filelevels;
+ int i;
+
+ bitdelta = ptrbits * depth;
+ levelptr = no >> bitdelta;
+
+ if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) {
+ printk(KERN_ERR "qnx6:Requested file block number (%u) too big.",
+ no);
+ return 0;
+ }
+
+ block = qnx6_get_devblock(s, ei->di_block_ptr[levelptr]);
+
+ for (i = 0; i < depth; i++) {
+ bh = sb_bread(s, block);
+ if (!bh) {
+ printk(KERN_ERR "qnx6:Error reading block (%u)\n",
+ block);
+ return 0;
+ }
+ bitdelta -= ptrbits;
+ levelptr = (no >> bitdelta) & mask;
+ ptr = ((__fs32 *)bh->b_data)[levelptr];
+
+ if (!qnx6_check_blockptr(ptr))
+ return 0;
+
+ block = qnx6_get_devblock(s, ptr);
+ brelse(bh);
+ }
+ return block;
+}
+
+static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct qnx6_sb_info *sbi = QNX6_SB(sb);
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+ buf->f_type = sb->s_magic;
+ buf->f_bsize = sb->s_blocksize;
+ buf->f_blocks = fs32_to_cpu(sbi, sbi->sb->sb_num_blocks);
+ buf->f_bfree = fs32_to_cpu(sbi, sbi->sb->sb_free_blocks);
+ buf->f_files = fs32_to_cpu(sbi, sbi->sb->sb_num_inodes);
+ buf->f_ffree = fs32_to_cpu(sbi, sbi->sb->sb_free_inodes);
+ buf->f_bavail = buf->f_bfree;
+ buf->f_namelen = QNX6_LONG_NAME_MAX;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
+
+ return 0;
+}
+
+/*
+ * Check the root directory of the filesystem to make sure
+ * it really _is_ a qnx6 filesystem, and to check the size
+ * of the directory entry.
+ */
+static const char *qnx6_checkroot(struct super_block *s)
+{
+ static char match_root[2][3] = {".\0\0", "..\0"};
+ int i, error = 0;
+ struct qnx6_dir_entry *dir_entry;
+ struct inode *root = s->s_root->d_inode;
+ struct address_space *mapping = root->i_mapping;
+ struct page *page = read_mapping_page(mapping, 0, NULL);
+ if (IS_ERR(page))
+ return "error reading root directory";
+ kmap(page);
+ dir_entry = page_address(page);
+ for (i = 0; i < 2; i++) {
+ /* maximum 3 bytes - due to match_root limitation */
+ if (strncmp(dir_entry[i].de_fname, match_root[i], 3))
+ error = 1;
+ }
+ qnx6_put_page(page);
+ if (error)
+ return "error reading root directory.";
+ return NULL;
+}
+
+#ifdef CONFIG_QNX6FS_DEBUG
+void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s)
+{
+ struct qnx6_sb_info *sbi = QNX6_SB(s);
+
+ QNX6DEBUG((KERN_INFO "magic: %08x\n",
+ fs32_to_cpu(sbi, sb->sb_magic)));
+ QNX6DEBUG((KERN_INFO "checksum: %08x\n",
+ fs32_to_cpu(sbi, sb->sb_checksum)));
+ QNX6DEBUG((KERN_INFO "serial: %llx\n",
+ fs64_to_cpu(sbi, sb->sb_serial)));
+ QNX6DEBUG((KERN_INFO "flags: %08x\n",
+ fs32_to_cpu(sbi, sb->sb_flags)));
+ QNX6DEBUG((KERN_INFO "blocksize: %08x\n",
+ fs32_to_cpu(sbi, sb->sb_blocksize)));
+ QNX6DEBUG((KERN_INFO "num_inodes: %08x\n",
+ fs32_to_cpu(sbi, sb->sb_num_inodes)));
+ QNX6DEBUG((KERN_INFO "free_inodes: %08x\n",
+ fs32_to_cpu(sbi, sb->sb_free_inodes)));
+ QNX6DEBUG((KERN_INFO "num_blocks: %08x\n",
+ fs32_to_cpu(sbi, sb->sb_num_blocks)));
+ QNX6DEBUG((KERN_INFO "free_blocks: %08x\n",
+ fs32_to_cpu(sbi, sb->sb_free_blocks)));
+ QNX6DEBUG((KERN_INFO "inode_levels: %02x\n",
+ sb->Inode.levels));
+}
+#endif
+
+enum {
+ Opt_mmifs,
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ {Opt_mmifs, "mmi_fs"},
+ {Opt_err, NULL}
+};
+
+static int qnx6_parse_options(char *options, struct super_block *sb)
+{
+ char *p;
+ struct qnx6_sb_info *sbi = QNX6_SB(sb);
+ substring_t args[MAX_OPT_ARGS];
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_mmifs:
+ set_opt(sbi->s_mount_opt, MMI_FS);
+ break;
+ default:
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
+ int offset, int silent)
+{
+ struct qnx6_sb_info *sbi = QNX6_SB(s);
+ struct buffer_head *bh;
+ struct qnx6_super_block *sb;
+
+ /* Check the superblock signatures
+ start with the first superblock */
+ bh = sb_bread(s, offset);
+ if (!bh) {
+ printk(KERN_ERR "qnx6: unable to read the first superblock\n");
+ return NULL;
+ }
+ sb = (struct qnx6_super_block *)bh->b_data;
+ if (fs32_to_cpu(sbi, sb->sb_magic) != QNX6_SUPER_MAGIC) {
+ sbi->s_bytesex = BYTESEX_BE;
+ if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) {
+ /* we got a big endian fs */
+ QNX6DEBUG((KERN_INFO "qnx6: fs got different"
+ " endianess.\n"));
+ return bh;
+ } else
+ sbi->s_bytesex = BYTESEX_LE;
+ if (!silent) {
+ if (offset == 0) {
+ printk(KERN_ERR "qnx6: wrong signature (magic)"
+ " in superblock #1.\n");
+ } else {
+ printk(KERN_INFO "qnx6: wrong signature (magic)"
+ " at position (0x%lx) - will try"
+ " alternative position (0x0000).\n",
+ offset * s->s_blocksize);
+ }
+ }
+ brelse(bh);
+ return NULL;
+ }
+ return bh;
+}
+
+static struct inode *qnx6_private_inode(struct super_block *s,
+ struct qnx6_root_node *p);
+
+static int qnx6_fill_super(struct super_block *s, void *data, int silent)
+{
+ struct buffer_head *bh1 = NULL, *bh2 = NULL;
+ struct qnx6_super_block *sb1 = NULL, *sb2 = NULL;
+ struct qnx6_sb_info *sbi;
+ struct inode *root;
+ const char *errmsg;
+ struct qnx6_sb_info *qs;
+ int ret = -EINVAL;
+ u64 offset;
+ int bootblock_offset = QNX6_BOOTBLOCK_SIZE;
+
+ qs = kzalloc(sizeof(struct qnx6_sb_info), GFP_KERNEL);
+ if (!qs)
+ return -ENOMEM;
+ s->s_fs_info = qs;
+
+ /* Superblock always is 512 Byte long */
+ if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) {
+ printk(KERN_ERR "qnx6: unable to set blocksize\n");
+ goto outnobh;
+ }
+
+ /* parse the mount-options */
+ if (!qnx6_parse_options((char *) data, s)) {
+ printk(KERN_ERR "qnx6: invalid mount options.\n");
+ goto outnobh;
+ }
+ if (test_opt(s, MMI_FS)) {
+ sb1 = qnx6_mmi_fill_super(s, silent);
+ if (sb1)
+ goto mmi_success;
+ else
+ goto outnobh;
+ }
+ sbi = QNX6_SB(s);
+ sbi->s_bytesex = BYTESEX_LE;
+ /* Check the superblock signatures
+ start with the first superblock */
+ bh1 = qnx6_check_first_superblock(s,
+ bootblock_offset / QNX6_SUPERBLOCK_SIZE, silent);
+ if (!bh1) {
+ /* try again without bootblock offset */
+ bh1 = qnx6_check_first_superblock(s, 0, silent);
+ if (!bh1) {
+ printk(KERN_ERR "qnx6: unable to read the first superblock\n");
+ goto outnobh;
+ }
+ /* seems that no bootblock at partition start */
+ bootblock_offset = 0;
+ }
+ sb1 = (struct qnx6_super_block *)bh1->b_data;
+
+#ifdef CONFIG_QNX6FS_DEBUG
+ qnx6_superblock_debug(sb1, s);
+#endif
+
+ /* checksum check - start at byte 8 and end at byte 512 */
+ if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
+ crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
+ printk(KERN_ERR "qnx6: superblock #1 checksum error\n");
+ goto out;
+ }
+
+ /* set new blocksize */
+ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
+ printk(KERN_ERR "qnx6: unable to set blocksize\n");
+ goto out;
+ }
+ /* blocksize invalidates bh - pull it back in */
+ brelse(bh1);
+ bh1 = sb_bread(s, bootblock_offset >> s->s_blocksize_bits);
+ if (!bh1)
+ goto outnobh;
+ sb1 = (struct qnx6_super_block *)bh1->b_data;
+
+ /* calculate second superblock blocknumber */
+ offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) +
+ (bootblock_offset >> s->s_blocksize_bits) +
+ (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits);
+
+ /* set bootblock offset */
+ sbi->s_blks_off = (bootblock_offset >> s->s_blocksize_bits) +
+ (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits);
+
+ /* next the second superblock */
+ bh2 = sb_bread(s, offset);
+ if (!bh2) {
+ printk(KERN_ERR "qnx6: unable to read the second superblock\n");
+ goto out;
+ }
+ sb2 = (struct qnx6_super_block *)bh2->b_data;
+ if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
+ if (!silent)
+ printk(KERN_ERR "qnx6: wrong signature (magic)"
+ " in superblock #2.\n");
+ goto out;
+ }
+
+ /* checksum check - start at byte 8 and end at byte 512 */
+ if (fs32_to_cpu(sbi, sb2->sb_checksum) !=
+ crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
+ printk(KERN_ERR "qnx6: superblock #2 checksum error\n");
+ goto out;
+ }
+
+ if (fs64_to_cpu(sbi, sb1->sb_serial) >=
+ fs64_to_cpu(sbi, sb2->sb_serial)) {
+ /* superblock #1 active */
+ sbi->sb_buf = bh1;
+ sbi->sb = (struct qnx6_super_block *)bh1->b_data;
+ brelse(bh2);
+ printk(KERN_INFO "qnx6: superblock #1 active\n");
+ } else {
+ /* superblock #2 active */
+ sbi->sb_buf = bh2;
+ sbi->sb = (struct qnx6_super_block *)bh2->b_data;
+ brelse(bh1);
+ printk(KERN_INFO "qnx6: superblock #2 active\n");
+ }
+mmi_success:
+ /* sanity check - limit maximum indirect pointer levels */
+ if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) {
+ printk(KERN_ERR "qnx6: too many inode levels (max %i, sb %i)\n",
+ QNX6_PTR_MAX_LEVELS, sb1->Inode.levels);
+ goto out;
+ }
+ if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) {
+ printk(KERN_ERR "qnx6: too many longfilename levels"
+ " (max %i, sb %i)\n",
+ QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels);
+ goto out;
+ }
+ s->s_op = &qnx6_sops;
+ s->s_magic = QNX6_SUPER_MAGIC;
+ s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
+
+ /* ease the later tree level calculations */
+ sbi = QNX6_SB(s);
+ sbi->s_ptrbits = ilog2(s->s_blocksize / 4);
+ sbi->inodes = qnx6_private_inode(s, &sb1->Inode);
+ if (!sbi->inodes)
+ goto out;
+ sbi->longfile = qnx6_private_inode(s, &sb1->Longfile);
+ if (!sbi->longfile)
+ goto out1;
+
+ /* prefetch root inode */
+ root = qnx6_iget(s, QNX6_ROOT_INO);
+ if (IS_ERR(root)) {
+ printk(KERN_ERR "qnx6: get inode failed\n");
+ ret = PTR_ERR(root);
+ goto out2;
+ }
+
+ ret = -ENOMEM;
+ s->s_root = d_make_root(root);
+ if (!s->s_root)
+ goto out2;
+
+ ret = -EINVAL;
+ errmsg = qnx6_checkroot(s);
+ if (errmsg != NULL) {
+ if (!silent)
+ printk(KERN_ERR "qnx6: %s\n", errmsg);
+ goto out3;
+ }
+ return 0;
+
+out3:
+ dput(s->s_root);
+ s->s_root = NULL;
+out2:
+ iput(sbi->longfile);
+out1:
+ iput(sbi->inodes);
+out:
+ if (bh1)
+ brelse(bh1);
+ if (bh2)
+ brelse(bh2);
+outnobh:
+ kfree(qs);
+ s->s_fs_info = NULL;
+ return ret;
+}
+
+static void qnx6_put_super(struct super_block *sb)
+{
+ struct qnx6_sb_info *qs = QNX6_SB(sb);
+ brelse(qs->sb_buf);
+ iput(qs->longfile);
+ iput(qs->inodes);
+ kfree(qs);
+ sb->s_fs_info = NULL;
+ return;
+}
+
+static sector_t qnx6_bmap(struct address_space *mapping, sector_t block)
+{
+ return generic_block_bmap(mapping, block, qnx6_get_block);
+}
+static const struct address_space_operations qnx6_aops = {
+ .readpage = qnx6_readpage,
+ .readpages = qnx6_readpages,
+ .bmap = qnx6_bmap
+};
+
+static struct inode *qnx6_private_inode(struct super_block *s,
+ struct qnx6_root_node *p)
+{
+ struct inode *inode = new_inode(s);
+ if (inode) {
+ struct qnx6_inode_info *ei = QNX6_I(inode);
+ struct qnx6_sb_info *sbi = QNX6_SB(s);
+ inode->i_size = fs64_to_cpu(sbi, p->size);
+ memcpy(ei->di_block_ptr, p->ptr, sizeof(p->ptr));
+ ei->di_filelevels = p->levels;
+ inode->i_mode = S_IFREG | S_IRUSR; /* probably wrong */
+ inode->i_mapping->a_ops = &qnx6_aops;
+ }
+ return inode;
+}
+
+struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
+{
+ struct qnx6_sb_info *sbi = QNX6_SB(sb);
+ struct qnx6_inode_entry *raw_inode;
+ struct inode *inode;
+ struct qnx6_inode_info *ei;
+ struct address_space *mapping;
+ struct page *page;
+ u32 n, offs;
+
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ ei = QNX6_I(inode);
+
+ inode->i_mode = 0;
+
+ if (ino == 0) {
+ printk(KERN_ERR "qnx6: bad inode number on dev %s: %u is "
+ "out of range\n",
+ sb->s_id, ino);
+ iget_failed(inode);
+ return ERR_PTR(-EIO);
+ }
+ n = (ino - 1) >> (PAGE_CACHE_SHIFT - QNX6_INODE_SIZE_BITS);
+ offs = (ino - 1) & (~PAGE_CACHE_MASK >> QNX6_INODE_SIZE_BITS);
+ mapping = sbi->inodes->i_mapping;
+ page = read_mapping_page(mapping, n, NULL);
+ if (IS_ERR(page)) {
+ printk(KERN_ERR "qnx6: major problem: unable to read inode from "
+ "dev %s\n", sb->s_id);
+ iget_failed(inode);
+ return ERR_CAST(page);
+ }
+ kmap(page);
+ raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs;
+
+ inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode);
+ inode->i_uid = (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid);
+ inode->i_gid = (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid);
+ inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size);
+ inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_mtime);
+ inode->i_mtime.tv_nsec = 0;
+ inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_atime);
+ inode->i_atime.tv_nsec = 0;
+ inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_ctime);
+ inode->i_ctime.tv_nsec = 0;
+
+ /* calc blocks based on 512 byte blocksize */
+ inode->i_blocks = (inode->i_size + 511) >> 9;
+
+ memcpy(&ei->di_block_ptr, &raw_inode->di_block_ptr,
+ sizeof(raw_inode->di_block_ptr));
+ ei->di_filelevels = raw_inode->di_filelevels;
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_fop = &generic_ro_fops;
+ inode->i_mapping->a_ops = &qnx6_aops;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &qnx6_dir_inode_operations;
+ inode->i_fop = &qnx6_dir_operations;
+ inode->i_mapping->a_ops = &qnx6_aops;
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_mapping->a_ops = &qnx6_aops;
+ } else
+ init_special_inode(inode, inode->i_mode, 0);
+ qnx6_put_page(page);
+ unlock_new_inode(inode);
+ return inode;
+}
+
+static struct kmem_cache *qnx6_inode_cachep;
+
+static struct inode *qnx6_alloc_inode(struct super_block *sb)
+{
+ struct qnx6_inode_info *ei;
+ ei = kmem_cache_alloc(qnx6_inode_cachep, GFP_KERNEL);
+ if (!ei)
+ return NULL;
+ return &ei->vfs_inode;
+}
+
+static void qnx6_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ kmem_cache_free(qnx6_inode_cachep, QNX6_I(inode));
+}
+
+static void qnx6_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, qnx6_i_callback);
+}
+
+static void init_once(void *foo)
+{
+ struct qnx6_inode_info *ei = (struct qnx6_inode_info *) foo;
+
+ inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+ qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
+ sizeof(struct qnx6_inode_info),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ init_once);
+ if (!qnx6_inode_cachep)
+ return -ENOMEM;
+ return 0;
+}
+
+static void destroy_inodecache(void)
+{
+ kmem_cache_destroy(qnx6_inode_cachep);
+}
+
+static struct dentry *qnx6_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return mount_bdev(fs_type, flags, dev_name, data, qnx6_fill_super);
+}
+
+static struct file_system_type qnx6_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "qnx6",
+ .mount = qnx6_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+static int __init init_qnx6_fs(void)
+{
+ int err;
+
+ err = init_inodecache();
+ if (err)
+ return err;
+
+ err = register_filesystem(&qnx6_fs_type);
+ if (err) {
+ destroy_inodecache();
+ return err;
+ }
+
+ printk(KERN_INFO "QNX6 filesystem 1.0.0 registered.\n");
+ return 0;
+}
+
+static void __exit exit_qnx6_fs(void)
+{
+ unregister_filesystem(&qnx6_fs_type);
+ destroy_inodecache();
+}
+
+module_init(init_qnx6_fs)
+module_exit(exit_qnx6_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
new file mode 100644
index 00000000000..8a97289e04a
--- /dev/null
+++ b/fs/qnx6/namei.c
@@ -0,0 +1,42 @@
+/*
+ * QNX6 file system, Linux implementation.
+ *
+ * Version : 1.0.0
+ *
+ * History :
+ *
+ * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
+ * 16-02-2012 pagemap extension by Al Viro
+ *
+ */
+
+#include "qnx6.h"
+
+struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ unsigned ino;
+ struct page *page;
+ struct inode *foundinode = NULL;
+ const char *name = dentry->d_name.name;
+ int len = dentry->d_name.len;
+
+ if (len > QNX6_LONG_NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ ino = qnx6_find_entry(len, dir, name, &page);
+ if (ino) {
+ foundinode = qnx6_iget(dir->i_sb, ino);
+ qnx6_put_page(page);
+ if (IS_ERR(foundinode)) {
+ QNX6DEBUG((KERN_ERR "qnx6: lookup->iget -> "
+ " error %ld\n", PTR_ERR(foundinode)));
+ return ERR_CAST(foundinode);
+ }
+ } else {
+ QNX6DEBUG((KERN_INFO "qnx6_lookup: not found %s\n", name));
+ return NULL;
+ }
+ d_add(dentry, foundinode);
+ return NULL;
+}
diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h
new file mode 100644
index 00000000000..6c5e02a0b6a
--- /dev/null
+++ b/fs/qnx6/qnx6.h
@@ -0,0 +1,135 @@
+/*
+ * QNX6 file system, Linux implementation.
+ *
+ * Version : 1.0.0
+ *
+ * History :
+ *
+ * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
+ * 16-02-2012 page map extension by Al Viro
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+
+typedef __u16 __bitwise __fs16;
+typedef __u32 __bitwise __fs32;
+typedef __u64 __bitwise __fs64;
+
+#include <linux/qnx6_fs.h>
+
+#ifdef CONFIG_QNX6FS_DEBUG
+#define QNX6DEBUG(X) printk X
+#else
+#define QNX6DEBUG(X) (void) 0
+#endif
+
+struct qnx6_sb_info {
+ struct buffer_head *sb_buf; /* superblock buffer */
+ struct qnx6_super_block *sb; /* our superblock */
+ int s_blks_off; /* blkoffset fs-startpoint */
+ int s_ptrbits; /* indirect pointer bitfield */
+ unsigned long s_mount_opt; /* all mount options */
+ int s_bytesex; /* holds endianess info */
+ struct inode * inodes;
+ struct inode * longfile;
+};
+
+struct qnx6_inode_info {
+ __fs32 di_block_ptr[QNX6_NO_DIRECT_POINTERS];
+ __u8 di_filelevels;
+ __u32 i_dir_start_lookup;
+ struct inode vfs_inode;
+};
+
+extern struct inode *qnx6_iget(struct super_block *sb, unsigned ino);
+extern struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd);
+
+#ifdef CONFIG_QNX6FS_DEBUG
+extern void qnx6_superblock_debug(struct qnx6_super_block *,
+ struct super_block *);
+#endif
+
+extern const struct inode_operations qnx6_dir_inode_operations;
+extern const struct file_operations qnx6_dir_operations;
+
+static inline struct qnx6_sb_info *QNX6_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline struct qnx6_inode_info *QNX6_I(struct inode *inode)
+{
+ return container_of(inode, struct qnx6_inode_info, vfs_inode);
+}
+
+#define clear_opt(o, opt) (o &= ~(QNX6_MOUNT_##opt))
+#define set_opt(o, opt) (o |= (QNX6_MOUNT_##opt))
+#define test_opt(sb, opt) (QNX6_SB(sb)->s_mount_opt & \
+ QNX6_MOUNT_##opt)
+enum {
+ BYTESEX_LE,
+ BYTESEX_BE,
+};
+
+static inline __u64 fs64_to_cpu(struct qnx6_sb_info *sbi, __fs64 n)
+{
+ if (sbi->s_bytesex == BYTESEX_LE)
+ return le64_to_cpu((__force __le64)n);
+ else
+ return be64_to_cpu((__force __be64)n);
+}
+
+static inline __fs64 cpu_to_fs64(struct qnx6_sb_info *sbi, __u64 n)
+{
+ if (sbi->s_bytesex == BYTESEX_LE)
+ return (__force __fs64)cpu_to_le64(n);
+ else
+ return (__force __fs64)cpu_to_be64(n);
+}
+
+static inline __u32 fs32_to_cpu(struct qnx6_sb_info *sbi, __fs32 n)
+{
+ if (sbi->s_bytesex == BYTESEX_LE)
+ return le32_to_cpu((__force __le32)n);
+ else
+ return be32_to_cpu((__force __be32)n);
+}
+
+static inline __fs32 cpu_to_fs32(struct qnx6_sb_info *sbi, __u32 n)
+{
+ if (sbi->s_bytesex == BYTESEX_LE)
+ return (__force __fs32)cpu_to_le32(n);
+ else
+ return (__force __fs32)cpu_to_be32(n);
+}
+
+static inline __u16 fs16_to_cpu(struct qnx6_sb_info *sbi, __fs16 n)
+{
+ if (sbi->s_bytesex == BYTESEX_LE)
+ return le16_to_cpu((__force __le16)n);
+ else
+ return be16_to_cpu((__force __be16)n);
+}
+
+static inline __fs16 cpu_to_fs16(struct qnx6_sb_info *sbi, __u16 n)
+{
+ if (sbi->s_bytesex == BYTESEX_LE)
+ return (__force __fs16)cpu_to_le16(n);
+ else
+ return (__force __fs16)cpu_to_be16(n);
+}
+
+extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s,
+ int silent);
+
+static inline void qnx6_put_page(struct page *page)
+{
+ kunmap(page);
+ page_cache_release(page);
+}
+
+extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
+ struct page **res_page);
diff --git a/fs/qnx6/super_mmi.c b/fs/qnx6/super_mmi.c
new file mode 100644
index 00000000000..29c32cba62d
--- /dev/null
+++ b/fs/qnx6/super_mmi.c
@@ -0,0 +1,150 @@
+/*
+ * QNX6 file system, Linux implementation.
+ *
+ * Version : 1.0.0
+ *
+ * History :
+ *
+ * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include "qnx6.h"
+
+static void qnx6_mmi_copy_sb(struct qnx6_super_block *qsb,
+ struct qnx6_mmi_super_block *sb)
+{
+ qsb->sb_magic = sb->sb_magic;
+ qsb->sb_checksum = sb->sb_checksum;
+ qsb->sb_serial = sb->sb_serial;
+ qsb->sb_blocksize = sb->sb_blocksize;
+ qsb->sb_num_inodes = sb->sb_num_inodes;
+ qsb->sb_free_inodes = sb->sb_free_inodes;
+ qsb->sb_num_blocks = sb->sb_num_blocks;
+ qsb->sb_free_blocks = sb->sb_free_blocks;
+
+ /* the rest of the superblock is the same */
+ memcpy(&qsb->Inode, &sb->Inode, sizeof(sb->Inode));
+ memcpy(&qsb->Bitmap, &sb->Bitmap, sizeof(sb->Bitmap));
+ memcpy(&qsb->Longfile, &sb->Longfile, sizeof(sb->Longfile));
+}
+
+struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
+{
+ struct buffer_head *bh1, *bh2 = NULL;
+ struct qnx6_mmi_super_block *sb1, *sb2;
+ struct qnx6_super_block *qsb = NULL;
+ struct qnx6_sb_info *sbi;
+ __u64 offset;
+
+ /* Check the superblock signatures
+ start with the first superblock */
+ bh1 = sb_bread(s, 0);
+ if (!bh1) {
+ printk(KERN_ERR "qnx6: Unable to read first mmi superblock\n");
+ return NULL;
+ }
+ sb1 = (struct qnx6_mmi_super_block *)bh1->b_data;
+ sbi = QNX6_SB(s);
+ if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) {
+ if (!silent) {
+ printk(KERN_ERR "qnx6: wrong signature (magic) in"
+ " superblock #1.\n");
+ goto out;
+ }
+ }
+
+ /* checksum check - start at byte 8 and end at byte 512 */
+ if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
+ crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
+ printk(KERN_ERR "qnx6: superblock #1 checksum error\n");
+ goto out;
+ }
+
+ /* calculate second superblock blocknumber */
+ offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) + QNX6_SUPERBLOCK_AREA /
+ fs32_to_cpu(sbi, sb1->sb_blocksize);
+
+ /* set new blocksize */
+ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
+ printk(KERN_ERR "qnx6: unable to set blocksize\n");
+ goto out;
+ }
+ /* blocksize invalidates bh - pull it back in */
+ brelse(bh1);
+ bh1 = sb_bread(s, 0);
+ if (!bh1)
+ goto out;
+ sb1 = (struct qnx6_mmi_super_block *)bh1->b_data;
+
+ /* read second superblock */
+ bh2 = sb_bread(s, offset);
+ if (!bh2) {
+ printk(KERN_ERR "qnx6: unable to read the second superblock\n");
+ goto out;
+ }
+ sb2 = (struct qnx6_mmi_super_block *)bh2->b_data;
+ if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
+ if (!silent)
+ printk(KERN_ERR "qnx6: wrong signature (magic) in"
+ " superblock #2.\n");
+ goto out;
+ }
+
+ /* checksum check - start at byte 8 and end at byte 512 */
+ if (fs32_to_cpu(sbi, sb2->sb_checksum)
+ != crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
+ printk(KERN_ERR "qnx6: superblock #1 checksum error\n");
+ goto out;
+ }
+
+ qsb = kmalloc(sizeof(*qsb), GFP_KERNEL);
+ if (!qsb) {
+ printk(KERN_ERR "qnx6: unable to allocate memory.\n");
+ goto out;
+ }
+
+ if (fs64_to_cpu(sbi, sb1->sb_serial) >
+ fs64_to_cpu(sbi, sb2->sb_serial)) {
+ /* superblock #1 active */
+ qnx6_mmi_copy_sb(qsb, sb1);
+#ifdef CONFIG_QNX6FS_DEBUG
+ qnx6_superblock_debug(qsb, s);
+#endif
+ memcpy(bh1->b_data, qsb, sizeof(struct qnx6_super_block));
+
+ sbi->sb_buf = bh1;
+ sbi->sb = (struct qnx6_super_block *)bh1->b_data;
+ brelse(bh2);
+ printk(KERN_INFO "qnx6: superblock #1 active\n");
+ } else {
+ /* superblock #2 active */
+ qnx6_mmi_copy_sb(qsb, sb2);
+#ifdef CONFIG_QNX6FS_DEBUG
+ qnx6_superblock_debug(qsb, s);
+#endif
+ memcpy(bh2->b_data, qsb, sizeof(struct qnx6_super_block));
+
+ sbi->sb_buf = bh2;
+ sbi->sb = (struct qnx6_super_block *)bh2->b_data;
+ brelse(bh1);
+ printk(KERN_INFO "qnx6: superblock #2 active\n");
+ }
+ kfree(qsb);
+
+ /* offset for mmi_fs is just SUPERBLOCK_AREA bytes */
+ sbi->s_blks_off = QNX6_SUPERBLOCK_AREA / s->s_blocksize;
+
+ /* success */
+ return sbi->sb;
+
+out:
+ if (bh1 != NULL)
+ brelse(bh1);
+ if (bh2 != NULL)
+ brelse(bh2);
+ return NULL;
+}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 5ec59b20cf7..d69a1d1d7e1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -71,6 +71,7 @@
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
+#include <linux/sched.h>
#include <linux/kmod.h>
#include <linux/namei.h>
#include <linux/capability.h>
@@ -1109,6 +1110,13 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number)
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}
+struct dquot_warn {
+ struct super_block *w_sb;
+ qid_t w_dq_id;
+ short w_dq_type;
+ short w_type;
+};
+
static int warning_issued(struct dquot *dquot, const int warntype)
{
int flag = (warntype == QUOTA_NL_BHARDWARN ||
@@ -1124,41 +1132,42 @@ static int warning_issued(struct dquot *dquot, const int warntype)
#ifdef CONFIG_PRINT_QUOTA_WARNING
static int flag_print_warnings = 1;
-static int need_print_warning(struct dquot *dquot)
+static int need_print_warning(struct dquot_warn *warn)
{
if (!flag_print_warnings)
return 0;
- switch (dquot->dq_type) {
+ switch (warn->w_dq_type) {
case USRQUOTA:
- return current_fsuid() == dquot->dq_id;
+ return current_fsuid() == warn->w_dq_id;
case GRPQUOTA:
- return in_group_p(dquot->dq_id);
+ return in_group_p(warn->w_dq_id);
}
return 0;
}
/* Print warning to user which exceeded quota */
-static void print_warning(struct dquot *dquot, const int warntype)
+static void print_warning(struct dquot_warn *warn)
{
char *msg = NULL;
struct tty_struct *tty;
+ int warntype = warn->w_type;
if (warntype == QUOTA_NL_IHARDBELOW ||
warntype == QUOTA_NL_ISOFTBELOW ||
warntype == QUOTA_NL_BHARDBELOW ||
- warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot))
+ warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn))
return;
tty = get_current_tty();
if (!tty)
return;
- tty_write_message(tty, dquot->dq_sb->s_id);
+ tty_write_message(tty, warn->w_sb->s_id);
if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
tty_write_message(tty, ": warning, ");
else
tty_write_message(tty, ": write failed, ");
- tty_write_message(tty, quotatypes[dquot->dq_type]);
+ tty_write_message(tty, quotatypes[warn->w_dq_type]);
switch (warntype) {
case QUOTA_NL_IHARDWARN:
msg = " file limit reached.\r\n";
@@ -1184,26 +1193,34 @@ static void print_warning(struct dquot *dquot, const int warntype)
}
#endif
+static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot,
+ int warntype)
+{
+ if (warning_issued(dquot, warntype))
+ return;
+ warn->w_type = warntype;
+ warn->w_sb = dquot->dq_sb;
+ warn->w_dq_id = dquot->dq_id;
+ warn->w_dq_type = dquot->dq_type;
+}
+
/*
* Write warnings to the console and send warning messages over netlink.
*
- * Note that this function can sleep.
+ * Note that this function can call into tty and networking code.
*/
-static void flush_warnings(struct dquot *const *dquots, char *warntype)
+static void flush_warnings(struct dquot_warn *warn)
{
- struct dquot *dq;
int i;
for (i = 0; i < MAXQUOTAS; i++) {
- dq = dquots[i];
- if (dq && warntype[i] != QUOTA_NL_NOWARN &&
- !warning_issued(dq, warntype[i])) {
+ if (warn[i].w_type == QUOTA_NL_NOWARN)
+ continue;
#ifdef CONFIG_PRINT_QUOTA_WARNING
- print_warning(dq, warntype[i]);
+ print_warning(&warn[i]);
#endif
- quota_send_warning(dq->dq_type, dq->dq_id,
- dq->dq_sb->s_dev, warntype[i]);
- }
+ quota_send_warning(warn[i].w_dq_type, warn[i].w_dq_id,
+ warn[i].w_sb->s_dev, warn[i].w_type);
}
}
@@ -1217,11 +1234,11 @@ static int ignore_hardlimit(struct dquot *dquot)
}
/* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes,
+ struct dquot_warn *warn)
{
qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
- *warntype = QUOTA_NL_NOWARN;
if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
test_bit(DQ_FAKE_B, &dquot->dq_flags))
return 0;
@@ -1229,7 +1246,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
if (dquot->dq_dqb.dqb_ihardlimit &&
newinodes > dquot->dq_dqb.dqb_ihardlimit &&
!ignore_hardlimit(dquot)) {
- *warntype = QUOTA_NL_IHARDWARN;
+ prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
return -EDQUOT;
}
@@ -1238,14 +1255,14 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
dquot->dq_dqb.dqb_itime &&
get_seconds() >= dquot->dq_dqb.dqb_itime &&
!ignore_hardlimit(dquot)) {
- *warntype = QUOTA_NL_ISOFTLONGWARN;
+ prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
return -EDQUOT;
}
if (dquot->dq_dqb.dqb_isoftlimit &&
newinodes > dquot->dq_dqb.dqb_isoftlimit &&
dquot->dq_dqb.dqb_itime == 0) {
- *warntype = QUOTA_NL_ISOFTWARN;
+ prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
dquot->dq_dqb.dqb_itime = get_seconds() +
sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
}
@@ -1254,12 +1271,12 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
}
/* needs dq_data_lock */
-static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
+static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
+ struct dquot_warn *warn)
{
qsize_t tspace;
struct super_block *sb = dquot->dq_sb;
- *warntype = QUOTA_NL_NOWARN;
if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
test_bit(DQ_FAKE_B, &dquot->dq_flags))
return 0;
@@ -1271,7 +1288,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
tspace > dquot->dq_dqb.dqb_bhardlimit &&
!ignore_hardlimit(dquot)) {
if (!prealloc)
- *warntype = QUOTA_NL_BHARDWARN;
+ prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
return -EDQUOT;
}
@@ -1281,7 +1298,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
get_seconds() >= dquot->dq_dqb.dqb_btime &&
!ignore_hardlimit(dquot)) {
if (!prealloc)
- *warntype = QUOTA_NL_BSOFTLONGWARN;
+ prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
return -EDQUOT;
}
@@ -1289,7 +1306,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
tspace > dquot->dq_dqb.dqb_bsoftlimit &&
dquot->dq_dqb.dqb_btime == 0) {
if (!prealloc) {
- *warntype = QUOTA_NL_BSOFTWARN;
+ prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
dquot->dq_dqb.dqb_btime = get_seconds() +
sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace;
}
@@ -1542,10 +1559,9 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
int cnt, ret = 0;
- char warntype[MAXQUOTAS];
- int warn = flags & DQUOT_SPACE_WARN;
+ struct dquot_warn warn[MAXQUOTAS];
+ struct dquot **dquots = inode->i_dquot;
int reserve = flags & DQUOT_SPACE_RESERVE;
- int nofail = flags & DQUOT_SPACE_NOFAIL;
/*
* First test before acquiring mutex - solves deadlocks when we
@@ -1558,36 +1574,36 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- warntype[cnt] = QUOTA_NL_NOWARN;
+ warn[cnt].w_type = QUOTA_NL_NOWARN;
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!inode->i_dquot[cnt])
+ if (!dquots[cnt])
continue;
- ret = check_bdq(inode->i_dquot[cnt], number, !warn,
- warntype+cnt);
- if (ret && !nofail) {
+ ret = check_bdq(dquots[cnt], number,
+ !(flags & DQUOT_SPACE_WARN), &warn[cnt]);
+ if (ret && !(flags & DQUOT_SPACE_NOFAIL)) {
spin_unlock(&dq_data_lock);
goto out_flush_warn;
}
}
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!inode->i_dquot[cnt])
+ if (!dquots[cnt])
continue;
if (reserve)
- dquot_resv_space(inode->i_dquot[cnt], number);
+ dquot_resv_space(dquots[cnt], number);
else
- dquot_incr_space(inode->i_dquot[cnt], number);
+ dquot_incr_space(dquots[cnt], number);
}
inode_incr_space(inode, number, reserve);
spin_unlock(&dq_data_lock);
if (reserve)
goto out_flush_warn;
- mark_all_dquot_dirty(inode->i_dquot);
+ mark_all_dquot_dirty(dquots);
out_flush_warn:
- flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ flush_warnings(warn);
out:
return ret;
}
@@ -1599,36 +1615,37 @@ EXPORT_SYMBOL(__dquot_alloc_space);
int dquot_alloc_inode(const struct inode *inode)
{
int cnt, ret = 0;
- char warntype[MAXQUOTAS];
+ struct dquot_warn warn[MAXQUOTAS];
+ struct dquot * const *dquots = inode->i_dquot;
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
if (!dquot_active(inode))
return 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- warntype[cnt] = QUOTA_NL_NOWARN;
+ warn[cnt].w_type = QUOTA_NL_NOWARN;
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!inode->i_dquot[cnt])
+ if (!dquots[cnt])
continue;
- ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt);
+ ret = check_idq(dquots[cnt], 1, &warn[cnt]);
if (ret)
goto warn_put_all;
}
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!inode->i_dquot[cnt])
+ if (!dquots[cnt])
continue;
- dquot_incr_inodes(inode->i_dquot[cnt], 1);
+ dquot_incr_inodes(dquots[cnt], 1);
}
warn_put_all:
spin_unlock(&dq_data_lock);
if (ret == 0)
- mark_all_dquot_dirty(inode->i_dquot);
- flush_warnings(inode->i_dquot, warntype);
+ mark_all_dquot_dirty(dquots);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ flush_warnings(warn);
return ret;
}
EXPORT_SYMBOL(dquot_alloc_inode);
@@ -1668,7 +1685,8 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
unsigned int cnt;
- char warntype[MAXQUOTAS];
+ struct dquot_warn warn[MAXQUOTAS];
+ struct dquot **dquots = inode->i_dquot;
int reserve = flags & DQUOT_SPACE_RESERVE;
/* First test before acquiring mutex - solves deadlocks when we
@@ -1681,23 +1699,28 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!inode->i_dquot[cnt])
+ int wtype;
+
+ warn[cnt].w_type = QUOTA_NL_NOWARN;
+ if (!dquots[cnt])
continue;
- warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
+ wtype = info_bdq_free(dquots[cnt], number);
+ if (wtype != QUOTA_NL_NOWARN)
+ prepare_warning(&warn[cnt], dquots[cnt], wtype);
if (reserve)
- dquot_free_reserved_space(inode->i_dquot[cnt], number);
+ dquot_free_reserved_space(dquots[cnt], number);
else
- dquot_decr_space(inode->i_dquot[cnt], number);
+ dquot_decr_space(dquots[cnt], number);
}
inode_decr_space(inode, number, reserve);
spin_unlock(&dq_data_lock);
if (reserve)
goto out_unlock;
- mark_all_dquot_dirty(inode->i_dquot);
+ mark_all_dquot_dirty(dquots);
out_unlock:
- flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ flush_warnings(warn);
}
EXPORT_SYMBOL(__dquot_free_space);
@@ -1707,7 +1730,8 @@ EXPORT_SYMBOL(__dquot_free_space);
void dquot_free_inode(const struct inode *inode)
{
unsigned int cnt;
- char warntype[MAXQUOTAS];
+ struct dquot_warn warn[MAXQUOTAS];
+ struct dquot * const *dquots = inode->i_dquot;
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
@@ -1717,15 +1741,20 @@ void dquot_free_inode(const struct inode *inode)
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!inode->i_dquot[cnt])
+ int wtype;
+
+ warn[cnt].w_type = QUOTA_NL_NOWARN;
+ if (!dquots[cnt])
continue;
- warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
- dquot_decr_inodes(inode->i_dquot[cnt], 1);
+ wtype = info_idq_free(dquots[cnt], 1);
+ if (wtype != QUOTA_NL_NOWARN)
+ prepare_warning(&warn[cnt], dquots[cnt], wtype);
+ dquot_decr_inodes(dquots[cnt], 1);
}
spin_unlock(&dq_data_lock);
- mark_all_dquot_dirty(inode->i_dquot);
- flush_warnings(inode->i_dquot, warntype);
+ mark_all_dquot_dirty(dquots);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ flush_warnings(warn);
}
EXPORT_SYMBOL(dquot_free_inode);
@@ -1746,16 +1775,20 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
struct dquot *transfer_from[MAXQUOTAS] = {};
int cnt, ret = 0;
char is_valid[MAXQUOTAS] = {};
- char warntype_to[MAXQUOTAS];
- char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
+ struct dquot_warn warn_to[MAXQUOTAS];
+ struct dquot_warn warn_from_inodes[MAXQUOTAS];
+ struct dquot_warn warn_from_space[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
if (IS_NOQUOTA(inode))
return 0;
/* Initialize the arrays */
- for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- warntype_to[cnt] = QUOTA_NL_NOWARN;
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ warn_to[cnt].w_type = QUOTA_NL_NOWARN;
+ warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
+ warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
+ }
down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1777,10 +1810,10 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
continue;
is_valid[cnt] = 1;
transfer_from[cnt] = inode->i_dquot[cnt];
- ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
+ ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
if (ret)
goto over_quota;
- ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt);
+ ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
if (ret)
goto over_quota;
}
@@ -1793,10 +1826,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
continue;
/* Due to IO error we might not have transfer_from[] structure */
if (transfer_from[cnt]) {
- warntype_from_inodes[cnt] =
- info_idq_free(transfer_from[cnt], 1);
- warntype_from_space[cnt] =
- info_bdq_free(transfer_from[cnt], space);
+ int wtype;
+ wtype = info_idq_free(transfer_from[cnt], 1);
+ if (wtype != QUOTA_NL_NOWARN)
+ prepare_warning(&warn_from_inodes[cnt],
+ transfer_from[cnt], wtype);
+ wtype = info_bdq_free(transfer_from[cnt], space);
+ if (wtype != QUOTA_NL_NOWARN)
+ prepare_warning(&warn_from_space[cnt],
+ transfer_from[cnt], wtype);
dquot_decr_inodes(transfer_from[cnt], 1);
dquot_decr_space(transfer_from[cnt], cur_space);
dquot_free_reserved_space(transfer_from[cnt],
@@ -1814,9 +1852,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
mark_all_dquot_dirty(transfer_from);
mark_all_dquot_dirty(transfer_to);
- flush_warnings(transfer_to, warntype_to);
- flush_warnings(transfer_from, warntype_from_inodes);
- flush_warnings(transfer_from, warntype_from_space);
+ flush_warnings(warn_to);
+ flush_warnings(warn_from_inodes);
+ flush_warnings(warn_from_space);
/* Pass back references to put */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
if (is_valid[cnt])
@@ -1825,7 +1863,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
over_quota:
spin_unlock(&dq_data_lock);
up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
- flush_warnings(transfer_to, warntype_to);
+ flush_warnings(warn_to);
return ret;
}
EXPORT_SYMBOL(__dquot_transfer);
@@ -2125,6 +2163,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
mutex_unlock(&dqopt->dqio_mutex);
goto out_file_init;
}
+ if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+ dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
mutex_unlock(&dqopt->dqio_mutex);
spin_lock(&dq_state_lock);
dqopt->flags |= dquot_state_flag(flags, type);
@@ -2464,7 +2504,7 @@ int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
spin_lock(&dq_data_lock);
ii->dqi_bgrace = mi->dqi_bgrace;
ii->dqi_igrace = mi->dqi_igrace;
- ii->dqi_flags = mi->dqi_flags & DQF_MASK;
+ ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK;
ii->dqi_valid = IIF_ALL;
spin_unlock(&dq_data_lock);
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
@@ -2490,8 +2530,8 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
if (ii->dqi_valid & IIF_IGRACE)
mi->dqi_igrace = ii->dqi_igrace;
if (ii->dqi_valid & IIF_FLAGS)
- mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) |
- (ii->dqi_flags & DQF_MASK);
+ mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) |
+ (ii->dqi_flags & DQF_SETINFO_MASK);
spin_unlock(&dq_data_lock);
mark_info_dirty(sb, type);
/* Force write to disk */
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 7898cd688a0..9a391204ca2 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -282,21 +282,35 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
case Q_XGETQUOTA:
return quota_getxquota(sb, type, id, addr);
case Q_XQUOTASYNC:
- /* caller already holds s_umount */
if (sb->s_flags & MS_RDONLY)
return -EROFS;
- writeback_inodes_sb(sb, WB_REASON_SYNC);
+ /* XFS quotas are fully coherent now, making this call a noop */
return 0;
default:
return -EINVAL;
}
}
+/* Return 1 if 'cmd' will block on frozen filesystem */
+static int quotactl_cmd_write(int cmd)
+{
+ switch (cmd) {
+ case Q_GETFMT:
+ case Q_GETINFO:
+ case Q_SYNC:
+ case Q_XGETQSTAT:
+ case Q_XGETQUOTA:
+ case Q_XQUOTASYNC:
+ return 0;
+ }
+ return 1;
+}
+
/*
* look up a superblock on which quota ops will be performed
* - use the name of a block device to find the superblock thereon
*/
-static struct super_block *quotactl_block(const char __user *special)
+static struct super_block *quotactl_block(const char __user *special, int cmd)
{
#ifdef CONFIG_BLOCK
struct block_device *bdev;
@@ -309,7 +323,10 @@ static struct super_block *quotactl_block(const char __user *special)
putname(tmp);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
- sb = get_super(bdev);
+ if (quotactl_cmd_write(cmd))
+ sb = get_super_thawed(bdev);
+ else
+ sb = get_super(bdev);
bdput(bdev);
if (!sb)
return ERR_PTR(-ENODEV);
@@ -361,7 +378,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
pathp = &path;
}
- sb = quotactl_block(special);
+ sb = quotactl_block(special, cmds);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
goto out;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index aec766abe3a..a1fdabe21de 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -209,22 +209,19 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
int ramfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct ramfs_fs_info *fsi;
- struct inode *inode = NULL;
- struct dentry *root;
+ struct inode *inode;
int err;
save_mount_options(sb, data);
fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
sb->s_fs_info = fsi;
- if (!fsi) {
- err = -ENOMEM;
- goto fail;
- }
+ if (!fsi)
+ return -ENOMEM;
err = ramfs_parse_options(data, &fsi->mount_opts);
if (err)
- goto fail;
+ return err;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -234,24 +231,11 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
- if (!inode) {
- err = -ENOMEM;
- goto fail;
- }
-
- root = d_alloc_root(inode);
- sb->s_root = root;
- if (!root) {
- err = -ENOMEM;
- goto fail;
- }
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
+ return -ENOMEM;
return 0;
-fail:
- kfree(fsi);
- sb->s_fs_info = NULL;
- iput(inode);
- return err;
}
struct dentry *ramfs_mount(struct file_system_type *fs_type,
diff --git a/fs/read_write.c b/fs/read_write.c
index 5ad4248b0cd..ffc99d22e0a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -11,7 +11,7 @@
#include <linux/uio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
diff --git a/fs/readdir.c b/fs/readdir.c
index 356f71528ad..cc0a8227cdd 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -6,7 +6,7 @@
#include <linux/stddef.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/errno.h>
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
new file mode 100644
index 00000000000..f096b80e73d
--- /dev/null
+++ b/fs/reiserfs/acl.h
@@ -0,0 +1,76 @@
+#include <linux/init.h>
+#include <linux/posix_acl.h>
+
+#define REISERFS_ACL_VERSION 0x0001
+
+typedef struct {
+ __le16 e_tag;
+ __le16 e_perm;
+ __le32 e_id;
+} reiserfs_acl_entry;
+
+typedef struct {
+ __le16 e_tag;
+ __le16 e_perm;
+} reiserfs_acl_entry_short;
+
+typedef struct {
+ __le32 a_version;
+} reiserfs_acl_header;
+
+static inline size_t reiserfs_acl_size(int count)
+{
+ if (count <= 4) {
+ return sizeof(reiserfs_acl_header) +
+ count * sizeof(reiserfs_acl_entry_short);
+ } else {
+ return sizeof(reiserfs_acl_header) +
+ 4 * sizeof(reiserfs_acl_entry_short) +
+ (count - 4) * sizeof(reiserfs_acl_entry);
+ }
+}
+
+static inline int reiserfs_acl_count(size_t size)
+{
+ ssize_t s;
+ size -= sizeof(reiserfs_acl_header);
+ s = size - 4 * sizeof(reiserfs_acl_entry_short);
+ if (s < 0) {
+ if (size % sizeof(reiserfs_acl_entry_short))
+ return -1;
+ return size / sizeof(reiserfs_acl_entry_short);
+ } else {
+ if (s % sizeof(reiserfs_acl_entry))
+ return -1;
+ return s / sizeof(reiserfs_acl_entry) + 4;
+ }
+}
+
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
+int reiserfs_acl_chmod(struct inode *inode);
+int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
+ struct inode *dir, struct dentry *dentry,
+ struct inode *inode);
+int reiserfs_cache_default_acl(struct inode *dir);
+extern const struct xattr_handler reiserfs_posix_acl_default_handler;
+extern const struct xattr_handler reiserfs_posix_acl_access_handler;
+
+#else
+
+#define reiserfs_cache_default_acl(inode) 0
+#define reiserfs_get_acl NULL
+
+static inline int reiserfs_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+
+static inline int
+reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
+ const struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ return 0;
+}
+#endif
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 70de42f09f1..4c0c7d163d1 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -4,14 +4,12 @@
/* Reiserfs block (de)allocator, bitmap-based. */
#include <linux/time.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/errno.h>
#include <linux/buffer_head.h>
#include <linux/kernel.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
-#include <linux/reiserfs_fs_sb.h>
-#include <linux/reiserfs_fs_i.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 133e9355dc6..66c53b642a8 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -5,7 +5,7 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/fs.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/stat.h>
#include <linux/buffer_head.h>
#include <linux/slab.h>
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 60c08044066..2b7882b508d 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -17,7 +17,7 @@
#include <asm/uaccess.h>
#include <linux/time.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/buffer_head.h>
#include <linux/kernel.h>
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index ace635053a3..8375c922c0d 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -3,9 +3,9 @@
*/
#include <linux/time.h>
-#include <linux/reiserfs_fs.h>
-#include <linux/reiserfs_acl.h>
-#include <linux/reiserfs_xattr.h>
+#include "reiserfs.h"
+#include "acl.h"
+#include "xattr.h"
#include <asm/uaccess.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 1e4250bc3a6..430e0658704 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -37,7 +37,7 @@
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/buffer_head.h>
/* To make any changes in the tree we find a node, that contains item
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index 6471c670743..91b0cc1242a 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -19,7 +19,7 @@
//
#include <linux/kernel.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <asm/types.h>
#define DELTA 0x9E3779B9
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index 2074fd95046..e1978fd895f 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -5,7 +5,7 @@
#include <asm/uaccess.h>
#include <linux/string.h>
#include <linux/time.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/buffer_head.h>
/* this is one and only function that is used outside (do_balance.c) */
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9e8cd5acd79..494c315c741 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -4,9 +4,9 @@
#include <linux/time.h>
#include <linux/fs.h>
-#include <linux/reiserfs_fs.h>
-#include <linux/reiserfs_acl.h>
-#include <linux/reiserfs_xattr.h>
+#include "reiserfs.h"
+#include "acl.h"
+#include "xattr.h"
#include <linux/exportfs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 950e3d1b5c9..0c2185042d5 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -5,7 +5,7 @@
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/mount.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/time.h>
#include <asm/uaccess.h>
#include <linux/pagemap.h>
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index 72cb1cc51b8..ee382ef3d30 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -3,7 +3,7 @@
*/
#include <linux/time.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
// this contains item handlers for old item types: sd, direct,
// indirect, directory
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index c3cf54fd4de..b1a08573fe1 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -37,7 +37,7 @@
#include <linux/time.h>
#include <linux/semaphore.h>
#include <linux/vmalloc.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fcntl.h>
@@ -51,7 +51,6 @@
#include <linux/uaccess.h>
#include <linux/slab.h>
-#include <asm/system.h>
/* gets a struct reiserfs_journal_list * from a list head */
#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 03d85cbf90b..79e5a8b4c22 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -5,7 +5,7 @@
#include <asm/uaccess.h>
#include <linux/string.h>
#include <linux/time.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/buffer_head.h>
/* these are used in do_balance.c */
@@ -975,7 +975,7 @@ static int leaf_cut_entries(struct buffer_head *bh,
remove */
RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
RFALSE(I_ENTRY_COUNT(ih) < from + del_count,
- "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d",
+ "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d",
I_ENTRY_COUNT(ih), from, del_count);
if (del_count == 0)
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
index 7df1ce48203..d735bc8470e 100644
--- a/fs/reiserfs/lock.c
+++ b/fs/reiserfs/lock.c
@@ -1,4 +1,4 @@
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/mutex.h>
/*
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 14637886523..84e8a69cee9 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -14,9 +14,9 @@
#include <linux/time.h>
#include <linux/bitops.h>
#include <linux/slab.h>
-#include <linux/reiserfs_fs.h>
-#include <linux/reiserfs_acl.h>
-#include <linux/reiserfs_xattr.h>
+#include "reiserfs.h"
+#include "acl.h"
+#include "xattr.h"
#include <linux/quotaops.h>
#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 3a6de810bd6..f732d6a5251 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -5,8 +5,7 @@
#include <linux/string.h>
#include <linux/random.h>
#include <linux/time.h>
-#include <linux/reiserfs_fs.h>
-#include <linux/reiserfs_fs_sb.h>
+#include "reiserfs.h"
// find where objectid map starts
#define objectid_map(s,rs) (old_format_only (s) ? \
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 45de98b5946..c0b1112ab7e 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -4,7 +4,7 @@
#include <linux/time.h>
#include <linux/fs.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/string.h>
#include <linux/buffer_head.h>
@@ -329,7 +329,7 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it
pointless complexity):
- panics in reiserfs_fs.h have numbers from 1000 to 1999
+ panics in reiserfs.h have numbers from 1000 to 1999
super.c 2000 to 2999
preserve.c (unused) 3000 to 3999
bitmap.c 4000 to 4999
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 7a9981196c1..2c1ade692cc 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -12,8 +12,7 @@
#include <linux/time.h>
#include <linux/seq_file.h>
#include <asm/uaccess.h>
-#include <linux/reiserfs_fs.h>
-#include <linux/reiserfs_fs_sb.h>
+#include "reiserfs.h"
#include <linux/init.h>
#include <linux/proc_fs.h>
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
new file mode 100644
index 00000000000..a59d2712633
--- /dev/null
+++ b/fs/reiserfs/reiserfs.h
@@ -0,0 +1,2923 @@
+/*
+ * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details
+ */
+
+#include <linux/reiserfs_fs.h>
+
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/bug.h>
+#include <linux/workqueue.h>
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
+#include <linux/proc_fs.h>
+#include <linux/buffer_head.h>
+
+/* the 32 bit compat definitions with int argument */
+#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int)
+#define REISERFS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define REISERFS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION
+#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION
+
+struct reiserfs_journal_list;
+
+/** bitmasks for i_flags field in reiserfs-specific part of inode */
+typedef enum {
+ /** this says what format of key do all items (but stat data) of
+ an object have. If this is set, that format is 3.6 otherwise
+ - 3.5 */
+ i_item_key_version_mask = 0x0001,
+ /** If this is unset, object has 3.5 stat data, otherwise, it has
+ 3.6 stat data with 64bit size, 32bit nlink etc. */
+ i_stat_data_version_mask = 0x0002,
+ /** file might need tail packing on close */
+ i_pack_on_close_mask = 0x0004,
+ /** don't pack tail of file */
+ i_nopack_mask = 0x0008,
+ /** If those is set, "safe link" was created for this file during
+ truncate or unlink. Safe link is used to avoid leakage of disk
+ space on crash with some files open, but unlinked. */
+ i_link_saved_unlink_mask = 0x0010,
+ i_link_saved_truncate_mask = 0x0020,
+ i_has_xattr_dir = 0x0040,
+ i_data_log = 0x0080,
+} reiserfs_inode_flags;
+
+struct reiserfs_inode_info {
+ __u32 i_key[4]; /* key is still 4 32 bit integers */
+ /** transient inode flags that are never stored on disk. Bitmasks
+ for this field are defined above. */
+ __u32 i_flags;
+
+ __u32 i_first_direct_byte; // offset of first byte stored in direct item.
+
+ /* copy of persistent inode flags read from sd_attrs. */
+ __u32 i_attrs;
+
+ int i_prealloc_block; /* first unused block of a sequence of unused blocks */
+ int i_prealloc_count; /* length of that sequence */
+ struct list_head i_prealloc_list; /* per-transaction list of inodes which
+ * have preallocated blocks */
+
+ unsigned new_packing_locality:1; /* new_packig_locality is created; new blocks
+ * for the contents of this directory should be
+ * displaced */
+
+ /* we use these for fsync or O_SYNC to decide which transaction
+ ** needs to be committed in order for this inode to be properly
+ ** flushed */
+ unsigned int i_trans_id;
+ struct reiserfs_journal_list *i_jl;
+ atomic_t openers;
+ struct mutex tailpack;
+#ifdef CONFIG_REISERFS_FS_XATTR
+ struct rw_semaphore i_xattr_sem;
+#endif
+ struct inode vfs_inode;
+};
+
+typedef enum {
+ reiserfs_attrs_cleared = 0x00000001,
+} reiserfs_super_block_flags;
+
+/* struct reiserfs_super_block accessors/mutators
+ * since this is a disk structure, it will always be in
+ * little endian format. */
+#define sb_block_count(sbp) (le32_to_cpu((sbp)->s_v1.s_block_count))
+#define set_sb_block_count(sbp,v) ((sbp)->s_v1.s_block_count = cpu_to_le32(v))
+#define sb_free_blocks(sbp) (le32_to_cpu((sbp)->s_v1.s_free_blocks))
+#define set_sb_free_blocks(sbp,v) ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v))
+#define sb_root_block(sbp) (le32_to_cpu((sbp)->s_v1.s_root_block))
+#define set_sb_root_block(sbp,v) ((sbp)->s_v1.s_root_block = cpu_to_le32(v))
+
+#define sb_jp_journal_1st_block(sbp) \
+ (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block))
+#define set_sb_jp_journal_1st_block(sbp,v) \
+ ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v))
+#define sb_jp_journal_dev(sbp) \
+ (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev))
+#define set_sb_jp_journal_dev(sbp,v) \
+ ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v))
+#define sb_jp_journal_size(sbp) \
+ (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size))
+#define set_sb_jp_journal_size(sbp,v) \
+ ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v))
+#define sb_jp_journal_trans_max(sbp) \
+ (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max))
+#define set_sb_jp_journal_trans_max(sbp,v) \
+ ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v))
+#define sb_jp_journal_magic(sbp) \
+ (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic))
+#define set_sb_jp_journal_magic(sbp,v) \
+ ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v))
+#define sb_jp_journal_max_batch(sbp) \
+ (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch))
+#define set_sb_jp_journal_max_batch(sbp,v) \
+ ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v))
+#define sb_jp_jourmal_max_commit_age(sbp) \
+ (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age))
+#define set_sb_jp_journal_max_commit_age(sbp,v) \
+ ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v))
+
+#define sb_blocksize(sbp) (le16_to_cpu((sbp)->s_v1.s_blocksize))
+#define set_sb_blocksize(sbp,v) ((sbp)->s_v1.s_blocksize = cpu_to_le16(v))
+#define sb_oid_maxsize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_maxsize))
+#define set_sb_oid_maxsize(sbp,v) ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v))
+#define sb_oid_cursize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_cursize))
+#define set_sb_oid_cursize(sbp,v) ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v))
+#define sb_umount_state(sbp) (le16_to_cpu((sbp)->s_v1.s_umount_state))
+#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v))
+#define sb_fs_state(sbp) (le16_to_cpu((sbp)->s_v1.s_fs_state))
+#define set_sb_fs_state(sbp,v) ((sbp)->s_v1.s_fs_state = cpu_to_le16(v))
+#define sb_hash_function_code(sbp) \
+ (le32_to_cpu((sbp)->s_v1.s_hash_function_code))
+#define set_sb_hash_function_code(sbp,v) \
+ ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v))
+#define sb_tree_height(sbp) (le16_to_cpu((sbp)->s_v1.s_tree_height))
+#define set_sb_tree_height(sbp,v) ((sbp)->s_v1.s_tree_height = cpu_to_le16(v))
+#define sb_bmap_nr(sbp) (le16_to_cpu((sbp)->s_v1.s_bmap_nr))
+#define set_sb_bmap_nr(sbp,v) ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v))
+#define sb_version(sbp) (le16_to_cpu((sbp)->s_v1.s_version))
+#define set_sb_version(sbp,v) ((sbp)->s_v1.s_version = cpu_to_le16(v))
+
+#define sb_mnt_count(sbp) (le16_to_cpu((sbp)->s_mnt_count))
+#define set_sb_mnt_count(sbp, v) ((sbp)->s_mnt_count = cpu_to_le16(v))
+
+#define sb_reserved_for_journal(sbp) \
+ (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal))
+#define set_sb_reserved_for_journal(sbp,v) \
+ ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v))
+
+/* LOGGING -- */
+
+/* These all interelate for performance.
+**
+** If the journal block count is smaller than n transactions, you lose speed.
+** I don't know what n is yet, I'm guessing 8-16.
+**
+** typical transaction size depends on the application, how often fsync is
+** called, and how many metadata blocks you dirty in a 30 second period.
+** The more small files (<16k) you use, the larger your transactions will
+** be.
+**
+** If your journal fills faster than dirty buffers get flushed to disk, it must flush them before allowing the journal
+** to wrap, which slows things down. If you need high speed meta data updates, the journal should be big enough
+** to prevent wrapping before dirty meta blocks get to disk.
+**
+** If the batch max is smaller than the transaction max, you'll waste space at the end of the journal
+** because journal_end sets the next transaction to start at 0 if the next transaction has any chance of wrapping.
+**
+** The large the batch max age, the better the speed, and the more meta data changes you'll lose after a crash.
+**
+*/
+
+/* don't mess with these for a while */
+ /* we have a node size define somewhere in reiserfs_fs.h. -Hans */
+#define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */
+#define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */
+#define JOURNAL_HASH_SIZE 8192
+#define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating. Must be >= 2 */
+
+/* One of these for every block in every transaction
+** Each one is in two hash tables. First, a hash of the current transaction, and after journal_end, a
+** hash of all the in memory transactions.
+** next and prev are used by the current transaction (journal_hash).
+** hnext and hprev are used by journal_list_hash. If a block is in more than one transaction, the journal_list_hash
+** links it in multiple times. This allows flush_journal_list to remove just the cnode belonging
+** to a given transaction.
+*/
+struct reiserfs_journal_cnode {
+ struct buffer_head *bh; /* real buffer head */
+ struct super_block *sb; /* dev of real buffer head */
+ __u32 blocknr; /* block number of real buffer head, == 0 when buffer on disk */
+ unsigned long state;
+ struct reiserfs_journal_list *jlist; /* journal list this cnode lives in */
+ struct reiserfs_journal_cnode *next; /* next in transaction list */
+ struct reiserfs_journal_cnode *prev; /* prev in transaction list */
+ struct reiserfs_journal_cnode *hprev; /* prev in hash list */
+ struct reiserfs_journal_cnode *hnext; /* next in hash list */
+};
+
+struct reiserfs_bitmap_node {
+ int id;
+ char *data;
+ struct list_head list;
+};
+
+struct reiserfs_list_bitmap {
+ struct reiserfs_journal_list *journal_list;
+ struct reiserfs_bitmap_node **bitmaps;
+};
+
+/*
+** one of these for each transaction. The most important part here is the j_realblock.
+** this list of cnodes is used to hash all the blocks in all the commits, to mark all the
+** real buffer heads dirty once all the commits hit the disk,
+** and to make sure every real block in a transaction is on disk before allowing the log area
+** to be overwritten */
+struct reiserfs_journal_list {
+ unsigned long j_start;
+ unsigned long j_state;
+ unsigned long j_len;
+ atomic_t j_nonzerolen;
+ atomic_t j_commit_left;
+ atomic_t j_older_commits_done; /* all commits older than this on disk */
+ struct mutex j_commit_mutex;
+ unsigned int j_trans_id;
+ time_t j_timestamp;
+ struct reiserfs_list_bitmap *j_list_bitmap;
+ struct buffer_head *j_commit_bh; /* commit buffer head */
+ struct reiserfs_journal_cnode *j_realblock;
+ struct reiserfs_journal_cnode *j_freedlist; /* list of buffers that were freed during this trans. free each of these on flush */
+ /* time ordered list of all active transactions */
+ struct list_head j_list;
+
+ /* time ordered list of all transactions we haven't tried to flush yet */
+ struct list_head j_working_list;
+
+ /* list of tail conversion targets in need of flush before commit */
+ struct list_head j_tail_bh_list;
+ /* list of data=ordered buffers in need of flush before commit */
+ struct list_head j_bh_list;
+ int j_refcount;
+};
+
+struct reiserfs_journal {
+ struct buffer_head **j_ap_blocks; /* journal blocks on disk */
+ struct reiserfs_journal_cnode *j_last; /* newest journal block */
+ struct reiserfs_journal_cnode *j_first; /* oldest journal block. start here for traverse */
+
+ struct block_device *j_dev_bd;
+ fmode_t j_dev_mode;
+ int j_1st_reserved_block; /* first block on s_dev of reserved area journal */
+
+ unsigned long j_state;
+ unsigned int j_trans_id;
+ unsigned long j_mount_id;
+ unsigned long j_start; /* start of current waiting commit (index into j_ap_blocks) */
+ unsigned long j_len; /* length of current waiting commit */
+ unsigned long j_len_alloc; /* number of buffers requested by journal_begin() */
+ atomic_t j_wcount; /* count of writers for current commit */
+ unsigned long j_bcount; /* batch count. allows turning X transactions into 1 */
+ unsigned long j_first_unflushed_offset; /* first unflushed transactions offset */
+ unsigned j_last_flush_trans_id; /* last fully flushed journal timestamp */
+ struct buffer_head *j_header_bh;
+
+ time_t j_trans_start_time; /* time this transaction started */
+ struct mutex j_mutex;
+ struct mutex j_flush_mutex;
+ wait_queue_head_t j_join_wait; /* wait for current transaction to finish before starting new one */
+ atomic_t j_jlock; /* lock for j_join_wait */
+ int j_list_bitmap_index; /* number of next list bitmap to use */
+ int j_must_wait; /* no more journal begins allowed. MUST sleep on j_join_wait */
+ int j_next_full_flush; /* next journal_end will flush all journal list */
+ int j_next_async_flush; /* next journal_end will flush all async commits */
+
+ int j_cnode_used; /* number of cnodes on the used list */
+ int j_cnode_free; /* number of cnodes on the free list */
+
+ unsigned int j_trans_max; /* max number of blocks in a transaction. */
+ unsigned int j_max_batch; /* max number of blocks to batch into a trans */
+ unsigned int j_max_commit_age; /* in seconds, how old can an async commit be */
+ unsigned int j_max_trans_age; /* in seconds, how old can a transaction be */
+ unsigned int j_default_max_commit_age; /* the default for the max commit age */
+
+ struct reiserfs_journal_cnode *j_cnode_free_list;
+ struct reiserfs_journal_cnode *j_cnode_free_orig; /* orig pointer returned from vmalloc */
+
+ struct reiserfs_journal_list *j_current_jl;
+ int j_free_bitmap_nodes;
+ int j_used_bitmap_nodes;
+
+ int j_num_lists; /* total number of active transactions */
+ int j_num_work_lists; /* number that need attention from kreiserfsd */
+
+ /* debugging to make sure things are flushed in order */
+ unsigned int j_last_flush_id;
+
+ /* debugging to make sure things are committed in order */
+ unsigned int j_last_commit_id;
+
+ struct list_head j_bitmap_nodes;
+ struct list_head j_dirty_buffers;
+ spinlock_t j_dirty_buffers_lock; /* protects j_dirty_buffers */
+
+ /* list of all active transactions */
+ struct list_head j_journal_list;
+ /* lists that haven't been touched by writeback attempts */
+ struct list_head j_working_list;
+
+ struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS]; /* array of bitmaps to record the deleted blocks */
+ struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE]; /* hash table for real buffer heads in current trans */
+ struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE]; /* hash table for all the real buffer heads in all
+ the transactions */
+ struct list_head j_prealloc_list; /* list of inodes which have preallocated blocks */
+ int j_persistent_trans;
+ unsigned long j_max_trans_size;
+ unsigned long j_max_batch_size;
+
+ int j_errno;
+
+ /* when flushing ordered buffers, throttle new ordered writers */
+ struct delayed_work j_work;
+ struct super_block *j_work_sb;
+ atomic_t j_async_throttle;
+};
+
+enum journal_state_bits {
+ J_WRITERS_BLOCKED = 1, /* set when new writers not allowed */
+ J_WRITERS_QUEUED, /* set when log is full due to too many writers */
+ J_ABORTED, /* set when log is aborted */
+};
+
+#define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick. magic string to find desc blocks in the journal */
+
+typedef __u32(*hashf_t) (const signed char *, int);
+
+struct reiserfs_bitmap_info {
+ __u32 free_count;
+};
+
+struct proc_dir_entry;
+
+#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO )
+typedef unsigned long int stat_cnt_t;
+typedef struct reiserfs_proc_info_data {
+ spinlock_t lock;
+ int exiting;
+ int max_hash_collisions;
+
+ stat_cnt_t breads;
+ stat_cnt_t bread_miss;
+ stat_cnt_t search_by_key;
+ stat_cnt_t search_by_key_fs_changed;
+ stat_cnt_t search_by_key_restarted;
+
+ stat_cnt_t insert_item_restarted;
+ stat_cnt_t paste_into_item_restarted;
+ stat_cnt_t cut_from_item_restarted;
+ stat_cnt_t delete_solid_item_restarted;
+ stat_cnt_t delete_item_restarted;
+
+ stat_cnt_t leaked_oid;
+ stat_cnt_t leaves_removable;
+
+ /* balances per level. Use explicit 5 as MAX_HEIGHT is not visible yet. */
+ stat_cnt_t balance_at[5]; /* XXX */
+ /* sbk == search_by_key */
+ stat_cnt_t sbk_read_at[5]; /* XXX */
+ stat_cnt_t sbk_fs_changed[5];
+ stat_cnt_t sbk_restarted[5];
+ stat_cnt_t items_at[5]; /* XXX */
+ stat_cnt_t free_at[5]; /* XXX */
+ stat_cnt_t can_node_be_removed[5]; /* XXX */
+ long int lnum[5]; /* XXX */
+ long int rnum[5]; /* XXX */
+ long int lbytes[5]; /* XXX */
+ long int rbytes[5]; /* XXX */
+ stat_cnt_t get_neighbors[5];
+ stat_cnt_t get_neighbors_restart[5];
+ stat_cnt_t need_l_neighbor[5];
+ stat_cnt_t need_r_neighbor[5];
+
+ stat_cnt_t free_block;
+ struct __scan_bitmap_stats {
+ stat_cnt_t call;
+ stat_cnt_t wait;
+ stat_cnt_t bmap;
+ stat_cnt_t retry;
+ stat_cnt_t in_journal_hint;
+ stat_cnt_t in_journal_nohint;
+ stat_cnt_t stolen;
+ } scan_bitmap;
+ struct __journal_stats {
+ stat_cnt_t in_journal;
+ stat_cnt_t in_journal_bitmap;
+ stat_cnt_t in_journal_reusable;
+ stat_cnt_t lock_journal;
+ stat_cnt_t lock_journal_wait;
+ stat_cnt_t journal_being;
+ stat_cnt_t journal_relock_writers;
+ stat_cnt_t journal_relock_wcount;
+ stat_cnt_t mark_dirty;
+ stat_cnt_t mark_dirty_already;
+ stat_cnt_t mark_dirty_notjournal;
+ stat_cnt_t restore_prepared;
+ stat_cnt_t prepare;
+ stat_cnt_t prepare_retry;
+ } journal;
+} reiserfs_proc_info_data_t;
+#else
+typedef struct reiserfs_proc_info_data {
+} reiserfs_proc_info_data_t;
+#endif
+
+/* reiserfs union of in-core super block data */
+struct reiserfs_sb_info {
+ struct buffer_head *s_sbh; /* Buffer containing the super block */
+ /* both the comment and the choice of
+ name are unclear for s_rs -Hans */
+ struct reiserfs_super_block *s_rs; /* Pointer to the super block in the buffer */
+ struct reiserfs_bitmap_info *s_ap_bitmap;
+ struct reiserfs_journal *s_journal; /* pointer to journal information */
+ unsigned short s_mount_state; /* reiserfs state (valid, invalid) */
+
+ /* Serialize writers access, replace the old bkl */
+ struct mutex lock;
+ /* Owner of the lock (can be recursive) */
+ struct task_struct *lock_owner;
+ /* Depth of the lock, start from -1 like the bkl */
+ int lock_depth;
+
+ /* Comment? -Hans */
+ void (*end_io_handler) (struct buffer_head *, int);
+ hashf_t s_hash_function; /* pointer to function which is used
+ to sort names in directory. Set on
+ mount */
+ unsigned long s_mount_opt; /* reiserfs's mount options are set
+ here (currently - NOTAIL, NOLOG,
+ REPLAYONLY) */
+
+ struct { /* This is a structure that describes block allocator options */
+ unsigned long bits; /* Bitfield for enable/disable kind of options */
+ unsigned long large_file_size; /* size started from which we consider file to be a large one(in blocks) */
+ int border; /* percentage of disk, border takes */
+ int preallocmin; /* Minimal file size (in blocks) starting from which we do preallocations */
+ int preallocsize; /* Number of blocks we try to prealloc when file
+ reaches preallocmin size (in blocks) or
+ prealloc_list is empty. */
+ } s_alloc_options;
+
+ /* Comment? -Hans */
+ wait_queue_head_t s_wait;
+ /* To be obsoleted soon by per buffer seals.. -Hans */
+ atomic_t s_generation_counter; // increased by one every time the
+ // tree gets re-balanced
+ unsigned long s_properties; /* File system properties. Currently holds
+ on-disk FS format */
+
+ /* session statistics */
+ int s_disk_reads;
+ int s_disk_writes;
+ int s_fix_nodes;
+ int s_do_balance;
+ int s_unneeded_left_neighbor;
+ int s_good_search_by_key_reada;
+ int s_bmaps;
+ int s_bmaps_without_search;
+ int s_direct2indirect;
+ int s_indirect2direct;
+ /* set up when it's ok for reiserfs_read_inode2() to read from
+ disk inode with nlink==0. Currently this is only used during
+ finish_unfinished() processing at mount time */
+ int s_is_unlinked_ok;
+ reiserfs_proc_info_data_t s_proc_info_data;
+ struct proc_dir_entry *procdir;
+ int reserved_blocks; /* amount of blocks reserved for further allocations */
+ spinlock_t bitmap_lock; /* this lock on now only used to protect reserved_blocks variable */
+ struct dentry *priv_root; /* root of /.reiserfs_priv */
+ struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */
+ int j_errno;
+#ifdef CONFIG_QUOTA
+ char *s_qf_names[MAXQUOTAS];
+ int s_jquota_fmt;
+#endif
+ char *s_jdev; /* Stored jdev for mount option showing */
+#ifdef CONFIG_REISERFS_CHECK
+
+ struct tree_balance *cur_tb; /*
+ * Detects whether more than one
+ * copy of tb exists per superblock
+ * as a means of checking whether
+ * do_balance is executing concurrently
+ * against another tree reader/writer
+ * on a same mount point.
+ */
+#endif
+};
+
+/* Definitions of reiserfs on-disk properties: */
+#define REISERFS_3_5 0
+#define REISERFS_3_6 1
+#define REISERFS_OLD_FORMAT 2
+
+enum reiserfs_mount_options {
+/* Mount options */
+ REISERFS_LARGETAIL, /* large tails will be created in a session */
+ REISERFS_SMALLTAIL, /* small (for files less than block size) tails will be created in a session */
+ REPLAYONLY, /* replay journal and return 0. Use by fsck */
+ REISERFS_CONVERT, /* -o conv: causes conversion of old
+ format super block to the new
+ format. If not specified - old
+ partition will be dealt with in a
+ manner of 3.5.x */
+
+/* -o hash={tea, rupasov, r5, detect} is meant for properly mounting
+** reiserfs disks from 3.5.19 or earlier. 99% of the time, this option
+** is not required. If the normal autodection code can't determine which
+** hash to use (because both hashes had the same value for a file)
+** use this option to force a specific hash. It won't allow you to override
+** the existing hash on the FS, so if you have a tea hash disk, and mount
+** with -o hash=rupasov, the mount will fail.
+*/
+ FORCE_TEA_HASH, /* try to force tea hash on mount */
+ FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */
+ FORCE_R5_HASH, /* try to force rupasov hash on mount */
+ FORCE_HASH_DETECT, /* try to detect hash function on mount */
+
+ REISERFS_DATA_LOG,
+ REISERFS_DATA_ORDERED,
+ REISERFS_DATA_WRITEBACK,
+
+/* used for testing experimental features, makes benchmarking new
+ features with and without more convenient, should never be used by
+ users in any code shipped to users (ideally) */
+
+ REISERFS_NO_BORDER,
+ REISERFS_NO_UNHASHED_RELOCATION,
+ REISERFS_HASHED_RELOCATION,
+ REISERFS_ATTRS,
+ REISERFS_XATTRS_USER,
+ REISERFS_POSIXACL,
+ REISERFS_EXPOSE_PRIVROOT,
+ REISERFS_BARRIER_NONE,
+ REISERFS_BARRIER_FLUSH,
+
+ /* Actions on error */
+ REISERFS_ERROR_PANIC,
+ REISERFS_ERROR_RO,
+ REISERFS_ERROR_CONTINUE,
+
+ REISERFS_USRQUOTA, /* User quota option specified */
+ REISERFS_GRPQUOTA, /* Group quota option specified */
+
+ REISERFS_TEST1,
+ REISERFS_TEST2,
+ REISERFS_TEST3,
+ REISERFS_TEST4,
+ REISERFS_UNSUPPORTED_OPT,
+};
+
+#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
+#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
+#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH))
+#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT))
+#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER))
+#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION))
+#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION))
+#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4))
+
+#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
+#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
+#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
+#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
+#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
+#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
+#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
+#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
+#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
+#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
+#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
+#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
+#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
+#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
+#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
+
+#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC))
+#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO))
+
+void reiserfs_file_buffer(struct buffer_head *bh, int list);
+extern struct file_system_type reiserfs_fs_type;
+int reiserfs_resize(struct super_block *, unsigned long);
+
+#define CARRY_ON 0
+#define SCHEDULE_OCCURRED 1
+
+#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
+#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
+#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
+#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free)
+#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
+
+#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
+
+/* A safe version of the "bdevname", which returns the "s_id" field of
+ * a superblock or else "Null superblock" if the super block is NULL.
+ */
+static inline char *reiserfs_bdevname(struct super_block *s)
+{
+ return (s == NULL) ? "Null superblock" : s->s_id;
+}
+
+#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
+static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
+ *journal)
+{
+ return test_bit(J_ABORTED, &journal->j_state);
+}
+
+/*
+ * Locking primitives. The write lock is a per superblock
+ * special mutex that has properties close to the Big Kernel Lock
+ * which was used in the previous locking scheme.
+ */
+void reiserfs_write_lock(struct super_block *s);
+void reiserfs_write_unlock(struct super_block *s);
+int reiserfs_write_lock_once(struct super_block *s);
+void reiserfs_write_unlock_once(struct super_block *s, int lock_depth);
+
+#ifdef CONFIG_REISERFS_CHECK
+void reiserfs_lock_check_recursive(struct super_block *s);
+#else
+static inline void reiserfs_lock_check_recursive(struct super_block *s) { }
+#endif
+
+/*
+ * Several mutexes depend on the write lock.
+ * However sometimes we want to relax the write lock while we hold
+ * these mutexes, according to the release/reacquire on schedule()
+ * properties of the Bkl that were used.
+ * Reiserfs performances and locking were based on this scheme.
+ * Now that the write lock is a mutex and not the bkl anymore, doing so
+ * may result in a deadlock:
+ *
+ * A acquire write_lock
+ * A acquire j_commit_mutex
+ * A release write_lock and wait for something
+ * B acquire write_lock
+ * B can't acquire j_commit_mutex and sleep
+ * A can't acquire write lock anymore
+ * deadlock
+ *
+ * What we do here is avoiding such deadlock by playing the same game
+ * than the Bkl: if we can't acquire a mutex that depends on the write lock,
+ * we release the write lock, wait a bit and then retry.
+ *
+ * The mutexes concerned by this hack are:
+ * - The commit mutex of a journal list
+ * - The flush mutex
+ * - The journal lock
+ * - The inode mutex
+ */
+static inline void reiserfs_mutex_lock_safe(struct mutex *m,
+ struct super_block *s)
+{
+ reiserfs_lock_check_recursive(s);
+ reiserfs_write_unlock(s);
+ mutex_lock(m);
+ reiserfs_write_lock(s);
+}
+
+static inline void
+reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass,
+ struct super_block *s)
+{
+ reiserfs_lock_check_recursive(s);
+ reiserfs_write_unlock(s);
+ mutex_lock_nested(m, subclass);
+ reiserfs_write_lock(s);
+}
+
+static inline void
+reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
+{
+ reiserfs_lock_check_recursive(s);
+ reiserfs_write_unlock(s);
+ down_read(sem);
+ reiserfs_write_lock(s);
+}
+
+/*
+ * When we schedule, we usually want to also release the write lock,
+ * according to the previous bkl based locking scheme of reiserfs.
+ */
+static inline void reiserfs_cond_resched(struct super_block *s)
+{
+ if (need_resched()) {
+ reiserfs_write_unlock(s);
+ schedule();
+ reiserfs_write_lock(s);
+ }
+}
+
+struct fid;
+
+/* in reading the #defines, it may help to understand that they employ
+ the following abbreviations:
+
+ B = Buffer
+ I = Item header
+ H = Height within the tree (should be changed to LEV)
+ N = Number of the item in the node
+ STAT = stat data
+ DEH = Directory Entry Header
+ EC = Entry Count
+ E = Entry number
+ UL = Unsigned Long
+ BLKH = BLocK Header
+ UNFM = UNForMatted node
+ DC = Disk Child
+ P = Path
+
+ These #defines are named by concatenating these abbreviations,
+ where first comes the arguments, and last comes the return value,
+ of the macro.
+
+*/
+
+#define USE_INODE_GENERATION_COUNTER
+
+#define REISERFS_PREALLOCATE
+#define DISPLACE_NEW_PACKING_LOCALITIES
+#define PREALLOCATION_SIZE 9
+
+/* n must be power of 2 */
+#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u))
+
+// to be ok for alpha and others we have to align structures to 8 byte
+// boundary.
+// FIXME: do not change 4 by anything else: there is code which relies on that
+#define ROUND_UP(x) _ROUND_UP(x,8LL)
+
+/* debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug
+** messages.
+*/
+#define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */
+
+void __reiserfs_warning(struct super_block *s, const char *id,
+ const char *func, const char *fmt, ...);
+#define reiserfs_warning(s, id, fmt, args...) \
+ __reiserfs_warning(s, id, __func__, fmt, ##args)
+/* assertions handling */
+
+/** always check a condition and panic if it's false. */
+#define __RASSERT(cond, scond, format, args...) \
+do { \
+ if (!(cond)) \
+ reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \
+ __FILE__ ":%i:%s: " format "\n", \
+ in_interrupt() ? -1 : task_pid_nr(current), \
+ __LINE__, __func__ , ##args); \
+} while (0)
+
+#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args)
+
+#if defined( CONFIG_REISERFS_CHECK )
+#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args)
+#else
+#define RFALSE( cond, format, args... ) do {;} while( 0 )
+#endif
+
+#define CONSTF __attribute_const__
+/*
+ * Disk Data Structures
+ */
+
+/***************************************************************************/
+/* SUPER BLOCK */
+/***************************************************************************/
+
+/*
+ * Structure of super block on disk, a version of which in RAM is often accessed as REISERFS_SB(s)->s_rs
+ * the version in RAM is part of a larger structure containing fields never written to disk.
+ */
+#define UNSET_HASH 0 // read_super will guess about, what hash names
+ // in directories were sorted with
+#define TEA_HASH 1
+#define YURA_HASH 2
+#define R5_HASH 3
+#define DEFAULT_HASH R5_HASH
+
+struct journal_params {
+ __le32 jp_journal_1st_block; /* where does journal start from on its
+ * device */
+ __le32 jp_journal_dev; /* journal device st_rdev */
+ __le32 jp_journal_size; /* size of the journal */
+ __le32 jp_journal_trans_max; /* max number of blocks in a transaction. */
+ __le32 jp_journal_magic; /* random value made on fs creation (this
+ * was sb_journal_block_count) */
+ __le32 jp_journal_max_batch; /* max number of blocks to batch into a
+ * trans */
+ __le32 jp_journal_max_commit_age; /* in seconds, how old can an async
+ * commit be */
+ __le32 jp_journal_max_trans_age; /* in seconds, how old can a transaction
+ * be */
+};
+
+/* this is the super from 3.5.X, where X >= 10 */
+struct reiserfs_super_block_v1 {
+ __le32 s_block_count; /* blocks count */
+ __le32 s_free_blocks; /* free blocks count */
+ __le32 s_root_block; /* root block number */
+ struct journal_params s_journal;
+ __le16 s_blocksize; /* block size */
+ __le16 s_oid_maxsize; /* max size of object id array, see
+ * get_objectid() commentary */
+ __le16 s_oid_cursize; /* current size of object id array */
+ __le16 s_umount_state; /* this is set to 1 when filesystem was
+ * umounted, to 2 - when not */
+ char s_magic[10]; /* reiserfs magic string indicates that
+ * file system is reiserfs:
+ * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" */
+ __le16 s_fs_state; /* it is set to used by fsck to mark which
+ * phase of rebuilding is done */
+ __le32 s_hash_function_code; /* indicate, what hash function is being use
+ * to sort names in a directory*/
+ __le16 s_tree_height; /* height of disk tree */
+ __le16 s_bmap_nr; /* amount of bitmap blocks needed to address
+ * each block of file system */
+ __le16 s_version; /* this field is only reliable on filesystem
+ * with non-standard journal */
+ __le16 s_reserved_for_journal; /* size in blocks of journal area on main
+ * device, we need to keep after
+ * making fs with non-standard journal */
+} __attribute__ ((__packed__));
+
+#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1))
+
+/* this is the on disk super block */
+struct reiserfs_super_block {
+ struct reiserfs_super_block_v1 s_v1;
+ __le32 s_inode_generation;
+ __le32 s_flags; /* Right now used only by inode-attributes, if enabled */
+ unsigned char s_uuid[16]; /* filesystem unique identifier */
+ unsigned char s_label[16]; /* filesystem volume label */
+ __le16 s_mnt_count; /* Count of mounts since last fsck */
+ __le16 s_max_mnt_count; /* Maximum mounts before check */
+ __le32 s_lastcheck; /* Timestamp of last fsck */
+ __le32 s_check_interval; /* Interval between checks */
+ char s_unused[76]; /* zero filled by mkreiserfs and
+ * reiserfs_convert_objectid_map_v1()
+ * so any additions must be updated
+ * there as well. */
+} __attribute__ ((__packed__));
+
+#define SB_SIZE (sizeof(struct reiserfs_super_block))
+
+#define REISERFS_VERSION_1 0
+#define REISERFS_VERSION_2 2
+
+// on-disk super block fields converted to cpu form
+#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs)
+#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1))
+#define SB_BLOCKSIZE(s) \
+ le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize))
+#define SB_BLOCK_COUNT(s) \
+ le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count))
+#define SB_FREE_BLOCKS(s) \
+ le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks))
+#define SB_REISERFS_MAGIC(s) \
+ (SB_V1_DISK_SUPER_BLOCK(s)->s_magic)
+#define SB_ROOT_BLOCK(s) \
+ le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block))
+#define SB_TREE_HEIGHT(s) \
+ le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height))
+#define SB_REISERFS_STATE(s) \
+ le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state))
+#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version))
+#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr))
+
+#define PUT_SB_BLOCK_COUNT(s, val) \
+ do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0)
+#define PUT_SB_FREE_BLOCKS(s, val) \
+ do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0)
+#define PUT_SB_ROOT_BLOCK(s, val) \
+ do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0)
+#define PUT_SB_TREE_HEIGHT(s, val) \
+ do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0)
+#define PUT_SB_REISERFS_STATE(s, val) \
+ do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0)
+#define PUT_SB_VERSION(s, val) \
+ do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0)
+#define PUT_SB_BMAP_NR(s, val) \
+ do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0)
+
+#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal)
+#define SB_ONDISK_JOURNAL_SIZE(s) \
+ le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size))
+#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \
+ le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block))
+#define SB_ONDISK_JOURNAL_DEVICE(s) \
+ le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev))
+#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \
+ le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal))
+
+#define is_block_in_log_or_reserved_area(s, block) \
+ block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \
+ && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) + \
+ ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \
+ SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s)))
+
+int is_reiserfs_3_5(struct reiserfs_super_block *rs);
+int is_reiserfs_3_6(struct reiserfs_super_block *rs);
+int is_reiserfs_jr(struct reiserfs_super_block *rs);
+
+/* ReiserFS leaves the first 64k unused, so that partition labels have
+ enough space. If someone wants to write a fancy bootloader that
+ needs more than 64k, let us know, and this will be increased in size.
+ This number must be larger than than the largest block size on any
+ platform, or code will break. -Hans */
+#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024)
+#define REISERFS_FIRST_BLOCK unused_define
+#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES
+
+/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */
+#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024)
+
+/* reiserfs internal error code (used by search_by_key and fix_nodes)) */
+#define CARRY_ON 0
+#define REPEAT_SEARCH -1
+#define IO_ERROR -2
+#define NO_DISK_SPACE -3
+#define NO_BALANCING_NEEDED (-4)
+#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
+#define QUOTA_EXCEEDED -6
+
+typedef __u32 b_blocknr_t;
+typedef __le32 unp_t;
+
+struct unfm_nodeinfo {
+ unp_t unfm_nodenum;
+ unsigned short unfm_freespace;
+};
+
+/* there are two formats of keys: 3.5 and 3.6
+ */
+#define KEY_FORMAT_3_5 0
+#define KEY_FORMAT_3_6 1
+
+/* there are two stat datas */
+#define STAT_DATA_V1 0
+#define STAT_DATA_V2 1
+
+static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode)
+{
+ return container_of(inode, struct reiserfs_inode_info, vfs_inode);
+}
+
+static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+/* Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
+ * which overflows on large file systems. */
+static inline __u32 reiserfs_bmap_count(struct super_block *sb)
+{
+ return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1;
+}
+
+static inline int bmap_would_wrap(unsigned bmap_nr)
+{
+ return bmap_nr > ((1LL << 16) - 1);
+}
+
+/** this says about version of key of all items (but stat data) the
+ object consists of */
+#define get_inode_item_key_version( inode ) \
+ ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5)
+
+#define set_inode_item_key_version( inode, version ) \
+ ({ if((version)==KEY_FORMAT_3_6) \
+ REISERFS_I(inode)->i_flags |= i_item_key_version_mask; \
+ else \
+ REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; })
+
+#define get_inode_sd_version(inode) \
+ ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1)
+
+#define set_inode_sd_version(inode, version) \
+ ({ if((version)==STAT_DATA_V2) \
+ REISERFS_I(inode)->i_flags |= i_stat_data_version_mask; \
+ else \
+ REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; })
+
+/* This is an aggressive tail suppression policy, I am hoping it
+ improves our benchmarks. The principle behind it is that percentage
+ space saving is what matters, not absolute space saving. This is
+ non-intuitive, but it helps to understand it if you consider that the
+ cost to access 4 blocks is not much more than the cost to access 1
+ block, if you have to do a seek and rotate. A tail risks a
+ non-linear disk access that is significant as a percentage of total
+ time cost for a 4 block file and saves an amount of space that is
+ less significant as a percentage of space, or so goes the hypothesis.
+ -Hans */
+#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \
+(\
+ (!(n_tail_size)) || \
+ (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \
+ ( (n_file_size) >= (n_block_size) * 4 ) || \
+ ( ( (n_file_size) >= (n_block_size) * 3 ) && \
+ ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \
+ ( ( (n_file_size) >= (n_block_size) * 2 ) && \
+ ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \
+ ( ( (n_file_size) >= (n_block_size) ) && \
+ ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \
+)
+
+/* Another strategy for tails, this one means only create a tail if all the
+ file would fit into one DIRECT item.
+ Primary intention for this one is to increase performance by decreasing
+ seeking.
+*/
+#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \
+(\
+ (!(n_tail_size)) || \
+ (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \
+)
+
+/*
+ * values for s_umount_state field
+ */
+#define REISERFS_VALID_FS 1
+#define REISERFS_ERROR_FS 2
+
+//
+// there are 5 item types currently
+//
+#define TYPE_STAT_DATA 0
+#define TYPE_INDIRECT 1
+#define TYPE_DIRECT 2
+#define TYPE_DIRENTRY 3
+#define TYPE_MAXTYPE 3
+#define TYPE_ANY 15 // FIXME: comment is required
+
+/***************************************************************************/
+/* KEY & ITEM HEAD */
+/***************************************************************************/
+
+//
+// directories use this key as well as old files
+//
+struct offset_v1 {
+ __le32 k_offset;
+ __le32 k_uniqueness;
+} __attribute__ ((__packed__));
+
+struct offset_v2 {
+ __le64 v;
+} __attribute__ ((__packed__));
+
+static inline __u16 offset_v2_k_type(const struct offset_v2 *v2)
+{
+ __u8 type = le64_to_cpu(v2->v) >> 60;
+ return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY;
+}
+
+static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type)
+{
+ v2->v =
+ (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60);
+}
+
+static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2)
+{
+ return le64_to_cpu(v2->v) & (~0ULL >> 4);
+}
+
+static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset)
+{
+ offset &= (~0ULL >> 4);
+ v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset);
+}
+
+/* Key of an item determines its location in the S+tree, and
+ is composed of 4 components */
+struct reiserfs_key {
+ __le32 k_dir_id; /* packing locality: by default parent
+ directory object id */
+ __le32 k_objectid; /* object identifier */
+ union {
+ struct offset_v1 k_offset_v1;
+ struct offset_v2 k_offset_v2;
+ } __attribute__ ((__packed__)) u;
+} __attribute__ ((__packed__));
+
+struct in_core_key {
+ __u32 k_dir_id; /* packing locality: by default parent
+ directory object id */
+ __u32 k_objectid; /* object identifier */
+ __u64 k_offset;
+ __u8 k_type;
+};
+
+struct cpu_key {
+ struct in_core_key on_disk_key;
+ int version;
+ int key_length; /* 3 in all cases but direct2indirect and
+ indirect2direct conversion */
+};
+
+/* Our function for comparing keys can compare keys of different
+ lengths. It takes as a parameter the length of the keys it is to
+ compare. These defines are used in determining what is to be passed
+ to it as that parameter. */
+#define REISERFS_FULL_KEY_LEN 4
+#define REISERFS_SHORT_KEY_LEN 2
+
+/* The result of the key compare */
+#define FIRST_GREATER 1
+#define SECOND_GREATER -1
+#define KEYS_IDENTICAL 0
+#define KEY_FOUND 1
+#define KEY_NOT_FOUND 0
+
+#define KEY_SIZE (sizeof(struct reiserfs_key))
+#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32))
+
+/* return values for search_by_key and clones */
+#define ITEM_FOUND 1
+#define ITEM_NOT_FOUND 0
+#define ENTRY_FOUND 1
+#define ENTRY_NOT_FOUND 0
+#define DIRECTORY_NOT_FOUND -1
+#define REGULAR_FILE_FOUND -2
+#define DIRECTORY_FOUND -3
+#define BYTE_FOUND 1
+#define BYTE_NOT_FOUND 0
+#define FILE_NOT_FOUND -1
+
+#define POSITION_FOUND 1
+#define POSITION_NOT_FOUND 0
+
+// return values for reiserfs_find_entry and search_by_entry_key
+#define NAME_FOUND 1
+#define NAME_NOT_FOUND 0
+#define GOTO_PREVIOUS_ITEM 2
+#define NAME_FOUND_INVISIBLE 3
+
+/* Everything in the filesystem is stored as a set of items. The
+ item head contains the key of the item, its free space (for
+ indirect items) and specifies the location of the item itself
+ within the block. */
+
+struct item_head {
+ /* Everything in the tree is found by searching for it based on
+ * its key.*/
+ struct reiserfs_key ih_key;
+ union {
+ /* The free space in the last unformatted node of an
+ indirect item if this is an indirect item. This
+ equals 0xFFFF iff this is a direct item or stat data
+ item. Note that the key, not this field, is used to
+ determine the item type, and thus which field this
+ union contains. */
+ __le16 ih_free_space_reserved;
+ /* Iff this is a directory item, this field equals the
+ number of directory entries in the directory item. */
+ __le16 ih_entry_count;
+ } __attribute__ ((__packed__)) u;
+ __le16 ih_item_len; /* total size of the item body */
+ __le16 ih_item_location; /* an offset to the item body
+ * within the block */
+ __le16 ih_version; /* 0 for all old items, 2 for new
+ ones. Highest bit is set by fsck
+ temporary, cleaned after all
+ done */
+} __attribute__ ((__packed__));
+/* size of item header */
+#define IH_SIZE (sizeof(struct item_head))
+
+#define ih_free_space(ih) le16_to_cpu((ih)->u.ih_free_space_reserved)
+#define ih_version(ih) le16_to_cpu((ih)->ih_version)
+#define ih_entry_count(ih) le16_to_cpu((ih)->u.ih_entry_count)
+#define ih_location(ih) le16_to_cpu((ih)->ih_item_location)
+#define ih_item_len(ih) le16_to_cpu((ih)->ih_item_len)
+
+#define put_ih_free_space(ih, val) do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0)
+#define put_ih_version(ih, val) do { (ih)->ih_version = cpu_to_le16(val); } while (0)
+#define put_ih_entry_count(ih, val) do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0)
+#define put_ih_location(ih, val) do { (ih)->ih_item_location = cpu_to_le16(val); } while (0)
+#define put_ih_item_len(ih, val) do { (ih)->ih_item_len = cpu_to_le16(val); } while (0)
+
+#define unreachable_item(ih) (ih_version(ih) & (1 << 15))
+
+#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih))
+#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val)))
+
+/* these operate on indirect items, where you've got an array of ints
+** at a possibly unaligned location. These are a noop on ia32
+**
+** p is the array of __u32, i is the index into the array, v is the value
+** to store there.
+*/
+#define get_block_num(p, i) get_unaligned_le32((p) + (i))
+#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i))
+
+//
+// in old version uniqueness field shows key type
+//
+#define V1_SD_UNIQUENESS 0
+#define V1_INDIRECT_UNIQUENESS 0xfffffffe
+#define V1_DIRECT_UNIQUENESS 0xffffffff
+#define V1_DIRENTRY_UNIQUENESS 500
+#define V1_ANY_UNIQUENESS 555 // FIXME: comment is required
+
+//
+// here are conversion routines
+//
+static inline int uniqueness2type(__u32 uniqueness) CONSTF;
+static inline int uniqueness2type(__u32 uniqueness)
+{
+ switch ((int)uniqueness) {
+ case V1_SD_UNIQUENESS:
+ return TYPE_STAT_DATA;
+ case V1_INDIRECT_UNIQUENESS:
+ return TYPE_INDIRECT;
+ case V1_DIRECT_UNIQUENESS:
+ return TYPE_DIRECT;
+ case V1_DIRENTRY_UNIQUENESS:
+ return TYPE_DIRENTRY;
+ case V1_ANY_UNIQUENESS:
+ default:
+ return TYPE_ANY;
+ }
+}
+
+static inline __u32 type2uniqueness(int type) CONSTF;
+static inline __u32 type2uniqueness(int type)
+{
+ switch (type) {
+ case TYPE_STAT_DATA:
+ return V1_SD_UNIQUENESS;
+ case TYPE_INDIRECT:
+ return V1_INDIRECT_UNIQUENESS;
+ case TYPE_DIRECT:
+ return V1_DIRECT_UNIQUENESS;
+ case TYPE_DIRENTRY:
+ return V1_DIRENTRY_UNIQUENESS;
+ case TYPE_ANY:
+ default:
+ return V1_ANY_UNIQUENESS;
+ }
+}
+
+//
+// key is pointer to on disk key which is stored in le, result is cpu,
+// there is no way to get version of object from key, so, provide
+// version to these defines
+//
+static inline loff_t le_key_k_offset(int version,
+ const struct reiserfs_key *key)
+{
+ return (version == KEY_FORMAT_3_5) ?
+ le32_to_cpu(key->u.k_offset_v1.k_offset) :
+ offset_v2_k_offset(&(key->u.k_offset_v2));
+}
+
+static inline loff_t le_ih_k_offset(const struct item_head *ih)
+{
+ return le_key_k_offset(ih_version(ih), &(ih->ih_key));
+}
+
+static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key)
+{
+ return (version == KEY_FORMAT_3_5) ?
+ uniqueness2type(le32_to_cpu(key->u.k_offset_v1.k_uniqueness)) :
+ offset_v2_k_type(&(key->u.k_offset_v2));
+}
+
+static inline loff_t le_ih_k_type(const struct item_head *ih)
+{
+ return le_key_k_type(ih_version(ih), &(ih->ih_key));
+}
+
+static inline void set_le_key_k_offset(int version, struct reiserfs_key *key,
+ loff_t offset)
+{
+ (version == KEY_FORMAT_3_5) ? (void)(key->u.k_offset_v1.k_offset = cpu_to_le32(offset)) : /* jdm check */
+ (void)(set_offset_v2_k_offset(&(key->u.k_offset_v2), offset));
+}
+
+static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
+{
+ set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
+}
+
+static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
+ int type)
+{
+ (version == KEY_FORMAT_3_5) ?
+ (void)(key->u.k_offset_v1.k_uniqueness =
+ cpu_to_le32(type2uniqueness(type)))
+ : (void)(set_offset_v2_k_type(&(key->u.k_offset_v2), type));
+}
+
+static inline void set_le_ih_k_type(struct item_head *ih, int type)
+{
+ set_le_key_k_type(ih_version(ih), &(ih->ih_key), type);
+}
+
+static inline int is_direntry_le_key(int version, struct reiserfs_key *key)
+{
+ return le_key_k_type(version, key) == TYPE_DIRENTRY;
+}
+
+static inline int is_direct_le_key(int version, struct reiserfs_key *key)
+{
+ return le_key_k_type(version, key) == TYPE_DIRECT;
+}
+
+static inline int is_indirect_le_key(int version, struct reiserfs_key *key)
+{
+ return le_key_k_type(version, key) == TYPE_INDIRECT;
+}
+
+static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
+{
+ return le_key_k_type(version, key) == TYPE_STAT_DATA;
+}
+
+//
+// item header has version.
+//
+static inline int is_direntry_le_ih(struct item_head *ih)
+{
+ return is_direntry_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_direct_le_ih(struct item_head *ih)
+{
+ return is_direct_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_indirect_le_ih(struct item_head *ih)
+{
+ return is_indirect_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_statdata_le_ih(struct item_head *ih)
+{
+ return is_statdata_le_key(ih_version(ih), &ih->ih_key);
+}
+
+//
+// key is pointer to cpu key, result is cpu
+//
+static inline loff_t cpu_key_k_offset(const struct cpu_key *key)
+{
+ return key->on_disk_key.k_offset;
+}
+
+static inline loff_t cpu_key_k_type(const struct cpu_key *key)
+{
+ return key->on_disk_key.k_type;
+}
+
+static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset)
+{
+ key->on_disk_key.k_offset = offset;
+}
+
+static inline void set_cpu_key_k_type(struct cpu_key *key, int type)
+{
+ key->on_disk_key.k_type = type;
+}
+
+static inline void cpu_key_k_offset_dec(struct cpu_key *key)
+{
+ key->on_disk_key.k_offset--;
+}
+
+#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY)
+#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT)
+#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT)
+#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA)
+
+/* are these used ? */
+#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key)))
+#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key)))
+#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key)))
+#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key)))
+
+#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \
+ (!COMP_SHORT_KEYS(ih, key) && \
+ I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize))
+
+/* maximal length of item */
+#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE)
+#define MIN_ITEM_LEN 1
+
+/* object identifier for root dir */
+#define REISERFS_ROOT_OBJECTID 2
+#define REISERFS_ROOT_PARENT_OBJECTID 1
+
+extern struct reiserfs_key root_key;
+
+/*
+ * Picture represents a leaf of the S+tree
+ * ______________________________________________________
+ * | | Array of | | |
+ * |Block | Object-Item | F r e e | Objects- |
+ * | head | Headers | S p a c e | Items |
+ * |______|_______________|___________________|___________|
+ */
+
+/* Header of a disk block. More precisely, header of a formatted leaf
+ or internal node, and not the header of an unformatted node. */
+struct block_head {
+ __le16 blk_level; /* Level of a block in the tree. */
+ __le16 blk_nr_item; /* Number of keys/items in a block. */
+ __le16 blk_free_space; /* Block free space in bytes. */
+ __le16 blk_reserved;
+ /* dump this in v4/planA */
+ struct reiserfs_key blk_right_delim_key; /* kept only for compatibility */
+};
+
+#define BLKH_SIZE (sizeof(struct block_head))
+#define blkh_level(p_blkh) (le16_to_cpu((p_blkh)->blk_level))
+#define blkh_nr_item(p_blkh) (le16_to_cpu((p_blkh)->blk_nr_item))
+#define blkh_free_space(p_blkh) (le16_to_cpu((p_blkh)->blk_free_space))
+#define blkh_reserved(p_blkh) (le16_to_cpu((p_blkh)->blk_reserved))
+#define set_blkh_level(p_blkh,val) ((p_blkh)->blk_level = cpu_to_le16(val))
+#define set_blkh_nr_item(p_blkh,val) ((p_blkh)->blk_nr_item = cpu_to_le16(val))
+#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val))
+#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val))
+#define blkh_right_delim_key(p_blkh) ((p_blkh)->blk_right_delim_key)
+#define set_blkh_right_delim_key(p_blkh,val) ((p_blkh)->blk_right_delim_key = val)
+
+/*
+ * values for blk_level field of the struct block_head
+ */
+
+#define FREE_LEVEL 0 /* when node gets removed from the tree its
+ blk_level is set to FREE_LEVEL. It is then
+ used to see whether the node is still in the
+ tree */
+
+#define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level. */
+
+/* Given the buffer head of a formatted node, resolve to the block head of that node. */
+#define B_BLK_HEAD(bh) ((struct block_head *)((bh)->b_data))
+/* Number of items that are in buffer. */
+#define B_NR_ITEMS(bh) (blkh_nr_item(B_BLK_HEAD(bh)))
+#define B_LEVEL(bh) (blkh_level(B_BLK_HEAD(bh)))
+#define B_FREE_SPACE(bh) (blkh_free_space(B_BLK_HEAD(bh)))
+
+#define PUT_B_NR_ITEMS(bh, val) do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0)
+#define PUT_B_LEVEL(bh, val) do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0)
+#define PUT_B_FREE_SPACE(bh, val) do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0)
+
+/* Get right delimiting key. -- little endian */
+#define B_PRIGHT_DELIM_KEY(bh) (&(blk_right_delim_key(B_BLK_HEAD(bh))))
+
+/* Does the buffer contain a disk leaf. */
+#define B_IS_ITEMS_LEVEL(bh) (B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL)
+
+/* Does the buffer contain a disk internal node */
+#define B_IS_KEYS_LEVEL(bh) (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \
+ && B_LEVEL(bh) <= MAX_HEIGHT)
+
+/***************************************************************************/
+/* STAT DATA */
+/***************************************************************************/
+
+//
+// old stat data is 32 bytes long. We are going to distinguish new one by
+// different size
+//
+struct stat_data_v1 {
+ __le16 sd_mode; /* file type, permissions */
+ __le16 sd_nlink; /* number of hard links */
+ __le16 sd_uid; /* owner */
+ __le16 sd_gid; /* group */
+ __le32 sd_size; /* file size */
+ __le32 sd_atime; /* time of last access */
+ __le32 sd_mtime; /* time file was last modified */
+ __le32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */
+ union {
+ __le32 sd_rdev;
+ __le32 sd_blocks; /* number of blocks file uses */
+ } __attribute__ ((__packed__)) u;
+ __le32 sd_first_direct_byte; /* first byte of file which is stored
+ in a direct item: except that if it
+ equals 1 it is a symlink and if it
+ equals ~(__u32)0 there is no
+ direct item. The existence of this
+ field really grates on me. Let's
+ replace it with a macro based on
+ sd_size and our tail suppression
+ policy. Someday. -Hans */
+} __attribute__ ((__packed__));
+
+#define SD_V1_SIZE (sizeof(struct stat_data_v1))
+#define stat_data_v1(ih) (ih_version (ih) == KEY_FORMAT_3_5)
+#define sd_v1_mode(sdp) (le16_to_cpu((sdp)->sd_mode))
+#define set_sd_v1_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v))
+#define sd_v1_nlink(sdp) (le16_to_cpu((sdp)->sd_nlink))
+#define set_sd_v1_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le16(v))
+#define sd_v1_uid(sdp) (le16_to_cpu((sdp)->sd_uid))
+#define set_sd_v1_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le16(v))
+#define sd_v1_gid(sdp) (le16_to_cpu((sdp)->sd_gid))
+#define set_sd_v1_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le16(v))
+#define sd_v1_size(sdp) (le32_to_cpu((sdp)->sd_size))
+#define set_sd_v1_size(sdp,v) ((sdp)->sd_size = cpu_to_le32(v))
+#define sd_v1_atime(sdp) (le32_to_cpu((sdp)->sd_atime))
+#define set_sd_v1_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v))
+#define sd_v1_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime))
+#define set_sd_v1_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v))
+#define sd_v1_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime))
+#define set_sd_v1_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v))
+#define sd_v1_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev))
+#define set_sd_v1_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v))
+#define sd_v1_blocks(sdp) (le32_to_cpu((sdp)->u.sd_blocks))
+#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v))
+#define sd_v1_first_direct_byte(sdp) \
+ (le32_to_cpu((sdp)->sd_first_direct_byte))
+#define set_sd_v1_first_direct_byte(sdp,v) \
+ ((sdp)->sd_first_direct_byte = cpu_to_le32(v))
+
+/* inode flags stored in sd_attrs (nee sd_reserved) */
+
+/* we want common flags to have the same values as in ext2,
+ so chattr(1) will work without problems */
+#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL
+#define REISERFS_APPEND_FL FS_APPEND_FL
+#define REISERFS_SYNC_FL FS_SYNC_FL
+#define REISERFS_NOATIME_FL FS_NOATIME_FL
+#define REISERFS_NODUMP_FL FS_NODUMP_FL
+#define REISERFS_SECRM_FL FS_SECRM_FL
+#define REISERFS_UNRM_FL FS_UNRM_FL
+#define REISERFS_COMPR_FL FS_COMPR_FL
+#define REISERFS_NOTAIL_FL FS_NOTAIL_FL
+
+/* persistent flags that file inherits from the parent directory */
+#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL | \
+ REISERFS_SYNC_FL | \
+ REISERFS_NOATIME_FL | \
+ REISERFS_NODUMP_FL | \
+ REISERFS_SECRM_FL | \
+ REISERFS_COMPR_FL | \
+ REISERFS_NOTAIL_FL )
+
+/* Stat Data on disk (reiserfs version of UFS disk inode minus the
+ address blocks) */
+struct stat_data {
+ __le16 sd_mode; /* file type, permissions */
+ __le16 sd_attrs; /* persistent inode flags */
+ __le32 sd_nlink; /* number of hard links */
+ __le64 sd_size; /* file size */
+ __le32 sd_uid; /* owner */
+ __le32 sd_gid; /* group */
+ __le32 sd_atime; /* time of last access */
+ __le32 sd_mtime; /* time file was last modified */
+ __le32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */
+ __le32 sd_blocks;
+ union {
+ __le32 sd_rdev;
+ __le32 sd_generation;
+ //__le32 sd_first_direct_byte;
+ /* first byte of file which is stored in a
+ direct item: except that if it equals 1
+ it is a symlink and if it equals
+ ~(__u32)0 there is no direct item. The
+ existence of this field really grates
+ on me. Let's replace it with a macro
+ based on sd_size and our tail
+ suppression policy? */
+ } __attribute__ ((__packed__)) u;
+} __attribute__ ((__packed__));
+//
+// this is 44 bytes long
+//
+#define SD_SIZE (sizeof(struct stat_data))
+#define SD_V2_SIZE SD_SIZE
+#define stat_data_v2(ih) (ih_version (ih) == KEY_FORMAT_3_6)
+#define sd_v2_mode(sdp) (le16_to_cpu((sdp)->sd_mode))
+#define set_sd_v2_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v))
+/* sd_reserved */
+/* set_sd_reserved */
+#define sd_v2_nlink(sdp) (le32_to_cpu((sdp)->sd_nlink))
+#define set_sd_v2_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le32(v))
+#define sd_v2_size(sdp) (le64_to_cpu((sdp)->sd_size))
+#define set_sd_v2_size(sdp,v) ((sdp)->sd_size = cpu_to_le64(v))
+#define sd_v2_uid(sdp) (le32_to_cpu((sdp)->sd_uid))
+#define set_sd_v2_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le32(v))
+#define sd_v2_gid(sdp) (le32_to_cpu((sdp)->sd_gid))
+#define set_sd_v2_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le32(v))
+#define sd_v2_atime(sdp) (le32_to_cpu((sdp)->sd_atime))
+#define set_sd_v2_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v))
+#define sd_v2_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime))
+#define set_sd_v2_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v))
+#define sd_v2_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime))
+#define set_sd_v2_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v))
+#define sd_v2_blocks(sdp) (le32_to_cpu((sdp)->sd_blocks))
+#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v))
+#define sd_v2_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev))
+#define set_sd_v2_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v))
+#define sd_v2_generation(sdp) (le32_to_cpu((sdp)->u.sd_generation))
+#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v))
+#define sd_v2_attrs(sdp) (le16_to_cpu((sdp)->sd_attrs))
+#define set_sd_v2_attrs(sdp,v) ((sdp)->sd_attrs = cpu_to_le16(v))
+
+/***************************************************************************/
+/* DIRECTORY STRUCTURE */
+/***************************************************************************/
+/*
+ Picture represents the structure of directory items
+ ________________________________________________
+ | Array of | | | | | |
+ | directory |N-1| N-2 | .... | 1st |0th|
+ | entry headers | | | | | |
+ |_______________|___|_____|________|_______|___|
+ <---- directory entries ------>
+
+ First directory item has k_offset component 1. We store "." and ".."
+ in one item, always, we never split "." and ".." into differing
+ items. This makes, among other things, the code for removing
+ directories simpler. */
+#define SD_OFFSET 0
+#define SD_UNIQUENESS 0
+#define DOT_OFFSET 1
+#define DOT_DOT_OFFSET 2
+#define DIRENTRY_UNIQUENESS 500
+
+/* */
+#define FIRST_ITEM_OFFSET 1
+
+/*
+ Q: How to get key of object pointed to by entry from entry?
+
+ A: Each directory entry has its header. This header has deh_dir_id and deh_objectid fields, those are key
+ of object, entry points to */
+
+/* NOT IMPLEMENTED:
+ Directory will someday contain stat data of object */
+
+struct reiserfs_de_head {
+ __le32 deh_offset; /* third component of the directory entry key */
+ __le32 deh_dir_id; /* objectid of the parent directory of the object, that is referenced
+ by directory entry */
+ __le32 deh_objectid; /* objectid of the object, that is referenced by directory entry */
+ __le16 deh_location; /* offset of name in the whole item */
+ __le16 deh_state; /* whether 1) entry contains stat data (for future), and 2) whether
+ entry is hidden (unlinked) */
+} __attribute__ ((__packed__));
+#define DEH_SIZE sizeof(struct reiserfs_de_head)
+#define deh_offset(p_deh) (le32_to_cpu((p_deh)->deh_offset))
+#define deh_dir_id(p_deh) (le32_to_cpu((p_deh)->deh_dir_id))
+#define deh_objectid(p_deh) (le32_to_cpu((p_deh)->deh_objectid))
+#define deh_location(p_deh) (le16_to_cpu((p_deh)->deh_location))
+#define deh_state(p_deh) (le16_to_cpu((p_deh)->deh_state))
+
+#define put_deh_offset(p_deh,v) ((p_deh)->deh_offset = cpu_to_le32((v)))
+#define put_deh_dir_id(p_deh,v) ((p_deh)->deh_dir_id = cpu_to_le32((v)))
+#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v)))
+#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v)))
+#define put_deh_state(p_deh,v) ((p_deh)->deh_state = cpu_to_le16((v)))
+
+/* empty directory contains two entries "." and ".." and their headers */
+#define EMPTY_DIR_SIZE \
+(DEH_SIZE * 2 + ROUND_UP (strlen (".")) + ROUND_UP (strlen ("..")))
+
+/* old format directories have this size when empty */
+#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3)
+
+#define DEH_Statdata 0 /* not used now */
+#define DEH_Visible 2
+
+/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */
+#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__)
+# define ADDR_UNALIGNED_BITS (3)
+#endif
+
+/* These are only used to manipulate deh_state.
+ * Because of this, we'll use the ext2_ bit routines,
+ * since they are little endian */
+#ifdef ADDR_UNALIGNED_BITS
+
+# define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1)))
+# define unaligned_offset(addr) (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3)
+
+# define set_bit_unaligned(nr, addr) \
+ __test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
+# define clear_bit_unaligned(nr, addr) \
+ __test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
+# define test_bit_unaligned(nr, addr) \
+ test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
+
+#else
+
+# define set_bit_unaligned(nr, addr) __test_and_set_bit_le(nr, addr)
+# define clear_bit_unaligned(nr, addr) __test_and_clear_bit_le(nr, addr)
+# define test_bit_unaligned(nr, addr) test_bit_le(nr, addr)
+
+#endif
+
+#define mark_de_with_sd(deh) set_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
+#define mark_de_without_sd(deh) clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
+#define mark_de_visible(deh) set_bit_unaligned (DEH_Visible, &((deh)->deh_state))
+#define mark_de_hidden(deh) clear_bit_unaligned (DEH_Visible, &((deh)->deh_state))
+
+#define de_with_sd(deh) test_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
+#define de_visible(deh) test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
+#define de_hidden(deh) !test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
+
+extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
+ __le32 par_dirid, __le32 par_objid);
+extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
+ __le32 par_dirid, __le32 par_objid);
+
+/* array of the entry headers */
+ /* get item body */
+#define B_I_PITEM(bh,ih) ( (bh)->b_data + ih_location(ih) )
+#define B_I_DEH(bh,ih) ((struct reiserfs_de_head *)(B_I_PITEM(bh,ih)))
+
+/* length of the directory entry in directory item. This define
+ calculates length of i-th directory entry using directory entry
+ locations from dir entry head. When it calculates length of 0-th
+ directory entry, it uses length of whole item in place of entry
+ location of the non-existent following entry in the calculation.
+ See picture above.*/
+/*
+#define I_DEH_N_ENTRY_LENGTH(ih,deh,i) \
+((i) ? (deh_location((deh)-1) - deh_location((deh))) : (ih_item_len((ih)) - deh_location((deh))))
+*/
+static inline int entry_length(const struct buffer_head *bh,
+ const struct item_head *ih, int pos_in_item)
+{
+ struct reiserfs_de_head *deh;
+
+ deh = B_I_DEH(bh, ih) + pos_in_item;
+ if (pos_in_item)
+ return deh_location(deh - 1) - deh_location(deh);
+
+ return ih_item_len(ih) - deh_location(deh);
+}
+
+/* number of entries in the directory item, depends on ENTRY_COUNT being at the start of directory dynamic data. */
+#define I_ENTRY_COUNT(ih) (ih_entry_count((ih)))
+
+/* name by bh, ih and entry_num */
+#define B_I_E_NAME(bh,ih,entry_num) ((char *)(bh->b_data + ih_location(ih) + deh_location(B_I_DEH(bh,ih)+(entry_num))))
+
+// two entries per block (at least)
+#define REISERFS_MAX_NAME(block_size) 255
+
+/* this structure is used for operations on directory entries. It is
+ not a disk structure. */
+/* When reiserfs_find_entry or search_by_entry_key find directory
+ entry, they return filled reiserfs_dir_entry structure */
+struct reiserfs_dir_entry {
+ struct buffer_head *de_bh;
+ int de_item_num;
+ struct item_head *de_ih;
+ int de_entry_num;
+ struct reiserfs_de_head *de_deh;
+ int de_entrylen;
+ int de_namelen;
+ char *de_name;
+ unsigned long *de_gen_number_bit_string;
+
+ __u32 de_dir_id;
+ __u32 de_objectid;
+
+ struct cpu_key de_entry_key;
+};
+
+/* these defines are useful when a particular member of a reiserfs_dir_entry is needed */
+
+/* pointer to file name, stored in entry */
+#define B_I_DEH_ENTRY_FILE_NAME(bh,ih,deh) (B_I_PITEM (bh, ih) + deh_location(deh))
+
+/* length of name */
+#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \
+(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0))
+
+/* hash value occupies bits from 7 up to 30 */
+#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL)
+/* generation number occupies 7 bits starting from 0 up to 6 */
+#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL)
+#define MAX_GENERATION_NUMBER 127
+
+#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number))
+
+/*
+ * Picture represents an internal node of the reiserfs tree
+ * ______________________________________________________
+ * | | Array of | Array of | Free |
+ * |block | keys | pointers | space |
+ * | head | N | N+1 | |
+ * |______|_______________|___________________|___________|
+ */
+
+/***************************************************************************/
+/* DISK CHILD */
+/***************************************************************************/
+/* Disk child pointer: The pointer from an internal node of the tree
+ to a node that is on disk. */
+struct disk_child {
+ __le32 dc_block_number; /* Disk child's block number. */
+ __le16 dc_size; /* Disk child's used space. */
+ __le16 dc_reserved;
+};
+
+#define DC_SIZE (sizeof(struct disk_child))
+#define dc_block_number(dc_p) (le32_to_cpu((dc_p)->dc_block_number))
+#define dc_size(dc_p) (le16_to_cpu((dc_p)->dc_size))
+#define put_dc_block_number(dc_p, val) do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0)
+#define put_dc_size(dc_p, val) do { (dc_p)->dc_size = cpu_to_le16(val); } while(0)
+
+/* Get disk child by buffer header and position in the tree node. */
+#define B_N_CHILD(bh, n_pos) ((struct disk_child *)\
+((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos)))
+
+/* Get disk child number by buffer header and position in the tree node. */
+#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos)))
+#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \
+ (put_dc_block_number(B_N_CHILD(bh, n_pos), val))
+
+ /* maximal value of field child_size in structure disk_child */
+ /* child size is the combined size of all items and their headers */
+#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE ))
+
+/* amount of used space in buffer (not including block head) */
+#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur)))
+
+/* max and min number of keys in internal node */
+#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) )
+#define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2)
+
+/***************************************************************************/
+/* PATH STRUCTURES AND DEFINES */
+/***************************************************************************/
+
+/* Search_by_key fills up the path from the root to the leaf as it descends the tree looking for the
+ key. It uses reiserfs_bread to try to find buffers in the cache given their block number. If it
+ does not find them in the cache it reads them from disk. For each node search_by_key finds using
+ reiserfs_bread it then uses bin_search to look through that node. bin_search will find the
+ position of the block_number of the next node if it is looking through an internal node. If it
+ is looking through a leaf node bin_search will find the position of the item which has key either
+ equal to given key, or which is the maximal key less than the given key. */
+
+struct path_element {
+ struct buffer_head *pe_buffer; /* Pointer to the buffer at the path in the tree. */
+ int pe_position; /* Position in the tree node which is placed in the */
+ /* buffer above. */
+};
+
+#define MAX_HEIGHT 5 /* maximal height of a tree. don't change this without changing JOURNAL_PER_BALANCE_CNT */
+#define EXTENDED_MAX_HEIGHT 7 /* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
+#define FIRST_PATH_ELEMENT_OFFSET 2 /* Must be equal to at least 2. */
+
+#define ILLEGAL_PATH_ELEMENT_OFFSET 1 /* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
+#define MAX_FEB_SIZE 6 /* this MUST be MAX_HEIGHT + 1. See about FEB below */
+
+/* We need to keep track of who the ancestors of nodes are. When we
+ perform a search we record which nodes were visited while
+ descending the tree looking for the node we searched for. This list
+ of nodes is called the path. This information is used while
+ performing balancing. Note that this path information may become
+ invalid, and this means we must check it when using it to see if it
+ is still valid. You'll need to read search_by_key and the comments
+ in it, especially about decrement_counters_in_path(), to understand
+ this structure.
+
+Paths make the code so much harder to work with and debug.... An
+enormous number of bugs are due to them, and trying to write or modify
+code that uses them just makes my head hurt. They are based on an
+excessive effort to avoid disturbing the precious VFS code.:-( The
+gods only know how we are going to SMP the code that uses them.
+znodes are the way! */
+
+#define PATH_READA 0x1 /* do read ahead */
+#define PATH_READA_BACK 0x2 /* read backwards */
+
+struct treepath {
+ int path_length; /* Length of the array above. */
+ int reada;
+ struct path_element path_elements[EXTENDED_MAX_HEIGHT]; /* Array of the path elements. */
+ int pos_in_item;
+};
+
+#define pos_in_item(path) ((path)->pos_in_item)
+
+#define INITIALIZE_PATH(var) \
+struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
+
+/* Get path element by path and path position. */
+#define PATH_OFFSET_PELEMENT(path, n_offset) ((path)->path_elements + (n_offset))
+
+/* Get buffer header at the path by path and path position. */
+#define PATH_OFFSET_PBUFFER(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer)
+
+/* Get position in the element at the path by path and path position. */
+#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position)
+
+#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length))
+ /* you know, to the person who didn't
+ write this the macro name does not
+ at first suggest what it does.
+ Maybe POSITION_FROM_PATH_END? Or
+ maybe we should just focus on
+ dumping paths... -Hans */
+#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length))
+
+#define PATH_PITEM_HEAD(path) B_N_PITEM_HEAD(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path))
+
+/* in do_balance leaf has h == 0 in contrast with path structure,
+ where root has level == 0. That is why we need these defines */
+#define PATH_H_PBUFFER(path, h) PATH_OFFSET_PBUFFER (path, path->path_length - (h)) /* tb->S[h] */
+#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER (path, (h) + 1) /* tb->F[h] or tb->S[0]->b_parent */
+#define PATH_H_POSITION(path, h) PATH_OFFSET_POSITION (path, path->path_length - (h))
+#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) /* tb->S[h]->b_item_order */
+
+#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h))
+
+#define get_last_bh(path) PATH_PLAST_BUFFER(path)
+#define get_ih(path) PATH_PITEM_HEAD(path)
+#define get_item_pos(path) PATH_LAST_POSITION(path)
+#define get_item(path) ((void *)B_N_PITEM(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION (path)))
+#define item_moved(ih,path) comp_items(ih, path)
+#define path_changed(ih,path) comp_items (ih, path)
+
+/***************************************************************************/
+/* MISC */
+/***************************************************************************/
+
+/* Size of pointer to the unformatted node. */
+#define UNFM_P_SIZE (sizeof(unp_t))
+#define UNFM_P_SHIFT 2
+
+// in in-core inode key is stored on le form
+#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key))
+
+#define MAX_UL_INT 0xffffffff
+#define MAX_INT 0x7ffffff
+#define MAX_US_INT 0xffff
+
+// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
+#define U32_MAX (~(__u32)0)
+
+static inline loff_t max_reiserfs_offset(struct inode *inode)
+{
+ if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
+ return (loff_t) U32_MAX;
+
+ return (loff_t) ((~(__u64) 0) >> 4);
+}
+
+/*#define MAX_KEY_UNIQUENESS MAX_UL_INT*/
+#define MAX_KEY_OBJECTID MAX_UL_INT
+
+#define MAX_B_NUM MAX_UL_INT
+#define MAX_FC_NUM MAX_US_INT
+
+/* the purpose is to detect overflow of an unsigned short */
+#define REISERFS_LINK_MAX (MAX_US_INT - 1000)
+
+/* The following defines are used in reiserfs_insert_item and reiserfs_append_item */
+#define REISERFS_KERNEL_MEM 0 /* reiserfs kernel memory mode */
+#define REISERFS_USER_MEM 1 /* reiserfs user memory mode */
+
+#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter)
+#define get_generation(s) atomic_read (&fs_generation(s))
+#define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen)
+#define __fs_changed(gen,s) (gen != get_generation (s))
+#define fs_changed(gen,s) \
+({ \
+ reiserfs_cond_resched(s); \
+ __fs_changed(gen, s); \
+})
+
+/***************************************************************************/
+/* FIXATE NODES */
+/***************************************************************************/
+
+#define VI_TYPE_LEFT_MERGEABLE 1
+#define VI_TYPE_RIGHT_MERGEABLE 2
+
+/* To make any changes in the tree we always first find node, that
+ contains item to be changed/deleted or place to insert a new
+ item. We call this node S. To do balancing we need to decide what
+ we will shift to left/right neighbor, or to a new node, where new
+ item will be etc. To make this analysis simpler we build virtual
+ node. Virtual node is an array of items, that will replace items of
+ node S. (For instance if we are going to delete an item, virtual
+ node does not contain it). Virtual node keeps information about
+ item sizes and types, mergeability of first and last items, sizes
+ of all entries in directory item. We use this array of items when
+ calculating what we can shift to neighbors and how many nodes we
+ have to have if we do not any shiftings, if we shift to left/right
+ neighbor or to both. */
+struct virtual_item {
+ int vi_index; // index in the array of item operations
+ unsigned short vi_type; // left/right mergeability
+ unsigned short vi_item_len; /* length of item that it will have after balancing */
+ struct item_head *vi_ih;
+ const char *vi_item; // body of item (old or new)
+ const void *vi_new_data; // 0 always but paste mode
+ void *vi_uarea; // item specific area
+};
+
+struct virtual_node {
+ char *vn_free_ptr; /* this is a pointer to the free space in the buffer */
+ unsigned short vn_nr_item; /* number of items in virtual node */
+ short vn_size; /* size of node , that node would have if it has unlimited size and no balancing is performed */
+ short vn_mode; /* mode of balancing (paste, insert, delete, cut) */
+ short vn_affected_item_num;
+ short vn_pos_in_item;
+ struct item_head *vn_ins_ih; /* item header of inserted item, 0 for other modes */
+ const void *vn_data;
+ struct virtual_item *vn_vi; /* array of items (including a new one, excluding item to be deleted) */
+};
+
+/* used by directory items when creating virtual nodes */
+struct direntry_uarea {
+ int flags;
+ __u16 entry_count;
+ __u16 entry_sizes[1];
+} __attribute__ ((__packed__));
+
+/***************************************************************************/
+/* TREE BALANCE */
+/***************************************************************************/
+
+/* This temporary structure is used in tree balance algorithms, and
+ constructed as we go to the extent that its various parts are
+ needed. It contains arrays of nodes that can potentially be
+ involved in the balancing of node S, and parameters that define how
+ each of the nodes must be balanced. Note that in these algorithms
+ for balancing the worst case is to need to balance the current node
+ S and the left and right neighbors and all of their parents plus
+ create a new node. We implement S1 balancing for the leaf nodes
+ and S0 balancing for the internal nodes (S1 and S0 are defined in
+ our papers.)*/
+
+#define MAX_FREE_BLOCK 7 /* size of the array of buffers to free at end of do_balance */
+
+/* maximum number of FEB blocknrs on a single level */
+#define MAX_AMOUNT_NEEDED 2
+
+/* someday somebody will prefix every field in this struct with tb_ */
+struct tree_balance {
+ int tb_mode;
+ int need_balance_dirty;
+ struct super_block *tb_sb;
+ struct reiserfs_transaction_handle *transaction_handle;
+ struct treepath *tb_path;
+ struct buffer_head *L[MAX_HEIGHT]; /* array of left neighbors of nodes in the path */
+ struct buffer_head *R[MAX_HEIGHT]; /* array of right neighbors of nodes in the path */
+ struct buffer_head *FL[MAX_HEIGHT]; /* array of fathers of the left neighbors */
+ struct buffer_head *FR[MAX_HEIGHT]; /* array of fathers of the right neighbors */
+ struct buffer_head *CFL[MAX_HEIGHT]; /* array of common parents of center node and its left neighbor */
+ struct buffer_head *CFR[MAX_HEIGHT]; /* array of common parents of center node and its right neighbor */
+
+ struct buffer_head *FEB[MAX_FEB_SIZE]; /* array of empty buffers. Number of buffers in array equals
+ cur_blknum. */
+ struct buffer_head *used[MAX_FEB_SIZE];
+ struct buffer_head *thrown[MAX_FEB_SIZE];
+ int lnum[MAX_HEIGHT]; /* array of number of items which must be
+ shifted to the left in order to balance the
+ current node; for leaves includes item that
+ will be partially shifted; for internal
+ nodes, it is the number of child pointers
+ rather than items. It includes the new item
+ being created. The code sometimes subtracts
+ one to get the number of wholly shifted
+ items for other purposes. */
+ int rnum[MAX_HEIGHT]; /* substitute right for left in comment above */
+ int lkey[MAX_HEIGHT]; /* array indexed by height h mapping the key delimiting L[h] and
+ S[h] to its item number within the node CFL[h] */
+ int rkey[MAX_HEIGHT]; /* substitute r for l in comment above */
+ int insert_size[MAX_HEIGHT]; /* the number of bytes by we are trying to add or remove from
+ S[h]. A negative value means removing. */
+ int blknum[MAX_HEIGHT]; /* number of nodes that will replace node S[h] after
+ balancing on the level h of the tree. If 0 then S is
+ being deleted, if 1 then S is remaining and no new nodes
+ are being created, if 2 or 3 then 1 or 2 new nodes is
+ being created */
+
+ /* fields that are used only for balancing leaves of the tree */
+ int cur_blknum; /* number of empty blocks having been already allocated */
+ int s0num; /* number of items that fall into left most node when S[0] splits */
+ int s1num; /* number of items that fall into first new node when S[0] splits */
+ int s2num; /* number of items that fall into second new node when S[0] splits */
+ int lbytes; /* number of bytes which can flow to the left neighbor from the left */
+ /* most liquid item that cannot be shifted from S[0] entirely */
+ /* if -1 then nothing will be partially shifted */
+ int rbytes; /* number of bytes which will flow to the right neighbor from the right */
+ /* most liquid item that cannot be shifted from S[0] entirely */
+ /* if -1 then nothing will be partially shifted */
+ int s1bytes; /* number of bytes which flow to the first new node when S[0] splits */
+ /* note: if S[0] splits into 3 nodes, then items do not need to be cut */
+ int s2bytes;
+ struct buffer_head *buf_to_free[MAX_FREE_BLOCK]; /* buffers which are to be freed after do_balance finishes by unfix_nodes */
+ char *vn_buf; /* kmalloced memory. Used to create
+ virtual node and keep map of
+ dirtied bitmap blocks */
+ int vn_buf_size; /* size of the vn_buf */
+ struct virtual_node *tb_vn; /* VN starts after bitmap of bitmap blocks */
+
+ int fs_gen; /* saved value of `reiserfs_generation' counter
+ see FILESYSTEM_CHANGED() macro in reiserfs_fs.h */
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+ struct in_core_key key; /* key pointer, to pass to block allocator or
+ another low-level subsystem */
+#endif
+};
+
+/* These are modes of balancing */
+
+/* When inserting an item. */
+#define M_INSERT 'i'
+/* When inserting into (directories only) or appending onto an already
+ existent item. */
+#define M_PASTE 'p'
+/* When deleting an item. */
+#define M_DELETE 'd'
+/* When truncating an item or removing an entry from a (directory) item. */
+#define M_CUT 'c'
+
+/* used when balancing on leaf level skipped (in reiserfsck) */
+#define M_INTERNAL 'n'
+
+/* When further balancing is not needed, then do_balance does not need
+ to be called. */
+#define M_SKIP_BALANCING 's'
+#define M_CONVERT 'v'
+
+/* modes of leaf_move_items */
+#define LEAF_FROM_S_TO_L 0
+#define LEAF_FROM_S_TO_R 1
+#define LEAF_FROM_R_TO_L 2
+#define LEAF_FROM_L_TO_R 3
+#define LEAF_FROM_S_TO_SNEW 4
+
+#define FIRST_TO_LAST 0
+#define LAST_TO_FIRST 1
+
+/* used in do_balance for passing parent of node information that has
+ been gotten from tb struct */
+struct buffer_info {
+ struct tree_balance *tb;
+ struct buffer_head *bi_bh;
+ struct buffer_head *bi_parent;
+ int bi_position;
+};
+
+static inline struct super_block *sb_from_tb(struct tree_balance *tb)
+{
+ return tb ? tb->tb_sb : NULL;
+}
+
+static inline struct super_block *sb_from_bi(struct buffer_info *bi)
+{
+ return bi ? sb_from_tb(bi->tb) : NULL;
+}
+
+/* there are 4 types of items: stat data, directory item, indirect, direct.
++-------------------+------------+--------------+------------+
+| | k_offset | k_uniqueness | mergeable? |
++-------------------+------------+--------------+------------+
+| stat data | 0 | 0 | no |
++-------------------+------------+--------------+------------+
+| 1st directory item| DOT_OFFSET |DIRENTRY_UNIQUENESS| no |
+| non 1st directory | hash value | | yes |
+| item | | | |
++-------------------+------------+--------------+------------+
+| indirect item | offset + 1 |TYPE_INDIRECT | if this is not the first indirect item of the object
++-------------------+------------+--------------+------------+
+| direct item | offset + 1 |TYPE_DIRECT | if not this is not the first direct item of the object
++-------------------+------------+--------------+------------+
+*/
+
+struct item_operations {
+ int (*bytes_number) (struct item_head * ih, int block_size);
+ void (*decrement_key) (struct cpu_key *);
+ int (*is_left_mergeable) (struct reiserfs_key * ih,
+ unsigned long bsize);
+ void (*print_item) (struct item_head *, char *item);
+ void (*check_item) (struct item_head *, char *item);
+
+ int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi,
+ int is_affected, int insert_size);
+ int (*check_left) (struct virtual_item * vi, int free,
+ int start_skip, int end_skip);
+ int (*check_right) (struct virtual_item * vi, int free);
+ int (*part_size) (struct virtual_item * vi, int from, int to);
+ int (*unit_num) (struct virtual_item * vi);
+ void (*print_vi) (struct virtual_item * vi);
+};
+
+extern struct item_operations *item_ops[TYPE_ANY + 1];
+
+#define op_bytes_number(ih,bsize) item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize)
+#define op_is_left_mergeable(key,bsize) item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize)
+#define op_print_item(ih,item) item_ops[le_ih_k_type (ih)]->print_item (ih, item)
+#define op_check_item(ih,item) item_ops[le_ih_k_type (ih)]->check_item (ih, item)
+#define op_create_vi(vn,vi,is_affected,insert_size) item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size)
+#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip)
+#define op_check_right(vi,free) item_ops[(vi)->vi_index]->check_right (vi, free)
+#define op_part_size(vi,from,to) item_ops[(vi)->vi_index]->part_size (vi, from, to)
+#define op_unit_num(vi) item_ops[(vi)->vi_index]->unit_num (vi)
+#define op_print_vi(vi) item_ops[(vi)->vi_index]->print_vi (vi)
+
+#define COMP_SHORT_KEYS comp_short_keys
+
+/* number of blocks pointed to by the indirect item */
+#define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE)
+
+/* the used space within the unformatted node corresponding to pos within the item pointed to by ih */
+#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size))
+
+/* number of bytes contained by the direct item or the unformatted nodes the indirect item points to */
+
+/* get the item header */
+#define B_N_PITEM_HEAD(bh,item_num) ( (struct item_head * )((bh)->b_data + BLKH_SIZE) + (item_num) )
+
+/* get key */
+#define B_N_PDELIM_KEY(bh,item_num) ( (struct reiserfs_key * )((bh)->b_data + BLKH_SIZE) + (item_num) )
+
+/* get the key */
+#define B_N_PKEY(bh,item_num) ( &(B_N_PITEM_HEAD(bh,item_num)->ih_key) )
+
+/* get item body */
+#define B_N_PITEM(bh,item_num) ( (bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(item_num))))
+
+/* get the stat data by the buffer header and the item order */
+#define B_N_STAT_DATA(bh,nr) \
+( (struct stat_data *)((bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(nr))) ) )
+
+ /* following defines use reiserfs buffer header and item header */
+
+/* get stat-data */
+#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) )
+
+// this is 3976 for size==4096
+#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE)
+
+/* indirect items consist of entries which contain blocknrs, pos
+ indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
+ blocknr contained by the entry pos points to */
+#define B_I_POS_UNFM_POINTER(bh,ih,pos) le32_to_cpu(*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)))
+#define PUT_B_I_POS_UNFM_POINTER(bh,ih,pos, val) do {*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)) = cpu_to_le32(val); } while (0)
+
+struct reiserfs_iget_args {
+ __u32 objectid;
+ __u32 dirid;
+};
+
+/***************************************************************************/
+/* FUNCTION DECLARATIONS */
+/***************************************************************************/
+
+#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
+
+#define journal_trans_half(blocksize) \
+ ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32))
+
+/* journal.c see journal.c for all the comments here */
+
+/* first block written in a commit. */
+struct reiserfs_journal_desc {
+ __le32 j_trans_id; /* id of commit */
+ __le32 j_len; /* length of commit. len +1 is the commit block */
+ __le32 j_mount_id; /* mount id of this trans */
+ __le32 j_realblock[1]; /* real locations for each block */
+};
+
+#define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id)
+#define get_desc_trans_len(d) le32_to_cpu((d)->j_len)
+#define get_desc_mount_id(d) le32_to_cpu((d)->j_mount_id)
+
+#define set_desc_trans_id(d,val) do { (d)->j_trans_id = cpu_to_le32 (val); } while (0)
+#define set_desc_trans_len(d,val) do { (d)->j_len = cpu_to_le32 (val); } while (0)
+#define set_desc_mount_id(d,val) do { (d)->j_mount_id = cpu_to_le32 (val); } while (0)
+
+/* last block written in a commit */
+struct reiserfs_journal_commit {
+ __le32 j_trans_id; /* must match j_trans_id from the desc block */
+ __le32 j_len; /* ditto */
+ __le32 j_realblock[1]; /* real locations for each block */
+};
+
+#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
+#define get_commit_trans_len(c) le32_to_cpu((c)->j_len)
+#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id)
+
+#define set_commit_trans_id(c,val) do { (c)->j_trans_id = cpu_to_le32 (val); } while (0)
+#define set_commit_trans_len(c,val) do { (c)->j_len = cpu_to_le32 (val); } while (0)
+
+/* this header block gets written whenever a transaction is considered fully flushed, and is more recent than the
+** last fully flushed transaction. fully flushed means all the log blocks and all the real blocks are on disk,
+** and this transaction does not need to be replayed.
+*/
+struct reiserfs_journal_header {
+ __le32 j_last_flush_trans_id; /* id of last fully flushed transaction */
+ __le32 j_first_unflushed_offset; /* offset in the log of where to start replay after a crash */
+ __le32 j_mount_id;
+ /* 12 */ struct journal_params jh_journal;
+};
+
+/* biggest tunable defines are right here */
+#define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */
+#define JOURNAL_TRANS_MAX_DEFAULT 1024 /* biggest possible single transaction, don't change for now (8/3/99) */
+#define JOURNAL_TRANS_MIN_DEFAULT 256
+#define JOURNAL_MAX_BATCH_DEFAULT 900 /* max blocks to batch into one transaction, don't make this any bigger than 900 */
+#define JOURNAL_MIN_RATIO 2
+#define JOURNAL_MAX_COMMIT_AGE 30
+#define JOURNAL_MAX_TRANS_AGE 30
+#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9)
+#define JOURNAL_BLOCKS_PER_OBJECT(sb) (JOURNAL_PER_BALANCE_CNT * 3 + \
+ 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \
+ REISERFS_QUOTA_TRANS_BLOCKS(sb)))
+
+#ifdef CONFIG_QUOTA
+#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA))
+/* We need to update data and inode (atime) */
+#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0)
+/* 1 balancing, 1 bitmap, 1 data per write + stat data update */
+#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
+(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0)
+/* same as with INIT */
+#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
+(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0)
+#else
+#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0
+#define REISERFS_QUOTA_INIT_BLOCKS(s) 0
+#define REISERFS_QUOTA_DEL_BLOCKS(s) 0
+#endif
+
+/* both of these can be as low as 1, or as high as you want. The min is the
+** number of 4k bitmap nodes preallocated on mount. New nodes are allocated
+** as needed, and released when transactions are committed. On release, if
+** the current number of nodes is > max, the node is freed, otherwise,
+** it is put on a free list for faster use later.
+*/
+#define REISERFS_MIN_BITMAP_NODES 10
+#define REISERFS_MAX_BITMAP_NODES 100
+
+#define JBH_HASH_SHIFT 13 /* these are based on journal hash size of 8192 */
+#define JBH_HASH_MASK 8191
+
+#define _jhashfn(sb,block) \
+ (((unsigned long)sb>>L1_CACHE_SHIFT) ^ \
+ (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
+#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
+
+// We need these to make journal.c code more readable
+#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+
+enum reiserfs_bh_state_bits {
+ BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */
+ BH_JDirty_wait,
+ BH_JNew, /* disk block was taken off free list before
+ * being in a finished transaction, or
+ * written to disk. Can be reused immed. */
+ BH_JPrepared,
+ BH_JRestore_dirty,
+ BH_JTest, // debugging only will go away
+};
+
+BUFFER_FNS(JDirty, journaled);
+TAS_BUFFER_FNS(JDirty, journaled);
+BUFFER_FNS(JDirty_wait, journal_dirty);
+TAS_BUFFER_FNS(JDirty_wait, journal_dirty);
+BUFFER_FNS(JNew, journal_new);
+TAS_BUFFER_FNS(JNew, journal_new);
+BUFFER_FNS(JPrepared, journal_prepared);
+TAS_BUFFER_FNS(JPrepared, journal_prepared);
+BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
+TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
+BUFFER_FNS(JTest, journal_test);
+TAS_BUFFER_FNS(JTest, journal_test);
+
+/*
+** transaction handle which is passed around for all journal calls
+*/
+struct reiserfs_transaction_handle {
+ struct super_block *t_super; /* super for this FS when journal_begin was
+ called. saves calls to reiserfs_get_super
+ also used by nested transactions to make
+ sure they are nesting on the right FS
+ _must_ be first in the handle
+ */
+ int t_refcount;
+ int t_blocks_logged; /* number of blocks this writer has logged */
+ int t_blocks_allocated; /* number of blocks this writer allocated */
+ unsigned int t_trans_id; /* sanity check, equals the current trans id */
+ void *t_handle_save; /* save existing current->journal_info */
+ unsigned displace_new_blocks:1; /* if new block allocation occurres, that block
+ should be displaced from others */
+ struct list_head t_list;
+};
+
+/* used to keep track of ordered and tail writes, attached to the buffer
+ * head through b_journal_head.
+ */
+struct reiserfs_jh {
+ struct reiserfs_journal_list *jl;
+ struct buffer_head *bh;
+ struct list_head list;
+};
+
+void reiserfs_free_jh(struct buffer_head *bh);
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
+int journal_mark_dirty(struct reiserfs_transaction_handle *,
+ struct super_block *, struct buffer_head *bh);
+
+static inline int reiserfs_file_data_log(struct inode *inode)
+{
+ if (reiserfs_data_log(inode->i_sb) ||
+ (REISERFS_I(inode)->i_flags & i_data_log))
+ return 1;
+ return 0;
+}
+
+static inline int reiserfs_transaction_running(struct super_block *s)
+{
+ struct reiserfs_transaction_handle *th = current->journal_info;
+ if (th && th->t_super == s)
+ return 1;
+ if (th && th->t_super == NULL)
+ BUG();
+ return 0;
+}
+
+static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th)
+{
+ return th->t_blocks_allocated - th->t_blocks_logged;
+}
+
+struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
+ super_block
+ *,
+ int count);
+int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+ unsigned from, unsigned to);
+int reiserfs_flush_old_commits(struct super_block *);
+int reiserfs_commit_for_inode(struct inode *);
+int reiserfs_inode_needs_commit(struct inode *);
+void reiserfs_update_inode_transaction(struct inode *);
+void reiserfs_wait_on_write_block(struct super_block *s);
+void reiserfs_block_writes(struct reiserfs_transaction_handle *th);
+void reiserfs_allow_writes(struct super_block *s);
+void reiserfs_check_lock_depth(struct super_block *s, char *caller);
+int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh,
+ int wait);
+void reiserfs_restore_prepared_buffer(struct super_block *,
+ struct buffer_head *bh);
+int journal_init(struct super_block *, const char *j_dev_name, int old_format,
+ unsigned int);
+int journal_release(struct reiserfs_transaction_handle *, struct super_block *);
+int journal_release_error(struct reiserfs_transaction_handle *,
+ struct super_block *);
+int journal_end(struct reiserfs_transaction_handle *, struct super_block *,
+ unsigned long);
+int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *,
+ unsigned long);
+int journal_mark_freed(struct reiserfs_transaction_handle *,
+ struct super_block *, b_blocknr_t blocknr);
+int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
+int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr,
+ int bit_nr, int searchall, b_blocknr_t *next);
+int journal_begin(struct reiserfs_transaction_handle *,
+ struct super_block *sb, unsigned long);
+int journal_join_abort(struct reiserfs_transaction_handle *,
+ struct super_block *sb, unsigned long);
+void reiserfs_abort_journal(struct super_block *sb, int errno);
+void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
+int reiserfs_allocate_list_bitmaps(struct super_block *s,
+ struct reiserfs_list_bitmap *, unsigned int);
+
+void add_save_link(struct reiserfs_transaction_handle *th,
+ struct inode *inode, int truncate);
+int remove_save_link(struct inode *inode, int truncate);
+
+/* objectid.c */
+__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th);
+void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
+ __u32 objectid_to_release);
+int reiserfs_convert_objectid_map_v1(struct super_block *);
+
+/* stree.c */
+int B_IS_IN_TREE(const struct buffer_head *);
+extern void copy_item_head(struct item_head *to,
+ const struct item_head *from);
+
+// first key is in cpu form, second - le
+extern int comp_short_keys(const struct reiserfs_key *le_key,
+ const struct cpu_key *cpu_key);
+extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from);
+
+// both are in le form
+extern int comp_le_keys(const struct reiserfs_key *,
+ const struct reiserfs_key *);
+extern int comp_short_le_keys(const struct reiserfs_key *,
+ const struct reiserfs_key *);
+
+//
+// get key version from on disk key - kludge
+//
+static inline int le_key_version(const struct reiserfs_key *key)
+{
+ int type;
+
+ type = offset_v2_k_type(&(key->u.k_offset_v2));
+ if (type != TYPE_DIRECT && type != TYPE_INDIRECT
+ && type != TYPE_DIRENTRY)
+ return KEY_FORMAT_3_5;
+
+ return KEY_FORMAT_3_6;
+
+}
+
+static inline void copy_key(struct reiserfs_key *to,
+ const struct reiserfs_key *from)
+{
+ memcpy(to, from, KEY_SIZE);
+}
+
+int comp_items(const struct item_head *stored_ih, const struct treepath *path);
+const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
+ const struct super_block *sb);
+int search_by_key(struct super_block *, const struct cpu_key *,
+ struct treepath *, int);
+#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL)
+int search_for_position_by_key(struct super_block *sb,
+ const struct cpu_key *cpu_key,
+ struct treepath *search_path);
+extern void decrement_bcount(struct buffer_head *bh);
+void decrement_counters_in_path(struct treepath *search_path);
+void pathrelse(struct treepath *search_path);
+int reiserfs_check_path(struct treepath *p);
+void pathrelse_and_restore(struct super_block *s, struct treepath *search_path);
+
+int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
+ struct treepath *path,
+ const struct cpu_key *key,
+ struct item_head *ih,
+ struct inode *inode, const char *body);
+
+int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
+ struct treepath *path,
+ const struct cpu_key *key,
+ struct inode *inode,
+ const char *body, int paste_size);
+
+int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
+ struct treepath *path,
+ struct cpu_key *key,
+ struct inode *inode,
+ struct page *page, loff_t new_file_size);
+
+int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
+ struct treepath *path,
+ const struct cpu_key *key,
+ struct inode *inode, struct buffer_head *un_bh);
+
+void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
+ struct inode *inode, struct reiserfs_key *key);
+int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
+ struct inode *inode);
+int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
+ struct inode *inode, struct page *,
+ int update_timestamps);
+
+#define i_block_size(inode) ((inode)->i_sb->s_blocksize)
+#define file_size(inode) ((inode)->i_size)
+#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1))
+
+#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\
+!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 )
+
+void padd_item(char *item, int total_length, int length);
+
+/* inode.c */
+/* args for the create parameter of reiserfs_get_block */
+#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
+#define GET_BLOCK_CREATE 1 /* add anything you need to find block */
+#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */
+#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */
+#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */
+#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */
+
+void reiserfs_read_locked_inode(struct inode *inode,
+ struct reiserfs_iget_args *args);
+int reiserfs_find_actor(struct inode *inode, void *p);
+int reiserfs_init_locked_inode(struct inode *inode, void *p);
+void reiserfs_evict_inode(struct inode *inode);
+int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc);
+int reiserfs_get_block(struct inode *inode, sector_t block,
+ struct buffer_head *bh_result, int create);
+struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type);
+struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type);
+int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
+ int connectable);
+
+int reiserfs_truncate_file(struct inode *, int update_timestamps);
+void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
+ int type, int key_length);
+void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
+ int version,
+ loff_t offset, int type, int length, int entry_count);
+struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key);
+
+struct reiserfs_security_handle;
+int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
+ struct inode *dir, umode_t mode,
+ const char *symname, loff_t i_size,
+ struct dentry *dentry, struct inode *inode,
+ struct reiserfs_security_handle *security);
+
+void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
+ struct inode *inode, loff_t size);
+
+static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
+ struct inode *inode)
+{
+ reiserfs_update_sd_size(th, inode, inode->i_size);
+}
+
+void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
+void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs);
+int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
+
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
+
+/* namei.c */
+void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
+int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
+ struct treepath *path, struct reiserfs_dir_entry *de);
+struct dentry *reiserfs_get_parent(struct dentry *);
+
+#ifdef CONFIG_REISERFS_PROC_INFO
+int reiserfs_proc_info_init(struct super_block *sb);
+int reiserfs_proc_info_done(struct super_block *sb);
+int reiserfs_proc_info_global_init(void);
+int reiserfs_proc_info_global_done(void);
+
+#define PROC_EXP( e ) e
+
+#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data
+#define PROC_INFO_MAX( sb, field, value ) \
+ __PINFO( sb ).field = \
+ max( REISERFS_SB( sb ) -> s_proc_info_data.field, value )
+#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) )
+#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) )
+#define PROC_INFO_BH_STAT( sb, bh, level ) \
+ PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] ); \
+ PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) ); \
+ PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) )
+#else
+static inline int reiserfs_proc_info_init(struct super_block *sb)
+{
+ return 0;
+}
+
+static inline int reiserfs_proc_info_done(struct super_block *sb)
+{
+ return 0;
+}
+
+static inline int reiserfs_proc_info_global_init(void)
+{
+ return 0;
+}
+
+static inline int reiserfs_proc_info_global_done(void)
+{
+ return 0;
+}
+
+#define PROC_EXP( e )
+#define VOID_V ( ( void ) 0 )
+#define PROC_INFO_MAX( sb, field, value ) VOID_V
+#define PROC_INFO_INC( sb, field ) VOID_V
+#define PROC_INFO_ADD( sb, field, val ) VOID_V
+#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V
+#endif
+
+/* dir.c */
+extern const struct inode_operations reiserfs_dir_inode_operations;
+extern const struct inode_operations reiserfs_symlink_inode_operations;
+extern const struct inode_operations reiserfs_special_inode_operations;
+extern const struct file_operations reiserfs_dir_operations;
+int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *);
+
+/* tail_conversion.c */
+int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
+ struct treepath *, struct buffer_head *, loff_t);
+int indirect2direct(struct reiserfs_transaction_handle *, struct inode *,
+ struct page *, struct treepath *, const struct cpu_key *,
+ loff_t, char *);
+void reiserfs_unmap_buffer(struct buffer_head *);
+
+/* file.c */
+extern const struct inode_operations reiserfs_file_inode_operations;
+extern const struct file_operations reiserfs_file_operations;
+extern const struct address_space_operations reiserfs_address_space_operations;
+
+/* fix_nodes.c */
+
+int fix_nodes(int n_op_mode, struct tree_balance *tb,
+ struct item_head *ins_ih, const void *);
+void unfix_nodes(struct tree_balance *);
+
+/* prints.c */
+void __reiserfs_panic(struct super_block *s, const char *id,
+ const char *function, const char *fmt, ...)
+ __attribute__ ((noreturn));
+#define reiserfs_panic(s, id, fmt, args...) \
+ __reiserfs_panic(s, id, __func__, fmt, ##args)
+void __reiserfs_error(struct super_block *s, const char *id,
+ const char *function, const char *fmt, ...);
+#define reiserfs_error(s, id, fmt, args...) \
+ __reiserfs_error(s, id, __func__, fmt, ##args)
+void reiserfs_info(struct super_block *s, const char *fmt, ...);
+void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...);
+void print_indirect_item(struct buffer_head *bh, int item_num);
+void store_print_tb(struct tree_balance *tb);
+void print_cur_tb(char *mes);
+void print_de(struct reiserfs_dir_entry *de);
+void print_bi(struct buffer_info *bi, char *mes);
+#define PRINT_LEAF_ITEMS 1 /* print all items */
+#define PRINT_DIRECTORY_ITEMS 2 /* print directory items */
+#define PRINT_DIRECT_ITEMS 4 /* print contents of direct items */
+void print_block(struct buffer_head *bh, ...);
+void print_bmap(struct super_block *s, int silent);
+void print_bmap_block(int i, char *data, int size, int silent);
+/*void print_super_block (struct super_block * s, char * mes);*/
+void print_objectid_map(struct super_block *s);
+void print_block_head(struct buffer_head *bh, char *mes);
+void check_leaf(struct buffer_head *bh);
+void check_internal(struct buffer_head *bh);
+void print_statistics(struct super_block *s);
+char *reiserfs_hashname(int code);
+
+/* lbalance.c */
+int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
+ int mov_bytes, struct buffer_head *Snew);
+int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes);
+int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
+void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
+ int del_num, int del_bytes);
+void leaf_insert_into_buf(struct buffer_info *bi, int before,
+ struct item_head *inserted_item_ih,
+ const char *inserted_item_body, int zeros_number);
+void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
+ int pos_in_item, int paste_size, const char *body,
+ int zeros_number);
+void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
+ int pos_in_item, int cut_size);
+void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
+ int new_entry_count, struct reiserfs_de_head *new_dehs,
+ const char *records, int paste_size);
+/* ibalance.c */
+int balance_internal(struct tree_balance *, int, int, struct item_head *,
+ struct buffer_head **);
+
+/* do_balance.c */
+void do_balance_mark_leaf_dirty(struct tree_balance *tb,
+ struct buffer_head *bh, int flag);
+#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
+#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
+
+void do_balance(struct tree_balance *tb, struct item_head *ih,
+ const char *body, int flag);
+void reiserfs_invalidate_buffer(struct tree_balance *tb,
+ struct buffer_head *bh);
+
+int get_left_neighbor_position(struct tree_balance *tb, int h);
+int get_right_neighbor_position(struct tree_balance *tb, int h);
+void replace_key(struct tree_balance *tb, struct buffer_head *, int,
+ struct buffer_head *, int);
+void make_empty_node(struct buffer_info *);
+struct buffer_head *get_FEB(struct tree_balance *);
+
+/* bitmap.c */
+
+/* structure contains hints for block allocator, and it is a container for
+ * arguments, such as node, search path, transaction_handle, etc. */
+struct __reiserfs_blocknr_hint {
+ struct inode *inode; /* inode passed to allocator, if we allocate unf. nodes */
+ sector_t block; /* file offset, in blocks */
+ struct in_core_key key;
+ struct treepath *path; /* search path, used by allocator to deternine search_start by
+ * various ways */
+ struct reiserfs_transaction_handle *th; /* transaction handle is needed to log super blocks and
+ * bitmap blocks changes */
+ b_blocknr_t beg, end;
+ b_blocknr_t search_start; /* a field used to transfer search start value (block number)
+ * between different block allocator procedures
+ * (determine_search_start() and others) */
+ int prealloc_size; /* is set in determine_prealloc_size() function, used by underlayed
+ * function that do actual allocation */
+
+ unsigned formatted_node:1; /* the allocator uses different polices for getting disk space for
+ * formatted/unformatted blocks with/without preallocation */
+ unsigned preallocate:1;
+};
+
+typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
+
+int reiserfs_parse_alloc_options(struct super_block *, char *);
+void reiserfs_init_alloc_options(struct super_block *s);
+
+/*
+ * given a directory, this will tell you what packing locality
+ * to use for a new object underneat it. The locality is returned
+ * in disk byte order (le).
+ */
+__le32 reiserfs_choose_packing(struct inode *dir);
+
+int reiserfs_init_bitmap_cache(struct super_block *sb);
+void reiserfs_free_bitmap_cache(struct super_block *sb);
+void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
+struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap);
+int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
+void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *,
+ b_blocknr_t, int for_unformatted);
+int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int,
+ int);
+static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb,
+ b_blocknr_t * new_blocknrs,
+ int amount_needed)
+{
+ reiserfs_blocknr_hint_t hint = {
+ .th = tb->transaction_handle,
+ .path = tb->tb_path,
+ .inode = NULL,
+ .key = tb->key,
+ .block = 0,
+ .formatted_node = 1
+ };
+ return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed,
+ 0);
+}
+
+static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle
+ *th, struct inode *inode,
+ b_blocknr_t * new_blocknrs,
+ struct treepath *path,
+ sector_t block)
+{
+ reiserfs_blocknr_hint_t hint = {
+ .th = th,
+ .path = path,
+ .inode = inode,
+ .block = block,
+ .formatted_node = 0,
+ .preallocate = 0
+ };
+ return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
+}
+
+#ifdef REISERFS_PREALLOCATE
+static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle
+ *th, struct inode *inode,
+ b_blocknr_t * new_blocknrs,
+ struct treepath *path,
+ sector_t block)
+{
+ reiserfs_blocknr_hint_t hint = {
+ .th = th,
+ .path = path,
+ .inode = inode,
+ .block = block,
+ .formatted_node = 0,
+ .preallocate = 1
+ };
+ return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
+}
+
+void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
+ struct inode *inode);
+void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th);
+#endif
+
+/* hashes.c */
+__u32 keyed_hash(const signed char *msg, int len);
+__u32 yura_hash(const signed char *msg, int len);
+__u32 r5_hash(const signed char *msg, int len);
+
+#define reiserfs_set_le_bit __set_bit_le
+#define reiserfs_test_and_set_le_bit __test_and_set_bit_le
+#define reiserfs_clear_le_bit __clear_bit_le
+#define reiserfs_test_and_clear_le_bit __test_and_clear_bit_le
+#define reiserfs_test_le_bit test_bit_le
+#define reiserfs_find_next_zero_le_bit find_next_zero_bit_le
+
+/* sometimes reiserfs_truncate may require to allocate few new blocks
+ to perform indirect2direct conversion. People probably used to
+ think, that truncate should work without problems on a filesystem
+ without free disk space. They may complain that they can not
+ truncate due to lack of free disk space. This spare space allows us
+ to not worry about it. 500 is probably too much, but it should be
+ absolutely safe */
+#define SPARE_SPACE 500
+
+/* prototypes from ioctl.c */
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long reiserfs_compat_ioctl(struct file *filp,
+ unsigned int cmd, unsigned long arg);
+int reiserfs_unpack(struct inode *inode, struct file *filp);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 7483279b482..9a17f63c3fd 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -13,8 +13,7 @@
#include <linux/vmalloc.h>
#include <linux/string.h>
#include <linux/errno.h>
-#include <linux/reiserfs_fs.h>
-#include <linux/reiserfs_fs_sb.h>
+#include "reiserfs.h"
#include <linux/buffer_head.h>
int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 313d39d639e..f8afa4b162b 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -51,7 +51,7 @@
#include <linux/time.h>
#include <linux/string.h>
#include <linux/pagemap.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/buffer_head.h>
#include <linux/quotaops.h>
@@ -1284,12 +1284,12 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
** -clm
*/
- data = kmap_atomic(un_bh->b_page, KM_USER0);
+ data = kmap_atomic(un_bh->b_page);
off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
memcpy(data + off,
B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih),
ret_value);
- kunmap_atomic(data, KM_USER0);
+ kunmap_atomic(data);
}
/* Perform balancing after all resources have been collected at once. */
do_balance(&s_del_balance, NULL, NULL, M_DELETE);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e12d8b97cd4..8b7616ef06d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -16,9 +16,9 @@
#include <linux/vmalloc.h>
#include <linux/time.h>
#include <asm/uaccess.h>
-#include <linux/reiserfs_fs.h>
-#include <linux/reiserfs_acl.h>
-#include <linux/reiserfs_xattr.h>
+#include "reiserfs.h"
+#include "acl.h"
+#include "xattr.h"
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
@@ -1874,11 +1874,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
unlock_new_inode(root_inode);
}
- s->s_root = d_alloc_root(root_inode);
- if (!s->s_root) {
- iput(root_inode);
+ s->s_root = d_make_root(root_inode);
+ if (!s->s_root)
goto error;
- }
// define and initialize hash function
sbi->s_hash_function = hash_function(s);
if (sbi->s_hash_function == NULL) {
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index d7f6e51bef2..5e2624d12f7 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -5,7 +5,7 @@
#include <linux/time.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
/* access to tail : when one is going to read tail it must make sure, that is not running.
direct2indirect and indirect2direct can not run concurrently */
@@ -128,9 +128,9 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
if (up_to_date_bh) {
unsigned pgoff =
(tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
- char *kaddr = kmap_atomic(up_to_date_bh->b_page, KM_USER0);
+ char *kaddr = kmap_atomic(up_to_date_bh->b_page);
memset(kaddr + pgoff, 0, blk_size - total_tail);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index c24deda8a8b..46fc1c20a6b 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -33,7 +33,7 @@
* The xattrs themselves are protected by the xattr_sem.
*/
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/capability.h>
#include <linux/dcache.h>
#include <linux/namei.h>
@@ -43,8 +43,8 @@
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
-#include <linux/reiserfs_xattr.h>
-#include <linux/reiserfs_acl.h>
+#include "xattr.h"
+#include "acl.h"
#include <asm/uaccess.h>
#include <net/checksum.h>
#include <linux/stat.h>
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
new file mode 100644
index 00000000000..f59626c5d33
--- /dev/null
+++ b/fs/reiserfs/xattr.h
@@ -0,0 +1,122 @@
+#include <linux/reiserfs_xattr.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+
+struct inode;
+struct dentry;
+struct iattr;
+struct super_block;
+struct nameidata;
+
+int reiserfs_xattr_register_handlers(void) __init;
+void reiserfs_xattr_unregister_handlers(void);
+int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
+int reiserfs_lookup_privroot(struct super_block *sb);
+int reiserfs_delete_xattrs(struct inode *inode);
+int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
+int reiserfs_permission(struct inode *inode, int mask);
+
+#ifdef CONFIG_REISERFS_FS_XATTR
+#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
+ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size);
+int reiserfs_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int reiserfs_removexattr(struct dentry *dentry, const char *name);
+
+int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
+int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
+int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *,
+ struct inode *, const char *, const void *,
+ size_t, int);
+
+extern const struct xattr_handler reiserfs_xattr_user_handler;
+extern const struct xattr_handler reiserfs_xattr_trusted_handler;
+extern const struct xattr_handler reiserfs_xattr_security_handler;
+#ifdef CONFIG_REISERFS_FS_SECURITY
+int reiserfs_security_init(struct inode *dir, struct inode *inode,
+ const struct qstr *qstr,
+ struct reiserfs_security_handle *sec);
+int reiserfs_security_write(struct reiserfs_transaction_handle *th,
+ struct inode *inode,
+ struct reiserfs_security_handle *sec);
+void reiserfs_security_free(struct reiserfs_security_handle *sec);
+#endif
+
+static inline int reiserfs_xattrs_initialized(struct super_block *sb)
+{
+ return REISERFS_SB(sb)->priv_root != NULL;
+}
+
+#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
+static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
+{
+ loff_t ret = 0;
+ if (reiserfs_file_data_log(inode)) {
+ ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize);
+ ret >>= inode->i_sb->s_blocksize_bits;
+ }
+ return ret;
+}
+
+/* We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
+ * Let's try to be smart about it.
+ * xattr root: We cache it. If it's not cached, we may need to create it.
+ * xattr dir: If anything has been loaded for this inode, we can set a flag
+ * saying so.
+ * xattr file: Since we don't cache xattrs, we can't tell. We always include
+ * blocks for it.
+ *
+ * However, since root and dir can be created between calls - YOU MUST SAVE
+ * THIS VALUE.
+ */
+static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode)
+{
+ size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+
+ if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) {
+ nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+ if (!REISERFS_SB(inode->i_sb)->xattr_root->d_inode)
+ nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+ }
+
+ return nblocks;
+}
+
+static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
+{
+ init_rwsem(&REISERFS_I(inode)->i_xattr_sem);
+}
+
+#else
+
+#define reiserfs_getxattr NULL
+#define reiserfs_setxattr NULL
+#define reiserfs_listxattr NULL
+#define reiserfs_removexattr NULL
+
+static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
+{
+}
+#endif /* CONFIG_REISERFS_FS_XATTR */
+
+#ifndef CONFIG_REISERFS_FS_SECURITY
+static inline int reiserfs_security_init(struct inode *dir,
+ struct inode *inode,
+ const struct qstr *qstr,
+ struct reiserfs_security_handle *sec)
+{
+ return 0;
+}
+static inline int
+reiserfs_security_write(struct reiserfs_transaction_handle *th,
+ struct inode *inode,
+ struct reiserfs_security_handle *sec)
+{
+ return 0;
+}
+static inline void reiserfs_security_free(struct reiserfs_security_handle *sec)
+{}
+#endif
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 6da0396e505..44474f9b990 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -1,14 +1,14 @@
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/posix_acl.h>
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/errno.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
#include <linux/slab.h>
#include <linux/posix_acl_xattr.h>
-#include <linux/reiserfs_xattr.h>
-#include <linux/reiserfs_acl.h>
+#include "xattr.h"
+#include "acl.h"
#include <asm/uaccess.h>
static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 534668fa41b..800a3cef6f6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -1,10 +1,10 @@
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
#include <linux/slab.h>
-#include <linux/reiserfs_xattr.h>
+#include "xattr.h"
#include <linux/security.h>
#include <asm/uaccess.h>
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 9883736ce3e..a0035719f66 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -1,10 +1,10 @@
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
-#include <linux/reiserfs_xattr.h>
+#include "xattr.h"
#include <asm/uaccess.h>
static int
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 45ae1a00013..8667491ae7c 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -1,9 +1,9 @@
-#include <linux/reiserfs_fs.h>
+#include "reiserfs.h"
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
-#include <linux/reiserfs_xattr.h>
+#include "xattr.h"
#include <asm/uaccess.h>
static int
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index 71e2b4d50a0..f86f51f99ac 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -19,7 +19,7 @@
#endif
#ifdef CONFIG_ROMFS_ON_MTD
-#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
+#define ROMFS_MTD_READ(sb, ...) mtd_read((sb)->s_mtd, ##__VA_ARGS__)
/*
* read data from an romfs image on an MTD device
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index bb36ab74eb4..e64f6b5f7ae 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -538,14 +538,12 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
if (IS_ERR(root))
goto error;
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root)
- goto error_i;
+ goto error;
return 0;
-error_i:
- iput(root);
error:
return -EINVAL;
error_rsb_inval:
diff --git a/fs/select.c b/fs/select.c
index d33418fdc85..17d33d09fc1 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -17,7 +17,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
@@ -223,7 +223,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
- entry->key = p->key;
+ entry->key = p->_key;
init_waitqueue_func_entry(&entry->wait, pollwake);
entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait);
@@ -348,7 +348,7 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds)
set = ~(~0UL << (n & (__NFDBITS-1)));
n /= __NFDBITS;
fdt = files_fdtable(current->files);
- open_fds = fdt->open_fds->fds_bits+n;
+ open_fds = fdt->open_fds + n;
max = 0;
if (set) {
set &= BITS(fds, n);
@@ -386,13 +386,11 @@ get_max:
static inline void wait_key_set(poll_table *wait, unsigned long in,
unsigned long out, unsigned long bit)
{
- if (wait) {
- wait->key = POLLEX_SET;
- if (in & bit)
- wait->key |= POLLIN_SET;
- if (out & bit)
- wait->key |= POLLOUT_SET;
- }
+ wait->_key = POLLEX_SET;
+ if (in & bit)
+ wait->_key |= POLLIN_SET;
+ if (out & bit)
+ wait->_key |= POLLOUT_SET;
}
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
@@ -414,7 +412,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
poll_initwait(&table);
wait = &table.pt;
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
- wait = NULL;
+ wait->_qproc = NULL;
timed_out = 1;
}
@@ -459,17 +457,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
- wait = NULL;
+ wait->_qproc = NULL;
}
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
- wait = NULL;
+ wait->_qproc = NULL;
}
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
- wait = NULL;
+ wait->_qproc = NULL;
}
}
}
@@ -481,7 +479,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
*rexp = res_ex;
cond_resched();
}
- wait = NULL;
+ wait->_qproc = NULL;
if (retval || timed_out || signal_pending(current))
break;
if (table.error) {
@@ -720,7 +718,7 @@ struct poll_list {
* interested in events matching the pollfd->events mask, and the result
* matching that mask is both recorded in pollfd->revents and returned. The
* pwait poll_table will be used by the fd-provided poll handler for waiting,
- * if non-NULL.
+ * if pwait->_qproc is non-NULL.
*/
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
@@ -738,9 +736,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
if (file != NULL) {
mask = DEFAULT_POLLMASK;
if (file->f_op && file->f_op->poll) {
- if (pwait)
- pwait->key = pollfd->events |
- POLLERR | POLLHUP;
+ pwait->_key = pollfd->events|POLLERR|POLLHUP;
mask = file->f_op->poll(file, pwait);
}
/* Mask out unneeded events. */
@@ -763,7 +759,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
- pt = NULL;
+ pt->_qproc = NULL;
timed_out = 1;
}
@@ -781,22 +777,22 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
for (; pfd != pfd_end; pfd++) {
/*
* Fish for events. If we found one, record it
- * and kill the poll_table, so we don't
+ * and kill poll_table->_qproc, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt)) {
count++;
- pt = NULL;
+ pt->_qproc = NULL;
}
}
}
/*
* All waiters have already been registered, so don't provide
- * a poll_table to them on the next loop iteration.
+ * a poll_table->_qproc to them on the next loop iteration.
*/
- pt = NULL;
+ pt->_qproc = NULL;
if (!count) {
count = wait->error;
if (signal_pending(current))
@@ -912,7 +908,7 @@ static long do_restart_poll(struct restart_block *restart_block)
}
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
- long, timeout_msecs)
+ int, timeout_msecs)
{
struct timespec end_time, *to = NULL;
int ret;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 4023d6be939..0cbd0494b79 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -6,13 +6,29 @@
*/
#include <linux/fs.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/page.h>
+
+/*
+ * seq_files have a buffer which can may overflow. When this happens a larger
+ * buffer is reallocated and all the data will be printed again.
+ * The overflow state is true when m->count == m->size.
+ */
+static bool seq_overflow(struct seq_file *m)
+{
+ return m->count == m->size;
+}
+
+static void seq_set_overflow(struct seq_file *m)
+{
+ m->count = m->size;
+}
+
/**
* seq_open - initialize sequential file
* @file: file we initialize
@@ -92,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset)
error = 0;
m->count = 0;
}
- if (m->count == m->size)
+ if (seq_overflow(m))
goto Eoverflow;
if (pos + m->count > offset) {
m->from = offset - pos;
@@ -140,9 +156,21 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
mutex_lock(&m->lock);
+ /*
+ * seq_file->op->..m_start/m_stop/m_next may do special actions
+ * or optimisations based on the file->f_version, so we want to
+ * pass the file->f_version to those methods.
+ *
+ * seq_file->version is just copy of f_version, and seq_file
+ * methods can treat it simply as file version.
+ * It is copied in first and copied out after all operations.
+ * It is convenient to have it as part of structure to avoid the
+ * need of passing another argument to all the seq_file methods.
+ */
+ m->version = file->f_version;
+
/* Don't assume *ppos is where we left it */
if (unlikely(*ppos != m->read_pos)) {
- m->read_pos = *ppos;
while ((err = traverse(m, *ppos)) == -EAGAIN)
;
if (err) {
@@ -152,21 +180,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
m->index = 0;
m->count = 0;
goto Done;
+ } else {
+ m->read_pos = *ppos;
}
}
- /*
- * seq_file->op->..m_start/m_stop/m_next may do special actions
- * or optimisations based on the file->f_version, so we want to
- * pass the file->f_version to those methods.
- *
- * seq_file->version is just copy of f_version, and seq_file
- * methods can treat it simply as file version.
- * It is copied in first and copied out after all operations.
- * It is convenient to have it as part of structure to avoid the
- * need of passing another argument to all the seq_file methods.
- */
- m->version = file->f_version;
/* grab buffer if we didn't have one */
if (!m->buf) {
m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
@@ -232,7 +250,7 @@ Fill:
break;
}
err = m->op->show(m, p);
- if (m->count == m->size || err) {
+ if (seq_overflow(m) || err) {
m->count = offs;
if (likely(err <= 0))
break;
@@ -359,7 +377,7 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc)
*p++ = '0' + (c & 07);
continue;
}
- m->count = m->size;
+ seq_set_overflow(m);
return -1;
}
m->count = p - m->buf;
@@ -381,7 +399,7 @@ int seq_printf(struct seq_file *m, const char *f, ...)
return 0;
}
}
- m->count = m->size;
+ seq_set_overflow(m);
return -1;
}
EXPORT_SYMBOL(seq_printf);
@@ -510,7 +528,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,
return 0;
}
}
- m->count = m->size;
+ seq_set_overflow(m);
return -1;
}
EXPORT_SYMBOL(seq_bitmap);
@@ -526,7 +544,7 @@ int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
return 0;
}
}
- m->count = m->size;
+ seq_set_overflow(m);
return -1;
}
EXPORT_SYMBOL(seq_bitmap_list);
@@ -637,11 +655,63 @@ int seq_puts(struct seq_file *m, const char *s)
m->count += len;
return 0;
}
- m->count = m->size;
+ seq_set_overflow(m);
return -1;
}
EXPORT_SYMBOL(seq_puts);
+/*
+ * A helper routine for putting decimal numbers without rich format of printf().
+ * only 'unsigned long long' is supported.
+ * This routine will put one byte delimiter + number into seq_file.
+ * This routine is very quick when you show lots of numbers.
+ * In usual cases, it will be better to use seq_printf(). It's easier to read.
+ */
+int seq_put_decimal_ull(struct seq_file *m, char delimiter,
+ unsigned long long num)
+{
+ int len;
+
+ if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
+ goto overflow;
+
+ if (delimiter)
+ m->buf[m->count++] = delimiter;
+
+ if (num < 10) {
+ m->buf[m->count++] = num + '0';
+ return 0;
+ }
+
+ len = num_to_str(m->buf + m->count, m->size - m->count, num);
+ if (!len)
+ goto overflow;
+ m->count += len;
+ return 0;
+overflow:
+ seq_set_overflow(m);
+ return -1;
+}
+EXPORT_SYMBOL(seq_put_decimal_ull);
+
+int seq_put_decimal_ll(struct seq_file *m, char delimiter,
+ long long num)
+{
+ if (num < 0) {
+ if (m->count + 3 >= m->size) {
+ seq_set_overflow(m);
+ return -1;
+ }
+ if (delimiter)
+ m->buf[m->count++] = delimiter;
+ num = -num;
+ delimiter = '-';
+ }
+ return seq_put_decimal_ull(m, delimiter, num);
+
+}
+EXPORT_SYMBOL(seq_put_decimal_ll);
+
/**
* seq_write - write arbitrary data to buffer
* @seq: seq_file identifying the buffer to which data should be written
@@ -657,7 +727,7 @@ int seq_write(struct seq_file *seq, const void *data, size_t len)
seq->count += len;
return 0;
}
- seq->count = seq->size;
+ seq_set_overflow(seq);
return -1;
}
EXPORT_SYMBOL(seq_write);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 492465b451d..7ae2a574cb2 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -30,6 +30,21 @@
#include <linux/signalfd.h>
#include <linux/syscalls.h>
+void signalfd_cleanup(struct sighand_struct *sighand)
+{
+ wait_queue_head_t *wqh = &sighand->signalfd_wqh;
+ /*
+ * The lockless check can race with remove_wait_queue() in progress,
+ * but in this case its caller should run under rcu_read_lock() and
+ * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
+ */
+ if (likely(!waitqueue_active(wqh)))
+ return;
+
+ /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */
+ wake_up_poll(wqh, POLLHUP | POLLFREE);
+}
+
struct signalfd_ctx {
sigset_t sigmask;
};
diff --git a/fs/splice.c b/fs/splice.c
index 1ec0493266b..f8476841eb0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -25,11 +25,12 @@
#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/writeback.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/uio.h>
#include <linux/security.h>
#include <linux/gfp.h>
+#include <linux/socket.h>
/*
* Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -690,7 +691,9 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
if (!likely(file->f_op && file->f_op->sendpage))
return -EINVAL;
- more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+ more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
+ if (sd->len < sd->total_len)
+ more |= MSG_SENDPAGE_NOTLAST;
return file->f_op->sendpage(file, buf->page, buf->offset,
sd->len, &pos, more);
}
@@ -737,15 +740,12 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
goto out;
if (buf->page != page) {
- /*
- * Careful, ->map() uses KM_USER0!
- */
char *src = buf->ops->map(pipe, buf, 1);
- char *dst = kmap_atomic(page, KM_USER1);
+ char *dst = kmap_atomic(page);
memcpy(dst + offset, src + buf->offset, this_len);
flush_dcache_page(page);
- kunmap_atomic(dst, KM_USER1);
+ kunmap_atomic(dst);
buf->ops->unmap(pipe, buf, src);
}
ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index ed0eb2a921f..fb50652e4e1 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -83,7 +83,8 @@ static struct buffer_head *get_block_length(struct super_block *sb,
* filesystem), otherwise the length is obtained from the first two bytes of
* the metadata block. A bit in the length field indicates if the block
* is stored uncompressed in the filesystem (usually because compression
- * generated a larger block - this does occasionally happen with zlib).
+ * generated a larger block - this does occasionally happen with compression
+ * algorithms).
*/
int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
int length, u64 *next_index, int srclength, int pages)
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f744be98cd5..af0b7380259 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -70,11 +70,15 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
spin_lock(&cache->lock);
while (1) {
- for (i = 0; i < cache->entries; i++)
- if (cache->entry[i].block == block)
+ for (i = cache->curr_blk, n = 0; n < cache->entries; n++) {
+ if (cache->entry[i].block == block) {
+ cache->curr_blk = i;
break;
+ }
+ i = (i + 1) % cache->entries;
+ }
- if (i == cache->entries) {
+ if (n == cache->entries) {
/*
* Block not in cache, if all cache entries are used
* go to sleep waiting for one to become available.
@@ -245,6 +249,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
goto cleanup;
}
+ cache->curr_blk = 0;
cache->next_blk = 0;
cache->unused = entries;
cache->entries = entries;
@@ -332,17 +337,20 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
u64 *block, int *offset, int length)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
- int bytes, copied = length;
+ int bytes, res = length;
struct squashfs_cache_entry *entry;
TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
while (length) {
entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
- if (entry->error)
- return entry->error;
- else if (*offset >= entry->length)
- return -EIO;
+ if (entry->error) {
+ res = entry->error;
+ goto error;
+ } else if (*offset >= entry->length) {
+ res = -EIO;
+ goto error;
+ }
bytes = squashfs_copy_data(buffer, entry, *offset, length);
if (buffer)
@@ -358,7 +366,11 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
squashfs_cache_put(entry);
}
- return copied;
+ return res;
+
+error:
+ squashfs_cache_put(entry);
+ return res;
}
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 9dfe2ce0fb7..b381305c9a4 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -64,7 +64,7 @@ static int get_dir_index_using_offset(struct super_block *sb,
* is offset by 3 because we invent "." and ".." entries which are
* not actually stored in the directory.
*/
- if (f_pos < 3)
+ if (f_pos <= 3)
return f_pos;
f_pos -= 3;
@@ -105,7 +105,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
struct inode *inode = file->f_dentry->d_inode;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
u64 block = squashfs_i(inode)->start + msblk->directory_table;
- int offset = squashfs_i(inode)->offset, length = 0, dir_count, size,
+ int offset = squashfs_i(inode)->offset, length, dir_count, size,
type, err;
unsigned int inode_number;
struct squashfs_dir_header dirh;
@@ -173,8 +173,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
dir_count = le32_to_cpu(dirh.count) + 1;
- /* dir_count should never be larger than 256 */
- if (dir_count > 256)
+ if (dir_count > SQUASHFS_DIR_COUNT)
goto failed_read;
while (dir_count--) {
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 38bb1c64055..8ca62c28fe1 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -464,10 +464,10 @@ static int squashfs_readpage(struct file *file, struct page *page)
if (PageUptodate(push_page))
goto skip_page;
- pageaddr = kmap_atomic(push_page, KM_USER0);
+ pageaddr = kmap_atomic(push_page);
squashfs_copy_data(pageaddr, buffer, offset, avail);
memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
- kunmap_atomic(pageaddr, KM_USER0);
+ kunmap_atomic(pageaddr);
flush_dcache_page(push_page);
SetPageUptodate(push_page);
skip_page:
@@ -484,9 +484,9 @@ skip_page:
error_out:
SetPageError(page);
out:
- pageaddr = kmap_atomic(page, KM_USER0);
+ pageaddr = kmap_atomic(page);
memset(pageaddr, 0, PAGE_CACHE_SIZE);
- kunmap_atomic(pageaddr, KM_USER0);
+ kunmap_atomic(pageaddr);
flush_dcache_page(page);
if (!PageError(page))
SetPageUptodate(page);
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index fd7b3b3bda1..81afbccfa84 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -208,8 +208,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_op = &squashfs_inode_ops;
inode->i_fop = &generic_ro_fops;
inode->i_mode |= S_IFREG;
- inode->i_blocks = ((inode->i_size -
- le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
+ inode->i_blocks = (inode->i_size -
+ le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;
squashfs_i(inode)->fragment_block = frag_blk;
squashfs_i(inode)->fragment_size = frag_size;
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 0682b38d7e3..abcc58f3c15 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -144,7 +144,7 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
struct squashfs_dir_entry *dire;
u64 block = squashfs_i(dir)->start + msblk->directory_table;
int offset = squashfs_i(dir)->offset;
- int err, length = 0, dir_count, size;
+ int err, length, dir_count, size;
TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
@@ -177,8 +177,7 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
dir_count = le32_to_cpu(dirh.count) + 1;
- /* dir_count should never be larger than 256 */
- if (dir_count > 256)
+ if (dir_count > SQUASHFS_DIR_COUNT)
goto data_error;
while (dir_count--) {
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index e8e14645de9..9e2349d07cb 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -30,11 +30,6 @@
/* size of metadata (inode and directory) blocks */
#define SQUASHFS_METADATA_SIZE 8192
-#define SQUASHFS_METADATA_LOG 13
-
-/* default size of data blocks */
-#define SQUASHFS_FILE_SIZE 131072
-#define SQUASHFS_FILE_LOG 17
/* default size of block device I/O */
#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
@@ -46,12 +41,12 @@
#define SQUASHFS_FILE_MAX_SIZE 1048576
#define SQUASHFS_FILE_MAX_LOG 20
-/* Max number of uids and gids */
-#define SQUASHFS_IDS 65536
-
/* Max length of filename (not 255) */
#define SQUASHFS_NAME_LEN 256
+/* Max value for directory header count*/
+#define SQUASHFS_DIR_COUNT 256
+
#define SQUASHFS_INVALID_FRAG (0xffffffffU)
#define SQUASHFS_INVALID_XATTR (0xffffffffU)
#define SQUASHFS_INVALID_BLK (-1LL)
@@ -142,9 +137,6 @@
#define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\
<< 16) + (B)))
-/* Translate between VFS mode and squashfs mode */
-#define SQUASHFS_MODE(A) ((A) & 0xfff)
-
/* fragment and fragment table defines */
#define SQUASHFS_FRAGMENT_BYTES(A) \
((A) * sizeof(struct squashfs_fragment_entry))
@@ -215,11 +207,6 @@
/* cached data constants for filesystem */
#define SQUASHFS_CACHED_BLKS 8
-#define SQUASHFS_MAX_FILE_SIZE_LOG 64
-
-#define SQUASHFS_MAX_FILE_SIZE (1LL << \
- (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
-
/* meta index cache */
#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
#define SQUASHFS_META_ENTRIES 127
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 651f0b31d29..52934a22f29 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -28,6 +28,7 @@
struct squashfs_cache {
char *name;
int entries;
+ int curr_blk;
int next_blk;
int num_waiters;
int unused;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index d0858c2d9a4..29cd014ed3a 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -158,10 +158,15 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ /* Check block log for sanity */
msblk->block_log = le16_to_cpu(sblk->block_log);
if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
goto failed_mount;
+ /* Check that block_size and block_log match */
+ if (msblk->block_size != (1 << msblk->block_log))
+ goto failed_mount;
+
/* Check the root inode for sanity */
root_inode = le64_to_cpu(sblk->root_inode);
if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
@@ -290,7 +295,7 @@ handle_fragments:
check_directory_table:
/* Sanity check directory_table */
- if (msblk->directory_table >= next_table) {
+ if (msblk->directory_table > next_table) {
err = -EINVAL;
goto failed_mount;
}
@@ -316,11 +321,10 @@ check_directory_table:
}
insert_inode_hash(root);
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (sb->s_root == NULL) {
ERROR("Root inode create failed\n");
err = -ENOMEM;
- iput(root);
goto failed_mount;
}
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 1191817264c..12806dffb34 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -90,14 +90,14 @@ static int squashfs_symlink_readpage(struct file *file, struct page *page)
goto error_out;
}
- pageaddr = kmap_atomic(page, KM_USER0);
+ pageaddr = kmap_atomic(page);
copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
length - bytes);
if (copied == length - bytes)
memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
else
block = entry->next_index;
- kunmap_atomic(pageaddr, KM_USER0);
+ kunmap_atomic(pageaddr);
squashfs_cache_put(entry);
}
diff --git a/fs/stack.c b/fs/stack.c
index 9c11519245a..5b5388250e2 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/fs.h>
#include <linux/fs_stack.h>
diff --git a/fs/stat.c b/fs/stat.c
index 8806b8997d2..c733dc5753a 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -4,7 +4,7 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/errno.h>
#include <linux/file.h>
@@ -307,7 +307,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
if (inode->i_op->readlink) {
error = security_inode_readlink(path.dentry);
if (!error) {
- touch_atime(path.mnt, path.dentry);
+ touch_atime(&path);
error = inode->i_op->readlink(path.dentry,
buf, bufsiz);
}
diff --git a/fs/statfs.c b/fs/statfs.c
index 2aa6a22e0be..43e6b6fe4e8 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -1,5 +1,5 @@
#include <linux/syscalls.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mount.h>
diff --git a/fs/super.c b/fs/super.c
index de41e1e46f0..cf001775617 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -20,7 +20,7 @@
* Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/blkdev.h>
@@ -32,6 +32,7 @@
#include <linux/backing-dev.h>
#include <linux/rculist_bl.h>
#include <linux/cleancache.h>
+#include <linux/fsnotify.h>
#include "internal.h"
@@ -250,7 +251,7 @@ void deactivate_locked_super(struct super_block *s)
{
struct file_system_type *fs = s->s_type;
if (atomic_dec_and_test(&s->s_active)) {
- cleancache_flush_fs(s);
+ cleancache_invalidate_fs(s);
fs->kill_sb(s);
/* caches are now gone, we can safely kill the shrinker now */
@@ -634,6 +635,28 @@ rescan:
EXPORT_SYMBOL(get_super);
/**
+ * get_super_thawed - get thawed superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device. The superblock is returned once it is thawed
+ * (or immediately if it was not frozen). %NULL is returned if no match
+ * is found.
+ */
+struct super_block *get_super_thawed(struct block_device *bdev)
+{
+ while (1) {
+ struct super_block *s = get_super(bdev);
+ if (!s || s->s_frozen == SB_UNFROZEN)
+ return s;
+ up_read(&s->s_umount);
+ vfs_check_frozen(s, SB_FREEZE_WRITE);
+ put_super(s);
+ }
+}
+EXPORT_SYMBOL(get_super_thawed);
+
+/**
* get_active_super - get an active reference to the superblock of a device
* @bdev: device to get the superblock for
*
@@ -1186,6 +1209,8 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_frozen = SB_UNFROZEN;
+ smp_wmb();
+ wake_up(&sb->s_wait_unfrozen);
deactivate_locked_super(sb);
return ret;
}
diff --git a/fs/sync.c b/fs/sync.c
index f3501ef3923..0e8db939d96 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -6,7 +6,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/writeback.h>
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7fdf6a7b743..35a36d39fa2 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -22,76 +22,103 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <linux/hash.h>
#include "sysfs.h"
DEFINE_MUTEX(sysfs_mutex);
DEFINE_SPINLOCK(sysfs_assoc_lock);
+#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb);
+
static DEFINE_SPINLOCK(sysfs_ino_lock);
static DEFINE_IDA(sysfs_ino_ida);
/**
- * sysfs_link_sibling - link sysfs_dirent into sibling list
+ * sysfs_name_hash
+ * @ns: Namespace tag to hash
+ * @name: Null terminated string to hash
+ *
+ * Returns 31 bit hash of ns + name (so it fits in an off_t )
+ */
+static unsigned int sysfs_name_hash(const void *ns, const char *name)
+{
+ unsigned long hash = init_name_hash();
+ unsigned int len = strlen(name);
+ while (len--)
+ hash = partial_name_hash(*name++, hash);
+ hash = ( end_name_hash(hash) ^ hash_ptr( (void *)ns, 31 ) );
+ hash &= 0x7fffffffU;
+ /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
+ if (hash < 1)
+ hash += 2;
+ if (hash >= INT_MAX)
+ hash = INT_MAX - 1;
+ return hash;
+}
+
+static int sysfs_name_compare(unsigned int hash, const void *ns,
+ const char *name, const struct sysfs_dirent *sd)
+{
+ if (hash != sd->s_hash)
+ return hash - sd->s_hash;
+ if (ns != sd->s_ns)
+ return ns - sd->s_ns;
+ return strcmp(name, sd->s_name);
+}
+
+static int sysfs_sd_compare(const struct sysfs_dirent *left,
+ const struct sysfs_dirent *right)
+{
+ return sysfs_name_compare(left->s_hash, left->s_ns, left->s_name,
+ right);
+}
+
+/**
+ * sysfs_link_subling - link sysfs_dirent into sibling rbtree
* @sd: sysfs_dirent of interest
*
- * Link @sd into its sibling list which starts from
+ * Link @sd into its sibling rbtree which starts from
* sd->s_parent->s_dir.children.
*
* Locking:
* mutex_lock(sysfs_mutex)
+ *
+ * RETURNS:
+ * 0 on susccess -EEXIST on failure.
*/
-static void sysfs_link_sibling(struct sysfs_dirent *sd)
+static int sysfs_link_sibling(struct sysfs_dirent *sd)
{
- struct sysfs_dirent *parent_sd = sd->s_parent;
-
- struct rb_node **p;
- struct rb_node *parent;
+ struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
+ struct rb_node *parent = NULL;
if (sysfs_type(sd) == SYSFS_DIR)
- parent_sd->s_dir.subdirs++;
-
- p = &parent_sd->s_dir.inode_tree.rb_node;
- parent = NULL;
- while (*p) {
- parent = *p;
-#define node rb_entry(parent, struct sysfs_dirent, inode_node)
- if (sd->s_ino < node->s_ino) {
- p = &node->inode_node.rb_left;
- } else if (sd->s_ino > node->s_ino) {
- p = &node->inode_node.rb_right;
- } else {
- printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n",
- (unsigned long) sd->s_ino);
- BUG();
- }
-#undef node
- }
- rb_link_node(&sd->inode_node, parent, p);
- rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree);
-
- p = &parent_sd->s_dir.name_tree.rb_node;
- parent = NULL;
- while (*p) {
- int c;
- parent = *p;
-#define node rb_entry(parent, struct sysfs_dirent, name_node)
- c = strcmp(sd->s_name, node->s_name);
- if (c < 0) {
- p = &node->name_node.rb_left;
- } else {
- p = &node->name_node.rb_right;
- }
-#undef node
+ sd->s_parent->s_dir.subdirs++;
+
+ while (*node) {
+ struct sysfs_dirent *pos;
+ int result;
+
+ pos = to_sysfs_dirent(*node);
+ parent = *node;
+ result = sysfs_sd_compare(sd, pos);
+ if (result < 0)
+ node = &pos->s_rb.rb_left;
+ else if (result > 0)
+ node = &pos->s_rb.rb_right;
+ else
+ return -EEXIST;
}
- rb_link_node(&sd->name_node, parent, p);
- rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree);
+ /* add new node and rebalance the tree */
+ rb_link_node(&sd->s_rb, parent, node);
+ rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
+ return 0;
}
/**
- * sysfs_unlink_sibling - unlink sysfs_dirent from sibling list
+ * sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
* @sd: sysfs_dirent of interest
*
- * Unlink @sd from its sibling list which starts from
+ * Unlink @sd from its sibling rbtree which starts from
* sd->s_parent->s_dir.children.
*
* Locking:
@@ -102,8 +129,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
if (sysfs_type(sd) == SYSFS_DIR)
sd->s_parent->s_dir.subdirs--;
- rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree);
- rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree);
+ rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
}
/**
@@ -198,7 +224,7 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
rwsem_release(&sd->dep_map, 1, _RET_IP_);
}
-static int sysfs_alloc_ino(ino_t *pino)
+static int sysfs_alloc_ino(unsigned int *pino)
{
int ino, rc;
@@ -217,7 +243,7 @@ static int sysfs_alloc_ino(ino_t *pino)
return rc;
}
-static void sysfs_free_ino(ino_t ino)
+static void sysfs_free_ino(unsigned int ino)
{
spin_lock(&sysfs_ino_lock);
ida_remove(&sysfs_ino_ida, ino);
@@ -402,6 +428,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
{
struct sysfs_inode_attrs *ps_iattr;
+ int ret;
if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
@@ -410,12 +437,12 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
return -EINVAL;
}
- if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
- return -EEXIST;
-
+ sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
sd->s_parent = sysfs_get(acxt->parent_sd);
- sysfs_link_sibling(sd);
+ ret = sysfs_link_sibling(sd);
+ if (ret)
+ return ret;
/* Update timestamps on the parent */
ps_iattr = acxt->parent_sd->s_iattr;
@@ -565,8 +592,8 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
const void *ns,
const unsigned char *name)
{
- struct rb_node *p = parent_sd->s_dir.name_tree.rb_node;
- struct sysfs_dirent *found = NULL;
+ struct rb_node *node = parent_sd->s_dir.children.rb_node;
+ unsigned int hash;
if (!!sysfs_ns_type(parent_sd) != !!ns) {
WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
@@ -575,33 +602,21 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
return NULL;
}
- while (p) {
- int c;
-#define node rb_entry(p, struct sysfs_dirent, name_node)
- c = strcmp(name, node->s_name);
- if (c < 0) {
- p = node->name_node.rb_left;
- } else if (c > 0) {
- p = node->name_node.rb_right;
- } else {
- found = node;
- p = node->name_node.rb_left;
- }
-#undef node
- }
-
- if (found) {
- while (found->s_ns != ns) {
- p = rb_next(&found->name_node);
- if (!p)
- return NULL;
- found = rb_entry(p, struct sysfs_dirent, name_node);
- if (strcmp(name, found->s_name))
- return NULL;
- }
+ hash = sysfs_name_hash(ns, name);
+ while (node) {
+ struct sysfs_dirent *sd;
+ int result;
+
+ sd = to_sysfs_dirent(node);
+ result = sysfs_name_compare(hash, ns, name, sd);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return sd;
}
-
- return found;
+ return NULL;
}
/**
@@ -714,6 +729,9 @@ int sysfs_create_dir(struct kobject * kobj)
else
parent_sd = &sysfs_root;
+ if (!parent_sd)
+ return -ENOENT;
+
if (sysfs_ns_type(parent_sd))
ns = kobj->ktype->namespace(kobj);
type = sysfs_read_ns_type(kobj);
@@ -804,9 +822,9 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
sysfs_addrm_start(&acxt, dir_sd);
- pos = rb_first(&dir_sd->s_dir.inode_tree);
+ pos = rb_first(&dir_sd->s_dir.children);
while (pos) {
- struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node);
+ struct sysfs_dirent *sd = to_sysfs_dirent(pos);
pos = rb_next(pos);
if (sysfs_type(sd) != SYSFS_DIR)
sysfs_remove_one(&acxt, sd);
@@ -870,6 +888,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
sysfs_get(new_parent_sd);
sysfs_put(sd->s_parent);
sd->s_ns = new_ns;
+ sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
sd->s_parent = new_parent_sd;
sysfs_link_sibling(sd);
@@ -919,38 +938,36 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
}
static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
- struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
+ struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos)
{
if (pos) {
int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
pos->s_parent == parent_sd &&
- ino == pos->s_ino;
+ hash == pos->s_hash;
sysfs_put(pos);
if (!valid)
pos = NULL;
}
- if (!pos && (ino > 1) && (ino < INT_MAX)) {
- struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node;
- while (p) {
-#define node rb_entry(p, struct sysfs_dirent, inode_node)
- if (ino < node->s_ino) {
- pos = node;
- p = node->inode_node.rb_left;
- } else if (ino > node->s_ino) {
- p = node->inode_node.rb_right;
- } else {
- pos = node;
+ if (!pos && (hash > 1) && (hash < INT_MAX)) {
+ struct rb_node *node = parent_sd->s_dir.children.rb_node;
+ while (node) {
+ pos = to_sysfs_dirent(node);
+
+ if (hash < pos->s_hash)
+ node = node->rb_left;
+ else if (hash > pos->s_hash)
+ node = node->rb_right;
+ else
break;
- }
-#undef node
}
}
+ /* Skip over entries in the wrong namespace */
while (pos && pos->s_ns != ns) {
- struct rb_node *p = rb_next(&pos->inode_node);
- if (!p)
+ struct rb_node *node = rb_next(&pos->s_rb);
+ if (!node)
pos = NULL;
else
- pos = rb_entry(p, struct sysfs_dirent, inode_node);
+ pos = to_sysfs_dirent(node);
}
return pos;
}
@@ -960,11 +977,11 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
{
pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
if (pos) do {
- struct rb_node *p = rb_next(&pos->inode_node);
- if (!p)
+ struct rb_node *node = rb_next(&pos->s_rb);
+ if (!node)
pos = NULL;
else
- pos = rb_entry(p, struct sysfs_dirent, inode_node);
+ pos = to_sysfs_dirent(node);
} while (pos && pos->s_ns != ns);
return pos;
}
@@ -1006,7 +1023,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
len = strlen(name);
ino = pos->s_ino;
type = dt_type(pos);
- filp->f_pos = ino;
+ filp->f_pos = pos->s_hash;
filp->private_data = sysfs_get(pos);
mutex_unlock(&sysfs_mutex);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 62f4fb37789..00012e31829 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -493,6 +493,12 @@ int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
const void *ns = NULL;
int err;
+ if (!dir_sd) {
+ WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n",
+ kobject_name(kobj));
+ return -ENOENT;
+ }
+
err = 0;
if (!sysfs_ns_type(dir_sd))
goto out;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index dd1701caecc..2df555c66d5 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -67,7 +67,11 @@ static int internal_create_group(struct kobject *kobj, int update,
/* Updates may happen before the object has been instantiated */
if (unlikely(update && !kobj->sd))
return -EINVAL;
-
+ if (!grp->attrs) {
+ WARN(1, "sysfs: attrs not set by subsystem for group: %s/%s\n",
+ kobj->name, grp->name ? "" : grp->name);
+ return -EINVAL;
+ }
if (grp->name) {
error = sysfs_create_subdir(kobj, grp->name, &sd);
if (error)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 4a802b4a905..feb2d69396c 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -136,12 +136,13 @@ static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *sec
void *old_secdata;
size_t old_secdata_len;
- iattrs = sd->s_iattr;
- if (!iattrs)
- iattrs = sysfs_init_inode_attrs(sd);
- if (!iattrs)
- return -ENOMEM;
+ if (!sd->s_iattr) {
+ sd->s_iattr = sysfs_init_inode_attrs(sd);
+ if (!sd->s_iattr)
+ return -ENOMEM;
+ }
+ iattrs = sd->s_iattr;
old_secdata = iattrs->ia_secdata;
old_secdata_len = iattrs->ia_secdata_len;
@@ -318,8 +319,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
struct sysfs_addrm_cxt acxt;
struct sysfs_dirent *sd;
- if (!dir_sd)
+ if (!dir_sd) {
+ WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
+ name);
return -ENOENT;
+ }
sysfs_addrm_start(&acxt, dir_sd);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index e34f0d99ea4..52c3bdb66a8 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -36,7 +36,7 @@ struct sysfs_dirent sysfs_root = {
.s_name = "",
.s_count = ATOMIC_INIT(1),
.s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
- .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+ .s_mode = S_IFDIR | S_IRUGO | S_IXUGO,
.s_ino = 1,
};
@@ -61,10 +61,9 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
}
/* instantiate and link root dentry */
- root = d_alloc_root(inode);
+ root = d_make_root(inode);
if (!root) {
pr_debug("%s: could not get root dentry!\n",__func__);
- iput(inode);
return -ENOMEM;
}
root->d_fsdata = &sysfs_root;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 7484a36ee67..661a9639570 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -20,9 +20,8 @@ struct sysfs_elem_dir {
struct kobject *kobj;
unsigned long subdirs;
-
- struct rb_root inode_tree;
- struct rb_root name_tree;
+ /* children rbtree starts here and goes through sd->s_rb */
+ struct rb_root children;
};
struct sysfs_elem_symlink {
@@ -62,8 +61,7 @@ struct sysfs_dirent {
struct sysfs_dirent *s_parent;
const char *s_name;
- struct rb_node inode_node;
- struct rb_node name_node;
+ struct rb_node s_rb;
union {
struct completion *completion;
@@ -71,6 +69,7 @@ struct sysfs_dirent {
} u;
const void *s_ns; /* namespace tag */
+ unsigned int s_hash; /* ns + name hash */
union {
struct sysfs_elem_dir s_dir;
struct sysfs_elem_symlink s_symlink;
@@ -78,9 +77,9 @@ struct sysfs_dirent {
struct sysfs_elem_bin_attr s_bin_attr;
};
- unsigned int s_flags;
+ unsigned short s_flags;
umode_t s_mode;
- ino_t s_ino;
+ unsigned int s_ino;
struct sysfs_inode_attrs *s_iattr;
};
@@ -95,11 +94,11 @@ struct sysfs_dirent {
#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_NS_TYPE_MASK 0xff00
+#define SYSFS_NS_TYPE_MASK 0xf00
#define SYSFS_NS_TYPE_SHIFT 8
#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
-#define SYSFS_FLAG_REMOVED 0x020000
+#define SYSFS_FLAG_REMOVED 0x02000
static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
{
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b217797e621..d7466e29361 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -121,9 +121,6 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
{
struct inode *inode = old_dentry->d_inode;
- if (inode->i_nlink >= SYSV_SB(inode->i_sb)->s_link_max)
- return -EMLINK;
-
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
ihold(inode);
@@ -134,10 +131,8 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
{
struct inode * inode;
- int err = -EMLINK;
+ int err;
- if (dir->i_nlink >= SYSV_SB(dir->i_sb)->s_link_max)
- goto out;
inode_inc_link_count(dir);
inode = sysv_new_inode(dir, S_IFDIR|mode);
@@ -251,11 +246,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
} else {
- if (dir_de) {
- err = -EMLINK;
- if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max)
- goto out_dir;
- }
err = sysv_add_link(new_dentry, old_inode);
if (err)
goto out_dir;
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index f60c196913e..7491c33b646 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -44,7 +44,7 @@ enum {
JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60
};
-static void detected_xenix(struct sysv_sb_info *sbi)
+static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links)
{
struct buffer_head *bh1 = sbi->s_bh1;
struct buffer_head *bh2 = sbi->s_bh2;
@@ -59,7 +59,7 @@ static void detected_xenix(struct sysv_sb_info *sbi)
sbd2 = (struct xenix_super_block *) (bh2->b_data - 512);
}
- sbi->s_link_max = XENIX_LINK_MAX;
+ *max_links = XENIX_LINK_MAX;
sbi->s_fic_size = XENIX_NICINOD;
sbi->s_flc_size = XENIX_NICFREE;
sbi->s_sbd1 = (char *)sbd1;
@@ -75,7 +75,7 @@ static void detected_xenix(struct sysv_sb_info *sbi)
sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize);
}
-static void detected_sysv4(struct sysv_sb_info *sbi)
+static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links)
{
struct sysv4_super_block * sbd;
struct buffer_head *bh1 = sbi->s_bh1;
@@ -86,7 +86,7 @@ static void detected_sysv4(struct sysv_sb_info *sbi)
else
sbd = (struct sysv4_super_block *) bh2->b_data;
- sbi->s_link_max = SYSV_LINK_MAX;
+ *max_links = SYSV_LINK_MAX;
sbi->s_fic_size = SYSV_NICINOD;
sbi->s_flc_size = SYSV_NICFREE;
sbi->s_sbd1 = (char *)sbd;
@@ -103,7 +103,7 @@ static void detected_sysv4(struct sysv_sb_info *sbi)
sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
}
-static void detected_sysv2(struct sysv_sb_info *sbi)
+static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links)
{
struct sysv2_super_block *sbd;
struct buffer_head *bh1 = sbi->s_bh1;
@@ -114,7 +114,7 @@ static void detected_sysv2(struct sysv_sb_info *sbi)
else
sbd = (struct sysv2_super_block *) bh2->b_data;
- sbi->s_link_max = SYSV_LINK_MAX;
+ *max_links = SYSV_LINK_MAX;
sbi->s_fic_size = SYSV_NICINOD;
sbi->s_flc_size = SYSV_NICFREE;
sbi->s_sbd1 = (char *)sbd;
@@ -131,14 +131,14 @@ static void detected_sysv2(struct sysv_sb_info *sbi)
sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
}
-static void detected_coherent(struct sysv_sb_info *sbi)
+static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links)
{
struct coh_super_block * sbd;
struct buffer_head *bh1 = sbi->s_bh1;
sbd = (struct coh_super_block *) bh1->b_data;
- sbi->s_link_max = COH_LINK_MAX;
+ *max_links = COH_LINK_MAX;
sbi->s_fic_size = COH_NICINOD;
sbi->s_flc_size = COH_NICFREE;
sbi->s_sbd1 = (char *)sbd;
@@ -154,12 +154,12 @@ static void detected_coherent(struct sysv_sb_info *sbi)
sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
}
-static void detected_v7(struct sysv_sb_info *sbi)
+static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links)
{
struct buffer_head *bh2 = sbi->s_bh2;
struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data;
- sbi->s_link_max = V7_LINK_MAX;
+ *max_links = V7_LINK_MAX;
sbi->s_fic_size = V7_NICINOD;
sbi->s_flc_size = V7_NICFREE;
sbi->s_sbd1 = (char *)sbd;
@@ -290,7 +290,7 @@ static char *flavour_names[] = {
[FSTYPE_AFS] = "AFS",
};
-static void (*flavour_setup[])(struct sysv_sb_info *) = {
+static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = {
[FSTYPE_XENIX] = detected_xenix,
[FSTYPE_SYSV4] = detected_sysv4,
[FSTYPE_SYSV2] = detected_sysv2,
@@ -310,7 +310,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
sbi->s_firstinodezone = 2;
- flavour_setup[sbi->s_type](sbi);
+ flavour_setup[sbi->s_type](sbi, &sb->s_max_links);
sbi->s_truncate = 1;
sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone;
@@ -341,9 +341,8 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
printk("SysV FS: get root inode failed\n");
return 0;
}
- sb->s_root = d_alloc_root(root_inode);
+ sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
- iput(root_inode);
printk("SysV FS: get root dentry failed\n");
return 0;
}
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 0e4b821c569..11b07672f6c 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -24,7 +24,6 @@ struct sysv_sb_info {
char s_bytesex; /* bytesex (le/be/pdp) */
char s_truncate; /* if 1: names > SYSV_NAMELEN chars are truncated */
/* if 0: they are disallowed (ENAMETOOLONG) */
- nlink_t s_link_max; /* max number of hard links to a file */
unsigned int s_inodes_per_block; /* number of inodes per block */
unsigned int s_inodes_per_block_1; /* inodes_per_block - 1 */
unsigned int s_inodes_per_block_bits; /* log2(inodes_per_block) */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b09ba2dd8b6..1934084e208 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -36,10 +36,7 @@
#ifdef CONFIG_UBIFS_FS_DEBUG
-DEFINE_SPINLOCK(dbg_lock);
-
-static char dbg_key_buf0[128];
-static char dbg_key_buf1[128];
+static DEFINE_SPINLOCK(dbg_lock);
static const char *get_key_fmt(int fmt)
{
@@ -103,8 +100,8 @@ static const char *get_dent_type(int type)
}
}
-static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
- char *buffer)
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+ const union ubifs_key *key, char *buffer, int len)
{
char *p = buffer;
int type = key_type(c, key);
@@ -112,45 +109,34 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
switch (type) {
case UBIFS_INO_KEY:
- sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key),
- get_key_type(type));
+ len -= snprintf(p, len, "(%lu, %s)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type));
break;
case UBIFS_DENT_KEY:
case UBIFS_XENT_KEY:
- sprintf(p, "(%lu, %s, %#08x)",
- (unsigned long)key_inum(c, key),
- get_key_type(type), key_hash(c, key));
+ len -= snprintf(p, len, "(%lu, %s, %#08x)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type), key_hash(c, key));
break;
case UBIFS_DATA_KEY:
- sprintf(p, "(%lu, %s, %u)",
- (unsigned long)key_inum(c, key),
- get_key_type(type), key_block(c, key));
+ len -= snprintf(p, len, "(%lu, %s, %u)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type), key_block(c, key));
break;
case UBIFS_TRUN_KEY:
- sprintf(p, "(%lu, %s)",
- (unsigned long)key_inum(c, key),
- get_key_type(type));
+ len -= snprintf(p, len, "(%lu, %s)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type));
break;
default:
- sprintf(p, "(bad key type: %#08x, %#08x)",
- key->u32[0], key->u32[1]);
+ len -= snprintf(p, len, "(bad key type: %#08x, %#08x)",
+ key->u32[0], key->u32[1]);
}
} else
- sprintf(p, "bad key format %d", c->key_fmt);
-}
-
-const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
-{
- /* dbg_lock must be held */
- sprintf_key(c, key, dbg_key_buf0);
- return dbg_key_buf0;
-}
-
-const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
-{
- /* dbg_lock must be held */
- sprintf_key(c, key, dbg_key_buf1);
- return dbg_key_buf1;
+ len -= snprintf(p, len, "bad key format %d", c->key_fmt);
+ ubifs_assert(len > 0);
+ return p;
}
const char *dbg_ntype(int type)
@@ -235,15 +221,15 @@ const char *dbg_jhead(int jhead)
static void dump_ch(const struct ubifs_ch *ch)
{
- printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
- printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc));
- printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type,
+ printk(KERN_ERR "\tmagic %#x\n", le32_to_cpu(ch->magic));
+ printk(KERN_ERR "\tcrc %#x\n", le32_to_cpu(ch->crc));
+ printk(KERN_ERR "\tnode_type %d (%s)\n", ch->node_type,
dbg_ntype(ch->node_type));
- printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type,
+ printk(KERN_ERR "\tgroup_type %d (%s)\n", ch->group_type,
dbg_gtype(ch->group_type));
- printk(KERN_DEBUG "\tsqnum %llu\n",
+ printk(KERN_ERR "\tsqnum %llu\n",
(unsigned long long)le64_to_cpu(ch->sqnum));
- printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len));
+ printk(KERN_ERR "\tlen %u\n", le32_to_cpu(ch->len));
}
void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
@@ -254,43 +240,43 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
struct ubifs_dent_node *dent, *pdent = NULL;
int count = 2;
- printk(KERN_DEBUG "Dump in-memory inode:");
- printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino);
- printk(KERN_DEBUG "\tsize %llu\n",
+ printk(KERN_ERR "Dump in-memory inode:");
+ printk(KERN_ERR "\tinode %lu\n", inode->i_ino);
+ printk(KERN_ERR "\tsize %llu\n",
(unsigned long long)i_size_read(inode));
- printk(KERN_DEBUG "\tnlink %u\n", inode->i_nlink);
- printk(KERN_DEBUG "\tuid %u\n", (unsigned int)inode->i_uid);
- printk(KERN_DEBUG "\tgid %u\n", (unsigned int)inode->i_gid);
- printk(KERN_DEBUG "\tatime %u.%u\n",
+ printk(KERN_ERR "\tnlink %u\n", inode->i_nlink);
+ printk(KERN_ERR "\tuid %u\n", (unsigned int)inode->i_uid);
+ printk(KERN_ERR "\tgid %u\n", (unsigned int)inode->i_gid);
+ printk(KERN_ERR "\tatime %u.%u\n",
(unsigned int)inode->i_atime.tv_sec,
(unsigned int)inode->i_atime.tv_nsec);
- printk(KERN_DEBUG "\tmtime %u.%u\n",
+ printk(KERN_ERR "\tmtime %u.%u\n",
(unsigned int)inode->i_mtime.tv_sec,
(unsigned int)inode->i_mtime.tv_nsec);
- printk(KERN_DEBUG "\tctime %u.%u\n",
+ printk(KERN_ERR "\tctime %u.%u\n",
(unsigned int)inode->i_ctime.tv_sec,
(unsigned int)inode->i_ctime.tv_nsec);
- printk(KERN_DEBUG "\tcreat_sqnum %llu\n", ui->creat_sqnum);
- printk(KERN_DEBUG "\txattr_size %u\n", ui->xattr_size);
- printk(KERN_DEBUG "\txattr_cnt %u\n", ui->xattr_cnt);
- printk(KERN_DEBUG "\txattr_names %u\n", ui->xattr_names);
- printk(KERN_DEBUG "\tdirty %u\n", ui->dirty);
- printk(KERN_DEBUG "\txattr %u\n", ui->xattr);
- printk(KERN_DEBUG "\tbulk_read %u\n", ui->xattr);
- printk(KERN_DEBUG "\tsynced_i_size %llu\n",
+ printk(KERN_ERR "\tcreat_sqnum %llu\n", ui->creat_sqnum);
+ printk(KERN_ERR "\txattr_size %u\n", ui->xattr_size);
+ printk(KERN_ERR "\txattr_cnt %u\n", ui->xattr_cnt);
+ printk(KERN_ERR "\txattr_names %u\n", ui->xattr_names);
+ printk(KERN_ERR "\tdirty %u\n", ui->dirty);
+ printk(KERN_ERR "\txattr %u\n", ui->xattr);
+ printk(KERN_ERR "\tbulk_read %u\n", ui->xattr);
+ printk(KERN_ERR "\tsynced_i_size %llu\n",
(unsigned long long)ui->synced_i_size);
- printk(KERN_DEBUG "\tui_size %llu\n",
+ printk(KERN_ERR "\tui_size %llu\n",
(unsigned long long)ui->ui_size);
- printk(KERN_DEBUG "\tflags %d\n", ui->flags);
- printk(KERN_DEBUG "\tcompr_type %d\n", ui->compr_type);
- printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read);
- printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row);
- printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len);
+ printk(KERN_ERR "\tflags %d\n", ui->flags);
+ printk(KERN_ERR "\tcompr_type %d\n", ui->compr_type);
+ printk(KERN_ERR "\tlast_page_read %lu\n", ui->last_page_read);
+ printk(KERN_ERR "\tread_in_a_row %lu\n", ui->read_in_a_row);
+ printk(KERN_ERR "\tdata_len %d\n", ui->data_len);
if (!S_ISDIR(inode->i_mode))
return;
- printk(KERN_DEBUG "List of directory entries:\n");
+ printk(KERN_ERR "List of directory entries:\n");
ubifs_assert(!mutex_is_locked(&c->tnc_mutex));
lowest_dent_key(c, &key, inode->i_ino);
@@ -298,11 +284,11 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
if (PTR_ERR(dent) != -ENOENT)
- printk(KERN_DEBUG "error %ld\n", PTR_ERR(dent));
+ printk(KERN_ERR "error %ld\n", PTR_ERR(dent));
break;
}
- printk(KERN_DEBUG "\t%d: %s (%s)\n",
+ printk(KERN_ERR "\t%d: %s (%s)\n",
count++, dent->name, get_dent_type(dent->type));
nm.name = dent->name;
@@ -319,14 +305,15 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
int i, n;
union ubifs_key key;
const struct ubifs_ch *ch = node;
+ char key_buf[DBG_KEY_BUF_LEN];
if (dbg_is_tst_rcvry(c))
return;
/* If the magic is incorrect, just hexdump the first bytes */
if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
- printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ);
- print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
+ printk(KERN_ERR "Not a node, first %zu bytes:", UBIFS_CH_SZ);
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1,
(void *)node, UBIFS_CH_SZ, 1);
return;
}
@@ -339,7 +326,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
{
const struct ubifs_pad_node *pad = node;
- printk(KERN_DEBUG "\tpad_len %u\n",
+ printk(KERN_ERR "\tpad_len %u\n",
le32_to_cpu(pad->pad_len));
break;
}
@@ -348,50 +335,50 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
const struct ubifs_sb_node *sup = node;
unsigned int sup_flags = le32_to_cpu(sup->flags);
- printk(KERN_DEBUG "\tkey_hash %d (%s)\n",
+ printk(KERN_ERR "\tkey_hash %d (%s)\n",
(int)sup->key_hash, get_key_hash(sup->key_hash));
- printk(KERN_DEBUG "\tkey_fmt %d (%s)\n",
+ printk(KERN_ERR "\tkey_fmt %d (%s)\n",
(int)sup->key_fmt, get_key_fmt(sup->key_fmt));
- printk(KERN_DEBUG "\tflags %#x\n", sup_flags);
- printk(KERN_DEBUG "\t big_lpt %u\n",
+ printk(KERN_ERR "\tflags %#x\n", sup_flags);
+ printk(KERN_ERR "\t big_lpt %u\n",
!!(sup_flags & UBIFS_FLG_BIGLPT));
- printk(KERN_DEBUG "\t space_fixup %u\n",
+ printk(KERN_ERR "\t space_fixup %u\n",
!!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
- printk(KERN_DEBUG "\tmin_io_size %u\n",
+ printk(KERN_ERR "\tmin_io_size %u\n",
le32_to_cpu(sup->min_io_size));
- printk(KERN_DEBUG "\tleb_size %u\n",
+ printk(KERN_ERR "\tleb_size %u\n",
le32_to_cpu(sup->leb_size));
- printk(KERN_DEBUG "\tleb_cnt %u\n",
+ printk(KERN_ERR "\tleb_cnt %u\n",
le32_to_cpu(sup->leb_cnt));
- printk(KERN_DEBUG "\tmax_leb_cnt %u\n",
+ printk(KERN_ERR "\tmax_leb_cnt %u\n",
le32_to_cpu(sup->max_leb_cnt));
- printk(KERN_DEBUG "\tmax_bud_bytes %llu\n",
+ printk(KERN_ERR "\tmax_bud_bytes %llu\n",
(unsigned long long)le64_to_cpu(sup->max_bud_bytes));
- printk(KERN_DEBUG "\tlog_lebs %u\n",
+ printk(KERN_ERR "\tlog_lebs %u\n",
le32_to_cpu(sup->log_lebs));
- printk(KERN_DEBUG "\tlpt_lebs %u\n",
+ printk(KERN_ERR "\tlpt_lebs %u\n",
le32_to_cpu(sup->lpt_lebs));
- printk(KERN_DEBUG "\torph_lebs %u\n",
+ printk(KERN_ERR "\torph_lebs %u\n",
le32_to_cpu(sup->orph_lebs));
- printk(KERN_DEBUG "\tjhead_cnt %u\n",
+ printk(KERN_ERR "\tjhead_cnt %u\n",
le32_to_cpu(sup->jhead_cnt));
- printk(KERN_DEBUG "\tfanout %u\n",
+ printk(KERN_ERR "\tfanout %u\n",
le32_to_cpu(sup->fanout));
- printk(KERN_DEBUG "\tlsave_cnt %u\n",
+ printk(KERN_ERR "\tlsave_cnt %u\n",
le32_to_cpu(sup->lsave_cnt));
- printk(KERN_DEBUG "\tdefault_compr %u\n",
+ printk(KERN_ERR "\tdefault_compr %u\n",
(int)le16_to_cpu(sup->default_compr));
- printk(KERN_DEBUG "\trp_size %llu\n",
+ printk(KERN_ERR "\trp_size %llu\n",
(unsigned long long)le64_to_cpu(sup->rp_size));
- printk(KERN_DEBUG "\trp_uid %u\n",
+ printk(KERN_ERR "\trp_uid %u\n",
le32_to_cpu(sup->rp_uid));
- printk(KERN_DEBUG "\trp_gid %u\n",
+ printk(KERN_ERR "\trp_gid %u\n",
le32_to_cpu(sup->rp_gid));
- printk(KERN_DEBUG "\tfmt_version %u\n",
+ printk(KERN_ERR "\tfmt_version %u\n",
le32_to_cpu(sup->fmt_version));
- printk(KERN_DEBUG "\ttime_gran %u\n",
+ printk(KERN_ERR "\ttime_gran %u\n",
le32_to_cpu(sup->time_gran));
- printk(KERN_DEBUG "\tUUID %pUB\n",
+ printk(KERN_ERR "\tUUID %pUB\n",
sup->uuid);
break;
}
@@ -399,61 +386,61 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
{
const struct ubifs_mst_node *mst = node;
- printk(KERN_DEBUG "\thighest_inum %llu\n",
+ printk(KERN_ERR "\thighest_inum %llu\n",
(unsigned long long)le64_to_cpu(mst->highest_inum));
- printk(KERN_DEBUG "\tcommit number %llu\n",
+ printk(KERN_ERR "\tcommit number %llu\n",
(unsigned long long)le64_to_cpu(mst->cmt_no));
- printk(KERN_DEBUG "\tflags %#x\n",
+ printk(KERN_ERR "\tflags %#x\n",
le32_to_cpu(mst->flags));
- printk(KERN_DEBUG "\tlog_lnum %u\n",
+ printk(KERN_ERR "\tlog_lnum %u\n",
le32_to_cpu(mst->log_lnum));
- printk(KERN_DEBUG "\troot_lnum %u\n",
+ printk(KERN_ERR "\troot_lnum %u\n",
le32_to_cpu(mst->root_lnum));
- printk(KERN_DEBUG "\troot_offs %u\n",
+ printk(KERN_ERR "\troot_offs %u\n",
le32_to_cpu(mst->root_offs));
- printk(KERN_DEBUG "\troot_len %u\n",
+ printk(KERN_ERR "\troot_len %u\n",
le32_to_cpu(mst->root_len));
- printk(KERN_DEBUG "\tgc_lnum %u\n",
+ printk(KERN_ERR "\tgc_lnum %u\n",
le32_to_cpu(mst->gc_lnum));
- printk(KERN_DEBUG "\tihead_lnum %u\n",
+ printk(KERN_ERR "\tihead_lnum %u\n",
le32_to_cpu(mst->ihead_lnum));
- printk(KERN_DEBUG "\tihead_offs %u\n",
+ printk(KERN_ERR "\tihead_offs %u\n",
le32_to_cpu(mst->ihead_offs));
- printk(KERN_DEBUG "\tindex_size %llu\n",
+ printk(KERN_ERR "\tindex_size %llu\n",
(unsigned long long)le64_to_cpu(mst->index_size));
- printk(KERN_DEBUG "\tlpt_lnum %u\n",
+ printk(KERN_ERR "\tlpt_lnum %u\n",
le32_to_cpu(mst->lpt_lnum));
- printk(KERN_DEBUG "\tlpt_offs %u\n",
+ printk(KERN_ERR "\tlpt_offs %u\n",
le32_to_cpu(mst->lpt_offs));
- printk(KERN_DEBUG "\tnhead_lnum %u\n",
+ printk(KERN_ERR "\tnhead_lnum %u\n",
le32_to_cpu(mst->nhead_lnum));
- printk(KERN_DEBUG "\tnhead_offs %u\n",
+ printk(KERN_ERR "\tnhead_offs %u\n",
le32_to_cpu(mst->nhead_offs));
- printk(KERN_DEBUG "\tltab_lnum %u\n",
+ printk(KERN_ERR "\tltab_lnum %u\n",
le32_to_cpu(mst->ltab_lnum));
- printk(KERN_DEBUG "\tltab_offs %u\n",
+ printk(KERN_ERR "\tltab_offs %u\n",
le32_to_cpu(mst->ltab_offs));
- printk(KERN_DEBUG "\tlsave_lnum %u\n",
+ printk(KERN_ERR "\tlsave_lnum %u\n",
le32_to_cpu(mst->lsave_lnum));
- printk(KERN_DEBUG "\tlsave_offs %u\n",
+ printk(KERN_ERR "\tlsave_offs %u\n",
le32_to_cpu(mst->lsave_offs));
- printk(KERN_DEBUG "\tlscan_lnum %u\n",
+ printk(KERN_ERR "\tlscan_lnum %u\n",
le32_to_cpu(mst->lscan_lnum));
- printk(KERN_DEBUG "\tleb_cnt %u\n",
+ printk(KERN_ERR "\tleb_cnt %u\n",
le32_to_cpu(mst->leb_cnt));
- printk(KERN_DEBUG "\tempty_lebs %u\n",
+ printk(KERN_ERR "\tempty_lebs %u\n",
le32_to_cpu(mst->empty_lebs));
- printk(KERN_DEBUG "\tidx_lebs %u\n",
+ printk(KERN_ERR "\tidx_lebs %u\n",
le32_to_cpu(mst->idx_lebs));
- printk(KERN_DEBUG "\ttotal_free %llu\n",
+ printk(KERN_ERR "\ttotal_free %llu\n",
(unsigned long long)le64_to_cpu(mst->total_free));
- printk(KERN_DEBUG "\ttotal_dirty %llu\n",
+ printk(KERN_ERR "\ttotal_dirty %llu\n",
(unsigned long long)le64_to_cpu(mst->total_dirty));
- printk(KERN_DEBUG "\ttotal_used %llu\n",
+ printk(KERN_ERR "\ttotal_used %llu\n",
(unsigned long long)le64_to_cpu(mst->total_used));
- printk(KERN_DEBUG "\ttotal_dead %llu\n",
+ printk(KERN_ERR "\ttotal_dead %llu\n",
(unsigned long long)le64_to_cpu(mst->total_dead));
- printk(KERN_DEBUG "\ttotal_dark %llu\n",
+ printk(KERN_ERR "\ttotal_dark %llu\n",
(unsigned long long)le64_to_cpu(mst->total_dark));
break;
}
@@ -461,11 +448,11 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
{
const struct ubifs_ref_node *ref = node;
- printk(KERN_DEBUG "\tlnum %u\n",
+ printk(KERN_ERR "\tlnum %u\n",
le32_to_cpu(ref->lnum));
- printk(KERN_DEBUG "\toffs %u\n",
+ printk(KERN_ERR "\toffs %u\n",
le32_to_cpu(ref->offs));
- printk(KERN_DEBUG "\tjhead %u\n",
+ printk(KERN_ERR "\tjhead %u\n",
le32_to_cpu(ref->jhead));
break;
}
@@ -474,39 +461,40 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
const struct ubifs_ino_node *ino = node;
key_read(c, &ino->key, &key);
- printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
- printk(KERN_DEBUG "\tcreat_sqnum %llu\n",
+ printk(KERN_ERR "\tkey %s\n",
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+ printk(KERN_ERR "\tcreat_sqnum %llu\n",
(unsigned long long)le64_to_cpu(ino->creat_sqnum));
- printk(KERN_DEBUG "\tsize %llu\n",
+ printk(KERN_ERR "\tsize %llu\n",
(unsigned long long)le64_to_cpu(ino->size));
- printk(KERN_DEBUG "\tnlink %u\n",
+ printk(KERN_ERR "\tnlink %u\n",
le32_to_cpu(ino->nlink));
- printk(KERN_DEBUG "\tatime %lld.%u\n",
+ printk(KERN_ERR "\tatime %lld.%u\n",
(long long)le64_to_cpu(ino->atime_sec),
le32_to_cpu(ino->atime_nsec));
- printk(KERN_DEBUG "\tmtime %lld.%u\n",
+ printk(KERN_ERR "\tmtime %lld.%u\n",
(long long)le64_to_cpu(ino->mtime_sec),
le32_to_cpu(ino->mtime_nsec));
- printk(KERN_DEBUG "\tctime %lld.%u\n",
+ printk(KERN_ERR "\tctime %lld.%u\n",
(long long)le64_to_cpu(ino->ctime_sec),
le32_to_cpu(ino->ctime_nsec));
- printk(KERN_DEBUG "\tuid %u\n",
+ printk(KERN_ERR "\tuid %u\n",
le32_to_cpu(ino->uid));
- printk(KERN_DEBUG "\tgid %u\n",
+ printk(KERN_ERR "\tgid %u\n",
le32_to_cpu(ino->gid));
- printk(KERN_DEBUG "\tmode %u\n",
+ printk(KERN_ERR "\tmode %u\n",
le32_to_cpu(ino->mode));
- printk(KERN_DEBUG "\tflags %#x\n",
+ printk(KERN_ERR "\tflags %#x\n",
le32_to_cpu(ino->flags));
- printk(KERN_DEBUG "\txattr_cnt %u\n",
+ printk(KERN_ERR "\txattr_cnt %u\n",
le32_to_cpu(ino->xattr_cnt));
- printk(KERN_DEBUG "\txattr_size %u\n",
+ printk(KERN_ERR "\txattr_size %u\n",
le32_to_cpu(ino->xattr_size));
- printk(KERN_DEBUG "\txattr_names %u\n",
+ printk(KERN_ERR "\txattr_names %u\n",
le32_to_cpu(ino->xattr_names));
- printk(KERN_DEBUG "\tcompr_type %#x\n",
+ printk(KERN_ERR "\tcompr_type %#x\n",
(int)le16_to_cpu(ino->compr_type));
- printk(KERN_DEBUG "\tdata len %u\n",
+ printk(KERN_ERR "\tdata len %u\n",
le32_to_cpu(ino->data_len));
break;
}
@@ -517,15 +505,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
int nlen = le16_to_cpu(dent->nlen);
key_read(c, &dent->key, &key);
- printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
- printk(KERN_DEBUG "\tinum %llu\n",
+ printk(KERN_ERR "\tkey %s\n",
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+ printk(KERN_ERR "\tinum %llu\n",
(unsigned long long)le64_to_cpu(dent->inum));
- printk(KERN_DEBUG "\ttype %d\n", (int)dent->type);
- printk(KERN_DEBUG "\tnlen %d\n", nlen);
- printk(KERN_DEBUG "\tname ");
+ printk(KERN_ERR "\ttype %d\n", (int)dent->type);
+ printk(KERN_ERR "\tnlen %d\n", nlen);
+ printk(KERN_ERR "\tname ");
if (nlen > UBIFS_MAX_NLEN)
- printk(KERN_DEBUG "(bad name length, not printing, "
+ printk(KERN_ERR "(bad name length, not printing, "
"bad or corrupted node)");
else {
for (i = 0; i < nlen && dent->name[i]; i++)
@@ -541,15 +530,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
key_read(c, &dn->key, &key);
- printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
- printk(KERN_DEBUG "\tsize %u\n",
+ printk(KERN_ERR "\tkey %s\n",
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+ printk(KERN_ERR "\tsize %u\n",
le32_to_cpu(dn->size));
- printk(KERN_DEBUG "\tcompr_typ %d\n",
+ printk(KERN_ERR "\tcompr_typ %d\n",
(int)le16_to_cpu(dn->compr_type));
- printk(KERN_DEBUG "\tdata size %d\n",
+ printk(KERN_ERR "\tdata size %d\n",
dlen);
- printk(KERN_DEBUG "\tdata:\n");
- print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+ printk(KERN_ERR "\tdata:\n");
+ print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
(void *)&dn->data, dlen, 0);
break;
}
@@ -557,11 +547,11 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
{
const struct ubifs_trun_node *trun = node;
- printk(KERN_DEBUG "\tinum %u\n",
+ printk(KERN_ERR "\tinum %u\n",
le32_to_cpu(trun->inum));
- printk(KERN_DEBUG "\told_size %llu\n",
+ printk(KERN_ERR "\told_size %llu\n",
(unsigned long long)le64_to_cpu(trun->old_size));
- printk(KERN_DEBUG "\tnew_size %llu\n",
+ printk(KERN_ERR "\tnew_size %llu\n",
(unsigned long long)le64_to_cpu(trun->new_size));
break;
}
@@ -570,19 +560,21 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
const struct ubifs_idx_node *idx = node;
n = le16_to_cpu(idx->child_cnt);
- printk(KERN_DEBUG "\tchild_cnt %d\n", n);
- printk(KERN_DEBUG "\tlevel %d\n",
+ printk(KERN_ERR "\tchild_cnt %d\n", n);
+ printk(KERN_ERR "\tlevel %d\n",
(int)le16_to_cpu(idx->level));
- printk(KERN_DEBUG "\tBranches:\n");
+ printk(KERN_ERR "\tBranches:\n");
for (i = 0; i < n && i < c->fanout - 1; i++) {
const struct ubifs_branch *br;
br = ubifs_idx_branch(c, idx, i);
key_read(c, &br->key, &key);
- printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
+ printk(KERN_ERR "\t%d: LEB %d:%d len %d key %s\n",
i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
- le32_to_cpu(br->len), DBGKEY(&key));
+ le32_to_cpu(br->len),
+ dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
}
break;
}
@@ -592,20 +584,20 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
{
const struct ubifs_orph_node *orph = node;
- printk(KERN_DEBUG "\tcommit number %llu\n",
+ printk(KERN_ERR "\tcommit number %llu\n",
(unsigned long long)
le64_to_cpu(orph->cmt_no) & LLONG_MAX);
- printk(KERN_DEBUG "\tlast node flag %llu\n",
+ printk(KERN_ERR "\tlast node flag %llu\n",
(unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
- printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
+ printk(KERN_ERR "\t%d orphan inode numbers:\n", n);
for (i = 0; i < n; i++)
- printk(KERN_DEBUG "\t ino %llu\n",
+ printk(KERN_ERR "\t ino %llu\n",
(unsigned long long)le64_to_cpu(orph->inos[i]));
break;
}
default:
- printk(KERN_DEBUG "node type %d was not recognized\n",
+ printk(KERN_ERR "node type %d was not recognized\n",
(int)ch->node_type);
}
spin_unlock(&dbg_lock);
@@ -614,16 +606,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
void dbg_dump_budget_req(const struct ubifs_budget_req *req)
{
spin_lock(&dbg_lock);
- printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n",
+ printk(KERN_ERR "Budgeting request: new_ino %d, dirtied_ino %d\n",
req->new_ino, req->dirtied_ino);
- printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n",
+ printk(KERN_ERR "\tnew_ino_d %d, dirtied_ino_d %d\n",
req->new_ino_d, req->dirtied_ino_d);
- printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n",
+ printk(KERN_ERR "\tnew_page %d, dirtied_page %d\n",
req->new_page, req->dirtied_page);
- printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n",
+ printk(KERN_ERR "\tnew_dent %d, mod_dent %d\n",
req->new_dent, req->mod_dent);
- printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth);
- printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n",
+ printk(KERN_ERR "\tidx_growth %d\n", req->idx_growth);
+ printk(KERN_ERR "\tdata_growth %d dd_growth %d\n",
req->data_growth, req->dd_growth);
spin_unlock(&dbg_lock);
}
@@ -631,12 +623,12 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)
void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
{
spin_lock(&dbg_lock);
- printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, "
+ printk(KERN_ERR "(pid %d) Lprops statistics: empty_lebs %d, "
"idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
- printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
+ printk(KERN_ERR "\ttaken_empty_lebs %d, total_free %lld, "
"total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
lst->total_dirty);
- printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, "
+ printk(KERN_ERR "\ttotal_used %lld, total_dark %lld, "
"total_dead %lld\n", lst->total_used, lst->total_dark,
lst->total_dead);
spin_unlock(&dbg_lock);
@@ -652,21 +644,21 @@ void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
spin_lock(&c->space_lock);
spin_lock(&dbg_lock);
- printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
+ printk(KERN_ERR "(pid %d) Budgeting info: data budget sum %lld, "
"total budget sum %lld\n", current->pid,
bi->data_growth + bi->dd_growth,
bi->data_growth + bi->dd_growth + bi->idx_growth);
- printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
+ printk(KERN_ERR "\tbudg_data_growth %lld, budg_dd_growth %lld, "
"budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
bi->idx_growth);
- printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
+ printk(KERN_ERR "\tmin_idx_lebs %d, old_idx_sz %llu, "
"uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
bi->uncommitted_idx);
- printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+ printk(KERN_ERR "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
bi->page_budget, bi->inode_budget, bi->dent_budget);
- printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
+ printk(KERN_ERR "\tnospace %u, nospace_rp %u\n",
bi->nospace, bi->nospace_rp);
- printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+ printk(KERN_ERR "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
c->dark_wm, c->dead_wm, c->max_idx_node_sz);
if (bi != &c->bi)
@@ -677,38 +669,38 @@ void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
*/
goto out_unlock;
- printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+ printk(KERN_ERR "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
- printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
+ printk(KERN_ERR "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
"clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
atomic_long_read(&c->dirty_zn_cnt),
atomic_long_read(&c->clean_zn_cnt));
- printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
+ printk(KERN_ERR "\tgc_lnum %d, ihead_lnum %d\n",
c->gc_lnum, c->ihead_lnum);
/* If we are in R/O mode, journal heads do not exist */
if (c->jheads)
for (i = 0; i < c->jhead_cnt; i++)
- printk(KERN_DEBUG "\tjhead %s\t LEB %d\n",
+ printk(KERN_ERR "\tjhead %s\t LEB %d\n",
dbg_jhead(c->jheads[i].wbuf.jhead),
c->jheads[i].wbuf.lnum);
for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
bud = rb_entry(rb, struct ubifs_bud, rb);
- printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
+ printk(KERN_ERR "\tbud LEB %d\n", bud->lnum);
}
list_for_each_entry(bud, &c->old_buds, list)
- printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum);
+ printk(KERN_ERR "\told bud LEB %d\n", bud->lnum);
list_for_each_entry(idx_gc, &c->idx_gc, list)
- printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
+ printk(KERN_ERR "\tGC'ed idx LEB %d unmap %d\n",
idx_gc->lnum, idx_gc->unmap);
- printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+ printk(KERN_ERR "\tcommit state %d\n", c->cmt_state);
/* Print budgeting predictions */
available = ubifs_calc_available(c, c->bi.min_idx_lebs);
outstanding = c->bi.data_growth + c->bi.dd_growth;
free = ubifs_get_free_space_nolock(c);
- printk(KERN_DEBUG "Budgeting predictions:\n");
- printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
+ printk(KERN_ERR "Budgeting predictions:\n");
+ printk(KERN_ERR "\tavailable: %lld, outstanding %lld, free %lld\n",
available, outstanding, free);
out_unlock:
spin_unlock(&dbg_lock);
@@ -728,11 +720,11 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
dark = ubifs_calc_dark(c, spc);
if (lp->flags & LPROPS_INDEX)
- printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+ printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d "
"free + dirty %-8d flags %#x (", lp->lnum, lp->free,
lp->dirty, c->leb_size - spc, spc, lp->flags);
else
- printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+ printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d "
"free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
"flags %#-4x (", lp->lnum, lp->free, lp->dirty,
c->leb_size - spc, spc, dark, dead,
@@ -815,7 +807,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
struct ubifs_lprops lp;
struct ubifs_lp_stats lst;
- printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
+ printk(KERN_ERR "(pid %d) start dumping LEB properties\n",
current->pid);
ubifs_get_lp_stats(c, &lst);
dbg_dump_lstats(&lst);
@@ -827,7 +819,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
dbg_dump_lprop(c, &lp);
}
- printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
+ printk(KERN_ERR "(pid %d) finish dumping LEB properties\n",
current->pid);
}
@@ -836,35 +828,35 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
int i;
spin_lock(&dbg_lock);
- printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
- printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz);
- printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz);
- printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz);
- printk(KERN_DEBUG "\tltab_sz: %d\n", c->ltab_sz);
- printk(KERN_DEBUG "\tlsave_sz: %d\n", c->lsave_sz);
- printk(KERN_DEBUG "\tbig_lpt: %d\n", c->big_lpt);
- printk(KERN_DEBUG "\tlpt_hght: %d\n", c->lpt_hght);
- printk(KERN_DEBUG "\tpnode_cnt: %d\n", c->pnode_cnt);
- printk(KERN_DEBUG "\tnnode_cnt: %d\n", c->nnode_cnt);
- printk(KERN_DEBUG "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt);
- printk(KERN_DEBUG "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt);
- printk(KERN_DEBUG "\tlsave_cnt: %d\n", c->lsave_cnt);
- printk(KERN_DEBUG "\tspace_bits: %d\n", c->space_bits);
- printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
- printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
- printk(KERN_DEBUG "\tlpt_spc_bits: %d\n", c->lpt_spc_bits);
- printk(KERN_DEBUG "\tpcnt_bits: %d\n", c->pcnt_bits);
- printk(KERN_DEBUG "\tlnum_bits: %d\n", c->lnum_bits);
- printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
- printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
+ printk(KERN_ERR "(pid %d) dumping LPT information\n", current->pid);
+ printk(KERN_ERR "\tlpt_sz: %lld\n", c->lpt_sz);
+ printk(KERN_ERR "\tpnode_sz: %d\n", c->pnode_sz);
+ printk(KERN_ERR "\tnnode_sz: %d\n", c->nnode_sz);
+ printk(KERN_ERR "\tltab_sz: %d\n", c->ltab_sz);
+ printk(KERN_ERR "\tlsave_sz: %d\n", c->lsave_sz);
+ printk(KERN_ERR "\tbig_lpt: %d\n", c->big_lpt);
+ printk(KERN_ERR "\tlpt_hght: %d\n", c->lpt_hght);
+ printk(KERN_ERR "\tpnode_cnt: %d\n", c->pnode_cnt);
+ printk(KERN_ERR "\tnnode_cnt: %d\n", c->nnode_cnt);
+ printk(KERN_ERR "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt);
+ printk(KERN_ERR "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt);
+ printk(KERN_ERR "\tlsave_cnt: %d\n", c->lsave_cnt);
+ printk(KERN_ERR "\tspace_bits: %d\n", c->space_bits);
+ printk(KERN_ERR "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
+ printk(KERN_ERR "\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
+ printk(KERN_ERR "\tlpt_spc_bits: %d\n", c->lpt_spc_bits);
+ printk(KERN_ERR "\tpcnt_bits: %d\n", c->pcnt_bits);
+ printk(KERN_ERR "\tlnum_bits: %d\n", c->lnum_bits);
+ printk(KERN_ERR "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
+ printk(KERN_ERR "\tLPT head is at %d:%d\n",
c->nhead_lnum, c->nhead_offs);
- printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
+ printk(KERN_ERR "\tLPT ltab is at %d:%d\n",
c->ltab_lnum, c->ltab_offs);
if (c->big_lpt)
- printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
+ printk(KERN_ERR "\tLPT lsave is at %d:%d\n",
c->lsave_lnum, c->lsave_offs);
for (i = 0; i < c->lpt_lebs; i++)
- printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d "
+ printk(KERN_ERR "\tLPT LEB %d free %d dirty %d tgc %d "
"cmt %d\n", i + c->lpt_first, c->ltab[i].free,
c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt);
spin_unlock(&dbg_lock);
@@ -875,12 +867,12 @@ void dbg_dump_sleb(const struct ubifs_info *c,
{
struct ubifs_scan_node *snod;
- printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
+ printk(KERN_ERR "(pid %d) start dumping scanned data from LEB %d:%d\n",
current->pid, sleb->lnum, offs);
list_for_each_entry(snod, &sleb->nodes, list) {
cond_resched();
- printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
+ printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
snod->offs, snod->len);
dbg_dump_node(c, snod->node);
}
@@ -895,7 +887,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
if (dbg_is_tst_rcvry(c))
return;
- printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+ printk(KERN_ERR "(pid %d) start dumping LEB %d\n",
current->pid, lnum);
buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
@@ -910,17 +902,17 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
goto out;
}
- printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
+ printk(KERN_ERR "LEB %d has %d nodes ending at %d\n", lnum,
sleb->nodes_cnt, sleb->endpt);
list_for_each_entry(snod, &sleb->nodes, list) {
cond_resched();
- printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum,
+ printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", lnum,
snod->offs, snod->len);
dbg_dump_node(c, snod->node);
}
- printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+ printk(KERN_ERR "(pid %d) finish dumping LEB %d\n",
current->pid, lnum);
ubifs_scan_destroy(sleb);
@@ -934,6 +926,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
{
int n;
const struct ubifs_zbranch *zbr;
+ char key_buf[DBG_KEY_BUF_LEN];
spin_lock(&dbg_lock);
if (znode->parent)
@@ -941,7 +934,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
else
zbr = &c->zroot;
- printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
+ printk(KERN_ERR "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
" child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
zbr->len, znode->parent, znode->iip, znode->level,
znode->child_cnt, znode->flags);
@@ -951,19 +944,23 @@ void dbg_dump_znode(const struct ubifs_info *c,
return;
}
- printk(KERN_DEBUG "zbranches:\n");
+ printk(KERN_ERR "zbranches:\n");
for (n = 0; n < znode->child_cnt; n++) {
zbr = &znode->zbranch[n];
if (znode->level > 0)
- printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
+ printk(KERN_ERR "\t%d: znode %p LEB %d:%d len %d key "
"%s\n", n, zbr->znode, zbr->lnum,
zbr->offs, zbr->len,
- DBGKEY(&zbr->key));
+ dbg_snprintf_key(c, &zbr->key,
+ key_buf,
+ DBG_KEY_BUF_LEN));
else
- printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
+ printk(KERN_ERR "\t%d: LNC %p LEB %d:%d len %d key "
"%s\n", n, zbr->znode, zbr->lnum,
zbr->offs, zbr->len,
- DBGKEY(&zbr->key));
+ dbg_snprintf_key(c, &zbr->key,
+ key_buf,
+ DBG_KEY_BUF_LEN));
}
spin_unlock(&dbg_lock);
}
@@ -972,16 +969,16 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
{
int i;
- printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
+ printk(KERN_ERR "(pid %d) start dumping heap cat %d (%d elements)\n",
current->pid, cat, heap->cnt);
for (i = 0; i < heap->cnt; i++) {
struct ubifs_lprops *lprops = heap->arr[i];
- printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d "
+ printk(KERN_ERR "\t%d. LEB %d hpos %d free %d dirty %d "
"flags %d\n", i, lprops->lnum, lprops->hpos,
lprops->free, lprops->dirty, lprops->flags);
}
- printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
+ printk(KERN_ERR "(pid %d) finish dumping heap\n", current->pid);
}
void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -989,15 +986,15 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
{
int i;
- printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
- printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
+ printk(KERN_ERR "(pid %d) dumping pnode:\n", current->pid);
+ printk(KERN_ERR "\taddress %zx parent %zx cnext %zx\n",
(size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
- printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
+ printk(KERN_ERR "\tflags %lu iip %d level %d num %d\n",
pnode->flags, iip, pnode->level, pnode->num);
for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
struct ubifs_lprops *lp = &pnode->lprops[i];
- printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n",
+ printk(KERN_ERR "\t%d: free %d dirty %d flags %d lnum %d\n",
i, lp->free, lp->dirty, lp->flags, lp->lnum);
}
}
@@ -1007,20 +1004,20 @@ void dbg_dump_tnc(struct ubifs_info *c)
struct ubifs_znode *znode;
int level;
- printk(KERN_DEBUG "\n");
- printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
+ printk(KERN_ERR "\n");
+ printk(KERN_ERR "(pid %d) start dumping TNC tree\n", current->pid);
znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
level = znode->level;
- printk(KERN_DEBUG "== Level %d ==\n", level);
+ printk(KERN_ERR "== Level %d ==\n", level);
while (znode) {
if (level != znode->level) {
level = znode->level;
- printk(KERN_DEBUG "== Level %d ==\n", level);
+ printk(KERN_ERR "== Level %d ==\n", level);
}
dbg_dump_znode(c, znode);
znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
}
- printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
+ printk(KERN_ERR "(pid %d) finish dumping TNC tree\n", current->pid);
}
static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
@@ -1260,6 +1257,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
int err, nlen1, nlen2, cmp;
struct ubifs_dent_node *dent1, *dent2;
union ubifs_key key;
+ char key_buf[DBG_KEY_BUF_LEN];
ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
@@ -1290,9 +1288,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
key_read(c, &dent1->key, &key);
if (keys_cmp(c, &zbr1->key, &key)) {
dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
- zbr1->offs, DBGKEY(&key));
+ zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_err("but it should have key %s according to tnc",
- DBGKEY(&zbr1->key));
+ dbg_snprintf_key(c, &zbr1->key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_dump_node(c, dent1);
goto out_free;
}
@@ -1300,9 +1300,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
key_read(c, &dent2->key, &key);
if (keys_cmp(c, &zbr2->key, &key)) {
dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
- zbr1->offs, DBGKEY(&key));
+ zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_err("but it should have key %s according to tnc",
- DBGKEY(&zbr2->key));
+ dbg_snprintf_key(c, &zbr2->key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_dump_node(c, dent2);
goto out_free;
}
@@ -1319,7 +1321,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
dbg_err("2 xent/dent nodes with the same name");
else
dbg_err("bad order of colliding key %s",
- DBGKEY(&key));
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
dbg_dump_node(c, dent1);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8d9c4681018..9f717655df1 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -164,45 +164,42 @@ struct ubifs_global_debug_info {
#define dbg_dump_stack() dump_stack()
#define dbg_err(fmt, ...) do { \
- spin_lock(&dbg_lock); \
ubifs_err(fmt, ##__VA_ARGS__); \
- spin_unlock(&dbg_lock); \
} while (0)
-const char *dbg_key_str0(const struct ubifs_info *c,
- const union ubifs_key *key);
-const char *dbg_key_str1(const struct ubifs_info *c,
- const union ubifs_key *key);
+#define ubifs_dbg_msg(type, fmt, ...) \
+ pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
-/*
- * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
- * macros.
- */
-#define DBGKEY(key) dbg_key_str0(c, (key))
-#define DBGKEY1(key) dbg_key_str1(c, (key))
-
-extern spinlock_t dbg_lock;
-
-#define ubifs_dbg_msg(type, fmt, ...) do { \
- spin_lock(&dbg_lock); \
- pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
- spin_unlock(&dbg_lock); \
+#define DBG_KEY_BUF_LEN 32
+#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \
+ char __tmp_key_buf[DBG_KEY_BUF_LEN]; \
+ pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \
+ dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \
} while (0)
/* Just a debugging messages not related to any specific UBIFS subsystem */
-#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...) \
+ printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
+ __func__, ##__VA_ARGS__)
+
/* General messages */
#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
/* Additional journal messages */
#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) \
+ ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__)
/* Additional TNC messages */
#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) \
+ ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__)
/* Additional lprops messages */
#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
/* Additional LEB find messages */
#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
/* Additional mount messages */
#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) \
+ ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__)
/* Additional I/O messages */
#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
/* Additional commit messages */
@@ -258,6 +255,8 @@ const char *dbg_cstate(int cmt_state);
const char *dbg_jhead(int jhead);
const char *dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key);
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+ const union ubifs_key *key, char *buffer, int len);
void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode);
void dbg_dump_node(const struct ubifs_info *c, const void *node);
void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
@@ -345,20 +344,23 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
#define dbg_dump_stack()
#define ubifs_assert_cmt_locked(c)
-#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; }
static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; }
@@ -368,6 +370,10 @@ static inline const char *dbg_jhead(int jhead) { return ""; }
static inline const char *
dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key) { return ""; }
+static inline const char *
+dbg_snprintf_key(const struct ubifs_info *c,
+ const union ubifs_key *key, char *buffer,
+ int len) { return ""; }
static inline void dbg_dump_inode(struct ubifs_info *c,
const struct inode *inode) { return; }
static inline void dbg_dump_node(const struct ubifs_info *c,
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index d6fe1c79f18..ec9f1870ab7 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -566,6 +566,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
int err, budgeted = 1;
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+ unsigned int saved_nlink = inode->i_nlink;
/*
* Budget request settings: deletion direntry, deletion inode (+1 for
@@ -613,7 +614,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
out_cancel:
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- inc_nlink(inode);
+ set_nlink(inode, saved_nlink);
unlock_2_inodes(dir, inode);
if (budgeted)
ubifs_release_budget(c, &req);
@@ -704,8 +705,7 @@ out_cancel:
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
inc_nlink(dir);
- inc_nlink(inode);
- inc_nlink(inode);
+ set_nlink(inode, 2);
unlock_2_inodes(dir, inode);
if (budgeted)
ubifs_release_budget(c, &req);
@@ -977,6 +977,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct timespec time;
+ unsigned int saved_nlink;
/*
* Budget request settings: deletion direntry, new direntry, removing
@@ -1059,13 +1060,14 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlink) {
/*
* Directories cannot have hard-links, so if this is a
- * directory, decrement its @i_nlink twice because an empty
- * directory has @i_nlink 2.
+ * directory, just clear @i_nlink.
*/
+ saved_nlink = new_inode->i_nlink;
if (is_dir)
+ clear_nlink(new_inode);
+ else
drop_nlink(new_inode);
new_inode->i_ctime = time;
- drop_nlink(new_inode);
} else {
new_dir->i_size += new_sz;
ubifs_inode(new_dir)->ui_size = new_dir->i_size;
@@ -1102,9 +1104,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
out_cancel:
if (unlink) {
- if (is_dir)
- inc_nlink(new_inode);
- inc_nlink(new_inode);
+ set_nlink(new_inode, saved_nlink);
} else {
new_dir->i_size -= new_sz;
ubifs_inode(new_dir)->ui_size = new_dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f9c234bf33d..5c8f6dc1d28 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1042,10 +1042,10 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (i_size > synced_i_size) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index cef0460f4c5..2f438ab2e7a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -697,9 +697,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
struct ubifs_inode *ui = ubifs_inode(inode);
- dbg_jnl("ino %lu, blk %u, len %d, key %s",
- (unsigned long)key_inum(c, key), key_block(c, key), len,
- DBGKEY(key));
+ dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
+ (unsigned long)key_inum(c, key), key_block(c, key), len);
ubifs_assert(len <= UBIFS_BLOCK_SIZE);
data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
@@ -1177,7 +1176,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
blk = new_size >> UBIFS_BLOCK_SHIFT;
data_key_init(c, &key, inum, blk);
- dbg_jnl("last block key %s", DBGKEY(&key));
+ dbg_jnlk(&key, "last block key ");
err = ubifs_tnc_lookup(c, &key, dn);
if (err == -ENOENT)
dlen = 0; /* Not found (so it is a hole) */
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index ee4f43f4bb9..2a935b31723 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -679,7 +679,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
ret == SCANNED_GARBAGE ||
ret == SCANNED_A_BAD_PAD_NODE ||
ret == SCANNED_A_CORRUPT_NODE) {
- dbg_rcvry("found corruption - %d", ret);
+ dbg_rcvry("found corruption (%d) at %d:%d",
+ ret, lnum, offs);
break;
} else {
dbg_err("unexpected return value %d", ret);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ccabaf1164b..b007637f040 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -221,8 +221,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
{
int err;
- dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
- r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
+ dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ",
+ r->lnum, r->offs, r->len, r->deletion, r->sqnum);
/* Set c->replay_sqnum to help deal with dangling branches. */
c->replay_sqnum = r->sqnum;
@@ -361,7 +361,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
{
struct replay_entry *r;
- dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+ dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
if (key_inum(c, key) >= c->highest_inum)
c->highest_inum = key_inum(c, key);
@@ -409,7 +409,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
struct replay_entry *r;
char *nbuf;
- dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+ dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
if (key_inum(c, key) >= c->highest_inum)
c->highest_inum = key_inum(c, key);
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 6094c5a5d7a..771f7fb6ce9 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -410,13 +410,23 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
}
if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
- err = 7;
+ ubifs_err("too few main LEBs count %d, must be at least %d",
+ c->main_lebs, UBIFS_MIN_MAIN_LEBS);
goto failed;
}
- if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS ||
- c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) {
- err = 8;
+ max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS;
+ if (c->max_bud_bytes < max_bytes) {
+ ubifs_err("too small journal (%lld bytes), must be at least "
+ "%lld bytes", c->max_bud_bytes, max_bytes);
+ goto failed;
+ }
+
+ max_bytes = (long long)c->leb_size * c->main_lebs;
+ if (c->max_bud_bytes > max_bytes) {
+ ubifs_err("too large journal size (%lld bytes), only %lld bytes"
+ "available in the main area",
+ c->max_bud_bytes, max_bytes);
goto failed;
}
@@ -450,7 +460,6 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
goto failed;
}
- max_bytes = c->main_lebs * (long long)c->leb_size;
if (c->rp_size < 0 || max_bytes < c->rp_size) {
err = 14;
goto failed;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 63765d58445..76e4e0566ad 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2076,15 +2076,13 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out_umount;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root)
- goto out_iput;
+ goto out_umount;
mutex_unlock(&c->umount_mutex);
return 0;
-out_iput:
- iput(root);
out_umount:
ubifs_umount(c);
out_unlock:
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e14ee53159d..16ad84d8402 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -505,7 +505,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
{
int ret;
- dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+ dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
zbr->offs);
@@ -519,8 +519,8 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
ret = 0;
}
if (ret == 0 && c->replaying)
- dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
- zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+ dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ",
+ zbr->lnum, zbr->offs, zbr->len);
return ret;
}
@@ -995,9 +995,9 @@ static int fallible_resolve_collision(struct ubifs_info *c,
if (adding || !o_znode)
return 0;
- dbg_mnt("dangling match LEB %d:%d len %d %s",
+ dbg_mntk(key, "dangling match LEB %d:%d len %d key ",
o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
- o_znode->zbranch[o_n].len, DBGKEY(key));
+ o_znode->zbranch[o_n].len);
*zn = o_znode;
*n = o_n;
return 1;
@@ -1179,7 +1179,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
unsigned long time = get_seconds();
- dbg_tnc("search key %s", DBGKEY(key));
+ dbg_tnck(key, "search key ");
ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
znode = c->zroot.znode;
@@ -1315,7 +1315,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
unsigned long time = get_seconds();
- dbg_tnc("search and dirty key %s", DBGKEY(key));
+ dbg_tnck(key, "search and dirty key ");
znode = c->zroot.znode;
if (unlikely(!znode)) {
@@ -1722,8 +1722,8 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
if (!keys_eq(c, &zbr->key, &key1)) {
ubifs_err("bad key in node at LEB %d:%d",
zbr->lnum, zbr->offs);
- dbg_tnc("looked for key %s found node's key %s",
- DBGKEY(&zbr->key), DBGKEY1(&key1));
+ dbg_tnck(&zbr->key, "looked for key ");
+ dbg_tnck(&key1, "found node's key ");
goto out_err;
}
@@ -1776,7 +1776,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
ubifs_err("failed to read from LEB %d:%d, error %d",
lnum, offs, err);
dbg_dump_stack();
- dbg_tnc("key %s", DBGKEY(&bu->key));
+ dbg_tnck(&bu->key, "key ");
return err;
}
@@ -1811,7 +1811,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
int found, n, err;
struct ubifs_znode *znode;
- dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+ dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
mutex_lock(&c->tnc_mutex);
found = ubifs_lookup_level0(c, key, &znode, &n);
if (!found) {
@@ -1985,8 +1985,7 @@ again:
zp = znode->parent;
if (znode->child_cnt < c->fanout) {
ubifs_assert(n != c->fanout);
- dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
- DBGKEY(key));
+ dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level);
insert_zbranch(znode, zbr, n);
@@ -2001,7 +2000,7 @@ again:
* Unfortunately, @znode does not have more empty slots and we have to
* split it.
*/
- dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+ dbg_tnck(key, "splitting level %d, key ", znode->level);
if (znode->alt)
/*
@@ -2095,7 +2094,7 @@ do_split:
}
/* Insert new key and branch */
- dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+ dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level);
insert_zbranch(zi, zbr, n);
@@ -2171,7 +2170,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+ dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len);
found = lookup_level0_dirty(c, key, &znode, &n);
if (!found) {
struct ubifs_zbranch zbr;
@@ -2220,8 +2219,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
- old_offs, lnum, offs, len, DBGKEY(key));
+ dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum,
+ old_offs, lnum, offs, len);
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2303,8 +2302,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
- DBGKEY(key));
+ dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
+ lnum, offs, nm->len, nm->name);
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2397,7 +2396,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
/* Delete without merge for now */
ubifs_assert(znode->level == 0);
ubifs_assert(n >= 0 && n < c->fanout);
- dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+ dbg_tnck(&znode->zbranch[n].key, "deleting key ");
zbr = &znode->zbranch[n];
lnc_free(zbr);
@@ -2507,7 +2506,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("key %s", DBGKEY(key));
+ dbg_tnck(key, "key ");
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2538,7 +2537,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+ dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
err = lookup_level0_dirty(c, key, &znode, &n);
if (err < 0)
goto out_unlock;
@@ -2653,7 +2652,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
dbg_dump_znode(c, znode);
goto out_unlock;
}
- dbg_tnc("removing %s", DBGKEY(key));
+ dbg_tnck(key, "removing key ");
}
if (k) {
for (i = n + 1 + k; i < znode->child_cnt; i++)
@@ -2773,7 +2772,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
struct ubifs_zbranch *zbr;
union ubifs_key *dkey;
- dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+ dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
ubifs_assert(is_hash_key(c, key));
mutex_lock(&c->tnc_mutex);
@@ -3332,9 +3331,9 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
out_dump:
block = key_block(c, key);
- ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
- "(data key %s)", (unsigned long)inode->i_ino, size,
- ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+ ubifs_err("inode %lu has size %lld, but there are data at offset %lld",
+ (unsigned long)inode->i_ino, size,
+ ((loff_t)block) << UBIFS_BLOCK_SHIFT);
mutex_unlock(&c->tnc_mutex);
dbg_dump_inode(c, inode);
dbg_dump_stack();
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index b48db999903..dc28fe6ec07 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
case UBIFS_XENT_KEY:
break;
default:
- dbg_msg("bad key type at slot %d: %s", i,
- DBGKEY(&zbr->key));
+ dbg_msg("bad key type at slot %d: %d",
+ i, key_type(c, &zbr->key));
err = 3;
goto out_dump;
}
@@ -475,7 +475,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
zbr->offs);
if (err) {
- dbg_tnc("key %s", DBGKEY(key));
+ dbg_tnck(key, "key ");
return err;
}
@@ -484,8 +484,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
if (!keys_eq(c, key, &key1)) {
ubifs_err("bad key in node at LEB %d:%d",
zbr->lnum, zbr->offs);
- dbg_tnc("looked for key %s found node's key %s",
- DBGKEY(key), DBGKEY1(&key1));
+ dbg_tnck(key, "looked for key ");
+ dbg_tnck(&key1, "but found node's key ");
dbg_dump_node(c, node);
return -EINVAL;
}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 12e94774aa8..93d59aceaae 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -84,9 +84,6 @@
#define INUM_WARN_WATERMARK 0xFFF00000
#define INUM_WATERMARK 0xFFFFFF00
-/* Largest key size supported in this implementation */
-#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
-
/* Maximum number of entries in each LPT (LEB category) heap */
#define LPT_HEAP_SZ 256
@@ -277,10 +274,10 @@ struct ubifs_old_idx {
/* The below union makes it easier to deal with keys */
union ubifs_key {
- uint8_t u8[CUR_MAX_KEY_LEN];
- uint32_t u32[CUR_MAX_KEY_LEN/4];
- uint64_t u64[CUR_MAX_KEY_LEN/8];
- __le32 j32[CUR_MAX_KEY_LEN/4];
+ uint8_t u8[UBIFS_SK_LEN];
+ uint32_t u32[UBIFS_SK_LEN/4];
+ uint64_t u64[UBIFS_SK_LEN/8];
+ __le32 j32[UBIFS_SK_LEN/4];
};
/**
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 987585bb0a1..1ba2baaf436 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -105,7 +105,6 @@ static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
}
static void udf_bitmap_free_blocks(struct super_block *sb,
- struct inode *inode,
struct udf_bitmap *bitmap,
struct kernel_lb_addr *bloc,
uint32_t offset,
@@ -172,7 +171,6 @@ error_return:
}
static int udf_bitmap_prealloc_blocks(struct super_block *sb,
- struct inode *inode,
struct udf_bitmap *bitmap,
uint16_t partition, uint32_t first_block,
uint32_t block_count)
@@ -223,7 +221,6 @@ out:
}
static int udf_bitmap_new_block(struct super_block *sb,
- struct inode *inode,
struct udf_bitmap *bitmap, uint16_t partition,
uint32_t goal, int *err)
{
@@ -349,7 +346,6 @@ error_return:
}
static void udf_table_free_blocks(struct super_block *sb,
- struct inode *inode,
struct inode *table,
struct kernel_lb_addr *bloc,
uint32_t offset,
@@ -581,7 +577,6 @@ error_return:
}
static int udf_table_prealloc_blocks(struct super_block *sb,
- struct inode *inode,
struct inode *table, uint16_t partition,
uint32_t first_block, uint32_t block_count)
{
@@ -643,7 +638,6 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
}
static int udf_table_new_block(struct super_block *sb,
- struct inode *inode,
struct inode *table, uint16_t partition,
uint32_t goal, int *err)
{
@@ -743,18 +737,23 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode,
struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
- udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
+ udf_bitmap_free_blocks(sb, map->s_uspace.s_bitmap,
bloc, offset, count);
} else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
- udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
+ udf_table_free_blocks(sb, map->s_uspace.s_table,
bloc, offset, count);
} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
- udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
+ udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap,
bloc, offset, count);
} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
- udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
+ udf_table_free_blocks(sb, map->s_fspace.s_table,
bloc, offset, count);
}
+
+ if (inode) {
+ inode_sub_bytes(inode,
+ ((sector_t)count) << sb->s_blocksize_bits);
+ }
}
inline int udf_prealloc_blocks(struct super_block *sb,
@@ -763,29 +762,34 @@ inline int udf_prealloc_blocks(struct super_block *sb,
uint32_t block_count)
{
struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+ sector_t allocated;
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
- return udf_bitmap_prealloc_blocks(sb, inode,
- map->s_uspace.s_bitmap,
- partition, first_block,
- block_count);
+ allocated = udf_bitmap_prealloc_blocks(sb,
+ map->s_uspace.s_bitmap,
+ partition, first_block,
+ block_count);
else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
- return udf_table_prealloc_blocks(sb, inode,
- map->s_uspace.s_table,
- partition, first_block,
- block_count);
+ allocated = udf_table_prealloc_blocks(sb,
+ map->s_uspace.s_table,
+ partition, first_block,
+ block_count);
else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
- return udf_bitmap_prealloc_blocks(sb, inode,
- map->s_fspace.s_bitmap,
- partition, first_block,
- block_count);
+ allocated = udf_bitmap_prealloc_blocks(sb,
+ map->s_fspace.s_bitmap,
+ partition, first_block,
+ block_count);
else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
- return udf_table_prealloc_blocks(sb, inode,
- map->s_fspace.s_table,
- partition, first_block,
- block_count);
+ allocated = udf_table_prealloc_blocks(sb,
+ map->s_fspace.s_table,
+ partition, first_block,
+ block_count);
else
return 0;
+
+ if (inode && allocated > 0)
+ inode_add_bytes(inode, allocated << sb->s_blocksize_bits);
+ return allocated;
}
inline int udf_new_block(struct super_block *sb,
@@ -793,25 +797,29 @@ inline int udf_new_block(struct super_block *sb,
uint16_t partition, uint32_t goal, int *err)
{
struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+ int block;
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
- return udf_bitmap_new_block(sb, inode,
- map->s_uspace.s_bitmap,
- partition, goal, err);
+ block = udf_bitmap_new_block(sb,
+ map->s_uspace.s_bitmap,
+ partition, goal, err);
else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
- return udf_table_new_block(sb, inode,
- map->s_uspace.s_table,
- partition, goal, err);
- else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
- return udf_bitmap_new_block(sb, inode,
- map->s_fspace.s_bitmap,
+ block = udf_table_new_block(sb,
+ map->s_uspace.s_table,
partition, goal, err);
+ else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
+ block = udf_bitmap_new_block(sb,
+ map->s_fspace.s_bitmap,
+ partition, goal, err);
else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
- return udf_table_new_block(sb, inode,
- map->s_fspace.s_table,
- partition, goal, err);
+ block = udf_table_new_block(sb,
+ map->s_fspace.s_table,
+ partition, goal, err);
else {
*err = -EIO;
return 0;
}
+ if (inode && block)
+ inode_add_bytes(inode, sb->s_blocksize);
+ return block;
}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index dca0c3881e8..7f3f7ba3df6 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -87,10 +87,10 @@ static int udf_adinicb_write_end(struct file *file,
char *kaddr;
struct udf_inode_info *iinfo = UDF_I(inode);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
kaddr + offset, copied);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
return simple_write_end(file, mapping, pos, len, copied, page, fsdata);
}
@@ -201,12 +201,10 @@ out:
static int udf_release_file(struct inode *inode, struct file *filp)
{
if (filp->f_mode & FMODE_WRITE) {
- mutex_lock(&inode->i_mutex);
down_write(&UDF_I(inode)->i_data_sem);
udf_discard_prealloc(inode);
udf_truncate_tail_extent(inode);
up_write(&UDF_I(inode)->i_data_sem);
- mutex_unlock(&inode->i_mutex);
}
return 0;
}
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 05ab48195be..7e5aae4bf46 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -116,6 +116,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
iinfo->i_lenEAttr = 0;
iinfo->i_lenAlloc = 0;
iinfo->i_use = 0;
+ iinfo->i_checkpoint = 1;
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 7699df7b319..7d752800835 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1358,6 +1358,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
iinfo->i_unique = le64_to_cpu(fe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs);
+ iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint);
offset = sizeof(struct fileEntry) + iinfo->i_lenEAttr;
} else {
inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
@@ -1379,6 +1380,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
iinfo->i_unique = le64_to_cpu(efe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
+ iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
offset = sizeof(struct extendedFileEntry) +
iinfo->i_lenEAttr;
}
@@ -1495,6 +1497,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
struct buffer_head *bh = NULL;
struct fileEntry *fe;
struct extendedFileEntry *efe;
+ uint64_t lb_recorded;
uint32_t udfperms;
uint16_t icbflags;
uint16_t crclen;
@@ -1589,13 +1592,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
dsea->minorDeviceIdent = cpu_to_le32(iminor(inode));
}
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+ lb_recorded = 0; /* No extents => no blocks! */
+ else
+ lb_recorded =
+ (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
+ (blocksize_bits - 9);
+
if (iinfo->i_efe == 0) {
memcpy(bh->b_data + sizeof(struct fileEntry),
iinfo->i_ext.i_data,
inode->i_sb->s_blocksize - sizeof(struct fileEntry));
- fe->logicalBlocksRecorded = cpu_to_le64(
- (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
- (blocksize_bits - 9));
+ fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
@@ -1607,6 +1615,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
fe->uniqueID = cpu_to_le64(iinfo->i_unique);
fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
+ fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);
fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE);
crclen = sizeof(struct fileEntry);
} else {
@@ -1615,9 +1624,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
inode->i_sb->s_blocksize -
sizeof(struct extendedFileEntry));
efe->objectSize = cpu_to_le64(inode->i_size);
- efe->logicalBlocksRecorded = cpu_to_le64(
- (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
- (blocksize_bits - 9));
+ efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec ||
(iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec &&
@@ -1646,6 +1653,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
efe->uniqueID = cpu_to_le64(iinfo->i_unique);
efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
+ efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);
efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE);
crclen = sizeof(struct extendedFileEntry);
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 08bf46edf9c..38de8f234b9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,8 +32,6 @@
#include <linux/crc-itu-t.h>
#include <linux/exportfs.h>
-enum { UDF_MAX_LINKS = 0xffff };
-
static inline int udf_match(int len1, const unsigned char *name1, int len2,
const unsigned char *name2)
{
@@ -649,10 +647,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct udf_inode_info *dinfo = UDF_I(dir);
struct udf_inode_info *iinfo;
- err = -EMLINK;
- if (dir->i_nlink >= UDF_MAX_LINKS)
- goto out;
-
err = -EIO;
inode = udf_new_inode(dir, S_IFDIR | mode, &err);
if (!inode)
@@ -1032,9 +1026,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
struct fileIdentDesc cfi, *fi;
int err;
- if (inode->i_nlink >= UDF_MAX_LINKS)
- return -EMLINK;
-
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
if (!fi) {
return err;
@@ -1126,10 +1117,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
old_dir->i_ino)
goto end_rename;
-
- retval = -EMLINK;
- if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS)
- goto end_rename;
}
if (!nfi) {
nfi = udf_add_entry(new_dir, new_dentry, &nfibh, &ncfi,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index c09a84daaf5..ac8a348dcb6 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -75,6 +75,8 @@
#define UDF_DEFAULT_BLOCKSIZE 2048
+enum { UDF_MAX_LINKS = 0xffff };
+
/* These are the "meat" - everything else is stuffing */
static int udf_fill_super(struct super_block *, void *, int);
static void udf_put_super(struct super_block *);
@@ -948,11 +950,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
else
bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
- if (bitmap == NULL) {
- udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n",
- nr_groups);
+ if (bitmap == NULL)
return NULL;
- }
bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
bitmap->s_nr_groups = nr_groups;
@@ -2035,13 +2034,13 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
}
/* Allocate a dentry for the root inode */
- sb->s_root = d_alloc_root(inode);
+ sb->s_root = d_make_root(inode);
if (!sb->s_root) {
udf_err(sb, "Couldn't allocate root dentry\n");
- iput(inode);
goto error_out;
}
sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_max_links = UDF_MAX_LINKS;
return 0;
error_out:
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index d1bd31ea724..bb8309dcd5c 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -23,6 +23,7 @@ struct udf_inode_info {
__u64 i_lenExtents;
__u32 i_next_alloc_block;
__u32 i_next_alloc_goal;
+ __u32 i_checkpoint;
unsigned i_alloc_type : 3;
unsigned i_efe : 1; /* extendedFileEntry */
unsigned i_use : 1; /* unallocSpaceEntry */
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 9094e1d917b..7cdd3953d67 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -26,7 +26,6 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/errno.h>
#include <linux/fs.h>
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 38cac199edf..a2281cadefa 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -166,10 +166,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
int error;
lock_ufs(dir->i_sb);
- if (inode->i_nlink >= UFS_LINK_MAX) {
- unlock_ufs(dir->i_sb);
- return -EMLINK;
- }
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
@@ -183,10 +179,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
{
struct inode * inode;
- int err = -EMLINK;
-
- if (dir->i_nlink >= UFS_LINK_MAX)
- goto out;
+ int err;
lock_ufs(dir->i_sb);
inode_inc_link_count(dir);
@@ -305,11 +298,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
} else {
- if (dir_de) {
- err = -EMLINK;
- if (new_dir->i_nlink >= UFS_LINK_MAX)
- goto out_dir;
- }
err = ufs_add_link(new_dentry, old_inode);
if (err)
goto out_dir;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 5246ee3e560..ac8e279eccc 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -73,7 +73,6 @@
#include <stdarg.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/errno.h>
#include <linux/fs.h>
@@ -1157,16 +1156,17 @@ magic_found:
"fast symlink size (%u)\n", uspi->s_maxsymlinklen);
uspi->s_maxsymlinklen = maxsymlen;
}
+ sb->s_max_links = UFS_LINK_MAX;
inode = ufs_iget(sb, UFS_ROOTINO);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto failed;
}
- sb->s_root = d_alloc_root(inode);
+ sb->s_root = d_make_root(inode);
if (!sb->s_root) {
ret = -ENOMEM;
- goto dalloc_failed;
+ goto failed;
}
ufs_setup_cstotal(sb);
@@ -1180,8 +1180,6 @@ magic_found:
UFSD("EXIT\n");
return 0;
-dalloc_failed:
- iput(inode);
failed:
if (ubh)
ubh_brelse_uspi (uspi);
diff --git a/fs/xattr.c b/fs/xattr.c
index 82f43376c7c..3c8c1cc333c 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -16,11 +16,12 @@
#include <linux/security.h>
#include <linux/evm.h>
#include <linux/syscalls.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/fsnotify.h>
#include <linux/audit.h>
-#include <asm/uaccess.h>
+#include <linux/vmalloc.h>
+#include <asm/uaccess.h>
/*
* Check permissions for extended attribute access. This is a bit complicated
@@ -320,6 +321,7 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
{
int error;
void *kvalue = NULL;
+ void *vvalue = NULL; /* If non-NULL, we used vmalloc() */
char kname[XATTR_NAME_MAX + 1];
if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
@@ -334,13 +336,25 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
if (size) {
if (size > XATTR_SIZE_MAX)
return -E2BIG;
- kvalue = memdup_user(value, size);
- if (IS_ERR(kvalue))
- return PTR_ERR(kvalue);
+ kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!kvalue) {
+ vvalue = vmalloc(size);
+ if (!vvalue)
+ return -ENOMEM;
+ kvalue = vvalue;
+ }
+ if (copy_from_user(kvalue, value, size)) {
+ error = -EFAULT;
+ goto out;
+ }
}
error = vfs_setxattr(d, kname, kvalue, size, flags);
- kfree(kvalue);
+out:
+ if (vvalue)
+ vfree(vvalue);
+ else
+ kfree(kvalue);
return error;
}
@@ -492,13 +506,18 @@ listxattr(struct dentry *d, char __user *list, size_t size)
{
ssize_t error;
char *klist = NULL;
+ char *vlist = NULL; /* If non-NULL, we used vmalloc() */
if (size) {
if (size > XATTR_LIST_MAX)
size = XATTR_LIST_MAX;
- klist = kmalloc(size, GFP_KERNEL);
- if (!klist)
- return -ENOMEM;
+ klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
+ if (!klist) {
+ vlist = vmalloc(size);
+ if (!vlist)
+ return -ENOMEM;
+ klist = vlist;
+ }
}
error = vfs_listxattr(d, klist, size);
@@ -510,7 +529,10 @@ listxattr(struct dentry *d, char __user *list, size_t size)
than XATTR_LIST_MAX bytes. Not possible. */
error = -E2BIG;
}
- kfree(klist);
+ if (vlist)
+ vfree(vlist);
+ else
+ kfree(klist);
return error;
}
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 8d5a506c82e..69d06b07b16 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -5,7 +5,7 @@
* Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/fs.h>
#include <linux/posix_acl_xattr.h>
#include <linux/gfp.h>
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 427a4e82a58..0a9977983f9 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -96,9 +96,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs_qm_bhv.o \
xfs_qm.o \
xfs_quotaops.o
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS) += xfs_qm_stats.o
-endif
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_PROC_FS) += xfs_stats.o
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 292eff19803..ab7c53fe346 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -110,10 +110,4 @@ kmem_zone_destroy(kmem_zone_t *zone)
extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
-static inline int
-kmem_shake_allow(gfp_t gfp_mask)
-{
- return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
-}
-
#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index ce84ffd0264..0f0df2759b0 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -35,6 +35,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
+struct workqueue_struct *xfs_alloc_wq;
#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
@@ -68,7 +69,7 @@ xfs_alloc_lookup_eq(
* Lookup the first record greater than or equal to [bno, len]
* in the btree given by cur.
*/
-STATIC int /* error */
+int /* error */
xfs_alloc_lookup_ge(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
@@ -2207,7 +2208,7 @@ xfs_alloc_read_agf(
* group or loop over the allocation groups to find the result.
*/
int /* error */
-xfs_alloc_vextent(
+__xfs_alloc_vextent(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
xfs_agblock_t agsize; /* allocation group size */
@@ -2417,6 +2418,37 @@ error0:
return error;
}
+static void
+xfs_alloc_vextent_worker(
+ struct work_struct *work)
+{
+ struct xfs_alloc_arg *args = container_of(work,
+ struct xfs_alloc_arg, work);
+ unsigned long pflags;
+
+ /* we are in a transaction context here */
+ current_set_flags_nested(&pflags, PF_FSTRANS);
+
+ args->result = __xfs_alloc_vextent(args);
+ complete(args->done);
+
+ current_restore_flags_nested(&pflags, PF_FSTRANS);
+}
+
+
+int /* error */
+xfs_alloc_vextent(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ args->done = &done;
+ INIT_WORK(&args->work, xfs_alloc_vextent_worker);
+ queue_work(xfs_alloc_wq, &args->work);
+ wait_for_completion(&done);
+ return args->result;
+}
+
/*
* Free an extent.
* Just break up the extent address and hand off to xfs_free_ag_extent
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 2f52b924be7..3a7e7d8f8de 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -25,6 +25,8 @@ struct xfs_perag;
struct xfs_trans;
struct xfs_busy_extent;
+extern struct workqueue_struct *xfs_alloc_wq;
+
/*
* Freespace allocation types. Argument to xfs_alloc_[v]extent.
*/
@@ -119,6 +121,9 @@ typedef struct xfs_alloc_arg {
char isfl; /* set if is freelist blocks - !acctg */
char userdata; /* set if this is user data */
xfs_fsblock_t firstblock; /* io first block allocated */
+ struct completion *done;
+ struct work_struct work;
+ int result;
} xfs_alloc_arg_t;
/*
@@ -243,6 +248,13 @@ xfs_alloc_lookup_le(
xfs_extlen_t len, /* length of extent */
int *stat); /* success/failure */
+int /* error */
+xfs_alloc_lookup_ge(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat); /* success/failure */
+
int /* error */
xfs_alloc_get_rec(
struct xfs_btree_cur *cur, /* btree cursor */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee9b62..0dbb9e70fe2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -26,6 +26,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_rw.h"
@@ -99,24 +100,6 @@ xfs_destroy_ioend(
}
/*
- * If the end of the current ioend is beyond the current EOF,
- * return the new EOF value, otherwise zero.
- */
-STATIC xfs_fsize_t
-xfs_ioend_new_eof(
- xfs_ioend_t *ioend)
-{
- xfs_inode_t *ip = XFS_I(ioend->io_inode);
- xfs_fsize_t isize;
- xfs_fsize_t bsize;
-
- bsize = ioend->io_offset + ioend->io_size;
- isize = MAX(ip->i_size, ip->i_new_size);
- isize = MIN(isize, bsize);
- return isize > ip->i_d.di_size ? isize : 0;
-}
-
-/*
* Fast and loose check if this write could update the on-disk inode size.
*/
static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
@@ -125,36 +108,65 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
XFS_I(ioend->io_inode)->i_d.di_size;
}
+STATIC int
+xfs_setfilesize_trans_alloc(
+ struct xfs_ioend *ioend)
+{
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+
+ error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ ioend->io_append_trans = tp;
+
+ /*
+ * We hand off the transaction to the completion thread now, so
+ * clear the flag here.
+ */
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ return 0;
+}
+
/*
- * Update on-disk file size now that data has been written to disk. The
- * current in-memory file size is i_size. If a write is beyond eof i_new_size
- * will be the intended file size until i_size is updated. If this write does
- * not extend all the way to the valid file size then restrict this update to
- * the end of the write.
- *
- * This function does not block as blocking on the inode lock in IO completion
- * can lead to IO completion order dependency deadlocks.. If it can't get the
- * inode ilock it will return EAGAIN. Callers must handle this.
+ * Update on-disk file size now that data has been written to disk.
*/
STATIC int
xfs_setfilesize(
- xfs_ioend_t *ioend)
+ struct xfs_ioend *ioend)
{
- xfs_inode_t *ip = XFS_I(ioend->io_inode);
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_trans *tp = ioend->io_append_trans;
xfs_fsize_t isize;
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
- return EAGAIN;
+ /*
+ * The transaction was allocated in the I/O submission thread,
+ * thus we need to mark ourselves as beeing in a transaction
+ * manually.
+ */
+ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
- isize = xfs_ioend_new_eof(ioend);
- if (isize) {
- trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
- ip->i_d.di_size = isize;
- xfs_mark_inode_dirty(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
+ if (!isize) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_cancel(tp, 0);
+ return 0;
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return 0;
+ trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+
+ ip->i_d.di_size = isize;
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return xfs_trans_commit(tp, 0);
}
/*
@@ -168,10 +180,12 @@ xfs_finish_ioend(
struct xfs_ioend *ioend)
{
if (atomic_dec_and_test(&ioend->io_remaining)) {
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+
if (ioend->io_type == IO_UNWRITTEN)
- queue_work(xfsconvertd_workqueue, &ioend->io_work);
- else if (xfs_ioend_is_append(ioend))
- queue_work(xfsdatad_workqueue, &ioend->io_work);
+ queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+ else if (ioend->io_append_trans)
+ queue_work(mp->m_data_workqueue, &ioend->io_work);
else
xfs_destroy_ioend(ioend);
}
@@ -200,35 +214,36 @@ xfs_end_io(
* range to normal written extens after the data I/O has finished.
*/
if (ioend->io_type == IO_UNWRITTEN) {
+ /*
+ * For buffered I/O we never preallocate a transaction when
+ * doing the unwritten extent conversion, but for direct I/O
+ * we do not know if we are converting an unwritten extent
+ * or not at the point where we preallocate the transaction.
+ */
+ if (ioend->io_append_trans) {
+ ASSERT(ioend->io_isdirect);
+
+ current_set_flags_nested(
+ &ioend->io_append_trans->t_pflags, PF_FSTRANS);
+ xfs_trans_cancel(ioend->io_append_trans, 0);
+ }
+
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
if (error) {
ioend->io_error = -error;
goto done;
}
+ } else if (ioend->io_append_trans) {
+ error = xfs_setfilesize(ioend);
+ if (error)
+ ioend->io_error = -error;
+ } else {
+ ASSERT(!xfs_ioend_is_append(ioend));
}
- /*
- * We might have to update the on-disk file size after extending
- * writes.
- */
- error = xfs_setfilesize(ioend);
- ASSERT(!error || error == EAGAIN);
-
done:
- /*
- * If we didn't complete processing of the ioend, requeue it to the
- * tail of the workqueue for another attempt later. Otherwise destroy
- * it.
- */
- if (error == EAGAIN) {
- atomic_inc(&ioend->io_remaining);
- xfs_finish_ioend(ioend);
- /* ensure we don't spin on blocked ioends */
- delay(1);
- } else {
- xfs_destroy_ioend(ioend);
- }
+ xfs_destroy_ioend(ioend);
}
/*
@@ -264,6 +279,7 @@ xfs_alloc_ioend(
*/
atomic_set(&ioend->io_remaining, 1);
ioend->io_isasync = 0;
+ ioend->io_isdirect = 0;
ioend->io_error = 0;
ioend->io_list = NULL;
ioend->io_type = type;
@@ -274,6 +290,7 @@ xfs_alloc_ioend(
ioend->io_size = 0;
ioend->io_iocb = NULL;
ioend->io_result = 0;
+ ioend->io_append_trans = NULL;
INIT_WORK(&ioend->io_work, xfs_end_io);
return ioend;
@@ -384,14 +401,6 @@ xfs_submit_ioend_bio(
atomic_inc(&ioend->io_remaining);
bio->bi_private = ioend;
bio->bi_end_io = xfs_end_bio;
-
- /*
- * If the I/O is beyond EOF we mark the inode dirty immediately
- * but don't update the inode size until I/O completion.
- */
- if (xfs_ioend_new_eof(ioend))
- xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
-
submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
}
@@ -1038,8 +1047,20 @@ xfs_vm_writepage(
wbc, end_index);
}
- if (iohead)
+ if (iohead) {
+ /*
+ * Reserve log space if we might write beyond the on-disk
+ * inode size.
+ */
+ if (ioend->io_type != IO_UNWRITTEN &&
+ xfs_ioend_is_append(ioend)) {
+ err = xfs_setfilesize_trans_alloc(ioend);
+ if (err)
+ goto error;
+ }
+
xfs_submit_ioend(wbc, iohead);
+ }
return 0;
@@ -1279,6 +1300,15 @@ xfs_end_io_direct_write(
struct xfs_ioend *ioend = iocb->private;
/*
+ * While the generic direct I/O code updates the inode size, it does
+ * so only after the end_io handler is called, which means our
+ * end_io handler thinks the on-disk size is outside the in-core
+ * size. To prevent this just update it a little bit earlier here.
+ */
+ if (offset + size > i_size_read(ioend->io_inode))
+ i_size_write(ioend->io_inode, offset + size);
+
+ /*
* blockdev_direct_IO can return an error even after the I/O
* completion handler was called. Thus we need to protect
* against double-freeing.
@@ -1310,17 +1340,32 @@ xfs_vm_direct_IO(
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct block_device *bdev = xfs_find_bdev_for_inode(inode);
+ struct xfs_ioend *ioend = NULL;
ssize_t ret;
if (rw & WRITE) {
- iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
+ size_t size = iov_length(iov, nr_segs);
+
+ /*
+ * We need to preallocate a transaction for a size update
+ * here. In the case that this write both updates the size
+ * and converts at least on unwritten extent we will cancel
+ * the still clean transaction after the I/O has finished.
+ */
+ iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);
+ if (offset + size > XFS_I(inode)->i_d.di_size) {
+ ret = xfs_setfilesize_trans_alloc(ioend);
+ if (ret)
+ goto out_destroy_ioend;
+ ioend->io_isdirect = 1;
+ }
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
xfs_get_blocks_direct,
xfs_end_io_direct_write, NULL, 0);
if (ret != -EIOCBQUEUED && iocb->private)
- xfs_destroy_ioend(iocb->private);
+ goto out_trans_cancel;
} else {
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
@@ -1329,6 +1374,16 @@ xfs_vm_direct_IO(
}
return ret;
+
+out_trans_cancel:
+ if (ioend->io_append_trans) {
+ current_set_flags_nested(&ioend->io_append_trans->t_pflags,
+ PF_FSTRANS);
+ xfs_trans_cancel(ioend->io_append_trans, 0);
+ }
+out_destroy_ioend:
+ xfs_destroy_ioend(ioend);
+ return ret;
}
STATIC void
@@ -1340,12 +1395,11 @@ xfs_vm_write_failed(
if (to > inode->i_size) {
/*
- * punch out the delalloc blocks we have already allocated. We
- * don't call xfs_setattr() to do this as we may be in the
- * middle of a multi-iovec write and so the vfs inode->i_size
- * will not match the xfs ip->i_size and so it will zero too
- * much. Hence we jus truncate the page cache to zero what is
- * necessary and punch the delalloc blocks directly.
+ * Punch out the delalloc blocks we have already allocated.
+ *
+ * Don't bother with xfs_setattr given that nothing can have
+ * made it to disk yet as the page is still locked at this
+ * point.
*/
struct xfs_inode *ip = XFS_I(inode);
xfs_fileoff_t start_fsb;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 116dd5c3703..84eafbcb0d9 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,8 +18,6 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern struct workqueue_struct *xfsdatad_workqueue;
-extern struct workqueue_struct *xfsconvertd_workqueue;
extern mempool_t *xfs_ioend_pool;
/*
@@ -48,12 +46,14 @@ typedef struct xfs_ioend {
int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
unsigned int io_isasync : 1; /* needs aio_complete */
+ unsigned int io_isdirect : 1;/* direct I/O */
struct inode *io_inode; /* file being written to */
struct buffer_head *io_buffer_head;/* buffer linked list head */
struct buffer_head *io_buffer_tail;/* buffer linked list tail */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
+ struct xfs_trans *io_append_trans;/* xact. for size update */
struct kiocb *io_iocb;
int io_result;
} xfs_ioend_t;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1e5d97f86ea..65d61b948ea 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
if (error)
goto out;
- /*
- * Commit the last in the sequence of transactions.
- */
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -857,6 +853,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
{
int newsize, forkoff, retval;
+ trace_xfs_attr_sf_addname(args);
+
retval = xfs_attr_shortform_lookup(args);
if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
return(retval);
@@ -900,6 +898,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
xfs_dabuf_t *bp;
int retval, error, committed, forkoff;
+ trace_xfs_attr_leaf_addname(args);
+
/*
* Read the (only) block in the attribute list in.
*/
@@ -924,6 +924,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
xfs_da_brelse(args->trans, bp);
return(retval);
}
+
+ trace_xfs_attr_leaf_replace(args);
+
args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
@@ -1094,6 +1097,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
xfs_dabuf_t *bp;
int error, committed, forkoff;
+ trace_xfs_attr_leaf_removename(args);
+
/*
* Remove the attribute.
*/
@@ -1227,6 +1232,8 @@ xfs_attr_node_addname(xfs_da_args_t *args)
xfs_mount_t *mp;
int committed, retval, error;
+ trace_xfs_attr_node_addname(args);
+
/*
* Fill in bucket of arguments/results/context to carry around.
*/
@@ -1253,6 +1260,9 @@ restart:
} else if (retval == EEXIST) {
if (args->flags & ATTR_CREATE)
goto out;
+
+ trace_xfs_attr_node_replace(args);
+
args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
@@ -1484,6 +1494,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_dabuf_t *bp;
int retval, error, committed, forkoff;
+ trace_xfs_attr_node_removename(args);
+
/*
* Tie a string around our finger to remind us where we are.
*/
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c1b55e59655..76d93dc953e 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -235,6 +235,8 @@ xfs_attr_shortform_create(xfs_da_args_t *args)
xfs_inode_t *dp;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_create(args);
+
dp = args->dp;
ASSERT(dp != NULL);
ifp = dp->i_afp;
@@ -268,13 +270,11 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
xfs_inode_t *dp;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_add(args);
+
dp = args->dp;
mp = dp->i_mount;
dp->i_d.di_forkoff = forkoff;
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- dp->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
ifp = dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +326,6 @@ xfs_attr_fork_reset(
ASSERT(ip->i_d.di_anextents == 0);
ASSERT(ip->i_afp == NULL);
- ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -342,6 +341,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
xfs_mount_t *mp;
xfs_inode_t *dp;
+ trace_xfs_attr_sf_remove(args);
+
dp = args->dp;
mp = dp->i_mount;
base = sizeof(xfs_attr_sf_hdr_t);
@@ -389,10 +390,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
(args->op_flags & XFS_DA_OP_ADDNAME) ||
!(mp->m_flags & XFS_MOUNT_ATTR2) ||
dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
- dp->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
xfs_trans_log_inode(args->trans, dp,
XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
@@ -414,6 +411,8 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
int i;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_lookup(args);
+
ifp = args->dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
@@ -485,6 +484,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
xfs_dabuf_t *bp;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_to_leaf(args);
+
dp = args->dp;
ifp = dp->i_afp;
sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
@@ -784,6 +785,8 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
char *tmpbuffer;
int error, i;
+ trace_xfs_attr_leaf_to_sf(args);
+
dp = args->dp;
tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
ASSERT(tmpbuffer != NULL);
@@ -857,6 +860,8 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
xfs_dablk_t blkno;
int error;
+ trace_xfs_attr_leaf_to_node(args);
+
dp = args->dp;
bp1 = bp2 = NULL;
error = xfs_da_grow_inode(args, &blkno);
@@ -920,6 +925,8 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
xfs_dabuf_t *bp;
int error;
+ trace_xfs_attr_leaf_create(args);
+
dp = args->dp;
ASSERT(dp != NULL);
error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
@@ -957,6 +964,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
xfs_dablk_t blkno;
int error;
+ trace_xfs_attr_leaf_split(state->args);
+
/*
* Allocate space for a new leaf node.
*/
@@ -986,10 +995,13 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
*
* Insert the "new" entry in the correct block.
*/
- if (state->inleaf)
+ if (state->inleaf) {
+ trace_xfs_attr_leaf_add_old(state->args);
error = xfs_attr_leaf_add(oldblk->bp, state->args);
- else
+ } else {
+ trace_xfs_attr_leaf_add_new(state->args);
error = xfs_attr_leaf_add(newblk->bp, state->args);
+ }
/*
* Update last hashval in each block since we added the name.
@@ -1010,6 +1022,8 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
xfs_attr_leaf_map_t *map;
int tablesize, entsize, sum, tmp, i;
+ trace_xfs_attr_leaf_add(args);
+
leaf = bp->data;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT((args->index >= 0)
@@ -1137,8 +1151,6 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
(be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
/*
- * Copy the attribute name and value into the new space.
- *
* For "remote" attribute values, simply note that we need to
* allocate space for the "remote" value. We can't actually
* allocate the extents in this transaction, and we can't decide
@@ -1274,6 +1286,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
args = state->args;
+ trace_xfs_attr_leaf_rebalance(args);
+
/*
* Check ordering of blocks, reverse if it makes things simpler.
*
@@ -1819,6 +1833,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
xfs_mount_t *mp;
char *tmpbuffer;
+ trace_xfs_attr_leaf_unbalance(state->args);
+
/*
* Set up environment.
*/
@@ -1928,6 +1944,8 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
int probe, span;
xfs_dahash_t hashval;
+ trace_xfs_attr_leaf_lookup(args);
+
leaf = bp->data;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(be16_to_cpu(leaf->hdr.count)
@@ -2454,6 +2472,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
char *name;
#endif /* DEBUG */
+ trace_xfs_attr_leaf_clearflag(args);
/*
* Set up the operation.
*/
@@ -2518,6 +2537,8 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
xfs_dabuf_t *bp;
int error;
+ trace_xfs_attr_leaf_setflag(args);
+
/*
* Set up the operation.
*/
@@ -2574,6 +2595,8 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
char *name1, *name2;
#endif /* DEBUG */
+ trace_xfs_attr_leaf_flipflags(args);
+
/*
* Read the block containing the "old" attr
*/
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d0ab7883705..85e7e327bcd 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
}
/*
-* Update the record referred to by cur to the value given
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+ return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >
+ XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+ return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) <=
+ XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
* by [off, bno, len, state].
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist,
&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur, 1,
&tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur,
1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
/*
* Make space in the inode incore.
*/
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
- if (dfl_forkoff > ip->i_d.di_forkoff) {
+ if (dfl_forkoff > ip->i_d.di_forkoff)
ip->i_d.di_forkoff = dfl_forkoff;
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
- }
}
}
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
int error; /* error return value */
ASSERT(XFS_IFORK_Q(ip) == 0);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
mp = ip->i_mount;
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
error = XFS_ERROR(EINVAL);
goto error1;
}
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
ASSERT(ip->i_afp == NULL);
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
ip->i_afp->if_flags = XFS_IFEXTENTS;
logflags = 0;
xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
} else
spin_unlock(&mp->m_sb_lock);
}
- if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+
+ error = xfs_bmap_finish(&tp, &flist, &committed);
+ if (error)
goto error2;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
- return error;
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
error2:
xfs_bmap_cancel(&flist);
error1:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
error0:
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
return error;
}
@@ -3994,11 +3997,8 @@ xfs_bmap_one_block(
xfs_bmbt_irec_t s; /* internal version of extent */
#ifndef DEBUG
- if (whichfork == XFS_DATA_FORK) {
- return S_ISREG(ip->i_d.di_mode) ?
- (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
- (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
- }
+ if (whichfork == XFS_DATA_FORK)
+ return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
#endif /* !DEBUG */
if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
return 0;
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block(
xfs_bmbt_get_all(ep, &s);
rval = s.br_startoff == 0 && s.br_blockcount == 1;
if (rval && whichfork == XFS_DATA_FORK)
- ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+ ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
return rval;
}
@@ -4379,8 +4379,6 @@ xfs_bmapi_read(
XFS_STATS_INC(xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4869,6 @@ xfs_bmapi_write(
return XFS_ERROR(EIO);
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
XFS_STATS_INC(xs_blk_mapw);
@@ -4981,8 +4977,7 @@ xfs_bmapi_write(
/*
* Transform from btree to extents, give it cur.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ if (xfs_bmap_wants_extents(ip, whichfork)) {
int tmp_logflags = 0;
ASSERT(bma.cur);
@@ -4992,10 +4987,10 @@ xfs_bmapi_write(
if (error)
goto error0;
}
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+ XFS_IFORK_NEXTENTS(ip, whichfork) >
+ XFS_IFORK_MAXEXT(ip, whichfork));
error = 0;
error0:
/*
@@ -5095,8 +5090,7 @@ xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
if (!(ifp->if_flags & XFS_IFEXTENTS) &&
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
@@ -5130,6 +5124,15 @@ xfs_bunmapi(
cur->bc_private.b.flags = 0;
} else
cur = NULL;
+
+ if (isrt) {
+ /*
+ * Synchronize by locking the bitmap inode.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ }
+
extno = 0;
while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
(nexts == 0 || extno < nexts)) {
@@ -5322,7 +5325,8 @@ xfs_bunmapi(
*/
if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+ XFS_IFORK_MAXEXT(ip, whichfork) &&
del.br_startoff > got.br_startoff &&
del.br_startoff + del.br_blockcount <
got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5357,11 @@ nodelete:
}
}
*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
/*
* Convert to a btree if necessary.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
ASSERT(cur == NULL);
error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
&cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5372,7 @@ nodelete:
/*
* transform from btree to extents, give it cur
*/
- else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ else if (xfs_bmap_wants_extents(ip, whichfork)) {
ASSERT(cur != NULL);
error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
whichfork);
@@ -5382,8 +5383,6 @@ nodelete:
/*
* transform from extents to local?
*/
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
error = 0;
error0:
/*
@@ -5434,7 +5433,7 @@ xfs_getbmapx_fix_eof_hole(
if (startblock == HOLESTARTBLOCK) {
mp = ip->i_mount;
out->bmv_block = -1;
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+ fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
fixlen -= out->bmv_offset;
if (prealloced && out->bmv_offset + out->bmv_length == end) {
/* Came to hole at EOF. Trim it. */
@@ -5522,7 +5521,7 @@ xfs_getbmap(
fixlen = XFS_MAXIOFFSET(mp);
} else {
prealloced = 0;
- fixlen = ip->i_size;
+ fixlen = XFS_ISIZE(ip);
}
}
@@ -5546,12 +5545,16 @@ xfs_getbmap(
if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
return XFS_ERROR(ENOMEM);
out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
- if (!out)
- return XFS_ERROR(ENOMEM);
+ if (!out) {
+ out = kmem_zalloc_large(bmv->bmv_count *
+ sizeof(struct getbmapx));
+ if (!out)
+ return XFS_ERROR(ENOMEM);
+ }
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
- if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+ if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
if (error)
goto out_unlock_iolock;
@@ -5671,7 +5674,10 @@ xfs_getbmap(
break;
}
- kmem_free(out);
+ if (is_vmalloc_addr(out))
+ kmem_free_large(out);
+ else
+ kmem_free(out);
return error;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4dff85c7d7e..6819b5163e3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -45,8 +45,6 @@ static kmem_zone_t *xfs_buf_zone;
STATIC int xfsbufd(void *);
static struct workqueue_struct *xfslogd_workqueue;
-struct workqueue_struct *xfsdatad_workqueue;
-struct workqueue_struct *xfsconvertd_workqueue;
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
@@ -1793,21 +1791,8 @@ xfs_buf_init(void)
if (!xfslogd_workqueue)
goto out_free_buf_zone;
- xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
- if (!xfsdatad_workqueue)
- goto out_destroy_xfslogd_workqueue;
-
- xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
- WQ_MEM_RECLAIM, 1);
- if (!xfsconvertd_workqueue)
- goto out_destroy_xfsdatad_workqueue;
-
return 0;
- out_destroy_xfsdatad_workqueue:
- destroy_workqueue(xfsdatad_workqueue);
- out_destroy_xfslogd_workqueue:
- destroy_workqueue(xfslogd_workqueue);
out_free_buf_zone:
kmem_zone_destroy(xfs_buf_zone);
out:
@@ -1817,8 +1802,6 @@ xfs_buf_init(void)
void
xfs_buf_terminate(void)
{
- destroy_workqueue(xfsconvertd_workqueue);
- destroy_workqueue(xfsdatad_workqueue);
destroy_workqueue(xfslogd_workqueue);
kmem_zone_destroy(xfs_buf_zone);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index df7ffb0affe..5bf3be45f54 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -21,7 +21,6 @@
#include <linux/list.h>
#include <linux/types.h>
#include <linux/spinlock.h>
-#include <asm/system.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 77c74257c2a..7f1a6f5b05a 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -108,6 +108,8 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
int error;
xfs_trans_t *tp;
+ trace_xfs_da_node_create(args);
+
tp = args->trans;
error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
if (error)
@@ -140,6 +142,8 @@ xfs_da_split(xfs_da_state_t *state)
xfs_dabuf_t *bp;
int max, action, error, i;
+ trace_xfs_da_split(state->args);
+
/*
* Walk back up the tree splitting/inserting/adjusting as necessary.
* If we need to insert and there isn't room, split the node, then
@@ -178,10 +182,12 @@ xfs_da_split(xfs_da_state_t *state)
state->extravalid = 1;
if (state->inleaf) {
state->extraafter = 0; /* before newblk */
+ trace_xfs_attr_leaf_split_before(state->args);
error = xfs_attr_leaf_split(state, oldblk,
&state->extrablk);
} else {
state->extraafter = 1; /* after newblk */
+ trace_xfs_attr_leaf_split_after(state->args);
error = xfs_attr_leaf_split(state, newblk,
&state->extrablk);
}
@@ -300,6 +306,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
xfs_mount_t *mp;
xfs_dir2_leaf_t *leaf;
+ trace_xfs_da_root_split(state->args);
+
/*
* Copy the existing (incorrect) block from the root node position
* to a free space somewhere.
@@ -380,6 +388,8 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
int newcount, error;
int useextra;
+ trace_xfs_da_node_split(state->args);
+
node = oldblk->bp->data;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
@@ -466,6 +476,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
int count, tmp;
xfs_trans_t *tp;
+ trace_xfs_da_node_rebalance(state->args);
+
node1 = blk1->bp->data;
node2 = blk2->bp->data;
/*
@@ -574,6 +586,8 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
xfs_da_node_entry_t *btree;
int tmp;
+ trace_xfs_da_node_add(state->args);
+
node = oldblk->bp->data;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
@@ -619,6 +633,8 @@ xfs_da_join(xfs_da_state_t *state)
xfs_da_state_blk_t *drop_blk, *save_blk;
int action, error;
+ trace_xfs_da_join(state->args);
+
action = 0;
drop_blk = &state->path.blk[ state->path.active-1 ];
save_blk = &state->altpath.blk[ state->path.active-1 ];
@@ -723,6 +739,8 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
xfs_dabuf_t *bp;
int error;
+ trace_xfs_da_root_join(state->args);
+
args = state->args;
ASSERT(args != NULL);
ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
@@ -941,6 +959,8 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
xfs_da_node_entry_t *btree;
int tmp;
+ trace_xfs_da_node_remove(state->args);
+
node = drop_blk->bp->data;
ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
ASSERT(drop_blk->index >= 0);
@@ -984,6 +1004,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
int tmp;
xfs_trans_t *tp;
+ trace_xfs_da_node_unbalance(state->args);
+
drop_node = drop_blk->bp->data;
save_node = save_blk->bp->data;
ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
@@ -1230,6 +1252,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
/*
* Link new block in before existing block.
*/
+ trace_xfs_da_link_before(args);
new_info->forw = cpu_to_be32(old_blk->blkno);
new_info->back = old_info->back;
if (old_info->back) {
@@ -1251,6 +1274,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
/*
* Link new block in after existing block.
*/
+ trace_xfs_da_link_after(args);
new_info->forw = old_info->forw;
new_info->back = cpu_to_be32(old_blk->blkno);
if (old_info->forw) {
@@ -1348,6 +1372,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* Unlink the leaf block from the doubly linked chain of leaves.
*/
if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+ trace_xfs_da_unlink_back(args);
save_info->back = drop_info->back;
if (drop_info->back) {
error = xfs_da_read_buf(args->trans, args->dp,
@@ -1365,6 +1390,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
xfs_da_buf_done(bp);
}
} else {
+ trace_xfs_da_unlink_forward(args);
save_info->forw = drop_info->forw;
if (drop_info->forw) {
error = xfs_da_read_buf(args->trans, args->dp,
@@ -1652,6 +1678,8 @@ xfs_da_grow_inode(
int count;
int error;
+ trace_xfs_da_grow_inode(args);
+
if (args->whichfork == XFS_DATA_FORK) {
bno = args->dp->i_mount->m_dirleafblk;
count = args->dp->i_mount->m_dirblkfsbs;
@@ -1690,6 +1718,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
xfs_dir2_leaf_t *dead_leaf2;
xfs_dahash_t dead_hash;
+ trace_xfs_da_swap_lastblock(args);
+
dead_buf = *dead_bufp;
dead_blkno = *dead_blknop;
tp = args->trans;
@@ -1878,6 +1908,8 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
xfs_trans_t *tp;
xfs_mount_t *mp;
+ trace_xfs_da_shrink_inode(args);
+
dp = args->dp;
w = args->whichfork;
tp = args->trans;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 654dc6f05ba..1137bbc5ecc 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
/* Check temp in extent form to max in target */
if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return EINVAL;
/* Check target in extent form to max in temp */
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return EINVAL;
/*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
* (a common defrag case) which will occur when the temp inode is in
* extent format...
*/
- if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- ((XFS_IFORK_BOFF(ip) &&
- tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
- return EINVAL;
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(ip) &&
+ tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+ return EINVAL;
+ if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ return EINVAL;
+ }
/* Reciprocal target->temp btree format checks */
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- ((XFS_IFORK_BOFF(tip) &&
- ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
- return EINVAL;
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(tip) &&
+ ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+ return EINVAL;
+
+ if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ return EINVAL;
+ }
return 0;
}
@@ -206,7 +215,7 @@ xfs_swap_extents(
xfs_trans_t *tp;
xfs_bstat_t *sbp = &sxp->sx_stat;
xfs_ifork_t *tempifp, *ifp, *tifp;
- int ilf_fields, tilf_fields;
+ int src_log_flags, target_log_flags;
int error = 0;
int aforkblks = 0;
int taforkblks = 0;
@@ -349,16 +358,6 @@ xfs_swap_extents(
*tifp = *tempifp; /* struct copy */
/*
- * Fix the in-memory data fork values that are dependent on the fork
- * offset in the inode. We can't assume they remain the same as attr2
- * has dynamic fork offsets.
- */
- ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
- (uint)sizeof(xfs_bmbt_rec_t);
- tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
- (uint)sizeof(xfs_bmbt_rec_t);
-
- /*
* Fix the on-disk inode values
*/
tmp = (__uint64_t)ip->i_d.di_nblocks;
@@ -386,9 +385,8 @@ xfs_swap_extents(
tip->i_delayed_blks = ip->i_delayed_blks;
ip->i_delayed_blks = 0;
- ilf_fields = XFS_ILOG_CORE;
-
- switch(ip->i_d.di_format) {
+ src_log_flags = XFS_ILOG_CORE;
+ switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
/* If the extents fit in the inode, fix the
* pointer. Otherwise it's already NULL or
@@ -398,16 +396,15 @@ xfs_swap_extents(
ifp->if_u1.if_extents =
ifp->if_u2.if_inline_ext;
}
- ilf_fields |= XFS_ILOG_DEXT;
+ src_log_flags |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
- ilf_fields |= XFS_ILOG_DBROOT;
+ src_log_flags |= XFS_ILOG_DBROOT;
break;
}
- tilf_fields = XFS_ILOG_CORE;
-
- switch(tip->i_d.di_format) {
+ target_log_flags = XFS_ILOG_CORE;
+ switch (tip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
/* If the extents fit in the inode, fix the
* pointer. Otherwise it's already NULL or
@@ -417,10 +414,10 @@ xfs_swap_extents(
tifp->if_u1.if_extents =
tifp->if_u2.if_inline_ext;
}
- tilf_fields |= XFS_ILOG_DEXT;
+ target_log_flags |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
- tilf_fields |= XFS_ILOG_DBROOT;
+ target_log_flags |= XFS_ILOG_DBROOT;
break;
}
@@ -428,8 +425,8 @@ xfs_swap_extents(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_log_inode(tp, ip, ilf_fields);
- xfs_trans_log_inode(tp, tip, tilf_fields);
+ xfs_trans_log_inode(tp, ip, src_log_flags);
+ xfs_trans_log_inode(tp, tip, target_log_flags);
/*
* If this is a synchronous mount, make sure that the
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 9245e029b8e..d3b63aefd01 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -29,6 +29,7 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
+#include "xfs_dir2.h"
#include "xfs_dir2_format.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 286a051f12c..1ad3a4b8ca4 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -37,9 +37,9 @@ STATIC int
xfs_trim_extents(
struct xfs_mount *mp,
xfs_agnumber_t agno,
- xfs_fsblock_t start,
- xfs_fsblock_t end,
- xfs_fsblock_t minlen,
+ xfs_daddr_t start,
+ xfs_daddr_t end,
+ xfs_daddr_t minlen,
__uint64_t *blocks_trimmed)
{
struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
@@ -67,7 +67,7 @@ xfs_trim_extents(
/*
* Look up the longest btree in the AGF and start with it.
*/
- error = xfs_alloc_lookup_le(cur, 0,
+ error = xfs_alloc_lookup_ge(cur, 0,
be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
if (error)
goto out_del_cursor;
@@ -77,8 +77,10 @@ xfs_trim_extents(
* enough to be worth discarding.
*/
while (i) {
- xfs_agblock_t fbno;
- xfs_extlen_t flen;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ xfs_daddr_t dbno;
+ xfs_extlen_t dlen;
error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
if (error)
@@ -87,9 +89,17 @@ xfs_trim_extents(
ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
/*
+ * use daddr format for all range/len calculations as that is
+ * the format the range/len variables are supplied in by
+ * userspace.
+ */
+ dbno = XFS_AGB_TO_DADDR(mp, agno, fbno);
+ dlen = XFS_FSB_TO_BB(mp, flen);
+
+ /*
* Too small? Give up.
*/
- if (flen < minlen) {
+ if (dlen < minlen) {
trace_xfs_discard_toosmall(mp, agno, fbno, flen);
goto out_del_cursor;
}
@@ -99,8 +109,7 @@ xfs_trim_extents(
* supposed to discard skip it. Do not bother to trim
* down partially overlapping ranges for now.
*/
- if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
- XFS_AGB_TO_FSB(mp, agno, fbno) > end) {
+ if (dbno + dlen < start || dbno > end) {
trace_xfs_discard_exclude(mp, agno, fbno, flen);
goto next_extent;
}
@@ -115,10 +124,7 @@ xfs_trim_extents(
}
trace_xfs_discard_extent(mp, agno, fbno, flen);
- error = -blkdev_issue_discard(bdev,
- XFS_AGB_TO_DADDR(mp, agno, fbno),
- XFS_FSB_TO_BB(mp, flen),
- GFP_NOFS, 0);
+ error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
if (error)
goto out_del_cursor;
*blocks_trimmed += flen;
@@ -137,6 +143,15 @@ out_put_perag:
return error;
}
+/*
+ * trim a range of the filesystem.
+ *
+ * Note: the parameters passed from userspace are byte ranges into the
+ * filesystem which does not match to the format we use for filesystem block
+ * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
+ * is a linear address range. Hence we need to use DADDR based conversions and
+ * comparisons for determining the correct offset and regions to trim.
+ */
int
xfs_ioc_trim(
struct xfs_mount *mp,
@@ -145,7 +160,7 @@ xfs_ioc_trim(
struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
unsigned int granularity = q->limits.discard_granularity;
struct fstrim_range range;
- xfs_fsblock_t start, end, minlen;
+ xfs_daddr_t start, end, minlen;
xfs_agnumber_t start_agno, end_agno, agno;
__uint64_t blocks_trimmed = 0;
int error, last_error = 0;
@@ -159,22 +174,22 @@ xfs_ioc_trim(
/*
* Truncating down the len isn't actually quite correct, but using
- * XFS_B_TO_FSB would mean we trivially get overflows for values
+ * BBTOB would mean we trivially get overflows for values
* of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
* used by the fstrim application. In the end it really doesn't
* matter as trimming blocks is an advisory interface.
*/
- start = XFS_B_TO_FSBT(mp, range.start);
- end = start + XFS_B_TO_FSBT(mp, range.len) - 1;
- minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+ start = BTOBB(range.start);
+ end = start + BTOBBT(range.len) - 1;
+ minlen = BTOBB(max_t(u64, granularity, range.minlen));
- if (start >= mp->m_sb.sb_dblocks)
+ if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks)
return -XFS_ERROR(EINVAL);
- if (end > mp->m_sb.sb_dblocks - 1)
- end = mp->m_sb.sb_dblocks - 1;
+ if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
+ end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
- start_agno = XFS_FSB_TO_AGNO(mp, start);
- end_agno = XFS_FSB_TO_AGNO(mp, end);
+ start_agno = xfs_daddr_to_agno(mp, start);
+ end_agno = xfs_daddr_to_agno(mp, end);
for (agno = start_agno; agno <= end_agno; agno++) {
error = -xfs_trim_extents(mp, agno, start, end, minlen,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index b4ff40b5f91..1155208fa83 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -43,11 +43,10 @@
* Lock order:
*
* ip->i_lock
- * qh->qh_lock
- * qi->qi_dqlist_lock
- * dquot->q_qlock (xfs_dqlock() and friends)
- * dquot->q_flush (xfs_dqflock() and friends)
- * xfs_Gqm->qm_dqfrlist_lock
+ * qi->qi_tree_lock
+ * dquot->q_qlock (xfs_dqlock() and friends)
+ * dquot->q_flush (xfs_dqflock() and friends)
+ * qi->qi_lru_lock
*
* If two dquots need to be locked the order is user before group/project,
* otherwise by the lowest id first, see xfs_dqlock2.
@@ -60,83 +59,10 @@ int xfs_dqreq_num;
int xfs_dqerror_mod = 33;
#endif
-static struct lock_class_key xfs_dquot_other_class;
-
-/*
- * Allocate and initialize a dquot. We don't always allocate fresh memory;
- * we try to reclaim a free dquot if the number of incore dquots are above
- * a threshold.
- * The only field inside the core that gets initialized at this point
- * is the d_id field. The idea is to fill in the entire q_core
- * when we read in the on disk dquot.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqinit(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type)
-{
- xfs_dquot_t *dqp;
- boolean_t brandnewdquot;
-
- brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
- dqp->dq_flags = type;
- dqp->q_core.d_id = cpu_to_be32(id);
- dqp->q_mount = mp;
-
- /*
- * No need to re-initialize these if this is a reclaimed dquot.
- */
- if (brandnewdquot) {
- INIT_LIST_HEAD(&dqp->q_freelist);
- mutex_init(&dqp->q_qlock);
- init_waitqueue_head(&dqp->q_pinwait);
-
- /*
- * Because we want to use a counting completion, complete
- * the flush completion once to allow a single access to
- * the flush completion without blocking.
- */
- init_completion(&dqp->q_flush);
- complete(&dqp->q_flush);
-
- trace_xfs_dqinit(dqp);
- } else {
- /*
- * Only the q_core portion was zeroed in dqreclaim_one().
- * So, we need to reset others.
- */
- dqp->q_nrefs = 0;
- dqp->q_blkno = 0;
- INIT_LIST_HEAD(&dqp->q_mplist);
- INIT_LIST_HEAD(&dqp->q_hashlist);
- dqp->q_bufoffset = 0;
- dqp->q_fileoffset = 0;
- dqp->q_transp = NULL;
- dqp->q_gdquot = NULL;
- dqp->q_res_bcount = 0;
- dqp->q_res_icount = 0;
- dqp->q_res_rtbcount = 0;
- atomic_set(&dqp->q_pincount, 0);
- dqp->q_hash = NULL;
- ASSERT(list_empty(&dqp->q_freelist));
-
- trace_xfs_dqreuse(dqp);
- }
-
- /*
- * In either case we need to make sure group quotas have a different
- * lock class than user quotas, to make sure lockdep knows we can
- * locks of one of each at the same time.
- */
- if (!(type & XFS_DQ_USER))
- lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+struct kmem_zone *xfs_qm_dqtrxzone;
+static struct kmem_zone *xfs_qm_dqzone;
- /*
- * log item gets initialized later
- */
- return (dqp);
-}
+static struct lock_class_key xfs_dquot_other_class;
/*
* This is called to free all the memory associated with a dquot
@@ -145,12 +71,12 @@ void
xfs_qm_dqdestroy(
xfs_dquot_t *dqp)
{
- ASSERT(list_empty(&dqp->q_freelist));
+ ASSERT(list_empty(&dqp->q_lru));
mutex_destroy(&dqp->q_qlock);
- kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+ kmem_zone_free(xfs_qm_dqzone, dqp);
- atomic_dec(&xfs_Gqm->qm_totaldquots);
+ XFS_STATS_DEC(xs_qm_dquot);
}
/*
@@ -215,10 +141,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_btimer) {
if ((d->d_blk_softlimit &&
- (be64_to_cpu(d->d_bcount) >=
+ (be64_to_cpu(d->d_bcount) >
be64_to_cpu(d->d_blk_softlimit))) ||
(d->d_blk_hardlimit &&
- (be64_to_cpu(d->d_bcount) >=
+ (be64_to_cpu(d->d_bcount) >
be64_to_cpu(d->d_blk_hardlimit)))) {
d->d_btimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_btimelimit);
@@ -227,10 +153,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_blk_softlimit ||
- (be64_to_cpu(d->d_bcount) <
+ (be64_to_cpu(d->d_bcount) <=
be64_to_cpu(d->d_blk_softlimit))) &&
(!d->d_blk_hardlimit ||
- (be64_to_cpu(d->d_bcount) <
+ (be64_to_cpu(d->d_bcount) <=
be64_to_cpu(d->d_blk_hardlimit)))) {
d->d_btimer = 0;
}
@@ -238,10 +164,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_itimer) {
if ((d->d_ino_softlimit &&
- (be64_to_cpu(d->d_icount) >=
+ (be64_to_cpu(d->d_icount) >
be64_to_cpu(d->d_ino_softlimit))) ||
(d->d_ino_hardlimit &&
- (be64_to_cpu(d->d_icount) >=
+ (be64_to_cpu(d->d_icount) >
be64_to_cpu(d->d_ino_hardlimit)))) {
d->d_itimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_itimelimit);
@@ -250,10 +176,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_ino_softlimit ||
- (be64_to_cpu(d->d_icount) <
+ (be64_to_cpu(d->d_icount) <=
be64_to_cpu(d->d_ino_softlimit))) &&
(!d->d_ino_hardlimit ||
- (be64_to_cpu(d->d_icount) <
+ (be64_to_cpu(d->d_icount) <=
be64_to_cpu(d->d_ino_hardlimit)))) {
d->d_itimer = 0;
}
@@ -261,10 +187,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_rtbtimer) {
if ((d->d_rtb_softlimit &&
- (be64_to_cpu(d->d_rtbcount) >=
+ (be64_to_cpu(d->d_rtbcount) >
be64_to_cpu(d->d_rtb_softlimit))) ||
(d->d_rtb_hardlimit &&
- (be64_to_cpu(d->d_rtbcount) >=
+ (be64_to_cpu(d->d_rtbcount) >
be64_to_cpu(d->d_rtb_hardlimit)))) {
d->d_rtbtimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_rtbtimelimit);
@@ -273,10 +199,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_rtb_softlimit ||
- (be64_to_cpu(d->d_rtbcount) <
+ (be64_to_cpu(d->d_rtbcount) <=
be64_to_cpu(d->d_rtb_softlimit))) &&
(!d->d_rtb_hardlimit ||
- (be64_to_cpu(d->d_rtbcount) <
+ (be64_to_cpu(d->d_rtbcount) <=
be64_to_cpu(d->d_rtb_hardlimit)))) {
d->d_rtbtimer = 0;
}
@@ -358,7 +284,7 @@ xfs_qm_dqalloc(
* Return if this type of quotas is turned off while we didn't
* have an inode lock
*/
- if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+ if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
return (ESRCH);
}
@@ -460,7 +386,7 @@ xfs_qm_dqtobp(
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
xfs_ilock(quotip, XFS_ILOCK_SHARED);
- if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+ if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
/*
* Return if this type of quotas is turned off while we
* didn't have the quota inode lock.
@@ -567,7 +493,32 @@ xfs_qm_dqread(
int error;
int cancelflags = 0;
- dqp = xfs_qm_dqinit(mp, id, type);
+
+ dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
+
+ dqp->dq_flags = type;
+ dqp->q_core.d_id = cpu_to_be32(id);
+ dqp->q_mount = mp;
+ INIT_LIST_HEAD(&dqp->q_lru);
+ mutex_init(&dqp->q_qlock);
+ init_waitqueue_head(&dqp->q_pinwait);
+
+ /*
+ * Because we want to use a counting completion, complete
+ * the flush completion once to allow a single access to
+ * the flush completion without blocking.
+ */
+ init_completion(&dqp->q_flush);
+ complete(&dqp->q_flush);
+
+ /*
+ * Make sure group quotas have a different lock class than user
+ * quotas.
+ */
+ if (!(type & XFS_DQ_USER))
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+
+ XFS_STATS_INC(xs_qm_dquot);
trace_xfs_dqread(dqp);
@@ -653,60 +604,6 @@ error0:
}
/*
- * Lookup a dquot in the incore dquot hashtable. We keep two separate
- * hashtables for user and group dquots; and, these are global tables
- * inside the XQM, not per-filesystem tables.
- * The hash chain must be locked by caller, and it is left locked
- * on return. Returning dquot is locked.
- */
-STATIC int
-xfs_qm_dqlookup(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- xfs_dqhash_t *qh,
- xfs_dquot_t **O_dqpp)
-{
- xfs_dquot_t *dqp;
-
- ASSERT(mutex_is_locked(&qh->qh_lock));
-
- /*
- * Traverse the hashchain looking for a match
- */
- list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
- /*
- * We already have the hashlock. We don't need the
- * dqlock to look at the id field of the dquot, since the
- * id can't be modified without the hashlock anyway.
- */
- if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp)
- continue;
-
- trace_xfs_dqlookup_found(dqp);
-
- xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_FREEING) {
- *O_dqpp = NULL;
- xfs_dqunlock(dqp);
- return -1;
- }
-
- dqp->q_nrefs++;
-
- /*
- * move the dquot to the front of the hashchain
- */
- list_move(&dqp->q_hashlist, &qh->qh_list);
- trace_xfs_dqlookup_done(dqp);
- *O_dqpp = dqp;
- return 0;
- }
-
- *O_dqpp = NULL;
- return 1;
-}
-
-/*
* Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
* a locked dquot, doing an allocation (if requested) as needed.
* When both an inode and an id are given, the inode's id takes precedence.
@@ -723,10 +620,10 @@ xfs_qm_dqget(
uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
{
- xfs_dquot_t *dqp;
- xfs_dqhash_t *h;
- uint version;
- int error;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ struct xfs_dquot *dqp;
+ int error;
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
@@ -734,7 +631,6 @@ xfs_qm_dqget(
(! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
return (ESRCH);
}
- h = XFS_DQ_HASH(mp, id, type);
#ifdef DEBUG
if (xfs_do_dqerror) {
@@ -750,42 +646,33 @@ xfs_qm_dqget(
type == XFS_DQ_GROUP);
if (ip) {
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (type == XFS_DQ_USER)
- ASSERT(ip->i_udquot == NULL);
- else
- ASSERT(ip->i_gdquot == NULL);
+ ASSERT(xfs_inode_dquot(ip, type) == NULL);
}
#endif
restart:
- mutex_lock(&h->qh_lock);
+ mutex_lock(&qi->qi_tree_lock);
+ dqp = radix_tree_lookup(tree, id);
+ if (dqp) {
+ xfs_dqlock(dqp);
+ if (dqp->dq_flags & XFS_DQ_FREEING) {
+ xfs_dqunlock(dqp);
+ mutex_unlock(&qi->qi_tree_lock);
+ trace_xfs_dqget_freeing(dqp);
+ delay(1);
+ goto restart;
+ }
- /*
- * Look in the cache (hashtable).
- * The chain is kept locked during lookup.
- */
- switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) {
- case -1:
- XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
- mutex_unlock(&h->qh_lock);
- delay(1);
- goto restart;
- case 0:
- XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
- /*
- * The dquot was found, moved to the front of the chain,
- * taken off the freelist if it was on it, and locked
- * at this point. Just unlock the hashchain and return.
- */
- ASSERT(*O_dqpp);
- ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
- mutex_unlock(&h->qh_lock);
- trace_xfs_dqget_hit(*O_dqpp);
- return 0; /* success */
- default:
- XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
- break;
+ dqp->q_nrefs++;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ trace_xfs_dqget_hit(dqp);
+ XFS_STATS_INC(xs_qm_dqcachehits);
+ *O_dqpp = dqp;
+ return 0;
}
+ mutex_unlock(&qi->qi_tree_lock);
+ XFS_STATS_INC(xs_qm_dqcachemisses);
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -796,12 +683,6 @@ restart:
*/
if (ip)
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * Save the hashchain version stamp, and unlock the chain, so that
- * we don't keep the lock across a disk read
- */
- version = h->qh_version;
- mutex_unlock(&h->qh_lock);
error = xfs_qm_dqread(mp, id, type, flags, &dqp);
@@ -811,97 +692,53 @@ restart:
if (error)
return error;
- /*
- * Dquot lock comes after hashlock in the lock ordering
- */
if (ip) {
/*
* A dquot could be attached to this inode by now, since
* we had dropped the ilock.
*/
- if (type == XFS_DQ_USER) {
- if (!XFS_IS_UQUOTA_ON(mp)) {
- /* inode stays locked on return */
- xfs_qm_dqdestroy(dqp);
- return XFS_ERROR(ESRCH);
- }
- if (ip->i_udquot) {
+ if (xfs_this_quota_on(mp, type)) {
+ struct xfs_dquot *dqp1;
+
+ dqp1 = xfs_inode_dquot(ip, type);
+ if (dqp1) {
xfs_qm_dqdestroy(dqp);
- dqp = ip->i_udquot;
+ dqp = dqp1;
xfs_dqlock(dqp);
goto dqret;
}
} else {
- if (!XFS_IS_OQUOTA_ON(mp)) {
- /* inode stays locked on return */
- xfs_qm_dqdestroy(dqp);
- return XFS_ERROR(ESRCH);
- }
- if (ip->i_gdquot) {
- xfs_qm_dqdestroy(dqp);
- dqp = ip->i_gdquot;
- xfs_dqlock(dqp);
- goto dqret;
- }
+ /* inode stays locked on return */
+ xfs_qm_dqdestroy(dqp);
+ return XFS_ERROR(ESRCH);
}
}
- /*
- * Hashlock comes after ilock in lock order
- */
- mutex_lock(&h->qh_lock);
- if (version != h->qh_version) {
- xfs_dquot_t *tmpdqp;
+ mutex_lock(&qi->qi_tree_lock);
+ error = -radix_tree_insert(tree, id, dqp);
+ if (unlikely(error)) {
+ WARN_ON(error != EEXIST);
+
/*
- * Now, see if somebody else put the dquot in the
- * hashtable before us. This can happen because we didn't
- * keep the hashchain lock. We don't have to worry about
- * lock order between the two dquots here since dqp isn't
- * on any findable lists yet.
+ * Duplicate found. Just throw away the new dquot and start
+ * over.
*/
- switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) {
- case 0:
- case -1:
- /*
- * Duplicate found, either in cache or on its way out.
- * Just throw away the new dquot and start over.
- */
- if (tmpdqp)
- xfs_qm_dqput(tmpdqp);
- mutex_unlock(&h->qh_lock);
- xfs_qm_dqdestroy(dqp);
- XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
- goto restart;
- default:
- break;
- }
+ mutex_unlock(&qi->qi_tree_lock);
+ trace_xfs_dqget_dup(dqp);
+ xfs_qm_dqdestroy(dqp);
+ XFS_STATS_INC(xs_qm_dquot_dups);
+ goto restart;
}
/*
- * Put the dquot at the beginning of the hash-chain and mp's list
- * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
- */
- ASSERT(mutex_is_locked(&h->qh_lock));
- dqp->q_hash = h;
- list_add(&dqp->q_hashlist, &h->qh_list);
- h->qh_version++;
-
- /*
- * Attach this dquot to this filesystem's list of all dquots,
- * kept inside the mount structure in m_quotainfo field
- */
- mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
-
- /*
* We return a locked dquot to the caller, with a reference taken
*/
xfs_dqlock(dqp);
dqp->q_nrefs = 1;
- list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
- mp->m_quotainfo->qi_dquots++;
- mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
- mutex_unlock(&h->qh_lock);
+ qi->qi_dquots++;
+ mutex_unlock(&qi->qi_tree_lock);
+
dqret:
ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
trace_xfs_dqget_miss(dqp);
@@ -910,37 +747,22 @@ restart:
}
-/*
- * Release a reference to the dquot (decrement ref-count)
- * and unlock it. If there is a group quota attached to this
- * dquot, carefully release that too without tripping over
- * deadlocks'n'stuff.
- */
-void
-xfs_qm_dqput(
+STATIC void
+xfs_qm_dqput_final(
struct xfs_dquot *dqp)
{
+ struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
struct xfs_dquot *gdqp;
- ASSERT(dqp->q_nrefs > 0);
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
- trace_xfs_dqput(dqp);
-
-recurse:
- if (--dqp->q_nrefs > 0) {
- xfs_dqunlock(dqp);
- return;
- }
-
trace_xfs_dqput_free(dqp);
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
- if (list_empty(&dqp->q_freelist)) {
- list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
- xfs_Gqm->qm_dqfrlist_cnt++;
+ mutex_lock(&qi->qi_lru_lock);
+ if (list_empty(&dqp->q_lru)) {
+ list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
+ qi->qi_lru_count++;
+ XFS_STATS_INC(xs_qm_dquot_unused);
}
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+ mutex_unlock(&qi->qi_lru_lock);
/*
* If we just added a udquot to the freelist, then we want to release
@@ -957,10 +779,29 @@ recurse:
/*
* If we had a group quota hint, release it now.
*/
- if (gdqp) {
- dqp = gdqp;
- goto recurse;
- }
+ if (gdqp)
+ xfs_qm_dqput(gdqp);
+}
+
+/*
+ * Release a reference to the dquot (decrement ref-count) and unlock it.
+ *
+ * If there is a group quota attached to this dquot, carefully release that
+ * too without tripping over deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+ struct xfs_dquot *dqp)
+{
+ ASSERT(dqp->q_nrefs > 0);
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+ trace_xfs_dqput(dqp);
+
+ if (--dqp->q_nrefs > 0)
+ xfs_dqunlock(dqp);
+ else
+ xfs_qm_dqput_final(dqp);
}
/*
@@ -1142,17 +983,6 @@ xfs_qm_dqflush(
}
-void
-xfs_dqunlock(
- xfs_dquot_t *dqp)
-{
- xfs_dqunlock_nonotify(dqp);
- if (dqp->q_logitem.qli_dquot == dqp) {
- xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
- &dqp->q_logitem.qli_item);
- }
-}
-
/*
* Lock two xfs_dquot structures.
*
@@ -1182,85 +1012,6 @@ xfs_dqlock2(
}
/*
- * Take a dquot out of the mount's dqlist as well as the hashlist. This is
- * called via unmount as well as quotaoff, and the purge will always succeed.
- */
-void
-xfs_qm_dqpurge(
- struct xfs_dquot *dqp)
-{
- struct xfs_mount *mp = dqp->q_mount;
- struct xfs_dqhash *qh = dqp->q_hash;
-
- xfs_dqlock(dqp);
-
- /*
- * If we're turning off quotas, we have to make sure that, for
- * example, we don't delete quota disk blocks while dquots are
- * in the process of getting written to those disk blocks.
- * This dquot might well be on AIL, and we can't leave it there
- * if we're turning off quotas. Basically, we need this flush
- * lock, and are willing to block on it.
- */
- if (!xfs_dqflock_nowait(dqp)) {
- /*
- * Block on the flush lock after nudging dquot buffer,
- * if it is incore.
- */
- xfs_dqflock_pushbuf_wait(dqp);
- }
-
- /*
- * If we are turning this type of quotas off, we don't care
- * about the dirty metadata sitting in this dquot. OTOH, if
- * we're unmounting, we do care, so we flush it and wait.
- */
- if (XFS_DQ_IS_DIRTY(dqp)) {
- int error;
-
- /*
- * We don't care about getting disk errors here. We need
- * to purge this dquot anyway, so we go ahead regardless.
- */
- error = xfs_qm_dqflush(dqp, SYNC_WAIT);
- if (error)
- xfs_warn(mp, "%s: dquot %p flush failed",
- __func__, dqp);
- xfs_dqflock(dqp);
- }
-
- ASSERT(atomic_read(&dqp->q_pincount) == 0);
- ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
- !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
-
- mutex_lock(&qh->qh_lock);
- list_del_init(&dqp->q_hashlist);
- qh->qh_version++;
- mutex_unlock(&qh->qh_lock);
-
- mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
- list_del_init(&dqp->q_mplist);
- mp->m_quotainfo->qi_dqreclaims++;
- mp->m_quotainfo->qi_dquots--;
- mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-
- /*
- * We move dquots to the freelist as soon as their reference count
- * hits zero, so it really should be on the freelist here.
- */
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
- ASSERT(!list_empty(&dqp->q_freelist));
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-
- xfs_qm_dqdestroy(dqp);
-}
-
-/*
* Give the buffer a little push if it is incore and
* wait on the flush lock.
*/
@@ -1292,3 +1043,31 @@ xfs_dqflock_pushbuf_wait(
out_lock:
xfs_dqflock(dqp);
}
+
+int __init
+xfs_qm_init(void)
+{
+ xfs_qm_dqzone =
+ kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot");
+ if (!xfs_qm_dqzone)
+ goto out;
+
+ xfs_qm_dqtrxzone =
+ kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx");
+ if (!xfs_qm_dqtrxzone)
+ goto out_free_dqzone;
+
+ return 0;
+
+out_free_dqzone:
+ kmem_zone_destroy(xfs_qm_dqzone);
+out:
+ return -ENOMEM;
+}
+
+void
+xfs_qm_exit(void)
+{
+ kmem_zone_destroy(xfs_qm_dqtrxzone);
+ kmem_zone_destroy(xfs_qm_dqzone);
+}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index a1d91d8f180..ef9190bd8b3 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -29,16 +29,6 @@
* when quotas are off.
*/
-/*
- * The hash chain headers (hash buckets)
- */
-typedef struct xfs_dqhash {
- struct list_head qh_list;
- struct mutex qh_lock;
- uint qh_version; /* ever increasing version */
- uint qh_nelems; /* number of dquots on the list */
-} xfs_dqhash_t;
-
struct xfs_mount;
struct xfs_trans;
@@ -47,10 +37,7 @@ struct xfs_trans;
*/
typedef struct xfs_dquot {
uint dq_flags; /* various flags (XFS_DQ_*) */
- struct list_head q_freelist; /* global free list of dquots */
- struct list_head q_mplist; /* mount's list of dquots */
- struct list_head q_hashlist; /* gloabl hash list of dquots */
- xfs_dqhash_t *q_hash; /* the hashchain header */
+ struct list_head q_lru; /* global free list of dquots */
struct xfs_mount*q_mount; /* filesystem this relates to */
struct xfs_trans*q_transp; /* trans this belongs to currently */
uint q_nrefs; /* # active refs from inodes */
@@ -110,11 +97,37 @@ static inline void xfs_dqlock(struct xfs_dquot *dqp)
mutex_lock(&dqp->q_qlock);
}
-static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp)
+static inline void xfs_dqunlock(struct xfs_dquot *dqp)
{
mutex_unlock(&dqp->q_qlock);
}
+static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
+{
+ switch (type & XFS_DQ_ALLTYPES) {
+ case XFS_DQ_USER:
+ return XFS_IS_UQUOTA_ON(mp);
+ case XFS_DQ_GROUP:
+ case XFS_DQ_PROJ:
+ return XFS_IS_OQUOTA_ON(mp);
+ default:
+ return 0;
+ }
+}
+
+static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
+{
+ switch (type & XFS_DQ_ALLTYPES) {
+ case XFS_DQ_USER:
+ return ip->i_udquot;
+ case XFS_DQ_GROUP:
+ case XFS_DQ_PROJ:
+ return ip->i_gdquot;
+ default:
+ return NULL;
+ }
+}
+
#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
@@ -125,15 +138,10 @@ static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp)
XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
-#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
- (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
- (XFS_IS_OQUOTA_ON((d)->q_mount))))
-
extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
uint, struct xfs_dquot **);
extern void xfs_qm_dqdestroy(xfs_dquot_t *);
extern int xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern void xfs_qm_dqpurge(xfs_dquot_t *);
extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
xfs_disk_dquot_t *);
@@ -144,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
extern void xfs_qm_dqput(xfs_dquot_t *);
extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
-extern void xfs_dqunlock(struct xfs_dquot *);
extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp);
static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f675f3d9d7b..54a67dd9ac0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -163,7 +163,6 @@ xfs_file_fsync(
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
int error = 0;
int log_flushed = 0;
xfs_lsn_t lsn = 0;
@@ -194,75 +193,18 @@ xfs_file_fsync(
}
/*
- * We always need to make sure that the required inode state is safe on
- * disk. The inode might be clean but we still might need to force the
- * log because of committed transactions that haven't hit the disk yet.
- * Likewise, there could be unflushed non-transactional changes to the
- * inode core that have to go to disk and this requires us to issue
- * a synchronous transaction to capture these changes correctly.
- *
- * This code relies on the assumption that if the i_update_core field
- * of the inode is clear and the inode is unpinned then it is clean
- * and no action is required.
+ * All metadata updates are logged, which means that we just have
+ * to flush the log up to the latest LSN that touched the inode.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
-
- /*
- * First check if the VFS inode is marked dirty. All the dirtying
- * of non-transactional updates do not go through mark_inode_dirty*,
- * which allows us to distinguish between pure timestamp updates
- * and i_size updates which need to be caught for fdatasync.
- * After that also check for the dirty state in the XFS inode, which
- * might gets cleared when the inode gets written out via the AIL
- * or xfs_iflush_cluster.
- */
- if (((inode->i_state & I_DIRTY_DATASYNC) ||
- ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
- ip->i_update_core) {
- /*
- * Kick off a transaction to log the inode core to get the
- * updates. The sync transaction will also force the log.
- */
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0,
- XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return -error;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- /*
- * Note - it's possible that we might have pushed ourselves out
- * of the way during trans_reserve which would flush the inode.
- * But there's no guarantee that the inode buffer has actually
- * gone out yet (it's delwri). Plus the buffer could be pinned
- * anyway if it's part of an inode in another recent
- * transaction. So we play it safe and fire off the
- * transaction anyway.
- */
- xfs_trans_ijoin(tp, ip, 0);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp, 0);
-
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- } else {
- /*
- * Timestamps/size haven't changed since last inode flush or
- * inode transaction commit. That means either nothing got
- * written or a transaction committed which caught the updates.
- * If the latter happened and the transaction hasn't hit the
- * disk yet, the inode will be still be pinned. If it is,
- * force the log.
- */
- if (xfs_ipincount(ip))
+ if (xfs_ipincount(ip)) {
+ if (!datasync ||
+ (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
}
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (!error && lsn)
+ if (lsn)
error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
/*
@@ -327,7 +269,7 @@ xfs_file_aio_read(
mp->m_rtdev_targp : mp->m_ddev_targp;
if ((iocb->ki_pos & target->bt_smask) ||
(size & target->bt_smask)) {
- if (iocb->ki_pos == ip->i_size)
+ if (iocb->ki_pos == i_size_read(inode))
return 0;
return -XFS_ERROR(EINVAL);
}
@@ -412,51 +354,6 @@ xfs_file_splice_read(
return ret;
}
-STATIC void
-xfs_aio_write_isize_update(
- struct inode *inode,
- loff_t *ppos,
- ssize_t bytes_written)
-{
- struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t isize = i_size_read(inode);
-
- if (bytes_written > 0)
- XFS_STATS_ADD(xs_write_bytes, bytes_written);
-
- if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
- *ppos > isize))
- *ppos = isize;
-
- if (*ppos > ip->i_size) {
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- if (*ppos > ip->i_size)
- ip->i_size = *ppos;
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- }
-}
-
-/*
- * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occurred. In
- * this case the on-disk file size may have been adjusted beyond the in-memory
- * file size and now needs to be truncated back.
- */
-STATIC void
-xfs_aio_write_newsize_update(
- struct xfs_inode *ip,
- xfs_fsize_t new_size)
-{
- if (new_size == ip->i_new_size) {
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- if (new_size == ip->i_new_size)
- ip->i_new_size = 0;
- if (ip->i_d.di_size > ip->i_size)
- ip->i_d.di_size = ip->i_size;
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- }
-}
-
/*
* xfs_file_splice_write() does not use xfs_rw_ilock() because
* generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -475,7 +372,6 @@ xfs_file_splice_write(
{
struct inode *inode = outfilp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t new_size;
int ioflags = 0;
ssize_t ret;
@@ -489,19 +385,12 @@ xfs_file_splice_write(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- new_size = *ppos + count;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (new_size > ip->i_size)
- ip->i_new_size = new_size;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_write_bytes, ret);
- xfs_aio_write_isize_update(inode, ppos, ret);
- xfs_aio_write_newsize_update(ip, new_size);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
@@ -689,72 +578,61 @@ out_lock:
/*
* Common pre-write limit and setup checks.
*
- * Returns with iolock held according to @iolock.
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held. Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
*/
STATIC ssize_t
xfs_file_aio_write_checks(
struct file *file,
loff_t *pos,
size_t *count,
- xfs_fsize_t *new_sizep,
int *iolock)
{
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t new_size;
int error = 0;
xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- *new_sizep = 0;
restart:
error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
if (error) {
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
- *iolock = 0;
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
- if (likely(!(file->f_mode & FMODE_NOCMTIME)))
- file_update_time(file);
-
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
- * write. There is no need to issue zeroing if another in-flght IO ends
- * at or before this one If zeronig is needed and we are currently
- * holding the iolock shared, we need to update it to exclusive which
- * involves dropping all locks and relocking to maintain correct locking
- * order. If we do this, restart the function to ensure all checks and
- * values are still valid.
+ * write. If zeroing is needed and we are currently holding the
+ * iolock shared, we need to update it to exclusive which involves
+ * dropping all locks and relocking to maintain correct locking order.
+ * If we do this, restart the function to ensure all checks and values
+ * are still valid.
*/
- if ((ip->i_new_size && *pos > ip->i_new_size) ||
- (!ip->i_new_size && *pos > ip->i_size)) {
+ if (*pos > i_size_read(inode)) {
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
*iolock = XFS_IOLOCK_EXCL;
xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
goto restart;
}
- error = -xfs_zero_eof(ip, *pos, ip->i_size);
- }
-
- /*
- * If this IO extends beyond EOF, we may need to update ip->i_new_size.
- * We have already zeroed space beyond EOF (if necessary). Only update
- * ip->i_new_size if this IO ends beyond any other in-flight writes.
- */
- new_size = *pos + *count;
- if (new_size > ip->i_size) {
- if (new_size > ip->i_new_size)
- ip->i_new_size = new_size;
- *new_sizep = new_size;
+ error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
}
-
xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
/*
+ * Updating the timestamps will grab the ilock again from
+ * xfs_fs_dirty_inode, so we have to call it after dropping the
+ * lock above. Eventually we should look into a way to avoid
+ * the pointless lock roundtrip.
+ */
+ if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+ file_update_time(file);
+
+ /*
* If we're writing the file then make sure to clear the setuid and
* setgid bits if the process is not being run by root. This keeps
* people from modifying setuid and setgid binaries.
@@ -794,9 +672,7 @@ xfs_file_dio_aio_write(
const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos,
- size_t ocount,
- xfs_fsize_t *new_size,
- int *iolock)
+ size_t ocount)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -806,10 +682,10 @@ xfs_file_dio_aio_write(
ssize_t ret = 0;
size_t count = ocount;
int unaligned_io = 0;
+ int iolock;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- *iolock = 0;
if ((pos & target->bt_smask) || (count & target->bt_smask))
return -XFS_ERROR(EINVAL);
@@ -824,31 +700,31 @@ xfs_file_dio_aio_write(
* EOF zeroing cases and fill out the new inode size as appropriate.
*/
if (unaligned_io || mapping->nrpages)
- *iolock = XFS_IOLOCK_EXCL;
+ iolock = XFS_IOLOCK_EXCL;
else
- *iolock = XFS_IOLOCK_SHARED;
- xfs_rw_ilock(ip, *iolock);
+ iolock = XFS_IOLOCK_SHARED;
+ xfs_rw_ilock(ip, iolock);
/*
* Recheck if there are cached pages that need invalidate after we got
* the iolock to protect against other threads adding new pages while
* we were waiting for the iolock.
*/
- if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, iolock);
+ iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, iolock);
}
- ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
- return ret;
+ goto out;
if (mapping->nrpages) {
ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
FI_REMAPF_LOCKED);
if (ret)
- return ret;
+ goto out;
}
/*
@@ -857,15 +733,18 @@ xfs_file_dio_aio_write(
*/
if (unaligned_io)
inode_dio_wait(inode);
- else if (*iolock == XFS_IOLOCK_EXCL) {
+ else if (iolock == XFS_IOLOCK_EXCL) {
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
- *iolock = XFS_IOLOCK_SHARED;
+ iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
ret = generic_file_direct_write(iocb, iovp,
&nr_segs, pos, &iocb->ki_pos, count, ocount);
+out:
+ xfs_rw_iunlock(ip, iolock);
+
/* No fallback to buffered IO on errors for XFS. */
ASSERT(ret < 0 || ret == count);
return ret;
@@ -877,9 +756,7 @@ xfs_file_buffered_aio_write(
const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos,
- size_t ocount,
- xfs_fsize_t *new_size,
- int *iolock)
+ size_t ocount)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -887,14 +764,14 @@ xfs_file_buffered_aio_write(
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
int enospc = 0;
+ int iolock = XFS_IOLOCK_EXCL;
size_t count = ocount;
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ xfs_rw_ilock(ip, iolock);
- ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
- return ret;
+ goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
@@ -908,13 +785,15 @@ write_retry:
* page locks and retry *once*
*/
if (ret == -ENOSPC && !enospc) {
- ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
- if (ret)
- return ret;
enospc = 1;
- goto write_retry;
+ ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+ if (!ret)
+ goto write_retry;
}
+
current->backing_dev_info = NULL;
+out:
+ xfs_rw_iunlock(ip, iolock);
return ret;
}
@@ -930,9 +809,7 @@ xfs_file_aio_write(
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int iolock;
size_t ocount = 0;
- xfs_fsize_t new_size = 0;
XFS_STATS_INC(xs_write_calls);
@@ -951,33 +828,22 @@ xfs_file_aio_write(
return -EIO;
if (unlikely(file->f_flags & O_DIRECT))
- ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
- ocount, &new_size, &iolock);
+ ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
else
ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
- ocount, &new_size, &iolock);
+ ocount);
- xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
+ if (ret > 0) {
+ ssize_t err;
- if (ret <= 0)
- goto out_unlock;
+ XFS_STATS_ADD(xs_write_bytes, ret);
- /* Handle various SYNC-type writes */
- if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- loff_t end = pos + ret - 1;
- int error;
-
- xfs_rw_iunlock(ip, iolock);
- error = xfs_file_fsync(file, pos, end,
- (file->f_flags & __O_SYNC) ? 0 : 1);
- xfs_rw_ilock(ip, iolock);
- if (error)
- ret = error;
+ /* Handle various SYNC-type writes */
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0)
+ ret = err;
}
-out_unlock:
- xfs_aio_write_newsize_update(ip, new_size);
- xfs_rw_iunlock(ip, iolock);
return ret;
}
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811..652b875a9d4 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
return -filemap_fdatawait_range(mapping, first,
- last == -1 ? ip->i_size - 1 : last);
+ last == -1 ? XFS_ISIZE(ip) - 1 : last);
}
return 0;
}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3960a066d7f..bcc6c249b2c 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -77,7 +77,7 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(completion_done(&ip->i_flush));
+ ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -91,11 +91,8 @@ xfs_inode_alloc(
ip->i_afp = NULL;
memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
ip->i_flags = 0;
- ip->i_update_core = 0;
ip->i_delayed_blks = 0;
memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
- ip->i_size = 0;
- ip->i_new_size = 0;
return ip;
}
@@ -150,7 +147,7 @@ xfs_inode_free(
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(completion_done(&ip->i_flush));
+ ASSERT(!xfs_isiflocked(ip));
/*
* Because we use RCU freeing we need to ensure the inode always
@@ -292,7 +289,7 @@ xfs_iget_cache_hit(
if (lock_flags != 0)
xfs_ilock(ip, lock_flags);
- xfs_iflags_clear(ip, XFS_ISTALE);
+ xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
XFS_STATS_INC(xs_ig_found);
return 0;
@@ -317,6 +314,7 @@ xfs_iget_cache_miss(
struct xfs_inode *ip;
int error;
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
+ int iflags;
ip = xfs_inode_alloc(mp, ino);
if (!ip)
@@ -352,9 +350,23 @@ xfs_iget_cache_miss(
BUG();
}
- spin_lock(&pag->pag_ici_lock);
+ /*
+ * These values must be set before inserting the inode into the radix
+ * tree as the moment it is inserted a concurrent lookup (allowed by the
+ * RCU locking mechanism) can find it and that lookup must see that this
+ * is an inode currently under construction (i.e. that XFS_INEW is set).
+ * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+ * memory barrier that ensures this detection works correctly at lookup
+ * time.
+ */
+ iflags = XFS_INEW;
+ if (flags & XFS_IGET_DONTCACHE)
+ iflags |= XFS_IDONTCACHE;
+ ip->i_udquot = ip->i_gdquot = NULL;
+ xfs_iflags_set(ip, iflags);
/* insert the new inode */
+ spin_lock(&pag->pag_ici_lock);
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
if (unlikely(error)) {
WARN_ON(error != -EEXIST);
@@ -362,11 +374,6 @@ xfs_iget_cache_miss(
error = EAGAIN;
goto out_preload_end;
}
-
- /* These values _must_ be set before releasing the radix tree lock! */
- ip->i_udquot = ip->i_gdquot = NULL;
- xfs_iflags_set(ip, XFS_INEW);
-
spin_unlock(&pag->pag_ici_lock);
radix_tree_preload_end();
@@ -420,6 +427,15 @@ xfs_iget(
xfs_perag_t *pag;
xfs_agino_t agino;
+ /*
+ * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+ * doesn't get freed while it's being referenced during a
+ * radix tree traversal here. It assumes this function
+ * aqcuires only the ILOCK (and therefore it has no need to
+ * involve the IOLOCK in this synchronization).
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
/* reject inode numbers outside existing AGs */
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return EINVAL;
@@ -450,8 +466,6 @@ again:
*ipp = ip;
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
/*
* If we have a real type for an on-disk inode, we can set ops(&unlock)
* now. If it's a new inode being created, xfs_ialloc will handle it.
@@ -646,8 +660,7 @@ xfs_iunlock(
(XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY |
- XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
ASSERT(lock_flags != 0);
if (lock_flags & XFS_IOLOCK_EXCL)
@@ -660,16 +673,6 @@ xfs_iunlock(
else if (lock_flags & XFS_ILOCK_SHARED)
mrunlock_shared(&ip->i_lock);
- if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) &&
- !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) {
- /*
- * Let the AIL know that this item has been unlocked in case
- * it is in the AIL and anyone is waiting on it. Don't do
- * this if the caller has asked us not to.
- */
- xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
- (xfs_log_item_t*)(ip->i_itemp));
- }
trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
}
@@ -715,3 +718,19 @@ xfs_isilocked(
return 0;
}
#endif
+
+void
+__xfs_iflock(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+ do {
+ prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_isiflocked(ip))
+ io_schedule();
+ } while (!xfs_iflock_nowait(ip));
+
+ finish_wait(wq, &wait.wait);
+}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9dda7cc3284..bc46c0a133d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -299,11 +299,8 @@ xfs_iformat(
{
xfs_attr_shortform_t *atp;
int size;
- int error;
+ int error = 0;
xfs_fsize_t di_size;
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
- error = 0;
if (unlikely(be32_to_cpu(dip->di_nextents) +
be16_to_cpu(dip->di_anextents) >
@@ -350,7 +347,6 @@ xfs_iformat(
return XFS_ERROR(EFSCORRUPTED);
}
ip->i_d.di_size = 0;
- ip->i_size = 0;
ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
break;
@@ -409,10 +405,10 @@ xfs_iformat(
}
if (!XFS_DFORK_Q(dip))
return 0;
+
ASSERT(ip->i_afp == NULL);
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
switch (dip->di_aformat) {
case XFS_DINODE_FMT_LOCAL:
atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +600,11 @@ xfs_iformat_btree(
* or the number of extents is greater than the number of
* blocks.
*/
- if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
- || XFS_BMDR_SPACE_CALC(nrecs) >
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
- || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+ XFS_IFORK_MAXEXT(ip, whichfork) ||
+ XFS_BMDR_SPACE_CALC(nrecs) >
+ XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+ XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
(unsigned long long) ip->i_ino);
XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +832,6 @@ xfs_iread(
* with the uninitialized part of it.
*/
ip->i_d.di_mode = 0;
- /*
- * Initialize the per-fork minima and maxima for a new
- * inode here. xfs_iformat will do it for old inodes.
- */
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
}
/*
@@ -861,7 +852,6 @@ xfs_iread(
}
ip->i_delayed_blks = 0;
- ip->i_size = ip->i_d.di_size;
/*
* Mark the buffer containing the inode as something to keep
@@ -1051,7 +1041,6 @@ xfs_ialloc(
}
ip->i_d.di_size = 0;
- ip->i_size = 0;
ip->i_d.di_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
@@ -1166,52 +1155,6 @@ xfs_ialloc(
}
/*
- * Check to make sure that there are no blocks allocated to the
- * file beyond the size of the file. We don't check this for
- * files with fixed size extents or real time extents, but we
- * at least do it for regular files.
- */
-#ifdef DEBUG
-STATIC void
-xfs_isize_check(
- struct xfs_inode *ip,
- xfs_fsize_t isize)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t map_first;
- int nimaps;
- xfs_bmbt_irec_t imaps[2];
- int error;
-
- if (!S_ISREG(ip->i_d.di_mode))
- return;
-
- if (XFS_IS_REALTIME_INODE(ip))
- return;
-
- if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
- return;
-
- nimaps = 2;
- map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- /*
- * The filesystem could be shutting down, so bmapi may return
- * an error.
- */
- error = xfs_bmapi_read(ip, map_first,
- (XFS_B_TO_FSB(mp,
- (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
- imaps, &nimaps, XFS_BMAPI_ENTIRE);
- if (error)
- return;
- ASSERT(nimaps == 1);
- ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
-}
-#else /* DEBUG */
-#define xfs_isize_check(ip, isize)
-#endif /* DEBUG */
-
-/*
* Free up the underlying blocks past new_size. The new size must be smaller
* than the current size. This routine can be used both for the attribute and
* data fork, and does not modify the inode size, which is left to the caller.
@@ -1252,12 +1195,14 @@ xfs_itruncate_extents(
int done = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- ASSERT(new_size <= ip->i_size);
+ ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(ip->i_itemp != NULL);
ASSERT(ip->i_itemp->ili_lock_flags == 0);
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ trace_xfs_itruncate_extents_start(ip, new_size);
+
/*
* Since it is possible for space to become allocated beyond
* the end of the file (in a crash where the space is allocated
@@ -1325,6 +1270,14 @@ xfs_itruncate_extents(
goto out;
}
+ /*
+ * Always re-log the inode so that our permanent transaction can keep
+ * on rolling it forward in the log.
+ */
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ trace_xfs_itruncate_extents_end(ip, new_size);
+
out:
*tpp = tp;
return error;
@@ -1338,74 +1291,6 @@ out_bmap_cancel:
goto out;
}
-int
-xfs_itruncate_data(
- struct xfs_trans **tpp,
- struct xfs_inode *ip,
- xfs_fsize_t new_size)
-{
- int error;
-
- trace_xfs_itruncate_data_start(ip, new_size);
-
- /*
- * The first thing we do is set the size to new_size permanently on
- * disk. This way we don't have to worry about anyone ever being able
- * to look at the data being freed even in the face of a crash.
- * What we're getting around here is the case where we free a block, it
- * is allocated to another file, it is written to, and then we crash.
- * If the new data gets written to the file but the log buffers
- * containing the free and reallocation don't, then we'd end up with
- * garbage in the blocks being freed. As long as we make the new_size
- * permanent before actually freeing any blocks it doesn't matter if
- * they get written to.
- */
- if (ip->i_d.di_nextents > 0) {
- /*
- * If we are not changing the file size then do not update
- * the on-disk file size - we may be called from
- * xfs_inactive_free_eofblocks(). If we update the on-disk
- * file size and then the system crashes before the contents
- * of the file are flushed to disk then the files may be
- * full of holes (ie NULL files bug).
- */
- if (ip->i_size != new_size) {
- ip->i_d.di_size = new_size;
- ip->i_size = new_size;
- xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
- }
- }
-
- error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
- if (error)
- return error;
-
- /*
- * If we are not changing the file size then do not update the on-disk
- * file size - we may be called from xfs_inactive_free_eofblocks().
- * If we update the on-disk file size and then the system crashes
- * before the contents of the file are flushed to disk then the files
- * may be full of holes (ie NULL files bug).
- */
- xfs_isize_check(ip, new_size);
- if (ip->i_size != new_size) {
- ip->i_d.di_size = new_size;
- ip->i_size = new_size;
- }
-
- ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
- ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
-
- /*
- * Always re-log the inode so that our permanent transaction can keep
- * on rolling it forward in the log.
- */
- xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-
- trace_xfs_itruncate_data_end(ip, new_size);
- return 0;
-}
-
/*
* This is called when the inode's link count goes to 0.
* We place the on-disk inode on a list in the AGI. It
@@ -1771,14 +1656,13 @@ retry:
iip = ip->i_itemp;
if (!iip || xfs_inode_clean(ip)) {
ASSERT(ip != free_ip);
- ip->i_update_core = 0;
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
continue;
}
- iip->ili_last_fields = iip->ili_format.ilf_fields;
- iip->ili_format.ilf_fields = 0;
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
@@ -1824,8 +1708,7 @@ xfs_ifree(
ASSERT(ip->i_d.di_nlink == 0);
ASSERT(ip->i_d.di_nextents == 0);
ASSERT(ip->i_d.di_anextents == 0);
- ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
- (!S_ISREG(ip->i_d.di_mode)));
+ ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
ASSERT(ip->i_d.di_nblocks == 0);
/*
@@ -1844,8 +1727,6 @@ xfs_ifree(
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
/*
@@ -2151,7 +2032,7 @@ xfs_idestroy_fork(
* once someone is waiting for it to be unpinned.
*/
static void
-xfs_iunpin_nowait(
+xfs_iunpin(
struct xfs_inode *ip)
{
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2163,14 +2044,29 @@ xfs_iunpin_nowait(
}
+static void
+__xfs_iunpin_wait(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+
+ xfs_iunpin(ip);
+
+ do {
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_ipincount(ip))
+ io_schedule();
+ } while (xfs_ipincount(ip));
+ finish_wait(wq, &wait.wait);
+}
+
void
xfs_iunpin_wait(
struct xfs_inode *ip)
{
- if (xfs_ipincount(ip)) {
- xfs_iunpin_nowait(ip);
- wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
- }
+ if (xfs_ipincount(ip))
+ __xfs_iunpin_wait(ip);
}
/*
@@ -2280,7 +2176,7 @@ xfs_iflush_fork(
mp = ip->i_mount;
switch (XFS_IFORK_FORMAT(ip, whichfork)) {
case XFS_DINODE_FMT_LOCAL:
- if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
+ if ((iip->ili_fields & dataflag[whichfork]) &&
(ifp->if_bytes > 0)) {
ASSERT(ifp->if_u1.if_data != NULL);
ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
@@ -2290,8 +2186,8 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_EXTENTS:
ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
- !(iip->ili_format.ilf_fields & extflag[whichfork]));
- if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
+ !(iip->ili_fields & extflag[whichfork]));
+ if ((iip->ili_fields & extflag[whichfork]) &&
(ifp->if_bytes > 0)) {
ASSERT(xfs_iext_get_ext(ifp, 0));
ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
@@ -2301,7 +2197,7 @@ xfs_iflush_fork(
break;
case XFS_DINODE_FMT_BTREE:
- if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
+ if ((iip->ili_fields & brootflag[whichfork]) &&
(ifp->if_broot_bytes > 0)) {
ASSERT(ifp->if_broot != NULL);
ASSERT(ifp->if_broot_bytes <=
@@ -2314,14 +2210,14 @@ xfs_iflush_fork(
break;
case XFS_DINODE_FMT_DEV:
- if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+ if (iip->ili_fields & XFS_ILOG_DEV) {
ASSERT(whichfork == XFS_DATA_FORK);
xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
}
break;
case XFS_DINODE_FMT_UUID:
- if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+ if (iip->ili_fields & XFS_ILOG_UUID) {
ASSERT(whichfork == XFS_DATA_FORK);
memcpy(XFS_DFORK_DPTR(dip),
&ip->i_df.if_u2.if_uuid,
@@ -2510,9 +2406,9 @@ xfs_iflush(
XFS_STATS_INC(xs_iflush_count);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > ip->i_df.if_ext_max);
+ ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
iip = ip->i_itemp;
mp = ip->i_mount;
@@ -2529,7 +2425,7 @@ xfs_iflush(
* out for us if they occur after the log force completes.
*/
if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
- xfs_iunpin_nowait(ip);
+ xfs_iunpin(ip);
xfs_ifunlock(ip);
return EAGAIN;
}
@@ -2554,9 +2450,8 @@ xfs_iflush(
* to disk, because the log record didn't make it to disk!
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
- ip->i_update_core = 0;
if (iip)
- iip->ili_format.ilf_fields = 0;
+ iip->ili_fields = 0;
xfs_ifunlock(ip);
return XFS_ERROR(EIO);
}
@@ -2626,9 +2521,9 @@ xfs_iflush_int(
#endif
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > ip->i_df.if_ext_max);
+ ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
iip = ip->i_itemp;
mp = ip->i_mount;
@@ -2636,26 +2531,6 @@ xfs_iflush_int(
/* set *dip = inode's place in the buffer */
dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
- /*
- * Clear i_update_core before copying out the data.
- * This is for coordination with our timestamp updates
- * that don't hold the inode lock. They will always
- * update the timestamps BEFORE setting i_update_core,
- * so if we clear i_update_core after they set it we
- * are guaranteed to see their updates to the timestamps.
- * I believe that this depends on strongly ordered memory
- * semantics, but we have that. We use the SYNCHRONIZE
- * macro to make sure that the compiler does not reorder
- * the i_update_core access below the data copy below.
- */
- ip->i_update_core = 0;
- SYNCHRONIZE();
-
- /*
- * Make sure to get the latest timestamps from the Linux inode.
- */
- xfs_synchronize_times(ip);
-
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
@@ -2766,36 +2641,33 @@ xfs_iflush_int(
xfs_inobp_check(mp, bp);
/*
- * We've recorded everything logged in the inode, so we'd
- * like to clear the ilf_fields bits so we don't log and
- * flush things unnecessarily. However, we can't stop
- * logging all this information until the data we've copied
- * into the disk buffer is written to disk. If we did we might
- * overwrite the copy of the inode in the log with all the
- * data after re-logging only part of it, and in the face of
- * a crash we wouldn't have all the data we need to recover.
+ * We've recorded everything logged in the inode, so we'd like to clear
+ * the ili_fields bits so we don't log and flush things unnecessarily.
+ * However, we can't stop logging all this information until the data
+ * we've copied into the disk buffer is written to disk. If we did we
+ * might overwrite the copy of the inode in the log with all the data
+ * after re-logging only part of it, and in the face of a crash we
+ * wouldn't have all the data we need to recover.
*
- * What we do is move the bits to the ili_last_fields field.
- * When logging the inode, these bits are moved back to the
- * ilf_fields field. In the xfs_iflush_done() routine we
- * clear ili_last_fields, since we know that the information
- * those bits represent is permanently on disk. As long as
- * the flush completes before the inode is logged again, then
- * both ilf_fields and ili_last_fields will be cleared.
+ * What we do is move the bits to the ili_last_fields field. When
+ * logging the inode, these bits are moved back to the ili_fields field.
+ * In the xfs_iflush_done() routine we clear ili_last_fields, since we
+ * know that the information those bits represent is permanently on
+ * disk. As long as the flush completes before the inode is logged
+ * again, then both ili_fields and ili_last_fields will be cleared.
*
- * We can play with the ilf_fields bits here, because the inode
- * lock must be held exclusively in order to set bits there
- * and the flush lock protects the ili_last_fields bits.
- * Set ili_logged so the flush done
- * routine can tell whether or not to look in the AIL.
- * Also, store the current LSN of the inode so that we can tell
- * whether the item has moved in the AIL from xfs_iflush_done().
- * In order to read the lsn we need the AIL lock, because
- * it is a 64 bit value that cannot be read atomically.
- */
- if (iip != NULL && iip->ili_format.ilf_fields != 0) {
- iip->ili_last_fields = iip->ili_format.ilf_fields;
- iip->ili_format.ilf_fields = 0;
+ * We can play with the ili_fields bits here, because the inode lock
+ * must be held exclusively in order to set bits there and the flush
+ * lock protects the ili_last_fields bits. Set ili_logged so the flush
+ * done routine can tell whether or not to look in the AIL. Also, store
+ * the current LSN of the inode so that we can tell whether the item has
+ * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
+ * need the AIL lock, because it is a 64 bit value that cannot be read
+ * atomically.
+ */
+ if (iip != NULL && iip->ili_fields != 0) {
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
@@ -2814,8 +2686,7 @@ xfs_iflush_int(
} else {
/*
* We're flushing an inode which is not in the AIL and has
- * not been logged but has i_update_core set. For this
- * case we can use a B_DELWRI flush and immediately drop
+ * not been logged. For this case we can immediately drop
* the inode flush lock because we can avoid the whole
* AIL state thing. It's OK to drop the flush lock now,
* because we've already locked the buffer and to do anything
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f0e6b151ba3..7fee3387e1c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
- unsigned char if_ext_max; /* max # of extent records */
union {
xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode {
((w) == XFS_DATA_FORK ? \
((ip)->i_d.di_nextents = (n)) : \
((ip)->i_d.di_anextents = (n)))
-
+#define XFS_IFORK_MAXEXT(ip, w) \
+ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
#ifdef __KERNEL__
-struct bhv_desc;
struct xfs_buf;
struct xfs_bmap_free;
struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
struct xfs_trans;
struct xfs_dquot;
-typedef struct dm_attrs_s {
- __uint32_t da_dmevmask; /* DMIG event mask */
- __uint16_t da_dmstate; /* DMIG state info */
- __uint16_t da_pad; /* DMIG extra padding */
-} dm_attrs_t;
-
typedef struct xfs_inode {
/* Inode linking and identification information. */
struct xfs_mount *i_mount; /* fs mount struct ptr */
@@ -244,27 +237,18 @@ typedef struct xfs_inode {
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
mrlock_t i_iolock; /* inode IO lock */
- struct completion i_flush; /* inode flush completion q */
atomic_t i_pincount; /* inode pin count */
- wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
spinlock_t i_flags_lock; /* inode i_flags lock */
/* Miscellaneous state. */
- unsigned short i_flags; /* see defined flags below */
- unsigned char i_update_core; /* timestamps/size is dirty */
+ unsigned long i_flags; /* see defined flags below */
unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */
- xfs_fsize_t i_size; /* in-memory size */
- xfs_fsize_t i_new_size; /* size when write completes */
-
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
} xfs_inode_t;
-#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \
- (ip)->i_size : (ip)->i_d.di_size;
-
/* Convert from vfs inode to xfs inode */
static inline struct xfs_inode *XFS_I(struct inode *inode)
{
@@ -278,6 +262,32 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
}
/*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk. Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+ if (S_ISREG(ip->i_d.di_mode))
+ return i_size_read(VFS_I(ip));
+ return ip->i_d.di_size;
+}
+
+/*
+ * If this I/O goes past the on-disk inode size update it unless it would
+ * be past the current in-core inode size.
+ */
+static inline xfs_fsize_t
+xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
+{
+ xfs_fsize_t i_size = i_size_read(VFS_I(ip));
+
+ if (new_size > i_size)
+ new_size = i_size;
+ return new_size > ip->i_d.di_size ? new_size : 0;
+}
+
+/*
* i_flags helper functions
*/
static inline void
@@ -331,6 +341,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
return ret;
}
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+ int ret;
+
+ spin_lock(&ip->i_flags_lock);
+ ret = ip->i_flags & flags;
+ if (!ret)
+ ip->i_flags |= flags;
+ spin_unlock(&ip->i_flags_lock);
+ return ret;
+}
+
/*
* Project quota id helpers (previously projid was 16bit only
* and using two 16bit values to hold new 32bit projid was chosen
@@ -351,39 +374,24 @@ xfs_set_projid(struct xfs_inode *ip,
}
/*
- * Manage the i_flush queue embedded in the inode. This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
- wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
- return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
- complete(&ip->i_flush);
-}
-
-/*
* In-core inode flags.
*/
-#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
-#define XFS_ISTALE 0x0002 /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_INEW 0x0008 /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
-#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
+#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
+#define XFS_ISTALE (1 << 1) /* inode has been staled */
+#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
+#define XFS_INEW (1 << 3) /* inode has just been allocated */
+#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */
+#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
+#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
+#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
+#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
+#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
+#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */
/*
* Per-lifetime flags need to be reset when re-using a reclaimable inode during
- * inode lookup. Thi prevents unintended behaviour on the new inode from
+ * inode lookup. This prevents unintended behaviour on the new inode from
* ocurring.
*/
#define XFS_IRECLAIM_RESET_FLAGS \
@@ -392,6 +400,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
XFS_IFILESTREAM);
/*
+ * Synchronize processes attempting to flush the in-core inode back to disk.
+ */
+
+extern void __xfs_iflock(struct xfs_inode *ip);
+
+static inline int xfs_iflock_nowait(struct xfs_inode *ip)
+{
+ return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
+}
+
+static inline void xfs_iflock(struct xfs_inode *ip)
+{
+ if (!xfs_iflock_nowait(ip))
+ __xfs_iflock(ip);
+}
+
+static inline void xfs_ifunlock(struct xfs_inode *ip)
+{
+ xfs_iflags_clear(ip, XFS_IFLOCK);
+ wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
+}
+
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+ return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
+/*
* Flags for inode locking.
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
@@ -400,7 +436,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
#define XFS_IOLOCK_SHARED (1<<1)
#define XFS_ILOCK_EXCL (1<<2)
#define XFS_ILOCK_SHARED (1<<3)
-#define XFS_IUNLOCK_NONOTIFY (1<<4)
#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
@@ -409,8 +444,7 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
{ XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \
{ XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \
{ XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \
- { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \
- { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" }
+ { XFS_ILOCK_SHARED, "ILOCK_SHARED" }
/*
@@ -491,8 +525,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
struct xfs_bmap_free *);
int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
int, xfs_fsize_t);
-int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
- xfs_fsize_t);
int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
void xfs_iext_realloc(xfs_inode_t *, int, int);
@@ -502,10 +534,6 @@ void xfs_promote_inode(struct xfs_inode *);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
-void xfs_synchronize_times(xfs_inode_t *);
-void xfs_mark_inode_dirty(xfs_inode_t *);
-void xfs_mark_inode_dirty_sync(xfs_inode_t *);
-
#define IHOLD(ip) \
do { \
ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
@@ -526,6 +554,7 @@ do { \
*/
#define XFS_IGET_CREATE 0x1
#define XFS_IGET_UNTRUSTED 0x2
+#define XFS_IGET_DONTCACHE 0x4
int xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
xfs_ino_t, struct xfs_dinode **,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cfd6c7f8cc3..05d924efcea 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -57,79 +57,28 @@ xfs_inode_item_size(
struct xfs_inode *ip = iip->ili_inode;
uint nvecs = 2;
- /*
- * Only log the data/extents/b-tree root if there is something
- * left to log.
- */
- iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
-
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
- (ip->i_d.di_nextents > 0) &&
- (ip->i_df.if_bytes > 0)) {
- ASSERT(ip->i_df.if_u1.if_extents != NULL);
+ if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+ ip->i_d.di_nextents > 0 &&
+ ip->i_df.if_bytes > 0)
nvecs++;
- } else {
- iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
- }
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
- (ip->i_df.if_broot_bytes > 0)) {
- ASSERT(ip->i_df.if_broot != NULL);
+ if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+ ip->i_df.if_broot_bytes > 0)
nvecs++;
- } else {
- ASSERT(!(iip->ili_format.ilf_fields &
- XFS_ILOG_DBROOT));
-#ifdef XFS_TRANS_DEBUG
- if (iip->ili_root_size > 0) {
- ASSERT(iip->ili_root_size ==
- ip->i_df.if_broot_bytes);
- ASSERT(memcmp(iip->ili_orig_root,
- ip->i_df.if_broot,
- iip->ili_root_size) == 0);
- } else {
- ASSERT(ip->i_df.if_broot_bytes == 0);
- }
-#endif
- iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
- }
break;
case XFS_DINODE_FMT_LOCAL:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
- (ip->i_df.if_bytes > 0)) {
- ASSERT(ip->i_df.if_u1.if_data != NULL);
- ASSERT(ip->i_d.di_size > 0);
+ if ((iip->ili_fields & XFS_ILOG_DDATA) &&
+ ip->i_df.if_bytes > 0)
nvecs++;
- } else {
- iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
- }
break;
case XFS_DINODE_FMT_DEV:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEXT | XFS_ILOG_UUID);
- break;
-
case XFS_DINODE_FMT_UUID:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEXT | XFS_ILOG_DEV);
break;
default:
@@ -137,56 +86,31 @@ xfs_inode_item_size(
break;
}
- /*
- * If there are no attributes associated with this file,
- * then there cannot be anything more to log.
- * Clear all attribute-related log flags.
- */
- if (!XFS_IFORK_Q(ip)) {
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+ if (!XFS_IFORK_Q(ip))
return nvecs;
- }
+
/*
* Log any necessary attribute data.
*/
switch (ip->i_d.di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
- (ip->i_d.di_anextents > 0) &&
- (ip->i_afp->if_bytes > 0)) {
- ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+ if ((iip->ili_fields & XFS_ILOG_AEXT) &&
+ ip->i_d.di_anextents > 0 &&
+ ip->i_afp->if_bytes > 0)
nvecs++;
- } else {
- iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
- }
break;
case XFS_DINODE_FMT_BTREE:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
- (ip->i_afp->if_broot_bytes > 0)) {
- ASSERT(ip->i_afp->if_broot != NULL);
+ if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+ ip->i_afp->if_broot_bytes > 0)
nvecs++;
- } else {
- iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
- }
break;
case XFS_DINODE_FMT_LOCAL:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
- (ip->i_afp->if_bytes > 0)) {
- ASSERT(ip->i_afp->if_u1.if_data != NULL);
+ if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+ ip->i_afp->if_bytes > 0)
nvecs++;
- } else {
- iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
- }
break;
default:
@@ -256,48 +180,11 @@ xfs_inode_item_format(
vecp++;
nvecs = 1;
- /*
- * Clear i_update_core if the timestamps (or any other
- * non-transactional modification) need flushing/logging
- * and we're about to log them with the rest of the core.
- *
- * This is the same logic as xfs_iflush() but this code can't
- * run at the same time as xfs_iflush because we're in commit
- * processing here and so we have the inode lock held in
- * exclusive mode. Although it doesn't really matter
- * for the timestamps if both routines were to grab the
- * timestamps or not. That would be ok.
- *
- * We clear i_update_core before copying out the data.
- * This is for coordination with our timestamp updates
- * that don't hold the inode lock. They will always
- * update the timestamps BEFORE setting i_update_core,
- * so if we clear i_update_core after they set it we
- * are guaranteed to see their updates to the timestamps
- * either here. Likewise, if they set it after we clear it
- * here, we'll see it either on the next commit of this
- * inode or the next time the inode gets flushed via
- * xfs_iflush(). This depends on strongly ordered memory
- * semantics, but we have that. We use the SYNCHRONIZE
- * macro to make sure that the compiler does not reorder
- * the i_update_core access below the data copy below.
- */
- if (ip->i_update_core) {
- ip->i_update_core = 0;
- SYNCHRONIZE();
- }
-
- /*
- * Make sure to get the latest timestamps from the Linux inode.
- */
- xfs_synchronize_times(ip);
-
vecp->i_addr = &ip->i_d;
vecp->i_len = sizeof(struct xfs_icdinode);
vecp->i_type = XLOG_REG_TYPE_ICORE;
vecp++;
nvecs++;
- iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
/*
* If this is really an old format inode, then we need to
@@ -330,16 +217,17 @@ xfs_inode_item_format(
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEV | XFS_ILOG_UUID)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
- ASSERT(ip->i_df.if_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID);
+
+ if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+ ip->i_d.di_nextents > 0 &&
+ ip->i_df.if_bytes > 0) {
ASSERT(ip->i_df.if_u1.if_extents != NULL);
- ASSERT(ip->i_d.di_nextents > 0);
+ ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
ASSERT(iip->ili_extents_buf == NULL);
- ASSERT((ip->i_df.if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)) > 0);
+
#ifdef XFS_NATIVE_HOST
if (ip->i_d.di_nextents == ip->i_df.if_bytes /
(uint)sizeof(xfs_bmbt_rec_t)) {
@@ -361,15 +249,18 @@ xfs_inode_item_format(
iip->ili_format.ilf_dsize = vecp->i_len;
vecp++;
nvecs++;
+ } else {
+ iip->ili_fields &= ~XFS_ILOG_DEXT;
}
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_DDATA | XFS_ILOG_DEXT |
- XFS_ILOG_DEV | XFS_ILOG_UUID)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
- ASSERT(ip->i_df.if_broot_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID);
+
+ if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+ ip->i_df.if_broot_bytes > 0) {
ASSERT(ip->i_df.if_broot != NULL);
vecp->i_addr = ip->i_df.if_broot;
vecp->i_len = ip->i_df.if_broot_bytes;
@@ -377,15 +268,30 @@ xfs_inode_item_format(
vecp++;
nvecs++;
iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+ } else {
+ ASSERT(!(iip->ili_fields &
+ XFS_ILOG_DBROOT));
+#ifdef XFS_TRANS_DEBUG
+ if (iip->ili_root_size > 0) {
+ ASSERT(iip->ili_root_size ==
+ ip->i_df.if_broot_bytes);
+ ASSERT(memcmp(iip->ili_orig_root,
+ ip->i_df.if_broot,
+ iip->ili_root_size) == 0);
+ } else {
+ ASSERT(ip->i_df.if_broot_bytes == 0);
+ }
+#endif
+ iip->ili_fields &= ~XFS_ILOG_DBROOT;
}
break;
case XFS_DINODE_FMT_LOCAL:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
- XFS_ILOG_DEV | XFS_ILOG_UUID)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
- ASSERT(ip->i_df.if_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID);
+ if ((iip->ili_fields & XFS_ILOG_DDATA) &&
+ ip->i_df.if_bytes > 0) {
ASSERT(ip->i_df.if_u1.if_data != NULL);
ASSERT(ip->i_d.di_size > 0);
@@ -403,24 +309,26 @@ xfs_inode_item_format(
vecp++;
nvecs++;
iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+ } else {
+ iip->ili_fields &= ~XFS_ILOG_DDATA;
}
break;
case XFS_DINODE_FMT_DEV:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
- XFS_ILOG_DDATA | XFS_ILOG_UUID)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+ iip->ili_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEXT | XFS_ILOG_UUID);
+ if (iip->ili_fields & XFS_ILOG_DEV) {
iip->ili_format.ilf_u.ilfu_rdev =
ip->i_df.if_u2.if_rdev;
}
break;
case XFS_DINODE_FMT_UUID:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
- XFS_ILOG_DDATA | XFS_ILOG_DEV)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+ iip->ili_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEXT | XFS_ILOG_DEV);
+ if (iip->ili_fields & XFS_ILOG_UUID) {
iip->ili_format.ilf_u.ilfu_uuid =
ip->i_df.if_u2.if_uuid;
}
@@ -432,31 +340,25 @@ xfs_inode_item_format(
}
/*
- * If there are no attributes associated with the file,
- * then we're done.
- * Assert that no attribute-related log flags are set.
+ * If there are no attributes associated with the file, then we're done.
*/
if (!XFS_IFORK_Q(ip)) {
- iip->ili_format.ilf_size = nvecs;
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
- return;
+ iip->ili_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+ goto out;
}
switch (ip->i_d.di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
-#ifdef DEBUG
- int nrecs = ip->i_afp->if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(nrecs > 0);
- ASSERT(nrecs == ip->i_d.di_anextents);
- ASSERT(ip->i_afp->if_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+
+ if ((iip->ili_fields & XFS_ILOG_AEXT) &&
+ ip->i_d.di_anextents > 0 &&
+ ip->i_afp->if_bytes > 0) {
+ ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
+ ip->i_d.di_anextents);
ASSERT(ip->i_afp->if_u1.if_extents != NULL);
- ASSERT(ip->i_d.di_anextents > 0);
-#endif
#ifdef XFS_NATIVE_HOST
/*
* There are not delayed allocation extents
@@ -473,29 +375,36 @@ xfs_inode_item_format(
iip->ili_format.ilf_asize = vecp->i_len;
vecp++;
nvecs++;
+ } else {
+ iip->ili_fields &= ~XFS_ILOG_AEXT;
}
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_ADATA | XFS_ILOG_AEXT)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
- ASSERT(ip->i_afp->if_broot_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+
+ if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+ ip->i_afp->if_broot_bytes > 0) {
ASSERT(ip->i_afp->if_broot != NULL);
+
vecp->i_addr = ip->i_afp->if_broot;
vecp->i_len = ip->i_afp->if_broot_bytes;
vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
vecp++;
nvecs++;
iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+ } else {
+ iip->ili_fields &= ~XFS_ILOG_ABROOT;
}
break;
case XFS_DINODE_FMT_LOCAL:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
- ASSERT(ip->i_afp->if_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+
+ if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+ ip->i_afp->if_bytes > 0) {
ASSERT(ip->i_afp->if_u1.if_data != NULL);
vecp->i_addr = ip->i_afp->if_u1.if_data;
@@ -512,6 +421,8 @@ xfs_inode_item_format(
vecp++;
nvecs++;
iip->ili_format.ilf_asize = (unsigned)data_bytes;
+ } else {
+ iip->ili_fields &= ~XFS_ILOG_ADATA;
}
break;
@@ -520,6 +431,15 @@ xfs_inode_item_format(
break;
}
+out:
+ /*
+ * Now update the log format that goes out to disk from the in-core
+ * values. We always write the inode core to make the arithmetic
+ * games in recovery easier, which isn't a big deal as just about any
+ * transaction would dirty it anyway.
+ */
+ iip->ili_format.ilf_fields = XFS_ILOG_CORE |
+ (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
iip->ili_format.ilf_size = nvecs;
}
@@ -557,7 +477,7 @@ xfs_inode_item_unpin(
trace_xfs_inode_unpin(ip, _RET_IP_);
ASSERT(atomic_read(&ip->i_pincount) > 0);
if (atomic_dec_and_test(&ip->i_pincount))
- wake_up(&ip->i_ipin_wait);
+ wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
/*
@@ -598,17 +518,13 @@ xfs_inode_item_trylock(
/* Stale items should force out the iclog */
if (ip->i_flags & XFS_ISTALE) {
xfs_ifunlock(ip);
- /*
- * we hold the AIL lock - notify the unlock routine of this
- * so it doesn't try to get the lock again.
- */
- xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
return XFS_ITEM_PINNED;
}
#ifdef DEBUG
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- ASSERT(iip->ili_format.ilf_fields != 0);
+ ASSERT(iip->ili_fields != 0);
ASSERT(iip->ili_logged == 0);
ASSERT(lip->li_flags & XFS_LI_IN_AIL);
}
@@ -640,7 +556,7 @@ xfs_inode_item_unlock(
if (iip->ili_extents_buf != NULL) {
ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
ASSERT(ip->i_d.di_nextents > 0);
- ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
+ ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
ASSERT(ip->i_df.if_bytes > 0);
kmem_free(iip->ili_extents_buf);
iip->ili_extents_buf = NULL;
@@ -648,7 +564,7 @@ xfs_inode_item_unlock(
if (iip->ili_aextents_buf != NULL) {
ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
ASSERT(ip->i_d.di_anextents > 0);
- ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
+ ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
ASSERT(ip->i_afp->if_bytes > 0);
kmem_free(iip->ili_aextents_buf);
iip->ili_aextents_buf = NULL;
@@ -719,7 +635,7 @@ xfs_inode_item_pushbuf(
* If a flush is not in progress anymore, chances are that the
* inode was taken off the AIL. So, just get out.
*/
- if (completion_done(&ip->i_flush) ||
+ if (!xfs_isiflocked(ip) ||
!(lip->li_flags & XFS_LI_IN_AIL)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return true;
@@ -752,7 +668,7 @@ xfs_inode_item_push(
struct xfs_inode *ip = iip->ili_inode;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
/*
* Since we were able to lock the inode's flush lock and
@@ -763,8 +679,7 @@ xfs_inode_item_push(
* lock without sleeping, then there must not have been
* anyone in the process of flushing the inode.
*/
- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
- iip->ili_format.ilf_fields != 0);
+ ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0);
/*
* Push the inode to it's backing buffer. This will not remove the
@@ -987,7 +902,7 @@ xfs_iflush_abort(
* Clear the inode logging fields so no more flushes are
* attempted.
*/
- iip->ili_format.ilf_fields = 0;
+ iip->ili_fields = 0;
}
/*
* Release the inode's flush lock since we're done with it.
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index d3dee61e6d9..41d61c3b7a3 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -86,6 +86,15 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
+
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core. Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP 0x4000
+
#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
XFS_ILOG_UUID | XFS_ILOG_ADATA | \
@@ -101,7 +110,7 @@ typedef struct xfs_inode_log_format_64 {
XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
XFS_ILOG_DEV | XFS_ILOG_UUID | \
XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
- XFS_ILOG_ABROOT)
+ XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
static inline int xfs_ilog_fbroot(int w)
{
@@ -134,6 +143,7 @@ typedef struct xfs_inode_log_item {
unsigned short ili_lock_flags; /* lock flags */
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
+ unsigned int ili_fields; /* fields to be logged */
struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
data exts */
struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
@@ -148,9 +158,7 @@ typedef struct xfs_inode_log_item {
static inline int xfs_inode_clean(xfs_inode_t *ip)
{
- return (!ip->i_itemp ||
- !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
- !ip->i_update_core;
+ return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
}
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 76f3ca5cfc3..91f8ff547ab 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -209,6 +209,7 @@ xfs_open_by_handle(
struct file *filp;
struct inode *inode;
struct dentry *dentry;
+ fmode_t fmode;
if (!capable(CAP_SYS_ADMIN))
return -XFS_ERROR(EPERM);
@@ -228,26 +229,21 @@ xfs_open_by_handle(
hreq->oflags |= O_LARGEFILE;
#endif
- /* Put open permission in namei format. */
permflag = hreq->oflags;
- if ((permflag+1) & O_ACCMODE)
- permflag++;
- if (permflag & O_TRUNC)
- permflag |= 2;
-
+ fmode = OPEN_FMODE(permflag);
if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
- (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
+ (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
error = -XFS_ERROR(EPERM);
goto out_dput;
}
- if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+ if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
error = -XFS_ERROR(EACCES);
goto out_dput;
}
/* Can't write directories. */
- if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+ if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
error = -XFS_ERROR(EISDIR);
goto out_dput;
}
@@ -450,9 +446,12 @@ xfs_attrmulti_attr_get(
if (*len > XATTR_SIZE_MAX)
return EINVAL;
- kbuf = kmalloc(*len, GFP_KERNEL);
- if (!kbuf)
- return ENOMEM;
+ kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL);
+ if (!kbuf) {
+ kbuf = kmem_zalloc_large(*len);
+ if (!kbuf)
+ return ENOMEM;
+ }
error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
if (error)
@@ -462,7 +461,10 @@ xfs_attrmulti_attr_get(
error = EFAULT;
out_kfree:
- kfree(kbuf);
+ if (is_vmalloc_addr(kbuf))
+ kmem_free_large(kbuf);
+ else
+ kmem_free(kbuf);
return error;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index f9ccb7b7c04..a849a5473af 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -293,7 +293,7 @@ xfs_compat_ioc_bulkstat(
int res;
error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
- sizeof(compat_xfs_bstat_t), 0, &res);
+ sizeof(compat_xfs_bstat_t), NULL, &res);
} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
error = xfs_bulkstat(mp, &inlast, &count,
xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282aa93..71a464503c4 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -31,6 +31,7 @@
#include "xfs_ialloc_btree.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
@@ -57,26 +58,26 @@ xfs_iomap_eof_align_last_fsb(
xfs_fileoff_t *last_fsb)
{
xfs_fileoff_t new_last_fsb = 0;
- xfs_extlen_t align;
+ xfs_extlen_t align = 0;
int eof, error;
- if (XFS_IS_REALTIME_INODE(ip))
- ;
- /*
- * If mounted with the "-o swalloc" option, roundup the allocation
- * request to a stripe width boundary if the file size is >=
- * stripe width and we are allocating past the allocation eof.
- */
- else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
- (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
- new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
- /*
- * Roundup the allocation request to a stripe unit (m_dalign) boundary
- * if the file size is >= stripe unit size, and we are allocating past
- * the allocation eof.
- */
- else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
- new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+ if (!XFS_IS_REALTIME_INODE(ip)) {
+ /*
+ * Round up the allocation request to a stripe unit
+ * (m_dalign) boundary if the file size is >= stripe unit
+ * size, and we are allocating past the allocation eof.
+ *
+ * If mounted with the "-o swalloc" option the alignment is
+ * increased from the strip unit size to the stripe width.
+ */
+ if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ align = mp->m_swidth;
+ else if (mp->m_dalign)
+ align = mp->m_dalign;
+
+ if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
+ new_last_fsb = roundup_64(*last_fsb, align);
+ }
/*
* Always round up the allocation request to an extent boundary
@@ -154,7 +155,7 @@ xfs_iomap_write_direct(
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
- if ((offset + count) > ip->i_size) {
+ if ((offset + count) > XFS_ISIZE(ip)) {
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
goto error_out;
@@ -211,7 +212,7 @@ xfs_iomap_write_direct(
xfs_trans_ijoin(tp, ip, 0);
bmapi_flag = 0;
- if (offset < ip->i_size || extsz)
+ if (offset < XFS_ISIZE(ip) || extsz)
bmapi_flag |= XFS_BMAPI_PREALLOC;
/*
@@ -286,7 +287,7 @@ xfs_iomap_eof_want_preallocate(
int found_delalloc = 0;
*prealloc = 0;
- if ((offset + count) <= ip->i_size)
+ if (offset + count <= XFS_ISIZE(ip))
return 0;
/*
@@ -340,7 +341,7 @@ xfs_iomap_prealloc_size(
* if we pass in alloc_blocks = 0. Hence the "+ 1" to
* ensure we always pass in a non-zero value.
*/
- alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+ alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
rounddown_pow_of_two(alloc_blocks));
@@ -564,7 +565,7 @@ xfs_iomap_write_allocate(
* back....
*/
nimaps = 1;
- end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+ end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
error = xfs_bmap_last_offset(NULL, ip, &last_block,
XFS_DATA_FORK);
if (error)
@@ -645,6 +646,7 @@ xfs_iomap_write_unwritten(
xfs_trans_t *tp;
xfs_bmbt_irec_t imap;
xfs_bmap_free_t free_list;
+ xfs_fsize_t i_size;
uint resblks;
int committed;
int error;
@@ -705,7 +707,22 @@ xfs_iomap_write_unwritten(
if (error)
goto error_on_bmapi_transaction;
- error = xfs_bmap_finish(&(tp), &(free_list), &committed);
+ /*
+ * Log the updated inode size as we go. We have to be careful
+ * to only log it up to the actual write offset if it is
+ * halfway into a block.
+ */
+ i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
+ if (i_size > offset + count)
+ i_size = offset + count;
+
+ i_size = xfs_new_eof(ip, i_size);
+ if (i_size) {
+ ip->i_d.di_size = i_size;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ }
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f9babd17922..3011b879f85 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -50,65 +50,15 @@
#include <linux/fiemap.h>
#include <linux/slab.h>
-/*
- * Bring the timestamps in the XFS inode uptodate.
- *
- * Used before writing the inode to disk.
- */
-void
-xfs_synchronize_times(
- xfs_inode_t *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
- ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
- ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
-}
-
-/*
- * If the linux inode is valid, mark it dirty, else mark the dirty state
- * in the XFS inode to make sure we pick it up when reclaiming the inode.
- */
-void
-xfs_mark_inode_dirty_sync(
- xfs_inode_t *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
- mark_inode_dirty_sync(inode);
- else {
- barrier();
- ip->i_update_core = 1;
- }
-}
-
-void
-xfs_mark_inode_dirty(
- xfs_inode_t *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
- mark_inode_dirty(inode);
- else {
- barrier();
- ip->i_update_core = 1;
- }
-
-}
-
-
-int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int
+xfs_initxattrs(
+ struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
{
- const struct xattr *xattr;
- struct xfs_inode *ip = XFS_I(inode);
- int error = 0;
+ const struct xattr *xattr;
+ struct xfs_inode *ip = XFS_I(inode);
+ int error = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
error = xfs_attr_set(ip, xattr->name, xattr->value,
@@ -678,19 +628,16 @@ xfs_setattr_nonsize(
inode->i_atime = iattr->ia_atime;
ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
- ip->i_update_core = 1;
}
if (mask & ATTR_CTIME) {
inode->i_ctime = iattr->ia_ctime;
ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
- ip->i_update_core = 1;
}
if (mask & ATTR_MTIME) {
inode->i_mtime = iattr->ia_mtime;
ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- ip->i_update_core = 1;
}
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -750,6 +697,7 @@ xfs_setattr_size(
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
int mask = iattr->ia_valid;
+ xfs_off_t oldsize, newsize;
struct xfs_trans *tp;
int error;
uint lock_flags;
@@ -777,11 +725,13 @@ xfs_setattr_size(
lock_flags |= XFS_IOLOCK_EXCL;
xfs_ilock(ip, lock_flags);
+ oldsize = inode->i_size;
+ newsize = iattr->ia_size;
+
/*
* Short circuit the truncate case for zero length files.
*/
- if (iattr->ia_size == 0 &&
- ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
goto out_unlock;
@@ -807,14 +757,14 @@ xfs_setattr_size(
* the inode to the transaction, because the inode cannot be unlocked
* once it is a part of the transaction.
*/
- if (iattr->ia_size > ip->i_size) {
+ if (newsize > oldsize) {
/*
* Do the first part of growing a file: zero any data in the
* last block that is beyond the old EOF. We need to do this
* before the inode is joined to the transaction to modify
* i_size.
*/
- error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+ error = xfs_zero_eof(ip, newsize, oldsize);
if (error)
goto out_unlock;
}
@@ -833,8 +783,8 @@ xfs_setattr_size(
* here and prevents waiting for other data not within the range we
* care about here.
*/
- if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
- error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+ error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
FI_NONE);
if (error)
goto out_unlock;
@@ -845,8 +795,7 @@ xfs_setattr_size(
*/
inode_dio_wait(inode);
- error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
- xfs_get_blocks);
+ error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
if (error)
goto out_unlock;
@@ -857,7 +806,7 @@ xfs_setattr_size(
if (error)
goto out_trans_cancel;
- truncate_setsize(inode, iattr->ia_size);
+ truncate_setsize(inode, newsize);
commit_flags = XFS_TRANS_RELEASE_LOG_RES;
lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +825,29 @@ xfs_setattr_size(
* these flags set. For all other operations the VFS set these flags
* explicitly if it wants a timestamp update.
*/
- if (iattr->ia_size != ip->i_size &&
- (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+ if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
iattr->ia_ctime = iattr->ia_mtime =
current_fs_time(inode->i_sb);
mask |= ATTR_CTIME | ATTR_MTIME;
}
- if (iattr->ia_size > ip->i_size) {
- ip->i_d.di_size = iattr->ia_size;
- ip->i_size = iattr->ia_size;
- } else if (iattr->ia_size <= ip->i_size ||
- (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
- error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+ /*
+ * The first thing we do is set the size to new_size permanently on
+ * disk. This way we don't have to worry about anyone ever being able
+ * to look at the data being freed even in the face of a crash.
+ * What we're getting around here is the case where we free a block, it
+ * is allocated to another file, it is written to, and then we crash.
+ * If the new data gets written to the file but the log buffers
+ * containing the free and reallocation don't, then we'd end up with
+ * garbage in the blocks being freed. As long as we make the new size
+ * permanent before actually freeing any blocks it doesn't matter if
+ * they get written to.
+ */
+ ip->i_d.di_size = newsize;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (newsize <= oldsize) {
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
if (error)
goto out_trans_abort;
@@ -906,13 +865,11 @@ xfs_setattr_size(
inode->i_ctime = iattr->ia_ctime;
ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
- ip->i_update_core = 1;
}
if (mask & ATTR_MTIME) {
inode->i_mtime = iattr->ia_mtime;
ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- ip->i_update_core = 1;
}
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 751e94fe1f7..acc2bf264da 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -62,7 +62,6 @@ xfs_bulkstat_one_int(
{
struct xfs_icdinode *dic; /* dinode core info pointer */
struct xfs_inode *ip; /* incore inode pointer */
- struct inode *inode;
struct xfs_bstat *buf; /* return buffer */
int error = 0; /* error value */
@@ -76,7 +75,8 @@ xfs_bulkstat_one_int(
return XFS_ERROR(ENOMEM);
error = xfs_iget(mp, NULL, ino,
- XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip);
+ (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
+ XFS_ILOCK_SHARED, &ip);
if (error) {
*stat = BULKSTAT_RV_NOTHING;
goto out_free;
@@ -86,7 +86,6 @@ xfs_bulkstat_one_int(
ASSERT(ip->i_imap.im_blkno != 0);
dic = &ip->i_d;
- inode = VFS_I(ip);
/* xfs_iget returns the following without needing
* further change.
@@ -99,19 +98,12 @@ xfs_bulkstat_one_int(
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
-
- /*
- * We need to read the timestamps from the Linux inode because
- * the VFS keeps writing directly into the inode structure instead
- * of telling us about the updates.
- */
- buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
- buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
- buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
- buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
- buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
- buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
-
+ buf->bs_atime.tv_sec = dic->di_atime.t_sec;
+ buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
+ buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
+ buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
+ buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
+ buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
buf->bs_extents = dic->di_nextents;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e2cc3568c29..6db1fef38bf 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -67,15 +67,10 @@ STATIC void xlog_state_switch_iclogs(xlog_t *log,
int eventual_size);
STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
-/* local functions to manipulate grant head */
-STATIC int xlog_grant_log_space(xlog_t *log,
- xlog_ticket_t *xtic);
STATIC void xlog_grant_push_ail(struct log *log,
int need_bytes);
STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
xlog_ticket_t *ticket);
-STATIC int xlog_regrant_write_log_space(xlog_t *log,
- xlog_ticket_t *ticket);
STATIC void xlog_ungrant_log_space(xlog_t *log,
xlog_ticket_t *ticket);
@@ -150,78 +145,93 @@ xlog_grant_add_space(
} while (head_val != old);
}
-STATIC bool
-xlog_reserveq_wake(
- struct log *log,
- int *free_bytes)
+STATIC void
+xlog_grant_head_init(
+ struct xlog_grant_head *head)
+{
+ xlog_assign_grant_head(&head->grant, 1, 0);
+ INIT_LIST_HEAD(&head->waiters);
+ spin_lock_init(&head->lock);
+}
+
+STATIC void
+xlog_grant_head_wake_all(
+ struct xlog_grant_head *head)
{
struct xlog_ticket *tic;
- int need_bytes;
- list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+ spin_lock(&head->lock);
+ list_for_each_entry(tic, &head->waiters, t_queue)
+ wake_up_process(tic->t_task);
+ spin_unlock(&head->lock);
+}
+
+static inline int
+xlog_ticket_reservation(
+ struct log *log,
+ struct xlog_grant_head *head,
+ struct xlog_ticket *tic)
+{
+ if (head == &log->l_write_head) {
+ ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+ return tic->t_unit_res;
+ } else {
if (tic->t_flags & XLOG_TIC_PERM_RESERV)
- need_bytes = tic->t_unit_res * tic->t_cnt;
+ return tic->t_unit_res * tic->t_cnt;
else
- need_bytes = tic->t_unit_res;
-
- if (*free_bytes < need_bytes)
- return false;
- *free_bytes -= need_bytes;
-
- trace_xfs_log_grant_wake_up(log, tic);
- wake_up(&tic->t_wait);
+ return tic->t_unit_res;
}
-
- return true;
}
STATIC bool
-xlog_writeq_wake(
+xlog_grant_head_wake(
struct log *log,
+ struct xlog_grant_head *head,
int *free_bytes)
{
struct xlog_ticket *tic;
int need_bytes;
- list_for_each_entry(tic, &log->l_writeq, t_queue) {
- ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
-
- need_bytes = tic->t_unit_res;
-
+ list_for_each_entry(tic, &head->waiters, t_queue) {
+ need_bytes = xlog_ticket_reservation(log, head, tic);
if (*free_bytes < need_bytes)
return false;
- *free_bytes -= need_bytes;
- trace_xfs_log_regrant_write_wake_up(log, tic);
- wake_up(&tic->t_wait);
+ *free_bytes -= need_bytes;
+ trace_xfs_log_grant_wake_up(log, tic);
+ wake_up_process(tic->t_task);
}
return true;
}
STATIC int
-xlog_reserveq_wait(
+xlog_grant_head_wait(
struct log *log,
+ struct xlog_grant_head *head,
struct xlog_ticket *tic,
int need_bytes)
{
- list_add_tail(&tic->t_queue, &log->l_reserveq);
+ list_add_tail(&tic->t_queue, &head->waiters);
do {
if (XLOG_FORCED_SHUTDOWN(log))
goto shutdown;
xlog_grant_push_ail(log, need_bytes);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&head->lock);
+
XFS_STATS_INC(xs_sleep_logspace);
- trace_xfs_log_grant_sleep(log, tic);
- xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+ trace_xfs_log_grant_sleep(log, tic);
+ schedule();
trace_xfs_log_grant_wake(log, tic);
- spin_lock(&log->l_grant_reserve_lock);
+ spin_lock(&head->lock);
if (XLOG_FORCED_SHUTDOWN(log))
goto shutdown;
- } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
+ } while (xlog_space_left(log, &head->grant) < need_bytes);
list_del_init(&tic->t_queue);
return 0;
@@ -230,35 +240,58 @@ shutdown:
return XFS_ERROR(EIO);
}
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto head->waiters, it will only return after the
+ * needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off head->waiters under head->lock, we
+ * only need to take that lock if we are going to add the ticket to the queue
+ * and sleep. We can avoid taking the lock if the ticket was never added to
+ * head->waiters because the t_queue list head will be empty and we hold the
+ * only reference to it so it can safely be checked unlocked.
+ */
STATIC int
-xlog_writeq_wait(
+xlog_grant_head_check(
struct log *log,
+ struct xlog_grant_head *head,
struct xlog_ticket *tic,
- int need_bytes)
+ int *need_bytes)
{
- list_add_tail(&tic->t_queue, &log->l_writeq);
-
- do {
- if (XLOG_FORCED_SHUTDOWN(log))
- goto shutdown;
- xlog_grant_push_ail(log, need_bytes);
-
- XFS_STATS_INC(xs_sleep_logspace);
- trace_xfs_log_regrant_write_sleep(log, tic);
+ int free_bytes;
+ int error = 0;
- xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
- trace_xfs_log_regrant_write_wake(log, tic);
+ ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
- spin_lock(&log->l_grant_write_lock);
- if (XLOG_FORCED_SHUTDOWN(log))
- goto shutdown;
- } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
+ /*
+ * If there are other waiters on the queue then give them a chance at
+ * logspace before us. Wake up the first waiters, if we do not wake
+ * up all the waiters then go to sleep waiting for more free space,
+ * otherwise try to get some space for this transaction.
+ */
+ *need_bytes = xlog_ticket_reservation(log, head, tic);
+ free_bytes = xlog_space_left(log, &head->grant);
+ if (!list_empty_careful(&head->waiters)) {
+ spin_lock(&head->lock);
+ if (!xlog_grant_head_wake(log, head, &free_bytes) ||
+ free_bytes < *need_bytes) {
+ error = xlog_grant_head_wait(log, head, tic,
+ *need_bytes);
+ }
+ spin_unlock(&head->lock);
+ } else if (free_bytes < *need_bytes) {
+ spin_lock(&head->lock);
+ error = xlog_grant_head_wait(log, head, tic, *need_bytes);
+ spin_unlock(&head->lock);
+ }
- list_del_init(&tic->t_queue);
- return 0;
-shutdown:
- list_del_init(&tic->t_queue);
- return XFS_ERROR(EIO);
+ return error;
}
static void
@@ -286,6 +319,128 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
}
/*
+ * Replenish the byte reservation required by moving the grant write head.
+ */
+int
+xfs_log_regrant(
+ struct xfs_mount *mp,
+ struct xlog_ticket *tic)
+{
+ struct log *log = mp->m_log;
+ int need_bytes;
+ int error = 0;
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return XFS_ERROR(EIO);
+
+ XFS_STATS_INC(xs_try_logspace);
+
+ /*
+ * This is a new transaction on the ticket, so we need to change the
+ * transaction ID so that the next transaction has a different TID in
+ * the log. Just add one to the existing tid so that we can see chains
+ * of rolling transactions in the log easily.
+ */
+ tic->t_tid++;
+
+ xlog_grant_push_ail(log, tic->t_unit_res);
+
+ tic->t_curr_res = tic->t_unit_res;
+ xlog_tic_reset_res(tic);
+
+ if (tic->t_cnt > 0)
+ return 0;
+
+ trace_xfs_log_regrant(log, tic);
+
+ error = xlog_grant_head_check(log, &log->l_write_head, tic,
+ &need_bytes);
+ if (error)
+ goto out_error;
+
+ xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+ trace_xfs_log_regrant_exit(log, tic);
+ xlog_verify_grant_tail(log);
+ return 0;
+
+out_error:
+ /*
+ * If we are failing, make sure the ticket doesn't have any current
+ * reservations. We don't want to add this back when the ticket/
+ * transaction gets cancelled.
+ */
+ tic->t_curr_res = 0;
+ tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+ return error;
+}
+
+/*
+ * Reserve log space and return a ticket corresponding the reservation.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation. By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(
+ struct xfs_mount *mp,
+ int unit_bytes,
+ int cnt,
+ struct xlog_ticket **ticp,
+ __uint8_t client,
+ bool permanent,
+ uint t_type)
+{
+ struct log *log = mp->m_log;
+ struct xlog_ticket *tic;
+ int need_bytes;
+ int error = 0;
+
+ ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return XFS_ERROR(EIO);
+
+ XFS_STATS_INC(xs_try_logspace);
+
+ ASSERT(*ticp == NULL);
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
+ KM_SLEEP | KM_MAYFAIL);
+ if (!tic)
+ return XFS_ERROR(ENOMEM);
+
+ tic->t_trans_type = t_type;
+ *ticp = tic;
+
+ xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+
+ trace_xfs_log_reserve(log, tic);
+
+ error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
+ &need_bytes);
+ if (error)
+ goto out_error;
+
+ xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
+ xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+ trace_xfs_log_reserve_exit(log, tic);
+ xlog_verify_grant_tail(log);
+ return 0;
+
+out_error:
+ /*
+ * If we are failing, make sure the ticket doesn't have any current
+ * reservations. We don't want to add this back when the ticket/
+ * transaction gets cancelled.
+ */
+ tic->t_curr_res = 0;
+ tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+ return error;
+}
+
+
+/*
* NOTES:
*
* 1. currblock field gets updated at startup and after in-core logs
@@ -395,88 +550,6 @@ xfs_log_release_iclog(
}
/*
- * 1. Reserve an amount of on-disk log space and return a ticket corresponding
- * to the reservation.
- * 2. Potentially, push buffers at tail of log to disk.
- *
- * Each reservation is going to reserve extra space for a log record header.
- * When writes happen to the on-disk log, we don't subtract the length of the
- * log record header from any reservation. By wasting space in each
- * reservation, we prevent over allocation problems.
- */
-int
-xfs_log_reserve(
- struct xfs_mount *mp,
- int unit_bytes,
- int cnt,
- struct xlog_ticket **ticket,
- __uint8_t client,
- uint flags,
- uint t_type)
-{
- struct log *log = mp->m_log;
- struct xlog_ticket *internal_ticket;
- int retval = 0;
-
- ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-
- if (XLOG_FORCED_SHUTDOWN(log))
- return XFS_ERROR(EIO);
-
- XFS_STATS_INC(xs_try_logspace);
-
-
- if (*ticket != NULL) {
- ASSERT(flags & XFS_LOG_PERM_RESERV);
- internal_ticket = *ticket;
-
- /*
- * this is a new transaction on the ticket, so we need to
- * change the transaction ID so that the next transaction has a
- * different TID in the log. Just add one to the existing tid
- * so that we can see chains of rolling transactions in the log
- * easily.
- */
- internal_ticket->t_tid++;
-
- trace_xfs_log_reserve(log, internal_ticket);
-
- xlog_grant_push_ail(log, internal_ticket->t_unit_res);
- retval = xlog_regrant_write_log_space(log, internal_ticket);
- } else {
- /* may sleep if need to allocate more tickets */
- internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
- client, flags,
- KM_SLEEP|KM_MAYFAIL);
- if (!internal_ticket)
- return XFS_ERROR(ENOMEM);
- internal_ticket->t_trans_type = t_type;
- *ticket = internal_ticket;
-
- trace_xfs_log_reserve(log, internal_ticket);
-
- xlog_grant_push_ail(log,
- (internal_ticket->t_unit_res *
- internal_ticket->t_cnt));
- retval = xlog_grant_log_space(log, internal_ticket);
- }
-
- if (unlikely(retval)) {
- /*
- * If we are failing, make sure the ticket doesn't have any
- * current reservations. We don't want to add this back
- * when the ticket/ transaction gets cancelled.
- */
- internal_ticket->t_curr_res = 0;
- /* ungrant will give back unit_res * t_cnt. */
- internal_ticket->t_cnt = 0;
- }
-
- return retval;
-}
-
-
-/*
* Mount a log filesystem
*
* mp - ubiquitous xfs mount point structure
@@ -653,8 +726,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
.lv_iovecp = &reg,
};
- /* remove inited flag */
+ /* remove inited flag, and account for space used */
tic->t_flags = 0;
+ tic->t_curr_res -= sizeof(magic);
error = xlog_write(log, &vec, tic, &lsn,
NULL, XLOG_UNMOUNT_TRANS);
/*
@@ -760,64 +834,35 @@ xfs_log_item_init(
INIT_LIST_HEAD(&item->li_cil);
}
+/*
+ * Wake up processes waiting for log space after we have moved the log tail.
+ */
void
-xfs_log_move_tail(xfs_mount_t *mp,
- xfs_lsn_t tail_lsn)
+xfs_log_space_wake(
+ struct xfs_mount *mp)
{
- xlog_ticket_t *tic;
- xlog_t *log = mp->m_log;
- int need_bytes, free_bytes;
+ struct log *log = mp->m_log;
+ int free_bytes;
if (XLOG_FORCED_SHUTDOWN(log))
return;
- if (tail_lsn == 0)
- tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-
- /* tail_lsn == 1 implies that we weren't passed a valid value. */
- if (tail_lsn != 1)
- atomic64_set(&log->l_tail_lsn, tail_lsn);
-
- if (!list_empty_careful(&log->l_writeq)) {
-#ifdef DEBUG
- if (log->l_flags & XLOG_ACTIVE_RECOVERY)
- panic("Recovery problem");
-#endif
- spin_lock(&log->l_grant_write_lock);
- free_bytes = xlog_space_left(log, &log->l_grant_write_head);
- list_for_each_entry(tic, &log->l_writeq, t_queue) {
- ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+ if (!list_empty_careful(&log->l_write_head.waiters)) {
+ ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
- if (free_bytes < tic->t_unit_res && tail_lsn != 1)
- break;
- tail_lsn = 0;
- free_bytes -= tic->t_unit_res;
- trace_xfs_log_regrant_write_wake_up(log, tic);
- wake_up(&tic->t_wait);
- }
- spin_unlock(&log->l_grant_write_lock);
+ spin_lock(&log->l_write_head.lock);
+ free_bytes = xlog_space_left(log, &log->l_write_head.grant);
+ xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
+ spin_unlock(&log->l_write_head.lock);
}
- if (!list_empty_careful(&log->l_reserveq)) {
-#ifdef DEBUG
- if (log->l_flags & XLOG_ACTIVE_RECOVERY)
- panic("Recovery problem");
-#endif
- spin_lock(&log->l_grant_reserve_lock);
- free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
- list_for_each_entry(tic, &log->l_reserveq, t_queue) {
- if (tic->t_flags & XLOG_TIC_PERM_RESERV)
- need_bytes = tic->t_unit_res*tic->t_cnt;
- else
- need_bytes = tic->t_unit_res;
- if (free_bytes < need_bytes && tail_lsn != 1)
- break;
- tail_lsn = 0;
- free_bytes -= need_bytes;
- trace_xfs_log_grant_wake_up(log, tic);
- wake_up(&tic->t_wait);
- }
- spin_unlock(&log->l_grant_reserve_lock);
+ if (!list_empty_careful(&log->l_reserve_head.waiters)) {
+ ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+
+ spin_lock(&log->l_reserve_head.lock);
+ free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
+ xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
+ spin_unlock(&log->l_reserve_head.lock);
}
}
@@ -867,21 +912,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
return needed;
}
-/******************************************************************************
- *
- * local routines
- *
- ******************************************************************************
- */
-
-/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
- * The log manager must keep track of the last LR which was committed
- * to disk. The lsn of this LR will become the new tail_lsn whenever
- * xfs_trans_tail_ail returns 0. If we don't do this, we run into
- * the situation where stuff could be written into the log but nothing
- * was ever in the AIL when asked. Eventually, we panic since the
- * tail hits the head.
- *
+/*
* We may be holding the log iclog lock upon entering this routine.
*/
xfs_lsn_t
@@ -891,10 +922,17 @@ xlog_assign_tail_lsn(
xfs_lsn_t tail_lsn;
struct log *log = mp->m_log;
+ /*
+ * To make sure we always have a valid LSN for the log tail we keep
+ * track of the last LSN which was committed in log->l_last_sync_lsn,
+ * and use that when the AIL was empty and xfs_ail_min_lsn returns 0.
+ *
+ * If the AIL has been emptied we also need to wake any process
+ * waiting for this condition.
+ */
tail_lsn = xfs_ail_min_lsn(mp->m_ail);
if (!tail_lsn)
tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-
atomic64_set(&log->l_tail_lsn, tail_lsn);
return tail_lsn;
}
@@ -1100,12 +1138,9 @@ xlog_alloc_log(xfs_mount_t *mp,
xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
- xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
- xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
- INIT_LIST_HEAD(&log->l_reserveq);
- INIT_LIST_HEAD(&log->l_writeq);
- spin_lock_init(&log->l_grant_reserve_lock);
- spin_lock_init(&log->l_grant_write_lock);
+
+ xlog_grant_head_init(&log->l_reserve_head);
+ xlog_grant_head_init(&log->l_write_head);
error = EFSCORRUPTED;
if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1280,7 +1315,7 @@ xlog_grant_push_ail(
ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
- free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+ free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
free_blocks = BTOBBT(free_bytes);
/*
@@ -1412,8 +1447,8 @@ xlog_sync(xlog_t *log,
roundoff < BBTOB(1)));
/* move grant heads by roundoff in sync */
- xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
- xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
+ xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
+ xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
/* put cycle number in every block */
xlog_pack_data(log, iclog, roundoff);
@@ -2566,119 +2601,6 @@ restart:
return 0;
} /* xlog_state_get_iclog_space */
-/*
- * Atomically get the log space required for a log ticket.
- *
- * Once a ticket gets put onto the reserveq, it will only return after the
- * needed reservation is satisfied.
- *
- * This function is structured so that it has a lock free fast path. This is
- * necessary because every new transaction reservation will come through this
- * path. Hence any lock will be globally hot if we take it unconditionally on
- * every pass.
- *
- * As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going to add
- * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
- * was never added to the reserveq because the t_queue list head will be empty
- * and we hold the only reference to it so it can safely be checked unlocked.
- */
-STATIC int
-xlog_grant_log_space(
- struct log *log,
- struct xlog_ticket *tic)
-{
- int free_bytes, need_bytes;
- int error = 0;
-
- ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-
- trace_xfs_log_grant_enter(log, tic);
-
- /*
- * If there are other waiters on the queue then give them a chance at
- * logspace before us. Wake up the first waiters, if we do not wake
- * up all the waiters then go to sleep waiting for more free space,
- * otherwise try to get some space for this transaction.
- */
- need_bytes = tic->t_unit_res;
- if (tic->t_flags & XFS_LOG_PERM_RESERV)
- need_bytes *= tic->t_ocnt;
- free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
- if (!list_empty_careful(&log->l_reserveq)) {
- spin_lock(&log->l_grant_reserve_lock);
- if (!xlog_reserveq_wake(log, &free_bytes) ||
- free_bytes < need_bytes)
- error = xlog_reserveq_wait(log, tic, need_bytes);
- spin_unlock(&log->l_grant_reserve_lock);
- } else if (free_bytes < need_bytes) {
- spin_lock(&log->l_grant_reserve_lock);
- error = xlog_reserveq_wait(log, tic, need_bytes);
- spin_unlock(&log->l_grant_reserve_lock);
- }
- if (error)
- return error;
-
- xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
- xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
- trace_xfs_log_grant_exit(log, tic);
- xlog_verify_grant_tail(log);
- return 0;
-}
-
-/*
- * Replenish the byte reservation required by moving the grant write head.
- *
- * Similar to xlog_grant_log_space, the function is structured to have a lock
- * free fast path.
- */
-STATIC int
-xlog_regrant_write_log_space(
- struct log *log,
- struct xlog_ticket *tic)
-{
- int free_bytes, need_bytes;
- int error = 0;
-
- tic->t_curr_res = tic->t_unit_res;
- xlog_tic_reset_res(tic);
-
- if (tic->t_cnt > 0)
- return 0;
-
- ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-
- trace_xfs_log_regrant_write_enter(log, tic);
-
- /*
- * If there are other waiters on the queue then give them a chance at
- * logspace before us. Wake up the first waiters, if we do not wake
- * up all the waiters then go to sleep waiting for more free space,
- * otherwise try to get some space for this transaction.
- */
- need_bytes = tic->t_unit_res;
- free_bytes = xlog_space_left(log, &log->l_grant_write_head);
- if (!list_empty_careful(&log->l_writeq)) {
- spin_lock(&log->l_grant_write_lock);
- if (!xlog_writeq_wake(log, &free_bytes) ||
- free_bytes < need_bytes)
- error = xlog_writeq_wait(log, tic, need_bytes);
- spin_unlock(&log->l_grant_write_lock);
- } else if (free_bytes < need_bytes) {
- spin_lock(&log->l_grant_write_lock);
- error = xlog_writeq_wait(log, tic, need_bytes);
- spin_unlock(&log->l_grant_write_lock);
- }
-
- if (error)
- return error;
-
- xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
- trace_xfs_log_regrant_write_exit(log, tic);
- xlog_verify_grant_tail(log);
- return 0;
-}
-
/* The first cnt-1 times through here we don't need to
* move the grant write head because the permanent
* reservation has reserved cnt times the unit amount.
@@ -2695,9 +2617,9 @@ xlog_regrant_reserve_log_space(xlog_t *log,
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- xlog_grant_sub_space(log, &log->l_grant_reserve_head,
+ xlog_grant_sub_space(log, &log->l_reserve_head.grant,
ticket->t_curr_res);
- xlog_grant_sub_space(log, &log->l_grant_write_head,
+ xlog_grant_sub_space(log, &log->l_write_head.grant,
ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
@@ -2708,7 +2630,7 @@ xlog_regrant_reserve_log_space(xlog_t *log,
if (ticket->t_cnt > 0)
return;
- xlog_grant_add_space(log, &log->l_grant_reserve_head,
+ xlog_grant_add_space(log, &log->l_reserve_head.grant,
ticket->t_unit_res);
trace_xfs_log_regrant_reserve_exit(log, ticket);
@@ -2754,14 +2676,13 @@ xlog_ungrant_log_space(xlog_t *log,
bytes += ticket->t_unit_res*ticket->t_cnt;
}
- xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
- xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
+ xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
+ xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
trace_xfs_log_ungrant_exit(log, ticket);
- xfs_log_move_tail(log->l_mp, 1);
-} /* xlog_ungrant_log_space */
-
+ xfs_log_space_wake(log->l_mp);
+}
/*
* Flush iclog to disk if this is the last reference to the given iclog and
@@ -3219,7 +3140,7 @@ xlog_ticket_alloc(
int unit_bytes,
int cnt,
char client,
- uint xflags,
+ bool permanent,
int alloc_flags)
{
struct xlog_ticket *tic;
@@ -3313,6 +3234,7 @@ xlog_ticket_alloc(
}
atomic_set(&tic->t_ref, 1);
+ tic->t_task = current;
INIT_LIST_HEAD(&tic->t_queue);
tic->t_unit_res = unit_bytes;
tic->t_curr_res = unit_bytes;
@@ -3322,9 +3244,8 @@ xlog_ticket_alloc(
tic->t_clientid = client;
tic->t_flags = XLOG_TIC_INITED;
tic->t_trans_type = 0;
- if (xflags & XFS_LOG_PERM_RESERV)
+ if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- init_waitqueue_head(&tic->t_wait);
xlog_tic_reset_res(tic);
@@ -3380,7 +3301,7 @@ xlog_verify_grant_tail(
int tail_cycle, tail_blocks;
int cycle, space;
- xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+ xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
if (tail_cycle != cycle) {
if (cycle - 1 != tail_cycle &&
@@ -3582,7 +3503,6 @@ xfs_log_force_umount(
struct xfs_mount *mp,
int logerror)
{
- xlog_ticket_t *tic;
xlog_t *log;
int retval;
@@ -3650,15 +3570,8 @@ xfs_log_force_umount(
* we don't enqueue anything once the SHUTDOWN flag is set, and this
* action is protected by the grant locks.
*/
- spin_lock(&log->l_grant_reserve_lock);
- list_for_each_entry(tic, &log->l_reserveq, t_queue)
- wake_up(&tic->t_wait);
- spin_unlock(&log->l_grant_reserve_lock);
-
- spin_lock(&log->l_grant_write_lock);
- list_for_each_entry(tic, &log->l_writeq, t_queue)
- wake_up(&tic->t_wait);
- spin_unlock(&log->l_grant_write_lock);
+ xlog_grant_head_wake_all(&log->l_reserve_head);
+ xlog_grant_head_wake_all(&log->l_write_head);
if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 2aee3b22d29..2c622bedb30 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -53,15 +53,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
#define XFS_LOG_REL_PERM_RESERV 0x1
/*
- * Flags to xfs_log_reserve()
- *
- * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are
- * performed against this type of reservation, the reservation
- * is not decreased. Long running transactions should use this.
- */
-#define XFS_LOG_PERM_RESERV 0x2
-
-/*
* Flags to xfs_log_force()
*
* XFS_LOG_SYNC: Synchronous force in-core log to disk
@@ -160,8 +151,8 @@ int xfs_log_mount(struct xfs_mount *mp,
xfs_daddr_t start_block,
int num_bblocks);
int xfs_log_mount_finish(struct xfs_mount *mp);
-void xfs_log_move_tail(struct xfs_mount *mp,
- xfs_lsn_t tail_lsn);
+xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
+void xfs_log_space_wake(struct xfs_mount *mp);
int xfs_log_notify(struct xfs_mount *mp,
struct xlog_in_core *iclog,
xfs_log_callback_t *callback_entry);
@@ -172,8 +163,9 @@ int xfs_log_reserve(struct xfs_mount *mp,
int count,
struct xlog_ticket **ticket,
__uint8_t clientid,
- uint flags,
+ bool permanent,
uint t_type);
+int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
int xfs_log_unmount_write(struct xfs_mount *mp);
void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2d3b6a498d6..2152900b79d 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -239,8 +239,8 @@ typedef struct xlog_res {
} xlog_res_t;
typedef struct xlog_ticket {
- wait_queue_head_t t_wait; /* ticket wait queue */
struct list_head t_queue; /* reserve/write queue */
+ struct task_struct *t_task; /* task that owns this ticket */
xlog_tid_t t_tid; /* transaction identifier : 4 */
atomic_t t_ref; /* ticket reference count : 4 */
int t_curr_res; /* current reservation in bytes : 4 */
@@ -470,6 +470,16 @@ struct xfs_cil {
#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
/*
+ * ticket grant locks, queues and accounting have their own cachlines
+ * as these are quite hot and can be operated on concurrently.
+ */
+struct xlog_grant_head {
+ spinlock_t lock ____cacheline_aligned_in_smp;
+ struct list_head waiters;
+ atomic64_t grant;
+};
+
+/*
* The reservation head lsn is not made up of a cycle number and block number.
* Instead, it uses a cycle number and byte number. Logs don't expect to
* overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -520,17 +530,8 @@ typedef struct log {
/* lsn of 1st LR with unflushed * buffers */
atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
- /*
- * ticket grant locks, queues and accounting have their own cachlines
- * as these are quite hot and can be operated on concurrently.
- */
- spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
- struct list_head l_reserveq;
- atomic64_t l_grant_reserve_head;
-
- spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
- struct list_head l_writeq;
- atomic64_t l_grant_write_head;
+ struct xlog_grant_head l_reserve_head;
+ struct xlog_grant_head l_write_head;
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
@@ -545,14 +546,13 @@ typedef struct log {
#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
/* common routines */
-extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
extern int xlog_recover(xlog_t *log);
extern int xlog_recover_finish(xlog_t *log);
extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
extern kmem_zone_t *xfs_log_ticket_zone;
struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
- int count, char client, uint xflags,
+ int count, char client, bool permanent,
int alloc_flags);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 541a508adea..8ecad5bad66 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -965,9 +965,9 @@ xlog_find_tail(
log->l_curr_cycle++;
atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
- xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
+ xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
BBTOB(log->l_curr_block));
- xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
+ xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
BBTOB(log->l_curr_block));
/*
@@ -1489,7 +1489,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
+ ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
memcpy(&ptr[old_len], dp, len); /* d, s, l */
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -1981,7 +1981,7 @@ xfs_qm_dqcheck(
if (!errs && ddq->d_id) {
if (ddq->d_blk_softlimit &&
- be64_to_cpu(ddq->d_bcount) >=
+ be64_to_cpu(ddq->d_bcount) >
be64_to_cpu(ddq->d_blk_softlimit)) {
if (!ddq->d_btimer) {
if (flags & XFS_QMOPT_DOWARN)
@@ -1992,7 +1992,7 @@ xfs_qm_dqcheck(
}
}
if (ddq->d_ino_softlimit &&
- be64_to_cpu(ddq->d_icount) >=
+ be64_to_cpu(ddq->d_icount) >
be64_to_cpu(ddq->d_ino_softlimit)) {
if (!ddq->d_itimer) {
if (flags & XFS_QMOPT_DOWARN)
@@ -2003,7 +2003,7 @@ xfs_qm_dqcheck(
}
}
if (ddq->d_rtb_softlimit &&
- be64_to_cpu(ddq->d_rtbcount) >=
+ be64_to_cpu(ddq->d_rtbcount) >
be64_to_cpu(ddq->d_rtb_softlimit)) {
if (!ddq->d_rtbtimer) {
if (flags & XFS_QMOPT_DOWARN)
@@ -3161,37 +3161,26 @@ xlog_recover_process_iunlinks(
*/
continue;
}
+ /*
+ * Unlock the buffer so that it can be acquired in the normal
+ * course of the transaction to truncate and free each inode.
+ * Because we are not racing with anyone else here for the AGI
+ * buffer, we don't even need to hold it locked to read the
+ * initial unlinked bucket entries out of the buffer. We keep
+ * buffer reference though, so that it stays pinned in memory
+ * while we need the buffer.
+ */
agi = XFS_BUF_TO_AGI(agibp);
+ xfs_buf_unlock(agibp);
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
agino = be32_to_cpu(agi->agi_unlinked[bucket]);
while (agino != NULLAGINO) {
- /*
- * Release the agi buffer so that it can
- * be acquired in the normal course of the
- * transaction to truncate and free the inode.
- */
- xfs_buf_relse(agibp);
-
agino = xlog_recover_process_one_iunlink(mp,
agno, agino, bucket);
-
- /*
- * Reacquire the agibuffer and continue around
- * the loop. This should never fail as we know
- * the buffer was good earlier on.
- */
- error = xfs_read_agi(mp, NULL, agno, &agibp);
- ASSERT(error == 0);
- agi = XFS_BUF_TO_AGI(agibp);
}
}
-
- /*
- * Release the buffer for the current agi so we can
- * go on to the next one.
- */
- xfs_buf_relse(agibp);
+ xfs_buf_rele(agibp);
}
mp->m_dmevmask = mp_dmevmask;
@@ -3695,7 +3684,7 @@ xlog_do_recover(
/* Convert superblock from on-disk format */
sbp = &log->l_mp->m_sb;
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
ASSERT(xfs_sb_good_version(sbp));
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d06afbc3540..1ffead4b229 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -158,7 +158,7 @@ xfs_uuid_mount(
out_duplicate:
mutex_unlock(&xfs_uuid_table_mutex);
- xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
+ xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
return XFS_ERROR(EINVAL);
}
@@ -553,9 +553,11 @@ out_unwind:
void
xfs_sb_from_disk(
- xfs_sb_t *to,
+ struct xfs_mount *mp,
xfs_dsb_t *from)
{
+ struct xfs_sb *to = &mp->m_sb;
+
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -693,7 +695,7 @@ reread:
* Initialize the mount structure from the superblock.
* But first do some basic consistency checking.
*/
- xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
if (error) {
if (loud)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 19f69e23250..9eba7388782 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -211,6 +211,9 @@ typedef struct xfs_mount {
struct shrinker m_inode_shrink; /* inode reclaim shrinker */
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
+
+ struct workqueue_struct *m_data_workqueue;
+ struct workqueue_struct *m_unwritten_workqueue;
} xfs_mount_t;
/*
@@ -395,7 +398,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
xfs_agnumber_t *);
-extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 671f37eae1c..55c6afedc87 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -48,197 +48,189 @@
* quota functionality, including maintaining the freelist and hash
* tables of dquots.
*/
-struct mutex xfs_Gqm_lock;
-struct xfs_qm *xfs_Gqm;
-uint ndquot;
-
-kmem_zone_t *qm_dqzone;
-kmem_zone_t *qm_dqtrxzone;
-
-STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
-STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
-
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
-static struct shrinker xfs_qm_shaker = {
- .shrink = xfs_qm_shake,
- .seeks = DEFAULT_SEEKS,
-};
-
/*
- * Initialize the XQM structure.
- * Note that there is not one quota manager per file system.
+ * We use the batch lookup interface to iterate over the dquots as it
+ * currently is the only interface into the radix tree code that allows
+ * fuzzy lookups instead of exact matches. Holding the lock over multiple
+ * operations is fine as all callers are used either during mount/umount
+ * or quotaoff.
*/
-STATIC struct xfs_qm *
-xfs_Gqm_init(void)
-{
- xfs_dqhash_t *udqhash, *gdqhash;
- xfs_qm_t *xqm;
- size_t hsize;
- uint i;
-
- /*
- * Initialize the dquot hash tables.
- */
- udqhash = kmem_zalloc_greedy(&hsize,
- XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
- XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
- if (!udqhash)
- goto out;
+#define XFS_DQ_LOOKUP_BATCH 32
- gdqhash = kmem_zalloc_large(hsize);
- if (!gdqhash)
- goto out_free_udqhash;
-
- hsize /= sizeof(xfs_dqhash_t);
- ndquot = hsize << 8;
+STATIC int
+xfs_qm_dquot_walk(
+ struct xfs_mount *mp,
+ int type,
+ int (*execute)(struct xfs_dquot *dqp))
+{
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ uint32_t next_index;
+ int last_error = 0;
+ int skipped;
+ int nr_found;
- xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
- xqm->qm_dqhashmask = hsize - 1;
- xqm->qm_usr_dqhtable = udqhash;
- xqm->qm_grp_dqhtable = gdqhash;
- ASSERT(xqm->qm_usr_dqhtable != NULL);
- ASSERT(xqm->qm_grp_dqhtable != NULL);
+restart:
+ skipped = 0;
+ next_index = 0;
+ nr_found = 0;
+
+ while (1) {
+ struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH];
+ int error = 0;
+ int i;
+
+ mutex_lock(&qi->qi_tree_lock);
+ nr_found = radix_tree_gang_lookup(tree, (void **)batch,
+ next_index, XFS_DQ_LOOKUP_BATCH);
+ if (!nr_found) {
+ mutex_unlock(&qi->qi_tree_lock);
+ break;
+ }
- for (i = 0; i < hsize; i++) {
- xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
- xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
- }
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_dquot *dqp = batch[i];
- /*
- * Freelist of all dquots of all file systems
- */
- INIT_LIST_HEAD(&xqm->qm_dqfrlist);
- xqm->qm_dqfrlist_cnt = 0;
- mutex_init(&xqm->qm_dqfrlist_lock);
+ next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
- /*
- * dquot zone. we register our own low-memory callback.
- */
- if (!qm_dqzone) {
- xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
- "xfs_dquots");
- qm_dqzone = xqm->qm_dqzone;
- } else
- xqm->qm_dqzone = qm_dqzone;
+ error = execute(batch[i]);
+ if (error == EAGAIN) {
+ skipped++;
+ continue;
+ }
+ if (error && last_error != EFSCORRUPTED)
+ last_error = error;
+ }
- register_shrinker(&xfs_qm_shaker);
+ mutex_unlock(&qi->qi_tree_lock);
- /*
- * The t_dqinfo portion of transactions.
- */
- if (!qm_dqtrxzone) {
- xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
- "xfs_dqtrx");
- qm_dqtrxzone = xqm->qm_dqtrxzone;
- } else
- xqm->qm_dqtrxzone = qm_dqtrxzone;
+ /* bail out if the filesystem is corrupted. */
+ if (last_error == EFSCORRUPTED) {
+ skipped = 0;
+ break;
+ }
+ }
- atomic_set(&xqm->qm_totaldquots, 0);
- xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
- xqm->qm_nrefs = 0;
- return xqm;
+ if (skipped) {
+ delay(1);
+ goto restart;
+ }
- out_free_udqhash:
- kmem_free_large(udqhash);
- out:
- return NULL;
+ return last_error;
}
+
/*
- * Destroy the global quota manager when its reference count goes to zero.
+ * Purge a dquot from all tracking data structures and free it.
*/
-STATIC void
-xfs_qm_destroy(
- struct xfs_qm *xqm)
+STATIC int
+xfs_qm_dqpurge(
+ struct xfs_dquot *dqp)
{
- int hsize, i;
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_dquot *gdqp = NULL;
- ASSERT(xqm != NULL);
- ASSERT(xqm->qm_nrefs == 0);
+ xfs_dqlock(dqp);
+ if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
+ xfs_dqunlock(dqp);
+ return EAGAIN;
+ }
- unregister_shrinker(&xfs_qm_shaker);
+ /*
+ * If this quota has a group hint attached, prepare for releasing it
+ * now.
+ */
+ gdqp = dqp->q_gdquot;
+ if (gdqp) {
+ xfs_dqlock(gdqp);
+ dqp->q_gdquot = NULL;
+ }
- mutex_lock(&xqm->qm_dqfrlist_lock);
- ASSERT(list_empty(&xqm->qm_dqfrlist));
- mutex_unlock(&xqm->qm_dqfrlist_lock);
+ dqp->dq_flags |= XFS_DQ_FREEING;
- hsize = xqm->qm_dqhashmask + 1;
- for (i = 0; i < hsize; i++) {
- xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
- xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
+ /*
+ * If we're turning off quotas, we have to make sure that, for
+ * example, we don't delete quota disk blocks while dquots are
+ * in the process of getting written to those disk blocks.
+ * This dquot might well be on AIL, and we can't leave it there
+ * if we're turning off quotas. Basically, we need this flush
+ * lock, and are willing to block on it.
+ */
+ if (!xfs_dqflock_nowait(dqp)) {
+ /*
+ * Block on the flush lock after nudging dquot buffer,
+ * if it is incore.
+ */
+ xfs_dqflock_pushbuf_wait(dqp);
}
- kmem_free_large(xqm->qm_usr_dqhtable);
- kmem_free_large(xqm->qm_grp_dqhtable);
- xqm->qm_usr_dqhtable = NULL;
- xqm->qm_grp_dqhtable = NULL;
- xqm->qm_dqhashmask = 0;
-
- kmem_free(xqm);
-}
-/*
- * Called at mount time to let XQM know that another file system is
- * starting quotas. This isn't crucial information as the individual mount
- * structures are pretty independent, but it helps the XQM keep a
- * global view of what's going on.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_hold_quotafs_ref(
- struct xfs_mount *mp)
-{
/*
- * Need to lock the xfs_Gqm structure for things like this. For example,
- * the structure could disappear between the entry to this routine and
- * a HOLD operation if not locked.
+ * If we are turning this type of quotas off, we don't care
+ * about the dirty metadata sitting in this dquot. OTOH, if
+ * we're unmounting, we do care, so we flush it and wait.
*/
- mutex_lock(&xfs_Gqm_lock);
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ int error;
- if (!xfs_Gqm) {
- xfs_Gqm = xfs_Gqm_init();
- if (!xfs_Gqm) {
- mutex_unlock(&xfs_Gqm_lock);
- return ENOMEM;
- }
+ /*
+ * We don't care about getting disk errors here. We need
+ * to purge this dquot anyway, so we go ahead regardless.
+ */
+ error = xfs_qm_dqflush(dqp, SYNC_WAIT);
+ if (error)
+ xfs_warn(mp, "%s: dquot %p flush failed",
+ __func__, dqp);
+ xfs_dqflock(dqp);
}
+ ASSERT(atomic_read(&dqp->q_pincount) == 0);
+ ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+ !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+
+ xfs_dqfunlock(dqp);
+ xfs_dqunlock(dqp);
+
+ radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ be32_to_cpu(dqp->q_core.d_id));
+ qi->qi_dquots--;
+
/*
- * We can keep a list of all filesystems with quotas mounted for
- * debugging and statistical purposes, but ...
- * Just take a reference and get out.
+ * We move dquots to the freelist as soon as their reference count
+ * hits zero, so it really should be on the freelist here.
*/
- xfs_Gqm->qm_nrefs++;
- mutex_unlock(&xfs_Gqm_lock);
+ mutex_lock(&qi->qi_lru_lock);
+ ASSERT(!list_empty(&dqp->q_lru));
+ list_del_init(&dqp->q_lru);
+ qi->qi_lru_count--;
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ mutex_unlock(&qi->qi_lru_lock);
+ xfs_qm_dqdestroy(dqp);
+
+ if (gdqp)
+ xfs_qm_dqput(gdqp);
return 0;
}
-
/*
- * Release the reference that a filesystem took at mount time,
- * so that we know when we need to destroy the entire quota manager.
+ * Purge the dquot cache.
*/
-/* ARGSUSED */
-STATIC void
-xfs_qm_rele_quotafs_ref(
- struct xfs_mount *mp)
+void
+xfs_qm_dqpurge_all(
+ struct xfs_mount *mp,
+ uint flags)
{
- ASSERT(xfs_Gqm);
- ASSERT(xfs_Gqm->qm_nrefs > 0);
-
- /*
- * Destroy the entire XQM. If somebody mounts with quotaon, this'll
- * be restarted.
- */
- mutex_lock(&xfs_Gqm_lock);
- if (--xfs_Gqm->qm_nrefs == 0) {
- xfs_qm_destroy(xfs_Gqm);
- xfs_Gqm = NULL;
- }
- mutex_unlock(&xfs_Gqm_lock);
+ if (flags & XFS_QMOPT_UQUOTA)
+ xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge);
+ if (flags & XFS_QMOPT_GQUOTA)
+ xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge);
+ if (flags & XFS_QMOPT_PQUOTA)
+ xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge);
}
/*
@@ -379,175 +371,6 @@ xfs_qm_unmount_quotas(
}
}
-/*
- * Flush all dquots of the given file system to disk. The dquots are
- * _not_ purged from memory here, just their data written to disk.
- */
-STATIC int
-xfs_qm_dqflush_all(
- struct xfs_mount *mp)
-{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- int recl;
- struct xfs_dquot *dqp;
- int error;
-
- if (!q)
- return 0;
-again:
- mutex_lock(&q->qi_dqlist_lock);
- list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
- xfs_dqlock(dqp);
- if ((dqp->dq_flags & XFS_DQ_FREEING) ||
- !XFS_DQ_IS_DIRTY(dqp)) {
- xfs_dqunlock(dqp);
- continue;
- }
-
- /* XXX a sentinel would be better */
- recl = q->qi_dqreclaims;
- if (!xfs_dqflock_nowait(dqp)) {
- /*
- * If we can't grab the flush lock then check
- * to see if the dquot has been flushed delayed
- * write. If so, grab its buffer and send it
- * out immediately. We'll be able to acquire
- * the flush lock when the I/O completes.
- */
- xfs_dqflock_pushbuf_wait(dqp);
- }
- /*
- * Let go of the mplist lock. We don't want to hold it
- * across a disk write.
- */
- mutex_unlock(&q->qi_dqlist_lock);
- error = xfs_qm_dqflush(dqp, 0);
- xfs_dqunlock(dqp);
- if (error)
- return error;
-
- mutex_lock(&q->qi_dqlist_lock);
- if (recl != q->qi_dqreclaims) {
- mutex_unlock(&q->qi_dqlist_lock);
- /* XXX restart limit */
- goto again;
- }
- }
-
- mutex_unlock(&q->qi_dqlist_lock);
- /* return ! busy */
- return 0;
-}
-
-/*
- * Release the group dquot pointers the user dquots may be
- * carrying around as a hint. mplist is locked on entry and exit.
- */
-STATIC void
-xfs_qm_detach_gdquots(
- struct xfs_mount *mp)
-{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_dquot *dqp, *gdqp;
-
- again:
- ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
- list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
- xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_FREEING) {
- xfs_dqunlock(dqp);
- mutex_unlock(&q->qi_dqlist_lock);
- delay(1);
- mutex_lock(&q->qi_dqlist_lock);
- goto again;
- }
-
- gdqp = dqp->q_gdquot;
- if (gdqp)
- dqp->q_gdquot = NULL;
- xfs_dqunlock(dqp);
-
- if (gdqp)
- xfs_qm_dqrele(gdqp);
- }
-}
-
-/*
- * Go through all the incore dquots of this file system and take them
- * off the mplist and hashlist, if the dquot type matches the dqtype
- * parameter. This is used when turning off quota accounting for
- * users and/or groups, as well as when the filesystem is unmounting.
- */
-STATIC int
-xfs_qm_dqpurge_int(
- struct xfs_mount *mp,
- uint flags)
-{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_dquot *dqp, *n;
- uint dqtype;
- int nmisses = 0;
- LIST_HEAD (dispose_list);
-
- if (!q)
- return 0;
-
- dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
- dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
- dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-
- mutex_lock(&q->qi_dqlist_lock);
-
- /*
- * In the first pass through all incore dquots of this filesystem,
- * we release the group dquot pointers the user dquots may be
- * carrying around as a hint. We need to do this irrespective of
- * what's being turned off.
- */
- xfs_qm_detach_gdquots(mp);
-
- /*
- * Try to get rid of all of the unwanted dquots.
- */
- list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
- xfs_dqlock(dqp);
- if ((dqp->dq_flags & dqtype) != 0 &&
- !(dqp->dq_flags & XFS_DQ_FREEING)) {
- if (dqp->q_nrefs == 0) {
- dqp->dq_flags |= XFS_DQ_FREEING;
- list_move_tail(&dqp->q_mplist, &dispose_list);
- } else
- nmisses++;
- }
- xfs_dqunlock(dqp);
- }
- mutex_unlock(&q->qi_dqlist_lock);
-
- list_for_each_entry_safe(dqp, n, &dispose_list, q_mplist)
- xfs_qm_dqpurge(dqp);
-
- return nmisses;
-}
-
-int
-xfs_qm_dqpurge_all(
- xfs_mount_t *mp,
- uint flags)
-{
- int ndquots;
-
- /*
- * Purge the dquot cache.
- * None of the dquots should really be busy at this point.
- */
- if (mp->m_quotainfo) {
- while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
- delay(ndquots * 10);
- }
- }
- return 0;
-}
-
STATIC int
xfs_qm_dqattach_one(
xfs_inode_t *ip,
@@ -786,14 +609,6 @@ xfs_qm_dqdetach(
}
/*
- * The hash chains and the mplist use the same xfs_dqhash structure as
- * their list head, but we can take the mplist qh_lock and one of the
- * hash qh_locks at the same time without any problem as they aren't
- * related.
- */
-static struct lock_class_key xfs_quota_mplist_class;
-
-/*
* This initializes all the quota information that's kept in the
* mount structure
*/
@@ -807,13 +622,6 @@ xfs_qm_init_quotainfo(
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- /*
- * Tell XQM that we exist as soon as possible.
- */
- if ((error = xfs_qm_hold_quotafs_ref(mp))) {
- return error;
- }
-
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
/*
@@ -826,11 +634,13 @@ xfs_qm_init_quotainfo(
return error;
}
- INIT_LIST_HEAD(&qinf->qi_dqlist);
- mutex_init(&qinf->qi_dqlist_lock);
- lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
+ INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
+ INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
+ mutex_init(&qinf->qi_tree_lock);
- qinf->qi_dqreclaims = 0;
+ INIT_LIST_HEAD(&qinf->qi_lru_list);
+ qinf->qi_lru_count = 0;
+ mutex_init(&qinf->qi_lru_lock);
/* mutex used to serialize quotaoffs */
mutex_init(&qinf->qi_quotaofflock);
@@ -897,6 +707,9 @@ xfs_qm_init_quotainfo(
qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
}
+ qinf->qi_shrinker.shrink = xfs_qm_shake;
+ qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&qinf->qi_shrinker);
return 0;
}
@@ -914,17 +727,8 @@ xfs_qm_destroy_quotainfo(
qi = mp->m_quotainfo;
ASSERT(qi != NULL);
- ASSERT(xfs_Gqm != NULL);
- /*
- * Release the reference that XQM kept, so that we know
- * when the XQM structure should be freed. We cannot assume
- * that xfs_Gqm is non-null after this point.
- */
- xfs_qm_rele_quotafs_ref(mp);
-
- ASSERT(list_empty(&qi->qi_dqlist));
- mutex_destroy(&qi->qi_dqlist_lock);
+ unregister_shrinker(&qi->qi_shrinker);
if (qi->qi_uquotaip) {
IRELE(qi->qi_uquotaip);
@@ -939,30 +743,6 @@ xfs_qm_destroy_quotainfo(
mp->m_quotainfo = NULL;
}
-
-
-/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_list_init(
- xfs_dqlist_t *list,
- char *str,
- int n)
-{
- mutex_init(&list->qh_lock);
- INIT_LIST_HEAD(&list->qh_list);
- list->qh_version = 0;
- list->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_list_destroy(
- xfs_dqlist_t *list)
-{
- mutex_destroy(&(list->qh_lock));
-}
-
/*
* Create an inode and return with a reference already taken, but unlocked
* This is how we create quota inodes
@@ -1400,6 +1180,28 @@ error0:
return error;
}
+STATIC int
+xfs_qm_flush_one(
+ struct xfs_dquot *dqp)
+{
+ int error = 0;
+
+ xfs_dqlock(dqp);
+ if (dqp->dq_flags & XFS_DQ_FREEING)
+ goto out_unlock;
+ if (!XFS_DQ_IS_DIRTY(dqp))
+ goto out_unlock;
+
+ if (!xfs_dqflock_nowait(dqp))
+ xfs_dqflock_pushbuf_wait(dqp);
+
+ error = xfs_qm_dqflush(dqp, 0);
+
+out_unlock:
+ xfs_dqunlock(dqp);
+ return error;
+}
+
/*
* Walk thru all the filesystem inodes and construct a consistent view
* of the disk quota world. If the quotacheck fails, disable quotas.
@@ -1408,7 +1210,7 @@ int
xfs_qm_quotacheck(
xfs_mount_t *mp)
{
- int done, count, error;
+ int done, count, error, error2;
xfs_ino_t lastino;
size_t structsz;
xfs_inode_t *uip, *gip;
@@ -1422,12 +1224,6 @@ xfs_qm_quotacheck(
ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- /*
- * There should be no cached dquots. The (simplistic) quotacheck
- * algorithm doesn't like that.
- */
- ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
-
xfs_notice(mp, "Quotacheck needed: Please wait.");
/*
@@ -1466,12 +1262,21 @@ xfs_qm_quotacheck(
} while (!done);
/*
- * We've made all the changes that we need to make incore.
- * Flush them down to disk buffers if everything was updated
- * successfully.
+ * We've made all the changes that we need to make incore. Flush them
+ * down to disk buffers if everything was updated successfully.
*/
- if (!error)
- error = xfs_qm_dqflush_all(mp);
+ if (XFS_IS_UQUOTA_ON(mp))
+ error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one);
+ if (XFS_IS_GQUOTA_ON(mp)) {
+ error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one);
+ if (!error)
+ error = error2;
+ }
+ if (XFS_IS_PQUOTA_ON(mp)) {
+ error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one);
+ if (!error)
+ error = error2;
+ }
/*
* We can get this error if we couldn't do a dquot allocation inside
@@ -1499,7 +1304,7 @@ xfs_qm_quotacheck(
* quotachecked status, since we won't be doing accounting for
* that type anymore.
*/
- mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
+ mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD;
mp->m_qflags |= flags;
error_return:
@@ -1511,7 +1316,6 @@ xfs_qm_quotacheck(
* We must turn off quotas.
*/
ASSERT(mp->m_quotainfo != NULL);
- ASSERT(xfs_Gqm != NULL);
xfs_qm_destroy_quotainfo(mp);
if (xfs_mount_reset_sbqflags(mp)) {
xfs_warn(mp,
@@ -1600,216 +1404,147 @@ xfs_qm_init_quotainos(
return 0;
}
+STATIC void
+xfs_qm_dqfree_one(
+ struct xfs_dquot *dqp)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ mutex_lock(&qi->qi_tree_lock);
+ radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ be32_to_cpu(dqp->q_core.d_id));
-/*
- * Pop the least recently used dquot off the freelist and recycle it.
- */
-STATIC struct xfs_dquot *
-xfs_qm_dqreclaim_one(void)
+ qi->qi_dquots--;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ xfs_qm_dqdestroy(dqp);
+}
+
+STATIC void
+xfs_qm_dqreclaim_one(
+ struct xfs_dquot *dqp,
+ struct list_head *dispose_list)
{
- struct xfs_dquot *dqp;
- int restarts = 0;
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ int error;
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-restart:
- list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
- struct xfs_mount *mp = dqp->q_mount;
+ if (!xfs_dqlock_nowait(dqp))
+ goto out_busy;
- if (!xfs_dqlock_nowait(dqp))
- continue;
+ /*
+ * This dquot has acquired a reference in the meantime remove it from
+ * the freelist and try again.
+ */
+ if (dqp->q_nrefs) {
+ xfs_dqunlock(dqp);
- /*
- * This dquot has already been grabbed by dqlookup.
- * Remove it from the freelist and try again.
- */
- if (dqp->q_nrefs) {
- trace_xfs_dqreclaim_want(dqp);
- XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- restarts++;
- goto dqunlock;
- }
+ trace_xfs_dqreclaim_want(dqp);
+ XFS_STATS_INC(xs_qm_dqwants);
- ASSERT(dqp->q_hash);
- ASSERT(!list_empty(&dqp->q_mplist));
+ list_del_init(&dqp->q_lru);
+ qi->qi_lru_count--;
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ return;
+ }
- /*
- * Try to grab the flush lock. If this dquot is in the process
- * of getting flushed to disk, we don't want to reclaim it.
- */
- if (!xfs_dqflock_nowait(dqp))
- goto dqunlock;
+ /*
+ * Try to grab the flush lock. If this dquot is in the process of
+ * getting flushed to disk, we don't want to reclaim it.
+ */
+ if (!xfs_dqflock_nowait(dqp))
+ goto out_busy;
+
+ /*
+ * We have the flush lock so we know that this is not in the
+ * process of being flushed. So, if this is dirty, flush it
+ * DELWRI so that we don't get a freelist infested with
+ * dirty dquots.
+ */
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ trace_xfs_dqreclaim_dirty(dqp);
/*
- * We have the flush lock so we know that this is not in the
- * process of being flushed. So, if this is dirty, flush it
- * DELWRI so that we don't get a freelist infested with
- * dirty dquots.
+ * We flush it delayed write, so don't bother releasing the
+ * freelist lock.
*/
- if (XFS_DQ_IS_DIRTY(dqp)) {
- int error;
-
- trace_xfs_dqreclaim_dirty(dqp);
-
- /*
- * We flush it delayed write, so don't bother
- * releasing the freelist lock.
- */
- error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK);
- if (error) {
- xfs_warn(mp, "%s: dquot %p flush failed",
- __func__, dqp);
- }
- goto dqunlock;
+ error = xfs_qm_dqflush(dqp, 0);
+ if (error) {
+ xfs_warn(mp, "%s: dquot %p flush failed",
+ __func__, dqp);
}
- xfs_dqfunlock(dqp);
/*
- * Prevent lookup now that we are going to reclaim the dquot.
- * Once XFS_DQ_FREEING is set lookup won't touch the dquot,
- * thus we can drop the lock now.
+ * Give the dquot another try on the freelist, as the
+ * flushing will take some time.
*/
- dqp->dq_flags |= XFS_DQ_FREEING;
- xfs_dqunlock(dqp);
-
- mutex_lock(&dqp->q_hash->qh_lock);
- list_del_init(&dqp->q_hashlist);
- dqp->q_hash->qh_version++;
- mutex_unlock(&dqp->q_hash->qh_lock);
-
- mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
- list_del_init(&dqp->q_mplist);
- mp->m_quotainfo->qi_dquots--;
- mp->m_quotainfo->qi_dqreclaims++;
- mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+ goto out_busy;
+ }
+ xfs_dqfunlock(dqp);
- ASSERT(dqp->q_nrefs == 0);
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
+ /*
+ * Prevent lookups now that we are past the point of no return.
+ */
+ dqp->dq_flags |= XFS_DQ_FREEING;
+ xfs_dqunlock(dqp);
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- return dqp;
-dqunlock:
- xfs_dqunlock(dqp);
- if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
- break;
- goto restart;
- }
+ ASSERT(dqp->q_nrefs == 0);
+ list_move_tail(&dqp->q_lru, dispose_list);
+ qi->qi_lru_count--;
+ XFS_STATS_DEC(xs_qm_dquot_unused);
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- return NULL;
-}
+ trace_xfs_dqreclaim_done(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaims);
+ return;
-/*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
- * favor the lookup function ...
- */
-STATIC int
-xfs_qm_shake_freelist(
- int howmany)
-{
- int nreclaimed = 0;
- xfs_dquot_t *dqp;
+out_busy:
+ xfs_dqunlock(dqp);
- if (howmany <= 0)
- return 0;
+ /*
+ * Move the dquot to the tail of the list so that we don't spin on it.
+ */
+ list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
- while (nreclaimed < howmany) {
- dqp = xfs_qm_dqreclaim_one();
- if (!dqp)
- return nreclaimed;
- xfs_qm_dqdestroy(dqp);
- nreclaimed++;
- }
- return nreclaimed;
+ trace_xfs_dqreclaim_busy(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaim_misses);
}
-/*
- * The kmem_shake interface is invoked when memory is running low.
- */
-/* ARGSUSED */
STATIC int
xfs_qm_shake(
- struct shrinker *shrink,
- struct shrink_control *sc)
+ struct shrinker *shrink,
+ struct shrink_control *sc)
{
- int ndqused, nfree, n;
- gfp_t gfp_mask = sc->gfp_mask;
-
- if (!kmem_shake_allow(gfp_mask))
- return 0;
- if (!xfs_Gqm)
- return 0;
-
- nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
- /* incore dquots in all f/s's */
- ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
-
- ASSERT(ndqused >= 0);
+ struct xfs_quotainfo *qi =
+ container_of(shrink, struct xfs_quotainfo, qi_shrinker);
+ int nr_to_scan = sc->nr_to_scan;
+ LIST_HEAD (dispose_list);
+ struct xfs_dquot *dqp;
- if (nfree <= ndqused && nfree < ndquot)
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
return 0;
+ if (!nr_to_scan)
+ goto out;
- ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */
- n = nfree - ndqused - ndquot; /* # over target */
-
- return xfs_qm_shake_freelist(MAX(nfree, n));
-}
-
-
-/*------------------------------------------------------------------*/
-
-/*
- * Return a new incore dquot. Depending on the number of
- * dquots in the system, we either allocate a new one on the kernel heap,
- * or reclaim a free one.
- * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
- * to reclaim an existing one from the freelist.
- */
-boolean_t
-xfs_qm_dqalloc_incore(
- xfs_dquot_t **O_dqpp)
-{
- xfs_dquot_t *dqp;
-
- /*
- * Check against high water mark to see if we want to pop
- * a nincompoop dquot off the freelist.
- */
- if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
- /*
- * Try to recycle a dquot from the freelist.
- */
- if ((dqp = xfs_qm_dqreclaim_one())) {
- XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
- /*
- * Just zero the core here. The rest will get
- * reinitialized by caller. XXX we shouldn't even
- * do this zero ...
- */
- memset(&dqp->q_core, 0, sizeof(dqp->q_core));
- *O_dqpp = dqp;
- return B_FALSE;
- }
- XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+ mutex_lock(&qi->qi_lru_lock);
+ while (!list_empty(&qi->qi_lru_list)) {
+ if (nr_to_scan-- <= 0)
+ break;
+ dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
+ q_lru);
+ xfs_qm_dqreclaim_one(dqp, &dispose_list);
}
+ mutex_unlock(&qi->qi_lru_lock);
- /*
- * Allocate a brand new dquot on the kernel heap and return it
- * to the caller to initialize.
- */
- ASSERT(xfs_Gqm->qm_dqzone != NULL);
- *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
- atomic_inc(&xfs_Gqm->qm_totaldquots);
-
- return B_TRUE;
+ while (!list_empty(&dispose_list)) {
+ dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
+ list_del_init(&dqp->q_lru);
+ xfs_qm_dqfree_one(dqp);
+ }
+out:
+ return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
}
-
/*
* Start a transaction and write the incore superblock changes to
* disk. flags parameter indicates which fields have changed.
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 9b4f3adefbc..44b858b79d7 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -21,33 +21,10 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_quota_priv.h"
-#include "xfs_qm_stats.h"
-struct xfs_qm;
struct xfs_inode;
-extern uint ndquot;
-extern struct mutex xfs_Gqm_lock;
-extern struct xfs_qm *xfs_Gqm;
-extern kmem_zone_t *qm_dqzone;
-extern kmem_zone_t *qm_dqtrxzone;
-
-/*
- * Ditto, for xfs_qm_dqreclaim_one.
- */
-#define XFS_QM_RECLAIM_MAX_RESTARTS 4
-
-/*
- * Ideal ratio of free to in use dquots. Quota manager makes an attempt
- * to keep this balance.
- */
-#define XFS_QM_DQFREE_RATIO 2
-
-/*
- * Dquot hashtable constants/threshold values.
- */
-#define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t))
-#define XFS_QM_HASHSIZE_HIGH ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t))
+extern struct kmem_zone *xfs_qm_dqtrxzone;
/*
* This defines the unit of allocation of dquots.
@@ -60,37 +37,20 @@ extern kmem_zone_t *qm_dqtrxzone;
*/
#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
-typedef xfs_dqhash_t xfs_dqlist_t;
-
-/*
- * Quota Manager (global) structure. Lives only in core.
- */
-typedef struct xfs_qm {
- xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
- xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
- uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
- struct list_head qm_dqfrlist; /* freelist of dquots */
- struct mutex qm_dqfrlist_lock;
- int qm_dqfrlist_cnt;
- atomic_t qm_totaldquots; /* total incore dquots */
- uint qm_nrefs; /* file systems with quota on */
- int qm_dqfree_ratio;/* ratio of free to inuse dquots */
- kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */
- kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */
-} xfs_qm_t;
-
/*
* Various quota information for individual filesystems.
* The mount structure keeps a pointer to this.
*/
typedef struct xfs_quotainfo {
+ struct radix_tree_root qi_uquota_tree;
+ struct radix_tree_root qi_gquota_tree;
+ struct mutex qi_tree_lock;
xfs_inode_t *qi_uquotaip; /* user quota inode */
xfs_inode_t *qi_gquotaip; /* group quota inode */
- struct list_head qi_dqlist; /* all dquots in filesys */
- struct mutex qi_dqlist_lock;
+ struct list_head qi_lru_list;
+ struct mutex qi_lru_lock;
+ int qi_lru_count;
int qi_dquots;
- int qi_dqreclaims; /* a change here indicates
- a removal in the dqlist */
time_t qi_btimelimit; /* limit for blks timer */
time_t qi_itimelimit; /* limit for inodes timer */
time_t qi_rtbtimelimit;/* limit for rt blks timer */
@@ -106,8 +66,14 @@ typedef struct xfs_quotainfo {
xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */
xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */
xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */
+ struct shrinker qi_shrinker;
} xfs_quotainfo_t;
+#define XFS_DQUOT_TREE(qi, type) \
+ ((type & XFS_DQ_USER) ? \
+ &((qi)->qi_uquota_tree) : \
+ &((qi)->qi_gquota_tree))
+
extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
@@ -143,8 +109,7 @@ extern int xfs_qm_quotacheck(xfs_mount_t *);
extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
/* dquot stuff */
-extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **);
-extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
+extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint);
extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
/* quota ops */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index a0a829addca..e6986b5d80d 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -40,28 +40,28 @@
STATIC void
xfs_fill_statvfs_from_dquot(
struct kstatfs *statp,
- xfs_disk_dquot_t *dp)
+ struct xfs_dquot *dqp)
{
__uint64_t limit;
- limit = dp->d_blk_softlimit ?
- be64_to_cpu(dp->d_blk_softlimit) :
- be64_to_cpu(dp->d_blk_hardlimit);
+ limit = dqp->q_core.d_blk_softlimit ?
+ be64_to_cpu(dqp->q_core.d_blk_softlimit) :
+ be64_to_cpu(dqp->q_core.d_blk_hardlimit);
if (limit && statp->f_blocks > limit) {
statp->f_blocks = limit;
statp->f_bfree = statp->f_bavail =
- (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
- (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
+ (statp->f_blocks > dqp->q_res_bcount) ?
+ (statp->f_blocks - dqp->q_res_bcount) : 0;
}
- limit = dp->d_ino_softlimit ?
- be64_to_cpu(dp->d_ino_softlimit) :
- be64_to_cpu(dp->d_ino_hardlimit);
+ limit = dqp->q_core.d_ino_softlimit ?
+ be64_to_cpu(dqp->q_core.d_ino_softlimit) :
+ be64_to_cpu(dqp->q_core.d_ino_hardlimit);
if (limit && statp->f_files > limit) {
statp->f_files = limit;
statp->f_ffree =
- (statp->f_files > be64_to_cpu(dp->d_icount)) ?
- (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0;
+ (statp->f_files > dqp->q_res_icount) ?
+ (statp->f_ffree - dqp->q_res_icount) : 0;
}
}
@@ -82,7 +82,7 @@ xfs_qm_statvfs(
xfs_dquot_t *dqp;
if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
- xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
+ xfs_fill_statvfs_from_dquot(statp, dqp);
xfs_qm_dqput(dqp);
}
}
@@ -156,21 +156,3 @@ xfs_qm_newmount(
return 0;
}
-
-void __init
-xfs_qm_init(void)
-{
- printk(KERN_INFO "SGI XFS Quota Management subsystem\n");
- mutex_init(&xfs_Gqm_lock);
- xfs_qm_init_procfs();
-}
-
-void __exit
-xfs_qm_exit(void)
-{
- xfs_qm_cleanup_procfs();
- if (qm_dqzone)
- kmem_zone_destroy(qm_dqzone);
- if (qm_dqtrxzone)
- kmem_zone_destroy(qm_dqtrxzone);
-}
diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
deleted file mode 100644
index 8671a0b3264..00000000000
--- a/fs/xfs/xfs_qm_stats.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_qm.h"
-
-struct xqmstats xqmstats;
-
-static int xqm_proc_show(struct seq_file *m, void *v)
-{
- /* maximum; incore; ratio free to inuse; freelist */
- seq_printf(m, "%d\t%d\t%d\t%u\n",
- ndquot,
- xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
- xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
- xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
- return 0;
-}
-
-static int xqm_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, xqm_proc_show, NULL);
-}
-
-static const struct file_operations xqm_proc_fops = {
- .owner = THIS_MODULE,
- .open = xqm_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int xqmstat_proc_show(struct seq_file *m, void *v)
-{
- /* quota performance statistics */
- seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
- xqmstats.xs_qm_dqreclaims,
- xqmstats.xs_qm_dqreclaim_misses,
- xqmstats.xs_qm_dquot_dups,
- xqmstats.xs_qm_dqcachemisses,
- xqmstats.xs_qm_dqcachehits,
- xqmstats.xs_qm_dqwants,
- xqmstats.xs_qm_dqshake_reclaims,
- xqmstats.xs_qm_dqinact_reclaims);
- return 0;
-}
-
-static int xqmstat_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, xqmstat_proc_show, NULL);
-}
-
-static const struct file_operations xqmstat_proc_fops = {
- .owner = THIS_MODULE,
- .open = xqmstat_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-void
-xfs_qm_init_procfs(void)
-{
- proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
- proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
-}
-
-void
-xfs_qm_cleanup_procfs(void)
-{
- remove_proc_entry("fs/xfs/xqm", NULL);
- remove_proc_entry("fs/xfs/xqmstat", NULL);
-}
diff --git a/fs/xfs/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
deleted file mode 100644
index 5b964fc0dc0..00000000000
--- a/fs/xfs/xfs_qm_stats.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2002 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_QM_STATS_H__
-#define __XFS_QM_STATS_H__
-
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
-/*
- * XQM global statistics
- */
-struct xqmstats {
- __uint32_t xs_qm_dqreclaims;
- __uint32_t xs_qm_dqreclaim_misses;
- __uint32_t xs_qm_dquot_dups;
- __uint32_t xs_qm_dqcachemisses;
- __uint32_t xs_qm_dqcachehits;
- __uint32_t xs_qm_dqwants;
- __uint32_t xs_qm_dqshake_reclaims;
- __uint32_t xs_qm_dqinact_reclaims;
-};
-
-extern struct xqmstats xqmstats;
-
-# define XQM_STATS_INC(count) ( (count)++ )
-
-extern void xfs_qm_init_procfs(void);
-extern void xfs_qm_cleanup_procfs(void);
-
-#else
-
-# define XQM_STATS_INC(count) do { } while (0)
-
-static inline void xfs_qm_init_procfs(void) { };
-static inline void xfs_qm_cleanup_procfs(void) { };
-
-#endif
-
-#endif /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5cc3dde1bc9..c4f396e437a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -31,6 +31,7 @@
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_itable.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
@@ -46,9 +47,6 @@ STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
uint);
STATIC uint xfs_qm_export_flags(uint);
STATIC uint xfs_qm_export_qtype_flags(uint);
-STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
- fs_disk_quota_t *);
-
/*
* Turn off quota accounting and/or enforcement for all udquots and/or
@@ -68,7 +66,6 @@ xfs_qm_scall_quotaoff(
int error;
uint inactivate_flags;
xfs_qoff_logitem_t *qoffstart;
- int nculprits;
/*
* No file system can have quotas enabled on disk but not in core.
@@ -174,18 +171,13 @@ xfs_qm_scall_quotaoff(
* This isn't protected by a particular lock directly, because we
* don't want to take a mrlock every time we depend on quotas being on.
*/
- mp->m_qflags &= ~(flags);
+ mp->m_qflags &= ~flags;
/*
* Go through all the dquots of this file system and purge them,
- * according to what was turned off. We may not be able to get rid
- * of all dquots, because dquots can have temporary references that
- * are not attached to inodes. eg. xfs_setattr, xfs_create.
- * So, if we couldn't purge all the dquots from the filesystem,
- * we can't get rid of the incore data structures.
+ * according to what was turned off.
*/
- while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
- delay(10 * nculprits);
+ xfs_qm_dqpurge_all(mp, dqtype);
/*
* Transactions that had started before ACTIVE state bit was cleared
@@ -263,13 +255,18 @@ xfs_qm_scall_trunc_qfile(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_itruncate_data(&tp, ip, 0);
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error) {
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
XFS_TRANS_ABORT);
goto out_unlock;
}
+ ASSERT(ip->i_d.di_nextents == 0);
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
@@ -629,42 +626,6 @@ xfs_qm_scall_setqlim(
return error;
}
-int
-xfs_qm_scall_getquota(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type,
- fs_disk_quota_t *out)
-{
- xfs_dquot_t *dqp;
- int error;
-
- /*
- * Try to get the dquot. We don't want it allocated on disk, so
- * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
- * exist, we'll get ENOENT back.
- */
- if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
- return (error);
- }
-
- /*
- * If everything's NULL, this dquot doesn't quite exist as far as
- * our utility programs are concerned.
- */
- if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
- xfs_qm_dqput(dqp);
- return XFS_ERROR(ENOENT);
- }
- /*
- * Convert the disk dquot to the exportable format
- */
- xfs_qm_export_dquot(mp, &dqp->q_core, out);
- xfs_qm_dqput(dqp);
- return (error ? XFS_ERROR(EFAULT) : 0);
-}
-
-
STATIC int
xfs_qm_log_quotaoff_end(
xfs_mount_t *mp,
@@ -753,50 +714,66 @@ error0:
}
-/*
- * Translate an internal style on-disk-dquot to the exportable format.
- * The main differences are that the counters/limits are all in Basic
- * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
- * to be converted to the native endianness.
- */
-STATIC void
-xfs_qm_export_dquot(
- xfs_mount_t *mp,
- xfs_disk_dquot_t *src,
+int
+xfs_qm_scall_getquota(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
struct fs_disk_quota *dst)
{
+ struct xfs_dquot *dqp;
+ int error;
+
+ /*
+ * Try to get the dquot. We don't want it allocated on disk, so
+ * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+ * exist, we'll get ENOENT back.
+ */
+ error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+ if (error)
+ return error;
+
+ /*
+ * If everything's NULL, this dquot doesn't quite exist as far as
+ * our utility programs are concerned.
+ */
+ if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ error = XFS_ERROR(ENOENT);
+ goto out_put;
+ }
+
memset(dst, 0, sizeof(*dst));
- dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */
- dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags);
- dst->d_id = be32_to_cpu(src->d_id);
+ dst->d_version = FS_DQUOT_VERSION;
+ dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags);
+ dst->d_id = be32_to_cpu(dqp->q_core.d_id);
dst->d_blk_hardlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit));
+ XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
dst->d_blk_softlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit));
- dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit);
- dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit);
- dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount));
- dst->d_icount = be64_to_cpu(src->d_icount);
- dst->d_btimer = be32_to_cpu(src->d_btimer);
- dst->d_itimer = be32_to_cpu(src->d_itimer);
- dst->d_iwarns = be16_to_cpu(src->d_iwarns);
- dst->d_bwarns = be16_to_cpu(src->d_bwarns);
+ XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
+ dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+ dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
+ dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount);
+ dst->d_icount = dqp->q_res_icount;
+ dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer);
+ dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer);
+ dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns);
+ dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns);
dst->d_rtb_hardlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit));
+ XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
dst->d_rtb_softlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit));
- dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount));
- dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer);
- dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns);
+ XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
+ dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount);
+ dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+ dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns);
/*
* Internally, we don't reset all the timers when quota enforcement
* gets turned off. No need to confuse the user level code,
* so return zeroes in that case.
*/
- if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) ||
+ if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
(!XFS_IS_OQUOTA_ENFORCED(mp) &&
- (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+ (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
dst->d_btimer = 0;
dst->d_itimer = 0;
dst->d_rtbtimer = 0;
@@ -807,16 +784,19 @@ xfs_qm_export_dquot(
(XFS_IS_OQUOTA_ENFORCED(mp) &&
(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
dst->d_id != 0) {
- if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+ if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) &&
(dst->d_blk_softlimit > 0)) {
ASSERT(dst->d_btimer != 0);
}
- if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+ if (((int) dst->d_icount > (int) dst->d_ino_softlimit) &&
(dst->d_ino_softlimit > 0)) {
ASSERT(dst->d_itimer != 0);
}
}
#endif
+out_put:
+ xfs_qm_dqput(dqp);
+ return error;
}
STATIC uint
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 8a0807e0f97..b50ec5b95d5 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -174,6 +174,8 @@ typedef struct xfs_qoff_logformat {
#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */
#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */
#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */
+#define XFS_ALL_QUOTA_ACTIVE \
+ (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
/*
* Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
index 94a3d927d71..6d86219d93d 100644
--- a/fs/xfs/xfs_quota_priv.h
+++ b/fs/xfs/xfs_quota_priv.h
@@ -24,17 +24,6 @@
*/
#define XFS_DQITER_MAP_SIZE 10
-/*
- * Hash into a bucket in the dquot hash table, based on <mp, id>.
- */
-#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
- (__psunsigned_t)(id)) & \
- (xfs_Gqm->qm_dqhashmask - 1))
-#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \
- (xfs_Gqm->qm_usr_dqhtable + \
- XFS_DQ_HASHVAL(mp, id)) : \
- (xfs_Gqm->qm_grp_dqhtable + \
- XFS_DQ_HASHVAL(mp, id)))
#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
!dqp->q_core.d_blk_hardlimit && \
!dqp->q_core.d_blk_softlimit && \
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 866de277079..e44ef7ee8ce 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -118,17 +118,6 @@ xfs_rename(
new_parent = (src_dp != target_dp);
src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
- if (src_is_directory) {
- /*
- * Check for link count overflow on target_dp
- */
- if (target_ip == NULL && new_parent &&
- target_dp->i_d.di_nlink >= XFS_MAXLINK) {
- error = XFS_ERROR(EMLINK);
- goto std_return;
- }
- }
-
xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
inodes, &num_inodes);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 87323f1ded6..ca4f31534a0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -183,6 +183,7 @@ error_cancel:
oblocks = map.br_startoff + map.br_blockcount;
}
return 0;
+
error:
return error;
}
@@ -2139,11 +2140,9 @@ xfs_rtfree_extent(
xfs_buf_t *sumbp; /* summary file block buffer */
mp = tp->t_mountp;
- /*
- * Synchronize by locking the bitmap inode.
- */
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+ ASSERT(mp->m_rbmip->i_itemp != NULL);
+ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
#if defined(__KERNEL__) && defined(DEBUG)
/*
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index cb6ae715814..f429d9d5d32 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -529,7 +529,6 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
#define XFS_BB_TO_FSB(mp,bb) \
(((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
-#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
/*
* File system block to byte conversions.
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 76fdc586193..ce372b7d564 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -20,9 +20,18 @@
DEFINE_PER_CPU(struct xfsstats, xfsstats);
+static int counter_val(int idx)
+{
+ int val = 0, cpu;
+
+ for_each_possible_cpu(cpu)
+ val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+ return val;
+}
+
static int xfs_stat_proc_show(struct seq_file *m, void *v)
{
- int c, i, j, val;
+ int i, j;
__uint64_t xs_xstrat_bytes = 0;
__uint64_t xs_write_bytes = 0;
__uint64_t xs_read_bytes = 0;
@@ -50,20 +59,16 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
{ "abtc2", XFSSTAT_END_ABTC_V2 },
{ "bmbt2", XFSSTAT_END_BMBT_V2 },
{ "ibt2", XFSSTAT_END_IBT_V2 },
+ /* we print both series of quota information together */
+ { "qm", XFSSTAT_END_QM },
};
/* Loop over all stats groups */
- for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+ for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
seq_printf(m, "%s", xstats[i].desc);
/* inner loop does each group */
- while (j < xstats[i].endpoint) {
- val = 0;
- /* sum over all cpus */
- for_each_possible_cpu(c)
- val += *(((__u32*)&per_cpu(xfsstats, c) + j));
- seq_printf(m, " %u", val);
- j++;
- }
+ for (; j < xstats[i].endpoint; j++)
+ seq_printf(m, " %u", counter_val(j));
seq_putc(m, '\n');
}
/* extra precision counters */
@@ -97,6 +102,58 @@ static const struct file_operations xfs_stat_proc_fops = {
.release = single_release,
};
+/* legacy quota interfaces */
+#ifdef CONFIG_XFS_QUOTA
+static int xqm_proc_show(struct seq_file *m, void *v)
+{
+ /* maximum; incore; ratio free to inuse; freelist */
+ seq_printf(m, "%d\t%d\t%d\t%u\n",
+ 0,
+ counter_val(XFSSTAT_END_XQMSTAT),
+ 0,
+ counter_val(XFSSTAT_END_XQMSTAT + 1));
+ return 0;
+}
+
+static int xqm_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqm_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/* legacy quota stats interface no 2 */
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
+ int j;
+
+ seq_printf(m, "qm");
+ for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+ seq_printf(m, " %u", counter_val(j));
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xqmstat_proc_show, NULL);
+}
+
+static const struct file_operations xqmstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqmstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_XFS_QUOTA */
+
int
xfs_init_procfs(void)
{
@@ -105,10 +162,24 @@ xfs_init_procfs(void)
if (!proc_create("fs/xfs/stat", 0, NULL,
&xfs_stat_proc_fops))
- goto out_remove_entry;
+ goto out_remove_xfs_dir;
+#ifdef CONFIG_XFS_QUOTA
+ if (!proc_create("fs/xfs/xqmstat", 0, NULL,
+ &xqmstat_proc_fops))
+ goto out_remove_stat_file;
+ if (!proc_create("fs/xfs/xqm", 0, NULL,
+ &xqm_proc_fops))
+ goto out_remove_xqmstat_file;
+#endif
return 0;
- out_remove_entry:
+#ifdef CONFIG_XFS_QUOTA
+ out_remove_xqmstat_file:
+ remove_proc_entry("fs/xfs/xqmstat", NULL);
+ out_remove_stat_file:
+ remove_proc_entry("fs/xfs/stat", NULL);
+#endif
+ out_remove_xfs_dir:
remove_proc_entry("fs/xfs", NULL);
out:
return -ENOMEM;
@@ -117,6 +188,10 @@ xfs_init_procfs(void)
void
xfs_cleanup_procfs(void)
{
+#ifdef CONFIG_XFS_QUOTA
+ remove_proc_entry("fs/xfs/xqm", NULL);
+ remove_proc_entry("fs/xfs/xqmstat", NULL);
+#endif
remove_proc_entry("fs/xfs/stat", NULL);
remove_proc_entry("fs/xfs", NULL);
}
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 736854b1ca1..c03ad38ceae 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -183,6 +183,16 @@ struct xfsstats {
__uint32_t xs_ibt_2_alloc;
__uint32_t xs_ibt_2_free;
__uint32_t xs_ibt_2_moves;
+#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6)
+ __uint32_t xs_qm_dqreclaims;
+ __uint32_t xs_qm_dqreclaim_misses;
+ __uint32_t xs_qm_dquot_dups;
+ __uint32_t xs_qm_dqcachemisses;
+ __uint32_t xs_qm_dqcachehits;
+ __uint32_t xs_qm_dqwants;
+#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2)
+ __uint32_t xs_qm_dquot;
+ __uint32_t xs_qm_dquot_unused;
/* Extra precision counters */
__uint64_t xs_xstrat_bytes;
__uint64_t xs_write_bytes;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 281961c1d81..dab9a5f6dfd 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -324,10 +324,9 @@ xfs_parseargs(
} else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
mp->m_flags |= XFS_MOUNT_FILESTREAMS;
} else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
- mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
- XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
- XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
- XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
} else if (!strcmp(this_char, MNTOPT_QUOTA) ||
!strcmp(this_char, MNTOPT_UQUOTA) ||
!strcmp(this_char, MNTOPT_USRQUOTA)) {
@@ -760,6 +759,36 @@ xfs_setup_devices(
return 0;
}
+STATIC int
+xfs_init_mount_workqueues(
+ struct xfs_mount *mp)
+{
+ mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
+ WQ_MEM_RECLAIM, 0, mp->m_fsname);
+ if (!mp->m_data_workqueue)
+ goto out;
+
+ mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
+ WQ_MEM_RECLAIM, 0, mp->m_fsname);
+ if (!mp->m_unwritten_workqueue)
+ goto out_destroy_data_iodone_queue;
+
+ return 0;
+
+out_destroy_data_iodone_queue:
+ destroy_workqueue(mp->m_data_workqueue);
+out:
+ return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_mount_workqueues(
+ struct xfs_mount *mp)
+{
+ destroy_workqueue(mp->m_data_workqueue);
+ destroy_workqueue(mp->m_unwritten_workqueue);
+}
+
/* Catch misguided souls that try to use this interface on XFS */
STATIC struct inode *
xfs_fs_alloc_inode(
@@ -828,105 +857,64 @@ xfs_fs_inode_init_once(
/* xfs inode */
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
- init_waitqueue_head(&ip->i_ipin_wait);
- /*
- * Because we want to use a counting completion, complete
- * the flush completion once to allow a single access to
- * the flush completion without blocking.
- */
- init_completion(&ip->i_flush);
- complete(&ip->i_flush);
mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
"xfsino", ip->i_ino);
}
/*
- * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
- * we catch unlogged VFS level updates to the inode.
+ * This is called by the VFS when dirtying inode metadata. This can happen
+ * for a few reasons, but we only care about timestamp updates, given that
+ * we handled the rest ourselves. In theory no other calls should happen,
+ * but for example generic_write_end() keeps dirtying the inode after
+ * updating i_size. Thus we check that the flags are exactly I_DIRTY_SYNC,
+ * and skip this call otherwise.
*
- * We need the barrier() to maintain correct ordering between unlogged
- * updates and the transaction commit code that clears the i_update_core
- * field. This requires all updates to be completed before marking the
- * inode dirty.
+ * We'll hopefull get a different method just for updating timestamps soon,
+ * at which point this hack can go away, and maybe we'll also get real
+ * error handling here.
*/
STATIC void
xfs_fs_dirty_inode(
- struct inode *inode,
- int flags)
-{
- barrier();
- XFS_I(inode)->i_update_core = 1;
-}
-
-STATIC int
-xfs_fs_write_inode(
struct inode *inode,
- struct writeback_control *wbc)
+ int flags)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- int error = EAGAIN;
-
- trace_xfs_write_inode(ip);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
-
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
- /*
- * Make sure the inode has made it it into the log. Instead
- * of forcing it all the way to stable storage using a
- * synchronous transaction we let the log force inside the
- * ->sync_fs call do that for thus, which reduces the number
- * of synchronous log forces dramatically.
- */
- error = xfs_log_dirty_inode(ip, NULL, 0);
- if (error)
- goto out;
- return 0;
- } else {
- if (!ip->i_update_core)
- return 0;
+ struct xfs_trans *tp;
+ int error;
- /*
- * We make this non-blocking if the inode is contended, return
- * EAGAIN to indicate to the caller that they did not succeed.
- * This prevents the flush path from blocking on inodes inside
- * another operation right now, they get caught later by
- * xfs_sync.
- */
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
- goto out;
+ if (flags != I_DIRTY_SYNC)
+ return;
- if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
- goto out_unlock;
+ trace_xfs_dirty_inode(ip);
- /*
- * Now we have the flush lock and the inode is not pinned, we
- * can check if the inode is really clean as we know that
- * there are no pending transaction completions, it is not
- * waiting on the delayed write queue and there is no IO in
- * progress.
- */
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- error = 0;
- goto out_unlock;
- }
- error = xfs_iflush(ip, SYNC_TRYLOCK);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto trouble;
}
-
- out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- out:
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
/*
- * if we failed to write out the inode then mark
- * it dirty again so we'll try again later.
+ * Grab all the latest timestamps from the Linux inode.
*/
+ ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+ ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+ ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
+
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
+ error = xfs_trans_commit(tp, 0);
if (error)
- xfs_mark_inode_dirty_sync(ip);
- return -error;
+ goto trouble;
+ return;
+
+trouble:
+ xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino);
}
STATIC void
@@ -962,6 +950,22 @@ xfs_fs_evict_inode(
xfs_inactive(ip);
}
+/*
+ * We do an unlocked check for XFS_IDONTCACHE here because we are already
+ * serialised against cache hits here via the inode->i_lock and igrab() in
+ * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be
+ * racing with us, and it avoids needing to grab a spinlock here for every inode
+ * we drop the final reference on.
+ */
+STATIC int
+xfs_fs_drop_inode(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
+}
+
STATIC void
xfs_free_fsname(
struct xfs_mount *mp)
@@ -991,6 +995,7 @@ xfs_fs_put_super(
xfs_unmountfs(mp);
xfs_freesb(mp);
xfs_icsb_destroy_counters(mp);
+ xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
xfs_free_fsname(mp);
kfree(mp);
@@ -1317,10 +1322,14 @@ xfs_fs_fill_super(
if (error)
goto out_free_fsname;
- error = xfs_icsb_init_counters(mp);
+ error = xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
+ error = xfs_icsb_init_counters(mp);
+ if (error)
+ goto out_destroy_workqueues;
+
error = xfs_readsb(mp, flags);
if (error)
goto out_destroy_counters;
@@ -1349,6 +1358,7 @@ xfs_fs_fill_super(
sb->s_blocksize = mp->m_sb.sb_blocksize;
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
+ sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
@@ -1369,10 +1379,10 @@ xfs_fs_fill_super(
error = EINVAL;
goto out_syncd_stop;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
error = ENOMEM;
- goto out_iput;
+ goto out_syncd_stop;
}
return 0;
@@ -1383,6 +1393,8 @@ xfs_fs_fill_super(
xfs_freesb(mp);
out_destroy_counters:
xfs_icsb_destroy_counters(mp);
+out_destroy_workqueues:
+ xfs_destroy_mount_workqueues(mp);
out_close_devices:
xfs_close_devices(mp);
out_free_fsname:
@@ -1391,8 +1403,6 @@ xfs_fs_fill_super(
out:
return -error;
- out_iput:
- iput(root);
out_syncd_stop:
xfs_syncd_stop(mp);
out_unmount:
@@ -1438,8 +1448,8 @@ static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
.dirty_inode = xfs_fs_dirty_inode,
- .write_inode = xfs_fs_write_inode,
.evict_inode = xfs_fs_evict_inode,
+ .drop_inode = xfs_fs_drop_inode,
.put_super = xfs_fs_put_super,
.sync_fs = xfs_fs_sync_fs,
.freeze_fs = xfs_fs_freeze,
@@ -1613,12 +1623,28 @@ xfs_init_workqueues(void)
xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
if (!xfs_syncd_wq)
return -ENOMEM;
+
+ /*
+ * The allocation workqueue can be used in memory reclaim situations
+ * (writepage path), and parallelism is only limited by the number of
+ * AGs in all the filesystems mounted. Hence use the default large
+ * max_active value for this workqueue.
+ */
+ xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
+ if (!xfs_alloc_wq)
+ goto out_destroy_syncd;
+
return 0;
+
+out_destroy_syncd:
+ destroy_workqueue(xfs_syncd_wq);
+ return -ENOMEM;
}
STATIC void
xfs_destroy_workqueues(void)
{
+ destroy_workqueue(xfs_alloc_wq);
destroy_workqueue(xfs_syncd_wq);
}
@@ -1660,13 +1686,17 @@ init_xfs_fs(void)
if (error)
goto out_cleanup_procfs;
- vfs_initquota();
+ error = xfs_qm_init();
+ if (error)
+ goto out_sysctl_unregister;
error = register_filesystem(&xfs_fs_type);
if (error)
- goto out_sysctl_unregister;
+ goto out_qm_exit;
return 0;
+ out_qm_exit:
+ xfs_qm_exit();
out_sysctl_unregister:
xfs_sysctl_unregister();
out_cleanup_procfs:
@@ -1688,7 +1718,7 @@ init_xfs_fs(void)
STATIC void __exit
exit_xfs_fs(void)
{
- vfs_exitquota();
+ xfs_qm_exit();
unregister_filesystem(&xfs_fs_type);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 50a3266c999..09b0c26b224 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -21,13 +21,11 @@
#include <linux/exportfs.h>
#ifdef CONFIG_XFS_QUOTA
-extern void xfs_qm_init(void);
+extern int xfs_qm_init(void);
extern void xfs_qm_exit(void);
-# define vfs_initquota() xfs_qm_init()
-# define vfs_exitquota() xfs_qm_exit()
#else
-# define vfs_initquota() do { } while (0)
-# define vfs_exitquota() do { } while (0)
+# define xfs_qm_init() (0)
+# define xfs_qm_exit() do { } while (0)
#endif
#ifdef CONFIG_XFS_POSIX_ACL
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 72c01a1c16e..205ebcb34d9 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -336,32 +336,6 @@ xfs_sync_fsdata(
return error;
}
-int
-xfs_log_dirty_inode(
- struct xfs_inode *ip,
- struct xfs_perag *pag,
- int flags)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
- int error;
-
- if (!ip->i_update_core)
- return 0;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- return xfs_trans_commit(tp, 0);
-}
-
/*
* When remounting a filesystem read-only or freezing the filesystem, we have
* two phases to execute. This first phase is syncing the data before we
@@ -385,16 +359,6 @@ xfs_quiesce_data(
{
int error, error2 = 0;
- /*
- * Log all pending size and timestamp updates. The vfs writeback
- * code is supposed to do this, but due to its overagressive
- * livelock detection it will skip inodes where appending writes
- * were written out in the first non-blocking sync phase if their
- * completion took long enough that it happened after taking the
- * timestamp for the cut-off in the blocking phase.
- */
- xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
-
/* force out the log */
xfs_log_force(mp, XFS_LOG_SYNC);
@@ -707,14 +671,13 @@ xfs_reclaim_inode_grab(
return 1;
/*
- * do some unlocked checks first to avoid unnecessary lock traffic.
- * The first is a flush lock check, the second is a already in reclaim
- * check. Only do these checks if we are not going to block on locks.
+ * If we are asked for non-blocking operation, do unlocked checks to
+ * see if the inode already is being flushed or in reclaim to avoid
+ * lock traffic.
*/
if ((flags & SYNC_TRYLOCK) &&
- (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+ __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
return 1;
- }
/*
* The radix tree lock here protects a thread in xfs_iget from racing
@@ -914,17 +877,15 @@ reclaim:
* can reference the inodes in the cache without taking references.
*
* We make that OK here by ensuring that we wait until the inode is
- * unlocked after the lookup before we go ahead and free it. We get
- * both the ilock and the iolock because the code may need to drop the
- * ilock one but will still hold the iolock.
+ * unlocked after the lookup before we go ahead and free it.
*/
- xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_qm_dqdetach(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_inode_free(ip);
- return error;
+ return error;
}
/*
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index fa965479d78..941202e7ac6 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -34,8 +34,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
void xfs_flush_inodes(struct xfs_inode *ip);
-int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
-
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a9d5b1e06ef..06838c42b2a 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -580,7 +580,7 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr);
DEFINE_INODE_EVENT(xfs_dir_fsync);
DEFINE_INODE_EVENT(xfs_file_fsync);
DEFINE_INODE_EVENT(xfs_destroy_inode);
-DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_dirty_inode);
DEFINE_INODE_EVENT(xfs_evict_inode);
DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
@@ -627,16 +627,19 @@ DECLARE_EVENT_CLASS(xfs_namespace_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, dp_ino)
+ __field(int, namelen)
__dynamic_array(char, name, name->len)
),
TP_fast_assign(
__entry->dev = VFS_I(dp)->i_sb->s_dev;
__entry->dp_ino = dp->i_ino;
+ __entry->namelen = name->len;
memcpy(__get_str(name), name->name, name->len);
),
- TP_printk("dev %d:%d dp ino 0x%llx name %s",
+ TP_printk("dev %d:%d dp ino 0x%llx name %.*s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->dp_ino,
+ __entry->namelen,
__get_str(name))
)
@@ -658,6 +661,8 @@ TRACE_EVENT(xfs_rename,
__field(dev_t, dev)
__field(xfs_ino_t, src_dp_ino)
__field(xfs_ino_t, target_dp_ino)
+ __field(int, src_namelen)
+ __field(int, target_namelen)
__dynamic_array(char, src_name, src_name->len)
__dynamic_array(char, target_name, target_name->len)
),
@@ -665,15 +670,20 @@ TRACE_EVENT(xfs_rename,
__entry->dev = VFS_I(src_dp)->i_sb->s_dev;
__entry->src_dp_ino = src_dp->i_ino;
__entry->target_dp_ino = target_dp->i_ino;
+ __entry->src_namelen = src_name->len;
+ __entry->target_namelen = target_name->len;
memcpy(__get_str(src_name), src_name->name, src_name->len);
- memcpy(__get_str(target_name), target_name->name, target_name->len);
+ memcpy(__get_str(target_name), target_name->name,
+ target_name->len);
),
TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
- " src name %s target name %s",
+ " src name %.*s target name %.*s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->src_dp_ino,
__entry->target_dp_ino,
+ __entry->src_namelen,
__get_str(src_name),
+ __entry->target_namelen,
__get_str(target_name))
)
@@ -733,19 +743,18 @@ DEFINE_EVENT(xfs_dquot_class, name, \
DEFINE_DQUOT_EVENT(xfs_dqadjust);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
DEFINE_DQUOT_EVENT(xfs_dqattach_found);
DEFINE_DQUOT_EVENT(xfs_dqattach_get);
-DEFINE_DQUOT_EVENT(xfs_dqinit);
-DEFINE_DQUOT_EVENT(xfs_dqreuse);
DEFINE_DQUOT_EVENT(xfs_dqalloc);
DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
DEFINE_DQUOT_EVENT(xfs_dqread);
DEFINE_DQUOT_EVENT(xfs_dqread_fail);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
DEFINE_DQUOT_EVENT(xfs_dqget_hit);
DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
+DEFINE_DQUOT_EVENT(xfs_dqget_dup);
DEFINE_DQUOT_EVENT(xfs_dqput);
DEFINE_DQUOT_EVENT(xfs_dqput_wait);
DEFINE_DQUOT_EVENT(xfs_dqput_free);
@@ -783,12 +792,12 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__entry->curr_res = tic->t_curr_res;
__entry->unit_res = tic->t_unit_res;
__entry->flags = tic->t_flags;
- __entry->reserveq = list_empty(&log->l_reserveq);
- __entry->writeq = list_empty(&log->l_writeq);
- xlog_crack_grant_head(&log->l_grant_reserve_head,
+ __entry->reserveq = list_empty(&log->l_reserve_head.waiters);
+ __entry->writeq = list_empty(&log->l_write_head.waiters);
+ xlog_crack_grant_head(&log->l_reserve_head.grant,
&__entry->grant_reserve_cycle,
&__entry->grant_reserve_bytes);
- xlog_crack_grant_head(&log->l_grant_write_head,
+ xlog_crack_grant_head(&log->l_write_head.grant,
&__entry->grant_write_cycle,
&__entry->grant_write_bytes);
__entry->curr_cycle = log->l_curr_cycle;
@@ -827,20 +836,14 @@ DEFINE_EVENT(xfs_loggrant_class, name, \
TP_ARGS(log, tic))
DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
-DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -891,7 +894,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_fsize_t, size)
- __field(xfs_fsize_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
__field(int, flags)
@@ -900,17 +902,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->size = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
__entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
"offset 0x%llx count 0x%zx ioflags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
- __entry->new_size,
__entry->offset,
__entry->count,
__print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -978,7 +978,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(loff_t, size)
- __field(loff_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
__field(int, type)
@@ -990,7 +989,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->size = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
__entry->type = type;
@@ -998,13 +996,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
- "offset 0x%llx count %zd type %s "
- "startoff 0x%llx startblock %lld blockcount 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
+ "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
- __entry->new_size,
__entry->offset,
__entry->count,
__print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1031,26 +1027,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
__field(xfs_ino_t, ino)
__field(loff_t, isize)
__field(loff_t, disize)
- __field(loff_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->isize = ip->i_size;
+ __entry->isize = VFS_I(ip)->i_size;
__entry->disize = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
),
- TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
+ TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
"offset 0x%llx count %zd",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->isize,
__entry->disize,
- __entry->new_size,
__entry->offset,
__entry->count)
);
@@ -1090,8 +1083,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
DEFINE_EVENT(xfs_itrunc_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
TRACE_EVENT(xfs_pagecache_inval,
TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1425,7 +1418,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
-DECLARE_EVENT_CLASS(xfs_dir2_class,
+DECLARE_EVENT_CLASS(xfs_da_class,
TP_PROTO(struct xfs_da_args *args),
TP_ARGS(args),
TP_STRUCT__entry(
@@ -1460,7 +1453,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_class,
)
#define DEFINE_DIR2_EVENT(name) \
-DEFINE_EVENT(xfs_dir2_class, name, \
+DEFINE_EVENT(xfs_da_class, name, \
TP_PROTO(struct xfs_da_args *args), \
TP_ARGS(args))
DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
@@ -1489,6 +1482,64 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+#define DEFINE_ATTR_EVENT(name) \
+DEFINE_EVENT(xfs_da_class, name, \
+ TP_PROTO(struct xfs_da_args *args), \
+ TP_ARGS(args))
+DEFINE_ATTR_EVENT(xfs_attr_sf_add);
+DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
+DEFINE_ATTR_EVENT(xfs_attr_sf_create);
+DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
+DEFINE_ATTR_EVENT(xfs_attr_sf_removename);
+DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
+
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
+
+DEFINE_ATTR_EVENT(xfs_attr_node_addname);
+DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_node_replace);
+DEFINE_ATTR_EVENT(xfs_attr_node_removename);
+
+#define DEFINE_DA_EVENT(name) \
+DEFINE_EVENT(xfs_da_class, name, \
+ TP_PROTO(struct xfs_da_args *args), \
+ TP_ARGS(args))
+DEFINE_DA_EVENT(xfs_da_split);
+DEFINE_DA_EVENT(xfs_da_join);
+DEFINE_DA_EVENT(xfs_da_link_before);
+DEFINE_DA_EVENT(xfs_da_link_after);
+DEFINE_DA_EVENT(xfs_da_unlink_back);
+DEFINE_DA_EVENT(xfs_da_unlink_forward);
+DEFINE_DA_EVENT(xfs_da_root_split);
+DEFINE_DA_EVENT(xfs_da_root_join);
+DEFINE_DA_EVENT(xfs_da_node_add);
+DEFINE_DA_EVENT(xfs_da_node_create);
+DEFINE_DA_EVENT(xfs_da_node_split);
+DEFINE_DA_EVENT(xfs_da_node_remove);
+DEFINE_DA_EVENT(xfs_da_node_rebalance);
+DEFINE_DA_EVENT(xfs_da_node_unbalance);
+DEFINE_DA_EVENT(xfs_da_swap_lastblock);
+DEFINE_DA_EVENT(xfs_da_grow_inode);
+DEFINE_DA_EVENT(xfs_da_shrink_inode);
+
DECLARE_EVENT_CLASS(xfs_dir2_space_class,
TP_PROTO(struct xfs_da_args *args, int idx),
TP_ARGS(args, idx),
@@ -1568,7 +1619,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__field(xfs_ino_t, ino)
__field(int, format)
__field(int, nex)
- __field(int, max_nex)
__field(int, broot_size)
__field(int, fork_off)
),
@@ -1578,18 +1628,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->ino = ip->i_ino;
__entry->format = ip->i_d.di_format;
__entry->nex = ip->i_d.di_nextents;
- __entry->max_nex = ip->i_df.if_ext_max;
__entry->broot_size = ip->i_df.if_broot_bytes;
__entry->fork_off = XFS_IFORK_BOFF(ip);
),
TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
- "Max in-fork extents %d, broot size %d, fork offset %d",
+ "broot size %d, fork offset %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
__print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
__entry->nex,
- __entry->max_nex,
__entry->broot_size,
__entry->fork_off)
)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 329b06aba1c..103b00c9000 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -681,7 +681,6 @@ xfs_trans_reserve(
uint flags,
uint logcount)
{
- int log_flags;
int error = 0;
int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
@@ -707,24 +706,32 @@ xfs_trans_reserve(
* Reserve the log space needed for this transaction.
*/
if (logspace > 0) {
- ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace));
- ASSERT((tp->t_log_count == 0) ||
- (tp->t_log_count == logcount));
+ bool permanent = false;
+
+ ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace);
+ ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount);
+
if (flags & XFS_TRANS_PERM_LOG_RES) {
- log_flags = XFS_LOG_PERM_RESERV;
tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+ permanent = true;
} else {
ASSERT(tp->t_ticket == NULL);
ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
- log_flags = 0;
}
- error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
- &tp->t_ticket,
- XFS_TRANSACTION, log_flags, tp->t_type);
- if (error) {
- goto undo_blocks;
+ if (tp->t_ticket != NULL) {
+ ASSERT(flags & XFS_TRANS_PERM_LOG_RES);
+ error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+ } else {
+ error = xfs_log_reserve(tp->t_mountp, logspace,
+ logcount, &tp->t_ticket,
+ XFS_TRANSACTION, permanent,
+ tp->t_type);
}
+
+ if (error)
+ goto undo_blocks;
+
tp->t_log_res = logspace;
tp->t_log_count = logcount;
}
@@ -752,6 +759,8 @@ xfs_trans_reserve(
*/
undo_log:
if (logspace > 0) {
+ int log_flags;
+
if (flags & XFS_TRANS_PERM_LOG_RES) {
log_flags = XFS_LOG_REL_PERM_RESERV;
} else {
@@ -1151,8 +1160,8 @@ xfs_trans_add_item(
{
struct xfs_log_item_desc *lidp;
- ASSERT(lip->li_mountp = tp->t_mountp);
- ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+ ASSERT(lip->li_mountp == tp->t_mountp);
+ ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index ed9252bcdac..1dead07f092 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -611,50 +611,6 @@ xfs_ail_push_all(
}
/*
- * This is to be called when an item is unlocked that may have
- * been in the AIL. It will wake up the first member of the AIL
- * wait list if this item's unlocking might allow it to progress.
- * If the item is in the AIL, then we need to get the AIL lock
- * while doing our checking so we don't race with someone going
- * to sleep waiting for this event in xfs_trans_push_ail().
- */
-void
-xfs_trans_unlocked_item(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip)
-{
- xfs_log_item_t *min_lip;
-
- /*
- * If we're forcibly shutting down, we may have
- * unlocked log items arbitrarily. The last thing
- * we want to do is to move the tail of the log
- * over some potentially valid data.
- */
- if (!(lip->li_flags & XFS_LI_IN_AIL) ||
- XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
- return;
- }
-
- /*
- * This is the one case where we can call into xfs_ail_min()
- * without holding the AIL lock because we only care about the
- * case where we are at the tail of the AIL. If the object isn't
- * at the tail, it doesn't matter what result we get back. This
- * is slightly racy because since we were just unlocked, we could
- * go to sleep between the call to xfs_ail_min and the call to
- * xfs_log_move_tail, have someone else lock us, commit to us disk,
- * move us out of the tail of the AIL, and then we wake up. However,
- * the call to xfs_log_move_tail() doesn't do anything if there's
- * not enough free space to wake people up so we're safe calling it.
- */
- min_lip = xfs_ail_min(ailp);
-
- if (min_lip == lip)
- xfs_log_move_tail(ailp->xa_mount, 1);
-} /* xfs_trans_unlocked_item */
-
-/*
* xfs_trans_ail_update - bulk AIL insertion operation.
*
* @xfs_trans_ail_update takes an array of log items that all need to be
@@ -685,7 +641,6 @@ xfs_trans_ail_update_bulk(
xfs_lsn_t lsn) __releases(ailp->xa_lock)
{
xfs_log_item_t *mlip;
- xfs_lsn_t tail_lsn;
int mlip_changed = 0;
int i;
LIST_HEAD(tmp);
@@ -712,22 +667,12 @@ xfs_trans_ail_update_bulk(
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
+ spin_unlock(&ailp->xa_lock);
- if (!mlip_changed) {
- spin_unlock(&ailp->xa_lock);
- return;
+ if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+ xlog_assign_tail_lsn(ailp->xa_mount);
+ xfs_log_space_wake(ailp->xa_mount);
}
-
- /*
- * It is not safe to access mlip after the AIL lock is dropped, so we
- * must get a copy of li_lsn before we do so. This is especially
- * important on 32-bit platforms where accessing and updating 64-bit
- * values like li_lsn is not atomic.
- */
- mlip = xfs_ail_min(ailp);
- tail_lsn = mlip->li_lsn;
- spin_unlock(&ailp->xa_lock);
- xfs_log_move_tail(ailp->xa_mount, tail_lsn);
}
/*
@@ -758,7 +703,6 @@ xfs_trans_ail_delete_bulk(
int nr_items) __releases(ailp->xa_lock)
{
xfs_log_item_t *mlip;
- xfs_lsn_t tail_lsn;
int mlip_changed = 0;
int i;
@@ -785,23 +729,12 @@ xfs_trans_ail_delete_bulk(
if (mlip == lip)
mlip_changed = 1;
}
+ spin_unlock(&ailp->xa_lock);
- if (!mlip_changed) {
- spin_unlock(&ailp->xa_lock);
- return;
+ if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+ xlog_assign_tail_lsn(ailp->xa_mount);
+ xfs_log_space_wake(ailp->xa_mount);
}
-
- /*
- * It is not safe to access mlip after the AIL lock is dropped, so we
- * must get a copy of li_lsn before we do so. This is especially
- * important on 32-bit platforms where accessing and updating 64-bit
- * values like li_lsn is not atomic. It is possible we've emptied the
- * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
- */
- mlip = xfs_ail_min(ailp);
- tail_lsn = mlip ? mlip->li_lsn : 0;
- spin_unlock(&ailp->xa_lock);
- xfs_log_move_tail(ailp->xa_mount, tail_lsn);
}
/*
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 475a4ded4f4..1302d1d95a5 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -463,19 +463,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
* Default to a normal brelse() call if the tp is NULL.
*/
if (tp == NULL) {
- struct xfs_log_item *lip = bp->b_fspriv;
-
ASSERT(bp->b_transp == NULL);
-
- /*
- * If there's a buf log item attached to the buffer,
- * then let the AIL know that the buffer is being
- * unlocked.
- */
- if (lip != NULL && lip->li_type == XFS_LI_BUF) {
- bip = bp->b_fspriv;
- xfs_trans_unlocked_item(bip->bli_item.li_ailp, lip);
- }
xfs_buf_relse(bp);
return;
}
@@ -550,21 +538,10 @@ xfs_trans_brelse(xfs_trans_t *tp,
ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
xfs_buf_item_relse(bp);
- bip = NULL;
- }
- bp->b_transp = NULL;
-
- /*
- * If we've still got a buf log item on the buffer, then
- * tell the AIL that the buffer is being unlocked.
- */
- if (bip != NULL) {
- xfs_trans_unlocked_item(bip->bli_item.li_ailp,
- (xfs_log_item_t*)bip);
}
+ bp->b_transp = NULL;
xfs_buf_relse(bp);
- return;
}
/*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 4d00ee67792..279099717ed 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -605,7 +605,7 @@ xfs_trans_dqresv(
time_t timer;
xfs_qwarncnt_t warns;
xfs_qwarncnt_t warnlimit;
- xfs_qcnt_t count;
+ xfs_qcnt_t total_count;
xfs_qcnt_t *resbcountp;
xfs_quotainfo_t *q = mp->m_quotainfo;
@@ -648,13 +648,12 @@ xfs_trans_dqresv(
* hardlimit or exceed the timelimit if we allocate
* nblks.
*/
- if (hardlimit > 0ULL &&
- hardlimit <= nblks + *resbcountp) {
+ total_count = *resbcountp + nblks;
+ if (hardlimit && total_count > hardlimit) {
xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
goto error_return;
}
- if (softlimit > 0ULL &&
- softlimit <= nblks + *resbcountp) {
+ if (softlimit && total_count > softlimit) {
if ((timer != 0 && get_seconds() > timer) ||
(warns != 0 && warns >= warnlimit)) {
xfs_quota_warn(mp, dqp,
@@ -666,7 +665,7 @@ xfs_trans_dqresv(
}
}
if (ninos > 0) {
- count = be64_to_cpu(dqp->q_core.d_icount);
+ total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos;
timer = be32_to_cpu(dqp->q_core.d_itimer);
warns = be16_to_cpu(dqp->q_core.d_iwarns);
warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
@@ -677,11 +676,11 @@ xfs_trans_dqresv(
if (!softlimit)
softlimit = q->qi_isoftlimit;
- if (hardlimit > 0ULL && count >= hardlimit) {
+ if (hardlimit && total_count > hardlimit) {
xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
goto error_return;
}
- if (softlimit > 0ULL && count >= softlimit) {
+ if (softlimit && total_count > softlimit) {
if ((timer != 0 && get_seconds() > timer) ||
(warns != 0 && warns >= warnlimit)) {
xfs_quota_warn(mp, dqp,
@@ -876,7 +875,7 @@ STATIC void
xfs_trans_alloc_dqinfo(
xfs_trans_t *tp)
{
- tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+ tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
}
void
@@ -885,6 +884,6 @@ xfs_trans_free_dqinfo(
{
if (!tp->t_dqinfo)
return;
- kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
+ kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
tp->t_dqinfo = NULL;
}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 32f0288ae10..7a7442c03f2 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -95,10 +95,14 @@ xfs_trans_ichgtime(
if ((flags & XFS_ICHGTIME_MOD) &&
!timespec_equal(&inode->i_mtime, &tv)) {
inode->i_mtime = tv;
+ ip->i_d.di_mtime.t_sec = tv.tv_sec;
+ ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
}
if ((flags & XFS_ICHGTIME_CHG) &&
!timespec_equal(&inode->i_ctime, &tv)) {
inode->i_ctime = tv;
+ ip->i_d.di_ctime.t_sec = tv.tv_sec;
+ ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
}
}
@@ -126,12 +130,12 @@ xfs_trans_log_inode(
/*
* Always OR in the bits from the ili_last_fields field.
* This is to coordinate with the xfs_iflush() and xfs_iflush_done()
- * routines in the eventual clearing of the ilf_fields bits.
+ * routines in the eventual clearing of the ili_fields bits.
* See the big comment in xfs_iflush() for an explanation of
* this coordination mechanism.
*/
flags |= ip->i_itemp->ili_last_fields;
- ip->i_itemp->ili_format.ilf_fields |= flags;
+ ip->i_itemp->ili_fields |= flags;
}
#ifdef XFS_TRANS_DEBUG
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 44820b9fcb4..8ab2ced415f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -104,9 +104,6 @@ void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
void xfs_ail_push_all(struct xfs_ail *);
xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp);
-void xfs_trans_unlocked_item(struct xfs_ail *,
- xfs_log_item_t *);
-
struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
struct xfs_ail_cursor *cur,
xfs_lsn_t lsn);
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 89dbb4a5087..79c05ac85bf 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -296,8 +296,6 @@ xfs_bumplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
{
- if (ip->i_d.di_nlink >= XFS_MAXLINK)
- return XFS_ERROR(EMLINK);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
ASSERT(ip->i_d.di_nlink > 0);
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 7c220b4227b..db14d0c0868 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -22,7 +22,6 @@
struct file;
struct xfs_inode;
-struct xfs_iomap;
struct attrlist_cursor_kern;
/*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f2fea868d4d..64981d7e737 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -131,7 +131,8 @@ xfs_readlink(
__func__, (unsigned long long) ip->i_ino,
(long long) pathlen);
ASSERT(0);
- return XFS_ERROR(EFSCORRUPTED);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto out;
}
@@ -175,7 +176,7 @@ xfs_free_eofblocks(
* Figure out if there are any blocks beyond the end
* of the file. If not, then there is nothing to do.
*/
- end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
if (last_fsb <= end_fsb)
return 0;
@@ -226,7 +227,14 @@ xfs_free_eofblocks(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_itruncate_data(&tp, ip, ip->i_size);
+ /*
+ * Do not update the on-disk file size. If we update the
+ * on-disk file size and then the system crashes before the
+ * contents of the file are flushed to disk then the files
+ * may be full of holes (ie NULL files bug).
+ */
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+ XFS_ISIZE(ip));
if (error) {
/*
* If we get an error at this point we simply don't
@@ -540,8 +548,8 @@ xfs_release(
return 0;
if ((S_ISREG(ip->i_d.di_mode) &&
- ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
- ip->i_delayed_blks > 0)) &&
+ (VFS_I(ip)->i_size > 0 ||
+ (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
(ip->i_df.if_flags & XFS_IFEXTENTS)) &&
(!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
@@ -618,7 +626,7 @@ xfs_inactive(
* only one with a reference to the inode.
*/
truncate = ((ip->i_d.di_nlink == 0) &&
- ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+ ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
(ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
S_ISREG(ip->i_d.di_mode));
@@ -632,12 +640,12 @@ xfs_inactive(
if (ip->i_d.di_nlink != 0) {
if ((S_ISREG(ip->i_d.di_mode) &&
- ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
- ip->i_delayed_blks > 0)) &&
- (ip->i_df.if_flags & XFS_IFEXTENTS) &&
- (!(ip->i_d.di_flags &
+ (VFS_I(ip)->i_size > 0 ||
+ (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
+ (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+ (!(ip->i_d.di_flags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
- (ip->i_delayed_blks != 0)))) {
+ ip->i_delayed_blks != 0))) {
error = xfs_free_eofblocks(mp, ip, 0);
if (error)
return VN_INACTIVE_CACHE;
@@ -670,13 +678,18 @@ xfs_inactive(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_itruncate_data(&tp, ip, 0);
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error) {
xfs_trans_cancel(tp,
XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
return VN_INACTIVE_CACHE;
}
+
+ ASSERT(ip->i_d.di_nextents == 0);
} else if (S_ISLNK(ip->i_d.di_mode)) {
/*
@@ -904,14 +917,6 @@ xfs_create(
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = B_TRUE;
- /*
- * Check for directory link count overflow.
- */
- if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
- error = XFS_ERROR(EMLINK);
- goto out_trans_cancel;
- }
-
xfs_bmap_init(&free_list, &first_block);
/*
@@ -1416,14 +1421,6 @@ xfs_link(
xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
/*
- * If the source has too many links, we can't make any more to it.
- */
- if (sip->i_d.di_nlink >= XFS_MAXLINK) {
- error = XFS_ERROR(EMLINK);
- goto error_return;
- }
-
- /*
* If we are using project inheritance, we only allow hard link
* creation in our tree when the project IDs are the same; else
* the tree quota mechanism could be circumvented.
@@ -1961,11 +1958,11 @@ xfs_zero_remaining_bytes(
* since nothing can read beyond eof. The space will
* be zeroed when the file is extended anyway.
*/
- if (startoff >= ip->i_size)
+ if (startoff >= XFS_ISIZE(ip))
return 0;
- if (endoff > ip->i_size)
- endoff = ip->i_size;
+ if (endoff > XFS_ISIZE(ip))
+ endoff = XFS_ISIZE(ip);
bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2260,7 +2257,7 @@ xfs_change_file_space(
bf->l_start += offset;
break;
case 2: /*SEEK_END*/
- bf->l_start += ip->i_size;
+ bf->l_start += XFS_ISIZE(ip);
break;
default:
return XFS_ERROR(EINVAL);
@@ -2277,7 +2274,7 @@ xfs_change_file_space(
bf->l_whence = 0;
startoffset = bf->l_start;
- fsize = ip->i_size;
+ fsize = XFS_ISIZE(ip);
/*
* XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 0c877cbde14..447e146b2ba 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -10,7 +10,6 @@ struct kiocb;
struct pipe_inode_info;
struct uio;
struct xfs_inode;
-struct xfs_iomap;
int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags);
@@ -49,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
-int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
- int flags, struct xfs_iomap *iomapp, int *niomaps);
void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
xfs_off_t last, int fiopt);
int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,