From 7ad920b504a980adcab4d3f6b85695526e6fd7bb Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:05 -0700
Subject: ceph: documentation

Mount options, syntax.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 Documentation/filesystems/ceph.txt | 139 +++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 Documentation/filesystems/ceph.txt

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
new file mode 100644
index 00000000000..6e03917316b
--- /dev/null
+++ b/Documentation/filesystems/ceph.txt
@@ -0,0 +1,139 @@
+Ceph Distributed File System
+============================
+
+Ceph is a distributed network file system designed to provide good
+performance, reliability, and scalability.
+
+Basic features include:
+
+ * POSIX semantics
+ * Seamless scaling from 1 to many thousands of nodes
+ * High availability and reliability.  No single points of failure.
+ * N-way replication of data across storage nodes
+ * Fast recovery from node failures
+ * Automatic rebalancing of data on node addition/removal
+ * Easy deployment: most FS components are userspace daemons
+
+Also,
+ * Flexible snapshots (on any directory)
+ * Recursive accounting (nested files, directories, bytes)
+
+In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
+on symmetric access by all clients to shared block devices, Ceph
+separates data and metadata management into independent server
+clusters, similar to Lustre.  Unlike Lustre, however, metadata and
+storage nodes run entirely as user space daemons.  Storage nodes
+utilize btrfs to store data objects, leveraging its advanced features
+(checksumming, metadata replication, etc.).  File data is striped
+across storage nodes in large chunks to distribute workload and
+facilitate high throughputs.  When storage nodes fail, data is
+re-replicated in a distributed fashion by the storage nodes themselves
+(with some minimal coordination from a cluster monitor), making the
+system extremely efficient and scalable.
+
+Metadata servers effectively form a large, consistent, distributed
+in-memory cache above the file namespace that is extremely scalable,
+dynamically redistributes metadata in response to workload changes,
+and can tolerate arbitrary (well, non-Byzantine) node failures.  The
+metadata server takes a somewhat unconventional approach to metadata
+storage to significantly improve performance for common workloads.  In
+particular, inodes with only a single link are embedded in
+directories, allowing entire directories of dentries and inodes to be
+loaded into its cache with a single I/O operation.  The contents of
+extremely large directories can be fragmented and managed by
+independent metadata servers, allowing scalable concurrent access.
+
+The system offers automatic data rebalancing/migration when scaling
+from a small cluster of just a few nodes to many hundreds, without
+requiring an administrator carve the data set into static volumes or
+go through the tedious process of migrating data between servers.
+When the file system approaches full, new nodes can be easily added
+and things will "just work."
+
+Ceph includes flexible snapshot mechanism that allows a user to create
+a snapshot on any subdirectory (and its nested contents) in the
+system.  Snapshot creation and deletion are as simple as 'mkdir
+.snap/foo' and 'rmdir .snap/foo'.
+
+Ceph also provides some recursive accounting on directories for nested
+files and bytes.  That is, a 'getfattr -d foo' on any directory in the
+system will reveal the total number of nested regular files and
+subdirectories, and a summation of all nested file sizes.  This makes
+the identification of large disk space consumers relatively quick, as
+no 'du' or similar recursive scan of the file system is required.
+
+
+Mount Syntax
+============
+
+The basic mount syntax is:
+
+ # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
+
+You only need to specify a single monitor, as the client will get the
+full list when it connects.  (However, if the monitor you specify
+happens to be down, the mount won't succeed.)  The port can be left
+off if the monitor is using the default.  So if the monitor is at
+1.2.3.4,
+
+ # mount -t ceph 1.2.3.4:/ /mnt/ceph
+
+is sufficient.  If /sbin/mount.ceph is installed, a hostname can be
+used instead of an IP address.
+
+
+
+Mount Options
+=============
+
+  ip=A.B.C.D[:N]
+	Specify the IP and/or port the client should bind to locally.
+	There is normally not much reason to do this.  If the IP is not
+	specified, the client's IP address is determined by looking at the
+	address it's connection to the monitor originates from.
+
+  wsize=X
+	Specify the maximum write size in bytes.  By default there is no
+	maximu.  Ceph will normally size writes based on the file stripe
+	size.
+
+  rsize=X
+	Specify the maximum readahead.
+
+  mount_timeout=X
+	Specify the timeout value for mount (in seconds), in the case
+	of a non-responsive Ceph file system.  The default is 30
+	seconds.
+
+  rbytes
+	When stat() is called on a directory, set st_size to 'rbytes',
+	the summation of file sizes over all files nested beneath that
+	directory.  This is the default.
+
+  norbytes
+	When stat() is called on a directory, set st_size to the
+	number of entries in that directory.
+
+  nocrc
+	Disable CRC32C calculation for data writes.  If set, the OSD
+	must rely on TCP's error correction to detect data corruption
+	in the data payload.
+
+  noasyncreaddir
+	Disable client's use its local cache to satisfy	readdir
+	requests.  (This does not change correctness; the client uses
+	cached metadata only when a lease or capability ensures it is
+	valid.)
+
+
+More Information
+================
+
+For more information on Ceph, see the home page at
+	http://ceph.newdream.net/
+
+The Linux kernel client source tree is available at
+	git://ceph.newdream.net/linux-ceph-client.git
+
+and the source for the full system is at
+	git://ceph.newdream.net/ceph.git
-- 
cgit v1.2.3


From 5db53f3e80dee2d9dff5e534f9e9fe1db17c9936 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 20 Nov 2009 20:13:39 +0100
Subject: [LogFS] add new flash file system

This is a new flash file system. See
Documentation/filesystems/logfs.txt

Signed-off-by: Joern Engel <joern@logfs.org>
---
 Documentation/filesystems/00-INDEX  |    2 +
 Documentation/filesystems/logfs.txt |  241 ++++
 fs/Kconfig                          |    1 +
 fs/Makefile                         |    1 +
 fs/logfs/Kconfig                    |   17 +
 fs/logfs/Makefile                   |   13 +
 fs/logfs/compr.c                    |   95 ++
 fs/logfs/dev_bdev.c                 |  263 ++++
 fs/logfs/dev_mtd.c                  |  253 ++++
 fs/logfs/dir.c                      |  818 +++++++++++++
 fs/logfs/file.c                     |  263 ++++
 fs/logfs/gc.c                       |  730 ++++++++++++
 fs/logfs/inode.c                    |  417 +++++++
 fs/logfs/journal.c                  |  879 ++++++++++++++
 fs/logfs/logfs.h                    |  722 +++++++++++
 fs/logfs/logfs_abi.h                |  627 ++++++++++
 fs/logfs/readwrite.c                | 2246 +++++++++++++++++++++++++++++++++++
 fs/logfs/segment.c                  |  924 ++++++++++++++
 fs/logfs/super.c                    |  634 ++++++++++
 include/linux/btree-128.h           |  109 ++
 include/linux/btree-type.h          |  147 +++
 include/linux/btree.h               |  243 ++++
 lib/Kconfig                         |    3 +
 lib/Makefile                        |    1 +
 lib/btree.c                         |  797 +++++++++++++
 25 files changed, 10446 insertions(+)
 create mode 100644 Documentation/filesystems/logfs.txt
 create mode 100644 fs/logfs/Kconfig
 create mode 100644 fs/logfs/Makefile
 create mode 100644 fs/logfs/compr.c
 create mode 100644 fs/logfs/dev_bdev.c
 create mode 100644 fs/logfs/dev_mtd.c
 create mode 100644 fs/logfs/dir.c
 create mode 100644 fs/logfs/file.c
 create mode 100644 fs/logfs/gc.c
 create mode 100644 fs/logfs/inode.c
 create mode 100644 fs/logfs/journal.c
 create mode 100644 fs/logfs/logfs.h
 create mode 100644 fs/logfs/logfs_abi.h
 create mode 100644 fs/logfs/readwrite.c
 create mode 100644 fs/logfs/segment.c
 create mode 100644 fs/logfs/super.c
 create mode 100644 include/linux/btree-128.h
 create mode 100644 include/linux/btree-type.h
 create mode 100644 include/linux/btree.h
 create mode 100644 lib/btree.c

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index f15621ee559..d362aa543b2 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -62,6 +62,8 @@ jfs.txt
 	- info and mount options for the JFS filesystem.
 locks.txt
 	- info on file locking implementations, flock() vs. fcntl(), etc.
+logfs.txt
+	- info on the LogFS flash filesystem.
 mandatory-locking.txt
 	- info on the Linux implementation of Sys V mandatory file locking.
 ncpfs.txt
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
new file mode 100644
index 00000000000..e64c94ba401
--- /dev/null
+++ b/Documentation/filesystems/logfs.txt
@@ -0,0 +1,241 @@
+
+The LogFS Flash Filesystem
+==========================
+
+Specification
+=============
+
+Superblocks
+-----------
+
+Two superblocks exist at the beginning and end of the filesystem.
+Each superblock is 256 Bytes large, with another 3840 Bytes reserved
+for future purposes, making a total of 4096 Bytes.
+
+Superblock locations may differ for MTD and block devices.  On MTD the
+first non-bad block contains a superblock in the first 4096 Bytes and
+the last non-bad block contains a superblock in the last 4096 Bytes.
+On block devices, the first 4096 Bytes of the device contain the first
+superblock and the last aligned 4096 Byte-block contains the second
+superblock.
+
+For the most part, the superblocks can be considered read-only.  They
+are written only to correct errors detected within the superblocks,
+move the journal and change the filesystem parameters through tunefs.
+As a result, the superblock does not contain any fields that require
+constant updates, like the amount of free space, etc.
+
+Segments
+--------
+
+The space in the device is split up into equal-sized segments.
+Segments are the primary write unit of LogFS.  Within each segments,
+writes happen from front (low addresses) to back (high addresses.  If
+only a partial segment has been written, the segment number, the
+current position within and optionally a write buffer are stored in
+the journal.
+
+Segments are erased as a whole.  Therefore Garbage Collection may be
+required to completely free a segment before doing so.
+
+Journal
+--------
+
+The journal contains all global information about the filesystem that
+is subject to frequent change.  At mount time, it has to be scanned
+for the most recent commit entry, which contains a list of pointers to
+all currently valid entries.
+
+Object Store
+------------
+
+All space except for the superblocks and journal is part of the object
+store.  Each segment contains a segment header and a number of
+objects, each consisting of the object header and the payload.
+Objects are either inodes, directory entries (dentries), file data
+blocks or indirect blocks.
+
+Levels
+------
+
+Garbage collection (GC) may fail if all data is written
+indiscriminately.  One requirement of GC is that data is seperated
+roughly according to the distance between the tree root and the data.
+Effectively that means all file data is on level 0, indirect blocks
+are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
+respectively.  Inode file data is on level 6 for the inodes and 7-11
+for indirect blocks.
+
+Each segment contains objects of a single level only.  As a result,
+each level requires its own seperate segment to be open for writing.
+
+Inode File
+----------
+
+All inodes are stored in a special file, the inode file.  Single
+exception is the inode file's inode (master inode) which for obvious
+reasons is stored in the journal instead.  Instead of data blocks, the
+leaf nodes of the inode files are inodes.
+
+Aliases
+-------
+
+Writes in LogFS are done by means of a wandering tree.  A naïve
+implementation would require that for each write or a block, all
+parent blocks are written as well, since the block pointers have
+changed.  Such an implementation would not be very efficient.
+
+In LogFS, the block pointer changes are cached in the journal by means
+of alias entries.  Each alias consists of its logical address - inode
+number, block index, level and child number (index into block) - and
+the changed data.  Any 8-byte word can be changes in this manner.
+
+Currently aliases are used for block pointers, file size, file used
+bytes and the height of an inodes indirect tree.
+
+Segment Aliases
+---------------
+
+Related to regular aliases, these are used to handle bad blocks.
+Initially, bad blocks are handled by moving the affected segment
+content to a spare segment and noting this move in the journal with a
+segment alias, a simple (to, from) tupel.  GC will later empty this
+segment and the alias can be removed again.  This is used on MTD only.
+
+Vim
+---
+
+By cleverly predicting the life time of data, it is possible to
+seperate long-living data from short-living data and thereby reduce
+the GC overhead later.  Each type of distinc life expectency (vim) can
+have a seperate segment open for writing.  Each (level, vim) tupel can
+be open just once.  If an open segment with unknown vim is encountered
+at mount time, it is closed and ignored henceforth.
+
+Indirect Tree
+-------------
+
+Inodes in LogFS are similar to FFS-style filesystems with direct and
+indirect block pointers.  One difference is that LogFS uses a single
+indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
+A height field in the inode defines the height of the indirect tree
+and thereby the indirection of the pointer.
+
+Another difference is the addressing of indirect blocks.  In LogFS,
+the first 16 pointers in the first indirect block are left empty,
+corresponding to the 16 direct pointers in the inode.  In ext2 (maybe
+others as well) the first pointer in the first indirect block
+corresponds to logical block 12, skipping the 12 direct pointers.
+So where ext2 is using arithmetic to better utilize space, LogFS keeps
+arithmetic simple and uses compression to save space.
+
+Compression
+-----------
+
+Both file data and metadata can be compressed.  Compression for file
+data can be enabled with chattr +c and disabled with chattr -c.  Doing
+so has no effect on existing data, but new data will be stored
+accordingly.  New inodes will inherit the compression flag of the
+parent directory.
+
+Metadata is always compressed.  However, the space accounting ignores
+this and charges for the uncompressed size.  Failing to do so could
+result in GC failures when, after moving some data, indirect blocks
+compress worse than previously.  Even on a 100% full medium, GC may
+not consume any extra space, so the compression gains are lost space
+to the user.
+
+However, they are not lost space to the filesystem internals.  By
+cheating the user for those bytes, the filesystem gained some slack
+space and GC will run less often and faster.
+
+Garbage Collection and Wear Leveling
+------------------------------------
+
+Garbage collection is invoked whenever the number of free segments
+falls below a threshold.  The best (known) candidate is picked based
+on the least amount of valid data contained in the segment.  All
+remaining valid data is copied elsewhere, thereby invalidating it.
+
+The GC code also checks for aliases and writes then back if their
+number gets too large.
+
+Wear leveling is done by occasionally picking a suboptimal segment for
+garbage collection.  If a stale segments erase count is significantly
+lower than the active segments' erase counts, it will be picked.  Wear
+leveling is rate limited, so it will never monopolize the device for
+more than one segment worth at a time.
+
+Values for "occasionally", "significantly lower" are compile time
+constants.
+
+Hashed directories
+------------------
+
+To satisfy efficient lookup(), directory entries are hashed and
+located based on the hash.  In order to both support large directories
+and not be overly inefficient for small directories, several hash
+tables of increasing size are used.  For each table, the hash value
+modulo the table size gives the table index.
+
+Tables sizes are chosen to limit the number of indirect blocks with a
+fully populated table to 0, 1, 2 or 3 respectively.  So the first
+table contains 16 entries, the second 512-16, etc.
+
+The last table is special in several ways.  First its size depends on
+the effective 32bit limit on telldir/seekdir cookies.  Since logfs
+uses the upper half of the address space for indirect blocks, the size
+is limited to 2^31.  Secondly the table contains hash buckets with 16
+entries each.
+
+Using single-entry buckets would result in birthday "attacks".  At
+just 2^16 used entries, hash collisions would be likely (P >= 0.5).
+My math skills are insufficient to do the combinatorics for the 17x
+collisions necessary to overflow a bucket, but testing showed that in
+10,000 runs the lowest directory fill before a bucket overflow was
+188,057,130 entries with an average of 315,149,915 entries.  So for
+directory sizes of up to a million, bucket overflows should be
+virtually impossible under normal circumstances.
+
+With carefully chosen filenames, it is obviously possible to cause an
+overflow with just 21 entries (4 higher tables + 16 entries + 1).  So
+there may be a security concern if a malicious user has write access
+to a directory.
+
+Open For Discussion
+===================
+
+Device Address Space
+--------------------
+
+A device address space is used for caching.  Both block devices and
+MTD provide functions to either read a single page or write a segment.
+Partial segments may be written for data integrity, but where possible
+complete segments are written for performance on simple block device
+flash media.
+
+Meta Inodes
+-----------
+
+Inodes are stored in the inode file, which is just a regular file for
+most purposes.  At umount time, however, the inode file needs to
+remain open until all dirty inodes are written.  So
+generic_shutdown_super() may not close this inode, but shouldn't
+complain about remaining inodes due to the inode file either.  Same
+goes for mapping inode of the device address space.
+
+Currently logfs uses a hack that essentially copies part of fs/inode.c
+code over.  A general solution would be preferred.
+
+Indirect block mapping
+----------------------
+
+With compression, the block device (or mapping inode) cannot be used
+to cache indirect blocks.  Some other place is required.  Currently
+logfs uses the top half of each inode's address space.  The low 8TB
+(on 32bit) are filled with file data, the high 8TB are used for
+indirect blocks.
+
+One problem is that 16TB files created on 64bit systems actually have
+data in the top 8TB.  But files >16TB would cause problems anyway, so
+only the limit has changed.
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44efad7a..7405f071be6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -177,6 +177,7 @@ source "fs/efs/Kconfig"
 source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
+source "fs/logfs/Kconfig"
 source "fs/cramfs/Kconfig"
 source "fs/squashfs/Kconfig"
 source "fs/freevxfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9..c3633aa4691 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS)		+= ntfs/
 obj-$(CONFIG_UFS_FS)		+= ufs/
 obj-$(CONFIG_EFS_FS)		+= efs/
 obj-$(CONFIG_JFFS2_FS)		+= jffs2/
+obj-$(CONFIG_LOGFS)		+= logfs/
 obj-$(CONFIG_UBIFS_FS)		+= ubifs/
 obj-$(CONFIG_AFFS_FS)		+= affs/
 obj-$(CONFIG_ROMFS_FS)		+= romfs/
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 00000000000..daf9a9b32dd
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
+config LOGFS
+	tristate "LogFS file system (EXPERIMENTAL)"
+	depends on (MTD || BLOCK) && EXPERIMENTAL
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	select CRC32
+	select BTREE
+	help
+	  Flash filesystem aimed to scale efficiently to large devices.
+	  In comparison to JFFS2 it offers significantly faster mount
+	  times and potentially less RAM usage, although the latter has
+	  not been measured yet.
+
+	  In its current state it is still very experimental and should
+	  not be used for other than testing purposes.
+
+	  If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 00000000000..4820027787e
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LOGFS)	+= logfs.o
+
+logfs-y	+= compr.o
+logfs-y	+= dir.o
+logfs-y	+= file.o
+logfs-y	+= gc.o
+logfs-y	+= inode.o
+logfs-y	+= journal.o
+logfs-y	+= readwrite.o
+logfs-y	+= segment.o
+logfs-y	+= super.o
+logfs-$(CONFIG_BLOCK)	+= dev_bdev.o
+logfs-$(CONFIG_MTD)	+= dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 00000000000..44bbfd249ab
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
+/*
+ * fs/logfs/compr.c	- compression routines
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+
+#define COMPR_LEVEL 3
+
+static DEFINE_MUTEX(compr_mutex);
+static struct z_stream_s stream;
+
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_deflateInit(&stream, COMPR_LEVEL);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_inflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	ret = 0;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int __init logfs_compr_init(void)
+{
+	size_t size = max(zlib_deflate_workspacesize(),
+			zlib_inflate_workspacesize());
+	stream.workspace = vmalloc(size);
+	if (!stream.workspace)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_compr_exit(void)
+{
+	vfree(stream.workspace);
+}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 00000000000..58a057b6e1a
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,263 @@
+/*
+ * fs/logfs/dev_bdev.c	- Device access methods for block devices
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static void request_complete(struct bio *bio, int err)
+{
+	complete((struct completion *)bio->bi_private);
+}
+
+static int sync_request(struct page *page, struct block_device *bdev, int rw)
+{
+	struct bio bio;
+	struct bio_vec bio_vec;
+	struct completion complete;
+
+	bio_init(&bio);
+	bio.bi_io_vec = &bio_vec;
+	bio_vec.bv_page = page;
+	bio_vec.bv_len = PAGE_SIZE;
+	bio_vec.bv_offset = 0;
+	bio.bi_vcnt = 1;
+	bio.bi_idx = 0;
+	bio.bi_size = PAGE_SIZE;
+	bio.bi_bdev = bdev;
+	bio.bi_sector = page->index * (PAGE_SIZE >> 9);
+	init_completion(&complete);
+	bio.bi_private = &complete;
+	bio.bi_end_io = request_complete;
+
+	submit_bio(rw, &bio);
+	generic_unplug_device(bdev_get_queue(bdev));
+	wait_for_completion(&complete);
+	return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
+}
+
+static int bdev_readpage(void *_sb, struct page *page)
+{
+	struct super_block *sb = _sb;
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+	int err;
+
+	err = sync_request(page, bdev, READ);
+	if (err) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	unlock_page(page);
+	return err;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(wq);
+
+static void writeseg_end_io(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct super_block *sb = bio->bi_private;
+	struct logfs_super *super = logfs_super(sb);
+	struct page *page;
+
+	BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
+	BUG_ON(err);
+	BUG_ON(bio->bi_vcnt == 0);
+	do {
+		page = bvec->bv_page;
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		end_page_writeback(page);
+	} while (bvec >= bio->bi_io_vec);
+	bio_put(bio);
+	if (atomic_dec_and_test(&super->s_pending_writes))
+		wake_up(&wq);
+}
+
+static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct bio *bio;
+	struct page *page;
+	struct request_queue *q = bdev_get_queue(sb->s_bdev);
+	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+	int i;
+
+	bio = bio_alloc(GFP_NOFS, max_pages);
+	BUG_ON(!bio); /* FIXME: handle this */
+
+	for (i = 0; i < nr_pages; i++) {
+		if (i >= max_pages) {
+			/* Block layer cannot split bios :( */
+			bio->bi_vcnt = i;
+			bio->bi_idx = 0;
+			bio->bi_size = i * PAGE_SIZE;
+			bio->bi_bdev = super->s_bdev;
+			bio->bi_sector = ofs >> 9;
+			bio->bi_private = sb;
+			bio->bi_end_io = writeseg_end_io;
+			atomic_inc(&super->s_pending_writes);
+			submit_bio(WRITE, bio);
+
+			ofs += i * PAGE_SIZE;
+			index += i;
+			nr_pages -= i;
+			i = 0;
+
+			bio = bio_alloc(GFP_NOFS, max_pages);
+			BUG_ON(!bio);
+		}
+		page = find_lock_page(mapping, index + i);
+		BUG_ON(!page);
+		bio->bi_io_vec[i].bv_page = page;
+		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+		bio->bi_io_vec[i].bv_offset = 0;
+
+		BUG_ON(PageWriteback(page));
+		set_page_writeback(page);
+		unlock_page(page);
+	}
+	bio->bi_vcnt = nr_pages;
+	bio->bi_idx = 0;
+	bio->bi_size = nr_pages * PAGE_SIZE;
+	bio->bi_bdev = super->s_bdev;
+	bio->bi_sector = ofs >> 9;
+	bio->bi_private = sb;
+	bio->bi_end_io = writeseg_end_io;
+	atomic_inc(&super->s_pending_writes);
+	submit_bio(WRITE, bio);
+	return 0;
+}
+
+static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int head;
+
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
+
+	if (len == 0) {
+		/* This can happen when the object fit perfectly into a
+		 * segment, the segment gets written per sync and subsequently
+		 * closed.
+		 */
+		return;
+	}
+	head = ofs & (PAGE_SIZE - 1);
+	if (head) {
+		ofs -= head;
+		len += head;
+	}
+	len = PAGE_ALIGN(len);
+	__bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+	generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
+}
+
+static int bdev_erase(struct super_block *sb, loff_t to, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	pgoff_t index = to >> PAGE_SHIFT;
+	int i, nr_pages = len >> PAGE_SHIFT;
+
+	BUG_ON(to & (PAGE_SIZE - 1));
+	BUG_ON(len & (PAGE_SIZE - 1));
+
+	if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	for (i = 0; i < nr_pages; i++) {
+		page = find_get_page(mapping, index + i);
+		if (page) {
+			memset(page_address(page), 0xFF, PAGE_SIZE);
+			page_cache_release(page);
+		}
+	}
+	return 0;
+}
+
+static void bdev_sync(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
+}
+
+static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = bdev_readpage;
+
+	*ofs = 0;
+	return read_cache_page(mapping, 0, filler, sb);
+}
+
+static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = bdev_readpage;
+	u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
+	pgoff_t index = pos >> PAGE_SHIFT;
+
+	*ofs = pos;
+	return read_cache_page(mapping, index, filler, sb);
+}
+
+static int bdev_write_sb(struct super_block *sb, struct page *page)
+{
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+
+	/* Nothing special to do for block devices. */
+	return sync_request(page, bdev, WRITE);
+}
+
+static void bdev_put_device(struct super_block *sb)
+{
+	close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
+}
+
+static const struct logfs_device_ops bd_devops = {
+	.find_first_sb	= bdev_find_first_sb,
+	.find_last_sb	= bdev_find_last_sb,
+	.write_sb	= bdev_write_sb,
+	.readpage	= bdev_readpage,
+	.writeseg	= bdev_writeseg,
+	.erase		= bdev_erase,
+	.sync		= bdev_sync,
+	.put_device	= bdev_put_device,
+};
+
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt)
+{
+	struct block_device *bdev;
+
+	bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
+		int mtdnr = MINOR(bdev->bd_dev);
+		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+		return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+	}
+
+	return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
+}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 00000000000..68e99d046c2
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,253 @@
+/*
+ * fs/logfs/dev_mtd.c	- Device access methods for MTD
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/completion.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+	size_t retlen;
+	int ret;
+
+	ret = mtd->read(mtd, ofs, len, &retlen, buf);
+	BUG_ON(ret == -EINVAL);
+	if (ret)
+		return ret;
+
+	/* Not sure if we should loop instead. */
+	if (retlen != len)
+		return -EIO;
+
+	return 0;
+}
+
+static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct mtd_info *mtd = super->s_mtd;
+	size_t retlen;
+	loff_t page_start, page_end;
+	int ret;
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
+	BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
+	BUG_ON(len > PAGE_CACHE_SIZE);
+	page_start = ofs & PAGE_CACHE_MASK;
+	page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
+	ret = mtd->write(mtd, ofs, len, &retlen, buf);
+	if (ret || (retlen != len))
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * For as long as I can remember (since about 2001) mtd->erase has been an
+ * asynchronous interface lacking the first driver to actually use the
+ * asynchronous properties.  So just to prevent the first implementor of such
+ * a thing from breaking logfs in 2350, we do the usual pointless dance to
+ * declare a completion variable and wait for completion before returning
+ * from mtd_erase().  What an excercise in futility!
+ */
+static void logfs_erase_callback(struct erase_info *ei)
+{
+	complete((struct completion *)ei->priv);
+}
+
+static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	pgoff_t index = ofs >> PAGE_SHIFT;
+
+	for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
+		page = find_get_page(mapping, index);
+		if (!page)
+			continue;
+		memset(page_address(page), 0xFF, PAGE_SIZE);
+		page_cache_release(page);
+	}
+	return 0;
+}
+
+static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+	struct erase_info ei;
+	DECLARE_COMPLETION_ONSTACK(complete);
+	int ret;
+
+	BUG_ON(len % mtd->erasesize);
+	if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	memset(&ei, 0, sizeof(ei));
+	ei.mtd = mtd;
+	ei.addr = ofs;
+	ei.len = len;
+	ei.callback = logfs_erase_callback;
+	ei.priv = (long)&complete;
+	ret = mtd->erase(mtd, &ei);
+	if (ret)
+		return -EIO;
+
+	wait_for_completion(&complete);
+	if (ei.state != MTD_ERASE_DONE)
+		return -EIO;
+	return mtd_erase_mapping(sb, ofs, len);
+}
+
+static void mtd_sync(struct super_block *sb)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+
+	if (mtd->sync)
+		mtd->sync(mtd);
+}
+
+static int mtd_readpage(void *_sb, struct page *page)
+{
+	struct super_block *sb = _sb;
+	int err;
+
+	err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+			page_address(page));
+	if (err == -EUCLEAN) {
+		err = 0;
+		/* FIXME: force GC this segment */
+	}
+	if (err) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	unlock_page(page);
+	return err;
+}
+
+static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = mtd_readpage;
+	struct mtd_info *mtd = super->s_mtd;
+
+	if (!mtd->block_isbad)
+		return NULL;
+
+	*ofs = 0;
+	while (mtd->block_isbad(mtd, *ofs)) {
+		*ofs += mtd->erasesize;
+		if (*ofs >= mtd->size)
+			return NULL;
+	}
+	BUG_ON(*ofs & ~PAGE_MASK);
+	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+
+static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = mtd_readpage;
+	struct mtd_info *mtd = super->s_mtd;
+
+	if (!mtd->block_isbad)
+		return NULL;
+
+	*ofs = mtd->size - mtd->erasesize;
+	while (mtd->block_isbad(mtd, *ofs)) {
+		*ofs -= mtd->erasesize;
+		if (*ofs <= 0)
+			return NULL;
+	}
+	*ofs = *ofs + mtd->erasesize - 0x1000;
+	BUG_ON(*ofs & ~PAGE_MASK);
+	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+
+static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	int i, err;
+
+	for (i = 0; i < nr_pages; i++) {
+		page = find_lock_page(mapping, index + i);
+		BUG_ON(!page);
+
+		err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+				page_address(page));
+		unlock_page(page);
+		page_cache_release(page);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int head;
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return;
+
+	if (len == 0) {
+		/* This can happen when the object fit perfectly into a
+		 * segment, the segment gets written per sync and subsequently
+		 * closed.
+		 */
+		return;
+	}
+	head = ofs & (PAGE_SIZE - 1);
+	if (head) {
+		ofs -= head;
+		len += head;
+	}
+	len = PAGE_ALIGN(len);
+	__mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+}
+
+static void mtd_put_device(struct super_block *sb)
+{
+	put_mtd_device(logfs_super(sb)->s_mtd);
+}
+
+static const struct logfs_device_ops mtd_devops = {
+	.find_first_sb	= mtd_find_first_sb,
+	.find_last_sb	= mtd_find_last_sb,
+	.readpage	= mtd_readpage,
+	.writeseg	= mtd_writeseg,
+	.erase		= mtd_erase,
+	.sync		= mtd_sync,
+	.put_device	= mtd_put_device,
+};
+
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt)
+{
+	struct mtd_info *mtd;
+	const struct logfs_device_ops *devops = &mtd_devops;
+
+	mtd = get_mtd_device(NULL, mtdnr);
+	return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
+}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 00000000000..89104e6f81c
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,818 @@
+/*
+ * fs/logfs/dir.c	- directory-related code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+
+
+/*
+ * Atomic dir operations
+ *
+ * Directory operations are by default not atomic.  Dentries and Inodes are
+ * created/removed/altered in seperate operations.  Therefore we need to do
+ * a small amount of journaling.
+ *
+ * Create, link, mkdir, mknod and symlink all share the same function to do
+ * the work: __logfs_create.  This function works in two atomic steps:
+ * 1. allocate inode (remember in journal)
+ * 2. allocate dentry (clear journal)
+ *
+ * As we can only get interrupted between the two, when the inode we just
+ * created is simply stored in the anchor.  On next mount, if we were
+ * interrupted, we delete the inode.  From a users point of view the
+ * operation never happened.
+ *
+ * Unlink and rmdir also share the same function: unlink.  Again, this
+ * function works in two atomic steps
+ * 1. remove dentry (remember inode in journal)
+ * 2. unlink inode (clear journal)
+ *
+ * And again, on the next mount, if we were interrupted, we delete the inode.
+ * From a users point of view the operation succeeded.
+ *
+ * Rename is the real pain to deal with, harder than all the other methods
+ * combined.  Depending on the circumstances we can run into three cases.
+ * A "target rename" where the target dentry already existed, a "local
+ * rename" where both parent directories are identical or a "cross-directory
+ * rename" in the remaining case.
+ *
+ * Local rename is atomic, as the old dentry is simply rewritten with a new
+ * name.
+ *
+ * Cross-directory rename works in two steps, similar to __logfs_create and
+ * logfs_unlink:
+ * 1. Write new dentry (remember old dentry in journal)
+ * 2. Remove old dentry (clear journal)
+ *
+ * Here we remember a dentry instead of an inode.  On next mount, if we were
+ * interrupted, we delete the dentry.  From a users point of view, the
+ * operation succeeded.
+ *
+ * Target rename works in three atomic steps:
+ * 1. Attach old inode to new dentry (remember old dentry and new inode)
+ * 2. Remove old dentry (still remember the new inode)
+ * 3. Remove victim inode
+ *
+ * Here we remember both an inode an a dentry.  If we get interrupted
+ * between steps 1 and 2, we delete both the dentry and the inode.  If
+ * we get interrupted between steps 2 and 3, we delete just the inode.
+ * In either case, the remaining objects are deleted on next mount.  From
+ * a users point of view, the operation succeeded.
+ */
+
+static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
+		loff_t pos)
+{
+	return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
+}
+
+static int write_inode(struct inode *inode)
+{
+	return __logfs_write_inode(inode, WF_LOCK);
+}
+
+static s64 dir_seek_data(struct inode *inode, s64 pos)
+{
+	s64 new_pos = logfs_seek_data(inode, pos);
+
+	return max(pos, new_pos - 1);
+}
+
+static int beyond_eof(struct inode *inode, loff_t bix)
+{
+	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+	return pos >= i_size_read(inode);
+}
+
+/*
+ * Prime value was chosen to be roughly 256 + 26.  r5 hash uses 11,
+ * so short names (len <= 9) don't even occupy the complete 32bit name
+ * space.  A prime >256 ensures short names quickly spread the 32bit
+ * name space.  Add about 26 for the estimated amount of information
+ * of each character and pick a prime nearby, preferrably a bit-sparse
+ * one.
+ */
+static u32 hash_32(const char *s, int len, u32 seed)
+{
+	u32 hash = seed;
+	int i;
+
+	for (i = 0; i < len; i++)
+		hash = hash * 293 + s[i];
+	return hash;
+}
+
+/*
+ * We have to satisfy several conflicting requirements here.  Small
+ * directories should stay fairly compact and not require too many
+ * indirect blocks.  The number of possible locations for a given hash
+ * should be small to make lookup() fast.  And we should try hard not
+ * to overflow the 32bit name space or nfs and 32bit host systems will
+ * be unhappy.
+ *
+ * So we use the following scheme.  First we reduce the hash to 0..15
+ * and try a direct block.  If that is occupied we reduce the hash to
+ * 16..255 and try an indirect block.  Same for 2x and 3x indirect
+ * blocks.  Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
+ * but use buckets containing eight entries instead of a single one.
+ *
+ * Using 16 entries should allow for a reasonable amount of hash
+ * collisions, so the 32bit name space can be packed fairly tight
+ * before overflowing.  Oh and currently we don't overflow but return
+ * and error.
+ *
+ * How likely are collisions?  Doing the appropriate math is beyond me
+ * and the Bronstein textbook.  But running a test program to brute
+ * force collisions for a couple of days showed that on average the
+ * first collision occurs after 598M entries, with 290M being the
+ * smallest result.  Obviously 21 entries could already cause a
+ * collision if all entries are carefully chosen.
+ */
+static pgoff_t hash_index(u32 hash, int round)
+{
+	switch (round) {
+	case 0:
+		return hash % I0_BLOCKS;
+	case 1:
+		return I0_BLOCKS + hash % (I1_BLOCKS - I0_BLOCKS);
+	case 2:
+		return I1_BLOCKS + hash % (I2_BLOCKS - I1_BLOCKS);
+	case 3:
+		return I2_BLOCKS + hash % (I3_BLOCKS - I2_BLOCKS);
+	case 4 ... 19:
+		return I3_BLOCKS + 16 * (hash % (((1<<31) - I3_BLOCKS) / 16))
+			+ round - 4;
+	}
+	BUG();
+}
+
+static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
+{
+	struct qstr *name = &dentry->d_name;
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	u32 hash = hash_32(name->name, name->len, 0);
+	pgoff_t index;
+	int round;
+
+	if (name->len > LOGFS_MAX_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	for (round = 0; round < 20; round++) {
+		index = hash_index(hash, round);
+
+		if (beyond_eof(dir, index))
+			return NULL;
+		if (!logfs_exist_block(dir, index))
+			continue;
+		page = read_cache_page(dir->i_mapping, index,
+				(filler_t *)logfs_readpage, NULL);
+		if (IS_ERR(page))
+			return page;
+		dd = kmap_atomic(page, KM_USER0);
+		BUG_ON(dd->namelen == 0);
+
+		if (name->len != be16_to_cpu(dd->namelen) ||
+				memcmp(name->name, dd->name, name->len)) {
+			kunmap_atomic(dd, KM_USER0);
+			page_cache_release(page);
+			continue;
+		}
+
+		kunmap_atomic(dd, KM_USER0);
+		return page;
+	}
+	return NULL;
+}
+
+static int logfs_remove_inode(struct inode *inode)
+{
+	int ret;
+
+	inode->i_nlink--;
+	ret = write_inode(inode);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	if (logfs_inode(inode)->li_block)
+		logfs_inode(inode)->li_block->ta = NULL;
+	kfree(ta);
+}
+
+static int logfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct inode *inode = dentry->d_inode;
+	struct logfs_transaction *ta;
+	struct page *page;
+	pgoff_t index;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = UNLINK_1;
+	ta->ino = inode->i_ino;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (!page)
+		return -ENOENT;
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+	index = page->index;
+	page_cache_release(page);
+
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(dir, ta);
+
+	ret = logfs_delete(dir, index, NULL);
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		abort_transaction(dir, ta);
+		printk(KERN_ERR"LOGFS: unable to delete inode\n");
+		goto out;
+	}
+
+	ta->state = UNLINK_2;
+	logfs_add_transaction(inode, ta);
+	ret = logfs_remove_inode(inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static inline int logfs_empty_dir(struct inode *dir)
+{
+	u64 data;
+
+	data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
+	return data >= i_size_read(dir);
+}
+
+static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (!logfs_empty_dir(inode))
+		return -ENOTEMPTY;
+
+	return logfs_unlink(dir, dentry);
+}
+
+/* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
+ * way to combine the two copies */
+#define IMPLICIT_NODES 2
+static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+	struct inode *dir = file->f_dentry->d_inode;
+	loff_t pos = file->f_pos - IMPLICIT_NODES;
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	int full;
+
+	BUG_ON(pos < 0);
+	for (;; pos++) {
+		if (beyond_eof(dir, pos))
+			break;
+		if (!logfs_exist_block(dir, pos)) {
+			/* deleted dentry */
+			pos = dir_seek_data(dir, pos);
+			continue;
+		}
+		page = read_cache_page(dir->i_mapping, pos,
+				(filler_t *)logfs_readpage, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		dd = kmap_atomic(page, KM_USER0);
+		BUG_ON(dd->namelen == 0);
+
+		full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
+				pos, be64_to_cpu(dd->ino), dd->type);
+		kunmap_atomic(dd, KM_USER0);
+		page_cache_release(page);
+		if (full)
+			break;
+	}
+
+	file->f_pos = pos + IMPLICIT_NODES;
+	return 0;
+}
+
+static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ino_t pino = parent_ino(file->f_dentry);
+	int err;
+
+	if (file->f_pos < 0)
+		return -EINVAL;
+
+	if (file->f_pos == 0) {
+		if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
+			return 0;
+		file->f_pos++;
+	}
+	if (file->f_pos == 1) {
+		if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
+			return 0;
+		file->f_pos++;
+	}
+
+	err = __logfs_readdir(file, buf, filldir);
+	return err;
+}
+
+static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
+{
+	dd->namelen = cpu_to_be16(name->len);
+	memcpy(dd->name, name->name, name->len);
+}
+
+static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
+		struct nameidata *nd)
+{
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	pgoff_t index;
+	u64 ino = 0;
+	struct inode *inode;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (IS_ERR(page))
+		return ERR_CAST(page);
+	if (!page) {
+		d_add(dentry, NULL);
+		return NULL;
+	}
+	index = page->index;
+	dd = kmap_atomic(page, KM_USER0);
+	ino = be64_to_cpu(dd->ino);
+	kunmap_atomic(dd, KM_USER0);
+	page_cache_release(page);
+
+	inode = logfs_iget(dir->i_sb, ino);
+	if (IS_ERR(inode)) {
+		printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
+				ino, dir->i_ino, index);
+		return ERR_CAST(inode);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+static void grow_dir(struct inode *dir, loff_t index)
+{
+	index = (index + 1) << dir->i_sb->s_blocksize_bits;
+	if (i_size_read(dir) < index)
+		i_size_write(dir, index);
+}
+
+static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
+		struct inode *inode)
+{
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
+	pgoff_t index;
+	int round, err;
+
+	for (round = 0; round < 20; round++) {
+		index = hash_index(hash, round);
+
+		if (logfs_exist_block(dir, index))
+			continue;
+		page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+
+		dd = kmap_atomic(page, KM_USER0);
+		memset(dd, 0, sizeof(*dd));
+		dd->ino = cpu_to_be64(inode->i_ino);
+		dd->type = logfs_type(inode);
+		logfs_set_name(dd, &dentry->d_name);
+		kunmap_atomic(dd, KM_USER0);
+
+		err = logfs_write_buf(dir, page, WF_LOCK);
+		unlock_page(page);
+		page_cache_release(page);
+		if (!err)
+			grow_dir(dir, index);
+		return err;
+	}
+	/* FIXME: Is there a better return value?  In most cases neither
+	 * the filesystem nor the directory are full.  But we have had
+	 * too many collisions for this particular hash and no fallback.
+	 */
+	return -ENOSPC;
+}
+
+static int __logfs_create(struct inode *dir, struct dentry *dentry,
+		struct inode *inode, const char *dest, long destlen)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_transaction *ta;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = CREATE_1;
+	ta->ino = inode->i_ino;
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(inode, ta);
+
+	if (dest) {
+		/* symlink */
+		ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
+		if (!ret)
+			ret = write_inode(inode);
+	} else {
+		/* creat/mkdir/mknod */
+		ret = write_inode(inode);
+	}
+	if (ret) {
+		abort_transaction(inode, ta);
+		li->li_flags |= LOGFS_IF_STILLBORN;
+		/* FIXME: truncate symlink */
+		inode->i_nlink--;
+		iput(inode);
+		goto out;
+	}
+
+	ta->state = CREATE_2;
+	logfs_add_transaction(dir, ta);
+	ret = logfs_write_dir(dir, dentry, inode);
+	/* sync directory */
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		logfs_del_transaction(dir, ta);
+		ta->state = CREATE_2;
+		logfs_add_transaction(inode, ta);
+		logfs_remove_inode(inode);
+		iput(inode);
+		goto out;
+	}
+	d_instantiate(dentry, inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+
+	/*
+	 * FIXME: why do we have to fill in S_IFDIR, while the mode is
+	 * correct for mknod, creat, etc.?  Smells like the vfs *should*
+	 * do it for us but for some reason fails to do so.
+	 */
+	inode = logfs_new_inode(dir, S_IFDIR | mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_dir_iops;
+	inode->i_fop = &logfs_dir_fops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct inode *inode;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_reg_iops;
+	inode->i_fop = &logfs_reg_fops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		dev_t rdev)
+{
+	struct inode *inode;
+
+	if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	init_special_inode(inode, mode, rdev);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_symlink(struct inode *dir, struct dentry *dentry,
+		const char *target)
+{
+	struct inode *inode;
+	size_t destlen = strlen(target) + 1;
+
+	if (destlen > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, S_IFLNK | 0777);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_symlink_iops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, target, destlen);
+}
+
+static int logfs_permission(struct inode *inode, int mask)
+{
+	return generic_permission(inode, mask, NULL);
+}
+
+static int logfs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= LOGFS_LINK_MAX)
+		return -EMLINK;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	atomic_inc(&inode->i_count);
+	inode->i_nlink++;
+	mark_inode_dirty_sync(inode);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t *pos)
+{
+	struct page *page;
+	void *map;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+	*pos = page->index;
+	map = kmap_atomic(page, KM_USER0);
+	memcpy(dd, map, sizeof(*dd));
+	kunmap_atomic(map, KM_USER0);
+	page_cache_release(page);
+	return 0;
+}
+
+static int logfs_delete_dd(struct inode *dir, loff_t pos)
+{
+	/*
+	 * Getting called with pos somewhere beyond eof is either a goofup
+	 * within this file or means someone maliciously edited the
+	 * (crc-protected) journal.
+	 */
+	BUG_ON(beyond_eof(dir, pos));
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
+	return logfs_delete(dir, pos, NULL);
+}
+
+/*
+ * Cross-directory rename, target does not exist.  Just a little nasty.
+ * Create a new dentry in the target dir, then remove the old dentry,
+ * all the while taking care to remember our operation in the journal.
+ */
+static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
+			      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = CROSS_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+
+	/* 2. write target dd */
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(new_dir, ta);
+	err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
+	if (!err)
+		err = write_inode(new_dir);
+
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	/* 3. remove source dd */
+	ta->state = CROSS_RENAME_2;
+	logfs_add_transaction(old_dir, ta);
+	err = logfs_delete_dd(old_dir, pos);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, struct inode *inode)
+{
+	loff_t pos;
+	int err;
+
+	err = logfs_get_dd(dir, dentry, dd, &pos);
+	if (err)
+		return err;
+	dd->ino = cpu_to_be64(inode->i_ino);
+	dd->type = logfs_type(inode);
+
+	err = write_dir(dir, dd, pos);
+	if (err)
+		return err;
+	log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
+			dd->name, be64_to_cpu(dd->ino));
+	return write_inode(dir);
+}
+
+/* Target dentry exists - the worst case.  We need to attach the source
+ * inode to the target dentry, then remove the orphaned target inode and
+ * source dentry.
+ */
+static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
+			       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	int isdir = S_ISDIR(old_inode->i_mode);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
+	if (isdir) {
+		if (!logfs_empty_dir(new_inode))
+			return -ENOTEMPTY;
+	}
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = TARGET_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+	ta->ino = new_inode->i_ino;
+
+	/* 2. attach source inode to target dd */
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(new_dir, ta);
+	err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		super->s_victim_ino = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	/* 3. remove source dd */
+	ta->state = TARGET_RENAME_2;
+	logfs_add_transaction(old_dir, ta);
+	err = logfs_delete_dd(old_dir, pos);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+
+	/* 4. remove target inode */
+	ta->state = TARGET_RENAME_3;
+	logfs_add_transaction(new_inode, ta);
+	err = logfs_remove_inode(new_inode);
+
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (new_dentry->d_inode)
+		return logfs_rename_target(old_dir, old_dentry,
+					   new_dir, new_dentry);
+	return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+/* No locking done here, as this is called before .get_sb() returns. */
+int logfs_replay_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	u64 ino, pos;
+	int err;
+
+	if (super->s_victim_ino) {
+		/* delete victim inode */
+		ino = super->s_victim_ino;
+		printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
+		inode = logfs_iget(sb, ino);
+		if (IS_ERR(inode))
+			goto fail;
+
+		LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
+		super->s_victim_ino = 0;
+		err = logfs_remove_inode(inode);
+		iput(inode);
+		if (err) {
+			super->s_victim_ino = ino;
+			goto fail;
+		}
+	}
+	if (super->s_rename_dir) {
+		/* delete old dd from rename */
+		ino = super->s_rename_dir;
+		pos = super->s_rename_pos;
+		printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
+				ino, pos);
+		inode = logfs_iget(sb, ino);
+		if (IS_ERR(inode))
+			goto fail;
+
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		err = logfs_delete_dd(inode, pos);
+		iput(inode);
+		if (err) {
+			super->s_rename_dir = ino;
+			super->s_rename_pos = pos;
+			goto fail;
+		}
+	}
+	return 0;
+fail:
+	LOGFS_BUG(sb);
+	return -EIO;
+}
+
+const struct inode_operations logfs_symlink_iops = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+};
+
+const struct inode_operations logfs_dir_iops = {
+	.create		= logfs_create,
+	.link		= logfs_link,
+	.lookup		= logfs_lookup,
+	.mkdir		= logfs_mkdir,
+	.mknod		= logfs_mknod,
+	.rename		= logfs_rename,
+	.rmdir		= logfs_rmdir,
+	.permission	= logfs_permission,
+	.symlink	= logfs_symlink,
+	.unlink		= logfs_unlink,
+};
+const struct file_operations logfs_dir_fops = {
+	.fsync		= logfs_fsync,
+	.ioctl		= logfs_ioctl,
+	.readdir	= logfs_readdir,
+	.read		= generic_read_dir,
+};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 00000000000..370f367a933
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,263 @@
+/*
+ * fs/logfs/file.c	- prepare_write, commit_write and friends
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/writeback.h>
+
+static int logfs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+		return 0;
+	if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+		unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+		unsigned end = start + len;
+
+		/* Reading beyond i_size is simple: memset to zero */
+		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+		return 0;
+	}
+	return logfs_readpage_nolock(page);
+}
+
+static int logfs_write_end(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned copied, struct page *page,
+		void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	pgoff_t index = page->index;
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned end = start + copied;
+	int ret = 0;
+
+	BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+	BUG_ON(page->index > I3_BLOCKS);
+
+	if (copied < len) {
+		/*
+		 * Short write of a non-initialized paged.  Just tell userspace
+		 * to retry the entire page.
+		 */
+		if (!PageUptodate(page)) {
+			copied = 0;
+			goto out;
+		}
+	}
+	if (copied == 0)
+		goto out; /* FIXME: do we need to update inode? */
+
+	if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
+		i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+		mark_inode_dirty_sync(inode);
+	}
+
+	SetPageUptodate(page);
+	if (!PageDirty(page)) {
+		if (!get_page_reserve(inode, page))
+			__set_page_dirty_nobuffers(page);
+		else
+			ret = logfs_write_buf(inode, page, WF_LOCK);
+	}
+out:
+	unlock_page(page);
+	page_cache_release(page);
+	return ret ? ret : copied;
+}
+
+int logfs_readpage(struct file *file, struct page *page)
+{
+	int ret;
+
+	ret = logfs_readpage_nolock(page);
+	unlock_page(page);
+	return ret;
+}
+
+/* Clear the page's dirty flag in the radix tree. */
+/* TODO: mucking with PageWriteback is silly.  Add a generic function to clear
+ * the dirty bit from the radix tree for filesystems that don't have to wait
+ * for page writeback to finish (i.e. any compressing filesystem).
+ */
+static void clear_radix_tree_dirty(struct page *page)
+{
+	BUG_ON(PagePrivate(page) || page->private);
+	set_page_writeback(page);
+	end_page_writeback(page);
+}
+
+static int __logfs_writepage(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	err = logfs_write_buf(inode, page, WF_LOCK);
+	if (err)
+		set_page_dirty(page);
+	else
+		clear_radix_tree_dirty(page);
+	unlock_page(page);
+	return err;
+}
+
+static int logfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	u64 bix;
+	level_t level;
+
+	log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
+			page);
+
+	logfs_unpack_index(page->index, &bix, &level);
+
+	/* Indirect blocks are never truncated */
+	if (level != 0)
+		return __logfs_writepage(page);
+
+	/*
+	 * TODO: everything below is a near-verbatim copy of nobh_writepage().
+	 * The relevant bits should be factored out after logfs is merged.
+	 */
+
+	/* Is the page fully inside i_size? */
+	if (bix < end_index)
+		return __logfs_writepage(page);
+
+	 /* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (bix > end_index || offset == 0) {
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invokation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	return __logfs_writepage(page);
+}
+
+static void logfs_invalidatepage(struct page *page, unsigned long offset)
+{
+	move_page_to_btree(page);
+	BUG_ON(PagePrivate(page) || page->private);
+}
+
+static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
+{
+	return 0; /* None of these are easy to release */
+}
+
+
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	unsigned int oldflags, flags;
+	int err;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *)arg);
+	case FS_IOC_SETFLAGS:
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		err = get_user(flags, (int __user *)arg);
+		if (err)
+			return err;
+
+		mutex_lock(&inode->i_mutex);
+		oldflags = li->li_flags;
+		flags &= LOGFS_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
+		li->li_flags = flags;
+		mutex_unlock(&inode->i_mutex);
+
+		inode->i_ctime = CURRENT_TIME;
+		mark_inode_dirty_sync(inode);
+		return 0;
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	/* FIXME: write anchor */
+	super->s_devops->sync(sb);
+	return 0;
+}
+
+static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = 0;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		err = logfs_truncate(inode, attr->ia_size);
+	attr->ia_valid &= ~ATTR_SIZE;
+
+	if (!err)
+		err = inode_change_ok(inode, attr);
+	if (!err)
+		err = inode_setattr(inode, attr);
+	return err;
+}
+
+const struct inode_operations logfs_reg_iops = {
+	.setattr	= logfs_setattr,
+};
+
+const struct file_operations logfs_reg_fops = {
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.fsync		= logfs_fsync,
+	.ioctl		= logfs_ioctl,
+	.llseek		= generic_file_llseek,
+	.mmap		= generic_file_readonly_mmap,
+	.open		= generic_file_open,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+};
+
+const struct address_space_operations logfs_reg_aops = {
+	.invalidatepage	= logfs_invalidatepage,
+	.readpage	= logfs_readpage,
+	.releasepage	= logfs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.writepage	= logfs_writepage,
+	.writepages	= generic_writepages,
+	.write_begin	= logfs_write_begin,
+	.write_end	= logfs_write_end,
+};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 00000000000..b3656c44190
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,730 @@
+/*
+ * fs/logfs/gc.c	- garbage collection code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+
+/*
+ * Wear leveling needs to kick in when the difference between low erase
+ * counts and high erase counts gets too big.  A good value for "too big"
+ * may be somewhat below 10% of maximum erase count for the device.
+ * Why not 397, to pick a nice round number with no specific meaning? :)
+ *
+ * WL_RATELIMIT is the minimum time between two wear level events.  A huge
+ * number of segments may fulfil the requirements for wear leveling at the
+ * same time.  If that happens we don't want to cause a latency from hell,
+ * but just gently pick one segment every so often and minimize overhead.
+ */
+#define WL_DELTA 397
+#define WL_RATELIMIT 100
+#define MAX_OBJ_ALIASES	2600
+#define SCAN_RATIO 512	/* number of scanned segments per gc'd segment */
+#define LIST_SIZE 64	/* base size of candidate lists */
+#define SCAN_ROUNDS 128	/* maximum number of complete medium scans */
+#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
+
+static int no_free_segments(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	return super->s_free_list.count;
+}
+
+/* journal has distance -1, top-most ifile layer distance 0 */
+static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u8 gc_level = (__force u8)__gc_level;
+
+	switch (gc_level) {
+	case 0: /* fall through */
+	case 1: /* fall through */
+	case 2: /* fall through */
+	case 3:
+		/* file data or indirect blocks */
+		return super->s_ifile_levels + super->s_iblock_levels - gc_level;
+	case 6: /* fall through */
+	case 7: /* fall through */
+	case 8: /* fall through */
+	case 9:
+		/* inode file data or indirect blocks */
+		return super->s_ifile_levels - (gc_level - 6);
+	default:
+		printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
+				gc_level);
+		WARN_ON(1);
+		return super->s_ifile_levels + super->s_iblock_levels;
+	}
+}
+
+static int segment_is_reserved(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area;
+	void *reserved;
+	int i;
+
+	/* Some segments are reserved.  Just pretend they were all valid */
+	reserved = btree_lookup32(&super->s_reserved_segments, segno);
+	if (reserved)
+		return 1;
+
+	/* Currently open segments */
+	for_each_area(i) {
+		area = super->s_area[i];
+		if (area->a_is_open && area->a_segno == segno)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	BUG();
+}
+
+/*
+ * Returns the bytes consumed by valid objects in this segment.  Object headers
+ * are counted, the segment header is not.
+ */
+static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
+		gc_level_t *gc_level)
+{
+	struct logfs_segment_entry se;
+	u32 ec_level;
+
+	logfs_get_segment_entry(sb, segno, &se);
+	if (se.ec_level == cpu_to_be32(BADSEG) ||
+			se.valid == cpu_to_be32(RESERVED))
+		return RESERVED;
+
+	ec_level = be32_to_cpu(se.ec_level);
+	*ec = ec_level >> 4;
+	*gc_level = GC_LEVEL(ec_level & 0xf);
+	return be32_to_cpu(se.valid);
+}
+
+static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
+		u64 bix, gc_level_t gc_level)
+{
+	struct inode *inode;
+	int err, cookie;
+
+	inode = logfs_safe_iget(sb, ino, &cookie);
+	err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
+	BUG_ON(err);
+	logfs_safe_iput(inode, cookie);
+}
+
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header sh;
+	struct logfs_object_header oh;
+	u64 ofs, ino, bix;
+	u32 seg_ofs, logical_segno, cleaned = 0;
+	int err, len, valid;
+	gc_level_t gc_level;
+
+	LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
+
+	btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
+	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+	BUG_ON(err);
+	gc_level = GC_LEVEL(sh.level);
+	logical_segno = be32_to_cpu(sh.segno);
+	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
+		logfs_mark_segment_bad(sb, segno);
+		cleaned = -1;
+		goto out;
+	}
+
+	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+			seg_ofs + sizeof(oh) < super->s_segsize; ) {
+		ofs = dev_ofs(sb, logical_segno, seg_ofs);
+		err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
+				&oh);
+		BUG_ON(err);
+
+		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+			break;
+
+		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
+			logfs_mark_segment_bad(sb, segno);
+			cleaned = super->s_segsize - 1;
+			goto out;
+		}
+
+		ino = be64_to_cpu(oh.ino);
+		bix = be64_to_cpu(oh.bix);
+		len = sizeof(oh) + be16_to_cpu(oh.len);
+		valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
+		if (valid == 1) {
+			logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
+			cleaned += len;
+		} else if (valid == 2) {
+			/* Will be invalid upon journal commit */
+			cleaned += len;
+		}
+		seg_ofs += len;
+	}
+out:
+	btree_remove32(&super->s_reserved_segments, segno);
+	return cleaned;
+}
+
+static struct gc_candidate *add_list(struct gc_candidate *cand,
+		struct candidate_list *list)
+{
+	struct rb_node **p = &list->rb_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct gc_candidate *cur;
+	int comp;
+
+	cand->list = list;
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct gc_candidate, rb_node);
+
+		if (list->sort_by_ec)
+			comp = cand->erase_count < cur->erase_count;
+		else
+			comp = cand->valid < cur->valid;
+
+		if (comp)
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node(&cand->rb_node, parent, p);
+	rb_insert_color(&cand->rb_node, &list->rb_tree);
+
+	if (list->count <= list->maxcount) {
+		list->count++;
+		return NULL;
+	}
+	cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
+	rb_erase(&cand->rb_node, &list->rb_tree);
+	cand->list = NULL;
+	return cand;
+}
+
+static void remove_from_list(struct gc_candidate *cand)
+{
+	struct candidate_list *list = cand->list;
+
+	rb_erase(&cand->rb_node, &list->rb_tree);
+	list->count--;
+}
+
+static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_remove32(&super->s_cand_tree, cand->segno);
+	kfree(cand);
+}
+
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
+{
+	struct gc_candidate *cand;
+	u32 segno;
+
+	BUG_ON(list->count == 0);
+
+	cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+	remove_from_list(cand);
+	segno = cand->segno;
+	if (ec)
+		*ec = cand->erase_count;
+	free_candidate(sb, cand);
+	return segno;
+}
+
+/*
+ * We have several lists to manage segments with.  The reserve_list is used to
+ * deal with bad blocks.  We try to keep the best (lowest ec) segments on this
+ * list.
+ * The free_list contains free segments for normal usage.  It usually gets the
+ * second pick after the reserve_list.  But when the free_list is running short
+ * it is more important to keep the free_list full than to keep a reserve.
+ *
+ * Segments that are not free are put onto a per-level low_list.  If we have
+ * to run garbage collection, we pick a candidate from there.  All segments on
+ * those lists should have at least some free space so GC will make progress.
+ *
+ * And last we have the ec_list, which is used to pick segments for wear
+ * leveling.
+ *
+ * If all appropriate lists are full, we simply free the candidate and forget
+ * about that segment for a while.  We have better candidates for each purpose.
+ */
+static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
+
+	if (cand->valid == 0) {
+		/* 100% free segments */
+		log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
+				cand->segno, cand->erase_count,
+				dev_ofs(sb, cand->segno, 0));
+		cand = add_list(cand, &super->s_reserve_list);
+		if (cand) {
+			log_gc_noisy("add free segment %x (ec %x) at %llx\n",
+					cand->segno, cand->erase_count,
+					dev_ofs(sb, cand->segno, 0));
+			cand = add_list(cand, &super->s_free_list);
+		}
+	} else {
+		/* good candidates for Garbage Collection */
+		if (cand->valid < full)
+			cand = add_list(cand, &super->s_low_list[cand->dist]);
+		/* good candidates for wear leveling,
+		 * segments that were recently written get ignored */
+		if (cand)
+			cand = add_list(cand, &super->s_ec_list);
+	}
+	if (cand)
+		free_candidate(sb, cand);
+}
+
+static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
+		u8 dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+
+	cand = kmalloc(sizeof(*cand), GFP_NOFS);
+	if (!cand)
+		return -ENOMEM;
+
+	cand->segno = segno;
+	cand->valid = valid;
+	cand->erase_count = ec;
+	cand->dist = dist;
+
+	btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
+	__add_candidate(sb, cand);
+	return 0;
+}
+
+static void remove_segment_from_lists(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+
+	cand = btree_lookup32(&super->s_cand_tree, segno);
+	if (cand) {
+		remove_from_list(cand);
+		free_candidate(sb, cand);
+	}
+}
+
+static void scan_segment(struct super_block *sb, u32 segno)
+{
+	u32 valid, ec = 0;
+	gc_level_t gc_level = 0;
+	u8 dist;
+
+	if (segment_is_reserved(sb, segno))
+		return;
+
+	remove_segment_from_lists(sb, segno);
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	if (valid == RESERVED)
+		return;
+
+	dist = root_distance(sb, gc_level);
+	add_candidate(sb, segno, valid, ec, dist);
+}
+
+static struct gc_candidate *first_in_list(struct candidate_list *list)
+{
+	if (list->count == 0)
+		return NULL;
+	return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+}
+
+/*
+ * Find the best segment for garbage collection.  Main criterion is
+ * the segment requiring the least effort to clean.  Secondary
+ * criterion is to GC on the lowest level available.
+ *
+ * So we search the least effort segment on the lowest level first,
+ * then move up and pick another segment iff is requires significantly
+ * less effort.  Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
+ */
+static struct gc_candidate *get_candidate(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i, max_dist;
+	struct gc_candidate *cand = NULL, *this;
+
+	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+
+	for (i = max_dist; i >= 0; i--) {
+		this = first_in_list(&super->s_low_list[i]);
+		if (!this)
+			continue;
+		if (!cand)
+			cand = this;
+		if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
+			cand = this;
+	}
+	return cand;
+}
+
+static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+	gc_level_t gc_level;
+	u32 cleaned, valid, segno, ec;
+	u8 dist;
+
+	if (!cand) {
+		log_gc("GC attempted, but no candidate found\n");
+		return 0;
+	}
+
+	segno = cand->segno;
+	dist = cand->dist;
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	free_candidate(sb, cand);
+	log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
+			segno, (u64)segno << super->s_segshift,
+			dist, no_free_segments(sb), valid,
+			super->s_free_bytes);
+	cleaned = logfs_gc_segment(sb, segno, dist);
+	log_gc("GC segment #%02x complete - now %x valid\n", segno,
+			valid - cleaned);
+	BUG_ON(cleaned != valid);
+	return 1;
+}
+
+static int logfs_gc_once(struct super_block *sb)
+{
+	struct gc_candidate *cand;
+
+	cand = get_candidate(sb);
+	if (cand)
+		remove_from_list(cand);
+	return __logfs_gc_once(sb, cand);
+}
+
+/* returns 1 if a wrap occurs, 0 otherwise */
+static int logfs_scan_some(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno;
+	int i, ret = 0;
+
+	segno = super->s_sweeper;
+	for (i = SCAN_RATIO; i > 0; i--) {
+		segno++;
+		if (segno >= super->s_no_segs) {
+			segno = 0;
+			ret = 1;
+			/* Break out of the loop.  We want to read a single
+			 * block from the segment size on next invocation if
+			 * SCAN_RATIO is set to match block size
+			 */
+			break;
+		}
+
+		scan_segment(sb, segno);
+	}
+	super->s_sweeper = segno;
+	return ret;
+}
+
+/*
+ * In principle, this function should loop forever, looking for GC candidates
+ * and moving data.  LogFS is designed in such a way that this loop is
+ * guaranteed to terminate.
+ *
+ * Limiting the loop to some iterations serves purely to catch cases when
+ * these guarantees have failed.  An actual endless loop is an obvious bug
+ * and should be reported as such.
+ */
+static void __logfs_gc_pass(struct super_block *sb, int target)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	int round, progress, last_progress = 0;
+
+	if (no_free_segments(sb) >= target &&
+			super->s_no_object_aliases < MAX_OBJ_ALIASES)
+		return;
+
+	log_gc("__logfs_gc_pass(%x)\n", target);
+	for (round = 0; round < SCAN_ROUNDS; ) {
+		if (no_free_segments(sb) >= target)
+			goto write_alias;
+
+		/* Sync in-memory state with on-medium state in case they
+		 * diverged */
+		logfs_write_anchor(super->s_master_inode);
+		round += logfs_scan_some(sb);
+		if (no_free_segments(sb) >= target)
+			goto write_alias;
+		progress = logfs_gc_once(sb);
+		if (progress)
+			last_progress = round;
+		else if (round - last_progress > 2)
+			break;
+		continue;
+
+		/*
+		 * The goto logic is nasty, I just don't know a better way to
+		 * code it.  GC is supposed to ensure two things:
+		 * 1. Enough free segments are available.
+		 * 2. The number of aliases is bounded.
+		 * When 1. is achieved, we take a look at 2. and write back
+		 * some alias-containing blocks, if necessary.  However, after
+		 * each such write we need to go back to 1., as writes can
+		 * consume free segments.
+		 */
+write_alias:
+		if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
+			return;
+		if (list_empty(&super->s_object_alias)) {
+			/* All aliases are still in btree */
+			return;
+		}
+		log_gc("Write back one alias\n");
+		block = list_entry(super->s_object_alias.next,
+				struct logfs_block, alias_list);
+		block->ops->write_block(block);
+		/*
+		 * To round off the nasty goto logic, we reset round here.  It
+		 * is a safety-net for GC not making any progress and limited
+		 * to something reasonably small.  If incremented it for every
+		 * single alias, the loop could terminate rather quickly.
+		 */
+		round = 0;
+	}
+	LOGFS_BUG(sb);
+}
+
+static int wl_ratelimit(struct super_block *sb, u64 *next_event)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (*next_event < super->s_gec) {
+		*next_event = super->s_gec + WL_RATELIMIT;
+		return 0;
+	}
+	return 1;
+}
+
+static void logfs_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *wl_cand, *free_cand;
+
+	if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
+		return;
+
+	wl_cand = first_in_list(&super->s_ec_list);
+	if (!wl_cand)
+		return;
+	free_cand = first_in_list(&super->s_free_list);
+	if (!free_cand)
+		return;
+
+	if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
+		remove_from_list(wl_cand);
+		__logfs_gc_once(sb, wl_cand);
+	}
+}
+
+/*
+ * The journal needs wear leveling as well.  But moving the journal is an
+ * expensive operation so we try to avoid it as much as possible.  And if we
+ * have to do it, we move the whole journal, not individual segments.
+ *
+ * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
+ * calculations.  First we check whether moving the journal would be a
+ * significant improvement.  That means that a) the current journal segments
+ * have more wear than the future journal segments and b) the current journal
+ * segments have more wear than normal ostore segments.
+ * Rationale for b) is that we don't have to move the journal if it is aging
+ * less than the ostore, even if the reserve segments age even less (they are
+ * excluded from wear leveling, after all).
+ * Next we check that the superblocks have less wear than the journal.  Since
+ * moving the journal requires writing the superblocks, we have to protect the
+ * superblocks even more than the journal.
+ *
+ * Also we double the acceptable wear difference, compared to ostore wear
+ * leveling.  Journal data is read and rewritten rapidly, comparatively.  So
+ * soft errors have much less time to accumulate and we allow the journal to
+ * be a bit worse than the ostore.
+ */
+static void logfs_journal_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+	u32 min_journal_ec = -1, max_reserve_ec = 0;
+	int i;
+
+	if (wl_ratelimit(sb, &super->s_wl_gec_journal))
+		return;
+
+	if (super->s_reserve_list.count < super->s_no_journal_segs) {
+		/* Reserve is not full enough to move complete journal */
+		return;
+	}
+
+	journal_for_each(i)
+		if (super->s_journal_seg[i])
+			min_journal_ec = min(min_journal_ec,
+					super->s_journal_ec[i]);
+	cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
+			struct gc_candidate, rb_node);
+	max_reserve_ec = cand->erase_count;
+	for (i = 0; i < 2; i++) {
+		struct logfs_segment_entry se;
+		u32 segno = seg_no(sb, super->s_sb_ofs[i]);
+		u32 ec;
+
+		logfs_get_segment_entry(sb, segno, &se);
+		ec = be32_to_cpu(se.ec_level) >> 4;
+		max_reserve_ec = max(max_reserve_ec, ec);
+	}
+
+	if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
+		do_logfs_journal_wl_pass(sb);
+	}
+}
+
+void logfs_gc_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	//BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
+	/* Write journal before free space is getting saturated with dirty
+	 * objects.
+	 */
+	if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
+			+ LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
+		logfs_write_anchor(super->s_master_inode);
+	__logfs_gc_pass(sb, logfs_super(sb)->s_total_levels);
+	logfs_wl_pass(sb);
+	logfs_journal_wl_pass(sb);
+}
+
+static int check_area(struct super_block *sb, int i)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[i];
+	struct logfs_object_header oh;
+	u32 segno = area->a_segno;
+	u32 ofs = area->a_used_bytes;
+	__be32 crc;
+	int err;
+
+	if (!area->a_is_open)
+		return 0;
+
+	for (ofs = area->a_used_bytes;
+	     ofs <= super->s_segsize - sizeof(oh);
+	     ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
+		err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
+		if (err)
+			return err;
+
+		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+			break;
+
+		crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
+		if (crc != oh.crc) {
+			printk(KERN_INFO "interrupted header at %llx\n",
+					dev_ofs(sb, segno, ofs));
+			return 0;
+		}
+	}
+	if (ofs != area->a_used_bytes) {
+		printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
+				ofs - area->a_used_bytes,
+				dev_ofs(sb, segno, area->a_used_bytes));
+		area->a_used_bytes = ofs;
+	}
+	return 0;
+}
+
+int logfs_check_areas(struct super_block *sb)
+{
+	int i, err;
+
+	for_each_area(i) {
+		err = check_area(sb, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void logfs_init_candlist(struct candidate_list *list, int maxcount,
+		int sort_by_ec)
+{
+	list->count = 0;
+	list->maxcount = maxcount;
+	list->sort_by_ec = sort_by_ec;
+	list->rb_tree = RB_ROOT;
+}
+
+int logfs_init_gc(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
+	logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
+	logfs_init_candlist(&super->s_reserve_list,
+			super->s_bad_seg_reserve, 1);
+	for_each_area(i)
+		logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
+	logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
+	return 0;
+}
+
+static void logfs_cleanup_list(struct super_block *sb,
+		struct candidate_list *list)
+{
+	struct gc_candidate *cand;
+
+	while (list->count) {
+		cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
+				rb_node);
+		remove_from_list(cand);
+		free_candidate(sb, cand);
+	}
+	BUG_ON(list->rb_tree.rb_node);
+}
+
+void logfs_cleanup_gc(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	if (!super->s_free_list.count)
+		return;
+
+	/*
+	 * FIXME: The btree may still contain a single empty node.  So we
+	 * call the grim visitor to clean up that mess.  Btree code should
+	 * do it for us, really.
+	 */
+	btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
+	logfs_cleanup_list(sb, &super->s_free_list);
+	logfs_cleanup_list(sb, &super->s_reserve_list);
+	for_each_area(i)
+		logfs_cleanup_list(sb, &super->s_low_list[i]);
+	logfs_cleanup_list(sb, &super->s_ec_list);
+}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 00000000000..6d08b376264
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,417 @@
+/*
+ * fs/logfs/inode.c	- inode handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+/*
+ * How soon to reuse old inode numbers?  LogFS doesn't store deleted inodes
+ * on the medium.  It therefore also lacks a method to store the previous
+ * generation number for deleted inodes.  Instead a single generation number
+ * is stored which will be used for new inodes.  Being just a 32bit counter,
+ * this can obvious wrap relatively quickly.  So we only reuse inodes if we
+ * know that a fair number of inodes can be created before we have to increment
+ * the generation again - effectively adding some bits to the counter.
+ * But being too aggressive here means we keep a very large and very sparse
+ * inode file, wasting space on indirect blocks.
+ * So what is a good value?  Beats me.  64k seems moderately bad on both
+ * fronts, so let's use that for now...
+ *
+ * NFS sucks, as everyone already knows.
+ */
+#define INOS_PER_WRAP (0x10000)
+
+/*
+ * Logfs' requirement to read inodes for garbage collection makes life a bit
+ * harder.  GC may have to read inodes that are in I_FREEING state, when they
+ * are being written out - and waiting for GC to make progress, naturally.
+ *
+ * So we cannot just call iget() or some variant of it, but first have to check
+ * wether the inode in question might be in I_FREEING state.  Therefore we
+ * maintain our own per-sb list of "almost deleted" inodes and check against
+ * that list first.  Normally this should be at most 1-2 entries long.
+ *
+ * Also, inodes have logfs-specific reference counting on top of what the vfs
+ * does.  When .destroy_inode is called, normally the reference count will drop
+ * to zero and the inode gets deleted.  But if GC accessed the inode, its
+ * refcount will remain nonzero and final deletion will have to wait.
+ *
+ * As a result we have two sets of functions to get/put inodes:
+ * logfs_safe_iget/logfs_safe_iput	- safe to call from GC context
+ * logfs_iget/iput			- normal version
+ */
+static struct kmem_cache *logfs_inode_cache;
+
+static DEFINE_SPINLOCK(logfs_inode_lock);
+
+static void logfs_inode_setops(struct inode *inode)
+{
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &logfs_dir_iops;
+		inode->i_fop = &logfs_dir_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFREG:
+		inode->i_op = &logfs_reg_iops;
+		inode->i_fop = &logfs_reg_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &logfs_symlink_iops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
+{
+	struct inode *inode = iget_locked(sb, ino);
+	int err;
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = logfs_read_inode(inode);
+	if (err || inode->i_nlink == 0) {
+		/* inode->i_nlink == 0 can be true when called from
+		 * block validator */
+		/* set i_nlink to 0 to prevent caching */
+		inode->i_nlink = 0;
+		logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
+		iget_failed(inode);
+		if (!err)
+			err = -ENOENT;
+		return ERR_PTR(err);
+	}
+
+	logfs_inode_setops(inode);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+struct inode *logfs_iget(struct super_block *sb, ino_t ino)
+{
+	BUG_ON(ino == LOGFS_INO_MASTER);
+	BUG_ON(ino == LOGFS_INO_SEGFILE);
+	return __logfs_iget(sb, ino);
+}
+
+/*
+ * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
+ * this allows logfs_iput to do the right thing later
+ */
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_inode *li;
+
+	if (ino == LOGFS_INO_MASTER)
+		return super->s_master_inode;
+	if (ino == LOGFS_INO_SEGFILE)
+		return super->s_segfile_inode;
+
+	spin_lock(&logfs_inode_lock);
+	list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
+		if (li->vfs_inode.i_ino == ino) {
+			li->li_refcount++;
+			spin_unlock(&logfs_inode_lock);
+			*is_cached = 1;
+			return &li->vfs_inode;
+		}
+	spin_unlock(&logfs_inode_lock);
+
+	*is_cached = 0;
+	return __logfs_iget(sb, ino);
+}
+
+static void __logfs_destroy_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	BUG_ON(li->li_block);
+	list_del(&li->li_freeing_list);
+	kmem_cache_free(logfs_inode_cache, li);
+}
+
+static void logfs_destroy_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	BUG_ON(list_empty(&li->li_freeing_list));
+	spin_lock(&logfs_inode_lock);
+	li->li_refcount--;
+	if (li->li_refcount == 0)
+		__logfs_destroy_inode(inode);
+	spin_unlock(&logfs_inode_lock);
+}
+
+void logfs_safe_iput(struct inode *inode, int is_cached)
+{
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		return;
+	if (inode->i_ino == LOGFS_INO_SEGFILE)
+		return;
+
+	if (is_cached) {
+		logfs_destroy_inode(inode);
+		return;
+	}
+
+	iput(inode);
+}
+
+static void logfs_init_inode(struct super_block *sb, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	li->li_flags	= 0;
+	li->li_height	= 0;
+	li->li_used_bytes = 0;
+	li->li_block	= NULL;
+	inode->i_uid	= 0;
+	inode->i_gid	= 0;
+	inode->i_size	= 0;
+	inode->i_blocks	= 0;
+	inode->i_ctime	= CURRENT_TIME;
+	inode->i_mtime	= CURRENT_TIME;
+	inode->i_nlink	= 1;
+	INIT_LIST_HEAD(&li->li_freeing_list);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+
+	return;
+}
+
+static struct inode *logfs_alloc_inode(struct super_block *sb)
+{
+	struct logfs_inode *li;
+
+	li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
+	if (!li)
+		return NULL;
+	logfs_init_inode(sb, &li->vfs_inode);
+	return &li->vfs_inode;
+}
+
+/*
+ * In logfs inodes are written to an inode file.  The inode file, like any
+ * other file, is managed with a inode.  The inode file's inode, aka master
+ * inode, requires special handling in several respects.  First, it cannot be
+ * written to the inode file, so it is stored in the journal instead.
+ *
+ * Secondly, this inode cannot be written back and destroyed before all other
+ * inodes have been written.  The ordering is important.  Linux' VFS is happily
+ * unaware of the ordering constraint and would ordinarily destroy the master
+ * inode at umount time while other inodes are still in use and dirty.  Not
+ * good.
+ *
+ * So logfs makes sure the master inode is not written until all other inodes
+ * have been destroyed.  Sadly, this method has another side-effect.  The VFS
+ * will notice one remaining inode and print a frightening warning message.
+ * Worse, it is impossible to judge whether such a warning was caused by the
+ * master inode or any other inodes have leaked as well.
+ *
+ * Our attempt of solving this is with logfs_new_meta_inode() below.  Its
+ * purpose is to create a new inode that will not trigger the warning if such
+ * an inode is still in use.  An ugly hack, no doubt.  Suggections for
+ * improvement are welcome.
+ */
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+
+	inode = logfs_alloc_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_mode = S_IFREG;
+	inode->i_ino = ino;
+	inode->i_sb = sb;
+
+	/* This is a blatant copy of alloc_inode code.  We'd need alloc_inode
+	 * to be nonstatic, alas. */
+	{
+		struct address_space * const mapping = &inode->i_data;
+
+		mapping->a_ops = &logfs_reg_aops;
+		mapping->host = inode;
+		mapping->flags = 0;
+		mapping_set_gfp_mask(mapping, GFP_NOFS);
+		mapping->assoc_mapping = NULL;
+		mapping->backing_dev_info = &default_backing_dev_info;
+		inode->i_mapping = mapping;
+		inode->i_nlink = 1;
+	}
+
+	return inode;
+}
+
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = logfs_new_meta_inode(sb, ino);
+	if (IS_ERR(inode))
+		return inode;
+
+	err = logfs_read_inode(inode);
+	if (err) {
+		destroy_meta_inode(inode);
+		return ERR_PTR(err);
+	}
+	logfs_inode_setops(inode);
+	return inode;
+}
+
+static int logfs_write_inode(struct inode *inode, int do_sync)
+{
+	int ret;
+	long flags = WF_LOCK;
+
+	/* Can only happen if creat() failed.  Safe to skip. */
+	if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
+		return 0;
+
+	ret = __logfs_write_inode(inode, flags);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+void destroy_meta_inode(struct inode *inode)
+{
+	if (inode) {
+		if (inode->i_data.nrpages)
+			truncate_inode_pages(&inode->i_data, 0);
+		logfs_clear_inode(inode);
+		kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+	}
+}
+
+/* called with inode_lock held */
+static void logfs_drop_inode(struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+
+	spin_lock(&logfs_inode_lock);
+	list_move(&li->li_freeing_list, &super->s_freeing_list);
+	spin_unlock(&logfs_inode_lock);
+	generic_drop_inode(inode);
+}
+
+static void logfs_set_ino_generation(struct super_block *sb,
+		struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 ino;
+
+	mutex_lock(&super->s_journal_mutex);
+	ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
+	super->s_last_ino = ino;
+	super->s_inos_till_wrap--;
+	if (super->s_inos_till_wrap < 0) {
+		super->s_last_ino = LOGFS_RESERVED_INOS;
+		super->s_generation++;
+		super->s_inos_till_wrap = INOS_PER_WRAP;
+	}
+	inode->i_ino = ino;
+	inode->i_generation = super->s_generation;
+	mutex_unlock(&super->s_journal_mutex);
+}
+
+struct inode *logfs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	logfs_init_inode(sb, inode);
+
+	/* inherit parent flags */
+	logfs_inode(inode)->li_flags |=
+		logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
+
+	inode->i_mode = mode;
+	logfs_set_ino_generation(sb, inode);
+
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
+	}
+
+	logfs_inode_setops(inode);
+	insert_inode_hash(inode);
+
+	return inode;
+}
+
+static void logfs_init_once(void *_li)
+{
+	struct logfs_inode *li = _li;
+	int i;
+
+	li->li_flags = 0;
+	li->li_used_bytes = 0;
+	li->li_refcount = 1;
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+	inode_init_once(&li->vfs_inode);
+}
+
+static int logfs_sync_fs(struct super_block *sb, int wait)
+{
+	/* FIXME: write anchor */
+	logfs_super(sb)->s_devops->sync(sb);
+	return 0;
+}
+
+const struct super_operations logfs_super_operations = {
+	.alloc_inode	= logfs_alloc_inode,
+	.clear_inode	= logfs_clear_inode,
+	.delete_inode	= logfs_delete_inode,
+	.destroy_inode	= logfs_destroy_inode,
+	.drop_inode	= logfs_drop_inode,
+	.write_inode	= logfs_write_inode,
+	.statfs		= logfs_statfs,
+	.sync_fs	= logfs_sync_fs,
+};
+
+int logfs_init_inode_cache(void)
+{
+	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
+			sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+			logfs_init_once);
+	if (!logfs_inode_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(logfs_inode_cache);
+}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 00000000000..7a023dbba9f
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,879 @@
+/*
+ * fs/logfs/journal.c	- journal handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+
+static void logfs_calc_free(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 reserve, no_segs = super->s_no_segs;
+	s64 free;
+	int i;
+
+	/* superblock segments */
+	no_segs -= 2;
+	super->s_no_journal_segs = 0;
+	/* journal */
+	journal_for_each(i)
+		if (super->s_journal_seg[i]) {
+			no_segs--;
+			super->s_no_journal_segs++;
+		}
+
+	/* open segments plus one extra per level for GC */
+	no_segs -= 2 * super->s_total_levels;
+
+	free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
+	free -= super->s_used_bytes;
+	/* just a bit extra */
+	free -= super->s_total_levels * 4096;
+
+	/* Bad blocks are 'paid' for with speed reserve - the filesystem
+	 * simply gets slower as bad blocks accumulate.  Until the bad blocks
+	 * exceed the speed reserve - then the filesystem gets smaller.
+	 */
+	reserve = super->s_bad_segments + super->s_bad_seg_reserve;
+	reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
+	reserve = max(reserve, super->s_speed_reserve);
+	free -= reserve;
+	if (free < 0)
+		free = 0;
+
+	super->s_free_bytes = free;
+}
+
+static void reserve_sb_and_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head32 *head = &super->s_reserved_segments;
+	int i, err;
+
+	err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
+			GFP_KERNEL);
+	BUG_ON(err);
+
+	err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
+			GFP_KERNEL);
+	BUG_ON(err);
+
+	journal_for_each(i) {
+		if (!super->s_journal_seg[i])
+			continue;
+		err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
+				GFP_KERNEL);
+		BUG_ON(err);
+	}
+}
+
+static void read_dynsb(struct super_block *sb,
+		struct logfs_je_dynsb *dynsb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec		= be64_to_cpu(dynsb->ds_gec);
+	super->s_sweeper	= be64_to_cpu(dynsb->ds_sweeper);
+	super->s_victim_ino	= be64_to_cpu(dynsb->ds_victim_ino);
+	super->s_rename_dir	= be64_to_cpu(dynsb->ds_rename_dir);
+	super->s_rename_pos	= be64_to_cpu(dynsb->ds_rename_pos);
+	super->s_used_bytes	= be64_to_cpu(dynsb->ds_used_bytes);
+	super->s_generation	= be32_to_cpu(dynsb->ds_generation);
+}
+
+static void read_anchor(struct super_block *sb,
+		struct logfs_je_anchor *da)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	super->s_last_ino = be64_to_cpu(da->da_last_ino);
+	li->li_flags	= 0;
+	li->li_height	= da->da_height;
+	i_size_write(inode, be64_to_cpu(da->da_size));
+	li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = be64_to_cpu(da->da_data[i]);
+}
+
+static void read_erasecount(struct super_block *sb,
+		struct logfs_je_journal_ec *ec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	journal_for_each(i)
+		super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
+}
+
+static int read_area(struct super_block *sb, struct logfs_je_area *a)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[a->gc_level];
+	u64 ofs;
+	u32 writemask = ~(super->s_writesize - 1);
+
+	if (a->gc_level >= LOGFS_NO_AREAS)
+		return -EIO;
+	if (a->vim != VIM_DEFAULT)
+		return -EIO; /* TODO: close area and continue */
+
+	area->a_used_bytes = be32_to_cpu(a->used_bytes);
+	area->a_written_bytes = area->a_used_bytes & writemask;
+	area->a_segno = be32_to_cpu(a->segno);
+	if (area->a_segno)
+		area->a_is_open = 1;
+
+	ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+	if (super->s_writesize > 1)
+		logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
+	else
+		logfs_buf_recover(area, ofs, NULL, 0);
+	return 0;
+}
+
+static void *unpack(void *from, void *to)
+{
+	struct logfs_journal_header *jh = from;
+	void *data = from + sizeof(struct logfs_journal_header);
+	int err;
+	size_t inlen, outlen;
+
+	inlen = be16_to_cpu(jh->h_len);
+	outlen = be16_to_cpu(jh->h_datalen);
+
+	if (jh->h_compr == COMPR_NONE)
+		memcpy(to, data, inlen);
+	else {
+		err = logfs_uncompress(data, to, inlen, outlen);
+		BUG_ON(err);
+	}
+	return to;
+}
+
+static int __read_je_header(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	struct logfs_super *super = logfs_super(sb);
+	size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+		+ MAX_JOURNAL_HEADER;
+	u16 type, len, datalen;
+	int err;
+
+	/* read header only */
+	err = wbuf_read(sb, ofs, sizeof(*jh), jh);
+	if (err)
+		return err;
+	type = be16_to_cpu(jh->h_type);
+	len = be16_to_cpu(jh->h_len);
+	datalen = be16_to_cpu(jh->h_datalen);
+	if (len > sb->s_blocksize)
+		return -EIO;
+	if ((type < JE_FIRST) || (type > JE_LAST))
+		return -EIO;
+	if (datalen > bufsize)
+		return -EIO;
+	return 0;
+}
+
+static int __read_je_payload(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	u16 len;
+	int err;
+
+	len = be16_to_cpu(jh->h_len);
+	err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
+	if (err)
+		return err;
+	if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
+		/* Old code was confused.  It forgot about the header length
+		 * and stopped calculating the crc 16 bytes before the end
+		 * of data - ick!
+		 * FIXME: Remove this hack once the old code is fixed.
+		 */
+		if (jh->h_crc == logfs_crc32(jh, len, 4))
+			WARN_ON_ONCE(1);
+		else
+			return -EIO;
+	}
+	return 0;
+}
+
+/*
+ * jh needs to be large enough to hold the complete entry, not just the header
+ */
+static int __read_je(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	int err;
+
+	err = __read_je_header(sb, ofs, jh);
+	if (err)
+		return err;
+	return __read_je_payload(sb, ofs, jh);
+}
+
+static int read_je(struct super_block *sb, u64 ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	void *scratch = super->s_je;
+	u16 type, datalen;
+	int err;
+
+	err = __read_je(sb, ofs, jh);
+	if (err)
+		return err;
+	type = be16_to_cpu(jh->h_type);
+	datalen = be16_to_cpu(jh->h_datalen);
+
+	switch (type) {
+	case JE_DYNSB:
+		read_dynsb(sb, unpack(jh, scratch));
+		break;
+	case JE_ANCHOR:
+		read_anchor(sb, unpack(jh, scratch));
+		break;
+	case JE_ERASECOUNT:
+		read_erasecount(sb, unpack(jh, scratch));
+		break;
+	case JE_AREA:
+		read_area(sb, unpack(jh, scratch));
+		break;
+	case JE_OBJ_ALIAS:
+		err = logfs_load_object_aliases(sb, unpack(jh, scratch),
+				datalen);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+	return err;
+}
+
+static int logfs_read_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
+	u32 h_ofs, last_ofs = 0;
+	u16 len, datalen, last_len;
+	int i, err;
+
+	/* search for most recent commit */
+	for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
+		ofs = seg_ofs + h_ofs;
+		err = __read_je_header(sb, ofs, jh);
+		if (err)
+			continue;
+		if (jh->h_type != cpu_to_be16(JE_COMMIT))
+			continue;
+		err = __read_je_payload(sb, ofs, jh);
+		if (err)
+			continue;
+		len = be16_to_cpu(jh->h_len);
+		datalen = be16_to_cpu(jh->h_datalen);
+		if ((datalen > sizeof(super->s_je_array)) ||
+				(datalen % sizeof(__be64)))
+			continue;
+		last_ofs = h_ofs;
+		last_len = datalen;
+		h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
+	}
+	/* read commit */
+	if (last_ofs == 0)
+		return -ENOENT;
+	ofs = seg_ofs + last_ofs;
+	log_journal("Read commit from %llx\n", ofs);
+	err = __read_je(sb, ofs, jh);
+	BUG_ON(err); /* We should have caught it in the scan loop already */
+	if (err)
+		return err;
+	/* uncompress */
+	unpack(jh, super->s_je_array);
+	super->s_no_je = last_len / sizeof(__be64);
+	/* iterate over array */
+	for (i = 0; i < super->s_no_je; i++) {
+		err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
+		if (err)
+			return err;
+	}
+	super->s_journal_area->a_segno = segno;
+	return 0;
+}
+
+static u64 read_gec(struct super_block *sb, u32 segno)
+{
+	struct logfs_segment_header sh;
+	__be32 crc;
+	int err;
+
+	if (!segno)
+		return 0;
+	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+	if (err)
+		return 0;
+	crc = logfs_crc32(&sh, sizeof(sh), 4);
+	if (crc != sh.crc) {
+		WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
+		/* Most likely it was just erased */
+		return 0;
+	}
+	return be64_to_cpu(sh.gec);
+}
+
+static int logfs_read_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 gec[LOGFS_JOURNAL_SEGS], max;
+	u32 segno;
+	int i, max_i;
+
+	max = 0;
+	max_i = -1;
+	journal_for_each(i) {
+		segno = super->s_journal_seg[i];
+		gec[i] = read_gec(sb, super->s_journal_seg[i]);
+		if (gec[i] > max) {
+			max = gec[i];
+			max_i = i;
+		}
+	}
+	if (max_i == -1)
+		return -EIO;
+	/* FIXME: Try older segments in case of error */
+	return logfs_read_segment(sb, super->s_journal_seg[max_i]);
+}
+
+/*
+ * First search the current segment (outer loop), then pick the next segment
+ * in the array, skipping any zero entries (inner loop).
+ */
+static void journal_get_free_segment(struct logfs_area *area)
+{
+	struct logfs_super *super = logfs_super(area->a_sb);
+	int i;
+
+	journal_for_each(i) {
+		if (area->a_segno != super->s_journal_seg[i])
+			continue;
+
+		do {
+			i++;
+			if (i == LOGFS_JOURNAL_SEGS)
+				i = 0;
+		} while (!super->s_journal_seg[i]);
+
+		area->a_segno = super->s_journal_seg[i];
+		area->a_erase_count = ++(super->s_journal_ec[i]);
+		log_journal("Journal now at %x (ec %x)\n", area->a_segno,
+				area->a_erase_count);
+		return;
+	}
+	BUG();
+}
+
+static void journal_get_erase_count(struct logfs_area *area)
+{
+	/* erase count is stored globally and incremented in
+	 * journal_get_free_segment() - nothing to do here */
+}
+
+static int journal_erase_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_segment_header sh;
+	u64 ofs;
+	int err;
+
+	err = logfs_erase_segment(sb, area->a_segno);
+	if (err)
+		return err;
+
+	sh.pad = 0;
+	sh.type = SEG_JOURNAL;
+	sh.level = 0;
+	sh.segno = cpu_to_be32(area->a_segno);
+	sh.ec = cpu_to_be32(area->a_erase_count);
+	sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+	sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+
+	/* This causes a bug in segment.c.  Not yet. */
+	//logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
+
+	ofs = dev_ofs(sb, area->a_segno, 0);
+	area->a_used_bytes = ALIGN(sizeof(sh), 16);
+	logfs_buf_write(area, ofs, &sh, sizeof(sh));
+	return 0;
+}
+
+static size_t __logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *jh, size_t len, size_t datalen,
+		u16 type, u8 compr)
+{
+	jh->h_len	= cpu_to_be16(len);
+	jh->h_type	= cpu_to_be16(type);
+	jh->h_version	= cpu_to_be16(++super->s_last_version);
+	jh->h_datalen	= cpu_to_be16(datalen);
+	jh->h_compr	= compr;
+	jh->h_pad[0]	= 'H';
+	jh->h_pad[1]	= 'A';
+	jh->h_pad[2]	= 'T';
+	jh->h_crc	= logfs_crc32(jh, len + sizeof(*jh), 4);
+	return ALIGN(len, 16) + sizeof(*jh);
+}
+
+static size_t logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *jh, size_t datalen, u16 type)
+{
+	size_t len = datalen;
+
+	return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
+}
+
+static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
+{
+	return LOGFS_JOURNAL_SEGS * sizeof(__be32);
+}
+
+static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_journal_ec *ec = _ec;
+	int i;
+
+	journal_for_each(i)
+		ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
+	*type = JE_ERASECOUNT;
+	*len = logfs_journal_erasecount_size(super);
+	return ec;
+}
+
+static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
+		size_t ignore2)
+{
+	struct logfs_shadow *shadow = _shadow;
+	struct super_block *sb = (void *)_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	/* consume new space */
+	super->s_free_bytes	  -= shadow->new_len;
+	super->s_used_bytes	  += shadow->new_len;
+	super->s_dirty_used_bytes -= shadow->new_len;
+
+	/* free up old space */
+	super->s_free_bytes	  += shadow->old_len;
+	super->s_used_bytes	  -= shadow->old_len;
+	super->s_dirty_free_bytes -= shadow->old_len;
+
+	logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
+	logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
+
+	log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	mempool_free(shadow, super->s_shadow_pool);
+}
+
+static void account_shadows(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	struct shadow_tree *tree = &super->s_shadow_tree;
+
+	btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
+	btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
+
+	if (li->li_block) {
+		/*
+		 * We never actually use the structure, when attached to the
+		 * master inode.  But it is easier to always free it here than
+		 * to have checks in several places elsewhere when allocating
+		 * it.
+		 */
+		li->li_block->ops->free_block(sb, li->li_block);
+	}
+	BUG_ON((s64)li->li_used_bytes < 0);
+}
+
+static void *__logfs_write_anchor(struct super_block *sb, void *_da,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_anchor *da = _da;
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	da->da_height	= li->li_height;
+	da->da_last_ino = cpu_to_be64(super->s_last_ino);
+	da->da_size	= cpu_to_be64(i_size_read(inode));
+	da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		da->da_data[i] = cpu_to_be64(li->li_data[i]);
+	*type = JE_ANCHOR;
+	*len = sizeof(*da);
+	return da;
+}
+
+static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_dynsb *dynsb = _dynsb;
+
+	dynsb->ds_gec		= cpu_to_be64(super->s_gec);
+	dynsb->ds_sweeper	= cpu_to_be64(super->s_sweeper);
+	dynsb->ds_victim_ino	= cpu_to_be64(super->s_victim_ino);
+	dynsb->ds_rename_dir	= cpu_to_be64(super->s_rename_dir);
+	dynsb->ds_rename_pos	= cpu_to_be64(super->s_rename_pos);
+	dynsb->ds_used_bytes	= cpu_to_be64(super->s_used_bytes);
+	dynsb->ds_generation	= cpu_to_be32(super->s_generation);
+	*type = JE_DYNSB;
+	*len = sizeof(*dynsb);
+	return dynsb;
+}
+
+static void write_wbuf(struct super_block *sb, struct logfs_area *area,
+		void *wbuf)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	u64 ofs;
+	pgoff_t index;
+	int page_ofs;
+	struct page *page;
+
+	ofs = dev_ofs(sb, area->a_segno,
+			area->a_used_bytes & ~(super->s_writesize - 1));
+	index = ofs >> PAGE_SHIFT;
+	page_ofs = ofs & (PAGE_SIZE - 1);
+
+	page = find_lock_page(mapping, index);
+	BUG_ON(!page);
+	memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
+	unlock_page(page);
+}
+
+static void *logfs_write_area(struct super_block *sb, void *_a,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[super->s_sum_index];
+	struct logfs_je_area *a = _a;
+
+	a->vim = VIM_DEFAULT;
+	a->gc_level = super->s_sum_index;
+	a->used_bytes = cpu_to_be32(area->a_used_bytes);
+	a->segno = cpu_to_be32(area->a_segno);
+	if (super->s_writesize > 1)
+		write_wbuf(sb, area, a + 1);
+
+	*type = JE_AREA;
+	*len = sizeof(*a) + super->s_writesize;
+	return a;
+}
+
+static void *logfs_write_commit(struct super_block *sb, void *h,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	*type = JE_COMMIT;
+	*len = super->s_no_je * sizeof(__be64);
+	return super->s_je_array;
+}
+
+static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
+		size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *header = super->s_compressed_je;
+	void *data = header + sizeof(struct logfs_journal_header);
+	ssize_t compr_len, pad_len;
+	u8 compr = COMPR_ZLIB;
+
+	if (len == 0)
+		return logfs_write_header(super, header, 0, type);
+
+	compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
+	if (compr_len < 0 || type == JE_ANCHOR) {
+		BUG_ON(len > sb->s_blocksize);
+		memcpy(data, buf, len);
+		compr_len = len;
+		compr = COMPR_NONE;
+	}
+
+	pad_len = ALIGN(compr_len, 16);
+	memset(data + compr_len, 0, pad_len - compr_len);
+
+	return __logfs_write_header(super, header, compr_len, len, type, compr);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
+		int must_pad)
+{
+	u32 writesize = logfs_super(area->a_sb)->s_writesize;
+	s32 ofs;
+	int ret;
+
+	ret = logfs_open_area(area, *bytes);
+	if (ret)
+		return -EAGAIN;
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += *bytes;
+
+	if (must_pad) {
+		area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
+		*bytes = area->a_used_bytes - ofs;
+	}
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
+		size_t buf_len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	size_t len;
+	int must_pad = 0;
+	s64 ofs;
+
+	len = __logfs_write_je(sb, buf, type, buf_len);
+	if (jh->h_type == cpu_to_be16(JE_COMMIT))
+		must_pad = 1;
+
+	ofs = logfs_get_free_bytes(area, &len, must_pad);
+	if (ofs < 0)
+		return ofs;
+	logfs_buf_write(area, ofs, super->s_compressed_je, len);
+	super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
+	return 0;
+}
+
+static int logfs_write_je(struct super_block *sb,
+		void* (*write)(struct super_block *sb, void *scratch,
+			u16 *type, size_t *len))
+{
+	void *buf;
+	size_t len;
+	u16 type;
+
+	buf = write(sb, logfs_super(sb)->s_je, &type, &len);
+	return logfs_write_je_buf(sb, buf, type, len);
+}
+
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_obj_alias *oa = super->s_je;
+	int err = 0, fill = super->s_je_fill;
+
+	log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
+			fill, ino, bix, level, child_no, be64_to_cpu(val));
+	oa[fill].ino = cpu_to_be64(ino);
+	oa[fill].bix = cpu_to_be64(bix);
+	oa[fill].val = val;
+	oa[fill].level = (__force u8)level;
+	oa[fill].child_no = cpu_to_be16(child_no);
+	fill++;
+	if (fill >= sb->s_blocksize / sizeof(*oa)) {
+		err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
+		fill = 0;
+	}
+
+	super->s_je_fill = fill;
+	return err;
+}
+
+static int logfs_write_obj_aliases(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int err;
+
+	log_journal("logfs_write_obj_aliases: %d aliases to write\n",
+			super->s_no_object_aliases);
+	super->s_je_fill = 0;
+	err = logfs_write_obj_aliases_pagecache(sb);
+	if (err)
+		return err;
+
+	if (super->s_je_fill)
+		err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
+				super->s_je_fill
+				* sizeof(struct logfs_obj_alias));
+	return err;
+}
+
+/*
+ * Write all journal entries.  The goto logic ensures that all journal entries
+ * are written whenever a new segment is used.  It is ugly and potentially a
+ * bit wasteful, but robustness is more important.  With this we can *always*
+ * erase all journal segments except the one containing the most recent commit.
+ */
+void logfs_write_anchor(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	int i, err;
+
+	BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	mutex_lock(&super->s_journal_mutex);
+
+	/* Do this first or suffer corruption */
+	logfs_sync_segments(sb);
+	account_shadows(sb);
+
+again:
+	super->s_no_je = 0;
+	for_each_area(i) {
+		if (!super->s_area[i]->a_is_open)
+			continue;
+		super->s_sum_index = i;
+		err = logfs_write_je(sb, logfs_write_area);
+		if (err)
+			goto again;
+	}
+	err = logfs_write_obj_aliases(sb);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_erasecount);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, __logfs_write_anchor);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_dynsb);
+	if (err)
+		goto again;
+	/*
+	 * Order is imperative.  First we sync all writes, including the
+	 * non-committed journal writes.  Then we write the final commit and
+	 * sync the current journal segment.
+	 * There is a theoretical bug here.  Syncing the journal segment will
+	 * write a number of journal entries and the final commit.  All these
+	 * are written in a single operation.  If the device layer writes the
+	 * data back-to-front, the commit will precede the other journal
+	 * entries, leaving a race window.
+	 * Two fixes are possible.  Preferred is to fix the device layer to
+	 * ensure writes happen front-to-back.  Alternatively we can insert
+	 * another logfs_sync_area() super->s_devops->sync() combo before
+	 * writing the commit.
+	 */
+	/*
+	 * On another subject, super->s_devops->sync is usually not necessary.
+	 * Unless called from sys_sync or friends, a barrier would suffice.
+	 */
+	super->s_devops->sync(sb);
+	err = logfs_write_je(sb, logfs_write_commit);
+	if (err)
+		goto again;
+	log_journal("Write commit to %llx\n",
+			be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
+	logfs_sync_area(area);
+	BUG_ON(area->a_used_bytes != area->a_written_bytes);
+	super->s_devops->sync(sb);
+
+	mutex_unlock(&super->s_journal_mutex);
+	return;
+}
+
+void do_logfs_journal_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	u32 segno, ec;
+	int i, err;
+
+	log_journal("Journal requires wear-leveling.\n");
+	/* Drop old segments */
+	journal_for_each(i)
+		if (super->s_journal_seg[i]) {
+			logfs_set_segment_unreserved(sb,
+					super->s_journal_seg[i],
+					super->s_journal_ec[i]);
+			super->s_journal_seg[i] = 0;
+			super->s_journal_ec[i] = 0;
+		}
+	/* Get new segments */
+	for (i = 0; i < super->s_no_journal_segs; i++) {
+		segno = get_best_cand(sb, &super->s_reserve_list, &ec);
+		super->s_journal_seg[i] = segno;
+		super->s_journal_ec[i] = ec;
+		logfs_set_segment_reserved(sb, segno);
+	}
+	/* Manually move journal_area */
+	area->a_segno = super->s_journal_seg[0];
+	area->a_is_open = 0;
+	area->a_used_bytes = 0;
+	/* Write journal */
+	logfs_write_anchor(super->s_master_inode);
+	/* Write superblocks */
+	err = logfs_write_sb(sb);
+	BUG_ON(err);
+}
+
+static const struct logfs_area_ops journal_area_ops = {
+	.get_free_segment	= journal_get_free_segment,
+	.get_erase_count	= journal_get_erase_count,
+	.erase_segment		= journal_erase_segment,
+};
+
+int logfs_init_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+		+ MAX_JOURNAL_HEADER;
+	int ret = -ENOMEM;
+
+	mutex_init(&super->s_journal_mutex);
+	btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
+
+	super->s_je = kzalloc(bufsize, GFP_KERNEL);
+	if (!super->s_je)
+		return ret;
+
+	super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
+	if (!super->s_compressed_je)
+		return ret;
+
+	super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
+	if (IS_ERR(super->s_master_inode))
+		return PTR_ERR(super->s_master_inode);
+
+	ret = logfs_read_journal(sb);
+	if (ret)
+		return -EIO;
+
+	reserve_sb_and_journal(sb);
+	logfs_calc_free(sb);
+
+	super->s_journal_area->a_ops = &journal_area_ops;
+	return 0;
+}
+
+void logfs_cleanup_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
+	destroy_meta_inode(super->s_master_inode);
+	super->s_master_inode = NULL;
+
+	kfree(super->s_compressed_je);
+	kfree(super->s_je);
+}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 00000000000..e3082abe9e3
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,722 @@
+/*
+ * fs/logfs/logfs.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Private header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_H
+#define FS_LOGFS_LOGFS_H
+
+#undef __CHECK_ENDIAN__
+#define __CHECK_ENDIAN__
+
+#include <linux/btree.h>
+#include <linux/crc32.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include "logfs_abi.h"
+
+#define LOGFS_DEBUG_SUPER	(0x0001)
+#define LOGFS_DEBUG_SEGMENT	(0x0002)
+#define LOGFS_DEBUG_JOURNAL	(0x0004)
+#define LOGFS_DEBUG_DIR		(0x0008)
+#define LOGFS_DEBUG_FILE	(0x0010)
+#define LOGFS_DEBUG_INODE	(0x0020)
+#define LOGFS_DEBUG_READWRITE	(0x0040)
+#define LOGFS_DEBUG_GC		(0x0080)
+#define LOGFS_DEBUG_GC_NOISY	(0x0100)
+#define LOGFS_DEBUG_ALIASES	(0x0200)
+#define LOGFS_DEBUG_BLOCKMOVE	(0x0400)
+#define LOGFS_DEBUG_ALL		(0xffffffff)
+
+#define LOGFS_DEBUG		(0x01)
+/*
+ * To enable specific log messages, simply define LOGFS_DEBUG to match any
+ * or all of the above.
+ */
+#ifndef LOGFS_DEBUG
+#define LOGFS_DEBUG		(0)
+#endif
+
+#define log_cond(cond, fmt, arg...) do {	\
+	if (cond)				\
+		printk(KERN_DEBUG fmt, ##arg);	\
+} while (0)
+
+#define log_super(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
+#define log_segment(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
+#define log_journal(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
+#define log_dir(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
+#define log_file(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
+#define log_inode(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
+#define log_readwrite(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
+#define log_gc(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
+#define log_gc_noisy(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
+#define log_aliases(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
+#define log_blockmove(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
+
+#define PG_pre_locked		PG_owner_priv_1
+#define PagePreLocked(page)	test_bit(PG_pre_locked, &(page)->flags)
+#define SetPagePreLocked(page)	set_bit(PG_pre_locked, &(page)->flags)
+#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
+
+/* FIXME: This should really be somewhere in the 64bit area. */
+#define LOGFS_LINK_MAX		(1<<30)
+
+/* Read-only filesystem */
+#define LOGFS_SB_FLAG_RO	0x0001
+#define LOGFS_SB_FLAG_SEG_ALIAS	0x0002
+#define LOGFS_SB_FLAG_OBJ_ALIAS	0x0004
+#define LOGFS_SB_FLAG_SHUTDOWN	0x0008
+
+/* Write Control Flags */
+#define WF_LOCK			0x01 /* take write lock */
+#define WF_WRITE		0x02 /* write block */
+#define WF_DELETE		0x04 /* delete old block */
+
+typedef u8 __bitwise level_t;
+typedef u8 __bitwise gc_level_t;
+
+#define LEVEL(level) ((__force level_t)(level))
+#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
+
+#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)),	\
+		(__force level_t)((__force u8)(level) - 1) )
+
+/**
+ * struct logfs_area - area management information
+ *
+ * @a_sb:			the superblock this area belongs to
+ * @a_is_open:			1 if the area is currently open, else 0
+ * @a_segno:			segment number of area
+ * @a_written_bytes:		number of bytes already written back
+ * @a_used_bytes:		number of used bytes
+ * @a_ops:			area operations (either journal or ostore)
+ * @a_erase_count:		erase count
+ * @a_level:			GC level
+ */
+struct logfs_area { /* a segment open for writing */
+	struct super_block *a_sb;
+	int	a_is_open;
+	u32	a_segno;
+	u32	a_written_bytes;
+	u32	a_used_bytes;
+	const struct logfs_area_ops *a_ops;
+	u32	a_erase_count;
+	gc_level_t a_level;
+};
+
+/**
+ * struct logfs_area_ops - area operations
+ *
+ * @get_free_segment:		fill area->ofs with the offset of a free segment
+ * @get_erase_count:		fill area->erase_count (needs area->ofs)
+ * @erase_segment:		erase and setup segment
+ */
+struct logfs_area_ops {
+	void	(*get_free_segment)(struct logfs_area *area);
+	void	(*get_erase_count)(struct logfs_area *area);
+	int	(*erase_segment)(struct logfs_area *area);
+};
+
+/**
+ * struct logfs_device_ops - device access operations
+ *
+ * @readpage:			read one page (mm page)
+ * @writeseg:			write one segment.  may be a partial segment
+ * @erase:			erase one segment
+ * @read:			read from the device
+ * @erase:			erase part of the device
+ */
+struct logfs_device_ops {
+	struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
+	struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
+	int (*write_sb)(struct super_block *sb, struct page *page);
+	int (*readpage)(void *_sb, struct page *page);
+	void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
+	int (*erase)(struct super_block *sb, loff_t ofs, size_t len);
+	void (*sync)(struct super_block *sb);
+	void (*put_device)(struct super_block *sb);
+};
+
+/**
+ * struct candidate_list - list of similar candidates
+ */
+struct candidate_list {
+	struct rb_root rb_tree;
+	int count;
+	int maxcount;
+	int sort_by_ec;
+};
+
+/**
+ * struct gc_candidate - "candidate" segment to be garbage collected next
+ *
+ * @list:			list (either free of low)
+ * @segno:			segment number
+ * @valid:			number of valid bytes
+ * @erase_count:		erase count of segment
+ * @dist:			distance from tree root
+ *
+ * Candidates can be on two lists.  The free list contains electees rather
+ * than candidates - segments that no longer contain any valid data.  The
+ * low list contains candidates to be picked for GC.  It should be kept
+ * short.  It is not required to always pick a perfect candidate.  In the
+ * worst case GC will have to move more data than absolutely necessary.
+ */
+struct gc_candidate {
+	struct rb_node rb_node;
+	struct candidate_list *list;
+	u32	segno;
+	u32	valid;
+	u32	erase_count;
+	u8	dist;
+};
+
+/**
+ * struct logfs_journal_entry - temporary structure used during journal scan
+ *
+ * @used:
+ * @version:			normalized version
+ * @len:			length
+ * @offset:			offset
+ */
+struct logfs_journal_entry {
+	int used;
+	s16 version;
+	u16 len;
+	u16 datalen;
+	u64 offset;
+};
+
+enum transaction_state {
+	CREATE_1 = 1,
+	CREATE_2,
+	UNLINK_1,
+	UNLINK_2,
+	CROSS_RENAME_1,
+	CROSS_RENAME_2,
+	TARGET_RENAME_1,
+	TARGET_RENAME_2,
+	TARGET_RENAME_3
+};
+
+/**
+ * struct logfs_transaction - essential fields to support atomic dirops
+ *
+ * @ino:			target inode
+ * @dir:			inode of directory containing dentry
+ * @pos:			pos of dentry in directory
+ */
+struct logfs_transaction {
+	enum transaction_state state;
+	u64	 ino;
+	u64	 dir;
+	u64	 pos;
+};
+
+/**
+ * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
+ * @old_ofs:			offset of old block on medium
+ * @new_ofs:			offset of new block on medium
+ * @ino:			inode number
+ * @bix:			block index
+ * @old_len:			size of old block, including header
+ * @new_len:			size of new block, including header
+ * @level:			block level
+ */
+struct logfs_shadow {
+	u64 old_ofs;
+	u64 new_ofs;
+	u64 ino;
+	u64 bix;
+	int old_len;
+	int new_len;
+	gc_level_t gc_level;
+};
+
+/**
+ * struct shadow_tree
+ * @new:			shadows where old_ofs==0, indexed by new_ofs
+ * @old:			shadows where old_ofs!=0, indexed by old_ofs
+ */
+struct shadow_tree {
+	struct btree_head64 new;
+	struct btree_head64 old;
+};
+
+struct object_alias_item {
+	struct list_head list;
+	__be64 val;
+	int child_no;
+};
+
+/**
+ * struct logfs_block - contains any block state
+ * @type:			indirect block or inode
+ * @full:			number of fully populated children
+ * @partial:			number of partially populated children
+ *
+ * Most blocks are directly represented by page cache pages.  But when a block
+ * becomes dirty, is part of a transaction, contains aliases or is otherwise
+ * special, a struct logfs_block is allocated to track the additional state.
+ * Inodes are very similar to indirect blocks, so they can also get one of
+ * these structures added when appropriate.
+ */
+#define BLOCK_INDIRECT	1	/* Indirect block */
+#define BLOCK_INODE	2	/* Inode */
+struct logfs_block_ops;
+struct logfs_block {
+	struct list_head alias_list;
+	struct list_head item_list;
+	struct super_block *sb;
+	u64 ino;
+	u64 bix;
+	level_t level;
+	struct page *page;
+	struct inode *inode;
+	struct logfs_transaction *ta;
+	unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
+	struct logfs_block_ops *ops;
+	int full;
+	int partial;
+	int reserved_bytes;
+};
+
+typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val);
+struct logfs_block_ops {
+	void	(*write_block)(struct logfs_block *block);
+	gc_level_t	(*block_level)(struct logfs_block *block);
+	void	(*free_block)(struct super_block *sb, struct logfs_block*block);
+	int	(*write_alias)(struct super_block *sb,
+			struct logfs_block *block,
+			write_alias_t *write_one_alias);
+};
+
+struct logfs_super {
+	struct mtd_info *s_mtd;			/* underlying device */
+	struct block_device *s_bdev;		/* underlying device */
+	const struct logfs_device_ops *s_devops;/* device access */
+	struct inode	*s_master_inode;	/* inode file */
+	struct inode	*s_segfile_inode;	/* segment file */
+	struct inode *s_mapping_inode;		/* device mapping */
+	atomic_t s_pending_writes;		/* outstanting bios */
+	long	 s_flags;
+	mempool_t *s_btree_pool;		/* for btree nodes */
+	mempool_t *s_alias_pool;		/* aliases in segment.c */
+	u64	 s_feature_incompat;
+	u64	 s_feature_ro_compat;
+	u64	 s_feature_compat;
+	u64	 s_feature_flags;
+	u64	 s_sb_ofs[2];
+	/* alias.c fields */
+	struct btree_head32 s_segment_alias;	/* remapped segments */
+	int	 s_no_object_aliases;
+	struct list_head s_object_alias;	/* remapped objects */
+	struct btree_head128 s_object_alias_tree; /* remapped objects */
+	struct mutex s_object_alias_mutex;
+	/* dir.c fields */
+	struct mutex s_dirop_mutex;		/* for creat/unlink/rename */
+	u64	 s_victim_ino;			/* used for atomic dir-ops */
+	u64	 s_rename_dir;			/* source directory ino */
+	u64	 s_rename_pos;			/* position of source dd */
+	/* gc.c fields */
+	long	 s_segsize;			/* size of a segment */
+	int	 s_segshift;			/* log2 of segment size */
+	long	 s_segmask;			/* 1 << s_segshift - 1 */
+	long	 s_no_segs;			/* segments on device */
+	long	 s_no_journal_segs;		/* segments used for journal */
+	long	 s_no_blocks;			/* blocks per segment */
+	long	 s_writesize;			/* minimum write size */
+	int	 s_writeshift;			/* log2 of write size */
+	u64	 s_size;			/* filesystem size */
+	struct logfs_area *s_area[LOGFS_NO_AREAS];	/* open segment array */
+	u64	 s_gec;				/* global erase count */
+	u64	 s_wl_gec_ostore;		/* time of last wl event */
+	u64	 s_wl_gec_journal;		/* time of last wl event */
+	u64	 s_sweeper;			/* current sweeper pos */
+	u8	 s_ifile_levels;		/* max level of ifile */
+	u8	 s_iblock_levels;		/* max level of regular files */
+	u8	 s_data_levels;			/* # of segments to leaf block*/
+	u8	 s_total_levels;		/* sum of above three */
+	struct btree_head32 s_cand_tree;	/* all candidates */
+	struct candidate_list s_free_list;	/* 100% free segments */
+	struct candidate_list s_reserve_list;	/* Bad segment reserve */
+	struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
+	struct candidate_list s_ec_list;	/* wear level candidates */
+	struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
+	/* inode.c fields */
+	u64	 s_last_ino;			/* highest ino used */
+	long	 s_inos_till_wrap;
+	u32	 s_generation;			/* i_generation for new files */
+	struct list_head s_freeing_list;	/* inodes being freed */
+	/* journal.c fields */
+	struct mutex s_journal_mutex;
+	void	*s_je;				/* journal entry to compress */
+	void	*s_compressed_je;		/* block to write to journal */
+	u32	 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
+	u32	 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
+	u64	 s_last_version;
+	struct logfs_area *s_journal_area;	/* open journal segment */
+	__be64	s_je_array[64];
+	int	s_no_je;
+
+	int	 s_sum_index;			/* for the 12 summaries */
+	struct shadow_tree s_shadow_tree;
+	int	 s_je_fill;			/* index of current je */
+	/* readwrite.c fields */
+	struct mutex s_write_mutex;
+	int	 s_lock_count;
+	mempool_t *s_block_pool;		/* struct logfs_block pool */
+	mempool_t *s_shadow_pool;		/* struct logfs_shadow pool */
+	/*
+	 * Space accounting:
+	 * - s_used_bytes specifies space used to store valid data objects.
+	 * - s_dirty_used_bytes is space used to store non-committed data
+	 *   objects.  Those objects have already been written themselves,
+	 *   but they don't become valid until all indirect blocks up to the
+	 *   journal have been written as well.
+	 * - s_dirty_free_bytes is space used to store the old copy of a
+	 *   replaced object, as long as the replacement is non-committed.
+	 *   In other words, it is the amount of space freed when all dirty
+	 *   blocks are written back.
+	 * - s_free_bytes is the amount of free space available for any
+	 *   purpose.
+	 * - s_root_reserve is the amount of free space available only to
+	 *   the root user.  Non-privileged users can no longer write once
+	 *   this watermark has been reached.
+	 * - s_speed_reserve is space which remains unused to speed up
+	 *   garbage collection performance.
+	 * - s_dirty_pages is the space reserved for currently dirty pages.
+	 *   It is a pessimistic estimate, so some/most will get freed on
+	 *   page writeback.
+	 *
+	 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
+	 */
+	u64	 s_free_bytes;
+	u64	 s_used_bytes;
+	u64	 s_dirty_free_bytes;
+	u64	 s_dirty_used_bytes;
+	u64	 s_root_reserve;
+	u64	 s_speed_reserve;
+	u64	 s_dirty_pages;
+	/* Bad block handling:
+	 * - s_bad_seg_reserve is a number of segments usually kept
+	 *   free.  When encountering bad blocks, the affected segment's data
+	 *   is _temporarily_ moved to a reserved segment.
+	 * - s_bad_segments is the number of known bad segments.
+	 */
+	u32	 s_bad_seg_reserve;
+	u32	 s_bad_segments;
+};
+
+/**
+ * struct logfs_inode - in-memory inode
+ *
+ * @vfs_inode:			struct inode
+ * @li_data:			data pointers
+ * @li_used_bytes:		number of used bytes
+ * @li_freeing_list:		used to track inodes currently being freed
+ * @li_flags:			inode flags
+ * @li_refcount:		number of internal (GC-induced) references
+ */
+struct logfs_inode {
+	struct inode vfs_inode;
+	u64	li_data[LOGFS_EMBEDDED_FIELDS];
+	u64	li_used_bytes;
+	struct list_head li_freeing_list;
+	struct logfs_block *li_block;
+	u32	li_flags;
+	u8	li_height;
+	int	li_refcount;
+};
+
+#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
+#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
+#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
+
+/* compr.c */
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
+int __init logfs_compr_init(void);
+void logfs_compr_exit(void);
+
+/* dev_bdev.c */
+#ifdef CONFIG_BLOCK
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt);
+#else
+static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt)
+{
+	return -ENODEV;
+}
+#endif
+
+/* dev_mtd.c */
+#ifdef CONFIG_MTD
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt);
+#else
+static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt)
+{
+	return -ENODEV;
+}
+#endif
+
+/* dir.c */
+extern const struct inode_operations logfs_symlink_iops;
+extern const struct inode_operations logfs_dir_iops;
+extern const struct file_operations logfs_dir_fops;
+int logfs_replay_journal(struct super_block *sb);
+
+/* file.c */
+extern const struct inode_operations logfs_reg_iops;
+extern const struct file_operations logfs_reg_fops;
+extern const struct address_space_operations logfs_reg_aops;
+int logfs_readpage(struct file *file, struct page *page);
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		unsigned long arg);
+int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+
+/* gc.c */
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
+void logfs_gc_pass(struct super_block *sb);
+int logfs_check_areas(struct super_block *sb);
+int logfs_init_gc(struct super_block *sb);
+void logfs_cleanup_gc(struct super_block *sb);
+
+/* inode.c */
+extern const struct super_operations logfs_super_operations;
+struct inode *logfs_iget(struct super_block *sb, ino_t ino);
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
+void logfs_safe_iput(struct inode *inode, int cookie);
+struct inode *logfs_new_inode(struct inode *dir, int mode);
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
+int logfs_init_inode_cache(void);
+void logfs_destroy_inode_cache(void);
+void destroy_meta_inode(struct inode *inode);
+void logfs_set_blocks(struct inode *inode, u64 no);
+/* these logically belong into inode.c but actually reside in readwrite.c */
+int logfs_read_inode(struct inode *inode);
+int __logfs_write_inode(struct inode *inode, long flags);
+void logfs_delete_inode(struct inode *inode);
+void logfs_clear_inode(struct inode *inode);
+
+/* journal.c */
+void logfs_write_anchor(struct inode *inode);
+int logfs_init_journal(struct super_block *sb);
+void logfs_cleanup_journal(struct super_block *sb);
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val);
+void do_logfs_journal_wl_pass(struct super_block *sb);
+
+/* readwrite.c */
+pgoff_t logfs_pack_index(u64 bix, level_t level);
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct shadow_tree *shadow_tree);
+int logfs_readpage_nolock(struct page *page);
+int logfs_write_buf(struct inode *inode, struct page *page, long flags);
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree);
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+		gc_level_t gc_level, long flags);
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+		gc_level_t gc_level);
+int logfs_truncate(struct inode *inode, u64 size);
+u64 logfs_seek_hole(struct inode *inode, u64 bix);
+u64 logfs_seek_data(struct inode *inode, u64 bix);
+int logfs_open_segfile(struct super_block *sb);
+int logfs_init_rw(struct super_block *sb);
+void logfs_cleanup_rw(struct super_block *sb);
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_write_block(struct logfs_block *block, long flags);
+int logfs_write_obj_aliases_pagecache(struct super_block *sb);
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+		struct logfs_segment_entry *se);
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+		gc_level_t gc_level);
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
+struct logfs_block *__alloc_block(struct super_block *sb,
+		u64 ino, u64 bix, level_t level);
+void __free_block(struct super_block *sb, struct logfs_block *block);
+void btree_write_block(struct logfs_block *block);
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+		__be64 *array, int page_is_empty);
+int logfs_exist_block(struct inode *inode, u64 bix);
+int get_page_reserve(struct inode *inode, struct page *page);
+extern struct logfs_block_ops indirect_block_ops;
+
+/* segment.c */
+int logfs_erase_segment(struct super_block *sb, u32 ofs);
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
+int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
+		level_t level);
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow);
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
+int logfs_load_object_aliases(struct super_block *sb,
+		struct logfs_obj_alias *oa, int count);
+void move_page_to_btree(struct page *page);
+int logfs_init_mapping(struct super_block *sb);
+void logfs_sync_area(struct logfs_area *area);
+void logfs_sync_segments(struct super_block *sb);
+
+/* area handling */
+int logfs_init_areas(struct super_block *sb);
+void logfs_cleanup_areas(struct super_block *sb);
+int logfs_open_area(struct logfs_area *area, size_t bytes);
+void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+		int use_filler);
+
+static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
+		void *buf, size_t len)
+{
+	__logfs_buf_write(area, ofs, buf, len, 0);
+}
+
+static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
+		void *buf, size_t len)
+{
+	__logfs_buf_write(area, ofs, buf, len, 1);
+}
+
+/* super.c */
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
+void emergency_read_end(struct page *page);
+void logfs_crash_dump(struct super_block *sb);
+void *memchr_inv(const void *s, int c, size_t n);
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+		struct mtd_info *mtd, struct block_device *bdev,
+		const struct logfs_device_ops *devops, struct vfsmount *mnt);
+int logfs_check_ds(struct logfs_disk_super *ds);
+int logfs_write_sb(struct super_block *sb);
+
+static inline struct logfs_super *logfs_super(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct logfs_inode *logfs_inode(struct inode *inode)
+{
+	return container_of(inode, struct logfs_inode, vfs_inode);
+}
+
+static inline void logfs_set_ro(struct super_block *sb)
+{
+	logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
+}
+
+#define LOGFS_BUG(sb) do {					\
+	struct super_block *__sb = sb;				\
+	logfs_crash_dump(__sb);					\
+	logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO;		\
+	BUG();							\
+} while (0)
+
+#define LOGFS_BUG_ON(condition, sb) \
+	do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
+
+static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
+{
+	return cpu_to_be32(crc32(~0, data+skip, len-skip));
+}
+
+static inline u8 logfs_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
+{
+	return pos >> sb->s_blocksize_bits;
+}
+
+static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
+{
+	return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
+}
+
+static inline u32 seg_no(struct super_block *sb, u64 ofs)
+{
+	return ofs >> logfs_super(sb)->s_segshift;
+}
+
+static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
+{
+	return ofs & logfs_super(sb)->s_segmask;
+}
+
+static inline u64 seg_align(struct super_block *sb, u64 ofs)
+{
+	return ofs & ~logfs_super(sb)->s_segmask;
+}
+
+static inline struct logfs_block *logfs_block(struct page *page)
+{
+	return (void *)page->private;
+}
+
+static inline level_t shrink_level(gc_level_t __level)
+{
+	u8 level = (__force u8)__level;
+
+	if (level >= LOGFS_MAX_LEVELS)
+		level -= LOGFS_MAX_LEVELS;
+	return (__force level_t)level;
+}
+
+static inline gc_level_t expand_level(u64 ino, level_t __level)
+{
+	u8 level = (__force u8)__level;
+
+	if (ino == LOGFS_INO_MASTER) {
+		/* ifile has seperate areas */
+		level += LOGFS_MAX_LEVELS;
+	}
+	return (__force gc_level_t)level;
+}
+
+static inline int logfs_block_shift(struct super_block *sb, level_t level)
+{
+	level = shrink_level((__force gc_level_t)level);
+	return (__force int)level * (sb->s_blocksize_bits - 3);
+}
+
+static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
+{
+	return ~0ull << logfs_block_shift(sb, level);
+}
+
+static inline struct logfs_area *get_area(struct super_block *sb,
+		gc_level_t gc_level)
+{
+	return logfs_super(sb)->s_area[(__force u8)gc_level];
+}
+
+#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 00000000000..5d3782ddecc
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,627 @@
+/*
+ * fs/logfs/logfs_abi.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Public header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_ABI_H
+#define FS_LOGFS_LOGFS_ABI_H
+
+/* For out-of-kernel compiles */
+#ifndef BUILD_BUG_ON
+#define BUILD_BUG_ON(condition) /**/
+#endif
+
+#define SIZE_CHECK(type, size)					\
+static inline void check_##type(void)				\
+{								\
+	BUILD_BUG_ON(sizeof(struct type) != (size));		\
+}
+
+/*
+ * Throughout the logfs code, we're constantly dealing with blocks at
+ * various positions or offsets.  To remove confusion, we stricly
+ * distinguish between a "position" - the logical position within a
+ * file and an "offset" - the physical location within the device.
+ *
+ * Any usage of the term offset for a logical location or position for
+ * a physical one is a bug and should get fixed.
+ */
+
+/*
+ * Block are allocated in one of several segments depending on their
+ * level.  The following levels are used:
+ *  0	- regular data block
+ *  1	- i1 indirect blocks
+ *  2	- i2 indirect blocks
+ *  3	- i3 indirect blocks
+ *  4	- i4 indirect blocks
+ *  5	- i5 indirect blocks
+ *  6	- ifile data blocks
+ *  7	- ifile i1 indirect blocks
+ *  8	- ifile i2 indirect blocks
+ *  9	- ifile i3 indirect blocks
+ * 10	- ifile i4 indirect blocks
+ * 11	- ifile i5 indirect blocks
+ * Potential levels to be used in the future:
+ * 12	- gc recycled blocks, long-lived data
+ * 13	- replacement blocks, short-lived data
+ *
+ * Levels 1-11 are necessary for robust gc operations and help seperate
+ * short-lived metadata from longer-lived file data.  In the future,
+ * file data should get seperated into several segments based on simple
+ * heuristics.  Old data recycled during gc operation is expected to be
+ * long-lived.  New data is of uncertain life expectancy.  New data
+ * used to replace older blocks in existing files is expected to be
+ * short-lived.
+ */
+
+
+/* Magic numbers.  64bit for superblock, 32bit for statfs f_type */
+#define LOGFS_MAGIC		0xb21f205ac97e8168ull
+#define LOGFS_MAGIC_U32		0xc97e8168u
+
+/*
+ * Various blocksize related macros.  Blocksize is currently fixed at 4KiB.
+ * Sooner or later that should become configurable and the macros replaced
+ * by something superblock-dependent.  Pointers in indirect blocks are and
+ * will remain 64bit.
+ *
+ * LOGFS_BLOCKSIZE	- self-explaining
+ * LOGFS_BLOCK_FACTOR	- number of pointers per indirect block
+ * LOGFS_BLOCK_BITS	- log2 of LOGFS_BLOCK_FACTOR, used for shifts
+ */
+#define LOGFS_BLOCKSIZE		(4096ull)
+#define LOGFS_BLOCK_FACTOR	(LOGFS_BLOCKSIZE / sizeof(u64))
+#define LOGFS_BLOCK_BITS	(9)
+
+/*
+ * Number of blocks at various levels of indirection.  There are 16 direct
+ * block pointers plus a single indirect pointer.
+ */
+#define I0_BLOCKS		(16)
+#define I1_BLOCKS		LOGFS_BLOCK_FACTOR
+#define I2_BLOCKS		(LOGFS_BLOCK_FACTOR * I1_BLOCKS)
+#define I3_BLOCKS		(LOGFS_BLOCK_FACTOR * I2_BLOCKS)
+#define I4_BLOCKS		(LOGFS_BLOCK_FACTOR * I3_BLOCKS)
+#define I5_BLOCKS		(LOGFS_BLOCK_FACTOR * I4_BLOCKS)
+
+#define INDIRECT_INDEX		I0_BLOCKS
+#define LOGFS_EMBEDDED_FIELDS	(I0_BLOCKS + 1)
+
+/*
+ * Sizes at which files require another level of indirection.  Files smaller
+ * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
+ * similar like ext2 fast symlinks.
+ *
+ * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
+ * direct pointers, else through the 1x indirect pointer and so forth.
+ */
+#define LOGFS_EMBEDDED_SIZE	(LOGFS_EMBEDDED_FIELDS * sizeof(u64))
+#define LOGFS_I0_SIZE		(I0_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I1_SIZE		(I1_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I2_SIZE		(I2_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I3_SIZE		(I3_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I4_SIZE		(I4_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I5_SIZE		(I5_BLOCKS * LOGFS_BLOCKSIZE)
+
+/*
+ * Each indirect block pointer must have this flag set, if all block pointers
+ * behind it are set, i.e. there is no hole hidden in the shadow of this
+ * indirect block pointer.
+ */
+#define LOGFS_FULLY_POPULATED (1ULL << 63)
+#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
+
+/*
+ * LogFS needs to seperate data into levels.  Each level is defined as the
+ * maximal possible distance from the master inode (inode of the inode file).
+ * Data blocks reside on level 0, 1x indirect block on level 1, etc.
+ * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
+ * This effort is necessary to guarantee garbage collection to always make
+ * progress.
+ *
+ * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
+ * LOGFS_MAX_LEVELS is one more for the actual data level of a file.  It is
+ * the maximal number of levels for one file.
+ * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
+ * effectively stacked on top of each other.
+ */
+#define LOGFS_MAX_INDIRECT	(5)
+#define LOGFS_MAX_LEVELS	(LOGFS_MAX_INDIRECT + 1)
+#define LOGFS_NO_AREAS		(2 * LOGFS_MAX_LEVELS)
+
+/* Maximum size of filenames */
+#define LOGFS_MAX_NAMELEN	(255)
+
+/* Number of segments in the primary journal. */
+#define LOGFS_JOURNAL_SEGS	(16)
+
+/* Maximum number of free/erased/etc. segments in journal entries */
+#define MAX_CACHED_SEGS		(64)
+
+
+/*
+ * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
+ * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
+ * its header,
+ * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
+ * its segment header and the padded space at the end when no further objects
+ * fit.
+ */
+#define LOGFS_OBJECT_HEADERSIZE	(0x1c)
+#define LOGFS_SEGMENT_HEADERSIZE (0x18)
+#define LOGFS_MAX_OBJECTSIZE	(LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
+#define LOGFS_SEGMENT_RESERVE	\
+	(LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
+
+/*
+ * Segment types:
+ * SEG_SUPER	- Data or indirect block
+ * SEG_JOURNAL	- Inode
+ * SEG_OSTORE	- Dentry
+ */
+enum {
+	SEG_SUPER	= 0x01,
+	SEG_JOURNAL	= 0x02,
+	SEG_OSTORE	= 0x03,
+};
+
+/**
+ * struct logfs_segment_header - per-segment header in the ostore
+ *
+ * @crc:			crc32 of header (there is no data)
+ * @pad:			unused, must be 0
+ * @type:			segment type, see above
+ * @level:			GC level for all objects in this segment
+ * @segno:			segment number
+ * @ec:				erase count for this segment
+ * @gec:			global erase count at time of writing
+ */
+struct logfs_segment_header {
+	__be32	crc;
+	__be16	pad;
+	__u8	type;
+	__u8	level;
+	__be32	segno;
+	__be32	ec;
+	__be64	gec;
+};
+
+SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
+
+/**
+ * struct logfs_disk_super - on-medium superblock
+ *
+ * @ds_magic:			magic number, must equal LOGFS_MAGIC
+ * @ds_crc:			crc32 of structure starting with the next field
+ * @ds_ifile_levels:		maximum number of levels for ifile
+ * @ds_iblock_levels:		maximum number of levels for regular files
+ * @ds_data_levels:		number of seperate levels for data
+ * @pad0:			reserved, must be 0
+ * @ds_feature_incompat:	incompatible filesystem features
+ * @ds_feature_ro_compat:	read-only compatible filesystem features
+ * @ds_feature_compat:		compatible filesystem features
+ * @ds_flags:			flags
+ * @ds_segment_shift:		log2 of segment size
+ * @ds_block_shift:		log2 of block size
+ * @ds_write_shift:		log2 of write size
+ * @pad1:			reserved, must be 0
+ * @ds_journal_seg:		segments used by primary journal
+ * @ds_root_reserve:		bytes reserved for the superuser
+ * @ds_speed_reserve:		bytes reserved to speed up GC
+ * @ds_bad_seg_reserve:		number of segments reserved to handle bad blocks
+ * @pad2:			reserved, must be 0
+ * @pad3:			reserved, must be 0
+ *
+ * Contains only read-only fields.  Read-write fields like the amount of used
+ * space is tracked in the dynamic superblock, which is stored in the journal.
+ */
+struct logfs_disk_super {
+	struct logfs_segment_header ds_sh;
+	__be64	ds_magic;
+
+	__be32	ds_crc;
+	__u8	ds_ifile_levels;
+	__u8	ds_iblock_levels;
+	__u8	ds_data_levels;
+	__u8	ds_segment_shift;
+	__u8	ds_block_shift;
+	__u8	ds_write_shift;
+	__u8	pad0[6];
+
+	__be64	ds_filesystem_size;
+	__be32	ds_segment_size;
+	__be32  ds_bad_seg_reserve;
+
+	__be64	ds_feature_incompat;
+	__be64	ds_feature_ro_compat;
+
+	__be64	ds_feature_compat;
+	__be64	ds_feature_flags;
+
+	__be64	ds_root_reserve;
+	__be64  ds_speed_reserve;
+
+	__be32	ds_journal_seg[LOGFS_JOURNAL_SEGS];
+
+	__be64	ds_super_ofs[2];
+	__be64	pad3[8];
+};
+
+SIZE_CHECK(logfs_disk_super, 256);
+
+/*
+ * Object types:
+ * OBJ_BLOCK	- Data or indirect block
+ * OBJ_INODE	- Inode
+ * OBJ_DENTRY	- Dentry
+ */
+enum {
+	OBJ_BLOCK	= 0x04,
+	OBJ_INODE	= 0x05,
+	OBJ_DENTRY	= 0x06,
+};
+
+/**
+ * struct logfs_object_header - per-object header in the ostore
+ *
+ * @crc:			crc32 of header, excluding data_crc
+ * @len:			length of data
+ * @type:			object type, see above
+ * @compr:			compression type
+ * @ino:			inode number
+ * @bix:			block index
+ * @data_crc:			crc32 of payload
+ */
+struct logfs_object_header {
+	__be32	crc;
+	__be16	len;
+	__u8	type;
+	__u8	compr;
+	__be64	ino;
+	__be64	bix;
+	__be32	data_crc;
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
+
+/*
+ * Reserved inode numbers:
+ * LOGFS_INO_MASTER	- master inode (for inode file)
+ * LOGFS_INO_ROOT	- root directory
+ * LOGFS_INO_SEGFILE	- per-segment used bytes and erase count
+ */
+enum {
+	LOGFS_INO_MAPPING	= 0x00,
+	LOGFS_INO_MASTER	= 0x01,
+	LOGFS_INO_ROOT		= 0x02,
+	LOGFS_INO_SEGFILE	= 0x03,
+	LOGFS_RESERVED_INOS	= 0x10,
+};
+
+/*
+ * Inode flags.  High bits should never be written to the medium.  They are
+ * reserved for in-memory usage.
+ * Low bits should either remain in sync with the corresponding FS_*_FL or
+ * reuse slots that obviously don't make sense for logfs.
+ *
+ * LOGFS_IF_DIRTY	Inode must be written back
+ * LOGFS_IF_ZOMBIE	Inode has been deleted
+ * LOGFS_IF_STILLBORN	-ENOSPC happened when creating inode
+ */
+#define LOGFS_IF_COMPRESSED	0x00000004 /* == FS_COMPR_FL */
+#define LOGFS_IF_DIRTY		0x20000000
+#define LOGFS_IF_ZOMBIE		0x40000000
+#define LOGFS_IF_STILLBORN	0x80000000
+
+/* Flags available to chattr */
+#define LOGFS_FL_USER_VISIBLE	(LOGFS_IF_COMPRESSED)
+#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
+/* Flags inherited from parent directory on file/directory creation */
+#define LOGFS_FL_INHERITED	(LOGFS_IF_COMPRESSED)
+
+/**
+ * struct logfs_disk_inode - on-medium inode
+ *
+ * @di_mode:			file mode
+ * @di_pad:			reserved, must be 0
+ * @di_flags:			inode flags, see above
+ * @di_uid:			user id
+ * @di_gid:			group id
+ * @di_ctime:			change time
+ * @di_mtime:			modify time
+ * @di_refcount:		reference count (aka nlink or link count)
+ * @di_generation:		inode generation, for nfs
+ * @di_used_bytes:		number of bytes used
+ * @di_size:			file size
+ * @di_data:			data pointers
+ */
+struct logfs_disk_inode {
+	__be16	di_mode;
+	__u8	di_height;
+	__u8	di_pad;
+	__be32	di_flags;
+	__be32	di_uid;
+	__be32	di_gid;
+
+	__be64	di_ctime;
+	__be64	di_mtime;
+
+	__be64	di_atime;
+	__be32	di_refcount;
+	__be32	di_generation;
+
+	__be64	di_used_bytes;
+	__be64	di_size;
+
+	__be64	di_data[LOGFS_EMBEDDED_FIELDS];
+};
+
+SIZE_CHECK(logfs_disk_inode, 200);
+
+#define INODE_POINTER_OFS \
+	(offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
+#define INODE_USED_OFS \
+	(offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
+#define INODE_SIZE_OFS \
+	(offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
+#define INODE_HEIGHT_OFS	(0)
+
+/**
+ * struct logfs_disk_dentry - on-medium dentry structure
+ *
+ * @ino:			inode number
+ * @namelen:			length of file name
+ * @type:			file type, identical to bits 12..15 of mode
+ * @name:			file name
+ */
+/* FIXME: add 6 bytes of padding to remove the __packed */
+struct logfs_disk_dentry {
+	__be64	ino;
+	__be16	namelen;
+	__u8	type;
+	__u8	name[LOGFS_MAX_NAMELEN];
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_disk_dentry, 266);
+
+#define RESERVED		0xffffffff
+#define BADSEG			0xffffffff
+/**
+ * struct logfs_segment_entry - segment file entry
+ *
+ * @ec_level:			erase count and level
+ * @valid:			number of valid bytes
+ *
+ * Segment file contains one entry for every segment.  ec_level contains the
+ * erasecount in the upper 28 bits and the level in the lower 4 bits.  An
+ * ec_level of BADSEG (-1) identifies bad segments.  valid contains the number
+ * of valid bytes or RESERVED (-1 again) if the segment is used for either the
+ * superblock or the journal, or when the segment is bad.
+ */
+struct logfs_segment_entry {
+	__be32	ec_level;
+	__be32	valid;
+};
+
+SIZE_CHECK(logfs_segment_entry, 8);
+
+/**
+ * struct logfs_journal_header - header for journal entries (JEs)
+ *
+ * @h_crc:			crc32 of journal entry
+ * @h_len:			length of compressed journal entry,
+ *				not including header
+ * @h_datalen:			length of uncompressed data
+ * @h_type:			JE type
+ * @h_version:			unnormalized version of journal entry
+ * @h_compr:			compression type
+ * @h_pad:			reserved
+ */
+struct logfs_journal_header {
+	__be32	h_crc;
+	__be16	h_len;
+	__be16	h_datalen;
+	__be16	h_type;
+	__be16	h_version;
+	__u8	h_compr;
+	__u8	h_pad[3];
+};
+
+SIZE_CHECK(logfs_journal_header, 16);
+
+/*
+ * Life expectency of data.
+ * VIM_DEFAULT		- default vim
+ * VIM_SEGFILE		- for segment file only - very short-living
+ * VIM_GC		- GC'd data - likely long-living
+ */
+enum logfs_vim {
+	VIM_DEFAULT	= 0,
+	VIM_SEGFILE	= 1,
+};
+
+/**
+ * struct logfs_je_area - wbuf header
+ *
+ * @segno:			segment number of area
+ * @used_bytes:			number of bytes already used
+ * @gc_level:			GC level
+ * @vim:			life expectancy of data
+ *
+ * "Areas" are segments currently being used for writing.  There is at least
+ * one area per GC level.  Several may be used to seperate long-living from
+ * short-living data.  If an area with unknown vim is encountered, it can
+ * simply be closed.
+ * The write buffer immediately follow this header.
+ */
+struct logfs_je_area {
+	__be32	segno;
+	__be32	used_bytes;
+	__u8	gc_level;
+	__u8	vim;
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_je_area, 10);
+
+#define MAX_JOURNAL_HEADER \
+	(sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
+
+/**
+ * struct logfs_je_dynsb - dynamic superblock
+ *
+ * @ds_gec:			global erase count
+ * @ds_sweeper:			current position of GC "sweeper"
+ * @ds_rename_dir:		source directory ino (see dir.c documentation)
+ * @ds_rename_pos:		position of source dd (see dir.c documentation)
+ * @ds_victim_ino:		victims of incomplete dir operation (see dir.c)
+ * @ds_victim_ino:		parent inode of victim (see dir.c)
+ * @ds_used_bytes:		number of used bytes
+ */
+struct logfs_je_dynsb {
+	__be64	ds_gec;
+	__be64	ds_sweeper;
+
+	__be64	ds_rename_dir;
+	__be64	ds_rename_pos;
+
+	__be64	ds_victim_ino;
+	__be64	ds_victim_parent; /* XXX */
+
+	__be64	ds_used_bytes;
+	__be32	ds_generation;
+	__be32	pad;
+};
+
+SIZE_CHECK(logfs_je_dynsb, 64);
+
+/**
+ * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
+ *
+ * @da_size:			size of inode file
+ * @da_last_ino:		last created inode
+ * @da_used_bytes:		number of bytes used
+ * @da_data:			data pointers
+ */
+struct logfs_je_anchor {
+	__be64	da_size;
+	__be64	da_last_ino;
+
+	__be64	da_used_bytes;
+	u8	da_height;
+	u8	pad[7];
+
+	__be64	da_data[LOGFS_EMBEDDED_FIELDS];
+};
+
+SIZE_CHECK(logfs_je_anchor, 168);
+
+/**
+ * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
+ *
+ * @so_segment:			segments used for 2nd journal
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_spillout {
+	__be64	so_segment[0];
+};
+
+SIZE_CHECK(logfs_je_spillout, 0);
+
+/**
+ * struct logfs_je_journal_ec - erase counts for all journal segments
+ *
+ * @ec:				erase count
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_journal_ec {
+	__be32	ec[0];
+};
+
+SIZE_CHECK(logfs_je_journal_ec, 0);
+
+/**
+ * struct logfs_je_free_segments - list of free segmetns with erase count
+ */
+struct logfs_je_free_segments {
+	__be32	segno;
+	__be32	ec;
+};
+
+SIZE_CHECK(logfs_je_free_segments, 8);
+
+/**
+ * struct logfs_seg_alias - list of segment aliases
+ */
+struct logfs_seg_alias {
+	__be32	old_segno;
+	__be32	new_segno;
+};
+
+SIZE_CHECK(logfs_seg_alias, 8);
+
+/**
+ * struct logfs_obj_alias - list of object aliases
+ */
+struct logfs_obj_alias {
+	__be64	ino;
+	__be64	bix;
+	__be64	val;
+	u8	level;
+	u8	pad[5];
+	__be16	child_no;
+};
+
+SIZE_CHECK(logfs_obj_alias, 32);
+
+/**
+ * Compression types.
+ *
+ * COMPR_NONE	- uncompressed
+ * COMPR_ZLIB	- compressed with zlib
+ */
+enum {
+	COMPR_NONE	= 0,
+	COMPR_ZLIB	= 1,
+};
+
+/*
+ * Journal entries come in groups of 16.  First group contains unique
+ * entries, next groups contain one entry per level
+ *
+ * JE_FIRST	- smallest possible journal entry number
+ *
+ * JEG_BASE	- base group, containing unique entries
+ * JE_COMMIT	- commit entry, validates all previous entries
+ * JE_DYNSB	- dynamic superblock, anything that ought to be in the
+ *		  superblock but cannot because it is read-write data
+ * JE_ANCHOR	- anchor aka master inode aka inode file's inode
+ * JE_ERASECOUNT  erasecounts for all journal segments
+ * JE_SPILLOUT	- unused
+ * JE_SEG_ALIAS	- aliases segments
+ * JE_AREA	- area description
+ *
+ * JE_LAST	- largest possible journal entry number
+ */
+enum {
+	JE_FIRST	= 0x01,
+
+	JEG_BASE	= 0x00,
+	JE_COMMIT	= 0x02,
+	JE_DYNSB	= 0x03,
+	JE_ANCHOR	= 0x04,
+	JE_ERASECOUNT	= 0x05,
+	JE_SPILLOUT	= 0x06,
+	JE_OBJ_ALIAS	= 0x0d,
+	JE_AREA		= 0x0e,
+
+	JE_LAST		= 0x0e,
+};
+
+#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 00000000000..1dbe6e8ccce
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2246 @@
+/*
+ * fs/logfs/readwrite.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ *
+ * Actually contains five sets of very similar functions:
+ * read		read blocks from a file
+ * seek_hole	find next hole
+ * seek_data	find next data block
+ * valid	check whether a block still belongs to a file
+ * write	write blocks to a file
+ * delete	delete a block (for directories and ifile)
+ * rewrite	move existing blocks of a file to a new location (gc helper)
+ * truncate	truncate a file
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+
+static u64 adjust_bix(u64 bix, level_t level)
+{
+	switch (level) {
+	case 0:
+		return bix;
+	case LEVEL(1):
+		return max_t(u64, bix, I0_BLOCKS);
+	case LEVEL(2):
+		return max_t(u64, bix, I1_BLOCKS);
+	case LEVEL(3):
+		return max_t(u64, bix, I2_BLOCKS);
+	case LEVEL(4):
+		return max_t(u64, bix, I3_BLOCKS);
+	case LEVEL(5):
+		return max_t(u64, bix, I4_BLOCKS);
+	default:
+		WARN_ON(1);
+		return bix;
+	}
+}
+
+static inline u64 maxbix(u8 height)
+{
+	return 1ULL << (LOGFS_BLOCK_BITS * height);
+}
+
+/**
+ * The inode address space is cut in two halves.  Lower half belongs to data
+ * pages, upper half to indirect blocks.  If the high bit (INDIRECT_BIT) is
+ * set, the actual block index (bix) and level can be derived from the page
+ * index.
+ *
+ * The lowest three bits of the block index are set to 0 after packing and
+ * unpacking.  Since the lowest n bits (9 for 4KiB blocksize) are ignored
+ * anyway this is harmless.
+ */
+#define ARCH_SHIFT	(BITS_PER_LONG - 32)
+#define INDIRECT_BIT	(0x80000000UL << ARCH_SHIFT)
+#define LEVEL_SHIFT	(28 + ARCH_SHIFT)
+static inline pgoff_t first_indirect_block(void)
+{
+	return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
+}
+
+pgoff_t logfs_pack_index(u64 bix, level_t level)
+{
+	pgoff_t index;
+
+	BUG_ON(bix >= INDIRECT_BIT);
+	if (level == 0)
+		return bix;
+
+	index  = INDIRECT_BIT;
+	index |= (__force long)level << LEVEL_SHIFT;
+	index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
+	return index;
+}
+
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
+{
+	u8 __level;
+
+	if (!(index & INDIRECT_BIT)) {
+		*bix = index;
+		*level = 0;
+		return;
+	}
+
+	__level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
+	*level = LEVEL(__level);
+	*bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
+	*bix = adjust_bix(*bix, *level);
+	return;
+}
+#undef ARCH_SHIFT
+#undef INDIRECT_BIT
+#undef LEVEL_SHIFT
+
+/*
+ * Time is stored as nanoseconds since the epoch.
+ */
+static struct timespec be64_to_timespec(__be64 betime)
+{
+	return ns_to_timespec(be64_to_cpu(betime));
+}
+
+static __be64 timespec_to_be64(struct timespec tsp)
+{
+	return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
+}
+
+static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	inode->i_mode	= be16_to_cpu(di->di_mode);
+	li->li_height	= di->di_height;
+	li->li_flags	= be32_to_cpu(di->di_flags);
+	inode->i_uid	= be32_to_cpu(di->di_uid);
+	inode->i_gid	= be32_to_cpu(di->di_gid);
+	inode->i_size	= be64_to_cpu(di->di_size);
+	logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
+	inode->i_atime	= be64_to_timespec(di->di_atime);
+	inode->i_ctime	= be64_to_timespec(di->di_ctime);
+	inode->i_mtime	= be64_to_timespec(di->di_mtime);
+	inode->i_nlink	= be32_to_cpu(di->di_refcount);
+	inode->i_generation = be32_to_cpu(di->di_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		inode->i_rdev = be64_to_cpu(di->di_data[0]);
+		break;
+	case S_IFDIR:	/* fall through */
+	case S_IFREG:	/* fall through */
+	case S_IFLNK:
+		for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+			li->li_data[i] = be64_to_cpu(di->di_data[i]);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	di->di_mode	= cpu_to_be16(inode->i_mode);
+	di->di_height	= li->li_height;
+	di->di_pad	= 0;
+	di->di_flags	= cpu_to_be32(li->li_flags);
+	di->di_uid	= cpu_to_be32(inode->i_uid);
+	di->di_gid	= cpu_to_be32(inode->i_gid);
+	di->di_size	= cpu_to_be64(i_size_read(inode));
+	di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
+	di->di_atime	= timespec_to_be64(inode->i_atime);
+	di->di_ctime	= timespec_to_be64(inode->i_ctime);
+	di->di_mtime	= timespec_to_be64(inode->i_mtime);
+	di->di_refcount	= cpu_to_be32(inode->i_nlink);
+	di->di_generation = cpu_to_be32(inode->i_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		di->di_data[0] = cpu_to_be64(inode->i_rdev);
+		break;
+	case S_IFDIR:	/* fall through */
+	case S_IFREG:	/* fall through */
+	case S_IFLNK:
+		for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+			di->di_data[i] = cpu_to_be64(li->li_data[i]);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void __logfs_set_blocks(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_inode *li = logfs_inode(inode);
+
+	inode->i_blocks = ULONG_MAX;
+	if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
+		inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
+}
+
+void logfs_set_blocks(struct inode *inode, u64 bytes)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	li->li_used_bytes = bytes;
+	__logfs_set_blocks(inode);
+}
+
+static void prelock_page(struct super_block *sb, struct page *page, int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(!PageLocked(page));
+	if (lock) {
+		BUG_ON(PagePreLocked(page));
+		SetPagePreLocked(page);
+	} else {
+		/* We are in GC path. */
+		if (PagePreLocked(page))
+			super->s_lock_count++;
+		else
+			SetPagePreLocked(page);
+	}
+}
+
+static void preunlock_page(struct super_block *sb, struct page *page, int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(!PageLocked(page));
+	if (lock)
+		ClearPagePreLocked(page);
+	else {
+		/* We are in GC path. */
+		BUG_ON(!PagePreLocked(page));
+		if (super->s_lock_count)
+			super->s_lock_count--;
+		else
+			ClearPagePreLocked(page);
+	}
+}
+
+/*
+ * Logfs is prone to an AB-BA deadlock where one task tries to acquire
+ * s_write_mutex with a locked page and GC tries to get that page while holding
+ * s_write_mutex.
+ * To solve this issue logfs will ignore the page lock iff the page in question
+ * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked
+ * in addition to PG_locked.
+ */
+static void logfs_get_wblocks(struct super_block *sb, struct page *page,
+		int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (page)
+		prelock_page(sb, page, lock);
+
+	if (lock) {
+		mutex_lock(&super->s_write_mutex);
+		logfs_gc_pass(sb);
+		/* FIXME: We also have to check for shadowed space
+		 * and mempool fill grade */
+	}
+}
+
+static void logfs_put_wblocks(struct super_block *sb, struct page *page,
+		int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (page)
+		preunlock_page(sb, page, lock);
+	/* Order matters - we must clear PG_pre_locked before releasing
+	 * s_write_mutex or we could race against another task. */
+	if (lock)
+		mutex_unlock(&super->s_write_mutex);
+}
+
+static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
+		level_t level)
+{
+	return find_or_create_page(inode->i_mapping,
+			logfs_pack_index(bix, level), GFP_NOFS);
+}
+
+static void logfs_put_read_page(struct page *page)
+{
+	unlock_page(page);
+	page_cache_release(page);
+}
+
+static void logfs_lock_write_page(struct page *page)
+{
+	int loop = 0;
+
+	while (unlikely(!trylock_page(page))) {
+		if (loop++ > 0x1000) {
+			/* Has been observed once so far... */
+			printk(KERN_ERR "stack at %p\n", &loop);
+			BUG();
+		}
+		if (PagePreLocked(page)) {
+			/* Holder of page lock is waiting for us, it
+			 * is safe to use this page. */
+			break;
+		}
+		/* Some other process has this page locked and has
+		 * nothing to do with us.  Wait for it to finish.
+		 */
+		schedule();
+	}
+	BUG_ON(!PageLocked(page));
+}
+
+static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
+		level_t level)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index = logfs_pack_index(bix, level);
+	struct page *page;
+	int err;
+
+repeat:
+	page = find_get_page(mapping, index);
+	if (!page) {
+		page = __page_cache_alloc(GFP_NOFS);
+		if (!page)
+			return NULL;
+		err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
+		if (unlikely(err)) {
+			page_cache_release(page);
+			if (err == -EEXIST)
+				goto repeat;
+			return NULL;
+		}
+	} else logfs_lock_write_page(page);
+	BUG_ON(!PageLocked(page));
+	return page;
+}
+
+static void logfs_unlock_write_page(struct page *page)
+{
+	if (!PagePreLocked(page))
+		unlock_page(page);
+}
+
+static void logfs_put_write_page(struct page *page)
+{
+	logfs_unlock_write_page(page);
+	page_cache_release(page);
+}
+
+static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
+		int rw)
+{
+	if (rw == READ)
+		return logfs_get_read_page(inode, bix, level);
+	else
+		return logfs_get_write_page(inode, bix, level);
+}
+
+static void logfs_put_page(struct page *page, int rw)
+{
+	if (rw == READ)
+		logfs_put_read_page(page);
+	else
+		logfs_put_write_page(page);
+}
+
+static unsigned long __get_bits(u64 val, int skip, int no)
+{
+	u64 ret = val;
+
+	ret >>= skip * no;
+	ret <<= 64 - no;
+	ret >>= 64 - no;
+	return ret;
+}
+
+static unsigned long get_bits(u64 val, level_t skip)
+{
+	return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
+}
+
+static inline void init_shadow_tree(struct super_block *sb,
+		struct shadow_tree *tree)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_init_mempool64(&tree->new, super->s_btree_pool);
+	btree_init_mempool64(&tree->old, super->s_btree_pool);
+}
+
+static void indirect_write_block(struct logfs_block *block)
+{
+	struct page *page;
+	struct inode *inode;
+	int ret;
+
+	page = block->page;
+	inode = page->mapping->host;
+	logfs_lock_write_page(page);
+	ret = logfs_write_buf(inode, page, 0);
+	logfs_unlock_write_page(page);
+	/*
+	 * This needs some rework.  Unless you want your filesystem to run
+	 * completely synchronously (you don't), the filesystem will always
+	 * report writes as 'successful' before the actual work has been
+	 * done.  The actual work gets done here and this is where any errors
+	 * will show up.  And there isn't much we can do about it, really.
+	 *
+	 * Some attempts to fix the errors (move from bad blocks, retry io,...)
+	 * have already been done, so anything left should be either a broken
+	 * device or a bug somewhere in logfs itself.  Being relatively new,
+	 * the odds currently favor a bug, so for now the line below isn't
+	 * entirely tasteles.
+	 */
+	BUG_ON(ret);
+}
+
+static void inode_write_block(struct logfs_block *block)
+{
+	struct inode *inode;
+	int ret;
+
+	inode = block->inode;
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		logfs_write_anchor(inode);
+	else {
+		ret = __logfs_write_inode(inode, 0);
+		/* see indirect_write_block comment */
+		BUG_ON(ret);
+	}
+}
+
+static gc_level_t inode_block_level(struct logfs_block *block)
+{
+	BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
+	return GC_LEVEL(LOGFS_MAX_LEVELS);
+}
+
+static gc_level_t indirect_block_level(struct logfs_block *block)
+{
+	struct page *page;
+	struct inode *inode;
+	u64 bix;
+	level_t level;
+
+	page = block->page;
+	inode = page->mapping->host;
+	logfs_unpack_index(page->index, &bix, &level);
+	return expand_level(inode->i_ino, level);
+}
+
+/*
+ * This silences a false, yet annoying gcc warning.  I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
+{
+	return find_next_bit(addr, size, offset);
+}
+
+static __be64 inode_val0(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 val;
+
+	/*
+	 * Explicit shifting generates good code, but must match the format
+	 * of the structure.  Add some paranoia just in case.
+	 */
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
+
+	val =	(u64)inode->i_mode << 48 |
+		(u64)li->li_height << 40 |
+		(u64)li->li_flags;
+	return cpu_to_be64(val);
+}
+
+static int inode_write_alias(struct super_block *sb,
+		struct logfs_block *block, write_alias_t *write_one_alias)
+{
+	struct inode *inode = block->inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	unsigned long pos;
+	u64 ino , bix;
+	__be64 val;
+	level_t level;
+	int err;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
+			return 0;
+
+		switch (pos) {
+		case INODE_HEIGHT_OFS:
+			val = inode_val0(inode);
+			break;
+		case INODE_USED_OFS:
+			val = cpu_to_be64(li->li_used_bytes);;
+			break;
+		case INODE_SIZE_OFS:
+			val = cpu_to_be64(i_size_read(inode));
+			break;
+		case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
+			val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
+			break;
+		default:
+			BUG();
+		}
+
+		ino = LOGFS_INO_MASTER;
+		bix = inode->i_ino;
+		level = LEVEL(0);
+		err = write_one_alias(sb, ino, bix, level, pos, val);
+		if (err)
+			return err;
+	}
+}
+
+static int indirect_write_alias(struct super_block *sb,
+		struct logfs_block *block, write_alias_t *write_one_alias)
+{
+	unsigned long pos;
+	struct page *page = block->page;
+	u64 ino , bix;
+	__be64 *child, val;
+	level_t level;
+	int err;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_BLOCK_FACTOR)
+			return 0;
+
+		ino = page->mapping->host->i_ino;
+		logfs_unpack_index(page->index, &bix, &level);
+		child = kmap_atomic(page, KM_USER0);
+		val = child[pos];
+		kunmap_atomic(child, KM_USER0);
+		err = write_one_alias(sb, ino, bix, level, pos, val);
+		if (err)
+			return err;
+	}
+}
+
+int logfs_write_obj_aliases_pagecache(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	int err;
+
+	list_for_each_entry(block, &super->s_object_alias, alias_list) {
+		err = block->ops->write_alias(sb, block, write_alias_journal);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+void __free_block(struct super_block *sb, struct logfs_block *block)
+{
+	BUG_ON(!list_empty(&block->item_list));
+	list_del(&block->alias_list);
+	mempool_free(block, logfs_super(sb)->s_block_pool);
+}
+
+static void inode_free_block(struct super_block *sb, struct logfs_block *block)
+{
+	struct inode *inode = block->inode;
+
+	logfs_inode(inode)->li_block = NULL;
+	__free_block(sb, block);
+}
+
+static void indirect_free_block(struct super_block *sb,
+		struct logfs_block *block)
+{
+	ClearPagePrivate(block->page);
+	block->page->private = 0;
+	__free_block(sb, block);
+}
+
+
+static struct logfs_block_ops inode_block_ops = {
+	.write_block = inode_write_block,
+	.block_level = inode_block_level,
+	.free_block = inode_free_block,
+	.write_alias = inode_write_alias,
+};
+
+struct logfs_block_ops indirect_block_ops = {
+	.write_block = indirect_write_block,
+	.block_level = indirect_block_level,
+	.free_block = indirect_free_block,
+	.write_alias = indirect_write_alias,
+};
+
+struct logfs_block *__alloc_block(struct super_block *sb,
+		u64 ino, u64 bix, level_t level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+
+	block = mempool_alloc(super->s_block_pool, GFP_NOFS);
+	memset(block, 0, sizeof(*block));
+	INIT_LIST_HEAD(&block->alias_list);
+	INIT_LIST_HEAD(&block->item_list);
+	block->sb = sb;
+	block->ino = ino;
+	block->bix = bix;
+	block->level = level;
+	return block;
+}
+
+static void alloc_inode_block(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block;
+
+	if (li->li_block)
+		return;
+
+	block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
+	block->inode = inode;
+	li->li_block = block;
+	block->ops = &inode_block_ops;
+}
+
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+		__be64 *array, int page_is_empty)
+{
+	u64 ptr;
+	int i, start;
+
+	block->partial = 0;
+	block->full = 0;
+	start = 0;
+	if (page->index < first_indirect_block()) {
+		/* Counters are pointless on level 0 */
+		return;
+	}
+	if (page->index == first_indirect_block()) {
+		/* Skip unused pointers */
+		start = I0_BLOCKS;
+		block->full = I0_BLOCKS;
+	}
+	if (!page_is_empty) {
+		for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
+			ptr = be64_to_cpu(array[i]);
+			if (ptr)
+				block->partial++;
+			if (ptr & LOGFS_FULLY_POPULATED)
+				block->full++;
+		}
+	}
+}
+
+static void alloc_data_block(struct inode *inode, struct page *page)
+{
+	struct logfs_block *block;
+	u64 bix;
+	level_t level;
+
+	if (PagePrivate(page))
+		return;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
+	block->page = page;
+	SetPagePrivate(page);
+	page->private = (unsigned long)block;
+	block->ops = &indirect_block_ops;
+}
+
+static void alloc_indirect_block(struct inode *inode, struct page *page,
+		int page_is_empty)
+{
+	struct logfs_block *block;
+	__be64 *array;
+
+	if (PagePrivate(page))
+		return;
+
+	alloc_data_block(inode, page);
+
+	block = logfs_block(page);
+	array = kmap_atomic(page, KM_USER0);
+	initialize_block_counters(page, block, array, page_is_empty);
+	kunmap_atomic(array, KM_USER0);
+}
+
+static void block_set_pointer(struct page *page, int index, u64 ptr)
+{
+	struct logfs_block *block = logfs_block(page);
+	__be64 *array;
+	u64 oldptr;
+
+	BUG_ON(!block);
+	array = kmap_atomic(page, KM_USER0);
+	oldptr = be64_to_cpu(array[index]);
+	array[index] = cpu_to_be64(ptr);
+	kunmap_atomic(array, KM_USER0);
+	SetPageUptodate(page);
+
+	block->full += !!(ptr & LOGFS_FULLY_POPULATED)
+		- !!(oldptr & LOGFS_FULLY_POPULATED);
+	block->partial += !!ptr - !!oldptr;
+}
+
+static u64 block_get_pointer(struct page *page, int index)
+{
+	__be64 *block;
+	u64 ptr;
+
+	block = kmap_atomic(page, KM_USER0);
+	ptr = be64_to_cpu(block[index]);
+	kunmap_atomic(block, KM_USER0);
+	return ptr;
+}
+
+static int logfs_read_empty(struct page *page)
+{
+	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	return 0;
+}
+
+static int logfs_read_direct(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	pgoff_t index = page->index;
+	u64 block;
+
+	block = li->li_data[index];
+	if (!block)
+		return logfs_read_empty(page);
+
+	return logfs_segment_read(inode, page, block, index, 0);
+}
+
+static int logfs_read_loop(struct inode *inode, struct page *page,
+		int rw_context)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bix, bofs = li->li_data[INDIRECT_INDEX];
+	level_t level, target_level;
+	int ret;
+	struct page *ipage;
+
+	logfs_unpack_index(page->index, &bix, &target_level);
+	if (!bofs)
+		return logfs_read_empty(page);
+
+	if (bix >= maxbix(li->li_height))
+		return logfs_read_empty(page);
+
+	for (level = LEVEL(li->li_height);
+			(__force u8)level > (__force u8)target_level;
+			level = SUBLEVEL(level)){
+		ipage = logfs_get_page(inode, bix, level, rw_context);
+		if (!ipage)
+			return -ENOMEM;
+
+		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(ipage);
+			return ret;
+		}
+
+		bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_page(ipage, rw_context);
+		if (!bofs)
+			return logfs_read_empty(page);
+	}
+
+	return logfs_segment_read(inode, page, bofs, bix, 0);
+}
+
+static int logfs_read_block(struct inode *inode, struct page *page,
+		int rw_context)
+{
+	pgoff_t index = page->index;
+
+	if (index < I0_BLOCKS)
+		return logfs_read_direct(inode, page);
+	return logfs_read_loop(inode, page, rw_context);
+}
+
+static int logfs_exist_loop(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[INDIRECT_INDEX];
+	level_t level;
+	int ret;
+	struct page *ipage;
+
+	if (!bofs)
+		return 0;
+	if (bix >= maxbix(li->li_height))
+		return 0;
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+		ipage = logfs_get_read_page(inode, bix, level);
+		if (!ipage)
+			return -ENOMEM;
+
+		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(ipage);
+			return ret;
+		}
+
+		bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_read_page(ipage);
+		if (!bofs)
+			return 0;
+	}
+
+	return 1;
+}
+
+int logfs_exist_block(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS)
+		return !!li->li_data[bix];
+	return logfs_exist_loop(inode, bix);
+}
+
+static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	for (; bix < I0_BLOCKS; bix++)
+		if (data ^ (li->li_data[bix] == 0))
+			return bix;
+	return I0_BLOCKS;
+}
+
+static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	__be64 *rblock;
+	u64 increment, bofs = li->li_data[INDIRECT_INDEX];
+	level_t level;
+	int ret, slot;
+	struct page *page;
+
+	BUG_ON(!bofs);
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+		increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
+		page = logfs_get_read_page(inode, bix, level);
+		if (!page)
+			return bix;
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(page);
+			return bix;
+		}
+
+		slot = get_bits(bix, SUBLEVEL(level));
+		rblock = kmap_atomic(page, KM_USER0);
+		while (slot < LOGFS_BLOCK_FACTOR) {
+			if (data && (rblock[slot] != 0))
+				break;
+			if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
+				break;
+			slot++;
+			bix += increment;
+			bix &= ~(increment - 1);
+		}
+		if (slot >= LOGFS_BLOCK_FACTOR) {
+			kunmap_atomic(rblock, KM_USER0);
+			logfs_put_read_page(page);
+			return bix;
+		}
+		bofs = be64_to_cpu(rblock[slot]);
+		kunmap_atomic(rblock, KM_USER0);
+		logfs_put_read_page(page);
+		if (!bofs) {
+			BUG_ON(data);
+			return bix;
+		}
+	}
+	return bix;
+}
+
+/**
+ * logfs_seek_hole - find next hole starting at a given block index
+ * @inode:		inode to search in
+ * @bix:		block index to start searching
+ *
+ * Returns next hole.  If the file doesn't contain any further holes, the
+ * block address next to eof is returned instead.
+ */
+u64 logfs_seek_hole(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS) {
+		bix = seek_holedata_direct(inode, bix, 0);
+		if (bix < I0_BLOCKS)
+			return bix;
+	}
+
+	if (!li->li_data[INDIRECT_INDEX])
+		return bix;
+	else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
+		bix = maxbix(li->li_height);
+	else {
+		bix = seek_holedata_loop(inode, bix, 0);
+		if (bix < maxbix(li->li_height))
+			return bix;
+		/* Should not happen anymore.  But if some port writes semi-
+		 * corrupt images (as this one used to) we might run into it.
+		 */
+		WARN_ON_ONCE(bix == maxbix(li->li_height));
+	}
+
+	return bix;
+}
+
+static u64 __logfs_seek_data(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS) {
+		bix = seek_holedata_direct(inode, bix, 1);
+		if (bix < I0_BLOCKS)
+			return bix;
+	}
+
+	if (bix < maxbix(li->li_height)) {
+		if (!li->li_data[INDIRECT_INDEX])
+			bix = maxbix(li->li_height);
+		else
+			return seek_holedata_loop(inode, bix, 1);
+	}
+
+	return bix;
+}
+
+/**
+ * logfs_seek_data - find next data block after a given block index
+ * @inode:		inode to search in
+ * @bix:		block index to start searching
+ *
+ * Returns next data block.  If the file doesn't contain any further data
+ * blocks, the last block in the file is returned instead.
+ */
+u64 logfs_seek_data(struct inode *inode, u64 bix)
+{
+	struct super_block *sb = inode->i_sb;
+	u64 ret, end;
+
+	ret = __logfs_seek_data(inode, bix);
+	end = i_size_read(inode) >> sb->s_blocksize_bits;
+	if (ret >= end)
+		ret = max(bix, end);
+	return ret;
+}
+
+static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
+{
+	return pure_ofs(li->li_data[bix]) == ofs;
+}
+
+static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
+		u64 ofs, u64 bofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	level_t level;
+	int ret;
+	struct page *page;
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
+		page = logfs_get_write_page(inode, bix, level);
+		BUG_ON(!page);
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_write_page(page);
+			return 0;
+		}
+
+		bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_write_page(page);
+		if (!bofs)
+			return 0;
+
+		if (pure_ofs(bofs) == ofs)
+			return 1;
+	}
+	return 0;
+}
+
+static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[INDIRECT_INDEX];
+
+	if (!bofs)
+		return 0;
+
+	if (bix >= maxbix(li->li_height))
+		return 0;
+
+	if (pure_ofs(bofs) == ofs)
+		return 1;
+
+	return __logfs_is_valid_loop(inode, bix, ofs, bofs);
+}
+
+static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
+		return 0;
+
+	if (bix < I0_BLOCKS)
+		return logfs_is_valid_direct(li, bix, ofs);
+	return logfs_is_valid_loop(inode, bix, ofs);
+}
+
+/**
+ * logfs_is_valid_block - check whether this block is still valid
+ *
+ * @sb	- superblock
+ * @ofs	- block physical offset
+ * @ino	- block inode number
+ * @bix	- block index
+ * @level - block level
+ *
+ * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
+ * become invalid once the journal is written.
+ */
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+		gc_level_t gc_level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	int ret, cookie;
+
+	/* Umount closes a segment with free blocks remaining.  Those
+	 * blocks are by definition invalid. */
+	if (ino == -1)
+		return 0;
+
+	LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
+
+	inode = logfs_safe_iget(sb, ino, &cookie);
+	if (IS_ERR(inode))
+		goto invalid;
+
+	ret = __logfs_is_valid_block(inode, bix, ofs);
+	logfs_safe_iput(inode, cookie);
+	if (ret)
+		return ret;
+
+invalid:
+	/* Block is nominally invalid, but may still sit in the shadow tree,
+	 * waiting for a journal commit.
+	 */
+	if (btree_lookup64(&super->s_shadow_tree.old, ofs))
+		return 2;
+	return 0;
+}
+
+int logfs_readpage_nolock(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int ret = -EIO;
+
+	ret = logfs_read_block(inode, page, READ);
+
+	if (ret) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	flush_dcache_page(page);
+
+	return ret;
+}
+
+static int logfs_reserve_bytes(struct inode *inode, int bytes)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	u64 available = super->s_free_bytes + super->s_dirty_free_bytes
+			- super->s_dirty_used_bytes - super->s_dirty_pages;
+
+	if (!bytes)
+		return 0;
+
+	if (available < bytes)
+		return -ENOSPC;
+
+	if (available < bytes + super->s_root_reserve &&
+			!capable(CAP_SYS_RESOURCE))
+		return -ENOSPC;
+
+	return 0;
+}
+
+int get_page_reserve(struct inode *inode, struct page *page)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	int ret;
+
+	if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+		return 0;
+
+	logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
+	ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
+	if (!ret) {
+		alloc_data_block(inode, page);
+		logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
+		super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
+	}
+	logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
+	return ret;
+}
+
+/*
+ * We are protected by write lock.  Push victims up to superblock level
+ * and release transaction when appropriate.
+ */
+/* FIXME: This is currently called from the wrong spots. */
+static void logfs_handle_transaction(struct inode *inode,
+		struct logfs_transaction *ta)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	if (!ta)
+		return;
+	logfs_inode(inode)->li_block->ta = NULL;
+
+	if (inode->i_ino != LOGFS_INO_MASTER) {
+		BUG(); /* FIXME: Yes, this needs more thought */
+		/* just remember the transaction until inode is written */
+		//BUG_ON(logfs_inode(inode)->li_transaction);
+		//logfs_inode(inode)->li_transaction = ta;
+		return;
+	}
+
+	switch (ta->state) {
+	case CREATE_1: /* fall through */
+	case UNLINK_1:
+		BUG_ON(super->s_victim_ino);
+		super->s_victim_ino = ta->ino;
+		break;
+	case CREATE_2: /* fall through */
+	case UNLINK_2:
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_victim_ino = 0;
+		/* transaction ends here - free it */
+		kfree(ta);
+		break;
+	case CROSS_RENAME_1:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		super->s_rename_dir = ta->dir;
+		super->s_rename_pos = ta->pos;
+		break;
+	case CROSS_RENAME_2:
+		BUG_ON(super->s_rename_dir != ta->dir);
+		BUG_ON(super->s_rename_pos != ta->pos);
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		kfree(ta);
+		break;
+	case TARGET_RENAME_1:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		BUG_ON(super->s_victim_ino);
+		super->s_rename_dir = ta->dir;
+		super->s_rename_pos = ta->pos;
+		super->s_victim_ino = ta->ino;
+		break;
+	case TARGET_RENAME_2:
+		BUG_ON(super->s_rename_dir != ta->dir);
+		BUG_ON(super->s_rename_pos != ta->pos);
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		break;
+	case TARGET_RENAME_3:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_victim_ino = 0;
+		kfree(ta);
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Not strictly a reservation, but rather a check that we still have enough
+ * space to satisfy the write.
+ */
+static int logfs_reserve_blocks(struct inode *inode, int blocks)
+{
+	return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
+}
+
+struct write_control {
+	u64 ofs;
+	long flags;
+};
+
+static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
+		level_t level, u64 old_ofs)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_shadow *shadow;
+
+	shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
+	memset(shadow, 0, sizeof(*shadow));
+	shadow->ino = inode->i_ino;
+	shadow->bix = bix;
+	shadow->gc_level = expand_level(inode->i_ino, level);
+	shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
+	return shadow;
+}
+
+static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	mempool_free(shadow, super->s_shadow_pool);
+}
+
+/**
+ * fill_shadow_tree - Propagate shadow tree changes due to a write
+ * @inode:	Inode owning the page
+ * @page:	Struct page that was written
+ * @shadow:	Shadow for the current write
+ *
+ * Writes in logfs can result in two semi-valid objects.  The old object
+ * is still valid as long as it can be reached by following pointers on
+ * the medium.  Only when writes propagate all the way up to the journal
+ * has the new object safely replaced the old one.
+ *
+ * To handle this problem, a struct logfs_shadow is used to represent
+ * every single write.  It is attached to the indirect block, which is
+ * marked dirty.  When the indirect block is written, its shadows are
+ * handed up to the next indirect block (or inode).  Untimately they
+ * will reach the master inode and be freed upon journal commit.
+ *
+ * This function handles a single step in the propagation.  It adds the
+ * shadow for the current write to the tree, along with any shadows in
+ * the page's tree, in case it was an indirect block.  If a page is
+ * written, the inode parameter is left NULL, if an inode is written,
+ * the page parameter is left NULL.
+ */
+static void fill_shadow_tree(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_block *block = logfs_block(page);
+	struct shadow_tree *tree = &super->s_shadow_tree;
+
+	if (PagePrivate(page)) {
+		if (block->alias_map)
+			super->s_no_object_aliases -= bitmap_weight(
+					block->alias_map, LOGFS_BLOCK_FACTOR);
+		logfs_handle_transaction(inode, block->ta);
+		block->ops->free_block(inode->i_sb, block);
+	}
+	if (shadow) {
+		if (shadow->old_ofs)
+			btree_insert64(&tree->old, shadow->old_ofs, shadow,
+					GFP_NOFS);
+		else
+			btree_insert64(&tree->new, shadow->new_ofs, shadow,
+					GFP_NOFS);
+
+		super->s_dirty_used_bytes += shadow->new_len;
+		super->s_dirty_free_bytes += shadow->old_len;
+	}
+}
+
+static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
+		long child_no)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
+		/* Aliases in the master inode are pointless. */
+		return;
+	}
+
+	if (!test_bit(child_no, block->alias_map)) {
+		set_bit(child_no, block->alias_map);
+		super->s_no_object_aliases++;
+	}
+	list_move_tail(&block->alias_list, &super->s_object_alias);
+}
+
+/*
+ * Object aliases can and often do change the size and occupied space of a
+ * file.  So not only do we have to change the pointers, we also have to
+ * change inode->i_size and li->li_used_bytes.  Which is done by setting
+ * another two object aliases for the inode itself.
+ */
+static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (shadow->new_len == shadow->old_len)
+		return;
+
+	alloc_inode_block(inode);
+	li->li_used_bytes += shadow->new_len - shadow->old_len;
+	__logfs_set_blocks(inode);
+	logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
+	logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
+}
+
+static int logfs_write_i0(struct inode *inode, struct page *page,
+		struct write_control *wc)
+{
+	struct logfs_shadow *shadow;
+	u64 bix;
+	level_t level;
+	int full, err = 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	if (wc->ofs == 0)
+		if (logfs_reserve_blocks(inode, 1))
+			return -ENOSPC;
+
+	shadow = alloc_shadow(inode, bix, level, wc->ofs);
+	if (wc->flags & WF_WRITE)
+		err = logfs_segment_write(inode, page, shadow);
+	if (wc->flags & WF_DELETE)
+		logfs_segment_delete(inode, shadow);
+	if (err) {
+		free_shadow(inode, shadow);
+		return err;
+	}
+
+	set_iused(inode, shadow);
+	full = 1;
+	if (level != 0) {
+		alloc_indirect_block(inode, page, 0);
+		full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
+	}
+	fill_shadow_tree(inode, page, shadow);
+	wc->ofs = shadow->new_ofs;
+	if (wc->ofs && full)
+		wc->ofs |= LOGFS_FULLY_POPULATED;
+	return 0;
+}
+
+static int logfs_write_direct(struct inode *inode, struct page *page,
+		long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[page->index],
+		.flags = flags,
+	};
+	int err;
+
+	alloc_inode_block(inode);
+
+	err = logfs_write_i0(inode, page, &wc);
+	if (err)
+		return err;
+
+	li->li_data[page->index] = wc.ofs;
+	logfs_set_alias(inode->i_sb, li->li_block,
+			page->index + INODE_POINTER_OFS);
+	return 0;
+}
+
+static int ptr_change(u64 ofs, struct page *page)
+{
+	struct logfs_block *block = logfs_block(page);
+	int empty0, empty1, full0, full1;
+
+	empty0 = ofs == 0;
+	empty1 = block->partial == 0;
+	if (empty0 != empty1)
+		return 1;
+
+	/* The !! is necessary to shrink result to int */
+	full0 = !!(ofs & LOGFS_FULLY_POPULATED);
+	full1 = block->full == LOGFS_BLOCK_FACTOR;
+	if (full0 != full1)
+		return 1;
+	return 0;
+}
+
+static int __logfs_write_rec(struct inode *inode, struct page *page,
+		struct write_control *this_wc,
+		pgoff_t bix, level_t target_level, level_t level)
+{
+	int ret, page_empty = 0;
+	int child_no = get_bits(bix, SUBLEVEL(level));
+	struct page *ipage;
+	struct write_control child_wc = {
+		.flags = this_wc->flags,
+	};
+
+	ipage = logfs_get_write_page(inode, bix, level);
+	if (!ipage)
+		return -ENOMEM;
+
+	if (this_wc->ofs) {
+		ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+		if (ret)
+			goto out;
+	} else if (!PageUptodate(ipage)) {
+		page_empty = 1;
+		logfs_read_empty(ipage);
+	}
+
+	child_wc.ofs = block_get_pointer(ipage, child_no);
+
+	if ((__force u8)level-1 > (__force u8)target_level)
+		ret = __logfs_write_rec(inode, page, &child_wc, bix,
+				target_level, SUBLEVEL(level));
+	else
+		ret = logfs_write_i0(inode, page, &child_wc);
+
+	if (ret)
+		goto out;
+
+	alloc_indirect_block(inode, ipage, page_empty);
+	block_set_pointer(ipage, child_no, child_wc.ofs);
+	/* FIXME: first condition seems superfluous */
+	if (child_wc.ofs || logfs_block(ipage)->partial)
+		this_wc->flags |= WF_WRITE;
+	/* the condition on this_wc->ofs ensures that we won't consume extra
+	 * space for indirect blocks in the future, which we cannot reserve */
+	if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
+		ret = logfs_write_i0(inode, ipage, this_wc);
+	else
+		logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
+out:
+	logfs_put_write_page(ipage);
+	return ret;
+}
+
+static int logfs_write_rec(struct inode *inode, struct page *page,
+		pgoff_t bix, level_t target_level, long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[INDIRECT_INDEX],
+		.flags = flags,
+	};
+	int ret;
+
+	alloc_inode_block(inode);
+
+	if (li->li_height > (__force u8)target_level)
+		ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
+				LEVEL(li->li_height));
+	else
+		ret = logfs_write_i0(inode, page, &wc);
+	if (ret)
+		return ret;
+
+	if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+		logfs_set_alias(inode->i_sb, li->li_block,
+				INDIRECT_INDEX + INODE_POINTER_OFS);
+	}
+	return ret;
+}
+
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	alloc_inode_block(inode);
+	logfs_inode(inode)->li_block->ta = ta;
+}
+
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	struct logfs_block *block = logfs_inode(inode)->li_block;
+
+	if (block && block->ta)
+		block->ta = NULL;
+}
+
+static int grow_inode(struct inode *inode, u64 bix, level_t level)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u8 height = (__force u8)level;
+	struct page *page;
+	struct write_control wc = {
+		.flags = WF_WRITE,
+	};
+	int err;
+
+	BUG_ON(height > 5 || li->li_height > 5);
+	while (height > li->li_height || bix >= maxbix(li->li_height)) {
+		page = logfs_get_write_page(inode, I0_BLOCKS + 1,
+				LEVEL(li->li_height + 1));
+		if (!page)
+			return -ENOMEM;
+		logfs_read_empty(page);
+		alloc_indirect_block(inode, page, 1);
+		block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
+		err = logfs_write_i0(inode, page, &wc);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+		wc.ofs = 0;
+		li->li_height++;
+		logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
+	}
+	return 0;
+}
+
+static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	pgoff_t index = page->index;
+	u64 bix;
+	level_t level;
+	int err;
+
+	flags |= WF_WRITE | WF_DELETE;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	logfs_unpack_index(index, &bix, &level);
+	if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+		super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
+
+	if (index < I0_BLOCKS)
+		return logfs_write_direct(inode, page, flags);
+
+	bix = adjust_bix(bix, level);
+	err = grow_inode(inode, bix, level);
+	if (err)
+		return err;
+	return logfs_write_rec(inode, page, bix, level, flags);
+}
+
+int logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	logfs_get_wblocks(sb, page, flags & WF_LOCK);
+	ret = __logfs_write_buf(inode, page, flags);
+	logfs_put_wblocks(sb, page, flags & WF_LOCK);
+	return ret;
+}
+
+static int __logfs_delete(struct inode *inode, struct page *page)
+{
+	long flags = WF_DELETE;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	if (page->index < I0_BLOCKS)
+		return logfs_write_direct(inode, page, flags);
+	return logfs_write_rec(inode, page, page->index, 0, flags);
+}
+
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree)
+{
+	struct super_block *sb = inode->i_sb;
+	struct page *page;
+	int ret;
+
+	page = logfs_get_read_page(inode, index, 0);
+	if (!page)
+		return -ENOMEM;
+
+	logfs_get_wblocks(sb, page, 1);
+	ret = __logfs_delete(inode, page);
+	logfs_put_wblocks(sb, page, 1);
+
+	logfs_put_read_page(page);
+
+	return ret;
+}
+
+/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+		gc_level_t gc_level, long flags)
+{
+	level_t level = shrink_level(gc_level);
+	struct page *page;
+	int err;
+
+	page = logfs_get_write_page(inode, bix, level);
+	if (!page)
+		return -ENOMEM;
+
+	err = logfs_segment_read(inode, page, ofs, bix, level);
+	if (!err) {
+		if (level != 0)
+			alloc_indirect_block(inode, page, 0);
+		err = logfs_write_buf(inode, page, flags);
+	}
+	logfs_put_write_page(page);
+	return err;
+}
+
+static int truncate_data_block(struct inode *inode, struct page *page,
+		u64 ofs, struct logfs_shadow *shadow, u64 size)
+{
+	loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
+	u64 bix;
+	level_t level;
+	int err;
+
+	/* Does truncation happen within this page? */
+	if (size <= pageofs || size - pageofs >= PAGE_SIZE)
+		return 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level != 0);
+
+	err = logfs_segment_read(inode, page, ofs, bix, level);
+	if (err)
+		return err;
+
+	zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
+	return logfs_segment_write(inode, page, shadow);
+}
+
+static int logfs_truncate_i0(struct inode *inode, struct page *page,
+		struct write_control *wc, u64 size)
+{
+	struct logfs_shadow *shadow;
+	u64 bix;
+	level_t level;
+	int err = 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level != 0);
+	shadow = alloc_shadow(inode, bix, level, wc->ofs);
+
+	err = truncate_data_block(inode, page, wc->ofs, shadow, size);
+	if (err) {
+		free_shadow(inode, shadow);
+		return err;
+	}
+
+	logfs_segment_delete(inode, shadow);
+	set_iused(inode, shadow);
+	fill_shadow_tree(inode, page, shadow);
+	wc->ofs = shadow->new_ofs;
+	return 0;
+}
+
+static int logfs_truncate_direct(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc;
+	struct page *page;
+	int e;
+	int err;
+
+	alloc_inode_block(inode);
+
+	for (e = I0_BLOCKS - 1; e >= 0; e--) {
+		if (size > (e+1) * LOGFS_BLOCKSIZE)
+			break;
+
+		wc.ofs = li->li_data[e];
+		if (!wc.ofs)
+			continue;
+
+		page = logfs_get_write_page(inode, e, 0);
+		if (!page)
+			return -ENOMEM;
+		err = logfs_segment_read(inode, page, wc.ofs, e, 0);
+		if (err) {
+			logfs_put_write_page(page);
+			return err;
+		}
+		err = logfs_truncate_i0(inode, page, &wc, size);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+
+		li->li_data[e] = wc.ofs;
+	}
+	return 0;
+}
+
+/* FIXME: these need to become per-sb once we support different blocksizes */
+static u64 __logfs_step[] = {
+	1,
+	I1_BLOCKS,
+	I2_BLOCKS,
+	I3_BLOCKS,
+};
+
+static u64 __logfs_start_index[] = {
+	I0_BLOCKS,
+	I1_BLOCKS,
+	I2_BLOCKS,
+	I3_BLOCKS
+};
+
+static inline u64 logfs_step(level_t level)
+{
+	return __logfs_step[(__force u8)level];
+}
+
+static inline u64 logfs_factor(u8 level)
+{
+	return __logfs_step[level] * LOGFS_BLOCKSIZE;
+}
+
+static inline u64 logfs_start_index(level_t level)
+{
+	return __logfs_start_index[(__force u8)level];
+}
+
+static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
+{
+	logfs_unpack_index(index, bix, level);
+	if (*bix <= logfs_start_index(SUBLEVEL(*level)))
+		*bix = 0;
+}
+
+static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
+		struct write_control *this_wc, u64 size)
+{
+	int truncate_happened = 0;
+	int e, err = 0;
+	u64 bix, child_bix, next_bix;
+	level_t level;
+	struct page *page;
+	struct write_control child_wc = { /* FIXME: flags */ };
+
+	logfs_unpack_raw_index(ipage->index, &bix, &level);
+	err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+	if (err)
+		return err;
+
+	for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
+		child_bix = bix + e * logfs_step(SUBLEVEL(level));
+		next_bix = child_bix + logfs_step(SUBLEVEL(level));
+		if (size > next_bix * LOGFS_BLOCKSIZE)
+			break;
+
+		child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
+		if (!child_wc.ofs)
+			continue;
+
+		page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
+		if (!page)
+			return -ENOMEM;
+
+		if ((__force u8)level > 1)
+			err = __logfs_truncate_rec(inode, page, &child_wc, size);
+		else
+			err = logfs_truncate_i0(inode, page, &child_wc, size);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+
+		truncate_happened = 1;
+		alloc_indirect_block(inode, ipage, 0);
+		block_set_pointer(ipage, e, child_wc.ofs);
+	}
+
+	if (!truncate_happened) {
+		printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
+		return 0;
+	}
+
+	this_wc->flags = WF_DELETE;
+	if (logfs_block(ipage)->partial)
+		this_wc->flags |= WF_WRITE;
+
+	return logfs_write_i0(inode, ipage, this_wc);
+}
+
+static int logfs_truncate_rec(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[INDIRECT_INDEX],
+	};
+	struct page *page;
+	int err;
+
+	alloc_inode_block(inode);
+
+	if (!wc.ofs)
+		return 0;
+
+	page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
+	if (!page)
+		return -ENOMEM;
+
+	err = __logfs_truncate_rec(inode, page, &wc, size);
+	logfs_put_write_page(page);
+	if (err)
+		return err;
+
+	if (li->li_data[INDIRECT_INDEX] != wc.ofs)
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+	return 0;
+}
+
+static int __logfs_truncate(struct inode *inode, u64 size)
+{
+	int ret;
+
+	if (size >= logfs_factor(logfs_inode(inode)->li_height))
+		return 0;
+
+	ret = logfs_truncate_rec(inode, size);
+	if (ret)
+		return ret;
+
+	return logfs_truncate_direct(inode, size);
+}
+
+int logfs_truncate(struct inode *inode, u64 size)
+{
+	struct super_block *sb = inode->i_sb;
+	int err;
+
+	logfs_get_wblocks(sb, NULL, 1);
+	err = __logfs_truncate(inode, size);
+	if (!err)
+		err = __logfs_write_inode(inode, 0);
+	logfs_put_wblocks(sb, NULL, 1);
+
+	if (!err)
+		err = vmtruncate(inode, size);
+
+	/* I don't trust error recovery yet. */
+	WARN_ON(err);
+	return err;
+}
+
+static void move_page_to_inode(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = logfs_block(page);
+
+	if (!block)
+		return;
+
+	log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	BUG_ON(li->li_block);
+	block->ops = &inode_block_ops;
+	block->inode = inode;
+	li->li_block = block;
+
+	block->page = NULL;
+	page->private = 0;
+	ClearPagePrivate(page);
+}
+
+static void move_inode_to_page(struct page *page, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = li->li_block;
+
+	if (!block)
+		return;
+
+	log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	BUG_ON(PagePrivate(page));
+	block->ops = &indirect_block_ops;
+	block->page = page;
+	page->private = (unsigned long)block;
+	SetPagePrivate(page);
+
+	block->inode = NULL;
+	li->li_block = NULL;
+}
+
+int logfs_read_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *master_inode = super->s_master_inode;
+	struct page *page;
+	struct logfs_disk_inode *di;
+	u64 ino = inode->i_ino;
+
+	if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
+		return -ENODATA;
+	if (!logfs_exist_block(master_inode, ino))
+		return -ENODATA;
+
+	page = read_cache_page(master_inode->i_mapping, ino,
+			(filler_t *)logfs_readpage, NULL);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	di = kmap_atomic(page, KM_USER0);
+	logfs_disk_to_inode(di, inode);
+	kunmap_atomic(di, KM_USER0);
+	move_page_to_inode(inode, page);
+	page_cache_release(page);
+	return 0;
+}
+
+/* Caller must logfs_put_write_page(page); */
+static struct page *inode_to_page(struct inode *inode)
+{
+	struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
+	struct logfs_disk_inode *di;
+	struct page *page;
+
+	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+
+	page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+	if (!page)
+		return NULL;
+
+	di = kmap_atomic(page, KM_USER0);
+	logfs_inode_to_disk(inode, di);
+	kunmap_atomic(di, KM_USER0);
+	move_inode_to_page(page, inode);
+	return page;
+}
+
+/* Cheaper version of write_inode.  All changes are concealed in
+ * aliases, which are moved back.  No write to the medium happens.
+ */
+void logfs_clear_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = li->li_block;
+	struct page *page;
+
+	/* Only deleted files may be dirty at this point */
+	BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
+	if (!block)
+		return;
+	if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
+		block->ops->free_block(inode->i_sb, block);
+		return;
+	}
+
+	BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
+	page = inode_to_page(inode);
+	BUG_ON(!page); /* FIXME: Use emergency page */
+	logfs_put_write_page(page);
+}
+
+static int do_write_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct inode *master_inode = logfs_super(sb)->s_master_inode;
+	loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
+	struct page *page;
+	int err;
+
+	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+	/* FIXME: lock inode */
+
+	if (i_size_read(master_inode) < size)
+		i_size_write(master_inode, size);
+
+	/* TODO: Tell vfs this inode is clean now */
+
+	page = inode_to_page(inode);
+	if (!page)
+		return -ENOMEM;
+
+	/* FIXME: transaction is part of logfs_block now.  Is that enough? */
+	err = logfs_write_buf(master_inode, page, 0);
+	logfs_put_write_page(page);
+	return err;
+}
+
+static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
+		int write,
+		void (*change_se)(struct logfs_segment_entry *, long),
+		long arg)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	struct page *page;
+	struct logfs_segment_entry *se;
+	pgoff_t page_no;
+	int child_no;
+
+	page_no = segno >> (sb->s_blocksize_bits - 3);
+	child_no = segno & ((sb->s_blocksize >> 3) - 1);
+
+	inode = super->s_segfile_inode;
+	page = logfs_get_write_page(inode, page_no, 0);
+	BUG_ON(!page); /* FIXME: We need some reserve page for this case */
+	if (!PageUptodate(page))
+		logfs_read_block(inode, page, WRITE);
+
+	if (write)
+		alloc_indirect_block(inode, page, 0);
+	se = kmap_atomic(page, KM_USER0);
+	change_se(se + child_no, arg);
+	if (write) {
+		logfs_set_alias(sb, logfs_block(page), child_no);
+		BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
+	}
+	kunmap_atomic(se, KM_USER0);
+
+	logfs_put_write_page(page);
+}
+
+static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
+{
+	struct logfs_segment_entry *target = (void *)_target;
+
+	*target = *se;
+}
+
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+		struct logfs_segment_entry *se)
+{
+	logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
+}
+
+static void __set_segment_used(struct logfs_segment_entry *se, long increment)
+{
+	u32 valid;
+
+	valid = be32_to_cpu(se->valid);
+	valid += increment;
+	se->valid = cpu_to_be32(valid);
+}
+
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno = ofs >> super->s_segshift;
+
+	if (!increment)
+		return;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
+}
+
+static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
+{
+	se->ec_level = cpu_to_be32(ec_level);
+}
+
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+		gc_level_t gc_level)
+{
+	u32 ec_level = ec << 4 | (__force u8)gc_level;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
+}
+
+static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
+{
+	se->valid = cpu_to_be32(RESERVED);
+}
+
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
+{
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
+}
+
+static void __set_segment_unreserved(struct logfs_segment_entry *se,
+		long ec_level)
+{
+	se->valid = 0;
+	se->ec_level = cpu_to_be32(ec_level);
+}
+
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
+{
+	u32 ec_level = ec << 4;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
+			ec_level);
+}
+
+int __logfs_write_inode(struct inode *inode, long flags)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+	ret = do_write_inode(inode);
+	logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+	return ret;
+}
+
+static int do_delete_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct inode *master_inode = logfs_super(sb)->s_master_inode;
+	struct page *page;
+	int ret;
+
+	page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+	if (!page)
+		return -ENOMEM;
+
+	move_inode_to_page(page, inode);
+
+	logfs_get_wblocks(sb, page, 1);
+	ret = __logfs_delete(master_inode, page);
+	logfs_put_wblocks(sb, page, 1);
+
+	logfs_put_write_page(page);
+	return ret;
+}
+
+/*
+ * ZOMBIE inodes have already been deleted before and should remain dead,
+ * if it weren't for valid checking.  No need to kill them again here.
+ */
+void logfs_delete_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
+		li->li_flags |= LOGFS_IF_ZOMBIE;
+		if (i_size_read(inode) > 0)
+			logfs_truncate(inode, 0);
+		do_delete_inode(inode);
+	}
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+}
+
+void btree_write_block(struct logfs_block *block)
+{
+	struct inode *inode;
+	struct page *page;
+	int err, cookie;
+
+	inode = logfs_safe_iget(block->sb, block->ino, &cookie);
+	page = logfs_get_write_page(inode, block->bix, block->level);
+
+	err = logfs_readpage_nolock(page);
+	BUG_ON(err);
+	BUG_ON(!PagePrivate(page));
+	BUG_ON(logfs_block(page) != block);
+	err = __logfs_write_buf(inode, page, 0);
+	BUG_ON(err);
+	BUG_ON(PagePrivate(page) || page->private);
+
+	logfs_put_write_page(page);
+	logfs_safe_iput(inode, cookie);
+}
+
+/**
+ * logfs_inode_write - write inode or dentry objects
+ *
+ * @inode:		parent inode (ifile or directory)
+ * @buf:		object to write (inode or dentry)
+ * @n:			object size
+ * @_pos:		object number (file position in blocks/objects)
+ * @flags:		write flags
+ * @lock:		0 if write lock is already taken, 1 otherwise
+ * @shadow_tree:	shadow below this inode
+ *
+ * FIXME: All caller of this put a 200-300 byte variable on the stack,
+ * only to call here and do a memcpy from that stack variable.  A good
+ * example of wasted performance and stack space.
+ */
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct shadow_tree *shadow_tree)
+{
+	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+	int err;
+	struct page *page;
+	void *pagebuf;
+
+	BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
+	BUG_ON(count > LOGFS_BLOCKSIZE);
+	page = logfs_get_write_page(inode, bix, 0);
+	if (!page)
+		return -ENOMEM;
+
+	pagebuf = kmap_atomic(page, KM_USER0);
+	memcpy(pagebuf, buf, count);
+	flush_dcache_page(page);
+	kunmap_atomic(pagebuf, KM_USER0);
+
+	if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
+		i_size_write(inode, pos + LOGFS_BLOCKSIZE);
+
+	err = logfs_write_buf(inode, page, flags);
+	logfs_put_write_page(page);
+	return err;
+}
+
+int logfs_open_segfile(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+
+	inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	super->s_segfile_inode = inode;
+	return 0;
+}
+
+int logfs_init_rw(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int min_fill = 3 * super->s_no_blocks;
+
+	INIT_LIST_HEAD(&super->s_object_alias);
+	mutex_init(&super->s_write_mutex);
+	super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
+			sizeof(struct logfs_block));
+	super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
+			sizeof(struct logfs_shadow));
+	return 0;
+}
+
+void logfs_cleanup_rw(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	destroy_meta_inode(super->s_segfile_inode);
+	if (super->s_block_pool)
+		mempool_destroy(super->s_block_pool);
+	if (super->s_shadow_pool)
+		mempool_destroy(super->s_shadow_pool);
+}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 00000000000..5f58b74516c
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,924 @@
+/*
+ * fs/logfs/segment.c	- Handling the Object Store
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Object store or ostore makes up the complete device with exception of
+ * the superblock and journal areas.  Apart from its own metadata it stores
+ * three kinds of objects: inodes, dentries and blocks, both data and indirect.
+ */
+#include "logfs.h"
+
+static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head32 *head = &super->s_reserved_segments;
+	int err;
+
+	err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
+	if (err)
+		return err;
+	logfs_super(sb)->s_bad_segments++;
+	/* FIXME: write to journal */
+	return 0;
+}
+
+int logfs_erase_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec++;
+
+	return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
+			super->s_segsize);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
+{
+	s32 ofs;
+
+	logfs_open_area(area, bytes);
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += bytes;
+	BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
+		int use_filler)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = super->s_devops->readpage;
+	struct page *page;
+
+	BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+	if (use_filler)
+		page = read_cache_page(mapping, index, filler, sb);
+	else {
+		page = find_or_create_page(mapping, index, GFP_NOFS);
+		unlock_page(page);
+	}
+	return page;
+}
+
+void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+		int use_filler)
+{
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	struct page *page;
+	long offset = ofs & (PAGE_SIZE-1);
+	long copylen;
+
+	/* Only logfs_wbuf_recover may use len==0 */
+	BUG_ON(!len && !use_filler);
+	do {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = get_mapping_page(area->a_sb, index, use_filler);
+		SetPageUptodate(page);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		memcpy(page_address(page) + offset, buf, copylen);
+		SetPagePrivate(page);
+		page_cache_release(page);
+
+		buf += copylen;
+		len -= copylen;
+		offset = 0;
+		index++;
+	} while (len);
+}
+
+/*
+ * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
+ */
+static void pad_wbuf(struct logfs_area *area, int final)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct page *page;
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	long offset = ofs & (PAGE_SIZE-1);
+	u32 len = PAGE_SIZE - offset;
+
+	if (len == PAGE_SIZE) {
+		/* The math in this function can surely use some love */
+		len = 0;
+	}
+	if (len) {
+		BUG_ON(area->a_used_bytes >= super->s_segsize);
+
+		page = get_mapping_page(area->a_sb, index, 0);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		memset(page_address(page) + offset, 0xff, len);
+		SetPagePrivate(page);
+		page_cache_release(page);
+	}
+
+	if (!final)
+		return;
+
+	area->a_used_bytes += len;
+	for ( ; area->a_used_bytes < super->s_segsize;
+			area->a_used_bytes += PAGE_SIZE) {
+		/* Memset another page */
+		index++;
+		page = get_mapping_page(area->a_sb, index, 0);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		memset(page_address(page), 0xff, PAGE_SIZE);
+		SetPagePrivate(page);
+		page_cache_release(page);
+	}
+}
+
+/*
+ * We have to be careful with the alias tree.  Since lookup is done by bix,
+ * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
+ * indirect blocks.  So always use it through accessor functions.
+ */
+static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
+		level_t level)
+{
+	struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+	pgoff_t index = logfs_pack_index(bix, level);
+
+	return btree_lookup128(head, ino, index);
+}
+
+static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, void *val)
+{
+	struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+	pgoff_t index = logfs_pack_index(bix, level);
+
+	return btree_insert128(head, ino, index, val, GFP_NOFS);
+}
+
+static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
+		write_alias_t *write_one_alias)
+{
+	struct object_alias_item *item;
+	int err;
+
+	list_for_each_entry(item, &block->item_list, list) {
+		err = write_alias_journal(sb, block->ino, block->bix,
+				block->level, item->child_no, item->val);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static gc_level_t btree_block_level(struct logfs_block *block)
+{
+	return expand_level(block->ino, block->level);
+}
+
+static struct logfs_block_ops btree_block_ops = {
+	.write_block	= btree_write_block,
+	.block_level	= btree_block_level,
+	.free_block	= __free_block,
+	.write_alias	= btree_write_alias,
+};
+
+int logfs_load_object_aliases(struct super_block *sb,
+		struct logfs_obj_alias *oa, int count)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	struct object_alias_item *item;
+	u64 ino, bix;
+	level_t level;
+	int i, err;
+
+	super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+	count /= sizeof(*oa);
+	for (i = 0; i < count; i++) {
+		item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+		if (!item)
+			return -ENOMEM;
+		memset(item, 0, sizeof(*item));
+
+		super->s_no_object_aliases++;
+		item->val = oa[i].val;
+		item->child_no = be16_to_cpu(oa[i].child_no);
+
+		ino = be64_to_cpu(oa[i].ino);
+		bix = be64_to_cpu(oa[i].bix);
+		level = LEVEL(oa[i].level);
+
+		log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
+				ino, bix, level, item->child_no,
+				be64_to_cpu(item->val));
+		block = alias_tree_lookup(sb, ino, bix, level);
+		if (!block) {
+			block = __alloc_block(sb, ino, bix, level);
+			block->ops = &btree_block_ops;
+			err = alias_tree_insert(sb, ino, bix, level, block);
+			BUG_ON(err); /* mempool empty */
+		}
+		if (test_and_set_bit(item->child_no, block->alias_map)) {
+			printk(KERN_ERR"LogFS: Alias collision detected\n");
+			return -EIO;
+		}
+		list_move_tail(&block->alias_list, &super->s_object_alias);
+		list_add(&item->list, &block->item_list);
+	}
+	return 0;
+}
+
+static void kill_alias(void *_block, unsigned long ignore0,
+		u64 ignore1, u64 ignore2, size_t ignore3)
+{
+	struct logfs_block *block = _block;
+	struct super_block *sb = block->sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct object_alias_item *item;
+
+	while (!list_empty(&block->item_list)) {
+		item = list_entry(block->item_list.next, typeof(*item), list);
+		list_del(&item->list);
+		mempool_free(item, super->s_alias_pool);
+	}
+	block->ops->free_block(sb, block);
+}
+
+static int obj_type(struct inode *inode, level_t level)
+{
+	if (level == 0) {
+		if (S_ISDIR(inode->i_mode))
+			return OBJ_DENTRY;
+		if (inode->i_ino == LOGFS_INO_MASTER)
+			return OBJ_INODE;
+	}
+	return OBJ_BLOCK;
+}
+
+static int obj_len(struct super_block *sb, int obj_type)
+{
+	switch (obj_type) {
+	case OBJ_DENTRY:
+		return sizeof(struct logfs_disk_dentry);
+	case OBJ_INODE:
+		return sizeof(struct logfs_disk_inode);
+	case OBJ_BLOCK:
+		return sb->s_blocksize;
+	default:
+		BUG();
+	}
+}
+
+static int __logfs_segment_write(struct inode *inode, void *buf,
+		struct logfs_shadow *shadow, int type, int len, int compr)
+{
+	struct logfs_area *area;
+	struct super_block *sb = inode->i_sb;
+	s64 ofs;
+	struct logfs_object_header h;
+	int acc_len;
+
+	if (shadow->gc_level == 0)
+		acc_len = len;
+	else
+		acc_len = obj_len(sb, type);
+
+	area = get_area(sb, shadow->gc_level);
+	ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
+	LOGFS_BUG_ON(ofs <= 0, sb);
+	/*
+	 * Order is important.  logfs_get_free_bytes(), by modifying the
+	 * segment file, may modify the content of the very page we're about
+	 * to write now.  Which is fine, as long as the calculated crc and
+	 * written data still match.  So do the modifications _before_
+	 * calculating the crc.
+	 */
+
+	h.len	= cpu_to_be16(len);
+	h.type	= type;
+	h.compr	= compr;
+	h.ino	= cpu_to_be64(inode->i_ino);
+	h.bix	= cpu_to_be64(shadow->bix);
+	h.crc	= logfs_crc32(&h, sizeof(h) - 4, 4);
+	h.data_crc = logfs_crc32(buf, len, 0);
+
+	logfs_buf_write(area, ofs, &h, sizeof(h));
+	logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
+
+	shadow->new_ofs = ofs;
+	shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
+
+	return 0;
+}
+
+static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
+		struct logfs_shadow *shadow, int type, int len)
+{
+	struct super_block *sb = inode->i_sb;
+	void *compressor_buf = logfs_super(sb)->s_compressed_je;
+	ssize_t compr_len;
+	int ret;
+
+	mutex_lock(&logfs_super(sb)->s_journal_mutex);
+	compr_len = logfs_compress(buf, compressor_buf, len, len);
+
+	if (compr_len >= 0) {
+		ret = __logfs_segment_write(inode, compressor_buf, shadow,
+				type, compr_len, COMPR_ZLIB);
+	} else {
+		ret = __logfs_segment_write(inode, buf, shadow, type, len,
+				COMPR_NONE);
+	}
+	mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+	return ret;
+}
+
+/**
+ * logfs_segment_write - write data block to object store
+ * @inode:		inode containing data
+ *
+ * Returns an errno or zero.
+ */
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	int do_compress, type, len;
+	int ret;
+	void *buf;
+
+	BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
+	if (shadow->gc_level != 0) {
+		/* temporarily disable compression for indirect blocks */
+		do_compress = 0;
+	}
+
+	type = obj_type(inode, shrink_level(shadow->gc_level));
+	len = obj_len(sb, type);
+	buf = kmap(page);
+	if (do_compress)
+		ret = logfs_segment_write_compress(inode, buf, shadow, type,
+				len);
+	else
+		ret = __logfs_segment_write(inode, buf, shadow, type, len,
+				COMPR_NONE);
+	kunmap(page);
+
+	log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	/* this BUG_ON did catch a locking bug.  useful */
+	BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
+	return ret;
+}
+
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
+{
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	struct page *page;
+	long offset = ofs & (PAGE_SIZE-1);
+	long copylen;
+
+	while (len) {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = get_mapping_page(sb, index, 1);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		memcpy(buf, page_address(page) + offset, copylen);
+		page_cache_release(page);
+
+		buf += copylen;
+		len -= copylen;
+		offset = 0;
+		index++;
+	}
+	return 0;
+}
+
+/*
+ * The "position" of indirect blocks is ambiguous.  It can be the position
+ * of any data block somewhere behind this indirect block.  So we need to
+ * normalize the positions through logfs_block_mask() before comparing.
+ */
+static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
+{
+	return	(pos1 & logfs_block_mask(sb, level)) !=
+		(pos2 & logfs_block_mask(sb, level));
+}
+
+#if 0
+static int read_seg_header(struct super_block *sb, u64 ofs,
+		struct logfs_segment_header *sh)
+{
+	__be32 crc;
+	int err;
+
+	err = wbuf_read(sb, ofs, sizeof(*sh), sh);
+	if (err)
+		return err;
+	crc = logfs_crc32(sh, sizeof(*sh), 4);
+	if (crc != sh->crc) {
+		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+				"got %x\n", ofs, be32_to_cpu(sh->crc),
+				be32_to_cpu(crc));
+		return -EIO;
+	}
+	return 0;
+}
+#endif
+
+static int read_obj_header(struct super_block *sb, u64 ofs,
+		struct logfs_object_header *oh)
+{
+	__be32 crc;
+	int err;
+
+	err = wbuf_read(sb, ofs, sizeof(*oh), oh);
+	if (err)
+		return err;
+	crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
+	if (crc != oh->crc) {
+		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+				"got %x\n", ofs, be32_to_cpu(oh->crc),
+				be32_to_cpu(crc));
+		return -EIO;
+	}
+	return 0;
+}
+
+static void move_btree_to_page(struct inode *inode, struct page *page,
+		__be64 *data)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head128 *head = &super->s_object_alias_tree;
+	struct logfs_block *block;
+	struct object_alias_item *item, *next;
+
+	if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
+		return;
+
+	block = btree_remove128(head, inode->i_ino, page->index);
+	if (!block)
+		return;
+
+	log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	list_for_each_entry_safe(item, next, &block->item_list, list) {
+		data[item->child_no] = item->val;
+		list_del(&item->list);
+		mempool_free(item, super->s_alias_pool);
+	}
+	block->page = page;
+	SetPagePrivate(page);
+	page->private = (unsigned long)block;
+	block->ops = &indirect_block_ops;
+	initialize_block_counters(page, block, data, 0);
+}
+
+/*
+ * This silences a false, yet annoying gcc warning.  I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
+{
+	return find_next_bit(addr, size, offset);
+}
+
+void move_page_to_btree(struct page *page)
+{
+	struct logfs_block *block = logfs_block(page);
+	struct super_block *sb = block->sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct object_alias_item *item;
+	unsigned long pos;
+	__be64 *child;
+	int err;
+
+	if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
+		block->ops->free_block(sb, block);
+		return;
+	}
+	log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_BLOCK_FACTOR)
+			break;
+
+		item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+		BUG_ON(!item); /* mempool empty */
+		memset(item, 0, sizeof(*item));
+
+		child = kmap_atomic(page, KM_USER0);
+		item->val = child[pos];
+		kunmap_atomic(child, KM_USER0);
+		item->child_no = pos;
+		list_add(&item->list, &block->item_list);
+	}
+	block->page = NULL;
+	ClearPagePrivate(page);
+	page->private = 0;
+	block->ops = &btree_block_ops;
+	err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
+			block);
+	BUG_ON(err); /* mempool empty */
+	ClearPageUptodate(page);
+}
+
+static int __logfs_segment_read(struct inode *inode, void *buf,
+		u64 ofs, u64 bix, level_t level)
+{
+	struct super_block *sb = inode->i_sb;
+	void *compressor_buf = logfs_super(sb)->s_compressed_je;
+	struct logfs_object_header oh;
+	__be32 crc;
+	u16 len;
+	int err, block_len;
+
+	block_len = obj_len(sb, obj_type(inode, level));
+	err = read_obj_header(sb, ofs, &oh);
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	if (be64_to_cpu(oh.ino) != inode->i_ino
+			|| check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
+		printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
+				"expected (%lx, %llx), got (%llx, %llx)\n",
+				ofs, inode->i_ino, bix,
+				be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
+		goto out_err;
+	}
+
+	len = be16_to_cpu(oh.len);
+
+	switch (oh.compr) {
+	case COMPR_NONE:
+		err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
+		if (err)
+			goto out_err;
+		crc = logfs_crc32(buf, len, 0);
+		if (crc != oh.data_crc) {
+			printk(KERN_ERR"LOGFS: uncompressed data crc error at "
+					"%llx: expected %x, got %x\n", ofs,
+					be32_to_cpu(oh.data_crc),
+					be32_to_cpu(crc));
+			goto out_err;
+		}
+		break;
+	case COMPR_ZLIB:
+		mutex_lock(&logfs_super(sb)->s_journal_mutex);
+		err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
+				compressor_buf);
+		if (err) {
+			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+			goto out_err;
+		}
+		crc = logfs_crc32(compressor_buf, len, 0);
+		if (crc != oh.data_crc) {
+			printk(KERN_ERR"LOGFS: compressed data crc error at "
+					"%llx: expected %x, got %x\n", ofs,
+					be32_to_cpu(oh.data_crc),
+					be32_to_cpu(crc));
+			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+			goto out_err;
+		}
+		err = logfs_uncompress(compressor_buf, buf, len, block_len);
+		mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+		if (err) {
+			printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
+			goto out_err;
+		}
+		break;
+	default:
+		LOGFS_BUG(sb);
+		err = -EIO;
+		goto out_err;
+	}
+	return 0;
+
+out_err:
+	logfs_set_ro(sb);
+	printk(KERN_ERR"LOGFS: device is read-only now\n");
+	LOGFS_BUG(sb);
+	return err;
+}
+
+/**
+ * logfs_segment_read - read data block from object store
+ * @inode:		inode containing data
+ * @buf:		data buffer
+ * @ofs:		physical data offset
+ * @bix:		block index
+ * @level:		block level
+ *
+ * Returns 0 on success or a negative errno.
+ */
+int logfs_segment_read(struct inode *inode, struct page *page,
+		u64 ofs, u64 bix, level_t level)
+{
+	int err;
+	void *buf;
+
+	if (PageUptodate(page))
+		return 0;
+
+	ofs &= ~LOGFS_FULLY_POPULATED;
+
+	buf = kmap(page);
+	err = __logfs_segment_read(inode, buf, ofs, bix, level);
+	if (!err) {
+		move_btree_to_page(inode, page, buf);
+		SetPageUptodate(page);
+	}
+	kunmap(page);
+	log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
+			inode->i_ino, bix, level, ofs, err);
+	return err;
+}
+
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_object_header h;
+	u16 len;
+	int err;
+
+	BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
+	if (!shadow->old_ofs)
+		return 0;
+
+	log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	err = read_obj_header(sb, shadow->old_ofs, &h);
+	LOGFS_BUG_ON(err, sb);
+	LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
+	LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
+				shrink_level(shadow->gc_level)), sb);
+
+	if (shadow->gc_level == 0)
+		len = be16_to_cpu(h.len);
+	else
+		len = obj_len(sb, h.type);
+	shadow->old_len = len + sizeof(h);
+	return 0;
+}
+
+static void freeseg(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	u64 ofs, start, end;
+
+	start = dev_ofs(sb, segno, 0);
+	end = dev_ofs(sb, segno + 1, 0);
+	for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
+		page = find_get_page(mapping, ofs >> PAGE_SHIFT);
+		if (!page)
+			continue;
+		ClearPagePrivate(page);
+		page_cache_release(page);
+	}
+}
+
+int logfs_open_area(struct logfs_area *area, size_t bytes)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	int err, closed = 0;
+
+	if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
+		return 0;
+
+	if (area->a_is_open) {
+		u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+		u32 len = super->s_segsize - area->a_written_bytes;
+
+		log_gc("logfs_close_area(%x)\n", area->a_segno);
+		pad_wbuf(area, 1);
+		super->s_devops->writeseg(area->a_sb, ofs, len);
+		freeseg(sb, area->a_segno);
+		closed = 1;
+	}
+
+	area->a_used_bytes = 0;
+	area->a_written_bytes = 0;
+again:
+	area->a_ops->get_free_segment(area);
+	area->a_ops->get_erase_count(area);
+
+	log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
+	err = area->a_ops->erase_segment(area);
+	if (err) {
+		printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
+				area->a_segno);
+		logfs_mark_segment_bad(sb, area->a_segno);
+		goto again;
+	}
+	area->a_is_open = 1;
+	return closed;
+}
+
+void logfs_sync_area(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+	u32 len = (area->a_used_bytes - area->a_written_bytes);
+
+	if (super->s_writesize)
+		len &= ~(super->s_writesize - 1);
+	if (len == 0)
+		return;
+	pad_wbuf(area, 0);
+	super->s_devops->writeseg(sb, ofs, len);
+	area->a_written_bytes += len;
+}
+
+void logfs_sync_segments(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for_each_area(i)
+		logfs_sync_area(super->s_area[i]);
+}
+
+/*
+ * Pick a free segment to be used for this area.  Effectively takes a
+ * candidate from the free list (not really a candidate anymore).
+ */
+static void ostore_get_free_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	if (super->s_free_list.count == 0) {
+		printk(KERN_ERR"LOGFS: ran out of free segments\n");
+		LOGFS_BUG(sb);
+	}
+
+	area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
+}
+
+static void ostore_get_erase_count(struct logfs_area *area)
+{
+	struct logfs_segment_entry se;
+	u32 ec_level;
+
+	logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
+	BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
+			se.valid == cpu_to_be32(RESERVED));
+
+	ec_level = be32_to_cpu(se.ec_level);
+	area->a_erase_count = (ec_level >> 4) + 1;
+}
+
+static int ostore_erase_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_segment_header sh;
+	u64 ofs;
+	int err;
+
+	err = logfs_erase_segment(sb, area->a_segno);
+	if (err)
+		return err;
+
+	sh.pad = 0;
+	sh.type = SEG_OSTORE;
+	sh.level = (__force u8)area->a_level;
+	sh.segno = cpu_to_be32(area->a_segno);
+	sh.ec = cpu_to_be32(area->a_erase_count);
+	sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+	sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+
+	logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
+			area->a_level);
+
+	ofs = dev_ofs(sb, area->a_segno, 0);
+	area->a_used_bytes = sizeof(sh);
+	logfs_buf_write(area, ofs, &sh, sizeof(sh));
+	return 0;
+}
+
+static const struct logfs_area_ops ostore_area_ops = {
+	.get_free_segment	= ostore_get_free_segment,
+	.get_erase_count	= ostore_get_erase_count,
+	.erase_segment		= ostore_erase_segment,
+};
+
+static void free_area(struct logfs_area *area)
+{
+	if (area)
+		freeseg(area->a_sb, area->a_segno);
+	kfree(area);
+}
+
+static struct logfs_area *alloc_area(struct super_block *sb)
+{
+	struct logfs_area *area;
+
+	area = kzalloc(sizeof(*area), GFP_KERNEL);
+	if (!area)
+		return NULL;
+
+	area->a_sb = sb;
+	return area;
+}
+
+static void map_invalidatepage(struct page *page, unsigned long l)
+{
+	BUG();
+}
+
+static int map_releasepage(struct page *page, gfp_t g)
+{
+	/* Don't release these pages */
+	return 0;
+}
+
+static const struct address_space_operations mapping_aops = {
+	.invalidatepage = map_invalidatepage,
+	.releasepage	= map_releasepage,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+int logfs_init_mapping(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping;
+	struct inode *inode;
+
+	inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	super->s_mapping_inode = inode;
+	mapping = inode->i_mapping;
+	mapping->a_ops = &mapping_aops;
+	/* Would it be possible to use __GFP_HIGHMEM as well? */
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	return 0;
+}
+
+int logfs_init_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i = -1;
+
+	super->s_alias_pool = mempool_create_kmalloc_pool(600,
+			sizeof(struct object_alias_item));
+	if (!super->s_alias_pool)
+		return -ENOMEM;
+
+	super->s_journal_area = alloc_area(sb);
+	if (!super->s_journal_area)
+		goto err;
+
+	for_each_area(i) {
+		super->s_area[i] = alloc_area(sb);
+		if (!super->s_area[i])
+			goto err;
+		super->s_area[i]->a_level = GC_LEVEL(i);
+		super->s_area[i]->a_ops = &ostore_area_ops;
+	}
+	btree_init_mempool128(&super->s_object_alias_tree,
+			super->s_btree_pool);
+	return 0;
+
+err:
+	for (i--; i >= 0; i--)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+	mempool_destroy(super->s_alias_pool);
+	return -ENOMEM;
+}
+
+void logfs_cleanup_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
+	for_each_area(i)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+	destroy_meta_inode(super->s_mapping_inode);
+}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 00000000000..d128a2c1c8d
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,634 @@
+/*
+ * fs/logfs/super.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Generally contains mount/umount code and also serves as a dump area for
+ * any functions that don't fit elsewhere and neither justify a file of their
+ * own.
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/mtd/mtd.h>
+#include <linux/statfs.h>
+#include <linux/buffer_head.h>
+
+static DEFINE_MUTEX(emergency_mutex);
+static struct page *emergency_page;
+
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
+{
+	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+	struct page *page;
+	int err;
+
+	page = read_cache_page(mapping, index, filler, NULL);
+	if (page)
+		return page;
+
+	/* No more pages available, switch to emergency page */
+	printk(KERN_INFO"Logfs: Using emergency page\n");
+	mutex_lock(&emergency_mutex);
+	err = filler(NULL, emergency_page);
+	if (err) {
+		mutex_unlock(&emergency_mutex);
+		printk(KERN_EMERG"Logfs: Error reading emergency page\n");
+		return ERR_PTR(err);
+	}
+	return emergency_page;
+}
+
+void emergency_read_end(struct page *page)
+{
+	if (page == emergency_page)
+		mutex_unlock(&emergency_mutex);
+	else
+		page_cache_release(page);
+}
+
+static void dump_segfile(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_entry se;
+	u32 segno;
+
+	for (segno = 0; segno < super->s_no_segs; segno++) {
+		logfs_get_segment_entry(sb, segno, &se);
+		printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
+				be32_to_cpu(se.valid));
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		printk("\n");
+	}
+}
+
+/*
+ * logfs_crash_dump - dump debug information to device
+ *
+ * The LogFS superblock only occupies part of a segment.  This function will
+ * write as much debug information as it can gather into the spare space.
+ */
+void logfs_crash_dump(struct super_block *sb)
+{
+	dump_segfile(sb);
+}
+
+/*
+ * TODO: move to lib/string.c
+ */
+/**
+ * memchr_inv - Find a character in an area of memory.
+ * @s: The memory area
+ * @c: The byte to search for
+ * @n: The size of the area.
+ *
+ * returns the address of the first character other than @c, or %NULL
+ * if the whole buffer contains just @c.
+ */
+void *memchr_inv(const void *s, int c, size_t n)
+{
+	const unsigned char *p = s;
+	while (n-- != 0)
+		if ((unsigned char)c != *p++)
+			return (void *)(p - 1);
+
+	return NULL;
+}
+
+/*
+ * FIXME: There should be a reserve for root, similar to ext2.
+ */
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	stats->f_type		= LOGFS_MAGIC_U32;
+	stats->f_bsize		= sb->s_blocksize;
+	stats->f_blocks		= super->s_size >> LOGFS_BLOCK_BITS >> 3;
+	stats->f_bfree		= super->s_free_bytes >> sb->s_blocksize_bits;
+	stats->f_bavail		= super->s_free_bytes >> sb->s_blocksize_bits;
+	stats->f_files		= 0;
+	stats->f_ffree		= 0;
+	stats->f_namelen	= LOGFS_MAX_NAMELEN;
+	return 0;
+}
+
+static int logfs_sb_set(struct super_block *sb, void *_super)
+{
+	struct logfs_super *super = _super;
+
+	sb->s_fs_info = super;
+	sb->s_mtd = super->s_mtd;
+	sb->s_bdev = super->s_bdev;
+	return 0;
+}
+
+static int logfs_sb_test(struct super_block *sb, void *_super)
+{
+	struct logfs_super *super = _super;
+	struct mtd_info *mtd = super->s_mtd;
+
+	if (mtd && sb->s_mtd == mtd)
+		return 1;
+	if (super->s_bdev && sb->s_bdev == super->s_bdev)
+		return 1;
+	return 0;
+}
+
+static void set_segment_header(struct logfs_segment_header *sh, u8 type,
+		u8 level, u32 segno, u32 ec)
+{
+	sh->pad = 0;
+	sh->type = type;
+	sh->level = level;
+	sh->segno = cpu_to_be32(segno);
+	sh->ec = cpu_to_be32(ec);
+	sh->gec = cpu_to_be64(segno);
+	sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
+}
+
+static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
+		u32 segno, u32 ec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header *sh = &ds->ds_sh;
+	int i;
+
+	memset(ds, 0, sizeof(*ds));
+	set_segment_header(sh, SEG_SUPER, 0, segno, ec);
+
+	ds->ds_ifile_levels	= super->s_ifile_levels;
+	ds->ds_iblock_levels	= super->s_iblock_levels;
+	ds->ds_data_levels	= super->s_data_levels; /* XXX: Remove */
+	ds->ds_segment_shift	= super->s_segshift;
+	ds->ds_block_shift	= sb->s_blocksize_bits;
+	ds->ds_write_shift	= super->s_writeshift;
+	ds->ds_filesystem_size	= cpu_to_be64(super->s_size);
+	ds->ds_segment_size	= cpu_to_be32(super->s_segsize);
+	ds->ds_bad_seg_reserve	= cpu_to_be32(super->s_bad_seg_reserve);
+	ds->ds_feature_incompat	= cpu_to_be64(super->s_feature_incompat);
+	ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
+	ds->ds_feature_compat	= cpu_to_be64(super->s_feature_compat);
+	ds->ds_feature_flags	= cpu_to_be64(super->s_feature_flags);
+	ds->ds_root_reserve	= cpu_to_be64(super->s_root_reserve);
+	ds->ds_speed_reserve	= cpu_to_be64(super->s_speed_reserve);
+	journal_for_each(i)
+		ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
+	ds->ds_magic		= cpu_to_be64(LOGFS_MAGIC);
+	ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
+			LOGFS_SEGMENT_HEADERSIZE + 12);
+}
+
+static int write_one_sb(struct super_block *sb,
+		struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_super *ds;
+	struct logfs_segment_entry se;
+	struct page *page;
+	u64 ofs;
+	u32 ec, segno;
+	int err;
+
+	page = find_sb(sb, &ofs);
+	if (!page)
+		return -EIO;
+	ds = page_address(page);
+	segno = seg_no(sb, ofs);
+	logfs_get_segment_entry(sb, segno, &se);
+	ec = be32_to_cpu(se.ec_level) >> 4;
+	ec++;
+	logfs_set_segment_erased(sb, segno, ec, 0);
+	logfs_write_ds(sb, ds, segno, ec);
+	err = super->s_devops->write_sb(sb, page);
+	page_cache_release(page);
+	return err;
+}
+
+int logfs_write_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int err;
+
+	/* First superblock */
+	err = write_one_sb(sb, super->s_devops->find_first_sb);
+	if (err)
+		return err;
+
+	/* Last superblock */
+	err = write_one_sb(sb, super->s_devops->find_last_sb);
+	if (err)
+		return err;
+	return 0;
+}
+
+static int ds_cmp(const void *ds0, const void *ds1)
+{
+	size_t len = sizeof(struct logfs_disk_super);
+
+	/* We know the segment headers differ, so ignore them */
+	len -= LOGFS_SEGMENT_HEADERSIZE;
+	ds0 += LOGFS_SEGMENT_HEADERSIZE;
+	ds1 += LOGFS_SEGMENT_HEADERSIZE;
+	return memcmp(ds0, ds1, len);
+}
+
+static int logfs_recover_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_super _ds0, *ds0 = &_ds0;
+	struct logfs_disk_super _ds1, *ds1 = &_ds1;
+	int err, valid0, valid1;
+
+	/* read first superblock */
+	err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
+	if (err)
+		return err;
+	/* read last superblock */
+	err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
+	if (err)
+		return err;
+	valid0 = logfs_check_ds(ds0) == 0;
+	valid1 = logfs_check_ds(ds1) == 0;
+
+	if (!valid0 && valid1) {
+		printk(KERN_INFO"First superblock is invalid - fixing.\n");
+		return write_one_sb(sb, super->s_devops->find_first_sb);
+	}
+	if (valid0 && !valid1) {
+		printk(KERN_INFO"Last superblock is invalid - fixing.\n");
+		return write_one_sb(sb, super->s_devops->find_last_sb);
+	}
+	if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
+		printk(KERN_INFO"Superblocks don't match - fixing.\n");
+		return write_one_sb(sb, super->s_devops->find_last_sb);
+	}
+	/* If neither is valid now, something's wrong.  Didn't we properly
+	 * check them before?!? */
+	BUG_ON(!valid0 && !valid1);
+	return 0;
+}
+
+static int logfs_make_writeable(struct super_block *sb)
+{
+	int err;
+
+	/* Repair any broken superblock copies */
+	err = logfs_recover_sb(sb);
+	if (err)
+		return err;
+
+	/* Check areas for trailing unaccounted data */
+	err = logfs_check_areas(sb);
+	if (err)
+		return err;
+
+	err = logfs_open_segfile(sb);
+	if (err)
+		return err;
+
+	/* Do one GC pass before any data gets dirtied */
+	logfs_gc_pass(sb);
+
+	/* after all initializations are done, replay the journal
+	 * for rw-mounts, if necessary */
+	err = logfs_replay_journal(sb);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct inode *rootdir;
+	int err;
+
+	/* root dir */
+	rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
+	if (IS_ERR(rootdir))
+		goto fail;
+
+	sb->s_root = d_alloc_root(rootdir);
+	if (!sb->s_root)
+		goto fail;
+
+	/* FIXME: check for read-only mounts */
+	err = logfs_make_writeable(sb);
+	if (err)
+		goto fail2;
+
+	log_super("LogFS: Finished mounting\n");
+	simple_set_mnt(mnt, sb);
+	return 0;
+
+fail2:
+	iput(rootdir);
+fail:
+	iput(logfs_super(sb)->s_master_inode);
+	return -EIO;
+}
+
+int logfs_check_ds(struct logfs_disk_super *ds)
+{
+	struct logfs_segment_header *sh = &ds->ds_sh;
+
+	if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
+		return -EINVAL;
+	if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
+		return -EINVAL;
+	if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
+				LOGFS_SEGMENT_HEADERSIZE + 12))
+		return -EINVAL;
+	return 0;
+}
+
+static struct page *find_super_block(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct page *first, *last;
+
+	first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
+	if (!first || IS_ERR(first))
+		return NULL;
+	last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
+	if (!last || IS_ERR(first)) {
+		page_cache_release(first);
+		return NULL;
+	}
+
+	if (!logfs_check_ds(page_address(first))) {
+		page_cache_release(last);
+		return first;
+	}
+
+	/* First one didn't work, try the second superblock */
+	if (!logfs_check_ds(page_address(last))) {
+		page_cache_release(first);
+		return last;
+	}
+
+	/* Neither worked, sorry folks */
+	page_cache_release(first);
+	page_cache_release(last);
+	return NULL;
+}
+
+static int __logfs_read_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct page *page;
+	struct logfs_disk_super *ds;
+	int i;
+
+	page = find_super_block(sb);
+	if (!page)
+		return -EIO;
+
+	ds = page_address(page);
+	super->s_size = be64_to_cpu(ds->ds_filesystem_size);
+	super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
+	super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
+	super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
+	super->s_segsize = 1 << ds->ds_segment_shift;
+	super->s_segmask = (1 << ds->ds_segment_shift) - 1;
+	super->s_segshift = ds->ds_segment_shift;
+	sb->s_blocksize = 1 << ds->ds_block_shift;
+	sb->s_blocksize_bits = ds->ds_block_shift;
+	super->s_writesize = 1 << ds->ds_write_shift;
+	super->s_writeshift = ds->ds_write_shift;
+	super->s_no_segs = super->s_size >> super->s_segshift;
+	super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
+	super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
+	super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
+	super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
+	super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
+
+	journal_for_each(i)
+		super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
+
+	super->s_ifile_levels = ds->ds_ifile_levels;
+	super->s_iblock_levels = ds->ds_iblock_levels;
+	super->s_data_levels = ds->ds_data_levels;
+	super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
+		+ super->s_data_levels;
+	page_cache_release(page);
+	return 0;
+}
+
+static int logfs_read_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int ret;
+
+	super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
+	if (!super->s_btree_pool)
+		return -ENOMEM;
+
+	btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
+	btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
+
+	ret = logfs_init_mapping(sb);
+	if (ret)
+		return ret;
+
+	ret = __logfs_read_sb(sb);
+	if (ret)
+		return ret;
+
+	mutex_init(&super->s_dirop_mutex);
+	mutex_init(&super->s_object_alias_mutex);
+	INIT_LIST_HEAD(&super->s_freeing_list);
+
+	ret = logfs_init_rw(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_areas(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_gc(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_journal(sb);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void logfs_kill_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	log_super("LogFS: Start unmounting\n");
+	/* Alias entries slow down mount, so evict as many as possible */
+	sync_filesystem(sb);
+	logfs_write_anchor(super->s_master_inode);
+
+	/*
+	 * From this point on alias entries are simply dropped - and any
+	 * writes to the object store are considered bugs.
+	 */
+	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
+	log_super("LogFS: Now in shutdown\n");
+	generic_shutdown_super(sb);
+
+	BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
+
+	logfs_cleanup_gc(sb);
+	logfs_cleanup_journal(sb);
+	logfs_cleanup_areas(sb);
+	logfs_cleanup_rw(sb);
+	super->s_devops->put_device(sb);
+	mempool_destroy(super->s_btree_pool);
+	mempool_destroy(super->s_alias_pool);
+	kfree(super);
+	log_super("LogFS: Finished unmounting\n");
+}
+
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+		struct mtd_info *mtd, struct block_device *bdev,
+		const struct logfs_device_ops *devops, struct vfsmount *mnt)
+{
+	struct logfs_super *super;
+	struct super_block *sb;
+	int err = -ENOMEM;
+	static int mount_count;
+
+	log_super("LogFS: Start mount %x\n", mount_count++);
+	super = kzalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		goto err0;
+
+	super->s_mtd	= mtd;
+	super->s_bdev	= bdev;
+	err = -EINVAL;
+	sb = sget(type, logfs_sb_test, logfs_sb_set, super);
+	if (IS_ERR(sb))
+		goto err0;
+
+	if (sb->s_root) {
+		/* Device is already in use */
+		err = 0;
+		simple_set_mnt(mnt, sb);
+		goto err0;
+	}
+
+	super->s_devops = devops;
+
+	/*
+	 * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
+	 * only covers 16TB and the upper 8TB are used for indirect blocks.
+	 * On 64bit system we could bump up the limit, but that would make
+	 * the filesystem incompatible with 32bit systems.
+	 */
+	sb->s_maxbytes	= (1ull << 43) - 1;
+	sb->s_op	= &logfs_super_operations;
+	sb->s_flags	= flags | MS_NOATIME;
+
+	err = logfs_read_sb(sb);
+	if (err)
+		goto err1;
+
+	sb->s_flags |= MS_ACTIVE;
+	err = logfs_get_sb_final(sb, mnt);
+	if (err)
+		goto err1;
+	return 0;
+
+err1:
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	return err;
+err0:
+	kfree(super);
+	//devops->put_device(sb);
+	return err;
+}
+
+static int logfs_get_sb(struct file_system_type *type, int flags,
+		const char *devname, void *data, struct vfsmount *mnt)
+{
+	ulong mtdnr;
+
+	if (!devname)
+		return logfs_get_sb_bdev(type, flags, devname, mnt);
+	if (strncmp(devname, "mtd", 3))
+		return logfs_get_sb_bdev(type, flags, devname, mnt);
+
+	{
+		char *garbage;
+		mtdnr = simple_strtoul(devname+3, &garbage, 0);
+		if (*garbage)
+			return -EINVAL;
+	}
+
+	return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+}
+
+static struct file_system_type logfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "logfs",
+	.get_sb		= logfs_get_sb,
+	.kill_sb	= logfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+
+};
+
+static int __init logfs_init(void)
+{
+	int ret;
+
+	emergency_page = alloc_pages(GFP_KERNEL, 0);
+	if (!emergency_page)
+		return -ENOMEM;
+
+	ret = logfs_compr_init();
+	if (ret)
+		goto out1;
+
+	ret = logfs_init_inode_cache();
+	if (ret)
+		goto out2;
+
+	return register_filesystem(&logfs_fs_type);
+out2:
+	logfs_compr_exit();
+out1:
+	__free_pages(emergency_page, 0);
+	return ret;
+}
+
+static void __exit logfs_exit(void)
+{
+	unregister_filesystem(&logfs_fs_type);
+	logfs_destroy_inode_cache();
+	logfs_compr_exit();
+	__free_pages(emergency_page, 0);
+}
+
+module_init(logfs_init);
+module_exit(logfs_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
+MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/include/linux/btree-128.h b/include/linux/btree-128.h
new file mode 100644
index 00000000000..0b3414c4c92
--- /dev/null
+++ b/include/linux/btree-128.h
@@ -0,0 +1,109 @@
+extern struct btree_geo btree_geo128;
+
+struct btree_head128 { struct btree_head h; };
+
+static inline void btree_init_mempool128(struct btree_head128 *head,
+					 mempool_t *mempool)
+{
+	btree_init_mempool(&head->h, mempool);
+}
+
+static inline int btree_init128(struct btree_head128 *head)
+{
+	return btree_init(&head->h);
+}
+
+static inline void btree_destroy128(struct btree_head128 *head)
+{
+	btree_destroy(&head->h);
+}
+
+static inline void *btree_lookup128(struct btree_head128 *head, u64 k1, u64 k2)
+{
+	u64 key[2] = {k1, k2};
+	return btree_lookup(&head->h, &btree_geo128, (unsigned long *)&key);
+}
+
+static inline void *btree_get_prev128(struct btree_head128 *head,
+				      u64 *k1, u64 *k2)
+{
+	u64 key[2] = {*k1, *k2};
+	void *val;
+
+	val = btree_get_prev(&head->h, &btree_geo128,
+			     (unsigned long *)&key);
+	*k1 = key[0];
+	*k2 = key[1];
+	return val;
+}
+
+static inline int btree_insert128(struct btree_head128 *head, u64 k1, u64 k2,
+				  void *val, gfp_t gfp)
+{
+	u64 key[2] = {k1, k2};
+	return btree_insert(&head->h, &btree_geo128,
+			    (unsigned long *)&key, val, gfp);
+}
+
+static inline int btree_update128(struct btree_head128 *head, u64 k1, u64 k2,
+				  void *val)
+{
+	u64 key[2] = {k1, k2};
+	return btree_update(&head->h, &btree_geo128,
+			    (unsigned long *)&key, val);
+}
+
+static inline void *btree_remove128(struct btree_head128 *head, u64 k1, u64 k2)
+{
+	u64 key[2] = {k1, k2};
+	return btree_remove(&head->h, &btree_geo128, (unsigned long *)&key);
+}
+
+static inline void *btree_last128(struct btree_head128 *head, u64 *k1, u64 *k2)
+{
+	u64 key[2];
+	void *val;
+
+	val = btree_last(&head->h, &btree_geo128, (unsigned long *)&key[0]);
+	if (val) {
+		*k1 = key[0];
+		*k2 = key[1];
+	}
+
+	return val;
+}
+
+static inline int btree_merge128(struct btree_head128 *target,
+				 struct btree_head128 *victim,
+				 gfp_t gfp)
+{
+	return btree_merge(&target->h, &victim->h, &btree_geo128, gfp);
+}
+
+void visitor128(void *elem, unsigned long opaque, unsigned long *__key,
+		size_t index, void *__func);
+
+typedef void (*visitor128_t)(void *elem, unsigned long opaque,
+			     u64 key1, u64 key2, size_t index);
+
+static inline size_t btree_visitor128(struct btree_head128 *head,
+				      unsigned long opaque,
+				      visitor128_t func2)
+{
+	return btree_visitor(&head->h, &btree_geo128, opaque,
+			     visitor128, func2);
+}
+
+static inline size_t btree_grim_visitor128(struct btree_head128 *head,
+					   unsigned long opaque,
+					   visitor128_t func2)
+{
+	return btree_grim_visitor(&head->h, &btree_geo128, opaque,
+				  visitor128, func2);
+}
+
+#define btree_for_each_safe128(head, k1, k2, val)	\
+	for (val = btree_last128(head, &k1, &k2);	\
+	     val;					\
+	     val = btree_get_prev128(head, &k1, &k2))
+
diff --git a/include/linux/btree-type.h b/include/linux/btree-type.h
new file mode 100644
index 00000000000..9a1147ef856
--- /dev/null
+++ b/include/linux/btree-type.h
@@ -0,0 +1,147 @@
+#define __BTREE_TP(pfx, type, sfx)	pfx ## type ## sfx
+#define _BTREE_TP(pfx, type, sfx)	__BTREE_TP(pfx, type, sfx)
+#define BTREE_TP(pfx)			_BTREE_TP(pfx, BTREE_TYPE_SUFFIX,)
+#define BTREE_FN(name)			BTREE_TP(btree_ ## name)
+#define BTREE_TYPE_HEAD			BTREE_TP(struct btree_head)
+#define VISITOR_FN			BTREE_TP(visitor)
+#define VISITOR_FN_T			_BTREE_TP(visitor, BTREE_TYPE_SUFFIX, _t)
+
+BTREE_TYPE_HEAD {
+	struct btree_head h;
+};
+
+static inline void BTREE_FN(init_mempool)(BTREE_TYPE_HEAD *head,
+					  mempool_t *mempool)
+{
+	btree_init_mempool(&head->h, mempool);
+}
+
+static inline int BTREE_FN(init)(BTREE_TYPE_HEAD *head)
+{
+	return btree_init(&head->h);
+}
+
+static inline void BTREE_FN(destroy)(BTREE_TYPE_HEAD *head)
+{
+	btree_destroy(&head->h);
+}
+
+static inline int BTREE_FN(merge)(BTREE_TYPE_HEAD *target,
+				  BTREE_TYPE_HEAD *victim,
+				  gfp_t gfp)
+{
+	return btree_merge(&target->h, &victim->h, BTREE_TYPE_GEO, gfp);
+}
+
+#if (BITS_PER_LONG > BTREE_TYPE_BITS)
+static inline void *BTREE_FN(lookup)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
+{
+	unsigned long _key = key;
+	return btree_lookup(&head->h, BTREE_TYPE_GEO, &_key);
+}
+
+static inline int BTREE_FN(insert)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
+				   void *val, gfp_t gfp)
+{
+	unsigned long _key = key;
+	return btree_insert(&head->h, BTREE_TYPE_GEO, &_key, val, gfp);
+}
+
+static inline int BTREE_FN(update)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
+		void *val)
+{
+	unsigned long _key = key;
+	return btree_update(&head->h, BTREE_TYPE_GEO, &_key, val);
+}
+
+static inline void *BTREE_FN(remove)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
+{
+	unsigned long _key = key;
+	return btree_remove(&head->h, BTREE_TYPE_GEO, &_key);
+}
+
+static inline void *BTREE_FN(last)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
+{
+	unsigned long _key;
+	void *val = btree_last(&head->h, BTREE_TYPE_GEO, &_key);
+	if (val)
+		*key = _key;
+	return val;
+}
+
+static inline void *BTREE_FN(get_prev)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
+{
+	unsigned long _key = *key;
+	void *val = btree_get_prev(&head->h, BTREE_TYPE_GEO, &_key);
+	if (val)
+		*key = _key;
+	return val;
+}
+#else
+static inline void *BTREE_FN(lookup)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
+{
+	return btree_lookup(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key);
+}
+
+static inline int BTREE_FN(insert)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
+			   void *val, gfp_t gfp)
+{
+	return btree_insert(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key,
+			    val, gfp);
+}
+
+static inline int BTREE_FN(update)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
+		void *val)
+{
+	return btree_update(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key, val);
+}
+
+static inline void *BTREE_FN(remove)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
+{
+	return btree_remove(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key);
+}
+
+static inline void *BTREE_FN(last)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
+{
+	return btree_last(&head->h, BTREE_TYPE_GEO, (unsigned long *)key);
+}
+
+static inline void *BTREE_FN(get_prev)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
+{
+	return btree_get_prev(&head->h, BTREE_TYPE_GEO, (unsigned long *)key);
+}
+#endif
+
+void VISITOR_FN(void *elem, unsigned long opaque, unsigned long *key,
+		size_t index, void *__func);
+
+typedef void (*VISITOR_FN_T)(void *elem, unsigned long opaque,
+			     BTREE_KEYTYPE key, size_t index);
+
+static inline size_t BTREE_FN(visitor)(BTREE_TYPE_HEAD *head,
+				       unsigned long opaque,
+				       VISITOR_FN_T func2)
+{
+	return btree_visitor(&head->h, BTREE_TYPE_GEO, opaque,
+			     visitorl, func2);
+}
+
+static inline size_t BTREE_FN(grim_visitor)(BTREE_TYPE_HEAD *head,
+					    unsigned long opaque,
+					    VISITOR_FN_T func2)
+{
+	return btree_grim_visitor(&head->h, BTREE_TYPE_GEO, opaque,
+				  visitorl, func2);
+}
+
+#undef VISITOR_FN
+#undef VISITOR_FN_T
+#undef __BTREE_TP
+#undef _BTREE_TP
+#undef BTREE_TP
+#undef BTREE_FN
+#undef BTREE_TYPE_HEAD
+#undef BTREE_TYPE_SUFFIX
+#undef BTREE_TYPE_GEO
+#undef BTREE_KEYTYPE
+#undef BTREE_TYPE_BITS
diff --git a/include/linux/btree.h b/include/linux/btree.h
new file mode 100644
index 00000000000..65b5bb05832
--- /dev/null
+++ b/include/linux/btree.h
@@ -0,0 +1,243 @@
+#ifndef BTREE_H
+#define BTREE_H
+
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+
+/**
+ * DOC: B+Tree basics
+ *
+ * A B+Tree is a data structure for looking up arbitrary (currently allowing
+ * unsigned long, u32, u64 and 2 * u64) keys into pointers. The data structure
+ * is described at http://en.wikipedia.org/wiki/B-tree, we currently do not
+ * use binary search to find the key on lookups.
+ *
+ * Each B+Tree consists of a head, that contains bookkeeping information and
+ * a variable number (starting with zero) nodes. Each node contains the keys
+ * and pointers to sub-nodes, or, for leaf nodes, the keys and values for the
+ * tree entries.
+ *
+ * Each node in this implementation has the following layout:
+ * [key1, key2, ..., keyN] [val1, val2, ..., valN]
+ *
+ * Each key here is an array of unsigned longs, geo->no_longs in total. The
+ * number of keys and values (N) is geo->no_pairs.
+ */
+
+/**
+ * struct btree_head - btree head
+ *
+ * @node: the first node in the tree
+ * @mempool: mempool used for node allocations
+ * @height: current of the tree
+ */
+struct btree_head {
+	unsigned long *node;
+	mempool_t *mempool;
+	int height;
+};
+
+/* btree geometry */
+struct btree_geo;
+
+/**
+ * btree_alloc - allocate function for the mempool
+ * @gfp_mask: gfp mask for the allocation
+ * @pool_data: unused
+ */
+void *btree_alloc(gfp_t gfp_mask, void *pool_data);
+
+/**
+ * btree_free - free function for the mempool
+ * @element: the element to free
+ * @pool_data: unused
+ */
+void btree_free(void *element, void *pool_data);
+
+/**
+ * btree_init_mempool - initialise a btree with given mempool
+ *
+ * @head: the btree head to initialise
+ * @mempool: the mempool to use
+ *
+ * When this function is used, there is no need to destroy
+ * the mempool.
+ */
+void btree_init_mempool(struct btree_head *head, mempool_t *mempool);
+
+/**
+ * btree_init - initialise a btree
+ *
+ * @head: the btree head to initialise
+ *
+ * This function allocates the memory pool that the
+ * btree needs. Returns zero or a negative error code
+ * (-%ENOMEM) when memory allocation fails.
+ *
+ */
+int __must_check btree_init(struct btree_head *head);
+
+/**
+ * btree_destroy - destroy mempool
+ *
+ * @head: the btree head to destroy
+ *
+ * This function destroys the internal memory pool, use only
+ * when using btree_init(), not with btree_init_mempool().
+ */
+void btree_destroy(struct btree_head *head);
+
+/**
+ * btree_lookup - look up a key in the btree
+ *
+ * @head: the btree to look in
+ * @geo: the btree geometry
+ * @key: the key to look up
+ *
+ * This function returns the value for the given key, or %NULL.
+ */
+void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
+		   unsigned long *key);
+
+/**
+ * btree_insert - insert an entry into the btree
+ *
+ * @head: the btree to add to
+ * @geo: the btree geometry
+ * @key: the key to add (must not already be present)
+ * @val: the value to add (must not be %NULL)
+ * @gfp: allocation flags for node allocations
+ *
+ * This function returns 0 if the item could be added, or an
+ * error code if it failed (may fail due to memory pressure).
+ */
+int __must_check btree_insert(struct btree_head *head, struct btree_geo *geo,
+			      unsigned long *key, void *val, gfp_t gfp);
+/**
+ * btree_update - update an entry in the btree
+ *
+ * @head: the btree to update
+ * @geo: the btree geometry
+ * @key: the key to update
+ * @val: the value to change it to (must not be %NULL)
+ *
+ * This function returns 0 if the update was successful, or
+ * -%ENOENT if the key could not be found.
+ */
+int btree_update(struct btree_head *head, struct btree_geo *geo,
+		 unsigned long *key, void *val);
+/**
+ * btree_remove - remove an entry from the btree
+ *
+ * @head: the btree to update
+ * @geo: the btree geometry
+ * @key: the key to remove
+ *
+ * This function returns the removed entry, or %NULL if the key
+ * could not be found.
+ */
+void *btree_remove(struct btree_head *head, struct btree_geo *geo,
+		   unsigned long *key);
+
+/**
+ * btree_merge - merge two btrees
+ *
+ * @target: the tree that gets all the entries
+ * @victim: the tree that gets merged into @target
+ * @geo: the btree geometry
+ * @gfp: allocation flags
+ *
+ * The two trees @target and @victim may not contain the same keys,
+ * that is a bug and triggers a BUG(). This function returns zero
+ * if the trees were merged successfully, and may return a failure
+ * when memory allocation fails, in which case both trees might have
+ * been partially merged, i.e. some entries have been moved from
+ * @victim to @target.
+ */
+int btree_merge(struct btree_head *target, struct btree_head *victim,
+		struct btree_geo *geo, gfp_t gfp);
+
+/**
+ * btree_last - get last entry in btree
+ *
+ * @head: btree head
+ * @geo: btree geometry
+ * @key: last key
+ *
+ * Returns the last entry in the btree, and sets @key to the key
+ * of that entry; returns NULL if the tree is empty, in that case
+ * key is not changed.
+ */
+void *btree_last(struct btree_head *head, struct btree_geo *geo,
+		 unsigned long *key);
+
+/**
+ * btree_get_prev - get previous entry
+ *
+ * @head: btree head
+ * @geo: btree geometry
+ * @key: pointer to key
+ *
+ * The function returns the next item right before the value pointed to by
+ * @key, and updates @key with its key, or returns %NULL when there is no
+ * entry with a key smaller than the given key.
+ */
+void *btree_get_prev(struct btree_head *head, struct btree_geo *geo,
+		     unsigned long *key);
+
+
+/* internal use, use btree_visitor{l,32,64,128} */
+size_t btree_visitor(struct btree_head *head, struct btree_geo *geo,
+		     unsigned long opaque,
+		     void (*func)(void *elem, unsigned long opaque,
+				  unsigned long *key, size_t index,
+				  void *func2),
+		     void *func2);
+
+/* internal use, use btree_grim_visitor{l,32,64,128} */
+size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo,
+			  unsigned long opaque,
+			  void (*func)(void *elem, unsigned long opaque,
+				       unsigned long *key,
+				       size_t index, void *func2),
+			  void *func2);
+
+
+#include <linux/btree-128.h>
+
+extern struct btree_geo btree_geo32;
+#define BTREE_TYPE_SUFFIX l
+#define BTREE_TYPE_BITS BITS_PER_LONG
+#define BTREE_TYPE_GEO &btree_geo32
+#define BTREE_KEYTYPE unsigned long
+#include <linux/btree-type.h>
+
+#define btree_for_each_safel(head, key, val)	\
+	for (val = btree_lastl(head, &key);	\
+	     val;				\
+	     val = btree_get_prevl(head, &key))
+
+#define BTREE_TYPE_SUFFIX 32
+#define BTREE_TYPE_BITS 32
+#define BTREE_TYPE_GEO &btree_geo32
+#define BTREE_KEYTYPE u32
+#include <linux/btree-type.h>
+
+#define btree_for_each_safe32(head, key, val)	\
+	for (val = btree_last32(head, &key);	\
+	     val;				\
+	     val = btree_get_prev32(head, &key))
+
+extern struct btree_geo btree_geo64;
+#define BTREE_TYPE_SUFFIX 64
+#define BTREE_TYPE_BITS 64
+#define BTREE_TYPE_GEO &btree_geo64
+#define BTREE_KEYTYPE u64
+#include <linux/btree-type.h>
+
+#define btree_for_each_safe64(head, key, val)	\
+	for (val = btree_last64(head, &key);	\
+	     val;				\
+	     val = btree_get_prev64(head, &key))
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index bb1326d3839..277fbfb233b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -156,6 +156,9 @@ config TEXTSEARCH_BM
 config TEXTSEARCH_FSM
 	tristate
 
+config BTREE
+	boolean
+
 config HAS_IOMEM
 	boolean
 	depends on !NO_IOMEM
diff --git a/lib/Makefile b/lib/Makefile
index 2e78277eff9..cff82612e98 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -41,6 +41,7 @@ lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
+obj-$(CONFIG_BTREE) += btree.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 obj-$(CONFIG_DEBUG_LIST) += list_debug.o
 obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
diff --git a/lib/btree.c b/lib/btree.c
new file mode 100644
index 00000000000..41859a82021
--- /dev/null
+++ b/lib/btree.c
@@ -0,0 +1,797 @@
+/*
+ * lib/btree.c	- Simple In-memory B+Tree
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2007-2008 Joern Engel <joern@logfs.org>
+ * Bits and pieces stolen from Peter Zijlstra's code, which is
+ * Copyright 2007, Red Hat Inc. Peter Zijlstra <pzijlstr@redhat.com>
+ * GPLv2
+ *
+ * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch
+ *
+ * A relatively simple B+Tree implementation.  I have written it as a learning
+ * excercise to understand how B+Trees work.  Turned out to be useful as well.
+ *
+ * B+Trees can be used similar to Linux radix trees (which don't have anything
+ * in common with textbook radix trees, beware).  Prerequisite for them working
+ * well is that access to a random tree node is much faster than a large number
+ * of operations within each node.
+ *
+ * Disks have fulfilled the prerequisite for a long time.  More recently DRAM
+ * has gained similar properties, as memory access times, when measured in cpu
+ * cycles, have increased.  Cacheline sizes have increased as well, which also
+ * helps B+Trees.
+ *
+ * Compared to radix trees, B+Trees are more efficient when dealing with a
+ * sparsely populated address space.  Between 25% and 50% of the memory is
+ * occupied with valid pointers.  When densely populated, radix trees contain
+ * ~98% pointers - hard to beat.  Very sparse radix trees contain only ~2%
+ * pointers.
+ *
+ * This particular implementation stores pointers identified by a long value.
+ * Storing NULL pointers is illegal, lookup will return NULL when no entry
+ * was found.
+ *
+ * A tricks was used that is not commonly found in textbooks.  The lowest
+ * values are to the right, not to the left.  All used slots within a node
+ * are on the left, all unused slots contain NUL values.  Most operations
+ * simply loop once over all slots and terminate on the first NUL.
+ */
+
+#include <linux/btree.h>
+#include <linux/cache.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define NODESIZE MAX(L1_CACHE_BYTES, 128)
+
+struct btree_geo {
+	int keylen;
+	int no_pairs;
+	int no_longs;
+};
+
+struct btree_geo btree_geo32 = {
+	.keylen = 1,
+	.no_pairs = NODESIZE / sizeof(long) / 2,
+	.no_longs = NODESIZE / sizeof(long) / 2,
+};
+EXPORT_SYMBOL_GPL(btree_geo32);
+
+#define LONG_PER_U64 (64 / BITS_PER_LONG)
+struct btree_geo btree_geo64 = {
+	.keylen = LONG_PER_U64,
+	.no_pairs = NODESIZE / sizeof(long) / (1 + LONG_PER_U64),
+	.no_longs = LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + LONG_PER_U64)),
+};
+EXPORT_SYMBOL_GPL(btree_geo64);
+
+struct btree_geo btree_geo128 = {
+	.keylen = 2 * LONG_PER_U64,
+	.no_pairs = NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64),
+	.no_longs = 2 * LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64)),
+};
+EXPORT_SYMBOL_GPL(btree_geo128);
+
+static struct kmem_cache *btree_cachep;
+
+void *btree_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	return kmem_cache_alloc(btree_cachep, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(btree_alloc);
+
+void btree_free(void *element, void *pool_data)
+{
+	kmem_cache_free(btree_cachep, element);
+}
+EXPORT_SYMBOL_GPL(btree_free);
+
+static unsigned long *btree_node_alloc(struct btree_head *head, gfp_t gfp)
+{
+	unsigned long *node;
+
+	node = mempool_alloc(head->mempool, gfp);
+	memset(node, 0, NODESIZE);
+	return node;
+}
+
+static int longcmp(const unsigned long *l1, const unsigned long *l2, size_t n)
+{
+	size_t i;
+
+	for (i = 0; i < n; i++) {
+		if (l1[i] < l2[i])
+			return -1;
+		if (l1[i] > l2[i])
+			return 1;
+	}
+	return 0;
+}
+
+static unsigned long *longcpy(unsigned long *dest, const unsigned long *src,
+		size_t n)
+{
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		dest[i] = src[i];
+	return dest;
+}
+
+static unsigned long *longset(unsigned long *s, unsigned long c, size_t n)
+{
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		s[i] = c;
+	return s;
+}
+
+static void dec_key(struct btree_geo *geo, unsigned long *key)
+{
+	unsigned long val;
+	int i;
+
+	for (i = geo->keylen - 1; i >= 0; i--) {
+		val = key[i];
+		key[i] = val - 1;
+		if (val)
+			break;
+	}
+}
+
+static unsigned long *bkey(struct btree_geo *geo, unsigned long *node, int n)
+{
+	return &node[n * geo->keylen];
+}
+
+static void *bval(struct btree_geo *geo, unsigned long *node, int n)
+{
+	return (void *)node[geo->no_longs + n];
+}
+
+static void setkey(struct btree_geo *geo, unsigned long *node, int n,
+		   unsigned long *key)
+{
+	longcpy(bkey(geo, node, n), key, geo->keylen);
+}
+
+static void setval(struct btree_geo *geo, unsigned long *node, int n,
+		   void *val)
+{
+	node[geo->no_longs + n] = (unsigned long) val;
+}
+
+static void clearpair(struct btree_geo *geo, unsigned long *node, int n)
+{
+	longset(bkey(geo, node, n), 0, geo->keylen);
+	node[geo->no_longs + n] = 0;
+}
+
+static inline void __btree_init(struct btree_head *head)
+{
+	head->node = NULL;
+	head->height = 0;
+}
+
+void btree_init_mempool(struct btree_head *head, mempool_t *mempool)
+{
+	__btree_init(head);
+	head->mempool = mempool;
+}
+EXPORT_SYMBOL_GPL(btree_init_mempool);
+
+int btree_init(struct btree_head *head)
+{
+	__btree_init(head);
+	head->mempool = mempool_create(0, btree_alloc, btree_free, NULL);
+	if (!head->mempool)
+		return -ENOMEM;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(btree_init);
+
+void btree_destroy(struct btree_head *head)
+{
+	mempool_destroy(head->mempool);
+	head->mempool = NULL;
+}
+EXPORT_SYMBOL_GPL(btree_destroy);
+
+void *btree_last(struct btree_head *head, struct btree_geo *geo,
+		 unsigned long *key)
+{
+	int height = head->height;
+	unsigned long *node = head->node;
+
+	if (height == 0)
+		return NULL;
+
+	for ( ; height > 1; height--)
+		node = bval(geo, node, 0);
+
+	longcpy(key, bkey(geo, node, 0), geo->keylen);
+	return bval(geo, node, 0);
+}
+EXPORT_SYMBOL_GPL(btree_last);
+
+static int keycmp(struct btree_geo *geo, unsigned long *node, int pos,
+		  unsigned long *key)
+{
+	return longcmp(bkey(geo, node, pos), key, geo->keylen);
+}
+
+static int keyzero(struct btree_geo *geo, unsigned long *key)
+{
+	int i;
+
+	for (i = 0; i < geo->keylen; i++)
+		if (key[i])
+			return 0;
+
+	return 1;
+}
+
+void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
+		unsigned long *key)
+{
+	int i, height = head->height;
+	unsigned long *node = head->node;
+
+	if (height == 0)
+		return NULL;
+
+	for ( ; height > 1; height--) {
+		for (i = 0; i < geo->no_pairs; i++)
+			if (keycmp(geo, node, i, key) <= 0)
+				break;
+		if (i == geo->no_pairs)
+			return NULL;
+		node = bval(geo, node, i);
+		if (!node)
+			return NULL;
+	}
+
+	if (!node)
+		return NULL;
+
+	for (i = 0; i < geo->no_pairs; i++)
+		if (keycmp(geo, node, i, key) == 0)
+			return bval(geo, node, i);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(btree_lookup);
+
+int btree_update(struct btree_head *head, struct btree_geo *geo,
+		 unsigned long *key, void *val)
+{
+	int i, height = head->height;
+	unsigned long *node = head->node;
+
+	if (height == 0)
+		return -ENOENT;
+
+	for ( ; height > 1; height--) {
+		for (i = 0; i < geo->no_pairs; i++)
+			if (keycmp(geo, node, i, key) <= 0)
+				break;
+		if (i == geo->no_pairs)
+			return -ENOENT;
+		node = bval(geo, node, i);
+		if (!node)
+			return -ENOENT;
+	}
+
+	if (!node)
+		return -ENOENT;
+
+	for (i = 0; i < geo->no_pairs; i++)
+		if (keycmp(geo, node, i, key) == 0) {
+			setval(geo, node, i, val);
+			return 0;
+		}
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(btree_update);
+
+/*
+ * Usually this function is quite similar to normal lookup.  But the key of
+ * a parent node may be smaller than the smallest key of all its siblings.
+ * In such a case we cannot just return NULL, as we have only proven that no
+ * key smaller than __key, but larger than this parent key exists.
+ * So we set __key to the parent key and retry.  We have to use the smallest
+ * such parent key, which is the last parent key we encountered.
+ */
+void *btree_get_prev(struct btree_head *head, struct btree_geo *geo,
+		     unsigned long *__key)
+{
+	int i, height;
+	unsigned long *node, *oldnode;
+	unsigned long *retry_key = NULL, key[geo->keylen];
+
+	if (keyzero(geo, __key))
+		return NULL;
+
+	if (head->height == 0)
+		return NULL;
+retry:
+	longcpy(key, __key, geo->keylen);
+	dec_key(geo, key);
+
+	node = head->node;
+	for (height = head->height ; height > 1; height--) {
+		for (i = 0; i < geo->no_pairs; i++)
+			if (keycmp(geo, node, i, key) <= 0)
+				break;
+		if (i == geo->no_pairs)
+			goto miss;
+		oldnode = node;
+		node = bval(geo, node, i);
+		if (!node)
+			goto miss;
+		retry_key = bkey(geo, oldnode, i);
+	}
+
+	if (!node)
+		goto miss;
+
+	for (i = 0; i < geo->no_pairs; i++) {
+		if (keycmp(geo, node, i, key) <= 0) {
+			if (bval(geo, node, i)) {
+				longcpy(__key, bkey(geo, node, i), geo->keylen);
+				return bval(geo, node, i);
+			} else
+				goto miss;
+		}
+	}
+miss:
+	if (retry_key) {
+		__key = retry_key;
+		retry_key = NULL;
+		goto retry;
+	}
+	return NULL;
+}
+
+static int getpos(struct btree_geo *geo, unsigned long *node,
+		unsigned long *key)
+{
+	int i;
+
+	for (i = 0; i < geo->no_pairs; i++) {
+		if (keycmp(geo, node, i, key) <= 0)
+			break;
+	}
+	return i;
+}
+
+static int getfill(struct btree_geo *geo, unsigned long *node, int start)
+{
+	int i;
+
+	for (i = start; i < geo->no_pairs; i++)
+		if (!bval(geo, node, i))
+			break;
+	return i;
+}
+
+/*
+ * locate the correct leaf node in the btree
+ */
+static unsigned long *find_level(struct btree_head *head, struct btree_geo *geo,
+		unsigned long *key, int level)
+{
+	unsigned long *node = head->node;
+	int i, height;
+
+	for (height = head->height; height > level; height--) {
+		for (i = 0; i < geo->no_pairs; i++)
+			if (keycmp(geo, node, i, key) <= 0)
+				break;
+
+		if ((i == geo->no_pairs) || !bval(geo, node, i)) {
+			/* right-most key is too large, update it */
+			/* FIXME: If the right-most key on higher levels is
+			 * always zero, this wouldn't be necessary. */
+			i--;
+			setkey(geo, node, i, key);
+		}
+		BUG_ON(i < 0);
+		node = bval(geo, node, i);
+	}
+	BUG_ON(!node);
+	return node;
+}
+
+static int btree_grow(struct btree_head *head, struct btree_geo *geo,
+		      gfp_t gfp)
+{
+	unsigned long *node;
+	int fill;
+
+	node = btree_node_alloc(head, gfp);
+	if (!node)
+		return -ENOMEM;
+	if (head->node) {
+		fill = getfill(geo, head->node, 0);
+		setkey(geo, node, 0, bkey(geo, head->node, fill - 1));
+		setval(geo, node, 0, head->node);
+	}
+	head->node = node;
+	head->height++;
+	return 0;
+}
+
+static void btree_shrink(struct btree_head *head, struct btree_geo *geo)
+{
+	unsigned long *node;
+	int fill;
+
+	if (head->height <= 1)
+		return;
+
+	node = head->node;
+	fill = getfill(geo, node, 0);
+	BUG_ON(fill > 1);
+	head->node = bval(geo, node, 0);
+	head->height--;
+	mempool_free(node, head->mempool);
+}
+
+static int btree_insert_level(struct btree_head *head, struct btree_geo *geo,
+			      unsigned long *key, void *val, int level,
+			      gfp_t gfp)
+{
+	unsigned long *node;
+	int i, pos, fill, err;
+
+	BUG_ON(!val);
+	if (head->height < level) {
+		err = btree_grow(head, geo, gfp);
+		if (err)
+			return err;
+	}
+
+retry:
+	node = find_level(head, geo, key, level);
+	pos = getpos(geo, node, key);
+	fill = getfill(geo, node, pos);
+	/* two identical keys are not allowed */
+	BUG_ON(pos < fill && keycmp(geo, node, pos, key) == 0);
+
+	if (fill == geo->no_pairs) {
+		/* need to split node */
+		unsigned long *new;
+
+		new = btree_node_alloc(head, gfp);
+		if (!new)
+			return -ENOMEM;
+		err = btree_insert_level(head, geo,
+				bkey(geo, node, fill / 2 - 1),
+				new, level + 1, gfp);
+		if (err) {
+			mempool_free(new, head->mempool);
+			return err;
+		}
+		for (i = 0; i < fill / 2; i++) {
+			setkey(geo, new, i, bkey(geo, node, i));
+			setval(geo, new, i, bval(geo, node, i));
+			setkey(geo, node, i, bkey(geo, node, i + fill / 2));
+			setval(geo, node, i, bval(geo, node, i + fill / 2));
+			clearpair(geo, node, i + fill / 2);
+		}
+		if (fill & 1) {
+			setkey(geo, node, i, bkey(geo, node, fill - 1));
+			setval(geo, node, i, bval(geo, node, fill - 1));
+			clearpair(geo, node, fill - 1);
+		}
+		goto retry;
+	}
+	BUG_ON(fill >= geo->no_pairs);
+
+	/* shift and insert */
+	for (i = fill; i > pos; i--) {
+		setkey(geo, node, i, bkey(geo, node, i - 1));
+		setval(geo, node, i, bval(geo, node, i - 1));
+	}
+	setkey(geo, node, pos, key);
+	setval(geo, node, pos, val);
+
+	return 0;
+}
+
+int btree_insert(struct btree_head *head, struct btree_geo *geo,
+		unsigned long *key, void *val, gfp_t gfp)
+{
+	return btree_insert_level(head, geo, key, val, 1, gfp);
+}
+EXPORT_SYMBOL_GPL(btree_insert);
+
+static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo,
+		unsigned long *key, int level);
+static void merge(struct btree_head *head, struct btree_geo *geo, int level,
+		unsigned long *left, int lfill,
+		unsigned long *right, int rfill,
+		unsigned long *parent, int lpos)
+{
+	int i;
+
+	for (i = 0; i < rfill; i++) {
+		/* Move all keys to the left */
+		setkey(geo, left, lfill + i, bkey(geo, right, i));
+		setval(geo, left, lfill + i, bval(geo, right, i));
+	}
+	/* Exchange left and right child in parent */
+	setval(geo, parent, lpos, right);
+	setval(geo, parent, lpos + 1, left);
+	/* Remove left (formerly right) child from parent */
+	btree_remove_level(head, geo, bkey(geo, parent, lpos), level + 1);
+	mempool_free(right, head->mempool);
+}
+
+static void rebalance(struct btree_head *head, struct btree_geo *geo,
+		unsigned long *key, int level, unsigned long *child, int fill)
+{
+	unsigned long *parent, *left = NULL, *right = NULL;
+	int i, no_left, no_right;
+
+	if (fill == 0) {
+		/* Because we don't steal entries from a neigbour, this case
+		 * can happen.  Parent node contains a single child, this
+		 * node, so merging with a sibling never happens.
+		 */
+		btree_remove_level(head, geo, key, level + 1);
+		mempool_free(child, head->mempool);
+		return;
+	}
+
+	parent = find_level(head, geo, key, level + 1);
+	i = getpos(geo, parent, key);
+	BUG_ON(bval(geo, parent, i) != child);
+
+	if (i > 0) {
+		left = bval(geo, parent, i - 1);
+		no_left = getfill(geo, left, 0);
+		if (fill + no_left <= geo->no_pairs) {
+			merge(head, geo, level,
+					left, no_left,
+					child, fill,
+					parent, i - 1);
+			return;
+		}
+	}
+	if (i + 1 < getfill(geo, parent, i)) {
+		right = bval(geo, parent, i + 1);
+		no_right = getfill(geo, right, 0);
+		if (fill + no_right <= geo->no_pairs) {
+			merge(head, geo, level,
+					child, fill,
+					right, no_right,
+					parent, i);
+			return;
+		}
+	}
+	/*
+	 * We could also try to steal one entry from the left or right
+	 * neighbor.  By not doing so we changed the invariant from
+	 * "all nodes are at least half full" to "no two neighboring
+	 * nodes can be merged".  Which means that the average fill of
+	 * all nodes is still half or better.
+	 */
+}
+
+static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo,
+		unsigned long *key, int level)
+{
+	unsigned long *node;
+	int i, pos, fill;
+	void *ret;
+
+	if (level > head->height) {
+		/* we recursed all the way up */
+		head->height = 0;
+		head->node = NULL;
+		return NULL;
+	}
+
+	node = find_level(head, geo, key, level);
+	pos = getpos(geo, node, key);
+	fill = getfill(geo, node, pos);
+	if ((level == 1) && (keycmp(geo, node, pos, key) != 0))
+		return NULL;
+	ret = bval(geo, node, pos);
+
+	/* remove and shift */
+	for (i = pos; i < fill - 1; i++) {
+		setkey(geo, node, i, bkey(geo, node, i + 1));
+		setval(geo, node, i, bval(geo, node, i + 1));
+	}
+	clearpair(geo, node, fill - 1);
+
+	if (fill - 1 < geo->no_pairs / 2) {
+		if (level < head->height)
+			rebalance(head, geo, key, level, node, fill - 1);
+		else if (fill - 1 == 1)
+			btree_shrink(head, geo);
+	}
+
+	return ret;
+}
+
+void *btree_remove(struct btree_head *head, struct btree_geo *geo,
+		unsigned long *key)
+{
+	if (head->height == 0)
+		return NULL;
+
+	return btree_remove_level(head, geo, key, 1);
+}
+EXPORT_SYMBOL_GPL(btree_remove);
+
+int btree_merge(struct btree_head *target, struct btree_head *victim,
+		struct btree_geo *geo, gfp_t gfp)
+{
+	unsigned long key[geo->keylen];
+	unsigned long dup[geo->keylen];
+	void *val;
+	int err;
+
+	BUG_ON(target == victim);
+
+	if (!(target->node)) {
+		/* target is empty, just copy fields over */
+		target->node = victim->node;
+		target->height = victim->height;
+		__btree_init(victim);
+		return 0;
+	}
+
+	/* TODO: This needs some optimizations.  Currently we do three tree
+	 * walks to remove a single object from the victim.
+	 */
+	for (;;) {
+		if (!btree_last(victim, geo, key))
+			break;
+		val = btree_lookup(victim, geo, key);
+		err = btree_insert(target, geo, key, val, gfp);
+		if (err)
+			return err;
+		/* We must make a copy of the key, as the original will get
+		 * mangled inside btree_remove. */
+		longcpy(dup, key, geo->keylen);
+		btree_remove(victim, geo, dup);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(btree_merge);
+
+static size_t __btree_for_each(struct btree_head *head, struct btree_geo *geo,
+			       unsigned long *node, unsigned long opaque,
+			       void (*func)(void *elem, unsigned long opaque,
+					    unsigned long *key, size_t index,
+					    void *func2),
+			       void *func2, int reap, int height, size_t count)
+{
+	int i;
+	unsigned long *child;
+
+	for (i = 0; i < geo->no_pairs; i++) {
+		child = bval(geo, node, i);
+		if (!child)
+			break;
+		if (height > 1)
+			count = __btree_for_each(head, geo, child, opaque,
+					func, func2, reap, height - 1, count);
+		else
+			func(child, opaque, bkey(geo, node, i), count++,
+					func2);
+	}
+	if (reap)
+		mempool_free(node, head->mempool);
+	return count;
+}
+
+static void empty(void *elem, unsigned long opaque, unsigned long *key,
+		  size_t index, void *func2)
+{
+}
+
+void visitorl(void *elem, unsigned long opaque, unsigned long *key,
+	      size_t index, void *__func)
+{
+	visitorl_t func = __func;
+
+	func(elem, opaque, *key, index);
+}
+EXPORT_SYMBOL_GPL(visitorl);
+
+void visitor32(void *elem, unsigned long opaque, unsigned long *__key,
+	       size_t index, void *__func)
+{
+	visitor32_t func = __func;
+	u32 *key = (void *)__key;
+
+	func(elem, opaque, *key, index);
+}
+EXPORT_SYMBOL_GPL(visitor32);
+
+void visitor64(void *elem, unsigned long opaque, unsigned long *__key,
+	       size_t index, void *__func)
+{
+	visitor64_t func = __func;
+	u64 *key = (void *)__key;
+
+	func(elem, opaque, *key, index);
+}
+EXPORT_SYMBOL_GPL(visitor64);
+
+void visitor128(void *elem, unsigned long opaque, unsigned long *__key,
+		size_t index, void *__func)
+{
+	visitor128_t func = __func;
+	u64 *key = (void *)__key;
+
+	func(elem, opaque, key[0], key[1], index);
+}
+EXPORT_SYMBOL_GPL(visitor128);
+
+size_t btree_visitor(struct btree_head *head, struct btree_geo *geo,
+		     unsigned long opaque,
+		     void (*func)(void *elem, unsigned long opaque,
+		     		  unsigned long *key,
+		     		  size_t index, void *func2),
+		     void *func2)
+{
+	size_t count = 0;
+
+	if (!func2)
+		func = empty;
+	if (head->node)
+		count = __btree_for_each(head, geo, head->node, opaque, func,
+				func2, 0, head->height, 0);
+	return count;
+}
+EXPORT_SYMBOL_GPL(btree_visitor);
+
+size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo,
+			  unsigned long opaque,
+			  void (*func)(void *elem, unsigned long opaque,
+				       unsigned long *key,
+				       size_t index, void *func2),
+			  void *func2)
+{
+	size_t count = 0;
+
+	if (!func2)
+		func = empty;
+	if (head->node)
+		count = __btree_for_each(head, geo, head->node, opaque, func,
+				func2, 1, head->height, 0);
+	__btree_init(head);
+	return count;
+}
+EXPORT_SYMBOL_GPL(btree_grim_visitor);
+
+static int __init btree_module_init(void)
+{
+	btree_cachep = kmem_cache_create("btree_node", NODESIZE, 0,
+			SLAB_HWCACHE_ALIGN, NULL);
+	return 0;
+}
+
+static void __exit btree_module_exit(void)
+{
+	kmem_cache_destroy(btree_cachep);
+}
+
+/* If core code starts using btree, initialization should happen even earlier */
+module_init(btree_module_init);
+module_exit(btree_module_exit);
+
+MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
+MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 4c54005ca438a8b46dd542b497d4f0dc2ca375e8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 14 Jan 2010 16:10:57 -0800
Subject: rcu: 1Q2010 update for RCU documentation

Add expedited functions.  Review documentation and update
obsolete verbiage.  Also fix the advice for the RCU CPU-stall
kernel configuration parameter, and document RCU CPU-stall
warnings.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12635142581866-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/RCU/00-INDEX                   |   8 +-
 Documentation/RCU/RTFP.txt                   |  58 +++++++-
 Documentation/RCU/checklist.txt              | 200 ++++++++++++++++-----------
 Documentation/RCU/rcu.txt                    |  48 +------
 Documentation/RCU/stallwarn.txt              |  58 ++++++++
 Documentation/RCU/torture.txt                |  12 ++
 Documentation/RCU/whatisRCU.txt              |   3 +-
 Documentation/filesystems/dentry-locking.txt |   3 +-
 lib/Kconfig.debug                            |   4 +-
 9 files changed, 258 insertions(+), 136 deletions(-)
 create mode 100644 Documentation/RCU/stallwarn.txt

(limited to 'Documentation/filesystems')

diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index 9bb62f7b89c..0a27ea9621f 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -8,14 +8,18 @@ listRCU.txt
 	- Using RCU to Protect Read-Mostly Linked Lists
 NMI-RCU.txt
 	- Using RCU to Protect Dynamic NMI Handlers
+rcubarrier.txt
+	- RCU and Unloadable Modules
+rculist_nulls.txt
+	- RCU list primitives for use with SLAB_DESTROY_BY_RCU
 rcuref.txt
 	- Reference-count design for elements of lists/arrays protected by RCU
 rcu.txt
 	- RCU Concepts
-rcubarrier.txt
-	- Unloading modules that use RCU callbacks
 RTFP.txt
 	- List of RCU papers (bibliography) going back to 1980.
+stallwarn.txt
+	- RCU CPU stall warnings (CONFIG_RCU_CPU_STALL_DETECTOR)
 torture.txt
 	- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
 trace.txt
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index d2b85237c76..5051209e683 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -25,10 +25,10 @@ to be referencing the data structure.  However, this mechanism was not
 optimized for modern computer systems, which is not surprising given
 that these overheads were not so expensive in the mid-80s.  Nonetheless,
 passive serialization appears to be the first deferred-destruction
-mechanism to be used in production.  Furthermore, the relevant patent has
-lapsed, so this approach may be used in non-GPL software, if desired.
-(In contrast, use of RCU is permitted only in software licensed under
-GPL.  Sorry!!!)
+mechanism to be used in production.  Furthermore, the relevant patent
+has lapsed, so this approach may be used in non-GPL software, if desired.
+(In contrast, implementation of RCU is permitted only in software licensed
+under either GPL or LGPL.  Sorry!!!)
 
 In 1990, Pugh [Pugh90] noted that explicitly tracking which threads
 were reading a given data structure permitted deferred free to operate
@@ -150,6 +150,18 @@ preemptible RCU [PaulEMcKenney2007PreemptibleRCU], and the three-part
 LWN "What is RCU?" series [PaulEMcKenney2007WhatIsRCUFundamentally,
 PaulEMcKenney2008WhatIsRCUUsage, and PaulEMcKenney2008WhatIsRCUAPI].
 
+2008 saw a journal paper on real-time RCU [DinakarGuniguntala2008IBMSysJ],
+a history of how Linux changed RCU more than RCU changed Linux
+[PaulEMcKenney2008RCUOSR], and a design overview of hierarchical RCU
+[PaulEMcKenney2008HierarchicalRCU].
+
+2009 introduced user-level RCU algorithms [PaulEMcKenney2009MaliciousURCU],
+which Mathieu Desnoyers is now maintaining [MathieuDesnoyers2009URCU]
+[MathieuDesnoyersPhD].  TINY_RCU [PaulEMcKenney2009BloatWatchRCU] made
+its appearance, as did expedited RCU [PaulEMcKenney2009expeditedRCU].
+The problem of resizeable RCU-protected hash tables may now be on a path
+to a solution [JoshTriplett2009RPHash].
+
 Bibtex Entries
 
 @article{Kung80
@@ -730,6 +742,11 @@ Revised:
 "
 }
 
+#
+#	"What is RCU?" LWN series.
+#
+########################################################################
+
 @article{DinakarGuniguntala2008IBMSysJ
 ,author="D. Guniguntala and P. E. McKenney and J. Triplett and J. Walpole"
 ,title="The read-copy-update mechanism for supporting real-time applications on shared-memory multiprocessor systems with {Linux}"
@@ -820,3 +837,36 @@ Revised:
 	Uniprocessor assumptions allow simplified RCU implementation.
 "
 }
+
+@unpublished{PaulEMcKenney2009expeditedRCU
+,Author="Paul E. McKenney"
+,Title="[{PATCH} -tip 0/3] expedited 'big hammer' {RCU} grace periods"
+,month="June"
+,day="25"
+,year="2009"
+,note="Available:
+\url{http://lkml.org/lkml/2009/6/25/306}
+[Viewed August 16, 2009]"
+,annotation="
+	First posting of expedited RCU to be accepted into -tip.
+"
+}
+
+@unpublished{JoshTriplett2009RPHash
+,Author="Josh Triplett"
+,Title="Scalable concurrent hash tables via relativistic programming"
+,month="September"
+,year="2009"
+,note="Linux Plumbers Conference presentation"
+,annotation="
+	RP fun with hash tables.
+"
+}
+
+@phdthesis{MathieuDesnoyersPhD
+, title  = "Low-impact Operating System Tracing"
+, author = "Mathieu Desnoyers"
+, school = "Ecole Polytechnique de Montr\'{e}al"
+, month  = "December"
+, year   = 2009
+}
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 51525a30e8b..767cf06a427 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -8,13 +8,12 @@ would cause.  This list is based on experiences reviewing such patches
 over a rather long period of time, but improvements are always welcome!
 
 0.	Is RCU being applied to a read-mostly situation?  If the data
-	structure is updated more than about 10% of the time, then
-	you should strongly consider some other approach, unless
-	detailed performance measurements show that RCU is nonetheless
-	the right tool for the job.  Yes, you might think of RCU
-	as simply cutting overhead off of the readers and imposing it
-	on the writers.  That is exactly why normal uses of RCU will
-	do much more reading than updating.
+	structure is updated more than about 10% of the time, then you
+	should strongly consider some other approach, unless detailed
+	performance measurements show that RCU is nonetheless the right
+	tool for the job.  Yes, RCU does reduce read-side overhead by
+	increasing write-side overhead, which is exactly why normal uses
+	of RCU will do much more reading than updating.
 
 	Another exception is where performance is not an issue, and RCU
 	provides a simpler implementation.  An example of this situation
@@ -35,13 +34,13 @@ over a rather long period of time, but improvements are always welcome!
 
 	If you choose #b, be prepared to describe how you have handled
 	memory barriers on weakly ordered machines (pretty much all of
-	them -- even x86 allows reads to be reordered), and be prepared
-	to explain why this added complexity is worthwhile.  If you
-	choose #c, be prepared to explain how this single task does not
-	become a major bottleneck on big multiprocessor machines (for
-	example, if the task is updating information relating to itself
-	that other tasks can read, there by definition can be no
-	bottleneck).
+	them -- even x86 allows later loads to be reordered to precede
+	earlier stores), and be prepared to explain why this added
+	complexity is worthwhile.  If you choose #c, be prepared to
+	explain how this single task does not become a major bottleneck on
+	big multiprocessor machines (for example, if the task is updating
+	information relating to itself that other tasks can read, there
+	by definition can be no bottleneck).
 
 2.	Do the RCU read-side critical sections make proper use of
 	rcu_read_lock() and friends?  These primitives are needed
@@ -51,8 +50,10 @@ over a rather long period of time, but improvements are always welcome!
 	actuarial risk of your kernel.
 
 	As a rough rule of thumb, any dereference of an RCU-protected
-	pointer must be covered by rcu_read_lock() or rcu_read_lock_bh()
-	or by the appropriate update-side lock.
+	pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
+	rcu_read_lock_sched(), or by the appropriate update-side lock.
+	Disabling of preemption can serve as rcu_read_lock_sched(), but
+	is less readable.
 
 3.	Does the update code tolerate concurrent accesses?
 
@@ -62,25 +63,27 @@ over a rather long period of time, but improvements are always welcome!
 	of ways to handle this concurrency, depending on the situation:
 
 	a.	Use the RCU variants of the list and hlist update
-		primitives to add, remove, and replace elements on an
-		RCU-protected list.  Alternatively, use the RCU-protected
-		trees that have been added to the Linux kernel.
+		primitives to add, remove, and replace elements on
+		an RCU-protected list.	Alternatively, use the other
+		RCU-protected data structures that have been added to
+		the Linux kernel.
 
 		This is almost always the best approach.
 
 	b.	Proceed as in (a) above, but also maintain per-element
 		locks (that are acquired by both readers and writers)
 		that guard per-element state.  Of course, fields that
-		the readers refrain from accessing can be guarded by the
-		update-side lock.
+		the readers refrain from accessing can be guarded by
+		some other lock acquired only by updaters, if desired.
 
 		This works quite well, also.
 
 	c.	Make updates appear atomic to readers.  For example,
-		pointer updates to properly aligned fields will appear
-		atomic, as will individual atomic primitives.  Operations
-		performed under a lock and sequences of multiple atomic
-		primitives will -not- appear to be atomic.
+		pointer updates to properly aligned fields will
+		appear atomic, as will individual atomic primitives.
+		Sequences of perations performed under a lock will -not-
+		appear to be atomic to RCU readers, nor will sequences
+		of multiple atomic primitives.
 
 		This can work, but is starting to get a bit tricky.
 
@@ -98,9 +101,9 @@ over a rather long period of time, but improvements are always welcome!
 		a new structure containing updated values.
 
 4.	Weakly ordered CPUs pose special challenges.  Almost all CPUs
-	are weakly ordered -- even i386 CPUs allow reads to be reordered.
-	RCU code must take all of the following measures to prevent
-	memory-corruption problems:
+	are weakly ordered -- even x86 CPUs allow later loads to be
+	reordered to precede earlier stores.  RCU code must take all of
+	the following measures to prevent memory-corruption problems:
 
 	a.	Readers must maintain proper ordering of their memory
 		accesses.  The rcu_dereference() primitive ensures that
@@ -113,14 +116,21 @@ over a rather long period of time, but improvements are always welcome!
 		The rcu_dereference() primitive is also an excellent
 		documentation aid, letting the person reading the code
 		know exactly which pointers are protected by RCU.
-
-		The rcu_dereference() primitive is used by the various
-		"_rcu()" list-traversal primitives, such as the
-		list_for_each_entry_rcu().  Note that it is perfectly
-		legal (if redundant) for update-side code to use
-		rcu_dereference() and the "_rcu()" list-traversal
-		primitives.  This is particularly useful in code
-		that is common to readers and updaters.
+		Please note that compilers can also reorder code, and
+		they are becoming increasingly aggressive about doing
+		just that.  The rcu_dereference() primitive therefore
+		also prevents destructive compiler optimizations.
+
+		The rcu_dereference() primitive is used by the
+		various "_rcu()" list-traversal primitives, such
+		as the list_for_each_entry_rcu().  Note that it is
+		perfectly legal (if redundant) for update-side code to
+		use rcu_dereference() and the "_rcu()" list-traversal
+		primitives.  This is particularly useful in code that
+		is common to readers and updaters.  However, neither
+		rcu_dereference() nor the "_rcu()" list-traversal
+		primitives can substitute for a good concurrency design
+		coordinating among multiple updaters.
 
 	b.	If the list macros are being used, the list_add_tail_rcu()
 		and list_add_rcu() primitives must be used in order
@@ -135,11 +145,14 @@ over a rather long period of time, but improvements are always welcome!
 		readers.  Similarly, if the hlist macros are being used,
 		the hlist_del_rcu() primitive is required.
 
-		The list_replace_rcu() primitive may be used to
-		replace an old structure with a new one in an
-		RCU-protected list.
+		The list_replace_rcu() and hlist_replace_rcu() primitives
+		may be used to replace an old structure with a new one
+		in their respective types of RCU-protected lists.
+
+	d.	Rules similar to (4b) and (4c) apply to the "hlist_nulls"
+		type of RCU-protected linked lists.
 
-	d.	Updates must ensure that initialization of a given
+	e.	Updates must ensure that initialization of a given
 		structure happens before pointers to that structure are
 		publicized.  Use the rcu_assign_pointer() primitive
 		when publicizing a pointer to a structure that can
@@ -151,16 +164,31 @@ over a rather long period of time, but improvements are always welcome!
 	it cannot block.
 
 6.	Since synchronize_rcu() can block, it cannot be called from
-	any sort of irq context.  Ditto for synchronize_sched() and
-	synchronize_srcu().
-
-7.	If the updater uses call_rcu(), then the corresponding readers
-	must use rcu_read_lock() and rcu_read_unlock().  If the updater
-	uses call_rcu_bh(), then the corresponding readers must use
-	rcu_read_lock_bh() and rcu_read_unlock_bh().  If the updater
-	uses call_rcu_sched(), then the corresponding readers must
-	disable preemption.  Mixing things up will result in confusion
-	and broken kernels.
+	any sort of irq context.  The same rule applies for
+	synchronize_rcu_bh(), synchronize_sched(), synchronize_srcu(),
+	synchronize_rcu_expedited(), synchronize_rcu_bh_expedited(),
+	synchronize_sched_expedite(), and synchronize_srcu_expedited().
+
+	The expedited forms of these primitives have the same semantics
+	as the non-expedited forms, but expediting is both expensive
+	and unfriendly to real-time workloads.	Use of the expedited
+	primitives should be restricted to rare configuration-change
+	operations that would not normally be undertaken while a real-time
+	workload is running.
+
+7.	If the updater uses call_rcu() or synchronize_rcu(), then the
+	corresponding readers must use rcu_read_lock() and
+	rcu_read_unlock().  If the updater uses call_rcu_bh() or
+	synchronize_rcu_bh(), then the corresponding readers must
+	use rcu_read_lock_bh() and rcu_read_unlock_bh().  If the
+	updater uses call_rcu_sched() or synchronize_sched(), then
+	the corresponding readers must disable preemption, possibly
+	by calling rcu_read_lock_sched() and rcu_read_unlock_sched().
+	If the updater uses synchronize_srcu(), the the corresponding
+	readers must use srcu_read_lock() and srcu_read_unlock(),
+	and with the same srcu_struct.	The rules for the expedited
+	primitives are the same as for their non-expedited counterparts.
+	Mixing things up will result in confusion and broken kernels.
 
 	One exception to this rule: rcu_read_lock() and rcu_read_unlock()
 	may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
@@ -212,6 +240,8 @@ over a rather long period of time, but improvements are always welcome!
 	e.	Periodically invoke synchronize_rcu(), permitting a limited
 		number of updates per grace period.
 
+	The same cautions apply to call_rcu_bh() and call_rcu_sched().
+
 9.	All RCU list-traversal primitives, which include
 	rcu_dereference(), list_for_each_entry_rcu(),
 	list_for_each_continue_rcu(), and list_for_each_safe_rcu(),
@@ -229,7 +259,8 @@ over a rather long period of time, but improvements are always welcome!
 10.	Conversely, if you are in an RCU read-side critical section,
 	and you don't hold the appropriate update-side lock, you -must-
 	use the "_rcu()" variants of the list macros.  Failing to do so
-	will break Alpha and confuse people reading your code.
+	will break Alpha, cause aggressive compilers to generate bad code,
+	and confuse people trying to read your code.
 
 11.	Note that synchronize_rcu() -only- guarantees to wait until
 	all currently executing rcu_read_lock()-protected RCU read-side
@@ -239,15 +270,21 @@ over a rather long period of time, but improvements are always welcome!
 	rcu_read_lock()-protected read-side critical sections, do -not-
 	use synchronize_rcu().
 
-	If you want to wait for some of these other things, you might
-	instead need to use synchronize_irq() or synchronize_sched().
+	Similarly, disabling preemption is not an acceptable substitute
+	for rcu_read_lock().  Code that attempts to use preemption
+	disabling where it should be using rcu_read_lock() will break
+	in real-time kernel builds.
+
+	If you want to wait for interrupt handlers, NMI handlers, and
+	code under the influence of preempt_disable(), you instead
+	need to use synchronize_irq() or synchronize_sched().
 
 12.	Any lock acquired by an RCU callback must be acquired elsewhere
 	with softirq disabled, e.g., via spin_lock_irqsave(),
 	spin_lock_bh(), etc.  Failing to disable irq on a given
-	acquisition of that lock will result in deadlock as soon as the
-	RCU callback happens to interrupt that acquisition's critical
-	section.
+	acquisition of that lock will result in deadlock as soon as
+	the RCU softirq handler happens to run your RCU callback while
+	interrupting that acquisition's critical section.
 
 13.	RCU callbacks can be and are executed in parallel.  In many cases,
 	the callback code simply wrappers around kfree(), so that this
@@ -265,29 +302,30 @@ over a rather long period of time, but improvements are always welcome!
 	not the case, a self-spawning RCU callback would prevent the
 	victim CPU from ever going offline.)
 
-14.	SRCU (srcu_read_lock(), srcu_read_unlock(), and synchronize_srcu())
-	may only be invoked from process context.  Unlike other forms of
-	RCU, it -is- permissible to block in an SRCU read-side critical
-	section (demarked by srcu_read_lock() and srcu_read_unlock()),
-	hence the "SRCU": "sleepable RCU".  Please note that if you
-	don't need to sleep in read-side critical sections, you should
-	be using RCU rather than SRCU, because RCU is almost always
-	faster and easier to use than is SRCU.
+14.	SRCU (srcu_read_lock(), srcu_read_unlock(), synchronize_srcu(),
+	and synchronize_srcu_expedited()) may only be invoked from
+	process context.  Unlike other forms of RCU, it -is- permissible
+	to block in an SRCU read-side critical section (demarked by
+	srcu_read_lock() and srcu_read_unlock()), hence the "SRCU":
+	"sleepable RCU".  Please note that if you don't need to sleep
+	in read-side critical sections, you should be using RCU rather
+	than SRCU, because RCU is almost always faster and easier to
+	use than is SRCU.
 
 	Also unlike other forms of RCU, explicit initialization
 	and cleanup is required via init_srcu_struct() and
 	cleanup_srcu_struct().	These are passed a "struct srcu_struct"
 	that defines the scope of a given SRCU domain.	Once initialized,
 	the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock()
-	and synchronize_srcu().  A given synchronize_srcu() waits only
-	for SRCU read-side critical sections governed by srcu_read_lock()
-	and srcu_read_unlock() calls that have been passd the same
-	srcu_struct.  This property is what makes sleeping read-side
-	critical sections tolerable -- a given subsystem delays only
-	its own updates, not those of other subsystems using SRCU.
-	Therefore, SRCU is less prone to OOM the system than RCU would
-	be if RCU's read-side critical sections were permitted to
-	sleep.
+	synchronize_srcu(), and synchronize_srcu_expedited().  A given
+	synchronize_srcu() waits only for SRCU read-side critical
+	sections governed by srcu_read_lock() and srcu_read_unlock()
+	calls that have been passed the same srcu_struct.  This property
+	is what makes sleeping read-side critical sections tolerable --
+	a given subsystem delays only its own updates, not those of other
+	subsystems using SRCU.	Therefore, SRCU is less prone to OOM the
+	system than RCU would be if RCU's read-side critical sections
+	were permitted to sleep.
 
 	The ability to sleep in read-side critical sections does not
 	come for free.	First, corresponding srcu_read_lock() and
@@ -311,12 +349,12 @@ over a rather long period of time, but improvements are always welcome!
 	destructive operation, and -only- -then- invoke call_rcu(),
 	synchronize_rcu(), or friends.
 
-	Because these primitives only wait for pre-existing readers,
-	it is the caller's responsibility to guarantee safety to
-	any subsequent readers.
+	Because these primitives only wait for pre-existing readers, it
+	is the caller's responsibility to guarantee that any subsequent
+	readers will execute safely.
 
-16.	The various RCU read-side primitives do -not- contain memory
-	barriers.  The CPU (and in some cases, the compiler) is free
-	to reorder code into and out of RCU read-side critical sections.
-	It is the responsibility of the RCU update-side primitives to
-	deal with this.
+16.	The various RCU read-side primitives do -not- necessarily contain
+	memory barriers.  You should therefore plan for the CPU
+	and the compiler to freely reorder code into and out of RCU
+	read-side critical sections.  It is the responsibility of the
+	RCU update-side primitives to deal with this.
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 2a23523ce47..31852705b58 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -75,6 +75,8 @@ o	I hear that RCU is patented?  What is with that?
 	search for the string "Patent" in RTFP.txt to find them.
 	Of these, one was allowed to lapse by the assignee, and the
 	others have been contributed to the Linux kernel under GPL.
+	There are now also LGPL implementations of user-level RCU
+	available (http://lttng.org/?q=node/18).
 
 o	I hear that RCU needs work in order to support realtime kernels?
 
@@ -91,48 +93,4 @@ o	Where can I find more information on RCU?
 
 o	What are all these files in this directory?
 
-
-	NMI-RCU.txt
-
-		Describes how to use RCU to implement dynamic
-		NMI handlers, which can be revectored on the fly,
-		without rebooting.
-
-	RTFP.txt
-
-		List of RCU-related publications and web sites.
-
-	UP.txt
-
-		Discussion of RCU usage in UP kernels.
-
-	arrayRCU.txt
-
-		Describes how to use RCU to protect arrays, with
-		resizeable arrays whose elements reference other
-		data structures being of the most interest.
-
-	checklist.txt
-
-		Lists things to check for when inspecting code that
-		uses RCU.
-
-	listRCU.txt
-
-		Describes how to use RCU to protect linked lists.
-		This is the simplest and most common use of RCU
-		in the Linux kernel.
-
-	rcu.txt
-
-		You are reading it!
-
-	rcuref.txt
-
-		Describes how to combine use of reference counts
-		with RCU.
-
-	whatisRCU.txt
-
-		Overview of how the RCU implementation works.  Along
-		the way, presents a conceptual view of RCU.
+	See 00-INDEX for the list.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
new file mode 100644
index 00000000000..1423d2570d7
--- /dev/null
+++ b/Documentation/RCU/stallwarn.txt
@@ -0,0 +1,58 @@
+Using RCU's CPU Stall Detector
+
+The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables
+RCU's CPU stall detector, which detects conditions that unduly delay
+RCU grace periods.  The stall detector's idea of what constitutes
+"unduly delayed" is controlled by a pair of C preprocessor macros:
+
+RCU_SECONDS_TILL_STALL_CHECK
+
+	This macro defines the period of time that RCU will wait from
+	the beginning of a grace period until it issues an RCU CPU
+	stall warning.	It is normally ten seconds.
+
+RCU_SECONDS_TILL_STALL_RECHECK
+
+	This macro defines the period of time that RCU will wait after
+	issuing a stall warning until it issues another stall warning.
+	It is normally set to thirty seconds.
+
+RCU_STALL_RAT_DELAY
+
+	The CPU stall detector tries to make the offending CPU rat on itself,
+	as this often gives better-quality stack traces.  However, if
+	the offending CPU does not detect its own stall in the number
+	of jiffies specified by RCU_STALL_RAT_DELAY, then other CPUs will
+	complain.  This is normally set to two jiffies.
+
+The following problems can result in an RCU CPU stall warning:
+
+o	A CPU looping in an RCU read-side critical section.
+	
+o	A CPU looping with interrupts disabled.
+
+o	A CPU looping with preemption disabled.
+
+o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
+	without invoking schedule().
+
+o	A bug in the RCU implementation.
+
+o	A hardware failure.  This is quite unlikely, but has occurred
+	at least once in a former life.  A CPU failed in a running system,
+	becoming unresponsive, but not causing an immediate crash.
+	This resulted in a series of RCU CPU stall warnings, eventually
+	leading the realization that the CPU had failed.
+
+The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning.
+SRCU does not do so directly, but its calls to synchronize_sched() will
+result in RCU-sched detecting any CPU stalls that might be occurring.
+
+To diagnose the cause of the stall, inspect the stack traces.  The offending
+function will usually be near the top of the stack.  If you have a series
+of stall warnings from a single extended stall, comparing the stack traces
+can often help determine where the stall is occurring, which will usually
+be in the function nearest the top of the stack that stays the same from
+trace to trace.
+
+RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE.
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 9dba3bb90e6..0e50bc2aa1e 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -30,6 +30,18 @@ MODULE PARAMETERS
 
 This module has the following parameters:
 
+fqs_duration	Duration (in microseconds) of artificially induced bursts
+		of force_quiescent_state() invocations.  In RCU
+		implementations having force_quiescent_state(), these
+		bursts help force races between forcing a given grace
+		period and that grace period ending on its own.
+
+fqs_holdoff	Holdoff time (in microseconds) between consecutive calls
+		to force_quiescent_state() within a burst.
+
+fqs_stutter	Wait time (in seconds) between consecutive bursts
+		of calls to force_quiescent_state().
+
 irqreaders	Says to invoke RCU readers from irq level.  This is currently
 		done via timers.  Defaults to "1" for variants of RCU that
 		permit this.  (Or, more accurately, variants of RCU that do
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index d542ca243b8..469a58b2e67 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -327,7 +327,8 @@ a.	synchronize_rcu()	rcu_read_lock() / rcu_read_unlock()
 
 b.	call_rcu_bh()		rcu_read_lock_bh() / rcu_read_unlock_bh()
 
-c.	synchronize_sched()	preempt_disable() / preempt_enable()
+c.	synchronize_sched()	rcu_read_lock_sched() / rcu_read_unlock_sched()
+				preempt_disable() / preempt_enable()
 				local_irq_save() / local_irq_restore()
 				hardirq enter / hardirq exit
 				NMI enter / NMI exit
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt
index 4c0c575a401..79334ed5daa 100644
--- a/Documentation/filesystems/dentry-locking.txt
+++ b/Documentation/filesystems/dentry-locking.txt
@@ -62,7 +62,8 @@ changes are :
 2. Insertion of a dentry into the hash table is done using
    hlist_add_head_rcu() which take care of ordering the writes - the
    writes to the dentry must be visible before the dentry is
-   inserted. This works in conjunction with hlist_for_each_rcu() while
+   inserted. This works in conjunction with hlist_for_each_rcu(),
+   which has since been replaced by hlist_for_each_entry_rcu(), while
    walking the hash chain. The only requirement is that all
    initialization to the dentry must be done before
    hlist_add_head_rcu() since we don't have dcache_lock protection
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 25c3ed594c5..6bf97d17632 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -765,9 +765,9 @@ config RCU_CPU_STALL_DETECTOR
 	  CPUs are delaying the current grace period, but only when
 	  the grace period extends for excessive time periods.
 
-	  Say Y if you want RCU to perform such checks.
+	  Say N if you want to disable such checks.
 
-	  Say N if you are unsure.
+	  Say Y if you are unsure.
 
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
-- 
cgit v1.2.3


From 73834d6f90f6833663f9effd4cf9b79b63bc36e1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 20 Jan 2010 17:17:04 -0500
Subject: nfsd: 4.1 has an rfc number

No need to refer to an internet draft; there's an RFC now.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 Documentation/filesystems/nfs/nfs41-server.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 1bd0d0c0517..6a53a84afc7 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4
 on or off; rpc.nfsd does this correctly.)
 
 The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
-on the latest NFSv4.1 Internet Draft:
-http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
+on RFC 5661.
 
 From the many new features in NFSv4.1 the current implementation
 focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
@@ -44,7 +43,7 @@ interoperability problems with future clients.  Known issues:
 	  trunking, but this is a mandatory feature, and its use is
 	  recommended to clients in a number of places.  (E.g. to ensure
 	  timely renewal in case an existing connection's retry timeouts
-	  have gotten too long; see section 8.3 of the draft.)
+	  have gotten too long; see section 8.3 of the RFC.)
 	  Therefore, lack of this feature may cause future clients to
 	  fail.
 	- Incomplete backchannel support: incomplete backchannel gss
-- 
cgit v1.2.3


From e902ec9906e844f4613fa6190c6fa65f162dc86e Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Sat, 30 Jan 2010 18:06:35 +0900
Subject: nilfs2: issue discard request after cleaning segments

This adds a function to send discard requests for given array of
segment numbers, and calls the function when garbage collection
succeeded.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 Documentation/filesystems/nilfs2.txt |  3 +++
 fs/nilfs2/segment.c                  | 10 ++++++++++
 fs/nilfs2/super.c                    |  8 +++++++-
 fs/nilfs2/the_nilfs.c                | 38 ++++++++++++++++++++++++++++++++++++
 fs/nilfs2/the_nilfs.h                |  1 +
 include/linux/nilfs2_fs.h            |  1 +
 6 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 839efd8a8a8..cf6d0d85ca8 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -74,6 +74,9 @@ norecovery		Disable recovery of the filesystem on mount.
 			This disables every write access on the device for
 			read-only mounts or snapshots.  This option will fail
 			for r/w mounts on an unclean volume.
+discard			Issue discard/TRIM commands to the underlying block
+			device when blocks are freed.  This is useful for SSD
+			devices and sparse/thinly-provisioned LUNs.
 
 NILFS2 usage
 ============
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 105b508b47a..9280b0f1079 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2560,6 +2560,16 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(sci->sc_interval);
 	}
+	if (nilfs_test_opt(sbi, DISCARD)) {
+		int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
+						 sci->sc_nfreesegs);
+		if (ret) {
+			printk(KERN_WARNING
+			       "NILFS warning: error %d on discard request, "
+			       "turning discards off for the device\n", ret);
+			nilfs_clear_opt(sbi, DISCARD);
+		}
+	}
 
  out_unlock:
 	sci->sc_freesegs = NULL;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8173faee31e..3f88401a375 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -481,6 +481,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",order=strict");
 	if (nilfs_test_opt(sbi, NORECOVERY))
 		seq_printf(seq, ",norecovery");
+	if (nilfs_test_opt(sbi, DISCARD))
+		seq_printf(seq, ",discard");
 
 	return 0;
 }
@@ -550,7 +552,7 @@ static const struct export_operations nilfs_export_ops = {
 enum {
 	Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
-	Opt_err,
+	Opt_discard, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -561,6 +563,7 @@ static match_table_t tokens = {
 	{Opt_snapshot, "cp=%u"},
 	{Opt_order, "order=%s"},
 	{Opt_norecovery, "norecovery"},
+	{Opt_discard, "discard"},
 	{Opt_err, NULL}
 };
 
@@ -614,6 +617,9 @@ static int parse_options(char *options, struct super_block *sb)
 		case Opt_norecovery:
 			nilfs_set_opt(sbi, NORECOVERY);
 			break;
+		case Opt_discard:
+			nilfs_set_opt(sbi, DISCARD);
+			break;
 		default:
 			printk(KERN_ERR
 			       "NILFS: Unrecognized mount option \"%s\"\n", p);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 6241e1722ef..92733d5651d 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -646,6 +646,44 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 	goto out;
 }
 
+int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
+			    size_t nsegs)
+{
+	sector_t seg_start, seg_end;
+	sector_t start = 0, nblocks = 0;
+	unsigned int sects_per_block;
+	__u64 *sn;
+	int ret = 0;
+
+	sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+		bdev_logical_block_size(nilfs->ns_bdev);
+	for (sn = segnump; sn < segnump + nsegs; sn++) {
+		nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
+
+		if (!nblocks) {
+			start = seg_start;
+			nblocks = seg_end - seg_start + 1;
+		} else if (start + nblocks == seg_start) {
+			nblocks += seg_end - seg_start + 1;
+		} else {
+			ret = blkdev_issue_discard(nilfs->ns_bdev,
+						   start * sects_per_block,
+						   nblocks * sects_per_block,
+						   GFP_NOFS,
+						   DISCARD_FL_BARRIER);
+			if (ret < 0)
+				return ret;
+			nblocks = 0;
+		}
+	}
+	if (nblocks)
+		ret = blkdev_issue_discard(nilfs->ns_bdev,
+					   start * sects_per_block,
+					   nblocks * sects_per_block,
+					   GFP_NOFS, DISCARD_FL_BARRIER);
+	return ret;
+}
+
 int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
 	struct inode *dat = nilfs_dat_inode(nilfs);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 589786e3346..fd057f8ad43 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -221,6 +221,7 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
 struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 3fe02cf8b65..640702e9745 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -153,6 +153,7 @@ struct nilfs_super_root {
 						   semantics also for data */
 #define NILFS_MOUNT_NORECOVERY		0x4000  /* Disable write access during
 						   mount-time recovery */
+#define NILFS_MOUNT_DISCARD		0x8000  /* Issue DISCARD requests */
 
 
 /**
-- 
cgit v1.2.3


From cb2992a60b7e73fbabe9ffe54056eed0022f2ed2 Mon Sep 17 00:00:00 2001
From: Mulyadi Santosa <mulyadi.santosa@gmail.com>
Date: Thu, 18 Feb 2010 01:22:40 +0700
Subject: doc: typo - Table 1-2 should refer to "status", not "statm"

Fixes a typo in proc.txt documentation. Table 1-2 should refer to
"status", not "statm"

Signed-off-by: Mulyadi Santosa <mulyadi.santosa@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/filesystems/proc.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0d07513a67a..bb314f60a76 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -188,7 +188,7 @@ memory usage. Its seven fields are explained in Table 1-3.  The stat file
 contains details information about the process itself.  Its fields are
 explained in Table 1-4.
 
-Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
+Table 1-2: Contents of the status files (as of 2.6.30-rc7)
 ..............................................................................
  Field                       Content
  Name                        filename of the executable
-- 
cgit v1.2.3


From 2f99cc6e4688faf5f03f9afe5e1c6097278c75b2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 16 Jan 2010 14:10:21 -0500
Subject: add several pieces to shared subtree documentation

* document locking
* add the missing part of data structure invariants (relationship
between mnt_share and mnt_slave lists in case of a peer group
among slaves).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/sharedsubtree.txt | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 23a181074f9..fc0e39af43c 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -837,6 +837,9 @@ replicas continue to be exactly same.
 	 individual lists does not affect propagation or the way propagation
 	 tree is modified by operations.
 
+	All vfsmounts in a peer group have the same ->mnt_master.  If it is
+	non-NULL, they form a contiguous (ordered) segment of slave list.
+
 	A example propagation tree looks as shown in the figure below.
 	[ NOTE: Though it looks like a forest, if we consider all the shared
 	mounts as a conceptual entity called 'pnode', it becomes a tree]
@@ -874,8 +877,19 @@ replicas continue to be exactly same.
 
 	NOTE: The propagation tree is orthogonal to the mount tree.
 
+8B Locking:
+
+	->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected
+	by namespace_sem (exclusive for modifications, shared for reading).
+
+	Normally we have ->mnt_flags modifications serialized by vfsmount_lock.
+	There are two exceptions: do_add_mount() and clone_mnt().
+	The former modifies a vfsmount that has not been visible in any shared
+	data structures yet.
+	The latter holds namespace_sem and the only references to vfsmount
+	are in lists that can't be traversed without namespace_sem.
 
-8B Algorithm:
+8C Algorithm:
 
 	The crux of the implementation resides in rbind/move operation.
 
-- 
cgit v1.2.3


From 5dd4056db84387975140ff2568eaa0406f07985e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 3 Mar 2010 09:05:00 -0500
Subject: dquot: cleanup space allocation / freeing routines

Get rid of the alloc_space, free_space, reserve_space, claim_space and
release_rsv dquot operations - they are always called from the filesystem
and if a filesystem really needs their own (which none currently does)
it can just call into it's own routine directly.

Move shared logic into the common __dquot_alloc_space,
dquot_claim_space_nodirty and __dquot_free_space low-level methods,
and rationalize the wrappers around it to move as much as possible
code into the common block for CONFIG_QUOTA vs not.  Also rename
all these helpers to be named dquot_* instead of vfs_dq_*.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 Documentation/filesystems/Locking |   6 +-
 fs/ext2/balloc.c                  |  12 ++-
 fs/ext2/xattr.c                   |  10 +-
 fs/ext3/balloc.c                  |  11 +-
 fs/ext3/inode.c                   |   2 +-
 fs/ext3/super.c                   |   2 -
 fs/ext3/xattr.c                   |   8 +-
 fs/ext4/inode.c                   |  20 ++--
 fs/ext4/mballoc.c                 |   6 +-
 fs/ext4/super.c                   |   5 -
 fs/ext4/xattr.c                   |   8 +-
 fs/jfs/jfs_dtree.c                |  28 ++---
 fs/jfs/jfs_extent.c               |  16 +--
 fs/jfs/jfs_xtree.c                |  21 ++--
 fs/jfs/xattr.c                    |  17 ++--
 fs/ocfs2/alloc.c                  |  13 ++-
 fs/ocfs2/aops.c                   |  11 +-
 fs/ocfs2/dir.c                    |  37 +++----
 fs/ocfs2/file.c                   |  11 +-
 fs/ocfs2/namei.c                  |   9 +-
 fs/ocfs2/quota_global.c           |   2 -
 fs/quota/dquot.c                  |  79 +++++----------
 fs/reiserfs/bitmap.c              |  10 +-
 fs/reiserfs/stree.c               |  20 ++--
 fs/reiserfs/super.c               |   2 -
 fs/udf/balloc.c                   |  35 ++++---
 fs/ufs/balloc.c                   |  24 +++--
 include/linux/quota.h             |   8 --
 include/linux/quotaops.h          | 208 +++++++++++---------------------------
 29 files changed, 258 insertions(+), 383 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 18b9d0ca063..1192fde1163 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -462,9 +462,7 @@ in sys_read() and friends.
 prototypes:
 	int (*initialize) (struct inode *, int);
 	int (*drop) (struct inode *);
-	int (*alloc_space) (struct inode *, qsize_t, int);
 	int (*alloc_inode) (const struct inode *, unsigned long);
-	int (*free_space) (struct inode *, qsize_t);
 	int (*free_inode) (const struct inode *, unsigned long);
 	int (*transfer) (struct inode *, struct iattr *);
 	int (*write_dquot) (struct dquot *);
@@ -481,9 +479,7 @@ What filesystem should expect from the generic quota functions:
 		FS recursion	Held locks when called
 initialize:	yes		maybe dqonoff_sem
 drop:		yes		-
-alloc_space:	->mark_dirty()	-
 alloc_inode:	->mark_dirty()	-
-free_space:	->mark_dirty()	-
 free_inode:	->mark_dirty()	-
 transfer:	yes		-
 write_dquot:	yes		dqonoff_sem or dqptr_sem
@@ -495,7 +491,7 @@ write_info:	yes		dqonoff_sem
 FS recursion means calling ->quota_read() and ->quota_write() from superblock
 operations.
 
-->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
+->alloc_inode(), ->free_inode() are called
 only directly by the filesystem and do not call any fs functions only
 the ->mark_dirty() operation.
 
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 7f8d2e5a7ea..1d081f0cfec 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -570,7 +570,7 @@ do_more:
 error_return:
 	brelse(bitmap_bh);
 	release_blocks(sb, freed);
-	vfs_dq_free_block(inode, freed);
+	dquot_free_block(inode, freed);
 }
 
 /**
@@ -1236,6 +1236,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 	unsigned short windowsz = 0;
 	unsigned long ngroups;
 	unsigned long num = *count;
+	int ret;
 
 	*errp = -ENOSPC;
 	sb = inode->i_sb;
@@ -1247,8 +1248,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (vfs_dq_alloc_block(inode, num)) {
-		*errp = -EDQUOT;
+	ret = dquot_alloc_block(inode, num);
+	if (ret) {
+		*errp = ret;
 		return 0;
 	}
 
@@ -1409,7 +1411,7 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	vfs_dq_free_block(inode, *count-num);
+	dquot_free_block(inode, *count-num);
 	*count = num;
 	return ret_block;
 
@@ -1420,7 +1422,7 @@ out:
 	 * Undo the block allocation
 	 */
 	if (!performed_allocation)
-		vfs_dq_free_block(inode, *count);
+		dquot_free_block(inode, *count);
 	brelse(bitmap_bh);
 	return 0;
 }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 904f00642f8..e44dc92609b 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -644,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 				   the inode.  */
 				ea_bdebug(new_bh, "reusing block");
 
-				error = -EDQUOT;
-				if (vfs_dq_alloc_block(inode, 1)) {
+				error = dquot_alloc_block(inode, 1);
+				if (error) {
 					unlock_buffer(new_bh);
 					goto cleanup;
 				}
@@ -702,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 		 * as if nothing happened and cleanup the unused block */
 		if (error && error != -ENOSPC) {
 			if (new_bh && new_bh != old_bh)
-				vfs_dq_free_block(inode, 1);
+				dquot_free_block(inode, 1);
 			goto cleanup;
 		}
 	} else
@@ -734,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
 			if (ce)
 				mb_cache_entry_release(ce);
-			vfs_dq_free_block(inode, 1);
+			dquot_free_block(inode, 1);
 			mark_buffer_dirty(old_bh);
 			ea_bdebug(old_bh, "refcount now=%d",
 				le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -797,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode)
 		mark_buffer_dirty(bh);
 		if (IS_SYNC(inode))
 			sync_dirty_buffer(bh);
-		vfs_dq_free_block(inode, 1);
+		dquot_free_block(inode, 1);
 	}
 	EXT2_I(inode)->i_file_acl = 0;
 
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 27967f92e82..161da2d3f89 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
 	}
 	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
-		vfs_dq_free_block(inode, dquot_freed_blocks);
+		dquot_free_block(inode, dquot_freed_blocks);
 	return;
 }
 
@@ -1502,8 +1502,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (vfs_dq_alloc_block(inode, num)) {
-		*errp = -EDQUOT;
+	err = dquot_alloc_block(inode, num);
+	if (err) {
+		*errp = err;
 		return 0;
 	}
 
@@ -1713,7 +1714,7 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	vfs_dq_free_block(inode, *count-num);
+	dquot_free_block(inode, *count-num);
 	*count = num;
 	return ret_block;
 
@@ -1728,7 +1729,7 @@ out:
 	 * Undo the block allocation
 	 */
 	if (!performed_allocation)
-		vfs_dq_free_block(inode, *count);
+		dquot_free_block(inode, *count);
 	brelse(bitmap_bh);
 	return 0;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index eda9121d7d5..20f02d69365 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3336,7 +3336,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
- * Also, vfs_dq_alloc_space() will always dirty the inode when blocks
+ * Also, dquot_alloc_space() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5c54e5f685d..8c13910a378 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -752,9 +752,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 static const struct dquot_operations ext3_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
-	.alloc_space	= dquot_alloc_space,
 	.alloc_inode	= dquot_alloc_inode,
-	.free_space	= dquot_free_space,
 	.free_inode	= dquot_free_inode,
 	.transfer	= dquot_transfer,
 	.write_dquot	= ext3_write_dquot,
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 2d2fb2a8596..534a94c3a93 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -500,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
 		error = ext3_journal_dirty_metadata(handle, bh);
 		if (IS_SYNC(inode))
 			handle->h_sync = 1;
-		vfs_dq_free_block(inode, 1);
+		dquot_free_block(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
 		if (ce)
@@ -775,8 +775,8 @@ inserted:
 			else {
 				/* The old block is released after updating
 				   the inode. */
-				error = -EDQUOT;
-				if (vfs_dq_alloc_block(inode, 1))
+				error = dquot_alloc_block(inode, 1);
+				if (error)
 					goto cleanup;
 				error = ext3_journal_get_write_access(handle,
 								      new_bh);
@@ -850,7 +850,7 @@ cleanup:
 	return error;
 
 cleanup_dquot:
-	vfs_dq_free_block(inode, 1);
+	dquot_free_block(inode, 1);
 	goto cleanup;
 
 bad_block:
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e11952404e0..9f607ea411c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1093,9 +1093,9 @@ void ext4_da_update_reserve_space(struct inode *inode,
 
 	/* Update quota subsystem */
 	if (quota_claim) {
-		vfs_dq_claim_block(inode, used);
+		dquot_claim_block(inode, used);
 		if (mdb_free)
-			vfs_dq_release_reservation_block(inode, mdb_free);
+			dquot_release_reservation_block(inode, mdb_free);
 	} else {
 		/*
 		 * We did fallocate with an offset that is already delayed
@@ -1106,8 +1106,8 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		 * that
 		 */
 		if (allocated_meta_blocks)
-			vfs_dq_claim_block(inode, allocated_meta_blocks);
-		vfs_dq_release_reservation_block(inode, mdb_free + used);
+			dquot_claim_block(inode, allocated_meta_blocks);
+		dquot_release_reservation_block(inode, mdb_free + used);
 	}
 
 	/*
@@ -1836,6 +1836,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned long md_needed, md_reserved;
+	int ret;
 
 	/*
 	 * recalculate the amount of metadata blocks to reserve
@@ -1853,11 +1854,12 @@ repeat:
 	 * later. Real quota accounting is done at pages writeout
 	 * time.
 	 */
-	if (vfs_dq_reserve_block(inode, md_needed + 1))
-		return -EDQUOT;
+	ret = dquot_reserve_block(inode, md_needed + 1);
+	if (ret)
+		return ret;
 
 	if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
-		vfs_dq_release_reservation_block(inode, md_needed + 1);
+		dquot_release_reservation_block(inode, md_needed + 1);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
@@ -1914,7 +1916,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-	vfs_dq_release_reservation_block(inode, to_free);
+	dquot_release_reservation_block(inode, to_free);
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
@@ -5641,7 +5643,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
- * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d34afad3e13..0b905781e8e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4254,7 +4254,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			return 0;
 		}
 		reserv_blks = ar->len;
-		while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) {
+		while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
 			ar->flags |= EXT4_MB_HINT_NOPREALLOC;
 			ar->len--;
 		}
@@ -4331,7 +4331,7 @@ out2:
 	kmem_cache_free(ext4_ac_cachep, ac);
 out1:
 	if (inquota && ar->len < inquota)
-		vfs_dq_free_block(ar->inode, inquota - ar->len);
+		dquot_free_block(ar->inode, inquota - ar->len);
 out3:
 	if (!ar->len) {
 		if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
@@ -4646,7 +4646,7 @@ do_more:
 	sb->s_dirt = 1;
 error_return:
 	if (freed)
-		vfs_dq_free_block(inode, freed);
+		dquot_free_block(inode, freed);
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
 	if (ac)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 735c20d5fd5..fa8f4deda65 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1014,15 +1014,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 static const struct dquot_operations ext4_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
-	.alloc_space	= dquot_alloc_space,
-	.reserve_space	= dquot_reserve_space,
-	.claim_space	= dquot_claim_space,
-	.release_rsv	= dquot_release_reserved_space,
 #ifdef CONFIG_QUOTA
 	.get_reserved_space = ext4_get_reserved_space,
 #endif
 	.alloc_inode	= dquot_alloc_inode,
-	.free_space	= dquot_free_space,
 	.free_inode	= dquot_free_inode,
 	.transfer	= dquot_transfer,
 	.write_dquot	= ext4_write_dquot,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f3a2f7ed45a..ab3a95ee5e7 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -494,7 +494,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		error = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (IS_SYNC(inode))
 			ext4_handle_sync(handle);
-		vfs_dq_free_block(inode, 1);
+		dquot_free_block(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
 		if (ce)
@@ -787,8 +787,8 @@ inserted:
 			else {
 				/* The old block is released after updating
 				   the inode. */
-				error = -EDQUOT;
-				if (vfs_dq_alloc_block(inode, 1))
+				error = dquot_alloc_block(inode, 1);
+				if (error)
 					goto cleanup;
 				error = ext4_journal_get_write_access(handle,
 								      new_bh);
@@ -876,7 +876,7 @@ cleanup:
 	return error;
 
 cleanup_dquot:
-	vfs_dq_free_block(inode, 1);
+	dquot_free_block(inode, 1);
 	goto cleanup;
 
 bad_block:
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 925871e9887..0e4623be70c 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 		 * It's time to move the inline table to an external
 		 * page and begin to build the xtree
 		 */
-		if (vfs_dq_alloc_block(ip, sbi->nbperpage))
+		if (dquot_alloc_block(ip, sbi->nbperpage))
 			goto clean_up;
 		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
-			vfs_dq_free_block(ip, sbi->nbperpage);
+			dquot_free_block(ip, sbi->nbperpage);
 			goto clean_up;
 		}
 
@@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 			memcpy(&jfs_ip->i_dirtable, temp_table,
 			       sizeof (temp_table));
 			dbFree(ip, xaddr, sbi->nbperpage);
-			vfs_dq_free_block(ip, sbi->nbperpage);
+			dquot_free_block(ip, sbi->nbperpage);
 			goto clean_up;
 		}
 		ip->i_size = PSIZE;
@@ -1027,10 +1027,9 @@ static int dtSplitUp(tid_t tid,
 			n = xlen;
 
 		/* Allocate blocks to quota. */
-		if (vfs_dq_alloc_block(ip, n)) {
-			rc = -EDQUOT;
+		rc = dquot_alloc_block(ip, n);
+		if (rc)
 			goto extendOut;
-		}
 		quota_allocation += n;
 
 		if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
@@ -1308,7 +1307,7 @@ static int dtSplitUp(tid_t tid,
 
 	/* Rollback quota allocation */
 	if (rc && quota_allocation)
-		vfs_dq_free_block(ip, quota_allocation);
+		dquot_free_block(ip, quota_allocation);
 
       dtSplitUp_Exit:
 
@@ -1369,9 +1368,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 		return -EIO;
 
 	/* Allocate blocks to quota. */
-	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
 		release_metapage(rmp);
-		return -EDQUOT;
+		return rc;
 	}
 
 	jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
@@ -1892,6 +1892,7 @@ static int dtSplitRoot(tid_t tid,
 	struct dt_lock *dtlck;
 	struct tlock *tlck;
 	struct lv *lv;
+	int rc;
 
 	/* get split root page */
 	smp = split->mp;
@@ -1916,9 +1917,10 @@ static int dtSplitRoot(tid_t tid,
 	rp = rmp->data;
 
 	/* Allocate blocks to quota. */
-	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
 		release_metapage(rmp);
-		return -EDQUOT;
+		return rc;
 	}
 
 	BT_MARK_DIRTY(rmp, ip);
@@ -2287,7 +2289,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 	xlen = lengthPXD(&fp->header.self);
 
 	/* Free quota allocation. */
-	vfs_dq_free_block(ip, xlen);
+	dquot_free_block(ip, xlen);
 
 	/* free/invalidate its buffer page */
 	discard_metapage(fmp);
@@ -2363,7 +2365,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 				xlen = lengthPXD(&p->header.self);
 
 				/* Free quota allocation */
-				vfs_dq_free_block(ip, xlen);
+				dquot_free_block(ip, xlen);
 
 				/* free/invalidate its buffer page */
 				discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 41d6045dbeb..5d3bbd10f8d 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 	}
 
 	/* Allocate blocks to quota. */
-	if (vfs_dq_alloc_block(ip, nxlen)) {
+	rc = dquot_alloc_block(ip, nxlen);
+	if (rc) {
 		dbFree(ip, nxaddr, (s64) nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
-		return -EDQUOT;
+		return rc;
 	}
 
 	/* determine the value of the extent flag */
@@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 	 */
 	if (rc) {
 		dbFree(ip, nxaddr, nxlen);
-		vfs_dq_free_block(ip, nxlen);
+		dquot_free_block(ip, nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
 		return (rc);
 	}
@@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		goto exit;
 
 	/* Allocat blocks to quota. */
-	if (vfs_dq_alloc_block(ip, nxlen)) {
+	rc = dquot_alloc_block(ip, nxlen);
+	if (rc) {
 		dbFree(ip, nxaddr, (s64) nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
-		return -EDQUOT;
+		return rc;
 	}
 
 	delta = nxlen - xlen;
@@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		/* extend the extent */
 		if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
 			dbFree(ip, xaddr + xlen, delta);
-			vfs_dq_free_block(ip, nxlen);
+			dquot_free_block(ip, nxlen);
 			goto exit;
 		}
 	} else {
@@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		 */
 		if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
 			dbFree(ip, nxaddr, nxlen);
-			vfs_dq_free_block(ip, nxlen);
+			dquot_free_block(ip, nxlen);
 			goto exit;
 		}
 	}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index d654a645864..6c50871e622 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -585,10 +585,10 @@ int xtInsert(tid_t tid,		/* transaction id */
 			hint = addressXAD(xad) + lengthXAD(xad) - 1;
 		} else
 			hint = 0;
-		if ((rc = vfs_dq_alloc_block(ip, xlen)))
+		if ((rc = dquot_alloc_block(ip, xlen)))
 			goto out;
 		if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
-			vfs_dq_free_block(ip, xlen);
+			dquot_free_block(ip, xlen);
 			goto out;
 		}
 	}
@@ -617,7 +617,7 @@ int xtInsert(tid_t tid,		/* transaction id */
 			/* undo data extent allocation */
 			if (*xaddrp == 0) {
 				dbFree(ip, xaddr, (s64) xlen);
-				vfs_dq_free_block(ip, xlen);
+				dquot_free_block(ip, xlen);
 			}
 			return rc;
 		}
@@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
 	rbn = addressPXD(pxd);
 
 	/* Allocate blocks to quota. */
-	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
-		rc = -EDQUOT;
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc)
 		goto clean_up;
-	}
 
 	quota_allocation += lengthPXD(pxd);
 
@@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
 
 	/* Rollback quota allocation. */
 	if (quota_allocation)
-		vfs_dq_free_block(ip, quota_allocation);
+		dquot_free_block(ip, quota_allocation);
 
 	return (rc);
 }
@@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid,
 	struct pxdlist *pxdlist;
 	struct tlock *tlck;
 	struct xtlock *xtlck;
+	int rc;
 
 	sp = &JFS_IP(ip)->i_xtroot;
 
@@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid,
 		return -EIO;
 
 	/* Allocate blocks to quota. */
-	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
 		release_metapage(rmp);
-		return -EDQUOT;
+		return rc;
 	}
 
 	jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
@@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 		ip->i_size = newsize;
 
 	/* update quota allocation to reflect freed blocks */
-	vfs_dq_free_block(ip, nfreed);
+	dquot_free_block(ip, nfreed);
 
 	/*
 	 * free tlock of invalidated pages
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fad364548bc..1f594ab2189 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
 	nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
 
 	/* Allocate new blocks to quota. */
-	if (vfs_dq_alloc_block(ip, nblocks)) {
-		return -EDQUOT;
-	}
+	rc = dquot_alloc_block(ip, nblocks);
+	if (rc)
+		return rc;
 
 	rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
 	if (rc) {
 		/*Rollback quota allocation. */
-		vfs_dq_free_block(ip, nblocks);
+		dquot_free_block(ip, nblocks);
 		return rc;
 	}
 
@@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
 
       failed:
 	/* Rollback quota allocation. */
-	vfs_dq_free_block(ip, nblocks);
+	dquot_free_block(ip, nblocks);
 
 	dbFree(ip, blkno, nblocks);
 	return rc;
@@ -538,7 +538,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
 
 	if (blocks_needed > current_blocks) {
 		/* Allocate new blocks to quota. */
-		if (vfs_dq_alloc_block(inode, blocks_needed))
+		rc = dquot_alloc_block(inode, blocks_needed);
+		if (rc)
 			return -EDQUOT;
 
 		quota_allocation = blocks_needed;
@@ -602,7 +603,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
       clean_up:
 	/* Rollback quota allocation */
 	if (quota_allocation)
-		vfs_dq_free_block(inode, quota_allocation);
+		dquot_free_block(inode, quota_allocation);
 
 	return (rc);
 }
@@ -677,7 +678,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
 
 	/* If old blocks exist, they must be removed from quota allocation. */
 	if (old_blocks)
-		vfs_dq_free_block(inode, old_blocks);
+		dquot_free_block(inode, old_blocks);
 
 	inode->i_ctime = CURRENT_TIME;
 
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d17bdc718f7..20538dd832a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5712,7 +5712,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 		goto out;
 	}
 
-	vfs_dq_free_space_nodirty(inode,
+	dquot_free_space_nodirty(inode,
 				  ocfs2_clusters_to_bytes(inode->i_sb, len));
 
 	ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
@@ -6935,7 +6935,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	vfs_dq_free_space_nodirty(inode,
+	dquot_free_space_nodirty(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
@@ -7300,11 +7300,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		unsigned int page_end;
 		u64 phys;
 
-		if (vfs_dq_alloc_space_nodirty(inode,
-				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
-			ret = -EDQUOT;
+		ret = dquot_alloc_space_nodirty(inode,
+				       ocfs2_clusters_to_bytes(osb->sb, 1));
+		if (ret)
 			goto out_commit;
-		}
 		did_quota = 1;
 
 		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
@@ -7380,7 +7379,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 
 out_commit:
 	if (ret < 0 && did_quota)
-		vfs_dq_free_space_nodirty(inode,
+		dquot_free_space_nodirty(inode,
 					  ocfs2_clusters_to_bytes(osb->sb, 1));
 
 	ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7e9df11260f..7d04c171567 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1763,10 +1763,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 
 	wc->w_handle = handle;
 
-	if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
-			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
-		ret = -EDQUOT;
-		goto out_commit;
+	if (clusters_to_alloc) {
+		ret = dquot_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+		if (ret)
+			goto out_commit;
 	}
 	/*
 	 * We don't want this to fail in ocfs2_write_end(), so do it
@@ -1809,7 +1810,7 @@ success:
 	return 0;
 out_quota:
 	if (clusters_to_alloc)
-		vfs_dq_free_space(inode,
+		dquot_free_space(inode,
 			  ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
 	ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 28c3ec23879..a63ea4d74e6 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		goto out;
 	}
 
-	if (vfs_dq_alloc_space_nodirty(dir,
-				ocfs2_clusters_to_bytes(osb->sb,
-							alloc + dx_alloc))) {
-		ret = -EDQUOT;
+	ret = dquot_alloc_space_nodirty(dir,
+		ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
+	if (ret)
 		goto out_commit;
-	}
 	did_quota = 1;
 
 	if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 
 out_commit:
 	if (ret < 0 && did_quota)
-		vfs_dq_free_space_nodirty(dir, bytes_allocated);
+		dquot_free_space_nodirty(dir, bytes_allocated);
 
 	ocfs2_commit_trans(osb, handle);
 
@@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	if (extend) {
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 
-		if (vfs_dq_alloc_space_nodirty(dir,
-					ocfs2_clusters_to_bytes(sb, 1))) {
-			status = -EDQUOT;
+		status = dquot_alloc_space_nodirty(dir,
+					ocfs2_clusters_to_bytes(sb, 1));
+		if (status)
 			goto bail;
-		}
 		did_quota = 1;
 
 		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
@@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	status = 0;
 bail:
 	if (did_quota && status < 0)
-		vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
+		dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
 	mlog_exit(status);
 	return status;
 }
@@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
 		goto out;
 	}
 
-	if (vfs_dq_alloc_space_nodirty(dir,
-				       ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
-		ret = -EDQUOT;
+	ret = dquot_alloc_space_nodirty(dir,
+				       ocfs2_clusters_to_bytes(dir->i_sb, 1));
+	if (ret)
 		goto out_commit;
-	}
 	did_quota = 1;
 
 	ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
@@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
 
 out_commit:
 	if (ret < 0 && did_quota)
-		vfs_dq_free_space_nodirty(dir,
+		dquot_free_space_nodirty(dir,
 				ocfs2_clusters_to_bytes(dir->i_sb, 1));
 
 	ocfs2_commit_trans(osb, handle);
@@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
 		goto out;
 	}
 
-	if (vfs_dq_alloc_space_nodirty(dir,
-				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
-		ret = -EDQUOT;
+	ret = dquot_alloc_space_nodirty(dir,
+				       ocfs2_clusters_to_bytes(osb->sb, 1));
+	if (ret)
 		goto out_commit;
-	}
 	did_quota = 1;
 
 	/*
@@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
 
 out_commit:
 	if (ret < 0 && did_quota)
-		vfs_dq_free_space_nodirty(dir,
+		dquot_free_space_nodirty(dir,
 					  ocfs2_clusters_to_bytes(dir->i_sb, 1));
 
 	ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 558ce031242..6cf3d8d1836 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -629,11 +629,10 @@ restart_all:
 	}
 
 restarted_transaction:
-	if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
-	    clusters_to_add))) {
-		status = -EDQUOT;
+	status = dquot_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	if (status)
 		goto leave;
-	}
 	did_quota = 1;
 
 	/* reserve a write to the file entry early on - that we if we
@@ -674,7 +673,7 @@ restarted_transaction:
 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	/* Release unused quota reservation */
-	vfs_dq_free_space(inode,
+	dquot_free_space(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	did_quota = 0;
 
@@ -710,7 +709,7 @@ restarted_transaction:
 
 leave:
 	if (status < 0 && did_quota)
-		vfs_dq_free_space(inode,
+		dquot_free_space(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	if (handle) {
 		ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 50fb26a6a5f..13adaa1f40c 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1716,11 +1716,10 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;
 
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		if (vfs_dq_alloc_space_nodirty(inode,
-		    ocfs2_clusters_to_bytes(osb->sb, 1))) {
-			status = -EDQUOT;
+		status = dquot_alloc_space_nodirty(inode,
+		    ocfs2_clusters_to_bytes(osb->sb, 1));
+		if (status)
 			goto bail;
-		}
 		did_quota = 1;
 		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
 					      new_fe_bh,
@@ -1788,7 +1787,7 @@ static int ocfs2_symlink(struct inode *dir,
 	d_instantiate(dentry, inode);
 bail:
 	if (status < 0 && did_quota)
-		vfs_dq_free_space_nodirty(inode,
+		dquot_free_space_nodirty(inode,
 					ocfs2_clusters_to_bytes(osb->sb, 1));
 	if (status < 0 && did_quota_inode)
 		vfs_dq_free_inode(inode);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b437dc0c4ca..aa66fb27722 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -853,9 +853,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 const struct dquot_operations ocfs2_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
-	.alloc_space	= dquot_alloc_space,
 	.alloc_inode	= dquot_alloc_inode,
-	.free_space	= dquot_free_space,
 	.free_inode	= dquot_free_inode,
 	.transfer	= dquot_transfer,
 	.write_dquot	= ocfs2_write_dquot,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 10d021dd37c..baf202c012c 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1464,28 +1464,29 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 }
 
 /*
- * Following four functions update i_blocks+i_bytes fields and
- * quota information (together with appropriate checks)
- * NOTE: We absolutely rely on the fact that caller dirties
- * the inode (usually macros in quotaops.h care about this) and
- * holds a handle for the current transaction so that dquot write and
- * inode write go into the same transaction.
+ * This functions updates i_blocks+i_bytes fields and quota information
+ * (together with appropriate checks).
+ *
+ * NOTE: We absolutely rely on the fact that caller dirties the inode
+ * (usually helpers in quotaops.h care about this) and holds a handle for
+ * the current transaction so that dquot write and inode write go into the
+ * same transaction.
  */
 
 /*
  * This operation can block, but only after everything is updated
  */
 int __dquot_alloc_space(struct inode *inode, qsize_t number,
-			int warn, int reserve)
+		int warn, int reserve)
 {
-	int cnt, ret = QUOTA_OK;
+	int cnt, ret = 0;
 	char warntype[MAXQUOTAS];
 
 	/*
 	 * First test before acquiring mutex - solves deadlocks when we
 	 * re-enter the quota code and are already holding the mutex
 	 */
-	if (IS_NOQUOTA(inode)) {
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
 		inode_incr_space(inode, number, reserve);
 		goto out;
 	}
@@ -1498,9 +1499,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!inode->i_dquot[cnt])
 			continue;
-		if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
-		    == NO_QUOTA) {
-			ret = NO_QUOTA;
+		if (check_bdq(inode->i_dquot[cnt], number, !warn, warntype+cnt)
+				== NO_QUOTA) {
+			ret = -EDQUOT;
 			spin_unlock(&dq_data_lock);
 			goto out_flush_warn;
 		}
@@ -1525,18 +1526,7 @@ out_flush_warn:
 out:
 	return ret;
 }
-
-int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
-{
-	return __dquot_alloc_space(inode, number, warn, 0);
-}
-EXPORT_SYMBOL(dquot_alloc_space);
-
-int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
-{
-	return __dquot_alloc_space(inode, number, warn, 1);
-}
-EXPORT_SYMBOL(dquot_reserve_space);
+EXPORT_SYMBOL(__dquot_alloc_space);
 
 /*
  * This operation can block, but only after everything is updated
@@ -1578,14 +1568,16 @@ warn_put_all:
 }
 EXPORT_SYMBOL(dquot_alloc_inode);
 
-int dquot_claim_space(struct inode *inode, qsize_t number)
+/*
+ * Convert in-memory reserved quotas to real consumed quotas
+ */
+int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
 	int cnt;
-	int ret = QUOTA_OK;
 
-	if (IS_NOQUOTA(inode)) {
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
 		inode_claim_rsv_space(inode, number);
-		goto out;
+		return 0;
 	}
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1601,24 +1593,23 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
 	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(inode->i_dquot);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
-	return ret;
+	return 0;
 }
-EXPORT_SYMBOL(dquot_claim_space);
+EXPORT_SYMBOL(dquot_claim_space_nodirty);
 
 /*
  * This operation can block, but only after everything is updated
  */
-int __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
 {
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode)) {
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
 		inode_decr_space(inode, number, reserve);
-		return QUOTA_OK;
+		return;
 	}
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1641,24 +1632,8 @@ int __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
 out_unlock:
 	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	return QUOTA_OK;
-}
-
-int dquot_free_space(struct inode *inode, qsize_t number)
-{
-	return  __dquot_free_space(inode, number, 0);
-}
-EXPORT_SYMBOL(dquot_free_space);
-
-/*
- * Release reserved quota space
- */
-void dquot_release_reserved_space(struct inode *inode, qsize_t number)
-{
-	__dquot_free_space(inode, number, 1);
-
 }
-EXPORT_SYMBOL(dquot_release_reserved_space);
+EXPORT_SYMBOL(__dquot_free_space);
 
 /*
  * This operation can block, but only after everything is updated
@@ -1840,9 +1815,7 @@ EXPORT_SYMBOL(dquot_commit_info);
 const struct dquot_operations dquot_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
-	.alloc_space	= dquot_alloc_space,
 	.alloc_inode	= dquot_alloc_inode,
-	.free_space	= dquot_free_space,
 	.free_inode	= dquot_free_inode,
 	.transfer	= dquot_transfer,
 	.write_dquot	= dquot_commit,
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 65c87276117..dc014f7def0 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
 
 	journal_mark_dirty(th, s, sbh);
 	if (for_unformatted)
-		vfs_dq_free_block_nodirty(inode, 1);
+		dquot_free_block_nodirty(inode, 1);
 }
 
 void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 			       amount_needed, hint->inode->i_uid);
 #endif
 		quota_ret =
-		    vfs_dq_alloc_block_nodirty(hint->inode, amount_needed);
+		    dquot_alloc_block_nodirty(hint->inode, amount_needed);
 		if (quota_ret)	/* Quota exceeded? */
 			return QUOTA_EXCEEDED;
 		if (hint->preallocate && hint->prealloc_size) {
@@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 				       "reiserquota: allocating (prealloc) %d blocks id=%u",
 				       hint->prealloc_size, hint->inode->i_uid);
 #endif
-			quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode,
+			quota_ret = dquot_prealloc_block_nodirty(hint->inode,
 							 hint->prealloc_size);
 			if (quota_ret)
 				hint->preallocate = hint->prealloc_size = 0;
@@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 					       hint->inode->i_uid);
 #endif
 				/* Free not allocated blocks */
-				vfs_dq_free_block_nodirty(hint->inode,
+				dquot_free_block_nodirty(hint->inode,
 					amount_needed + hint->prealloc_size -
 					nr_allocated);
 			}
@@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 			       REISERFS_I(hint->inode)->i_prealloc_count,
 			       hint->inode->i_uid);
 #endif
-		vfs_dq_free_block_nodirty(hint->inode, amount_needed +
+		dquot_free_block_nodirty(hint->inode, amount_needed +
 					 hint->prealloc_size - nr_allocated -
 					 REISERFS_I(hint->inode)->
 					 i_prealloc_count);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 5fa7118f04e..313d39d639e 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1299,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
 		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
 		       quota_cut_bytes, inode->i_uid, head2type(&s_ih));
 #endif
-	vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+	dquot_free_space_nodirty(inode, quota_cut_bytes);
 
 	/* Return deleted body length */
 	return ret_value;
@@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 					       quota_cut_bytes, inode->i_uid,
 					       key2type(key));
 #endif
-				vfs_dq_free_space_nodirty(inode,
+				dquot_free_space_nodirty(inode,
 							 quota_cut_bytes);
 			}
 			break;
@@ -1733,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
 		       quota_cut_bytes, inode->i_uid, '?');
 #endif
-	vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+	dquot_free_space_nodirty(inode, quota_cut_bytes);
 	return ret_value;
 }
 
@@ -1968,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 		       key2type(&(key->on_disk_key)));
 #endif
 
-	if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) {
+	retval = dquot_alloc_space_nodirty(inode, pasted_size);
+	if (retval) {
 		pathrelse(search_path);
-		return -EDQUOT;
+		return retval;
 	}
 	init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
 		       pasted_size);
@@ -2024,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 		       pasted_size, inode->i_uid,
 		       key2type(&(key->on_disk_key)));
 #endif
-	vfs_dq_free_space_nodirty(inode, pasted_size);
+	dquot_free_space_nodirty(inode, pasted_size);
 	return retval;
 }
 
@@ -2062,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
 #endif
 		/* We can't dirty inode here. It would be immediately written but
 		 * appropriate stat item isn't inserted yet... */
-		if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) {
+		retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+		if (retval) {
 			pathrelse(path);
-			return -EDQUOT;
+			return retval;
 		}
 	}
 	init_tb_struct(th, &s_ins_balance, th->t_super, path,
@@ -2113,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
 		       quota_bytes, inode->i_uid, head2type(ih));
 #endif
 	if (inode)
-		vfs_dq_free_space_nodirty(inode, quota_bytes);
+		dquot_free_space_nodirty(inode, quota_bytes);
 	return retval;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b4a7dd03bdb..ea4a77e9d7f 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -618,9 +618,7 @@ static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
 static const struct dquot_operations reiserfs_quota_operations = {
 	.initialize = dquot_initialize,
 	.drop = dquot_drop,
-	.alloc_space = dquot_alloc_space,
 	.alloc_inode = dquot_alloc_inode,
-	.free_space = dquot_free_space,
 	.free_inode = dquot_free_inode,
 	.transfer = dquot_transfer,
 	.write_dquot = reiserfs_write_dquot,
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 82372e332f0..e2ff180173a 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -208,7 +208,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 					((char *)bh->b_data)[(bit + i) >> 3]);
 			} else {
 				if (inode)
-					vfs_dq_free_block(inode, 1);
+					dquot_free_block(inode, 1);
 				udf_add_free_space(sb, sbi->s_partition, 1);
 			}
 		}
@@ -260,11 +260,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
 		while (bit < (sb->s_blocksize << 3) && block_count > 0) {
 			if (!udf_test_bit(bit, bh->b_data))
 				goto out;
-			else if (vfs_dq_prealloc_block(inode, 1))
+			else if (dquot_prealloc_block(inode, 1))
 				goto out;
 			else if (!udf_clear_bit(bit, bh->b_data)) {
 				udf_debug("bit already cleared for block %d\n", bit);
-				vfs_dq_free_block(inode, 1);
+				dquot_free_block(inode, 1);
 				goto out;
 			}
 			block_count--;
@@ -390,10 +390,14 @@ got_block:
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (inode && vfs_dq_alloc_block(inode, 1)) {
-		mutex_unlock(&sbi->s_alloc_mutex);
-		*err = -EDQUOT;
-		return 0;
+	if (inode) {
+		int ret = dquot_alloc_block(inode, 1);
+
+		if (ret) {
+			mutex_unlock(&sbi->s_alloc_mutex);
+			*err = ret;
+			return 0;
+		}
 	}
 
 	newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
@@ -449,7 +453,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 	/* We do this up front - There are some error conditions that
 	   could occure, but.. oh well */
 	if (inode)
-		vfs_dq_free_block(inode, count);
+		dquot_free_block(inode, count);
 	udf_add_free_space(sb, sbi->s_partition, count);
 
 	start = bloc->logicalBlockNum + offset;
@@ -694,7 +698,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
 		epos.offset -= adsize;
 
 		alloc_count = (elen >> sb->s_blocksize_bits);
-		if (inode && vfs_dq_prealloc_block(inode,
+		if (inode && dquot_prealloc_block(inode,
 			alloc_count > block_count ? block_count : alloc_count))
 			alloc_count = 0;
 		else if (alloc_count > block_count) {
@@ -797,12 +801,13 @@ static int udf_table_new_block(struct super_block *sb,
 	newblock = goal_eloc.logicalBlockNum;
 	goal_eloc.logicalBlockNum++;
 	goal_elen -= sb->s_blocksize;
-
-	if (inode && vfs_dq_alloc_block(inode, 1)) {
-		brelse(goal_epos.bh);
-		mutex_unlock(&sbi->s_alloc_mutex);
-		*err = -EDQUOT;
-		return 0;
+	if (inode) {
+		*err = dquot_alloc_block(inode, 1);
+		if (*err) {
+			brelse(goal_epos.bh);
+			mutex_unlock(&sbi->s_alloc_mutex);
+			return 0;
+		}
 	}
 
 	if (goal_elen)
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 54c16ec95df..5cfa4d85ccf 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
 				   "bit already cleared for fragment %u", i);
 	}
 	
-	vfs_dq_free_block(inode, count);
+	dquot_free_block(inode, count);
 
 	
 	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
@@ -195,7 +195,7 @@ do_more:
 		ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 			ufs_clusteracct (sb, ucpi, blkno, 1);
-		vfs_dq_free_block(inode, uspi->s_fpb);
+		dquot_free_block(inode, uspi->s_fpb);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
 		uspi->cs_total.cs_nbfree++;
@@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 	struct ufs_cg_private_info * ucpi;
 	struct ufs_cylinder_group * ucg;
 	unsigned cgno, fragno, fragoff, count, fragsize, i;
+	int ret;
 	
 	UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
 	     (unsigned long long)fragment, oldcount, newcount);
@@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 		fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
 	for (i = oldcount; i < newcount; i++)
 		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
-	if (vfs_dq_alloc_block(inode, count)) {
-		*err = -EDQUOT;
+	ret = dquot_alloc_block(inode, count);
+	if (ret) {
+		*err = ret;
 		return 0;
 	}
 
@@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
 	struct ufs_cylinder_group * ucg;
 	unsigned oldcg, i, j, k, allocsize;
 	u64 result;
+	int ret;
 	
 	UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
 	     inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -664,7 +667,7 @@ cg_found:
 		for (i = count; i < uspi->s_fpb; i++)
 			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
 		i = uspi->s_fpb - count;
-		vfs_dq_free_block(inode, i);
+		dquot_free_block(inode, i);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
 		uspi->cs_total.cs_nffree += i;
@@ -676,8 +679,9 @@ cg_found:
 	result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
 	if (result == INVBLOCK)
 		return 0;
-	if (vfs_dq_alloc_block(inode, count)) {
-		*err = -EDQUOT;
+	ret = dquot_alloc_block(inode, count);
+	if (ret) {
+		*err = ret;
 		return 0;
 	}
 	for (i = 0; i < count; i++)
@@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
 	struct ufs_super_block_first * usb1;
 	struct ufs_cylinder_group * ucg;
 	u64 result, blkno;
+	int ret;
 
 	UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
 
@@ -747,8 +752,9 @@ gotit:
 	ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 	if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 		ufs_clusteracct (sb, ucpi, blkno, -1);
-	if (vfs_dq_alloc_block(inode, uspi->s_fpb)) {
-		*err = -EDQUOT;
+	ret = dquot_alloc_block(inode, uspi->s_fpb);
+	if (ret) {
+		*err = ret;
 		return INVBLOCK;
 	}
 
diff --git a/include/linux/quota.h b/include/linux/quota.h
index edf34f2fe87..1b14ad287fe 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -297,9 +297,7 @@ struct quota_format_ops {
 struct dquot_operations {
 	int (*initialize) (struct inode *, int);
 	int (*drop) (struct inode *);
-	int (*alloc_space) (struct inode *, qsize_t, int);
 	int (*alloc_inode) (const struct inode *, qsize_t);
-	int (*free_space) (struct inode *, qsize_t);
 	int (*free_inode) (const struct inode *, qsize_t);
 	int (*transfer) (struct inode *, qid_t *, unsigned long);
 	int (*write_dquot) (struct dquot *);		/* Ordinary dquot write */
@@ -309,12 +307,6 @@ struct dquot_operations {
 	int (*release_dquot) (struct dquot *);		/* Quota is going to be deleted from disk */
 	int (*mark_dirty) (struct dquot *);		/* Dquot is marked dirty */
 	int (*write_info) (struct super_block *, int);	/* Write of quota "superblock" */
-	/* reserve quota for delayed block allocation */
-	int (*reserve_space) (struct inode *, qsize_t, int);
-	/* claim reserved quota for delayed alloc */
-	int (*claim_space) (struct inode *, qsize_t);
-	/* release rsved quota for delayed alloc */
-	void (*release_rsv) (struct inode *, qsize_t);
 	/* get reserved quota for delayed alloc, value returned is managed by
 	 * quota code only */
 	qsize_t *(*get_reserved_space) (struct inode *);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index e1cae204b5d..47e85682e11 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -33,14 +33,13 @@ int dquot_scan_active(struct super_block *sb,
 struct dquot *dquot_alloc(struct super_block *sb, int type);
 void dquot_destroy(struct dquot *dquot);
 
-int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc);
-int dquot_alloc_inode(const struct inode *inode, qsize_t number);
+int __dquot_alloc_space(struct inode *inode, qsize_t number,
+		int warn, int reserve);
+void __dquot_free_space(struct inode *inode, qsize_t number, int reserve);
 
-int dquot_reserve_space(struct inode *inode, qsize_t number, int prealloc);
-int dquot_claim_space(struct inode *inode, qsize_t number);
-void dquot_release_reserved_space(struct inode *inode, qsize_t number);
+int dquot_alloc_inode(const struct inode *inode, qsize_t number);
 
-int dquot_free_space(struct inode *inode, qsize_t number);
+int dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
 int dquot_free_inode(const struct inode *inode, qsize_t number);
 
 int dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask);
@@ -149,60 +148,6 @@ static inline void vfs_dq_init(struct inode *inode)
 		inode->i_sb->dq_op->initialize(inode, -1);
 }
 
-/* The following allocation/freeing/transfer functions *must* be called inside
- * a transaction (deadlocks possible otherwise) */
-static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr)
-{
-	if (sb_any_quota_active(inode->i_sb)) {
-		/* Used space is updated in alloc_space() */
-		if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA)
-			return 1;
-	}
-	else
-		inode_add_bytes(inode, nr);
-	return 0;
-}
-
-static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr)
-{
-	int ret;
-        if (!(ret =  vfs_dq_prealloc_space_nodirty(inode, nr)))
-		mark_inode_dirty(inode);
-	return ret;
-}
-
-static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr)
-{
-	if (sb_any_quota_active(inode->i_sb)) {
-		/* Used space is updated in alloc_space() */
-		if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA)
-			return 1;
-	}
-	else
-		inode_add_bytes(inode, nr);
-	return 0;
-}
-
-static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr)
-{
-	int ret;
-	if (!(ret = vfs_dq_alloc_space_nodirty(inode, nr)))
-		mark_inode_dirty(inode);
-	return ret;
-}
-
-static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr)
-{
-	if (sb_any_quota_active(inode->i_sb)) {
-		/* Used space is updated in alloc_space() */
-		if (inode->i_sb->dq_op->reserve_space(inode, nr, 0) == NO_QUOTA)
-			return 1;
-	}
-	else
-		inode_add_rsv_space(inode, nr);
-	return 0;
-}
-
 static inline int vfs_dq_alloc_inode(struct inode *inode)
 {
 	if (sb_any_quota_active(inode->i_sb)) {
@@ -213,47 +158,6 @@ static inline int vfs_dq_alloc_inode(struct inode *inode)
 	return 0;
 }
 
-/*
- * Convert in-memory reserved quotas to real consumed quotas
- */
-static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr)
-{
-	if (sb_any_quota_active(inode->i_sb)) {
-		if (inode->i_sb->dq_op->claim_space(inode, nr) == NO_QUOTA)
-			return 1;
-	} else
-		inode_claim_rsv_space(inode, nr);
-
-	mark_inode_dirty(inode);
-	return 0;
-}
-
-/*
- * Release reserved (in-memory) quotas
- */
-static inline
-void vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr)
-{
-	if (sb_any_quota_active(inode->i_sb))
-		inode->i_sb->dq_op->release_rsv(inode, nr);
-	else
-		inode_sub_rsv_space(inode, nr);
-}
-
-static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
-{
-	if (sb_any_quota_active(inode->i_sb))
-		inode->i_sb->dq_op->free_space(inode, nr);
-	else
-		inode_sub_bytes(inode, nr);
-}
-
-static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr)
-{
-	vfs_dq_free_space_nodirty(inode, nr);
-	mark_inode_dirty(inode);
-}
-
 static inline void vfs_dq_free_inode(struct inode *inode)
 {
 	if (sb_any_quota_active(inode->i_sb))
@@ -351,105 +255,109 @@ static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 	return 0;
 }
 
-static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr)
+static inline int __dquot_alloc_space(struct inode *inode, qsize_t number,
+		int warn, int reserve)
 {
-	inode_add_bytes(inode, nr);
+	if (!reserve)
+		inode_add_bytes(inode, number);
 	return 0;
 }
 
-static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr)
+static inline void __dquot_free_space(struct inode *inode, qsize_t number,
+		int reserve)
 {
-	vfs_dq_prealloc_space_nodirty(inode, nr);
-	mark_inode_dirty(inode);
-	return 0;
+	if (!reserve)
+		inode_sub_bytes(inode, number);
 }
 
-static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr)
+static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
-	inode_add_bytes(inode, nr);
+	inode_add_bytes(inode, number);
 	return 0;
 }
 
-static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr)
+#endif /* CONFIG_QUOTA */
+
+static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
 {
-	vfs_dq_alloc_space_nodirty(inode, nr);
-	mark_inode_dirty(inode);
-	return 0;
+	return __dquot_alloc_space(inode, nr, 1, 0);
 }
 
-static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr)
+static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
 {
-	return 0;
+	int ret;
+
+	ret = dquot_alloc_space_nodirty(inode, nr);
+	if (!ret)
+		mark_inode_dirty(inode);
+	return ret;
 }
 
-static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr)
+static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr)
 {
-	return vfs_dq_alloc_space(inode, nr);
+	return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits);
 }
 
-static inline
-int vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr)
+static inline int dquot_alloc_block(struct inode *inode, qsize_t nr)
 {
-	return 0;
+	return dquot_alloc_space(inode, nr << inode->i_blkbits);
 }
 
-static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
+static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
 {
-	inode_sub_bytes(inode, nr);
+	return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0, 0);
 }
 
-static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr)
+static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr)
 {
-	vfs_dq_free_space_nodirty(inode, nr);
-	mark_inode_dirty(inode);
-}	
-
-#endif /* CONFIG_QUOTA */
+	int ret;
 
-static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
-{
-	return vfs_dq_prealloc_space_nodirty(inode, nr << inode->i_blkbits);
+	ret = dquot_prealloc_block_nodirty(inode, nr);
+	if (!ret)
+		mark_inode_dirty(inode);
+	return ret;
 }
 
-static inline int vfs_dq_prealloc_block(struct inode *inode, qsize_t nr)
+static inline int dquot_reserve_block(struct inode *inode, qsize_t nr)
 {
-	return vfs_dq_prealloc_space(inode, nr << inode->i_blkbits);
+	return __dquot_alloc_space(inode, nr << inode->i_blkbits, 1, 1);
 }
 
-static inline int vfs_dq_alloc_block_nodirty(struct inode *inode, qsize_t nr)
+static inline int dquot_claim_block(struct inode *inode, qsize_t nr)
 {
-	return vfs_dq_alloc_space_nodirty(inode, nr << inode->i_blkbits);
-}
+	int ret;
 
-static inline int vfs_dq_alloc_block(struct inode *inode, qsize_t nr)
-{
-	return vfs_dq_alloc_space(inode, nr << inode->i_blkbits);
+	ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
+	if (!ret)
+		mark_inode_dirty(inode);
+	return ret;
 }
 
-static inline int vfs_dq_reserve_block(struct inode *inode, qsize_t nr)
+static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr)
 {
-	return vfs_dq_reserve_space(inode, nr << inode->i_blkbits);
+	__dquot_free_space(inode, nr, 0);
 }
 
-static inline int vfs_dq_claim_block(struct inode *inode, qsize_t nr)
+static inline void dquot_free_space(struct inode *inode, qsize_t nr)
 {
-	return vfs_dq_claim_space(inode, nr << inode->i_blkbits);
+	dquot_free_space_nodirty(inode, nr);
+	mark_inode_dirty(inode);
 }
 
-static inline
-void vfs_dq_release_reservation_block(struct inode *inode, qsize_t nr)
+static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr)
 {
-	vfs_dq_release_reservation_space(inode, nr << inode->i_blkbits);
+	dquot_free_space_nodirty(inode, nr << inode->i_blkbits);
 }
 
-static inline void vfs_dq_free_block_nodirty(struct inode *inode, qsize_t nr)
+static inline void dquot_free_block(struct inode *inode, qsize_t nr)
 {
-	vfs_dq_free_space_nodirty(inode, nr << inode->i_blkbits);
+	dquot_free_space(inode, nr << inode->i_blkbits);
 }
 
-static inline void vfs_dq_free_block(struct inode *inode, qsize_t nr)
+static inline void dquot_release_reservation_block(struct inode *inode,
+		qsize_t nr)
 {
-	vfs_dq_free_space(inode, nr << inode->i_blkbits);
+	__dquot_free_space(inode, nr << inode->i_blkbits, 1);
 }
 
 #endif /* _LINUX_QUOTAOPS_ */
-- 
cgit v1.2.3


From 63936ddaa16b9486e2d426ed7b09f559a5c60f87 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 3 Mar 2010 09:05:01 -0500
Subject: dquot: cleanup inode allocation / freeing routines

Get rid of the alloc_inode and free_inode dquot operations - they are
always called from the filesystem and if a filesystem really needs
their own (which none currently does) it can just call into it's
own routine directly.

Also get rid of the vfs_dq_alloc/vfs_dq_free wrappers and always
call the lowlevel dquot_alloc_inode / dqout_free_inode routines
directly, which now lose the number argument which is always 1.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 Documentation/filesystems/Locking |  8 --------
 fs/ext2/ialloc.c                  | 10 +++++-----
 fs/ext3/ialloc.c                  | 10 +++++-----
 fs/ext3/super.c                   |  2 --
 fs/ext4/ialloc.c                  | 10 +++++-----
 fs/ext4/super.c                   |  2 --
 fs/jfs/inode.c                    |  2 +-
 fs/jfs/jfs_inode.c                |  6 +++---
 fs/ocfs2/inode.c                  |  2 +-
 fs/ocfs2/namei.c                  | 30 +++++++++---------------------
 fs/ocfs2/quota_global.c           |  2 --
 fs/quota/dquot.c                  | 29 +++++++++++++----------------
 fs/reiserfs/inode.c               | 10 +++++-----
 fs/reiserfs/super.c               |  2 --
 fs/udf/ialloc.c                   | 10 ++++++----
 fs/ufs/ialloc.c                   |  7 ++++---
 include/linux/quota.h             |  2 --
 include/linux/quotaops.h          | 24 ++++--------------------
 18 files changed, 61 insertions(+), 107 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 1192fde1163..4428f55f213 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -462,8 +462,6 @@ in sys_read() and friends.
 prototypes:
 	int (*initialize) (struct inode *, int);
 	int (*drop) (struct inode *);
-	int (*alloc_inode) (const struct inode *, unsigned long);
-	int (*free_inode) (const struct inode *, unsigned long);
 	int (*transfer) (struct inode *, struct iattr *);
 	int (*write_dquot) (struct dquot *);
 	int (*acquire_dquot) (struct dquot *);
@@ -479,8 +477,6 @@ What filesystem should expect from the generic quota functions:
 		FS recursion	Held locks when called
 initialize:	yes		maybe dqonoff_sem
 drop:		yes		-
-alloc_inode:	->mark_dirty()	-
-free_inode:	->mark_dirty()	-
 transfer:	yes		-
 write_dquot:	yes		dqonoff_sem or dqptr_sem
 acquire_dquot:	yes		dqonoff_sem or dqptr_sem
@@ -491,10 +487,6 @@ write_info:	yes		dqonoff_sem
 FS recursion means calling ->quota_read() and ->quota_write() from superblock
 operations.
 
-->alloc_inode(), ->free_inode() are called
-only directly by the filesystem and do not call any fs functions only
-the ->mark_dirty() operation.
-
 More details about quota locking can be found in fs/dquot.c.
 
 --------------------------- vm_operations_struct -----------------------------
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 15387c9c17d..d12f9809559 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -121,7 +121,7 @@ void ext2_free_inode (struct inode * inode)
 	if (!is_bad_inode(inode)) {
 		/* Quota is already initialized in iput() */
 		ext2_xattr_delete_inode(inode);
-		vfs_dq_free_inode(inode);
+		dquot_free_inode(inode);
 		vfs_dq_drop(inode);
 	}
 
@@ -586,10 +586,10 @@ got:
 		goto fail_drop;
 	}
 
-	if (vfs_dq_alloc_inode(inode)) {
-		err = -EDQUOT;
+	vfs_dq_init(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
 		goto fail_drop;
-	}
 
 	err = ext2_init_acl(inode, dir);
 	if (err)
@@ -605,7 +605,7 @@ got:
 	return inode;
 
 fail_free_drop:
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 
 fail_drop:
 	vfs_dq_drop(inode);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index b3999128513..8bf00e997c3 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -125,7 +125,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	 */
 	vfs_dq_init(inode);
 	ext3_xattr_delete_inode(handle, inode);
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 	vfs_dq_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
@@ -588,10 +588,10 @@ got:
 		sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
 
 	ret = inode;
-	if (vfs_dq_alloc_inode(inode)) {
-		err = -EDQUOT;
+	vfs_dq_init(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
 		goto fail_drop;
-	}
 
 	err = ext3_init_acl(handle, inode, dir);
 	if (err)
@@ -619,7 +619,7 @@ really_out:
 	return ret;
 
 fail_free_drop:
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 
 fail_drop:
 	vfs_dq_drop(inode);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8c13910a378..8b8bc4f9cb1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -752,8 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 static const struct dquot_operations ext3_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
-	.alloc_inode	= dquot_alloc_inode,
-	.free_inode	= dquot_free_inode,
 	.transfer	= dquot_transfer,
 	.write_dquot	= ext3_write_dquot,
 	.acquire_dquot	= ext3_acquire_dquot,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6..b0d744cf8b9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -219,7 +219,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	 */
 	vfs_dq_init(inode);
 	ext4_xattr_delete_inode(handle, inode);
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 	vfs_dq_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
@@ -1034,10 +1034,10 @@ got:
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
 	ret = inode;
-	if (vfs_dq_alloc_inode(inode)) {
-		err = -EDQUOT;
+	vfs_dq_init(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
 		goto fail_drop;
-	}
 
 	err = ext4_init_acl(handle, inode, dir);
 	if (err)
@@ -1074,7 +1074,7 @@ really_out:
 	return ret;
 
 fail_free_drop:
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 
 fail_drop:
 	vfs_dq_drop(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fa8f4deda65..d231da8798e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1017,8 +1017,6 @@ static const struct dquot_operations ext4_quota_operations = {
 #ifdef CONFIG_QUOTA
 	.get_reserved_space = ext4_get_reserved_space,
 #endif
-	.alloc_inode	= dquot_alloc_inode,
-	.free_inode	= dquot_free_inode,
 	.transfer	= dquot_transfer,
 	.write_dquot	= ext4_write_dquot,
 	.acquire_dquot	= ext4_acquire_dquot,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b2ae190a77b..2562d18988f 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -159,7 +159,7 @@ void jfs_delete_inode(struct inode *inode)
 		 * Free the inode from the quota allocation.
 		 */
 		vfs_dq_init(inode);
-		vfs_dq_free_inode(inode);
+		dquot_free_inode(inode);
 		vfs_dq_drop(inode);
 	}
 
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index dc0e02159ac..7762f33e062 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	/*
 	 * Allocate inode to quota.
 	 */
-	if (vfs_dq_alloc_inode(inode)) {
-		rc = -EDQUOT;
+	vfs_dq_init(inode);
+	rc = dquot_alloc_inode(inode);
+	if (rc)
 		goto fail_drop;
-	}
 
 	inode->i_mode = mode;
 	/* inherit flags from parent */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 88459bdd1ff..cb7f67d8441 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -665,7 +665,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 	}
 
 	ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 
 	status = ocfs2_free_dinode(handle, inode_alloc_inode,
 				   inode_alloc_bh, di);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 13adaa1f40c..99766b6418e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -348,13 +348,9 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
-	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
-	 * to be called. */
-	if (sb_any_quota_active(osb->sb) &&
-	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
-		status = -EDQUOT;
+	status = dquot_alloc_inode(inode);
+	if (status)
 		goto leave;
-	}
 	did_quota_inode = 1;
 
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
@@ -431,7 +427,7 @@ static int ocfs2_mknod(struct inode *dir,
 	status = 0;
 leave:
 	if (status < 0 && did_quota_inode)
-		vfs_dq_free_inode(inode);
+		dquot_free_inode(inode);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
@@ -1688,13 +1684,9 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	}
 
-	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
-	 * to be called. */
-	if (sb_any_quota_active(osb->sb) &&
-	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
-		status = -EDQUOT;
+	status = dquot_alloc_inode(inode);
+	if (status)
 		goto bail;
-	}
 	did_quota_inode = 1;
 
 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
@@ -1790,7 +1782,7 @@ bail:
 		dquot_free_space_nodirty(inode,
 					ocfs2_clusters_to_bytes(osb->sb, 1));
 	if (status < 0 && did_quota_inode)
-		vfs_dq_free_inode(inode);
+		dquot_free_inode(inode);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
@@ -2098,13 +2090,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 		goto leave;
 	}
 
-	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
-	 * to be called. */
-	if (sb_any_quota_active(osb->sb) &&
-	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
-		status = -EDQUOT;
+	status = dquot_alloc_inode(inode);
+	if (status)
 		goto leave;
-	}
 	did_quota_inode = 1;
 
 	inode->i_nlink = 0;
@@ -2139,7 +2127,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 	insert_inode_hash(inode);
 leave:
 	if (status < 0 && did_quota_inode)
-		vfs_dq_free_inode(inode);
+		dquot_free_inode(inode);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index aa66fb27722..ed96b3eeb13 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -853,8 +853,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 const struct dquot_operations ocfs2_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
-	.alloc_inode	= dquot_alloc_inode,
-	.free_inode	= dquot_free_inode,
 	.transfer	= dquot_transfer,
 	.write_dquot	= ocfs2_write_dquot,
 	.acquire_dquot	= ocfs2_acquire_dquot,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index baf202c012c..ed131318b84 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1531,15 +1531,15 @@ EXPORT_SYMBOL(__dquot_alloc_space);
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_alloc_inode(const struct inode *inode, qsize_t number)
+int dquot_alloc_inode(const struct inode *inode)
 {
-	int cnt, ret = NO_QUOTA;
+	int cnt, ret = -EDQUOT;
 	char warntype[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode))
-		return QUOTA_OK;
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+		return 0;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = QUOTA_NL_NOWARN;
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1547,7 +1547,7 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!inode->i_dquot[cnt])
 			continue;
-		if (check_idq(inode->i_dquot[cnt], number, warntype+cnt)
+		if (check_idq(inode->i_dquot[cnt], 1, warntype+cnt)
 		    == NO_QUOTA)
 			goto warn_put_all;
 	}
@@ -1555,12 +1555,12 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!inode->i_dquot[cnt])
 			continue;
-		dquot_incr_inodes(inode->i_dquot[cnt], number);
+		dquot_incr_inodes(inode->i_dquot[cnt], 1);
 	}
-	ret = QUOTA_OK;
+	ret = 0;
 warn_put_all:
 	spin_unlock(&dq_data_lock);
-	if (ret == QUOTA_OK)
+	if (ret == 0)
 		mark_all_dquot_dirty(inode->i_dquot);
 	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1638,29 +1638,28 @@ EXPORT_SYMBOL(__dquot_free_space);
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_free_inode(const struct inode *inode, qsize_t number)
+void dquot_free_inode(const struct inode *inode)
 {
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode))
-		return QUOTA_OK;
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+		return;
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!inode->i_dquot[cnt])
 			continue;
-		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
-		dquot_decr_inodes(inode->i_dquot[cnt], number);
+		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
+		dquot_decr_inodes(inode->i_dquot[cnt], 1);
 	}
 	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(inode->i_dquot);
 	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	return QUOTA_OK;
 }
 EXPORT_SYMBOL(dquot_free_inode);
 
@@ -1815,8 +1814,6 @@ EXPORT_SYMBOL(dquot_commit_info);
 const struct dquot_operations dquot_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
-	.alloc_inode	= dquot_alloc_inode,
-	.free_inode	= dquot_free_inode,
 	.transfer	= dquot_transfer,
 	.write_dquot	= dquot_commit,
 	.acquire_dquot	= dquot_acquire,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 2df0f5c7c60..f56a3d2e649 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -54,7 +54,7 @@ void reiserfs_delete_inode(struct inode *inode)
 		 * after delete_object so that quota updates go into the same transaction as
 		 * stat data deletion */
 		if (!err) 
-			vfs_dq_free_inode(inode);
+			dquot_free_inode(inode);
 
 		if (journal_end(&th, inode->i_sb, jbegin_count))
 			goto out;
@@ -1765,10 +1765,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 
 	BUG_ON(!th->t_trans_id);
 
-	if (vfs_dq_alloc_inode(inode)) {
-		err = -EDQUOT;
+	vfs_dq_init(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
 		goto out_end_trans;
-	}
 	if (!dir->i_nlink) {
 		err = -EPERM;
 		goto out_bad_inode;
@@ -1959,7 +1959,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	INODE_PKEY(inode)->k_objectid = 0;
 
 	/* Quota change must be inside a transaction for journaling */
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 
       out_end_trans:
 	journal_end(th, th->t_super, th->t_blocks_allocated);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ea4a77e9d7f..e942ceecf2b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -618,8 +618,6 @@ static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
 static const struct dquot_operations reiserfs_quota_operations = {
 	.initialize = dquot_initialize,
 	.drop = dquot_drop,
-	.alloc_inode = dquot_alloc_inode,
-	.free_inode = dquot_free_inode,
 	.transfer = dquot_transfer,
 	.write_dquot = reiserfs_write_dquot,
 	.acquire_dquot = reiserfs_acquire_dquot,
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c10fa39f97e..e1856b89c9c 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -36,7 +36,7 @@ void udf_free_inode(struct inode *inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 	vfs_dq_drop(inode);
 
 	clear_inode(inode);
@@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	struct super_block *sb = dir->i_sb;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct inode *inode;
-	int block;
+	int block, ret;
 	uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
 	struct udf_inode_info *iinfo;
 	struct udf_inode_info *dinfo = UDF_I(dir);
@@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 
-	if (vfs_dq_alloc_inode(inode)) {
+	vfs_dq_init(inode);
+	ret = dquot_alloc_inode(inode);
+	if (ret) {
 		vfs_dq_drop(inode);
 		inode->i_flags |= S_NOQUOTA;
 		inode->i_nlink = 0;
 		iput(inode);
-		*err = -EDQUOT;
+		*err = ret;
 		return NULL;
 	}
 
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3527c00fef0..02f77882c57 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -95,7 +95,7 @@ void ufs_free_inode (struct inode * inode)
 
 	is_directory = S_ISDIR(inode->i_mode);
 
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 	vfs_dq_drop(inode);
 
 	clear_inode (inode);
@@ -355,9 +355,10 @@ cg_found:
 
 	unlock_super (sb);
 
-	if (vfs_dq_alloc_inode(inode)) {
+	vfs_dq_init(inode);
+	err = dquot_alloc_inode(inode);
+	if (err) {
 		vfs_dq_drop(inode);
-		err = -EDQUOT;
 		goto fail_without_unlock;
 	}
 
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 1b14ad287fe..e3b07895d32 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -297,8 +297,6 @@ struct quota_format_ops {
 struct dquot_operations {
 	int (*initialize) (struct inode *, int);
 	int (*drop) (struct inode *);
-	int (*alloc_inode) (const struct inode *, qsize_t);
-	int (*free_inode) (const struct inode *, qsize_t);
 	int (*transfer) (struct inode *, qid_t *, unsigned long);
 	int (*write_dquot) (struct dquot *);		/* Ordinary dquot write */
 	struct dquot *(*alloc_dquot)(struct super_block *, int);	/* Allocate memory for new dquot */
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 47e85682e11..9ce7f051a4b 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -37,10 +37,10 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
 		int warn, int reserve);
 void __dquot_free_space(struct inode *inode, qsize_t number, int reserve);
 
-int dquot_alloc_inode(const struct inode *inode, qsize_t number);
+int dquot_alloc_inode(const struct inode *inode);
 
 int dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
-int dquot_free_inode(const struct inode *inode, qsize_t number);
+void dquot_free_inode(const struct inode *inode);
 
 int dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask);
 int dquot_commit(struct dquot *dquot);
@@ -148,22 +148,6 @@ static inline void vfs_dq_init(struct inode *inode)
 		inode->i_sb->dq_op->initialize(inode, -1);
 }
 
-static inline int vfs_dq_alloc_inode(struct inode *inode)
-{
-	if (sb_any_quota_active(inode->i_sb)) {
-		vfs_dq_init(inode);
-		if (inode->i_sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA)
-			return 1;
-	}
-	return 0;
-}
-
-static inline void vfs_dq_free_inode(struct inode *inode)
-{
-	if (sb_any_quota_active(inode->i_sb))
-		inode->i_sb->dq_op->free_inode(inode, 1);
-}
-
 /* Cannot be called inside a transaction */
 static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
@@ -231,12 +215,12 @@ static inline void vfs_dq_drop(struct inode *inode)
 {
 }
 
-static inline int vfs_dq_alloc_inode(struct inode *inode)
+static inline int dquot_alloc_inode(const struct inode *inode)
 {
 	return 0;
 }
 
-static inline void vfs_dq_free_inode(struct inode *inode)
+static inline void dquot_free_inode(const struct inode *inode)
 {
 }
 
-- 
cgit v1.2.3


From b43fa8284d7790d9cca32c9c55e24f29be2fa33b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 3 Mar 2010 09:05:03 -0500
Subject: dquot: cleanup dquot transfer routine

Get rid of the transfer dquot operation - it is now always called from
the filesystem and if a filesystem really needs it's own (which none
currently does) it can just call into it's own routine directly.

Rename the now static low-level dquot_transfer helper to __dquot_transfer
and vfs_dq_transfer to dquot_transfer to have a consistent namespace,
and make the new dquot_transfer return a normal negative errno value
which all callers expect.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 Documentation/filesystems/Locking |  2 --
 drivers/staging/pohmelfs/inode.c  |  2 +-
 fs/ext2/inode.c                   |  2 +-
 fs/ext3/inode.c                   |  2 +-
 fs/ext3/super.c                   |  1 -
 fs/ext4/inode.c                   |  2 +-
 fs/ext4/super.c                   |  1 -
 fs/jfs/file.c                     |  5 +++--
 fs/ocfs2/file.c                   |  4 ++--
 fs/ocfs2/quota_global.c           |  1 -
 fs/quota/dquot.c                  | 12 +++++-------
 fs/reiserfs/inode.c               |  3 +--
 fs/reiserfs/super.c               |  1 -
 fs/udf/file.c                     |  2 +-
 fs/ufs/truncate.c                 |  2 +-
 include/linux/quota.h             |  1 -
 include/linux/quotaops.h          |  5 ++---
 17 files changed, 19 insertions(+), 29 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4428f55f213..4574e0272bd 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -462,7 +462,6 @@ in sys_read() and friends.
 prototypes:
 	int (*initialize) (struct inode *, int);
 	int (*drop) (struct inode *);
-	int (*transfer) (struct inode *, struct iattr *);
 	int (*write_dquot) (struct dquot *);
 	int (*acquire_dquot) (struct dquot *);
 	int (*release_dquot) (struct dquot *);
@@ -477,7 +476,6 @@ What filesystem should expect from the generic quota functions:
 		FS recursion	Held locks when called
 initialize:	yes		maybe dqonoff_sem
 drop:		yes		-
-transfer:	yes		-
 write_dquot:	yes		dqonoff_sem or dqptr_sem
 acquire_dquot:	yes		dqonoff_sem or dqptr_sem
 release_dquot:	yes		dqonoff_sem or dqptr_sem
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index f69b7783027..11fc4d5c43e 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -969,7 +969,7 @@ int pohmelfs_setattr_raw(struct inode *inode, struct iattr *attr)
 
 	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-		err = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		err = dquot_transfer(inode, attr);
 		if (err)
 			goto err_out_exit;
 	}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 71b032c65a0..3cfcfd9a131 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1459,7 +1459,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 		return error;
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-		error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0;
+		error = dquot_transfer(inode, iattr);
 		if (error)
 			return error;
 	}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 20f02d69365..14d40a4dd6f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3160,7 +3160,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		error = dquot_transfer(inode, attr);
 		if (error) {
 			ext3_journal_stop(handle);
 			return error;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8b8bc4f9cb1..f7d4a2c19de 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -752,7 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 static const struct dquot_operations ext3_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
-	.transfer	= dquot_transfer,
 	.write_dquot	= ext3_write_dquot,
 	.acquire_dquot	= ext3_acquire_dquot,
 	.release_dquot	= ext3_release_dquot,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f607ea411c..6a002a6d062 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5263,7 +5263,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		error = dquot_transfer(inode, attr);
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d231da8798e..b4253fb7bab 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1017,7 +1017,6 @@ static const struct dquot_operations ext4_quota_operations = {
 #ifdef CONFIG_QUOTA
 	.get_reserved_space = ext4_get_reserved_space,
 #endif
-	.transfer	= dquot_transfer,
 	.write_dquot	= ext4_write_dquot,
 	.acquire_dquot	= ext4_acquire_dquot,
 	.release_dquot	= ext4_release_dquot,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index a4229e49330..2c201783836 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -100,8 +100,9 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-		if (vfs_dq_transfer(inode, iattr))
-			return -EDQUOT;
+		rc = dquot_transfer(inode, iattr);
+		if (rc)
+			return rc;
 	}
 
 	rc = inode_setattr(inode, iattr);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6cf3d8d1836..472e8f8bc89 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1020,7 +1020,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		/*
 		 * Gather pointers to quota structures so that allocation /
 		 * freeing of quota structures happens here and not inside
-		 * vfs_dq_transfer() where we have problems with lock ordering
+		 * dquot_transfer() where we have problems with lock ordering
 		 */
 		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1053,7 +1053,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 			mlog_errno(status);
 			goto bail_unlock;
 		}
-		status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		status = dquot_transfer(inode, attr);
 		if (status < 0)
 			goto bail_commit;
 	} else {
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index ed96b3eeb13..b654bd103b6 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -853,7 +853,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 const struct dquot_operations ocfs2_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
-	.transfer	= dquot_transfer,
 	.write_dquot	= ocfs2_write_dquot,
 	.acquire_dquot	= ocfs2_acquire_dquot,
 	.release_dquot	= ocfs2_release_dquot,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ed131318b84..78ce4c48ad7 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1669,7 +1669,7 @@ EXPORT_SYMBOL(dquot_free_inode);
  * This operation can block, but only after everything is updated
  * A transaction must be started when entering this function.
  */
-int dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
+static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
 {
 	qsize_t space, cur_space;
 	qsize_t rsv_space = 0;
@@ -1766,12 +1766,11 @@ over_quota:
 	ret = NO_QUOTA;
 	goto warn_put_all;
 }
-EXPORT_SYMBOL(dquot_transfer);
 
 /* Wrapper for transferring ownership of an inode for uid/gid only
  * Called from FSXXX_setattr()
  */
-int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
 	qid_t chid[MAXQUOTAS];
 	unsigned long mask = 0;
@@ -1786,12 +1785,12 @@ int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 	}
 	if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
 		vfs_dq_init(inode);
-		if (inode->i_sb->dq_op->transfer(inode, chid, mask) == NO_QUOTA)
-			return 1;
+		if (__dquot_transfer(inode, chid, mask) == NO_QUOTA)
+			return -EDQUOT;
 	}
 	return 0;
 }
-EXPORT_SYMBOL(vfs_dq_transfer);
+EXPORT_SYMBOL(dquot_transfer);
 
 /*
  * Write info of quota file to disk
@@ -1814,7 +1813,6 @@ EXPORT_SYMBOL(dquot_commit_info);
 const struct dquot_operations dquot_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
-	.transfer	= dquot_transfer,
 	.write_dquot	= dquot_commit,
 	.acquire_dquot	= dquot_acquire,
 	.release_dquot	= dquot_release,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f56a3d2e649..99a5e5a8ab5 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3134,8 +3134,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 						  jbegin_count);
 				if (error)
 					goto out;
-				error =
-				    vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+				error = dquot_transfer(inode, attr);
 				if (error) {
 					journal_end(&th, inode->i_sb,
 						    jbegin_count);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e942ceecf2b..97c3e8ed7db 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -618,7 +618,6 @@ static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
 static const struct dquot_operations reiserfs_quota_operations = {
 	.initialize = dquot_initialize,
 	.drop = dquot_drop,
-	.transfer = dquot_transfer,
 	.write_dquot = reiserfs_write_dquot,
 	.acquire_dquot = reiserfs_acquire_dquot,
 	.release_dquot = reiserfs_release_dquot,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 35ca47281fa..2df7fcb677b 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -229,7 +229,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
 
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
             (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-		error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0;
+		error = dquot_transfer(inode, iattr);
 		if (error)
 			return error;
 	}
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 56ab31f00bd..87bbab68590 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -520,7 +520,7 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 	    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		error = dquot_transfer(inode, attr);
 		if (error)
 			return error;
 	}
diff --git a/include/linux/quota.h b/include/linux/quota.h
index e3b07895d32..422e6aa78ed 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -297,7 +297,6 @@ struct quota_format_ops {
 struct dquot_operations {
 	int (*initialize) (struct inode *, int);
 	int (*drop) (struct inode *);
-	int (*transfer) (struct inode *, qid_t *, unsigned long);
 	int (*write_dquot) (struct dquot *);		/* Ordinary dquot write */
 	struct dquot *(*alloc_dquot)(struct super_block *, int);	/* Allocate memory for new dquot */
 	void (*destroy_dquot)(struct dquot *);		/* Free memory for dquot */
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 9ce7f051a4b..fa27b7218c8 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -42,7 +42,6 @@ int dquot_alloc_inode(const struct inode *inode);
 int dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
 void dquot_free_inode(const struct inode *inode);
 
-int dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask);
 int dquot_commit(struct dquot *dquot);
 int dquot_acquire(struct dquot *dquot);
 int dquot_release(struct dquot *dquot);
@@ -66,7 +65,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di);
 
 void vfs_dq_drop(struct inode *inode);
-int vfs_dq_transfer(struct inode *inode, struct iattr *iattr);
+int dquot_transfer(struct inode *inode, struct iattr *iattr);
 int vfs_dq_quota_on_remount(struct super_block *sb);
 
 static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
@@ -234,7 +233,7 @@ static inline int vfs_dq_quota_on_remount(struct super_block *sb)
 	return 0;
 }
 
-static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+static inline int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 9f7547580263d4a55efe06ce5cfd567f568be6e8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 3 Mar 2010 09:05:05 -0500
Subject: dquot: cleanup dquot drop routine

Get rid of the drop dquot operation - it is now always called from
the filesystem and if a filesystem really needs it's own (which none
currently does) it can just call into it's own routine directly.

Rename the now static low-level dquot_drop helper to __dquot_drop
and vfs_dq_drop to dquot_drop to have a consistent namespace.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 Documentation/filesystems/Locking |  2 --
 fs/ext2/ialloc.c                  |  4 +--
 fs/ext2/super.c                   |  2 +-
 fs/ext3/ialloc.c                  |  4 +--
 fs/ext3/super.c                   |  3 +--
 fs/ext4/ialloc.c                  |  4 +--
 fs/ext4/super.c                   |  3 +--
 fs/jfs/inode.c                    |  2 +-
 fs/jfs/jfs_inode.c                |  2 +-
 fs/jfs/super.c                    |  2 +-
 fs/ocfs2/inode.c                  |  2 +-
 fs/ocfs2/quota_global.c           |  1 -
 fs/quota/dquot.c                  | 52 +++++++++++++++++++--------------------
 fs/reiserfs/inode.c               |  2 +-
 fs/reiserfs/namei.c               |  2 +-
 fs/reiserfs/super.c               |  3 +--
 fs/udf/ialloc.c                   |  4 +--
 fs/udf/inode.c                    |  2 +-
 fs/ufs/ialloc.c                   |  4 +--
 fs/ufs/super.c                    |  2 +-
 include/linux/quota.h             |  1 -
 include/linux/quotaops.h          |  5 ++--
 22 files changed, 49 insertions(+), 59 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4574e0272bd..fa10e4bf8e5 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -461,7 +461,6 @@ in sys_read() and friends.
 --------------------------- dquot_operations -------------------------------
 prototypes:
 	int (*initialize) (struct inode *, int);
-	int (*drop) (struct inode *);
 	int (*write_dquot) (struct dquot *);
 	int (*acquire_dquot) (struct dquot *);
 	int (*release_dquot) (struct dquot *);
@@ -475,7 +474,6 @@ What filesystem should expect from the generic quota functions:
 
 		FS recursion	Held locks when called
 initialize:	yes		maybe dqonoff_sem
-drop:		yes		-
 write_dquot:	yes		dqonoff_sem or dqptr_sem
 acquire_dquot:	yes		dqonoff_sem or dqptr_sem
 release_dquot:	yes		dqonoff_sem or dqptr_sem
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index d12f9809559..88b71972c62 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -122,7 +122,7 @@ void ext2_free_inode (struct inode * inode)
 		/* Quota is already initialized in iput() */
 		ext2_xattr_delete_inode(inode);
 		dquot_free_inode(inode);
-		vfs_dq_drop(inode);
+		dquot_drop(inode);
 	}
 
 	es = EXT2_SB(sb)->s_es;
@@ -608,7 +608,7 @@ fail_free_drop:
 	dquot_free_inode(inode);
 
 fail_drop:
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 98815d2a566..42e4a303b67 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -195,7 +195,7 @@ static void ext2_clear_inode(struct inode *inode)
 {
 	struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
 
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	ext2_discard_reservation(inode);
 	EXT2_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 8bf00e997c3..7d7238f9f6f 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -126,7 +126,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	vfs_dq_init(inode);
 	ext3_xattr_delete_inode(handle, inode);
 	dquot_free_inode(inode);
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
 
@@ -622,7 +622,7 @@ fail_free_drop:
 	dquot_free_inode(inode);
 
 fail_drop:
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2277b1a98e6..0163d0dae12 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -529,7 +529,7 @@ static void ext3_clear_inode(struct inode *inode)
 {
 	struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
 
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	ext3_discard_reservation(inode);
 	EXT3_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
@@ -753,7 +753,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 
 static const struct dquot_operations ext3_quota_operations = {
 	.initialize	= dquot_initialize,
-	.drop		= dquot_drop,
 	.write_dquot	= ext3_write_dquot,
 	.acquire_dquot	= ext3_acquire_dquot,
 	.release_dquot	= ext3_release_dquot,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b0d744cf8b9..ca8986e4b52 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -220,7 +220,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	vfs_dq_init(inode);
 	ext4_xattr_delete_inode(handle, inode);
 	dquot_free_inode(inode);
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
 
@@ -1077,7 +1077,7 @@ fail_free_drop:
 	dquot_free_inode(inode);
 
 fail_drop:
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 56554c8850e..035516c80df 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -761,7 +761,7 @@ static void destroy_inodecache(void)
 
 static void ext4_clear_inode(struct inode *inode)
 {
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	ext4_discard_preallocations(inode);
 	if (EXT4_JOURNAL(inode))
 		jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -1014,7 +1014,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 
 static const struct dquot_operations ext4_quota_operations = {
 	.initialize	= dquot_initialize,
-	.drop		= dquot_drop,
 #ifdef CONFIG_QUOTA
 	.get_reserved_space = ext4_get_reserved_space,
 #endif
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 2562d18988f..22fa412c528 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -160,7 +160,7 @@ void jfs_delete_inode(struct inode *inode)
 		 */
 		vfs_dq_init(inode);
 		dquot_free_inode(inode);
-		vfs_dq_drop(inode);
+		dquot_drop(inode);
 	}
 
 	clear_inode(inode);
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 7762f33e062..72b30895422 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	return inode;
 
 fail_drop:
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 fail_unlock:
 	inode->i_nlink = 0;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4086fa59341..266699deb1c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -133,7 +133,7 @@ static void jfs_destroy_inode(struct inode *inode)
 
 static void jfs_clear_inode(struct inode *inode)
 {
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 }
 
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 13eb5d467c4..00eb6a095e6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1087,7 +1087,7 @@ void ocfs2_clear_inode(struct inode *inode)
 	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
 			"Inode=%lu\n", inode->i_ino);
 
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 
 	/* To preven remote deletes we hold open lock before, now it
 	 * is time to unlock PR and EX open locks. */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b654bd103b6..4dca38f487c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -852,7 +852,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 
 const struct dquot_operations ocfs2_quota_operations = {
 	.initialize	= dquot_initialize,
-	.drop		= dquot_drop,
 	.write_dquot	= ocfs2_write_dquot,
 	.acquire_dquot	= ocfs2_acquire_dquot,
 	.release_dquot	= ocfs2_release_dquot,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 78ce4c48ad7..cd83c5b871b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1358,7 +1358,7 @@ EXPORT_SYMBOL(dquot_initialize);
 /*
  * 	Release all quotas referenced by inode
  */
-int dquot_drop(struct inode *inode)
+static void __dquot_drop(struct inode *inode)
 {
 	int cnt;
 	struct dquot *put[MAXQUOTAS];
@@ -1370,32 +1370,31 @@ int dquot_drop(struct inode *inode)
 	}
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	dqput_all(put);
-	return 0;
 }
-EXPORT_SYMBOL(dquot_drop);
 
-/* Wrapper to remove references to quota structures from inode */
-void vfs_dq_drop(struct inode *inode)
-{
-	/* Here we can get arbitrary inode from clear_inode() so we have
-	 * to be careful. OTOH we don't need locking as quota operations
-	 * are allowed to change only at mount time */
-	if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op
-	    && inode->i_sb->dq_op->drop) {
-		int cnt;
-		/* Test before calling to rule out calls from proc and such
-                 * where we are not allowed to block. Note that this is
-		 * actually reliable test even without the lock - the caller
-		 * must assure that nobody can come after the DQUOT_DROP and
-		 * add quota pointers back anyway */
-		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-			if (inode->i_dquot[cnt])
-				break;
-		if (cnt < MAXQUOTAS)
-			inode->i_sb->dq_op->drop(inode);
-	}
-}
-EXPORT_SYMBOL(vfs_dq_drop);
+void dquot_drop(struct inode *inode)
+{
+	int cnt;
+
+	if (IS_NOQUOTA(inode))
+		return;
+
+	/*
+	 * Test before calling to rule out calls from proc and such
+	 * where we are not allowed to block. Note that this is
+	 * actually reliable test even without the lock - the caller
+	 * must assure that nobody can come after the DQUOT_DROP and
+	 * add quota pointers back anyway.
+	 */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt])
+			break;
+	}
+
+	if (cnt < MAXQUOTAS)
+		__dquot_drop(inode);
+}
+EXPORT_SYMBOL(dquot_drop);
 
 /*
  * inode_reserved_space is managed internally by quota, and protected by
@@ -1812,7 +1811,6 @@ EXPORT_SYMBOL(dquot_commit_info);
  */
 const struct dquot_operations dquot_operations = {
 	.initialize	= dquot_initialize,
-	.drop		= dquot_drop,
 	.write_dquot	= dquot_commit,
 	.acquire_dquot	= dquot_acquire,
 	.release_dquot	= dquot_release,
@@ -2029,7 +2027,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		 * When S_NOQUOTA is set, remove dquot references as no more
 		 * references can be added
 		 */
-		sb->dq_op->drop(inode);
+		__dquot_drop(inode);
 	}
 
 	error = -EIO;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 99a5e5a8ab5..f07c3b69247 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1964,7 +1964,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
       out_end_trans:
 	journal_end(th, th->t_super, th->t_blocks_allocated);
 	/* Drop can be outside and it needs more credits so it's better to have it outside */
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	make_bad_inode(inode);
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 9d4dcf0b07c..9dea84e8a79 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 */
 static int drop_new_inode(struct inode *inode)
 {
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	make_bad_inode(inode);
 	inode->i_flags |= S_NOQUOTA;
 	iput(inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 6b24e70e329..34f7cd0cb02 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -580,7 +580,7 @@ out:
 
 static void reiserfs_clear_inode(struct inode *inode)
 {
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 }
 
 #ifdef CONFIG_QUOTA
@@ -623,7 +623,6 @@ static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
 
 static const struct dquot_operations reiserfs_quota_operations = {
 	.initialize = dquot_initialize,
-	.drop = dquot_drop,
 	.write_dquot = reiserfs_write_dquot,
 	.acquire_dquot = reiserfs_acquire_dquot,
 	.release_dquot = reiserfs_release_dquot,
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index e1856b89c9c..15c6e992e58 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -37,7 +37,7 @@ void udf_free_inode(struct inode *inode)
 	 * as writing the quota to disk may need the lock as well.
 	 */
 	dquot_free_inode(inode);
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 
 	clear_inode(inode);
 
@@ -156,7 +156,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	vfs_dq_init(inode);
 	ret = dquot_alloc_inode(inode);
 	if (ret) {
-		vfs_dq_drop(inode);
+		dquot_drop(inode);
 		inode->i_flags |= S_NOQUOTA;
 		inode->i_nlink = 0;
 		iput(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 859389a3832..1199e8e21ee 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -109,7 +109,7 @@ void udf_clear_inode(struct inode *inode)
 			(unsigned long long)iinfo->i_lenExtents);
 	}
 
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	kfree(iinfo->i_ext.i_data);
 	iinfo->i_ext.i_data = NULL;
 }
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 02f77882c57..67b4bdb056f 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -96,7 +96,7 @@ void ufs_free_inode (struct inode * inode)
 	is_directory = S_ISDIR(inode->i_mode);
 
 	dquot_free_inode(inode);
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 
 	clear_inode (inode);
 
@@ -358,7 +358,7 @@ cg_found:
 	vfs_dq_init(inode);
 	err = dquot_alloc_inode(inode);
 	if (err) {
-		vfs_dq_drop(inode);
+		dquot_drop(inode);
 		goto fail_without_unlock;
 	}
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 95d61cb3a5b..66b63a75161 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1434,7 +1434,7 @@ static void destroy_inodecache(void)
 
 static void ufs_clear_inode(struct inode *inode)
 {
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 }
 
 #ifdef CONFIG_QUOTA
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 422e6aa78ed..aec2e9dac2d 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -296,7 +296,6 @@ struct quota_format_ops {
 /* Operations working with dquots */
 struct dquot_operations {
 	int (*initialize) (struct inode *, int);
-	int (*drop) (struct inode *);
 	int (*write_dquot) (struct dquot *);		/* Ordinary dquot write */
 	struct dquot *(*alloc_dquot)(struct super_block *, int);	/* Allocate memory for new dquot */
 	void (*destroy_dquot)(struct dquot *);		/* Free memory for dquot */
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index fa27b7218c8..a5ebd1abccd 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -24,7 +24,7 @@ void inode_claim_rsv_space(struct inode *inode, qsize_t number);
 void inode_sub_rsv_space(struct inode *inode, qsize_t number);
 
 int dquot_initialize(struct inode *inode, int type);
-int dquot_drop(struct inode *inode);
+void dquot_drop(struct inode *inode);
 struct dquot *dqget(struct super_block *sb, unsigned int id, int type);
 void dqput(struct dquot *dquot);
 int dquot_scan_active(struct super_block *sb,
@@ -64,7 +64,6 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
 int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di);
 int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di);
 
-void vfs_dq_drop(struct inode *inode);
 int dquot_transfer(struct inode *inode, struct iattr *iattr);
 int vfs_dq_quota_on_remount(struct super_block *sb);
 
@@ -210,7 +209,7 @@ static inline void vfs_dq_init(struct inode *inode)
 {
 }
 
-static inline void vfs_dq_drop(struct inode *inode)
+static inline void dquot_drop(struct inode *inode)
 {
 }
 
-- 
cgit v1.2.3


From 871a293155a24554e153538d36e3a80fa169aefb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 3 Mar 2010 09:05:07 -0500
Subject: dquot: cleanup dquot initialize routine

Get rid of the initialize dquot operation - it is now always called from
the filesystem and if a filesystem really needs it's own (which none
currently does) it can just call into it's own routine directly.

Rename the now static low-level dquot_initialize helper to __dquot_initialize
and vfs_dq_init to dquot_initialize to have a consistent namespace.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 Documentation/filesystems/Locking |  2 --
 fs/ext2/file.c                    |  1 +
 fs/ext2/ialloc.c                  |  2 +-
 fs/ext2/inode.c                   |  4 ++--
 fs/ext2/namei.c                   | 16 ++++++++--------
 fs/ext3/file.c                    |  1 +
 fs/ext3/ialloc.c                  |  4 ++--
 fs/ext3/inode.c                   |  6 +++---
 fs/ext3/namei.c                   | 24 ++++++++++++------------
 fs/ext3/super.c                   |  5 ++---
 fs/ext4/file.c                    |  1 +
 fs/ext4/ialloc.c                  |  4 ++--
 fs/ext4/inode.c                   |  4 ++--
 fs/ext4/namei.c                   | 24 ++++++++++++------------
 fs/ext4/super.c                   |  5 ++---
 fs/jfs/file.c                     |  2 +-
 fs/jfs/inode.c                    |  4 ++--
 fs/jfs/jfs_inode.c                |  2 +-
 fs/jfs/namei.c                    | 24 ++++++++++++------------
 fs/ocfs2/file.c                   |  4 ++--
 fs/ocfs2/inode.c                  |  2 +-
 fs/ocfs2/namei.c                  | 14 +++++++-------
 fs/ocfs2/quota_global.c           |  1 -
 fs/ocfs2/refcounttree.c           |  2 +-
 fs/quota/dquot.c                  | 32 ++++++++++++++++++++------------
 fs/reiserfs/inode.c               |  6 +++---
 fs/reiserfs/namei.c               | 22 +++++++++++-----------
 fs/reiserfs/super.c               |  3 +--
 fs/udf/file.c                     |  2 +-
 fs/udf/ialloc.c                   |  2 +-
 fs/udf/inode.c                    |  2 +-
 fs/udf/namei.c                    | 18 +++++++++---------
 fs/ufs/file.c                     |  1 +
 fs/ufs/ialloc.c                   |  2 +-
 fs/ufs/inode.c                    |  2 +-
 fs/ufs/namei.c                    | 16 ++++++++--------
 fs/ufs/truncate.c                 |  2 +-
 include/linux/quota.h             |  1 -
 include/linux/quotaops.h          | 17 ++++-------------
 39 files changed, 141 insertions(+), 145 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index fa10e4bf8e5..06bbbed7120 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -460,7 +460,6 @@ in sys_read() and friends.
 
 --------------------------- dquot_operations -------------------------------
 prototypes:
-	int (*initialize) (struct inode *, int);
 	int (*write_dquot) (struct dquot *);
 	int (*acquire_dquot) (struct dquot *);
 	int (*release_dquot) (struct dquot *);
@@ -473,7 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
 What filesystem should expect from the generic quota functions:
 
 		FS recursion	Held locks when called
-initialize:	yes		maybe dqonoff_sem
 write_dquot:	yes		dqonoff_sem or dqptr_sem
 acquire_dquot:	yes		dqonoff_sem or dqptr_sem
 release_dquot:	yes		dqonoff_sem or dqptr_sem
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index d11f6e48451..5d198d0697f 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
 
 #include <linux/time.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 88b71972c62..ad7d572ee8d 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -586,7 +586,7 @@ got:
 		goto fail_drop;
 	}
 
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	err = dquot_alloc_inode(inode);
 	if (err)
 		goto fail_drop;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c87840c33e1..45ff49f0a4b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -59,7 +59,7 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
 void ext2_delete_inode (struct inode * inode)
 {
 	if (!is_bad_inode(inode))
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (is_bad_inode(inode))
@@ -1461,7 +1461,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 		return error;
 
 	if (iattr->ia_valid & ATTR_SIZE)
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
 		error = dquot_transfer(inode, iattr);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 5923df7b22a..71efb0e9a3f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -102,7 +102,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st
 {
 	struct inode *inode;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	inode = ext2_new_inode(dir, mode);
 	if (IS_ERR(inode))
@@ -131,7 +131,7 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	inode = ext2_new_inode (dir, mode);
 	err = PTR_ERR(inode);
@@ -157,7 +157,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
 	if (l > sb->s_blocksize)
 		goto out;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
 	err = PTR_ERR(inode);
@@ -202,7 +202,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 	if (inode->i_nlink >= EXT2_LINK_MAX)
 		return -EMLINK;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
@@ -226,7 +226,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	if (dir->i_nlink >= EXT2_LINK_MAX)
 		goto out;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	inode_inc_link_count(dir);
 
@@ -274,7 +274,7 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
 	struct page * page;
 	int err = -ENOENT;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	de = ext2_find_entry (dir, &dentry->d_name, &page);
 	if (!de)
@@ -318,8 +318,8 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	struct ext2_dir_entry_2 * old_de;
 	int err = -ENOENT;
 
-	vfs_dq_init(old_dir);
-	vfs_dq_init(new_dir);
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
 
 	old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3c7fb11a3b2..f55df0e61cb 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -21,6 +21,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd.h>
+#include <linux/quotaops.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
 #include "xattr.h"
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 7d7238f9f6f..ef9008b885b 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,7 +123,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	ext3_xattr_delete_inode(handle, inode);
 	dquot_free_inode(inode);
 	dquot_drop(inode);
@@ -588,7 +588,7 @@ got:
 		sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
 
 	ret = inode;
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	err = dquot_alloc_inode(inode);
 	if (err)
 		goto fail_drop;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index d7962b0c57b..ffbbc65e3f6 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -197,7 +197,7 @@ void ext3_delete_inode (struct inode * inode)
 	handle_t *handle;
 
 	if (!is_bad_inode(inode))
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 	truncate_inode_pages(&inode->i_data, 0);
 
@@ -3152,7 +3152,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 		return error;
 
 	if (ia_valid & ATTR_SIZE)
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		handle_t *handle;
@@ -3250,7 +3250,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
 		ret = 2 * (bpp + indirects) + 2;
 
 #ifdef CONFIG_QUOTA
-	/* We know that structure was already allocated during vfs_dq_init so
+	/* We know that structure was already allocated during dquot_initialize so
 	 * we will be updating only the data blocks + inodes */
 	ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
 #endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index a492b371b13..ee184084ca4 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1696,7 +1696,7 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
 	struct inode * inode;
 	int err, retries = 0;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -1732,7 +1732,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -1770,7 +1770,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	if (dir->i_nlink >= EXT3_LINK_MAX)
 		return -EMLINK;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -2066,8 +2066,8 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
-	vfs_dq_init(dir);
-	vfs_dq_init(dentry->d_inode);
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
 
 	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
@@ -2127,8 +2127,8 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
-	vfs_dq_init(dir);
-	vfs_dq_init(dentry->d_inode);
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
 
 	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
@@ -2184,7 +2184,7 @@ static int ext3_symlink (struct inode * dir,
 	if (l > dir->i_sb->s_blocksize)
 		return -ENAMETOOLONG;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -2241,7 +2241,7 @@ static int ext3_link (struct dentry * old_dentry,
 	if (inode->i_nlink >= EXT3_LINK_MAX)
 		return -EMLINK;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	/*
 	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
@@ -2293,15 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 	struct ext3_dir_entry_2 * old_de, * new_de;
 	int retval, flush_file = 0;
 
-	vfs_dq_init(old_dir);
-	vfs_dq_init(new_dir);
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
 
 	old_bh = new_bh = dir_bh = NULL;
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	if (new_dentry->d_inode)
-		vfs_dq_init(new_dentry->d_inode);
+		dquot_initialize(new_dentry->d_inode);
 	handle = ext3_journal_start(old_dir, 2 *
 					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 0163d0dae12..e844accbf55 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -752,7 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
 static const struct dquot_operations ext3_quota_operations = {
-	.initialize	= dquot_initialize,
 	.write_dquot	= ext3_write_dquot,
 	.acquire_dquot	= ext3_acquire_dquot,
 	.release_dquot	= ext3_release_dquot,
@@ -1480,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 		}
 
 		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
 				"%s: truncating inode %lu to %Ld bytes\n",
@@ -2736,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
  * Process 1                         Process 2
  * ext3_create()                     quota_sync()
  *   journal_start()                   write_dquot()
- *   vfs_dq_init()                       down(dqio_mutex)
+ *   dquot_initialize()                       down(dqio_mutex)
  *     down(dqio_mutex)                    journal_start()
  *
  */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 85fa464a24a..a08a12998c4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
 #include <linux/jbd2.h>
 #include <linux/mount.h>
 #include <linux/path.h>
+#include <linux/quotaops.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ca8986e4b52..9bb2bb9f67a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -217,7 +217,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	ext4_xattr_delete_inode(handle, inode);
 	dquot_free_inode(inode);
 	dquot_drop(inode);
@@ -1034,7 +1034,7 @@ got:
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
 	ret = inode;
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	err = dquot_alloc_inode(inode);
 	if (err)
 		goto fail_drop;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index eaa22ae9f1f..bec222ca9ba 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -171,7 +171,7 @@ void ext4_delete_inode(struct inode *inode)
 	int err;
 
 	if (!is_bad_inode(inode))
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
@@ -5255,7 +5255,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		return error;
 
 	if (ia_valid & ATTR_SIZE)
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		handle_t *handle;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 20f55c2e757..7f3d2d75a0d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1766,7 +1766,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
 	struct inode *inode;
 	int err, retries = 0;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -1802,7 +1802,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -1841,7 +1841,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (EXT4_DIR_LINK_MAX(dir))
 		return -EMLINK;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -2142,8 +2142,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
-	vfs_dq_init(dir);
-	vfs_dq_init(dentry->d_inode);
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
 
 	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
@@ -2203,8 +2203,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
-	vfs_dq_init(dir);
-	vfs_dq_init(dentry->d_inode);
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
 
 	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
@@ -2260,7 +2260,7 @@ static int ext4_symlink(struct inode *dir,
 	if (l > dir->i_sb->s_blocksize)
 		return -ENAMETOOLONG;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -2320,7 +2320,7 @@ static int ext4_link(struct dentry *old_dentry,
 	if (inode->i_nlink >= EXT4_LINK_MAX)
 		return -EMLINK;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	/*
 	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
@@ -2372,15 +2372,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct ext4_dir_entry_2 *old_de, *new_de;
 	int retval, force_da_alloc = 0;
 
-	vfs_dq_init(old_dir);
-	vfs_dq_init(new_dir);
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
 
 	old_bh = new_bh = dir_bh = NULL;
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	if (new_dentry->d_inode)
-		vfs_dq_init(new_dentry->d_inode);
+		dquot_initialize(new_dentry->d_inode);
 	handle = ext4_journal_start(old_dir, 2 *
 					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 035516c80df..edcf3b0239d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1013,7 +1013,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
 static const struct dquot_operations ext4_quota_operations = {
-	.initialize	= dquot_initialize,
 #ifdef CONFIG_QUOTA
 	.get_reserved_space = ext4_get_reserved_space,
 #endif
@@ -1931,7 +1930,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		}
 
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 		if (inode->i_nlink) {
 			ext4_msg(sb, KERN_DEBUG,
 				"%s: truncating inode %lu to %lld bytes",
@@ -3700,7 +3699,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
  * Process 1                         Process 2
  * ext4_create()                     quota_sync()
  *   jbd2_journal_start()                  write_dquot()
- *   vfs_dq_init()                         down(dqio_mutex)
+ *   dquot_initialize()                         down(dqio_mutex)
  *     down(dqio_mutex)                    jbd2_journal_start()
  *
  */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index f19bb33eb1e..14ba982b3f2 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -99,7 +99,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		return rc;
 
 	if (iattr->ia_valid & ATTR_SIZE)
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
 		rc = dquot_transfer(inode, iattr);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 1aa2dda1659..c694a5f1538 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -147,7 +147,7 @@ void jfs_delete_inode(struct inode *inode)
 	jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
 
 	if (!is_bad_inode(inode))
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 	if (!is_bad_inode(inode) &&
 	    (JFS_IP(inode)->fileset == FILESYSTEM_I)) {
@@ -161,7 +161,7 @@ void jfs_delete_inode(struct inode *inode)
 		/*
 		 * Free the inode from the quota allocation.
 		 */
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 		dquot_free_inode(inode);
 		dquot_drop(inode);
 	}
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 72b30895422..829921b6776 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,7 +116,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	/*
 	 * Allocate inode to quota.
 	 */
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	rc = dquot_alloc_inode(inode);
 	if (rc)
 		goto fail_drop;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index b7cc29da50b..4a3e9f39c21 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -85,7 +85,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 
 	jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
 
-	vfs_dq_init(dip);
+	dquot_initialize(dip);
 
 	/*
 	 * search parent directory for entry/freespace
@@ -217,7 +217,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 
 	jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
 
-	vfs_dq_init(dip);
+	dquot_initialize(dip);
 
 	/* link count overflow on parent directory ? */
 	if (dip->i_nlink == JFS_LINK_MAX) {
@@ -360,8 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 	jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
 
 	/* Init inode for quota operations. */
-	vfs_dq_init(dip);
-	vfs_dq_init(ip);
+	dquot_initialize(dip);
+	dquot_initialize(ip);
 
 	/* directory must be empty to be removed */
 	if (!dtEmpty(ip)) {
@@ -488,8 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 	jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
 
 	/* Init inode for quota operations. */
-	vfs_dq_init(dip);
-	vfs_dq_init(ip);
+	dquot_initialize(dip);
+	dquot_initialize(ip);
 
 	if ((rc = get_UCSname(&dname, dentry)))
 		goto out;
@@ -811,7 +811,7 @@ static int jfs_link(struct dentry *old_dentry,
 	if (ip->i_nlink == 0)
 		return -ENOENT;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	tid = txBegin(ip->i_sb, 0);
 
@@ -904,7 +904,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 
 	jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
 
-	vfs_dq_init(dip);
+	dquot_initialize(dip);
 
 	ssize = strlen(name) + 1;
 
@@ -1097,8 +1097,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
 		 new_dentry->d_name.name);
 
-	vfs_dq_init(old_dir);
-	vfs_dq_init(new_dir);
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
 
 	old_ip = old_dentry->d_inode;
 	new_ip = new_dentry->d_inode;
@@ -1149,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	} else if (new_ip) {
 		IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
 		/* Init inode for quota operations. */
-		vfs_dq_init(new_ip);
+		dquot_initialize(new_ip);
 	}
 
 	/*
@@ -1373,7 +1373,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 
 	jfs_info("jfs_mknod: %s", dentry->d_name.name);
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	if ((rc = get_UCSname(&dname, dentry)))
 		goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 126198f5a67..36410529128 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -108,7 +108,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
 		   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
 
 	if (file->f_mode & FMODE_WRITE)
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 	spin_lock(&oi->ip_lock);
 
@@ -980,7 +980,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 
 	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
 	if (size_change) {
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 		status = ocfs2_rw_lock(inode, 1);
 		if (status < 0) {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 77681a690d1..278a223aae1 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -971,7 +971,7 @@ void ocfs2_delete_inode(struct inode *inode)
 		goto bail;
 	}
 
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 
 	if (!ocfs2_inode_is_valid_to_delete(inode)) {
 		/* It's probably not necessary to truncate_inode_pages
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8b5b142eb63..d9cd4e373a5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
 	} else
 		inode->i_gid = current_fsgid();
 	inode->i_mode = mode;
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	return inode;
 }
 
@@ -244,7 +244,7 @@ static int ocfs2_mknod(struct inode *dir,
 		   (unsigned long)dev, dentry->d_name.len,
 		   dentry->d_name.name);
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	/* get our super block */
 	osb = OCFS2_SB(dir->i_sb);
@@ -634,7 +634,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
 	if (err < 0) {
@@ -791,7 +791,7 @@ static int ocfs2_unlink(struct inode *dir,
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
 		   dentry->d_name.len, dentry->d_name.name);
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	BUG_ON(dentry->d_parent->d_inode != dir);
 
@@ -1053,8 +1053,8 @@ static int ocfs2_rename(struct inode *old_dir,
 		   old_dentry->d_name.len, old_dentry->d_name.name,
 		   new_dentry->d_name.len, new_dentry->d_name.name);
 
-	vfs_dq_init(old_dir);
-	vfs_dq_init(new_dir);
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
 
 	osb = OCFS2_SB(old_dir->i_sb);
 
@@ -1604,7 +1604,7 @@ static int ocfs2_symlink(struct inode *dir,
 	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
 		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	sb = dir->i_sb;
 	osb = OCFS2_SB(sb);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 4dca38f487c..355f41d1d52 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -851,7 +851,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 }
 
 const struct dquot_operations ocfs2_quota_operations = {
-	.initialize	= dquot_initialize,
 	.write_dquot	= ocfs2_write_dquot,
 	.acquire_dquot	= ocfs2_acquire_dquot,
 	.release_dquot	= ocfs2_release_dquot,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 8ae65c9c020..f3ae10cde84 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4390,7 +4390,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	mutex_lock(&inode->i_mutex);
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 	error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
 	mutex_unlock(&inode->i_mutex);
 	if (!error)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 6244bca45c9..3c0a7e0dff7 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -230,6 +230,7 @@ struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
 
 static qsize_t inode_get_rsv_space(struct inode *inode);
+static void __dquot_initialize(struct inode *inode, int type);
 
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
@@ -890,7 +891,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 		spin_unlock(&inode_lock);
 
 		iput(old_inode);
-		sb->dq_op->initialize(inode, type);
+		__dquot_initialize(inode, type);
 		/* We hold a reference to 'inode' so it couldn't have been
 		 * removed from s_inodes list while we dropped the inode_lock.
 		 * We cannot iput the inode now as we can be holding the last
@@ -1293,22 +1294,26 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
 }
 
 /*
- *	Initialize quota pointers in inode
- *	We do things in a bit complicated way but by that we avoid calling
- *	dqget() and thus filesystem callbacks under dqptr_sem.
+ * Initialize quota pointers in inode
+ *
+ * We do things in a bit complicated way but by that we avoid calling
+ * dqget() and thus filesystem callbacks under dqptr_sem.
+ *
+ * It is better to call this function outside of any transaction as it
+ * might need a lot of space in journal for dquot structure allocation.
  */
-int dquot_initialize(struct inode *inode, int type)
+static void __dquot_initialize(struct inode *inode, int type)
 {
 	unsigned int id = 0;
-	int cnt, ret = 0;
+	int cnt;
 	struct dquot *got[MAXQUOTAS];
 	struct super_block *sb = inode->i_sb;
 	qsize_t rsv;
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode))
-		return 0;
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+		return;
 
 	/* First get references to structures we might need. */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1351,7 +1356,11 @@ out_err:
 	up_write(&sb_dqopt(sb)->dqptr_sem);
 	/* Drop unused references */
 	dqput_all(got);
-	return ret;
+}
+
+void dquot_initialize(struct inode *inode)
+{
+	__dquot_initialize(inode, -1);
 }
 EXPORT_SYMBOL(dquot_initialize);
 
@@ -1783,7 +1792,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		chid[GRPQUOTA] = iattr->ia_gid;
 	}
 	if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 		if (__dquot_transfer(inode, chid, mask) == NO_QUOTA)
 			return -EDQUOT;
 	}
@@ -1810,7 +1819,6 @@ EXPORT_SYMBOL(dquot_commit_info);
  * Definitions of diskquota operations.
  */
 const struct dquot_operations dquot_operations = {
-	.initialize	= dquot_initialize,
 	.write_dquot	= dquot_commit,
 	.acquire_dquot	= dquot_acquire,
 	.release_dquot	= dquot_release,
@@ -1829,7 +1837,7 @@ int dquot_file_open(struct inode *inode, struct file *file)
 
 	error = generic_file_open(inode, file);
 	if (!error && (file->f_mode & FMODE_WRITE))
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 	return error;
 }
 EXPORT_SYMBOL(dquot_file_open);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 06995cb48e3..b8671a54e8e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -35,7 +35,7 @@ void reiserfs_delete_inode(struct inode *inode)
 	int err;
 
 	if (!is_bad_inode(inode))
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 	truncate_inode_pages(&inode->i_data, 0);
 
@@ -1768,7 +1768,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 
 	BUG_ON(!th->t_trans_id);
 
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	err = dquot_alloc_inode(inode);
 	if (err)
 		goto out_end_trans;
@@ -3076,7 +3076,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	depth = reiserfs_write_lock_once(inode->i_sb);
 	if (attr->ia_valid & ATTR_SIZE) {
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 		/* version 2 items will be caught by the s_maxbytes check
 		 ** done for us in vmtruncate
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index c55e1b9fee5..96e4cbbfaa1 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -554,7 +554,7 @@ static int drop_new_inode(struct inode *inode)
 }
 
 /* utility function that does setup for reiserfs_new_inode.
-** vfs_dq_init needs lots of credits so it's better to have it
+** dquot_initialize needs lots of credits so it's better to have it
 ** outside of a transaction, so we had to pull some bits of
 ** reiserfs_new_inode out into this func.
 */
@@ -577,7 +577,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
 	} else {
 		inode->i_gid = current_fsgid();
 	}
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	return 0;
 }
 
@@ -594,7 +594,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	struct reiserfs_transaction_handle th;
 	struct reiserfs_security_handle security;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	if (!(inode = new_inode(dir->i_sb))) {
 		return -ENOMEM;
@@ -668,7 +668,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	if (!(inode = new_inode(dir->i_sb))) {
 		return -ENOMEM;
@@ -743,7 +743,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 	/* set flag that new packing locality created and new blocks for the content     * of that directory are not displaced yet */
@@ -848,7 +848,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
 	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	reiserfs_write_lock(dir->i_sb);
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
@@ -931,7 +931,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 	unsigned long savelink;
 	int depth;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	inode = dentry->d_inode;
 
@@ -1034,7 +1034,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
 
-	vfs_dq_init(parent_dir);
+	dquot_initialize(parent_dir);
 
 	if (!(inode = new_inode(parent_dir->i_sb))) {
 		return -ENOMEM;
@@ -1123,7 +1123,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 	    JOURNAL_PER_BALANCE_CNT * 3 +
 	    2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	reiserfs_write_lock(dir->i_sb);
 	if (inode->i_nlink >= REISERFS_LINK_MAX) {
@@ -1249,8 +1249,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    JOURNAL_PER_BALANCE_CNT * 3 + 5 +
 	    4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
 
-	vfs_dq_init(old_dir);
-	vfs_dq_init(new_dir);
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
 
 	old_inode = old_dentry->d_inode;
 	new_dentry_inode = new_dentry->d_inode;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 34f7cd0cb02..04bf5d791bd 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -246,7 +246,7 @@ static int finish_unfinished(struct super_block *s)
 			retval = remove_save_link_only(s, &save_link_key, 0);
 			continue;
 		}
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 		if (truncate && S_ISDIR(inode->i_mode)) {
 			/* We got a truncate request for a dir which is impossible.
@@ -622,7 +622,6 @@ static int reiserfs_write_info(struct super_block *, int);
 static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
 
 static const struct dquot_operations reiserfs_quota_operations = {
-	.initialize = dquot_initialize,
 	.write_dquot = reiserfs_write_dquot,
 	.acquire_dquot = reiserfs_acquire_dquot,
 	.release_dquot = reiserfs_release_dquot,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 013fa44d9a5..1eb06774ed9 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -228,7 +228,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
 		return error;
 
 	if (iattr->ia_valid & ATTR_SIZE)
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
             (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 15c6e992e58..fb68c9cd0c3 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -153,7 +153,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	ret = dquot_alloc_inode(inode);
 	if (ret) {
 		dquot_drop(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f1952026840..c7da1a32b36 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -72,7 +72,7 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 void udf_delete_inode(struct inode *inode)
 {
 	if (!is_bad_inode(inode))
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 	truncate_inode_pages(&inode->i_data, 0);
 
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index e360c3fc4ae..96757e3e3e0 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -563,7 +563,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
 	int err;
 	struct udf_inode_info *iinfo;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	lock_kernel();
 	inode = udf_new_inode(dir, mode, &err);
@@ -618,7 +618,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	if (!old_valid_dev(rdev))
 		return -EINVAL;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	lock_kernel();
 	err = -EIO;
@@ -666,7 +666,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct udf_inode_info *dinfo = UDF_I(dir);
 	struct udf_inode_info *iinfo;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	lock_kernel();
 	err = -EMLINK;
@@ -805,7 +805,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 	struct fileIdentDesc *fi, cfi;
 	struct kernel_lb_addr tloc;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	retval = -ENOENT;
 	lock_kernel();
@@ -853,7 +853,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 	struct fileIdentDesc cfi;
 	struct kernel_lb_addr tloc;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	retval = -ENOENT;
 	lock_kernel();
@@ -909,7 +909,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	struct buffer_head *bh;
 	struct udf_inode_info *iinfo;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	lock_kernel();
 	inode = udf_new_inode(dir, S_IFLNK, &err);
@@ -1081,7 +1081,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 	int err;
 	struct buffer_head *bh;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	lock_kernel();
 	if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
@@ -1145,8 +1145,8 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct kernel_lb_addr tloc;
 	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
 
-	vfs_dq_init(old_dir);
-	vfs_dq_init(new_dir);
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
 
 	lock_kernel();
 	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index d84762f3028..a8962cecde5 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,6 +24,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/quotaops.h>
 
 #include "ufs_fs.h"
 #include "ufs.h"
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 67b4bdb056f..230ecf60802 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -355,7 +355,7 @@ cg_found:
 
 	unlock_super (sb);
 
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	err = dquot_alloc_inode(inode);
 	if (err) {
 		dquot_drop(inode);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index fff8edab382..09aef49beed 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -910,7 +910,7 @@ void ufs_delete_inode (struct inode * inode)
 	loff_t old_i_size;
 
 	if (!is_bad_inode(inode))
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode))
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index c33cb90c516..118556243e7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -86,7 +86,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
 
 	UFSD("BEGIN\n");
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	inode = ufs_new_inode(dir, mode);
 	err = PTR_ERR(inode);
@@ -112,7 +112,7 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
 	if (!old_valid_dev(rdev))
 		return -EINVAL;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	inode = ufs_new_inode(dir, mode);
 	err = PTR_ERR(inode);
@@ -138,7 +138,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 	if (l > sb->s_blocksize)
 		goto out_notlocked;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	lock_kernel();
 	inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
@@ -185,7 +185,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
 		return -EMLINK;
 	}
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
@@ -204,7 +204,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	if (dir->i_nlink >= UFS_LINK_MAX)
 		goto out;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	lock_kernel();
 	inode_inc_link_count(dir);
@@ -250,7 +250,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
 	struct page *page;
 	int err = -ENOENT;
 
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 
 	de = ufs_find_entry(dir, &dentry->d_name, &page);
 	if (!de)
@@ -296,8 +296,8 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct ufs_dir_entry *old_de;
 	int err = -ENOENT;
 
-	vfs_dq_init(old_dir);
-	vfs_dq_init(new_dir);
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
 
 	old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index e5ef8a3ec23..d3b6270cb37 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -528,7 +528,7 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 	    attr->ia_size != i_size_read(inode)) {
 		loff_t old_i_size = inode->i_size;
 
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 		error = vmtruncate(inode, attr->ia_size);
 		if (error)
diff --git a/include/linux/quota.h b/include/linux/quota.h
index aec2e9dac2d..4aa93554f0e 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -295,7 +295,6 @@ struct quota_format_ops {
 
 /* Operations working with dquots */
 struct dquot_operations {
-	int (*initialize) (struct inode *, int);
 	int (*write_dquot) (struct dquot *);		/* Ordinary dquot write */
 	struct dquot *(*alloc_dquot)(struct super_block *, int);	/* Allocate memory for new dquot */
 	void (*destroy_dquot)(struct dquot *);		/* Free memory for dquot */
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 93ac788345e..e6fa7acce29 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -23,7 +23,7 @@ void inode_add_rsv_space(struct inode *inode, qsize_t number);
 void inode_claim_rsv_space(struct inode *inode, qsize_t number);
 void inode_sub_rsv_space(struct inode *inode, qsize_t number);
 
-int dquot_initialize(struct inode *inode, int type);
+void dquot_initialize(struct inode *inode);
 void dquot_drop(struct inode *inode);
 struct dquot *dqget(struct super_block *sb, unsigned int id, int type);
 void dqput(struct dquot *dquot);
@@ -139,15 +139,6 @@ extern const struct quotactl_ops vfs_quotactl_ops;
 #define sb_dquot_ops (&dquot_operations)
 #define sb_quotactl_ops (&vfs_quotactl_ops)
 
-/* It is better to call this function outside of any transaction as it might
- * need a lot of space in journal for dquot structure allocation. */
-static inline void vfs_dq_init(struct inode *inode)
-{
-	BUG_ON(!inode->i_sb);
-	if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode))
-		inode->i_sb->dq_op->initialize(inode, -1);
-}
-
 /* Cannot be called inside a transaction */
 static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
@@ -207,7 +198,7 @@ static inline int sb_any_quota_active(struct super_block *sb)
 #define sb_dquot_ops				(NULL)
 #define sb_quotactl_ops				(NULL)
 
-static inline void vfs_dq_init(struct inode *inode)
+static inline void dquot_initialize(struct inode *inode)
 {
 }
 
@@ -260,6 +251,8 @@ static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 	return 0;
 }
 
+#define dquot_file_open		generic_file_open
+
 #endif /* CONFIG_QUOTA */
 
 static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
@@ -344,6 +337,4 @@ static inline void dquot_release_reservation_block(struct inode *inode,
 	__dquot_free_space(inode, nr << inode->i_blkbits, 1);
 }
 
-#define dquot_file_open		generic_file_open
-
 #endif /* _LINUX_QUOTAOPS_ */
-- 
cgit v1.2.3


From 34e55232e59f7b19050267a05ff1226e5cd122a5 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 5 Mar 2010 13:41:40 -0800
Subject: mm: avoid false sharing of mm_counter

Considering the nature of per mm stats, it's the shared object among
threads and can be a cache-miss point in the page fault path.

This patch adds per-thread cache for mm_counter.  RSS value will be
counted into a struct in task_struct and synchronized with mm's one at
events.

Now, in this patch, the event is the number of calls to handle_mm_fault.
Per-thread value is added to mm at each 64 calls.

 rough estimation with small benchmark on parallel thread (2threads) shows
 [before]
     4.5 cache-miss/faults
 [after]
     4.0 cache-miss/faults
 Anyway, the most contended object is mmap_sem if the number of threads grows.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  6 +++
 fs/exec.c                          |  1 +
 include/linux/mm.h                 |  8 ++--
 include/linux/mm_types.h           |  6 +++
 include/linux/sched.h              |  4 +-
 kernel/exit.c                      |  3 +-
 mm/memory.c                        | 94 ++++++++++++++++++++++++++++++++++----
 7 files changed, 107 insertions(+), 15 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0d07513a67a..e418f3d8f42 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -188,6 +188,12 @@ memory usage. Its seven fields are explained in Table 1-3.  The stat file
 contains details information about the process itself.  Its fields are
 explained in Table 1-4.
 
+(for SMP CONFIG users)
+For making accounting scalable, RSS related information are handled in
+asynchronous manner and the vaule may not be very precise. To see a precise
+snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
+It's slow but very precise.
+
 Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
 ..............................................................................
  Field                       Content
diff --git a/fs/exec.c b/fs/exec.c
index cce6bbdbdbb..ea7861727ef 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm)
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
+	sync_mm_rss(tsk, old_mm);
 	mm_release(tsk, old_mm);
 
 	if (old_mm) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2124cdb2d1d..8e580c07d17 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -873,7 +873,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 /*
  * per-process(per-mm_struct) statistics.
  */
-#if USE_SPLIT_PTLOCKS
+#if defined(SPLIT_RSS_COUNTING)
 /*
  * The mm counters are not protected by its page_table_lock,
  * so must be incremented atomically.
@@ -883,10 +883,7 @@ static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
 	atomic_long_set(&mm->rss_stat.count[member], value);
 }
 
-static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
-{
-	return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]);
-}
+unsigned long get_mm_counter(struct mm_struct *mm, int member);
 
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
 {
@@ -974,6 +971,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 		*maxrss = hiwater_rss;
 }
 
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
 
 /*
  * A callback you can register to apply pressure to ageable caches.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e1ca64be667..21861239ab0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -202,9 +202,15 @@ enum {
 };
 
 #if USE_SPLIT_PTLOCKS
+#define SPLIT_RSS_COUNTING
 struct mm_rss_stat {
 	atomic_long_t count[NR_MM_COUNTERS];
 };
+/* per-thread cached information, */
+struct task_rss_stat {
+	int events;	/* for synchronization threshold */
+	int count[NR_MM_COUNTERS];
+};
 #else  /* !USE_SPLIT_PTLOCKS */
 struct mm_rss_stat {
 	unsigned long count[NR_MM_COUNTERS];
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbeafa49a53..46c6f8d5dc0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,9 @@ struct task_struct {
 	struct plist_node pushable_tasks;
 
 	struct mm_struct *mm, *active_mm;
-
+#if defined(SPLIT_RSS_COUNTING)
+	struct task_rss_stat	rss_stat;
+#endif
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
diff --git a/kernel/exit.c b/kernel/exit.c
index 45ed043b8bf..10d3c5d5ae4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-
+	/* sync mm's RSS info before statistics gathering */
+	sync_mm_rss(tsk, tsk->mm);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/mm/memory.c b/mm/memory.c
index c5767847880..a4597614f18 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void)
 core_initcall(init_zero_pfn);
 
 
+#if defined(SPLIT_RSS_COUNTING)
+
+void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+{
+	int i;
+
+	for (i = 0; i < NR_MM_COUNTERS; i++) {
+		if (task->rss_stat.count[i]) {
+			add_mm_counter(mm, i, task->rss_stat.count[i]);
+			task->rss_stat.count[i] = 0;
+		}
+	}
+	task->rss_stat.events = 0;
+}
+
+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
+{
+	struct task_struct *task = current;
+
+	if (likely(task->mm == mm))
+		task->rss_stat.count[member] += val;
+	else
+		add_mm_counter(mm, member, val);
+}
+#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
+#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
+
+/* sync counter once per 64 page faults */
+#define TASK_RSS_EVENTS_THRESH	(64)
+static void check_sync_rss_stat(struct task_struct *task)
+{
+	if (unlikely(task != current))
+		return;
+	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
+		__sync_task_rss_stat(task, task->mm);
+}
+
+unsigned long get_mm_counter(struct mm_struct *mm, int member)
+{
+	long val = 0;
+
+	/*
+	 * Don't use task->mm here...for avoiding to use task_get_mm()..
+	 * The caller must guarantee task->mm is not invalid.
+	 */
+	val = atomic_long_read(&mm->rss_stat.count[member]);
+	/*
+	 * counter is updated in asynchronous manner and may go to minus.
+	 * But it's never be expected number for users.
+	 */
+	if (val < 0)
+		return 0;
+	return (unsigned long)val;
+}
+
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+{
+	__sync_task_rss_stat(task, mm);
+}
+#else
+
+#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
+#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+
+static void check_sync_rss_stat(struct task_struct *task)
+{
+}
+
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+{
+}
+#endif
+
 /*
  * If a p?d_bad entry is found while walking page tables, report
  * the error, before resetting entry to p?d_none.  Usually (but
@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
 {
 	int i;
 
+	if (current->mm == mm)
+		sync_mm_rss(current, mm);
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		if (rss[i])
 			add_mm_counter(mm, i, rss[i]);
@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 
 	/* Ok, finally just insert the thing.. */
 	get_page(page);
-	inc_mm_counter(mm, MM_FILEPAGES);
+	inc_mm_counter_fast(mm, MM_FILEPAGES);
 	page_add_file_rmap(page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 
@@ -2175,11 +2250,11 @@ gotten:
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
-				dec_mm_counter(mm, MM_FILEPAGES);
-				inc_mm_counter(mm, MM_ANONPAGES);
+				dec_mm_counter_fast(mm, MM_FILEPAGES);
+				inc_mm_counter_fast(mm, MM_ANONPAGES);
 			}
 		} else
-			inc_mm_counter(mm, MM_ANONPAGES);
+			inc_mm_counter_fast(mm, MM_ANONPAGES);
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * discarded at swap_free().
 	 */
 
-	inc_mm_counter(mm, MM_ANONPAGES);
+	inc_mm_counter_fast(mm, MM_ANONPAGES);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pte_none(*page_table))
 		goto release;
 
-	inc_mm_counter(mm, MM_ANONPAGES);
+	inc_mm_counter_fast(mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, address);
 setpte:
 	set_pte_at(mm, address, page_table, entry);
@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		if (anon) {
-			inc_mm_counter(mm, MM_ANONPAGES);
+			inc_mm_counter_fast(mm, MM_ANONPAGES);
 			page_add_new_anon_rmap(page, vma, address);
 		} else {
-			inc_mm_counter(mm, MM_FILEPAGES);
+			inc_mm_counter_fast(mm, MM_FILEPAGES);
 			page_add_file_rmap(page);
 			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	count_vm_event(PGFAULT);
 
+	/* do counter updates before entering really critical section. */
+	check_sync_rss_stat(current);
+
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
 
-- 
cgit v1.2.3


From b084d4353ff99d824d3bc5a5c2c22c70b1fba722 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 5 Mar 2010 13:41:42 -0800
Subject: mm: count swap usage

A frequent questions from users about memory management is what numbers of
swap ents are user for processes.  And this information will give some
hints to oom-killer.

Besides we can count the number of swapents per a process by scanning
/proc/<pid>/smaps, this is very slow and not good for usual process
information handler which works like 'ps' or 'top'.  (ps or top is now
enough slow..)

This patch adds a counter of swapents to mm_counter and update is at each
swap events.  Information is exported via /proc/<pid>/status file as

[kamezawa@bluextal memory]$ cat /proc/self/status
Name:   cat
State:  R (running)
Tgid:   2910
Pid:    2910
PPid:   2823
TracerPid:      0
Uid:    500     500     500     500
Gid:    500     500     500     500
FDSize: 256
Groups: 500
VmPeak:    82696 kB
VmSize:    82696 kB
VmLck:         0 kB
VmHWM:       432 kB
VmRSS:       432 kB
VmData:      172 kB
VmStk:        84 kB
VmExe:        48 kB
VmLib:      1568 kB
VmPTE:        40 kB
VmSwap:        0 kB <=============== this.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  2 ++
 fs/proc/task_mmu.c                 |  9 ++++++---
 include/linux/mm_types.h           |  1 +
 mm/memory.c                        | 16 ++++++++++++----
 mm/rmap.c                          |  1 +
 mm/swapfile.c                      |  1 +
 6 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index e418f3d8f42..b5c5fc657a8 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -164,6 +164,7 @@ read the file /proc/PID/status:
   VmExe:        68 kB
   VmLib:      1412 kB
   VmPTE:        20 kb
+  VmSwap:        0 kB
   Threads:        1
   SigQ:   0/28578
   SigPnd: 0000000000000000
@@ -219,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
  VmExe                       size of text segment
  VmLib                       size of shared library code
  VmPTE                       size of page table entries
+ VmSwap                      size of swap usage (the number of referred swapents)
  Threads                     number of threads
  SigQ                        number of signals queued/max. number for queue
  SigPnd                      bitmap of pending signals for the thread
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 37558127601..183f8ff5f40 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -16,7 +16,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long data, text, lib;
+	unsigned long data, text, lib, swap;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	/*
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	swap = get_mm_counter(mm, MM_SWAPENTS);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
-		"VmPTE:\t%8lu kB\n",
+		"VmPTE:\t%8lu kB\n"
+		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		swap << (PAGE_SHIFT-10));
 }
 
 unsigned long task_vsize(struct mm_struct *mm)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 21861239ab0..19549d7275a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -198,6 +198,7 @@ struct core_state {
 enum {
 	MM_FILEPAGES,
 	MM_ANONPAGES,
+	MM_SWAPENTS,
 	NR_MM_COUNTERS
 };
 
diff --git a/mm/memory.c b/mm/memory.c
index a4597614f18..77d9f840936 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -679,7 +679,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			if (is_write_migration_entry(entry) &&
+			if (likely(!non_swap_entry(entry)))
+				rss[MM_SWAPENTS]++;
+			else if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
 				/*
 				 * COW mappings require pages in both parent
@@ -974,9 +976,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		if (pte_file(ptent)) {
 			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
 				print_bad_pte(vma, addr, ptent, NULL);
-		} else if
-		  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
-			print_bad_pte(vma, addr, ptent, NULL);
+		} else {
+			swp_entry_t entry = pte_to_swp_entry(ptent);
+
+			if (!non_swap_entry(entry))
+				rss[MM_SWAPENTS]--;
+			if (unlikely(!free_swap_and_cache(entry)))
+				print_bad_pte(vma, addr, ptent, NULL);
+		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
@@ -2692,6 +2699,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
+	dec_mm_counter_fast(mm, MM_SWAPENTS);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
diff --git a/mm/rmap.c b/mm/rmap.c
index 73d0472884c..5cb47111f79 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -840,6 +840,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, MM_ANONPAGES);
+			inc_mm_counter(mm, MM_SWAPENTS);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 893984946a2..187a21f8b7b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -840,6 +840,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 		goto out;
 	}
 
+	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
 	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
-- 
cgit v1.2.3


From a1b57ac061b0e348c9a878f8fa3a93a67fe6af42 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Fri, 5 Mar 2010 13:42:15 -0800
Subject: mm: document /proc/pagetypeinfo

Add documentation for /proc/pagetypeinfo.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 45 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index b5c5fc657a8..96a44dd95e0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -438,6 +438,7 @@ Table 1-5: Kernel info in /proc
  modules     List of loaded modules                            
  mounts      Mounted filesystems                               
  net         Networking info (see text)                        
+ pagetypeinfo Additional page allocator information (see text)  (2.5)
  partitions  Table of partitions known to the system           
  pci	     Deprecated info of PCI bus (new way -> /proc/bus/pci/,
              decoupled by lspci					(2.4)
@@ -592,7 +593,7 @@ Node 0, zone      DMA      0      4      5      4      4      3 ...
 Node 0, zone   Normal      1      0      0      1    101      8 ...
 Node 0, zone  HighMem      2      0      0      1      1      0 ...
 
-Memory fragmentation is a problem under some workloads, and buddyinfo is a 
+External fragmentation is a problem under some workloads, and buddyinfo is a
 useful tool for helping diagnose these problems.  Buddyinfo will give you a 
 clue as to how big an area you can safely allocate, or why a previous
 allocation failed.
@@ -602,6 +603,48 @@ available.  In this case, there are 0 chunks of 2^0*PAGE_SIZE available in
 ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE 
 available in ZONE_NORMAL, etc... 
 
+More information relevant to external fragmentation can be found in
+pagetypeinfo.
+
+> cat /proc/pagetypeinfo
+Page block order: 9
+Pages per block:  512
+
+Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10
+Node    0, zone      DMA, type    Unmovable      0      0      0      1      1      1      1      1      1      1      0
+Node    0, zone      DMA, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0
+Node    0, zone      DMA, type      Movable      1      1      2      1      2      1      1      0      1      0      2
+Node    0, zone      DMA, type      Reserve      0      0      0      0      0      0      0      0      0      1      0
+Node    0, zone      DMA, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
+Node    0, zone    DMA32, type    Unmovable    103     54     77      1      1      1     11      8      7      1      9
+Node    0, zone    DMA32, type  Reclaimable      0      0      2      1      0      0      0      0      1      0      0
+Node    0, zone    DMA32, type      Movable    169    152    113     91     77     54     39     13      6      1    452
+Node    0, zone    DMA32, type      Reserve      1      2      2      2      2      0      1      1      1      1      0
+Node    0, zone    DMA32, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
+
+Number of blocks type     Unmovable  Reclaimable      Movable      Reserve      Isolate
+Node 0, zone      DMA            2            0            5            1            0
+Node 0, zone    DMA32           41            6          967            2            0
+
+Fragmentation avoidance in the kernel works by grouping pages of different
+migrate types into the same contiguous regions of memory called page blocks.
+A page block is typically the size of the default hugepage size e.g. 2MB on
+X86-64. By keeping pages grouped based on their ability to move, the kernel
+can reclaim pages within a page block to satisfy a high-order allocation.
+
+The pagetypinfo begins with information on the size of a page block. It
+then gives the same type of information as buddyinfo except broken down
+by migrate-type and finishes with details on how many page blocks of each
+type exist.
+
+If min_free_kbytes has been tuned correctly (recommendations made by hugeadm
+from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can
+make an estimate of the likely number of huge pages that can be allocated
+at a given point in time. All the "Movable" blocks should be allocatable
+unless memory has been mlock()'d. Some of the Reclaimable blocks should
+also be allocatable although a lot of filesystem metadata may have to be
+reclaimed to achieve this.
+
 ..............................................................................
 
 meminfo:
-- 
cgit v1.2.3


From 1e0051ae48a253685e4309256f9c1ec2bdb74b5d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 10 Mar 2010 15:21:57 -0800
Subject: Documentation/fs/: split txt and source files

Make dnotify_test.c source file and add it to Makefile so that
bitrot can be prevented.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/Makefile                   |  2 +-
 Documentation/filesystems/00-INDEX       |  2 ++
 Documentation/filesystems/Makefile       |  8 +++++++
 Documentation/filesystems/dnotify.txt    | 39 ++++----------------------------
 Documentation/filesystems/dnotify_test.c | 34 ++++++++++++++++++++++++++++
 5 files changed, 50 insertions(+), 35 deletions(-)
 create mode 100644 Documentation/filesystems/Makefile
 create mode 100644 Documentation/filesystems/dnotify_test.c

(limited to 'Documentation/filesystems')

diff --git a/Documentation/Makefile b/Documentation/Makefile
index bde1ca1fbd2..6fc7ea1d1f9 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -1,3 +1,3 @@
 obj-m := DocBook/ accounting/ auxdisplay/ connector/ \
-	filesystems/configfs/ ia64/ laptops/ networking/ \
+	filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \
 	pcmcia/ spi/ timers/ video4linux/ vm/ watchdog/src/
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 5139b8c9d5a..3bae418c6ad 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -32,6 +32,8 @@ dlmfs.txt
 	- info on the userspace interface to the OCFS2 DLM.
 dnotify.txt
 	- info about directory notification in Linux.
+dnotify_test.c
+	- example program for dnotify
 ecryptfs.txt
 	- docs on eCryptfs: stacked cryptographic filesystem for Linux.
 exofs.txt
diff --git a/Documentation/filesystems/Makefile b/Documentation/filesystems/Makefile
new file mode 100644
index 00000000000..a5dd114da14
--- /dev/null
+++ b/Documentation/filesystems/Makefile
@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := dnotify_test
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt
index 9f5d338ddbb..6baf88f4685 100644
--- a/Documentation/filesystems/dnotify.txt
+++ b/Documentation/filesystems/dnotify.txt
@@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL.
 
 Example
 -------
+See Documentation/filesystems/dnotify_test.c for an example.
 
-	#define _GNU_SOURCE	/* needed to get the defines */
-	#include <fcntl.h>	/* in glibc 2.2 this has the needed
-					   values defined */
-	#include <signal.h>
-	#include <stdio.h>
-	#include <unistd.h>
-
-	static volatile int event_fd;
-
-	static void handler(int sig, siginfo_t *si, void *data)
-	{
-		event_fd = si->si_fd;
-	}
-
-	int main(void)
-	{
-		struct sigaction act;
-		int fd;
-
-		act.sa_sigaction = handler;
-		sigemptyset(&act.sa_mask);
-		act.sa_flags = SA_SIGINFO;
-		sigaction(SIGRTMIN + 1, &act, NULL);
-
-		fd = open(".", O_RDONLY);
-		fcntl(fd, F_SETSIG, SIGRTMIN + 1);
-		fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
-		/* we will now be notified if any of the files
-		   in "." is modified or new files are created */
-		while (1) {
-			pause();
-			printf("Got event on fd=%d\n", event_fd);
-		}
-	}
+NOTE
+----
+Beginning with Linux 2.6.13, dnotify has been replaced by inotify.
+See Documentation/filesystems/inotify.txt for more information on it.
diff --git a/Documentation/filesystems/dnotify_test.c b/Documentation/filesystems/dnotify_test.c
new file mode 100644
index 00000000000..8b37b4a1e18
--- /dev/null
+++ b/Documentation/filesystems/dnotify_test.c
@@ -0,0 +1,34 @@
+#define _GNU_SOURCE	/* needed to get the defines */
+#include <fcntl.h>	/* in glibc 2.2 this has the needed
+				   values defined */
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+
+static volatile int event_fd;
+
+static void handler(int sig, siginfo_t *si, void *data)
+{
+	event_fd = si->si_fd;
+}
+
+int main(void)
+{
+	struct sigaction act;
+	int fd;
+
+	act.sa_sigaction = handler;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = SA_SIGINFO;
+	sigaction(SIGRTMIN + 1, &act, NULL);
+
+	fd = open(".", O_RDONLY);
+	fcntl(fd, F_SETSIG, SIGRTMIN + 1);
+	fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
+	/* we will now be notified if any of the files
+	   in "." is modified or new files are created */
+	while (1) {
+		pause();
+		printf("Got event on fd=%d\n", event_fd);
+	}
+}
-- 
cgit v1.2.3


From 32e688b8c1afafa389223a4813b97e8c128a1636 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Mon, 15 Mar 2010 15:21:31 +0100
Subject: Documentation/filesystems/proc.txt typo fix.

Typo fix for a filename in procfs documentation.

Signed-off-by: Rob Landley <rob@landley.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/filesystems/proc.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a4f30faa4f1..770700317c2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -305,7 +305,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
   cgtime        guest time of the task children in jiffies
 ..............................................................................
 
-The /proc/PID/map file containing the currently mapped memory regions and
+The /proc/PID/maps file containing the currently mapped memory regions and
 their access permissions.
 
 The format is:
-- 
cgit v1.2.3


From 23ab15ad7a9d042afa7303b735b6e24faa607241 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 22 Mar 2010 09:37:14 -0700
Subject: ceph: avoid loaded term 'OSD' in documention

'OSD' means different things to different people; avoid it here to avoid
confusion.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 Documentation/filesystems/ceph.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index 6e03917316b..523fdf0828d 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -115,7 +115,7 @@ Mount Options
 	number of entries in that directory.
 
   nocrc
-	Disable CRC32C calculation for data writes.  If set, the OSD
+	Disable CRC32C calculation for data writes.  If set, the storage node
 	must rely on TCP's error correction to detect data corruption
 	in the data payload.
 
-- 
cgit v1.2.3


From 92d6b71ab906be706f3679353b30a8d2c3831144 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Thu, 11 Mar 2010 14:08:56 -0800
Subject: genirq: Expose irq_desc->node in proc/irq

Expose irq_desc->node as /proc/irq/*/node.

This file provides device hardware locality information for apps
desiring to include hardware locality in irq mapping decisions.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/filesystems/proc.txt |  4 ++++
 kernel/irq/proc.c                  | 23 +++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a4f30faa4f1..6507d2ae523 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -566,6 +566,10 @@ The default_smp_affinity mask applies to all non-active IRQs, which are the
 IRQs which have not yet been allocated/activated, and hence which lack a
 /proc/irq/[0-9]* directory.
 
+The node file on an SMP system shows the node to which the device using the IRQ
+reports itself as being attached. This hardware locality information does not
+include information about any possible driver locality preference.
+
 prof_cpu_mask specifies which CPUs are to be profiled by the system wide
 profiler. Default value is ffffffff (all cpus).
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6f50eccc79c..e346e08f5c3 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -146,6 +146,26 @@ static const struct file_operations default_affinity_proc_fops = {
 	.release	= single_release,
 	.write		= default_affinity_write,
 };
+
+static int irq_node_proc_show(struct seq_file *m, void *v)
+{
+	struct irq_desc *desc = irq_to_desc((long) m->private);
+
+	seq_printf(m, "%d\n", desc->node);
+	return 0;
+}
+
+static int irq_node_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_node_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations irq_node_proc_fops = {
+	.open		= irq_node_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif
 
 static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -230,6 +250,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 	/* create /proc/irq/<irq>/smp_affinity */
 	proc_create_data("smp_affinity", 0600, desc->dir,
 			 &irq_affinity_proc_fops, (void *)(long)irq);
+
+	proc_create_data("node", 0444, desc->dir,
+			 &irq_node_proc_fops, (void *)(long)irq);
 #endif
 
 	proc_create_data("spurious", 0444, desc->dir,
-- 
cgit v1.2.3


From 5574169613b40b85d6f4c67208fa4846b897a0a1 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 23 Mar 2010 13:35:33 -0700
Subject: doc: add the documentation for mpol=local

commit 3f226aa1c (mempolicy: support mpol=local tmpfs mount option) added
new mpol=local mount option.  but it didn't add a documentation.

This patch does it.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/tmpfs.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index 3015da0c6b2..fe09a2cb185 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -82,11 +82,13 @@ tmpfs has a mount option to set the NUMA memory allocation policy for
 all files in that instance (if CONFIG_NUMA is enabled) - which can be
 adjusted on the fly via 'mount -o remount ...'
 
-mpol=default             prefers to allocate memory from the local node
+mpol=default             use the process allocation policy
+                         (see set_mempolicy(2))
 mpol=prefer:Node         prefers to allocate memory from the given Node
 mpol=bind:NodeList       allocates memory only from nodes in NodeList
 mpol=interleave          prefers to allocate from each node in turn
 mpol=interleave:NodeList allocates from each node of NodeList in turn
+mpol=local		 prefers to allocate memory from the local node
 
 NodeList format is a comma-separated list of decimal numbers and ranges,
 a range being two hyphen-separated decimal numbers, the smallest and
@@ -134,3 +136,5 @@ Author:
    Christoph Rohland <cr@sap.com>, 1.12.01
 Updated:
    Hugh Dickins, 4 June 2007
+Updated:
+   KOSAKI Motohiro, 16 Mar 2010
-- 
cgit v1.2.3


From 4cb947b59c5835783fb96aad2f7d92b1e4250aff Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Thu, 25 Mar 2010 11:04:49 +0100
Subject: GFS2: docs update

Now http://sources.redhat.com/cluster/ is redirected to
http://sources.redhat.com/cluster/wiki/

Also fixed tabs in the end.

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 Documentation/filesystems/gfs2.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
index 5e3ab8f3bef..0b59c020091 100644
--- a/Documentation/filesystems/gfs2.txt
+++ b/Documentation/filesystems/gfs2.txt
@@ -1,7 +1,7 @@
 Global File System
 ------------------
 
-http://sources.redhat.com/cluster/
+http://sources.redhat.com/cluster/wiki/
 
 GFS is a cluster file system. It allows a cluster of computers to
 simultaneously use a block device that is shared between them (with FC,
@@ -36,11 +36,11 @@ GFS2 is not on-disk compatible with previous versions of GFS, but it
 is pretty close.
 
 The following man pages can be found at the URL above:
-  fsck.gfs2	to repair a filesystem
-  gfs2_grow	to expand a filesystem online
-  gfs2_jadd	to add journals to a filesystem online
-  gfs2_tool	to manipulate, examine and tune a filesystem
+  fsck.gfs2		to repair a filesystem
+  gfs2_grow		to expand a filesystem online
+  gfs2_jadd		to add journals to a filesystem online
+  gfs2_tool		to manipulate, examine and tune a filesystem
   gfs2_quota	to examine and change quota values in a filesystem
   gfs2_convert	to convert a gfs filesystem to gfs2 in-place
   mount.gfs2	to help mount(8) mount a filesystem
-  mkfs.gfs2	to make a filesystem
+  mkfs.gfs2		to make a filesystem
-- 
cgit v1.2.3


From 8136b58dd0fce0b4cb649ac690e0493fb6fdacdb Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Mon, 29 Mar 2010 19:05:57 +0800
Subject: ceph: some documentations fixes

New documentation should have an entry in the 00-INDEX.  Correct git
urls.

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 Documentation/filesystems/00-INDEX | 2 ++
 Documentation/filesystems/ceph.txt | 9 +++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 3bae418c6ad..4303614b5ad 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -16,6 +16,8 @@ befs.txt
 	- information about the BeOS filesystem for Linux.
 bfs.txt
 	- info for the SCO UnixWare Boot Filesystem (BFS).
+ceph.txt
+	- info for the Ceph Distributed File System
 cifs.txt
 	- description of the CIFS filesystem.
 coda.txt
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index 523fdf0828d..0660c9f5dee 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -8,7 +8,7 @@ Basic features include:
 
  * POSIX semantics
  * Seamless scaling from 1 to many thousands of nodes
- * High availability and reliability.  No single points of failure.
+ * High availability and reliability.  No single point of failure.
  * N-way replication of data across storage nodes
  * Fast recovery from node failures
  * Automatic rebalancing of data on node addition/removal
@@ -94,7 +94,7 @@ Mount Options
 
   wsize=X
 	Specify the maximum write size in bytes.  By default there is no
-	maximu.  Ceph will normally size writes based on the file stripe
+	maximum.  Ceph will normally size writes based on the file stripe
 	size.
 
   rsize=X
@@ -133,7 +133,8 @@ For more information on Ceph, see the home page at
 	http://ceph.newdream.net/
 
 The Linux kernel client source tree is available at
-	git://ceph.newdream.net/linux-ceph-client.git
+	git://ceph.newdream.net/git/ceph-client.git
+	git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
 
 and the source for the full system is at
-	git://ceph.newdream.net/ceph.git
+	git://ceph.newdream.net/git/ceph.git
-- 
cgit v1.2.3


From 9208d24253e5e644f8cb1b87b69de44897668303 Mon Sep 17 00:00:00 2001
From: Sripathi Kodi <sripathik@in.ibm.com>
Date: Thu, 18 Mar 2010 08:01:33 +0000
Subject: 9p: documentation update

This patch adds documentation for new 9P options introduced in
2.6.34.

Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 Documentation/filesystems/9p.txt | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index 57e0b80a527..c0236e753bc 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -37,6 +37,15 @@ For Plan 9 From User Space applications (http://swtch.com/plan9)
 
 	mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER
 
+For server running on QEMU host with virtio transport:
+
+	mount -t 9p -o trans=virtio <mount_tag> /mnt/9
+
+where mount_tag is the tag associated by the server to each of the exported
+mount points. Each 9P export is seen by the client as a virtio device with an
+associated "mount_tag" property. Available mount tags can be
+seen by reading /sys/bus/virtio/drivers/9pnet_virtio/virtio<n>/mount_tag files.
+
 OPTIONS
 =======
 
@@ -47,7 +56,7 @@ OPTIONS
 			fd   	- used passed file descriptors for connection
                                 (see rfdno and wfdno)
 			virtio	- connect to the next virtio channel available
-				(from lguest or KVM with trans_virtio module)
+				(from QEMU with trans_virtio module)
 			rdma	- connect to a specified RDMA channel
 
   uname=name	user name to attempt mount as on the remote server.  The
@@ -85,7 +94,12 @@ OPTIONS
 
   port=n	port to connect to on the remote server
 
-  noextend	force legacy mode (no 9p2000.u semantics)
+  noextend	force legacy mode (no 9p2000.u or 9p2000.L semantics)
+
+  version=name	Select 9P protocol version. Valid options are:
+			9p2000          - Legacy mode (same as noextend)
+			9p2000.u        - Use 9P2000.u protocol
+			9p2000.L        - Use 9P2000.L protocol
 
   dfltuid	attempt to mount as a particular uid
 
-- 
cgit v1.2.3


From a33f32244d8550da8b4a26e277ce07d5c6d158b5 Mon Sep 17 00:00:00 2001
From: Francis Galiegue <fgaliegue@gmail.com>
Date: Fri, 23 Apr 2010 00:08:02 +0200
Subject: Documentation/: it's -> its where appropriate

Fix obvious cases of "it's" being used when "its" was meant.

Signed-off-by: Francis Galiegue <fgaliegue@gmail.com>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ABI/testing/sysfs-devices-memory      |  2 +-
 Documentation/DMA-API-HOWTO.txt                     |  2 +-
 Documentation/DocBook/libata.tmpl                   |  2 +-
 Documentation/PCI/pci-error-recovery.txt            |  4 ++--
 Documentation/Smack.txt                             |  2 +-
 Documentation/arm/SA1100/ADSBitsy                   |  2 +-
 Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen      |  2 +-
 Documentation/atomic_ops.txt                        |  2 +-
 Documentation/blackfin/bfin-gpio-notes.txt          |  2 +-
 Documentation/cachetlb.txt                          |  6 +++---
 Documentation/cgroups/memcg_test.txt                |  2 +-
 Documentation/cgroups/memory.txt                    |  2 +-
 Documentation/connector/connector.txt               |  2 +-
 Documentation/dvb/ci.txt                            |  2 +-
 Documentation/dvb/contributors.txt                  |  2 +-
 Documentation/filesystems/autofs4-mount-control.txt |  2 +-
 Documentation/filesystems/ceph.txt                  |  2 +-
 Documentation/filesystems/dlmfs.txt                 |  2 +-
 Documentation/filesystems/fiemap.txt                | 12 ++++++------
 Documentation/filesystems/fuse.txt                  |  4 ++--
 Documentation/filesystems/hpfs.txt                  |  2 +-
 Documentation/filesystems/nfs/rpc-cache.txt         |  2 +-
 Documentation/filesystems/proc.txt                  |  4 ++--
 Documentation/filesystems/smbfs.txt                 |  2 +-
 Documentation/filesystems/vfs.txt                   |  2 +-
 Documentation/hwmon/lm85                            |  2 +-
 Documentation/input/joystick.txt                    |  2 +-
 Documentation/intel_txt.txt                         |  2 +-
 Documentation/kbuild/kconfig-language.txt           |  2 +-
 Documentation/kernel-docs.txt                       | 10 +++++-----
 Documentation/kprobes.txt                           |  2 +-
 Documentation/laptops/laptop-mode.txt               |  2 +-
 Documentation/lguest/lguest.c                       |  2 +-
 Documentation/md.txt                                |  2 +-
 Documentation/netlabel/lsm_interface.txt            |  2 +-
 Documentation/networking/ifenslave.c                |  2 +-
 Documentation/networking/packet_mmap.txt            |  4 ++--
 Documentation/power/regulator/consumer.txt          | 10 +++++-----
 Documentation/power/regulator/machine.txt           |  2 +-
 Documentation/power/regulator/overview.txt          |  6 +++---
 Documentation/powerpc/booting-without-of.txt        |  2 +-
 Documentation/powerpc/phyp-assisted-dump.txt        |  2 +-
 Documentation/rt-mutex-design.txt                   |  2 +-
 Documentation/scsi/ChangeLog.lpfc                   |  4 ++--
 Documentation/scsi/FlashPoint.txt                   |  2 +-
 Documentation/scsi/dtc3x80.txt                      |  2 +-
 Documentation/scsi/ncr53c8xx.txt                    |  2 +-
 Documentation/scsi/osst.txt                         |  2 +-
 Documentation/scsi/scsi_fc_transport.txt            |  4 ++--
 Documentation/scsi/sym53c8xx_2.txt                  |  2 +-
 Documentation/sound/alsa/soc/dapm.txt               |  4 ++--
 Documentation/sound/alsa/soc/machine.txt            |  2 +-
 Documentation/sound/alsa/soc/overview.txt           |  2 +-
 Documentation/usb/WUSB-Design-overview.txt          |  2 +-
 Documentation/vm/numa_memory_policy.txt             |  4 ++--
 Documentation/w1/w1.generic                         |  2 +-
 56 files changed, 81 insertions(+), 81 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
index bf1627b02a0..aba7d989208 100644
--- a/Documentation/ABI/testing/sysfs-devices-memory
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -43,7 +43,7 @@ Date:		September 2008
 Contact:	Badari Pulavarty <pbadari@us.ibm.com>
 Description:
 		The file /sys/devices/system/memory/memoryX/state
-		is read-write.  When read, it's contents show the
+		is read-write.  When read, its contents show the
 		online/offline state of the memory section.  When written,
 		root can toggle the the online/offline state of a removable
 		memory section (see removable file description above)
diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/DMA-API-HOWTO.txt
index 52618ab069a..2e435adfbd6 100644
--- a/Documentation/DMA-API-HOWTO.txt
+++ b/Documentation/DMA-API-HOWTO.txt
@@ -742,7 +742,7 @@ failure can be determined by:
 
 			   Closing
 
-This document, and the API itself, would not be in it's current
+This document, and the API itself, would not be in its current
 form without the feedback and suggestions from numerous individuals.
 We would like to specifically mention, in no particular order, the
 following people:
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
index ba997577150..261b57bc6f0 100644
--- a/Documentation/DocBook/libata.tmpl
+++ b/Documentation/DocBook/libata.tmpl
@@ -490,7 +490,7 @@ void (*host_stop) (struct ata_host_set *host_set);
 	allocates space for a legacy IDE PRD table and returns.
 	</para>
 	<para>
-	->port_stop() is called after ->host_stop().  It's sole function
+	->port_stop() is called after ->host_stop().  Its sole function
 	is to release DMA/memory resources, now that they are no longer
 	actively being used.  Many drivers also free driver-private
 	data from port at this time.
diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt
index e83f2ea7641..898ded24510 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -216,7 +216,7 @@ The driver should return one of the following result codes:
 
 		- PCI_ERS_RESULT_NEED_RESET
 		  Driver returns this if it thinks the device is not
-		  recoverable in it's current state and it needs a slot
+		  recoverable in its current state and it needs a slot
 		  reset to proceed.
 
 		- PCI_ERS_RESULT_DISCONNECT
@@ -241,7 +241,7 @@ in working condition.
 
 The driver is not supposed to restart normal driver I/O operations
 at this point.  It should limit itself to "probing" the device to
-check it's recoverability status. If all is right, then the platform
+check its recoverability status. If all is right, then the platform
 will call resume() once all drivers have ack'd link_reset().
 
 	Result codes:
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
index 34614b4c708..e9dab41c0fe 100644
--- a/Documentation/Smack.txt
+++ b/Documentation/Smack.txt
@@ -73,7 +73,7 @@ NOTE: Smack labels are limited to 23 characters. The attr command
 If you don't do anything special all users will get the floor ("_")
 label when they log in. If you do want to log in via the hacked ssh
 at other labels use the attr command to set the smack value on the
-home directory and it's contents.
+home directory and its contents.
 
 You can add access rules in /etc/smack/accesses. They take the form:
 
diff --git a/Documentation/arm/SA1100/ADSBitsy b/Documentation/arm/SA1100/ADSBitsy
index 7197a9e958e..f9f62e8c071 100644
--- a/Documentation/arm/SA1100/ADSBitsy
+++ b/Documentation/arm/SA1100/ADSBitsy
@@ -32,7 +32,7 @@ Notes:
 
 - The flash on board is divided into 3 partitions.
   You should be careful to use flash on board.
-  It's partition is different from GraphicsClient Plus and GraphicsMaster
+  Its partition is different from GraphicsClient Plus and GraphicsMaster
 
 - 16bpp mode requires a different cable than what ships with the board.
   Contact ADS or look through the manual to wire your own. Currently,
diff --git a/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen b/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
index 1e6a23fdf2f..dc460f05564 100644
--- a/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
+++ b/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
@@ -7,7 +7,7 @@ The driver only implements a four-wire touch panel protocol.
 
 The touchscreen driver is maintenance free except for the pen-down or
 touch threshold.  Some resistive displays and board combinations may
-require tuning of this threshold.  The driver exposes some of it's
+require tuning of this threshold.  The driver exposes some of its
 internal state in the sys filesystem.  If the kernel is configured
 with it, CONFIG_SYSFS, and sysfs is mounted at /sys, there will be a
 directory
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 396bec3b74e..ac4d4718712 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -320,7 +320,7 @@ counter decrement would not become globally visible until the
 obj->active update does.
 
 As a historical note, 32-bit Sparc used to only allow usage of
-24-bits of it's atomic_t type.  This was because it used 8 bits
+24-bits of its atomic_t type.  This was because it used 8 bits
 as a spinlock for SMP safety.  Sparc32 lacked a "compare and swap"
 type instruction.  However, 32-bit Sparc has since been moved over
 to a "hash table of spinlocks" scheme, that allows the full 32-bit
diff --git a/Documentation/blackfin/bfin-gpio-notes.txt b/Documentation/blackfin/bfin-gpio-notes.txt
index 9898c7ded7d..f731c1e5647 100644
--- a/Documentation/blackfin/bfin-gpio-notes.txt
+++ b/Documentation/blackfin/bfin-gpio-notes.txt
@@ -43,7 +43,7 @@
 	void bfin_gpio_irq_free(unsigned gpio);
 
     The request functions will record the function state for a certain pin,
-    the free functions will clear it's function state.
+    the free functions will clear its function state.
     Once a pin is requested, it can't be requested again before it is freed by
     previous caller, otherwise kernel will dump stacks, and the request
     function fail.
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index 2b5f823abd0..9164ae3b83b 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -5,7 +5,7 @@
 
 This document describes the cache/tlb flushing interfaces called
 by the Linux VM subsystem.  It enumerates over each interface,
-describes it's intended purpose, and what side effect is expected
+describes its intended purpose, and what side effect is expected
 after the interface is invoked.
 
 The side effects described below are stated for a uniprocessor
@@ -231,7 +231,7 @@ require a whole different set of interfaces to handle properly.
 The biggest problem is that of virtual aliasing in the data cache
 of a processor.
 
-Is your port susceptible to virtual aliasing in it's D-cache?
+Is your port susceptible to virtual aliasing in its D-cache?
 Well, if your D-cache is virtually indexed, is larger in size than
 PAGE_SIZE, and does not prevent multiple cache lines for the same
 physical address from existing at once, you have this problem.
@@ -249,7 +249,7 @@ one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
 Next, you have to solve the D-cache aliasing issue for all
 other cases.  Please keep in mind that fact that, for a given page
 mapped into some user address space, there is always at least one more
-mapping, that of the kernel in it's linear mapping starting at
+mapping, that of the kernel in its linear mapping starting at
 PAGE_OFFSET.  So immediately, once the first user maps a given
 physical page into its address space, by implication the D-cache
 aliasing problem has the potential to exist since the kernel already
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index f7f68b2ac19..b7eececfb19 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -244,7 +244,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	  we have to check if OLDPAGE/NEWPAGE is a valid page after commit().
 
 8. LRU
-        Each memcg has its own private LRU. Now, it's handling is under global
+        Each memcg has its own private LRU. Now, its handling is under global
 	VM's control (means that it's handled under global zone->lru_lock).
 	Almost all routines around memcg's LRU is called by global LRU's
 	list management functions under zone->lru_lock().
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 3a6aecd078b..6cab1f29da4 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -263,7 +263,7 @@ some of the pages cached in the cgroup (page cache pages).
 
 4.2 Task migration
 
-When a task migrates from one cgroup to another, it's charge is not
+When a task migrates from one cgroup to another, its charge is not
 carried forward by default. The pages allocated from the original cgroup still
 remain charged to it, the charge is dropped when the page is freed or
 reclaimed.
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt
index 78c9466a9aa..e5c5f5e6ab7 100644
--- a/Documentation/connector/connector.txt
+++ b/Documentation/connector/connector.txt
@@ -88,7 +88,7 @@ int cn_netlink_send(struct cn_msg *msg, u32 __groups, int gfp_mask);
  int gfp_mask			- GFP mask.
 
  Note: When registering new callback user, connector core assigns
- netlink group to the user which is equal to it's id.idx.
+ netlink group to the user which is equal to its id.idx.
 
 /*****************************************/
 Protocol description.
diff --git a/Documentation/dvb/ci.txt b/Documentation/dvb/ci.txt
index 2ecd834585e..4a0c2b56e69 100644
--- a/Documentation/dvb/ci.txt
+++ b/Documentation/dvb/ci.txt
@@ -41,7 +41,7 @@ This application requires the following to function properly as of now.
 
 * Cards that fall in this category
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-At present the cards that fall in this category are the Twinhan and it's
+At present the cards that fall in this category are the Twinhan and its
 clones, these cards are available as VVMER, Tomato, Hercules, Orange and
 so on.
 
diff --git a/Documentation/dvb/contributors.txt b/Documentation/dvb/contributors.txt
index 4865addebe1..47c30098dab 100644
--- a/Documentation/dvb/contributors.txt
+++ b/Documentation/dvb/contributors.txt
@@ -1,7 +1,7 @@
 Thanks go to the following people for patches and contributions:
 
 Michael Hunold <m.hunold@gmx.de>
-  for the initial saa7146 driver and it's recent overhaul
+  for the initial saa7146 driver and its recent overhaul
 
 Christian Theiss
   for his work on the initial Linux DVB driver
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt
index 8f78ded4b64..51986bf08a4 100644
--- a/Documentation/filesystems/autofs4-mount-control.txt
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -146,7 +146,7 @@ found to be inadequate, in this case. The Generic Netlink system was
 used for this as raw Netlink would lead to a significant increase in
 complexity. There's no question that the Generic Netlink system is an
 elegant solution for common case ioctl functions but it's not a complete
-replacement probably because it's primary purpose in life is to be a
+replacement probably because its primary purpose in life is to be a
 message bus implementation rather than specifically an ioctl replacement.
 While it would be possible to work around this there is one concern
 that lead to the decision to not use it. This is that the autofs
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index 0660c9f5dee..763d8ebbbeb 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -90,7 +90,7 @@ Mount Options
 	Specify the IP and/or port the client should bind to locally.
 	There is normally not much reason to do this.  If the IP is not
 	specified, the client's IP address is determined by looking at the
-	address it's connection to the monitor originates from.
+	address its connection to the monitor originates from.
 
   wsize=X
 	Specify the maximum write size in bytes.  By default there is no
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt
index c50bbb2d52b..1b528b2ad80 100644
--- a/Documentation/filesystems/dlmfs.txt
+++ b/Documentation/filesystems/dlmfs.txt
@@ -47,7 +47,7 @@ You'll want to start heartbeating on a volume which all the nodes in
 your lockspace can access. The easiest way to do this is via
 ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
 that an OCFS2 file system be in place so that it can automatically
-find it's heartbeat area, though it will eventually support heartbeat
+find its heartbeat area, though it will eventually support heartbeat
 against raw disks.
 
 Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt
index 606233cd461..1b805a0efbb 100644
--- a/Documentation/filesystems/fiemap.txt
+++ b/Documentation/filesystems/fiemap.txt
@@ -38,7 +38,7 @@ flags, it will return EBADR and the contents of fm_flags will contain
 the set of flags which caused the error. If the kernel is compatible
 with all flags passed, the contents of fm_flags will be unmodified.
 It is up to userspace to determine whether rejection of a particular
-flag is fatal to it's operation. This scheme is intended to allow the
+flag is fatal to its operation. This scheme is intended to allow the
 fiemap interface to grow in the future but without losing
 compatibility with old software.
 
@@ -56,7 +56,7 @@ If this flag is set, the kernel will sync the file before mapping extents.
 
 * FIEMAP_FLAG_XATTR
 If this flag is set, the extents returned will describe the inodes
-extended attribute lookup tree, instead of it's data tree.
+extended attribute lookup tree, instead of its data tree.
 
 
 Extent Mapping
@@ -89,7 +89,7 @@ struct fiemap_extent {
 };
 
 All offsets and lengths are in bytes and mirror those on disk.  It is valid
-for an extents logical offset to start before the request or it's logical
+for an extents logical offset to start before the request or its logical
 length to extend past the request.  Unless FIEMAP_EXTENT_NOT_ALIGNED is
 returned, fe_logical, fe_physical, and fe_length will be aligned to the
 block size of the file system.  With the exception of extents flagged as
@@ -125,7 +125,7 @@ been allocated for the file yet.
 
 * FIEMAP_EXTENT_DELALLOC
   - This will also set FIEMAP_EXTENT_UNKNOWN.
-Delayed allocation - while there is data for this extent, it's
+Delayed allocation - while there is data for this extent, its
 physical location has not been allocated yet.
 
 * FIEMAP_EXTENT_ENCODED
@@ -159,7 +159,7 @@ Data is located within a meta data block.
 Data is packed into a block with data from other files.
 
 * FIEMAP_EXTENT_UNWRITTEN
-Unwritten extent - the extent is allocated but it's data has not been
+Unwritten extent - the extent is allocated but its data has not been
 initialized.  This indicates the extent's data will be all zero if read
 through the filesystem but the contents are undefined if read directly from
 the device.
@@ -176,7 +176,7 @@ VFS -> File System Implementation
 
 File systems wishing to support fiemap must implement a ->fiemap callback on
 their inode_operations structure. The fs ->fiemap call is responsible for
-defining it's set of supported fiemap flags, and calling a helper function on
+defining its set of supported fiemap flags, and calling a helper function on
 each discovered extent:
 
 struct inode_operations {
diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt
index 397a41adb4c..13af4a49e7d 100644
--- a/Documentation/filesystems/fuse.txt
+++ b/Documentation/filesystems/fuse.txt
@@ -91,7 +91,7 @@ Mount options
 'default_permissions'
 
   By default FUSE doesn't check file access permissions, the
-  filesystem is free to implement it's access policy or leave it to
+  filesystem is free to implement its access policy or leave it to
   the underlying file access mechanism (e.g. in case of network
   filesystems).  This option enables permission checking, restricting
   access based on file mode.  It is usually useful together with the
@@ -171,7 +171,7 @@ or may honor them by sending a reply to the _original_ request, with
 the error set to EINTR.
 
 It is also possible that there's a race between processing the
-original request and it's INTERRUPT request.  There are two possibilities:
+original request and its INTERRUPT request.  There are two possibilities:
 
   1) The INTERRUPT request is processed before the original request is
      processed
diff --git a/Documentation/filesystems/hpfs.txt b/Documentation/filesystems/hpfs.txt
index fa45c3baed9..74630bd504f 100644
--- a/Documentation/filesystems/hpfs.txt
+++ b/Documentation/filesystems/hpfs.txt
@@ -103,7 +103,7 @@ to analyze or change OS2SYS.INI.
 Codepages
 
 HPFS can contain several uppercasing tables for several codepages and each
-file has a pointer to codepage it's name is in. However OS/2 was created in
+file has a pointer to codepage its name is in. However OS/2 was created in
 America where people don't care much about codepages and so multiple codepages
 support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk.
 Once I booted English OS/2 working in cp 850 and I created a file on my 852
diff --git a/Documentation/filesystems/nfs/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt
index 8a382bea680..ebcaaee2161 100644
--- a/Documentation/filesystems/nfs/rpc-cache.txt
+++ b/Documentation/filesystems/nfs/rpc-cache.txt
@@ -185,7 +185,7 @@ failed lookup meant a definite 'no'.
 request/response format
 -----------------------
 
-While each cache is free to use it's own format for requests
+While each cache is free to use its own format for requests
 and responses over channel, the following is recommended as
 appropriate and support routines are available to help:
 Each request or response record should be printable ASCII
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 770700317c2..f6b1b5fca1d 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -965,7 +965,7 @@ your system and how much traffic was routed over those devices:
   ...] 1375103    17405    0    0    0     0       0          0 
   ...] 1703981     5535    0    0    0     3       0          0 
 
-In addition, each Channel Bond interface has it's own directory.  For
+In addition, each Channel Bond interface has its own directory.  For
 example, the bond0 device will have a directory called /proc/net/bond0/.
 It will contain information that is specific to that bond, such as the
 current slaves of the bond, the link status of the slaves, and how
@@ -1362,7 +1362,7 @@ been accounted as having caused 1MB of write.
 In other words: The number of bytes which this process caused to not happen,
 by truncating pagecache. A task can cause "negative" IO too. If this task
 truncates some dirty pagecache, some IO which another task has been accounted
-for (in it's write_bytes) will not be happening. We _could_ just subtract that
+for (in its write_bytes) will not be happening. We _could_ just subtract that
 from the truncating task's write_bytes, but there is information loss in doing
 that.
 
diff --git a/Documentation/filesystems/smbfs.txt b/Documentation/filesystems/smbfs.txt
index f673ef0de0f..194fb0decd2 100644
--- a/Documentation/filesystems/smbfs.txt
+++ b/Documentation/filesystems/smbfs.txt
@@ -3,6 +3,6 @@ protocol used by Windows for Workgroups, Windows 95 and Windows NT.
 Smbfs was inspired by Samba, the program written by Andrew Tridgell
 that turns any Unix host into a file server for DOS or Windows clients.
 
-Smbfs is a SMB client, but uses parts of samba for it's operation. For
+Smbfs is a SMB client, but uses parts of samba for its operation. For
 more info on samba, including documentation, please go to
 http://www.samba.org/ and then on to your nearest mirror.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3de2f32edd9..b66858538df 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -72,7 +72,7 @@ structure (this is the kernel-side implementation of file
 descriptors). The freshly allocated file structure is initialized with
 a pointer to the dentry and a set of file operation member functions.
 These are taken from the inode data. The open() file method is then
-called so the specific filesystem implementation can do it's work. You
+called so the specific filesystem implementation can do its work. You
 can see that this is another switch performed by the VFS. The file
 structure is placed into the file descriptor table for the process.
 
diff --git a/Documentation/hwmon/lm85 b/Documentation/hwmon/lm85
index a13680871bc..a76aefeeb68 100644
--- a/Documentation/hwmon/lm85
+++ b/Documentation/hwmon/lm85
@@ -157,7 +157,7 @@ temperature configuration points:
 
 There are three PWM outputs. The LM85 datasheet suggests that the
 pwm3 output control both fan3 and fan4. Each PWM can be individually
-configured and assigned to a zone for it's control value. Each PWM can be
+configured and assigned to a zone for its control value. Each PWM can be
 configured individually according to the following options.
 
 * pwm#_auto_pwm_min - this specifies the PWM value for temp#_auto_temp_off
diff --git a/Documentation/input/joystick.txt b/Documentation/input/joystick.txt
index 154d767b2ac..8007b7ca87b 100644
--- a/Documentation/input/joystick.txt
+++ b/Documentation/input/joystick.txt
@@ -402,7 +402,7 @@ for the port of the SoundFusion is supported by the cs461x.c module.
 ~~~~~~~~~~~~~~~~~~~~~~~~
   The Live! has a special PCI gameport, which, although it doesn't provide
 any "Enhanced" stuff like 4DWave and friends, is quite a bit faster than
-it's ISA counterparts. It also requires special support, hence the
+its ISA counterparts. It also requires special support, hence the
 emu10k1-gp.c module for it instead of the normal ns558.c one.
 
 3.15 SoundBlaster 64 and 128 - ES1370 and ES1371, ESS Solo1 and S3 SonicVibes
diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt
index f40a1f03001..1423bcc7c50 100644
--- a/Documentation/intel_txt.txt
+++ b/Documentation/intel_txt.txt
@@ -126,7 +126,7 @@ o  Tboot then applies an (optional) user-defined launch policy to
 o  Tboot adjusts the e820 table provided by the bootloader to reserve
    its own location in memory as well as to reserve certain other
    TXT-related regions.
-o  As part of it's launch, tboot DMA protects all of RAM (using the
+o  As part of its launch, tboot DMA protects all of RAM (using the
    VT-d PMRs).  Thus, the kernel must be booted with 'intel_iommu=on'
    in order to remove this blanket protection and use VT-d's
    page-level protection.
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index c412c245848..b472e4e0ba6 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -181,7 +181,7 @@ Expressions are listed in decreasing order of precedence.
 (7) Returns the result of max(/expr/, /expr/).
 
 An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2
-respectively for calculations). A menu entry becomes visible when it's
+respectively for calculations). A menu entry becomes visible when its
 expression evaluates to 'm' or 'y'.
 
 There are two types of symbols: constant and non-constant symbols.
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index 28cdc2af213..ec8d31ee12e 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -116,7 +116,7 @@
        Author: Ingo Molnar, Gadi Oxman and Miguel de Icaza.
        URL: http://www.linuxjournal.com/article.php?sid=2391
        Keywords: RAID, MD driver.
-       Description: Linux Journal Kernel Korner article. Here is it's
+       Description: Linux Journal Kernel Korner article. Here is its
        abstract: "A description of the implementation of the RAID-1,
        RAID-4 and RAID-5 personalities of the MD device driver in the
        Linux kernel, providing users with high performance and reliable,
@@ -127,7 +127,7 @@
        URL: http://www.linuxjournal.com/article.php?sid=1219
        Keywords: device driver, module, loading/unloading modules,
        allocating resources.
-       Description: Linux Journal Kernel Korner article. Here is it's
+       Description: Linux Journal Kernel Korner article. Here is its
        abstract: "This is the first of a series of four articles
        co-authored by Alessandro Rubini and Georg Zezchwitz which present
        a practical approach to writing Linux device drivers as kernel
@@ -141,7 +141,7 @@
        Keywords: character driver, init_module, clean_up module,
        autodetection, mayor number, minor number, file operations,
        open(), close().
-       Description: Linux Journal Kernel Korner article. Here is it's
+       Description: Linux Journal Kernel Korner article. Here is its
        abstract: "This article, the second of four, introduces part of
        the actual code to create custom module implementing a character
        device driver. It describes the code for module initialization and
@@ -152,7 +152,7 @@
        URL: http://www.linuxjournal.com/article.php?sid=1221
        Keywords: read(), write(), select(), ioctl(), blocking/non
        blocking mode, interrupt handler.
-       Description: Linux Journal Kernel Korner article. Here is it's
+       Description: Linux Journal Kernel Korner article. Here is its
        abstract: "This article, the third of four on writing character
        device drivers, introduces concepts of reading, writing, and using
        ioctl-calls".
@@ -161,7 +161,7 @@
        Author: Alessandro Rubini and Georg v. Zezschwitz.
        URL: http://www.linuxjournal.com/article.php?sid=1222
        Keywords: interrupts, irqs, DMA, bottom halves, task queues.
-       Description: Linux Journal Kernel Korner article. Here is it's
+       Description: Linux Journal Kernel Korner article. Here is its
        abstract: "This is the fourth in a series of articles about
        writing character device drivers as loadable kernel modules. This
        month, we further investigate the field of interrupt handling.
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 2f9115c0ae6..51ec634ac04 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -332,7 +332,7 @@ occurs during execution of kp->pre_handler or kp->post_handler,
 or during single-stepping of the probed instruction, Kprobes calls
 kp->fault_handler.  Any or all handlers can be NULL. If kp->flags
 is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
-so, it's handlers aren't hit until calling enable_kprobe(kp).
+so, its handlers aren't hit until calling enable_kprobe(kp).
 
 NOTE:
 1. With the introduction of the "symbol_name" field to struct kprobe,
diff --git a/Documentation/laptops/laptop-mode.txt b/Documentation/laptops/laptop-mode.txt
index 2c3c3509302..0bf25eebce9 100644
--- a/Documentation/laptops/laptop-mode.txt
+++ b/Documentation/laptops/laptop-mode.txt
@@ -207,7 +207,7 @@ Tips & Tricks
 * Drew Scott Daniels observed: "I don't know why, but when I decrease the number
   of colours that my display uses it consumes less battery power. I've seen
   this on powerbooks too. I hope that this is a piece of information that
-  might be useful to the Laptop Mode patch or it's users."
+  might be useful to the Laptop Mode patch or its users."
 
 * In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the
   file after every logging. When you're using laptop-mode and your disk doesn't
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 3119f5db75b..e9ce3c55451 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -263,7 +263,7 @@ static u8 *get_feature_bits(struct device *dev)
  * Launcher virtual with an offset.
  *
  * This can be tough to get your head around, but usually it just means that we
- * use these trivial conversion functions when the Guest gives us it's
+ * use these trivial conversion functions when the Guest gives us its
  * "physical" addresses:
  */
 static void *from_guest_phys(unsigned long addr)
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 188f4768f1d..e4e893ef3e0 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -136,7 +136,7 @@ raid_disks != 0.
 
 Then uninitialized devices can be added with ADD_NEW_DISK.  The
 structure passed to ADD_NEW_DISK must specify the state of the device
-and it's role in the array.
+and its role in the array.
 
 Once started with RUN_ARRAY, uninitialized spares can be added with
 HOT_ADD_DISK.
diff --git a/Documentation/netlabel/lsm_interface.txt b/Documentation/netlabel/lsm_interface.txt
index 98dd9f7430f..638c74f7de7 100644
--- a/Documentation/netlabel/lsm_interface.txt
+++ b/Documentation/netlabel/lsm_interface.txt
@@ -38,7 +38,7 @@ Depending on the exact configuration, translation between the network packet
 label and the internal LSM security identifier can be time consuming.  The
 NetLabel label mapping cache is a caching mechanism which can be used to
 sidestep much of this overhead once a mapping has been established.  Once the
-LSM has received a packet, used NetLabel to decode it's security attributes,
+LSM has received a packet, used NetLabel to decode its security attributes,
 and translated the security attributes into a LSM internal identifier the LSM
 can use the NetLabel caching functions to associate the LSM internal
 identifier with the network packet's label.  This means that in the future
diff --git a/Documentation/networking/ifenslave.c b/Documentation/networking/ifenslave.c
index 1b96ccda383..2bac9618c34 100644
--- a/Documentation/networking/ifenslave.c
+++ b/Documentation/networking/ifenslave.c
@@ -756,7 +756,7 @@ static int enslave(char *master_ifname, char *slave_ifname)
 		 */
 		if (abi_ver < 1) {
 			/* For old ABI, the master needs to be
-			 * down before setting it's hwaddr
+			 * down before setting its hwaddr
 			 */
 			res = set_if_down(master_ifname, master_flags.ifr_flags);
 			if (res) {
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 09ab0d29032..98f71a5cef0 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -100,7 +100,7 @@ by the kernel.
 The destruction of the socket and all associated resources
 is done by a simple call to close(fd).
 
-Next I will describe PACKET_MMAP settings and it's constraints,
+Next I will describe PACKET_MMAP settings and its constraints,
 also the mapping of the circular buffer in the user process and 
 the use of this buffer.
 
@@ -432,7 +432,7 @@ TP_STATUS_LOSING      : indicates there were packet drops from last time
                         the PACKET_STATISTICS option.
 
 TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which 
-                        it's checksum will be done in hardware. So while 
+                        its checksum will be done in hardware. So while
                         reading the packet we should not try to check the 
                         checksum. 
 
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
index cdebb5145c2..55c4175d809 100644
--- a/Documentation/power/regulator/consumer.txt
+++ b/Documentation/power/regulator/consumer.txt
@@ -8,11 +8,11 @@ Please see overview.txt for a description of the terms used in this text.
 1. Consumer Regulator Access (static & dynamic drivers)
 =======================================================
 
-A consumer driver can get access to it's supply regulator by calling :-
+A consumer driver can get access to its supply regulator by calling :-
 
 regulator = regulator_get(dev, "Vcc");
 
-The consumer passes in it's struct device pointer and power supply ID. The core
+The consumer passes in its struct device pointer and power supply ID. The core
 then finds the correct regulator by consulting a machine specific lookup table.
 If the lookup is successful then this call will return a pointer to the struct
 regulator that supplies this consumer.
@@ -34,7 +34,7 @@ usually be called in your device drivers probe() and remove() respectively.
 2. Regulator Output Enable & Disable (static & dynamic drivers)
 ====================================================================
 
-A consumer can enable it's power supply by calling:-
+A consumer can enable its power supply by calling:-
 
 int regulator_enable(regulator);
 
@@ -49,7 +49,7 @@ int regulator_is_enabled(regulator);
 This will return > zero when the regulator is enabled.
 
 
-A consumer can disable it's supply when no longer needed by calling :-
+A consumer can disable its supply when no longer needed by calling :-
 
 int regulator_disable(regulator);
 
@@ -140,7 +140,7 @@ by calling :-
 int regulator_set_optimum_mode(struct regulator *regulator, int load_uA);
 
 This will cause the core to recalculate the total load on the regulator (based
-on all it's consumers) and change operating mode (if necessary and permitted)
+on all its consumers) and change operating mode (if necessary and permitted)
 to best match the current operating load.
 
 The load_uA value can be determined from the consumers datasheet. e.g.most
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
index 63728fed620..bdec39b9bd7 100644
--- a/Documentation/power/regulator/machine.txt
+++ b/Documentation/power/regulator/machine.txt
@@ -52,7 +52,7 @@ static struct regulator_init_data regulator1_data = {
 };
 
 Regulator-1 supplies power to Regulator-2. This relationship must be registered
-with the core so that Regulator-1 is also enabled when Consumer A enables it's
+with the core so that Regulator-1 is also enabled when Consumer A enables its
 supply (Regulator-2). The supply regulator is set by the supply_regulator_dev
 field below:-
 
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
index ffd185bb605..9363e056188 100644
--- a/Documentation/power/regulator/overview.txt
+++ b/Documentation/power/regulator/overview.txt
@@ -35,16 +35,16 @@ Some terms used in this document:-
   o Consumer     - Electronic device that is supplied power by a regulator.
                    Consumers can be classified into two types:-
 
-                   Static: consumer does not change it's supply voltage or
+                   Static: consumer does not change its supply voltage or
                    current limit. It only needs to enable or disable it's
-                   power supply. It's supply voltage is set by the hardware,
+                   power supply. Its supply voltage is set by the hardware,
                    bootloader, firmware or kernel board initialisation code.
 
                    Dynamic: consumer needs to change it's supply voltage or
                    current limit to meet operation demands.
 
 
-  o Power Domain - Electronic circuit that is supplied it's input power by the
+  o Power Domain - Electronic circuit that is supplied its input power by the
                    output power of a regulator, switch or by another power
                    domain.
 
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index 79f533f38c6..46d22105aa0 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -1289,7 +1289,7 @@ link between a device node and its interrupt parent in
 the interrupt tree.  The value of interrupt-parent is the
 phandle of the parent node.
 
-If the interrupt-parent property is not defined for a node, it's
+If the interrupt-parent property is not defined for a node, its
 interrupt parent is assumed to be an ancestor in the node's
 _device tree_ hierarchy.
 
diff --git a/Documentation/powerpc/phyp-assisted-dump.txt b/Documentation/powerpc/phyp-assisted-dump.txt
index c4682b982a2..ad340205d96 100644
--- a/Documentation/powerpc/phyp-assisted-dump.txt
+++ b/Documentation/powerpc/phyp-assisted-dump.txt
@@ -19,7 +19,7 @@ dump offers several strong, practical advantages:
    immediately available to the system for normal use.
 -- After the dump is completed, no further reboots are
    required; the system will be fully usable, and running
-   in it's normal, production mode on it normal kernel.
+   in its normal, production mode on its normal kernel.
 
 The above can only be accomplished by coordination with,
 and assistance from the hypervisor. The procedure is
diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt
index 4b736d24da7..8df0b782c4d 100644
--- a/Documentation/rt-mutex-design.txt
+++ b/Documentation/rt-mutex-design.txt
@@ -657,7 +657,7 @@ here.
 
 The waiter structure has a "task" field that points to the task that is blocked
 on the mutex.  This field can be NULL the first time it goes through the loop
-or if the task is a pending owner and had it's mutex stolen.  If the "task"
+or if the task is a pending owner and had its mutex stolen.  If the "task"
 field is NULL then we need to set up the accounting for it.
 
 Task blocks on mutex
diff --git a/Documentation/scsi/ChangeLog.lpfc b/Documentation/scsi/ChangeLog.lpfc
index 2ffc1148eb9..e759e92e286 100644
--- a/Documentation/scsi/ChangeLog.lpfc
+++ b/Documentation/scsi/ChangeLog.lpfc
@@ -707,7 +707,7 @@ Changes from 20040920 to 20041018
 	* Integrate patches from Christoph Hellwig: two new helpers common
 	  to lpfc_sli_resume_iocb and lpfc_sli_issue_iocb - singificant
 	  cleanup of those two functions - the unused SLI_IOCB_USE_TXQ is
-	  gone - lpfc_sli_issue_iocb_wait loses it's flags argument
+	  gone - lpfc_sli_issue_iocb_wait loses its flags argument
 	  totally.
 	* Fix in lpfc_sli.c: we can not store a 5 bit value in a 4-bit
 	  field.
@@ -1028,7 +1028,7 @@ Changes from 20040614 to 20040709
 	* Remove the need for buf_tmo.
 	* Changed ULP_BDE64 to struct ulp_bde64.
 	* Changed ULP_BDE to struct ulp_bde.
-	* Cleanup lpfc_os_return_scsi_cmd() and it's call path.
+	* Cleanup lpfc_os_return_scsi_cmd() and its call path.
 	* Removed lpfc_no_device_delay.
 	* Consolidating lpfc_hba_put_event() into lpfc_put_event().
 	* Removed following attributes and their functionality:
diff --git a/Documentation/scsi/FlashPoint.txt b/Documentation/scsi/FlashPoint.txt
index d5acaa300a4..1540a92f6d2 100644
--- a/Documentation/scsi/FlashPoint.txt
+++ b/Documentation/scsi/FlashPoint.txt
@@ -71,7 +71,7 @@ peters@mylex.com
 
 Ever since its introduction last October, the BusLogic FlashPoint LT has
 been problematic for members of the Linux community, in that no Linux
-drivers have been available for this new Ultra SCSI product.  Despite it's
+drivers have been available for this new Ultra SCSI product.  Despite its
 officially being positioned as a desktop workstation product, and not being
 particularly well suited for a high performance multitasking operating
 system like Linux, the FlashPoint LT has been touted by computer system
diff --git a/Documentation/scsi/dtc3x80.txt b/Documentation/scsi/dtc3x80.txt
index e8ae6230ab3..1d7af9f9a8e 100644
--- a/Documentation/scsi/dtc3x80.txt
+++ b/Documentation/scsi/dtc3x80.txt
@@ -12,7 +12,7 @@ The 3180 does not.  Otherwise, they are identical.
 The DTC3x80 does not support DMA but it does have Pseudo-DMA which is
 supported by the driver.
 
-It's DTC406 scsi chip is supposedly compatible with the NCR 53C400.
+Its DTC406 scsi chip is supposedly compatible with the NCR 53C400.
 It is memory mapped, uses an IRQ, but no dma or io-port.  There is
 internal DMA, between SCSI bus and an on-chip 128-byte buffer.  Double
 buffering is done automagically by the chip.  Data is transferred
diff --git a/Documentation/scsi/ncr53c8xx.txt b/Documentation/scsi/ncr53c8xx.txt
index 08e2b4d04aa..cda5f8fa2c6 100644
--- a/Documentation/scsi/ncr53c8xx.txt
+++ b/Documentation/scsi/ncr53c8xx.txt
@@ -1479,7 +1479,7 @@ Wide16 SCSI.
 Enabling serial NVRAM support enables detection of the serial NVRAM included
 on Symbios and some Symbios compatible host adaptors, and Tekram boards. The 
 serial NVRAM is used by Symbios and Tekram to hold set up parameters for the 
-host adaptor and it's attached drives.
+host adaptor and its attached drives.
 
 The Symbios NVRAM also holds data on the boot order of host adaptors in a
 system with more than one host adaptor. This enables the order of scanning
diff --git a/Documentation/scsi/osst.txt b/Documentation/scsi/osst.txt
index f536907e241..2b21890bc98 100644
--- a/Documentation/scsi/osst.txt
+++ b/Documentation/scsi/osst.txt
@@ -40,7 +40,7 @@ behavior looks very much the same as st to the userspace applications.
 
 History
 -------
-In the first place, osst shared it's identity very much with st. That meant
+In the first place, osst shared its identity very much with st. That meant
 that it used the same kernel structures and the same device node as st.
 So you could only have either of them being present in the kernel. This has
 been fixed by registering an own device, now.
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt
index aec6549ab09..e00192de4d1 100644
--- a/Documentation/scsi/scsi_fc_transport.txt
+++ b/Documentation/scsi/scsi_fc_transport.txt
@@ -70,7 +70,7 @@ Overview:
     up to an administrative entity controlling the vport. For example,
     if vports are to be associated with virtual machines, a XEN mgmt
     utility would be responsible for creating wwpn/wwnn's for the vport,
-    using it's own naming authority and OUI. (Note: it already does this
+    using its own naming authority and OUI. (Note: it already does this
     for virtual MAC addresses).
 
 
@@ -81,7 +81,7 @@ Device Trees and Vport Objects:
   with rports and scsi target objects underneath it. Currently the FC
   transport creates the vport object and places it under the scsi_host
   object corresponding to the physical adapter.  The LLDD will allocate
-  a new scsi_host for the vport and link it's object under the vport.
+  a new scsi_host for the vport and link its object under the vport.
   The remainder of the tree under the vports scsi_host is the same
   as the non-NPIV case. The transport is written currently to easily
   allow the parent of the vport to be something other than the scsi_host.
diff --git a/Documentation/scsi/sym53c8xx_2.txt b/Documentation/scsi/sym53c8xx_2.txt
index eb9a7b905b6..6f63b798967 100644
--- a/Documentation/scsi/sym53c8xx_2.txt
+++ b/Documentation/scsi/sym53c8xx_2.txt
@@ -687,7 +687,7 @@ maintain the driver code.
 Enabling serial NVRAM support enables detection of the serial NVRAM included
 on Symbios and some Symbios compatible host adaptors, and Tekram boards. The 
 serial NVRAM is used by Symbios and Tekram to hold set up parameters for the 
-host adaptor and it's attached drives.
+host adaptor and its attached drives.
 
 The Symbios NVRAM also holds data on the boot order of host adaptors in a
 system with more than one host adaptor.  This information is no longer used
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index 9ac842be9b4..05bf5a0eee4 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -188,8 +188,8 @@ The WM8731 output mixer has 3 inputs (sources)
  3. Mic Sidetone Input
 
 Each input in this example has a kcontrol associated with it (defined in example
-above) and is connected to the output mixer via it's kcontrol name. We can now
-connect the destination widget (wrt audio signal) with it's source widgets.
+above) and is connected to the output mixer via its kcontrol name. We can now
+connect the destination widget (wrt audio signal) with its source widgets.
 
 	/* output mixer */
 	{"Output Mixer", "Line Bypass Switch", "Line Input"},
diff --git a/Documentation/sound/alsa/soc/machine.txt b/Documentation/sound/alsa/soc/machine.txt
index bab7711ce96..2524c75557d 100644
--- a/Documentation/sound/alsa/soc/machine.txt
+++ b/Documentation/sound/alsa/soc/machine.txt
@@ -67,7 +67,7 @@ static struct snd_soc_dai_link corgi_dai = {
 	.ops = &corgi_ops,
 };
 
-struct snd_soc_card then sets up the machine with it's DAIs. e.g.
+struct snd_soc_card then sets up the machine with its DAIs. e.g.
 
 /* corgi audio machine driver */
 static struct snd_soc_card snd_soc_corgi = {
diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt
index 1e4c6d3655f..138ac88c146 100644
--- a/Documentation/sound/alsa/soc/overview.txt
+++ b/Documentation/sound/alsa/soc/overview.txt
@@ -33,7 +33,7 @@ features :-
     and machines.
 
   * Easy I2S/PCM audio interface setup between codec and SoC. Each SoC
-    interface and codec registers it's audio interface capabilities with the
+    interface and codec registers its audio interface capabilities with the
     core and are subsequently matched and configured when the application
     hardware parameters are known.
 
diff --git a/Documentation/usb/WUSB-Design-overview.txt b/Documentation/usb/WUSB-Design-overview.txt
index c480e9c32db..4c5e3793934 100644
--- a/Documentation/usb/WUSB-Design-overview.txt
+++ b/Documentation/usb/WUSB-Design-overview.txt
@@ -381,7 +381,7 @@ descriptor that gives us the status of the transfer, its identification
 we issue another URB to read into the destination buffer the chunk of
 data coming out of the remote endpoint. Done, wait for the next guy. The
 callbacks for the URBs issued from here are the ones that will declare
-the xfer complete at some point and call it's callback.
+the xfer complete at some point and call its callback.
 
 Seems simple, but the implementation is not trivial.
 
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index be45dbb9d7f..6690fc34ef6 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -45,7 +45,7 @@ most general to most specific:
 	to establish the task policy for a child task exec()'d from an
 	executable image that has no awareness of memory policy.  See the
 	MEMORY POLICY APIS section, below, for an overview of the system call
-	that a task may use to set/change it's task/process policy.
+	that a task may use to set/change its task/process policy.
 
 	In a multi-threaded task, task policies apply only to the thread
 	[Linux kernel task] that installs the policy and any threads
@@ -301,7 +301,7 @@ decrement this reference count, respectively.  mpol_put() will only free
 the structure back to the mempolicy kmem cache when the reference count
 goes to zero.
 
-When a new memory policy is allocated, it's reference count is initialized
+When a new memory policy is allocated, its reference count is initialized
 to '1', representing the reference held by the task that is installing the
 new policy.  When a pointer to a memory policy structure is stored in another
 structure, another reference is added, as the task's reference will be dropped
diff --git a/Documentation/w1/w1.generic b/Documentation/w1/w1.generic
index e3333eec432..212f4ac31c0 100644
--- a/Documentation/w1/w1.generic
+++ b/Documentation/w1/w1.generic
@@ -25,7 +25,7 @@ When a w1 master driver registers with the w1 subsystem, the following occurs:
  - sysfs entries for that w1 master are created
  - the w1 bus is periodically searched for new slave devices
 
-When a device is found on the bus, w1 core checks if driver for it's family is
+When a device is found on the bus, w1 core checks if driver for its family is
 loaded. If so, the family driver is attached to the slave.
 If there is no driver for the family, default one is assigned, which allows to perform
 almost any kind of operations. Each logical operation is a transaction
-- 
cgit v1.2.3


From d02f00cc057809d96c044cc72d5b9809d59f7d49 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 7 Dec 2009 13:10:48 -0800
Subject: ocfs2: allocation reservations

This patch improves Ocfs2 allocation policy by allowing an inode to
reserve a portion of the local alloc bitmap for itself. The reserved
portion (allocation window) is advisory in that other allocation
windows might steal it if the local alloc bitmap becomes
full. Otherwise, the reservations are honored and guaranteed to be
free. When the local alloc window is moved to a different portion of
the bitmap, existing reservations are discarded.

Reservation windows are represented internally by a red-black
tree. Within that tree, each node represents the reservation window of
one inode. An LRU of active reservations is also maintained. When new
data is written, we allocate it from the inodes window. When all bits
in a window are exhausted, we allocate a new one as close to the
previous one as possible. Should we not find free space, an existing
reservation is pulled off the LRU and cannibalized.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 Documentation/filesystems/ocfs2.txt |   3 +
 fs/ocfs2/Makefile                   |   1 +
 fs/ocfs2/cluster/masklog.c          |   1 +
 fs/ocfs2/cluster/masklog.h          |   1 +
 fs/ocfs2/localalloc.c               |  64 ++-
 fs/ocfs2/ocfs2.h                    |   5 +
 fs/ocfs2/reservations.c             | 849 ++++++++++++++++++++++++++++++++++++
 fs/ocfs2/reservations.h             | 154 +++++++
 fs/ocfs2/suballoc.h                 |   2 +
 fs/ocfs2/super.c                    |  25 ++
 10 files changed, 1094 insertions(+), 11 deletions(-)
 create mode 100644 fs/ocfs2/reservations.c
 create mode 100644 fs/ocfs2/reservations.h

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index c58b9f5ba00..412df909593 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -80,3 +80,6 @@ user_xattr	(*)	Enables Extended User Attributes.
 nouser_xattr		Disables Extended User Attributes.
 acl			Enables POSIX Access Control Lists support.
 noacl		(*)	Disables POSIX Access Control Lists support.
+resv_level=4	(*)	Set how agressive allocation reservations will be.
+			Valid values are between 0 (reservations off) to 8
+			(maximum space for reservations).
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c0886c06..07d9fd85435 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
 	mmap.o 			\
 	namei.o 		\
 	refcounttree.o		\
+	reservations.o		\
 	resize.o		\
 	slot_map.o 		\
 	suballoc.o 		\
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 3bb928a2bf7..c7fba396392 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
+	define_mask(RESERVATIONS),
 };
 
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbec32f..fd96e2a2fa5 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,6 +119,7 @@
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
 #define ML_KTHREAD	0x0000000400000000ULL /* kernel thread activity */
+#define	ML_RESERVATIONS	0x0000000800000000ULL /* ocfs2 alloc reservations */
 
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7e7dd65d97e..7fe8149a000 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 					     struct ocfs2_dinode *alloc,
-					     u32 numbits);
+					     u32 *numbits,
+					     struct ocfs2_alloc_reservation *resv);
 
 static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
 
@@ -262,6 +263,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 
 	osb->local_alloc_state = OCFS2_LA_DISABLED;
 
+	ocfs2_resmap_uninit(&osb->osb_la_resmap);
+
 	main_bm_inode = ocfs2_get_system_file_inode(osb,
 						    GLOBAL_BITMAP_SYSTEM_INODE,
 						    OCFS2_INVALID_SLOT);
@@ -493,7 +496,7 @@ static int ocfs2_local_alloc_in_range(struct inode *inode,
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 	la = OCFS2_LOCAL_ALLOC(alloc);
 
-	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted, NULL);
 	if (start == -1) {
 		mlog_errno(-ENOSPC);
 		return 0;
@@ -659,7 +662,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 	la = OCFS2_LOCAL_ALLOC(alloc);
 
-	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+						  ac->ac_resv);
 	if (start == -1) {
 		/* TODO: Shouldn't we just BUG here? */
 		status = -ENOSPC;
@@ -669,8 +673,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 
 	bitmap = la->la_bitmap;
 	*bit_off = le32_to_cpu(la->la_bm_off) + start;
-	/* local alloc is always contiguous by nature -- we never
-	 * delete bits from it! */
 	*num_bits = bits_wanted;
 
 	status = ocfs2_journal_access_di(handle,
@@ -682,6 +684,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+				  bits_wanted);
+
 	while(bits_wanted--)
 		ocfs2_set_bit(start++, bitmap);
 
@@ -711,13 +716,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 }
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
-					     struct ocfs2_dinode *alloc,
-					     u32 numbits)
+				     struct ocfs2_dinode *alloc,
+				     u32 *numbits,
+				     struct ocfs2_alloc_reservation *resv)
 {
 	int numfound, bitoff, left, startoff, lastzero;
+	int local_resv = 0;
+	struct ocfs2_alloc_reservation r;
 	void *bitmap = NULL;
+	struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
 
-	mlog_entry("(numbits wanted = %u)\n", numbits);
+	mlog_entry("(numbits wanted = %u)\n", *numbits);
 
 	if (!alloc->id1.bitmap1.i_total) {
 		mlog(0, "No bits in my window!\n");
@@ -725,6 +734,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	if (!resv) {
+		local_resv = 1;
+		ocfs2_resv_init_once(&r);
+		ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+		resv = &r;
+	}
+
+	numfound = *numbits;
+	if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+		if (numfound < *numbits)
+			*numbits = numfound;
+		goto bail;
+	}
+
+	/*
+	 * Code error. While reservations are enabled, local
+	 * allocation should _always_ go through them.
+	 */
+	BUG_ON(osb->osb_resv_level != 0);
+
+	/*
+	 * Reservations are disabled. Handle this the old way.
+	 */
+
 	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
 
 	numfound = bitoff = startoff = 0;
@@ -750,7 +783,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 			startoff = bitoff+1;
 		}
 		/* we got everything we needed */
-		if (numfound == numbits) {
+		if (numfound == *numbits) {
 			/* mlog(0, "Found it all!\n"); */
 			break;
 		}
@@ -759,12 +792,18 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 	mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
 	     numfound);
 
-	if (numfound == numbits)
+	if (numfound == *numbits) {
 		bitoff = startoff - numfound;
-	else
+		*numbits = numfound;
+	} else {
+		numfound = 0;
 		bitoff = -1;
+	}
 
 bail:
+	if (local_resv)
+		ocfs2_resv_discard(resmap, resv);
+
 	mlog_exit(bitoff);
 	return bitoff;
 }
@@ -1087,6 +1126,9 @@ retry_enospc:
 	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
 	       le16_to_cpu(la->la_size));
 
+	ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+			     OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
+
 	mlog(0, "New window allocated:\n");
 	mlog(0, "window la_bm_off = %u\n",
 	     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index adf5e2ebc2c..9552560df6c 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,7 @@
 /* For struct ocfs2_blockcheck_stats */
 #include "blockcheck.h"
 
+#include "reservations.h"
 
 /* Caching of metadata buffers */
 
@@ -349,6 +350,10 @@ struct ocfs2_super
 
 	u64 la_last_gd;
 
+	struct ocfs2_reservation_map	osb_la_resmap;
+
+	unsigned int	osb_resv_level;
+
 	/* Next three fields are for local node slot recovery during
 	 * mount. */
 	int dirty;
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 00000000000..79642d60821
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,849 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+
+#define MLOG_MASK_PREFIX ML_RESERVATIONS
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+
+DEFINE_SPINLOCK(resv_lock);
+
+#define	OCFS2_MIN_RESV_WINDOW_BITS	8
+#define	OCFS2_MAX_RESV_WINDOW_BITS	1024
+
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+					   struct ocfs2_alloc_reservation *resv)
+{
+	struct ocfs2_super *osb = resmap->m_osb;
+	unsigned int bits;
+
+	/* 8, 16, 32, 64, 128, 256, 512, 1024 */
+	bits = 4 << osb->osb_resv_level;
+
+	return bits;
+}
+
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+	if (resv->r_len)
+		return resv->r_start + resv->r_len - 1;
+	return resv->r_start;
+}
+
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+	return !!(resv->r_len == 0);
+}
+
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+	if (resmap->m_osb->osb_resv_level == 0)
+		return 1;
+	return 0;
+}
+
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+	struct ocfs2_super *osb = resmap->m_osb;
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+	int i = 0;
+
+	mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+	     osb->dev_str, resmap->m_bitmap_len);
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+		     "\tlast_len: %u\n", resv->r_start,
+		     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+		     resv->r_last_len);
+
+		node = rb_next(node);
+		i++;
+	}
+
+	mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+
+	i = 0;
+	list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+		mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+		     "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+		     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+		     resv->r_last_len);
+
+		i++;
+	}
+}
+
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+				      int i,
+				      struct ocfs2_alloc_reservation *resv)
+{
+	char *disk_bitmap = resmap->m_disk_bitmap;
+	unsigned int start = resv->r_start;
+	unsigned int end = ocfs2_resv_end(resv);
+
+	while (start <= end) {
+		if (ocfs2_test_bit(start, disk_bitmap)) {
+			mlog(ML_ERROR,
+			     "reservation %d covers an allocated area "
+			     "starting at bit %u!\n", i, start);
+			return 1;
+		}
+
+		start++;
+	}
+	return 0;
+}
+
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+	unsigned int off = 0;
+	int i = 0;
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		if (i > 0 && resv->r_start <= off) {
+			mlog(ML_ERROR, "reservation %d has bad start off!\n",
+			     i);
+			goto bad;
+		}
+
+		if (resv->r_len == 0) {
+			mlog(ML_ERROR, "reservation %d has no length!\n",
+			     i);
+			goto bad;
+		}
+
+		if (resv->r_start > ocfs2_resv_end(resv)) {
+			mlog(ML_ERROR, "reservation %d has invalid range!\n",
+			     i);
+			goto bad;
+		}
+
+		if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+			mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+			     i);
+			goto bad;
+		}
+
+		if (ocfs2_validate_resmap_bits(resmap, i, resv))
+			goto bad;
+
+		off = ocfs2_resv_end(resv);
+		node = rb_next(node);
+
+		i++;
+	}
+	return;
+
+bad:
+	ocfs2_dump_resv(resmap);
+	BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+
+}
+#endif
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+	memset(resv, 0, sizeof(*resv));
+	INIT_LIST_HEAD(&resv->r_lru);
+}
+
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+			 unsigned int flags)
+{
+	BUG_ON(flags & ~OCFS2_RESV_TYPES);
+
+	resv->r_flags |= flags;
+}
+
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+		      struct ocfs2_reservation_map *resmap)
+{
+	memset(resmap, 0, sizeof(*resmap));
+
+	resmap->m_osb = osb;
+	resmap->m_reservations = RB_ROOT;
+	/* m_bitmap_len is initialized to zero by the above memset. */
+	INIT_LIST_HEAD(&resmap->m_lru);
+
+	return 0;
+}
+
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+				struct ocfs2_alloc_reservation *resv)
+{
+	assert_spin_locked(&resv_lock);
+
+	if (!list_empty(&resv->r_lru))
+		list_del_init(&resv->r_lru);
+
+	list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+	resv->r_len = 0;
+	resv->r_start = 0;
+}
+
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+			      struct ocfs2_alloc_reservation *resv)
+{
+	if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+		list_del_init(&resv->r_lru);
+		rb_erase(&resv->r_node, &resmap->m_reservations);
+		resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+	}
+}
+
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+				 struct ocfs2_alloc_reservation *resv)
+{
+	assert_spin_locked(&resv_lock);
+
+	__ocfs2_resv_trunc(resv);
+	/*
+	 * last_len and last_start no longer make sense if
+	 * we're changing the range of our allocations.
+	 */
+	resv->r_last_len = resv->r_last_start = 0;
+
+	ocfs2_resv_remove(resmap, resv);
+}
+
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+			struct ocfs2_alloc_reservation *resv)
+{
+	if (resv) {
+		spin_lock(&resv_lock);
+		__ocfs2_resv_discard(resmap, resv);
+		spin_unlock(&resv_lock);
+	}
+}
+
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+
+	assert_spin_locked(&resv_lock);
+
+	while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		__ocfs2_resv_discard(resmap, resv);
+	}
+}
+
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+			  unsigned int clen, char *disk_bitmap)
+{
+	if (ocfs2_resmap_disabled(resmap))
+		return;
+
+	spin_lock(&resv_lock);
+
+	ocfs2_resmap_clear_all_resv(resmap);
+	resmap->m_bitmap_len = clen;
+	resmap->m_disk_bitmap = disk_bitmap;
+
+	spin_unlock(&resv_lock);
+}
+
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+	/* Does nothing for now. Keep this around for API symmetry */
+}
+
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+			      struct ocfs2_alloc_reservation *new)
+{
+	struct rb_root *root = &resmap->m_reservations;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct ocfs2_alloc_reservation *tmp;
+
+	assert_spin_locked(&resv_lock);
+
+	mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
+	     new->r_len);
+
+	while (*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+
+		if (new->r_start < tmp->r_start) {
+			p = &(*p)->rb_left;
+
+			/*
+			 * This is a good place to check for
+			 * overlapping reservations.
+			 */
+			BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+		} else if (new->r_start > ocfs2_resv_end(tmp)) {
+			p = &(*p)->rb_right;
+		} else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate reservation window!\n");
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, root);
+	new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+
+	ocfs2_resv_mark_lru(resmap, new);
+
+	ocfs2_check_resmap(resmap);
+}
+
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+	struct ocfs2_alloc_reservation *resv = NULL;
+	struct ocfs2_alloc_reservation *prev_resv = NULL;
+	struct rb_node *node = resmap->m_reservations.rb_node;
+	struct rb_node *prev = NULL;
+
+	assert_spin_locked(&resv_lock);
+
+	if (!node)
+		return NULL;
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+			break;
+
+		/* Check if we overshot the reservation just before goal? */
+		if (resv->r_start > goal) {
+			resv = prev_resv;
+			break;
+		}
+
+		prev_resv = resv;
+		prev = node;
+		node = rb_next(node);
+	}
+
+	return resv;
+}
+
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length csearch_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+				       unsigned int wanted,
+				       unsigned int search_start,
+				       unsigned int search_len,
+				       unsigned int *rstart,
+				       unsigned int *rlen)
+{
+	void *bitmap = resmap->m_disk_bitmap;
+	unsigned int best_start, best_len = 0;
+	int offset, start, found;
+
+	mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
+	     wanted, search_start, search_len, resmap->m_bitmap_len);
+
+	found = best_start = best_len = 0;
+
+	start = search_start;
+	while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+						 start)) != -1) {
+		/* Search reached end of the region */
+		if (offset >= (search_start + search_len))
+			break;
+
+		if (offset == start) {
+			/* we found a zero */
+			found++;
+			/* move start to the next bit to test */
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_len) {
+			best_len = found;
+			best_start = start - found;
+		}
+
+		if (found >= wanted)
+			break;
+	}
+
+	if (best_len == 0)
+		return 0;
+
+	if (best_len >= wanted)
+		best_len = wanted;
+
+	*rlen = best_len;
+	*rstart = best_start;
+
+	mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+
+	return *rlen;
+}
+
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+				     struct ocfs2_alloc_reservation *resv,
+				     unsigned int goal, unsigned int wanted)
+{
+	struct rb_root *root = &resmap->m_reservations;
+	unsigned int gap_start, gap_end, gap_len;
+	struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+	struct rb_node *prev, *next;
+	unsigned int cstart, clen;
+	unsigned int best_start = 0, best_len = 0;
+
+	/*
+	 * Nasty cases to consider:
+	 *
+	 * - rbtree is empty
+	 * - our window should be first in all reservations
+	 * - our window should be last in all reservations
+	 * - need to make sure we don't go past end of bitmap
+	 */
+
+	mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
+	     resv->r_start, ocfs2_resv_end(resv), goal, wanted);
+
+	assert_spin_locked(&resv_lock);
+
+	if (RB_EMPTY_ROOT(root)) {
+		/*
+		 * Easiest case - empty tree. We can just take
+		 * whatever window of free bits we want.
+		 */
+
+		mlog(0, "Empty root\n");
+
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+						   resmap->m_bitmap_len - goal,
+						   &cstart, &clen);
+
+		/*
+		 * This should never happen - the local alloc window
+		 * will always have free bits when we're called.
+		 */
+		BUG_ON(goal == 0 && clen == 0);
+
+		if (clen == 0)
+			return;
+
+		resv->r_start = cstart;
+		resv->r_len = clen;
+
+		ocfs2_resv_insert(resmap, resv);
+		return;
+	}
+
+	prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+
+	if (prev_resv == NULL) {
+		mlog(0, "Goal on LHS of leftmost window\n");
+
+		/*
+		 * A NULL here means that the search code couldn't
+		 * find a window that starts before goal.
+		 *
+		 * However, we can take the first window after goal,
+		 * which is also by definition, the leftmost window in
+		 * the entire tree. If we can find free bits in the
+		 * gap between goal and the LHS window, then the
+		 * reservation can safely be placed there.
+		 *
+		 * Otherwise we fall back to a linear search, checking
+		 * the gaps in between windows for a place to
+		 * allocate.
+		 */
+
+		next = rb_first(root);
+		next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+				     r_node);
+
+		/*
+		 * The search should never return such a window. (see
+		 * comment above
+		 */
+		if (next_resv->r_start <= goal) {
+			mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+			     goal, next_resv->r_start, next_resv->r_len);
+			ocfs2_dump_resv(resmap);
+			BUG();
+		}
+
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+						   next_resv->r_start - goal,
+						   &cstart, &clen);
+		if (clen) {
+			best_len = clen;
+			best_start = cstart;
+			if (best_len == wanted)
+				goto out_insert;
+		}
+
+		prev_resv = next_resv;
+		next_resv = NULL;
+	}
+
+	prev = &prev_resv->r_node;
+
+	/* Now we do a linear search for a window, starting at 'prev_rsv' */
+	while (1) {
+		next = rb_next(prev);
+		if (next) {
+			mlog(0, "One more resv found in linear search\n");
+			next_resv = rb_entry(next,
+					     struct ocfs2_alloc_reservation,
+					     r_node);
+
+			gap_start = ocfs2_resv_end(prev_resv) + 1;
+			gap_end = next_resv->r_start - 1;
+			gap_len = gap_end - gap_start + 1;
+		} else {
+			mlog(0, "No next node\n");
+			/*
+			 * We're at the rightmost edge of the
+			 * tree. See if a reservation between this
+			 * window and the end of the bitmap will work.
+			 */
+			gap_start = ocfs2_resv_end(prev_resv) + 1;
+			gap_len = resmap->m_bitmap_len - gap_start;
+			gap_end = resmap->m_bitmap_len - 1;
+		}
+
+		/*
+		 * No need to check this gap if we have already found
+		 * a larger region of free bits.
+		 */
+		if (gap_len <= best_len)
+			goto next_resv;
+
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+						   gap_len, &cstart, &clen);
+		if (clen == wanted) {
+			best_len = clen;
+			best_start = cstart;
+			goto out_insert;
+		} else if (clen > best_len) {
+			best_len = clen;
+			best_start = cstart;
+		}
+
+next_resv:
+		if (!next)
+			break;
+
+		prev = next;
+		prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+				     r_node);
+	}
+
+out_insert:
+	if (best_len) {
+		resv->r_start = best_start;
+		resv->r_len = best_len;
+		ocfs2_resv_insert(resmap, resv);
+	}
+}
+
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+				   struct ocfs2_alloc_reservation *resv,
+				   unsigned int wanted)
+{
+	struct ocfs2_alloc_reservation *lru_resv;
+	int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+	unsigned int min_bits;
+
+	if (!tmpwindow)
+		min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+	else
+		min_bits = wanted; /* We at know the temp window will use all
+				    * of these bits */
+
+	/*
+	 * Take the first reservation off the LRU as our 'target'. We
+	 * don't try to be smart about it. There might be a case for
+	 * searching based on size but I don't have enough data to be
+	 * sure. --Mark (3/16/2010)
+	 */
+	lru_resv = list_first_entry(&resmap->m_lru,
+				    struct ocfs2_alloc_reservation, r_lru);
+
+	mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
+	     lru_resv->r_len, ocfs2_resv_end(lru_resv));
+
+	/*
+	 * Cannibalize (some or all) of the target reservation and
+	 * feed it to the current window.
+	 */
+	if (lru_resv->r_len <= min_bits) {
+		/*
+		 * Discard completely if size is less than or equal to a
+		 * reasonable threshold - 50% of window bits for non temporary
+		 * windows.
+		 */
+		resv->r_start = lru_resv->r_start;
+		resv->r_len = lru_resv->r_len;
+
+		__ocfs2_resv_discard(resmap, lru_resv);
+	} else {
+		unsigned int shrink;
+		if (tmpwindow)
+			shrink = min_bits;
+		else
+			shrink = lru_resv->r_len / 2;
+
+		lru_resv->r_len -= shrink;
+
+		resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+		resv->r_len = shrink;
+	}
+
+	mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+	     "r_len: %u r_last_start: %u r_last_len: %u\n",
+	     resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+	     resv->r_last_start, resv->r_last_len);
+
+	ocfs2_resv_insert(resmap, resv);
+}
+
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+				   struct ocfs2_alloc_reservation *resv,
+				   unsigned int wanted)
+{
+	unsigned int goal = 0;
+
+	BUG_ON(!ocfs2_resv_empty(resv));
+
+	/*
+	 * Begin by trying to get a window as close to the previous
+	 * one as possible. Using the most recent allocation as a
+	 * start goal makes sense.
+	 */
+	if (resv->r_last_len) {
+		goal = resv->r_last_start + resv->r_last_len;
+		if (goal >= resmap->m_bitmap_len)
+			goal = 0;
+	}
+
+	__ocfs2_resv_find_window(resmap, resv, goal, wanted);
+
+	/* Search from last alloc didn't work, try once more from beginning. */
+	if (ocfs2_resv_empty(resv) && goal != 0)
+		__ocfs2_resv_find_window(resmap, resv, 0, wanted);
+
+	if (ocfs2_resv_empty(resv)) {
+		/*
+		 * Still empty? Pull oldest one off the LRU, remove it from
+		 * tree, put this one in it's place.
+		 */
+		ocfs2_cannibalize_resv(resmap, resv, wanted);
+	}
+
+	BUG_ON(ocfs2_resv_empty(resv));
+}
+
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+			   struct ocfs2_alloc_reservation *resv,
+			   int *cstart, int *clen)
+{
+	unsigned int wanted = *clen;
+
+	if (resv == NULL || ocfs2_resmap_disabled(resmap))
+		return -ENOSPC;
+
+	spin_lock(&resv_lock);
+
+	/*
+	 * We don't want to over-allocate for temporary
+	 * windows. Otherwise, we run the risk of fragmenting the
+	 * allocation space.
+	 */
+	wanted = ocfs2_resv_window_bits(resmap, resv);
+	if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+		wanted = *clen;
+
+	if (ocfs2_resv_empty(resv)) {
+		mlog(0, "empty reservation, find new window\n");
+
+		/*
+		 * Try to get a window here. If it works, we must fall
+		 * through and test the bitmap . This avoids some
+		 * ping-ponging of windows due to non-reserved space
+		 * being allocation before we initialize a window for
+		 * that inode.
+		 */
+		ocfs2_resv_find_window(resmap, resv, wanted);
+	}
+
+	BUG_ON(ocfs2_resv_empty(resv));
+
+	*cstart = resv->r_start;
+	*clen = resv->r_len;
+
+	spin_unlock(&resv_lock);
+	return 0;
+}
+
+static void
+	ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+				     struct ocfs2_alloc_reservation *resv,
+				     unsigned int start, unsigned int end)
+{
+	unsigned int lhs = 0, rhs = 0;
+
+	BUG_ON(start < resv->r_start);
+
+	/*
+	 * Completely used? We can remove it then.
+	 */
+	if (ocfs2_resv_end(resv) <= end && resv->r_start >= start) {
+		__ocfs2_resv_discard(resmap, resv);
+		return;
+	}
+
+	if (end < ocfs2_resv_end(resv))
+		rhs = end - ocfs2_resv_end(resv);
+
+	if (start > resv->r_start)
+		lhs = start - resv->r_start;
+
+	/*
+	 * This should have been trapped above. At the very least, rhs
+	 * should be non zero.
+	 */
+	BUG_ON(rhs == 0 && lhs == 0);
+
+	if (rhs >= lhs) {
+		unsigned int old_end = ocfs2_resv_end(resv);
+
+		resv->r_start = end + 1;
+		resv->r_len = old_end - resv->r_start + 1;
+	} else {
+		resv->r_len = start - resv->r_start;
+	}
+}
+
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+			       struct ocfs2_alloc_reservation *resv,
+			       u32 cstart, u32 clen)
+{
+	unsigned int cend = cstart + clen - 1;
+
+	if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+		return;
+
+	if (resv == NULL)
+		return;
+
+	spin_lock(&resv_lock);
+
+	mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
+	     "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
+	     cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
+	     resv->r_len, resv->r_last_start, resv->r_last_len);
+
+	BUG_ON(cstart < resv->r_start);
+	BUG_ON(cstart > ocfs2_resv_end(resv));
+	BUG_ON(cend > ocfs2_resv_end(resv));
+
+	ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+	resv->r_last_start = cstart;
+	resv->r_last_len = clen;
+
+	/*
+	 * May have been discarded above from
+	 * ocfs2_adjust_resv_from_alloc().
+	 */
+	if (!ocfs2_resv_empty(resv))
+		ocfs2_resv_mark_lru(resmap, resv);
+
+	mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+	     "r_len: %u r_last_start: %u r_last_len: %u\n",
+	     resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+	     resv->r_last_start, resv->r_last_len);
+
+	ocfs2_check_resmap(resmap);
+
+	spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 00000000000..8341cd0ef85
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,154 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef	OCFS2_RESERVATIONS_H
+#define	OCFS2_RESERVATIONS_H
+
+#include <linux/rbtree.h>
+
+#define OCFS2_DEFAULT_RESV_LEVEL	4
+#define OCFS2_MAX_RESV_LEVEL	9
+#define OCFS2_MIN_RESV_LEVEL	0
+
+struct ocfs2_alloc_reservation {
+	struct rb_node	r_node;
+
+	unsigned int	r_start;	/* Begining of current window */
+	unsigned int	r_len;		/* Length of the window */
+
+	unsigned int	r_last_len;	/* Length of most recent alloc */
+	unsigned int	r_last_start;	/* Start of most recent alloc */
+	struct list_head	r_lru;	/* LRU list head */
+
+	unsigned int	r_flags;
+};
+
+#define	OCFS2_RESV_FLAG_INUSE	0x01	/* Set when r_node is part of a btree */
+#define	OCFS2_RESV_FLAG_TMP	0x02	/* Temporary reservation, will be
+					 * destroyed immedately after use */
+
+struct ocfs2_reservation_map {
+	struct rb_root		m_reservations;
+	char			*m_disk_bitmap;
+
+	struct ocfs2_super	*m_osb;
+
+	/* The following are not initialized to meaningful values until a disk
+	 * bitmap is provided. */
+	u32			m_bitmap_len;	/* Number of valid
+						 * bits available */
+
+	struct list_head	m_lru;		/* LRU of reservations
+						 * structures. */
+
+};
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+
+#define OCFS2_RESV_TYPES	(OCFS2_RESV_FLAG_TMP)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+			 unsigned int flags);
+
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+			struct ocfs2_alloc_reservation *resv);
+
+
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+		      struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+			  unsigned int clen, char *disk_bitmap);
+
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+			   struct ocfs2_alloc_reservation *resv,
+			   int *cstart, int *clen);
+
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated.
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+			       struct ocfs2_alloc_reservation *resv,
+			       u32 cstart, u32 clen);
+
+#endif	/* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e0f46df357e..da2f29a55ec 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -54,6 +54,8 @@ struct ocfs2_alloc_context {
 	u64    ac_last_group;
 	u64    ac_max_block;  /* Highest block number to allocate. 0 is
 				 is the same as ~0 - unlimited */
+
+	struct ocfs2_alloc_reservation	*ac_resv;
 };
 
 void ocfs2_init_steal_slots(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee03197a49..cfe672e72b2 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -95,6 +95,7 @@ struct mount_options
 	unsigned int	atime_quantum;
 	signed short	slot;
 	unsigned int	localalloc_opt;
+	unsigned int	resv_level;
 	char		cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
 
@@ -176,6 +177,7 @@ enum {
 	Opt_noacl,
 	Opt_usrquota,
 	Opt_grpquota,
+	Opt_resv_level,
 	Opt_err,
 };
 
@@ -202,6 +204,7 @@ static const match_table_t tokens = {
 	{Opt_noacl, "noacl"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
+	{Opt_resv_level, "resv_level=%u"},
 	{Opt_err, NULL}
 };
 
@@ -1030,6 +1033,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->osb_commit_interval = parsed_options.commit_interval;
 	osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
 	osb->local_alloc_bits = osb->local_alloc_default_bits;
+	osb->osb_resv_level = parsed_options.resv_level;
 
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
@@ -1290,6 +1294,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 	mopt->slot = OCFS2_INVALID_SLOT;
 	mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
 	mopt->cluster_stack[0] = '\0';
+	mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
 
 	if (!options) {
 		status = 1;
@@ -1433,6 +1438,17 @@ static int ocfs2_parse_options(struct super_block *sb,
 			mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
 			mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
 			break;
+		case Opt_resv_level:
+			if (is_remount)
+				break;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= OCFS2_MIN_RESV_LEVEL &&
+			    option < OCFS2_MAX_RESV_LEVEL)
+				mopt->resv_level = option;
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -1514,6 +1530,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	else
 		seq_printf(s, ",noacl");
 
+	if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+		seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+
 	return 0;
 }
 
@@ -2042,6 +2061,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
 	init_waitqueue_head(&osb->osb_mount_event);
 
+	status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
 	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
 	if (!osb->vol_label) {
 		mlog(ML_ERROR, "unable to alloc vol label\n");
-- 
cgit v1.2.3


From b07f8f24dfe54da0f074b78949044842e8df881f Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 5 Apr 2010 18:17:15 -0700
Subject: ocfs2: change default reservation window sizes

The default reservation size of 4 (32-bit windows) is a bit too ambitious.
Scale it back to 16 bits (resv_level=2). I have been testing various sizes
on a 4-node cluster which runs a mixed workload that is heavily threaded.
With a 256MB local alloc, I get *roughly* the following levels of average file
fragmentation:

resv_level=0	70%
resv_level=1	21%
resv_level=2	23%
resv_level=3	24%
resv_level=4	60%
resv_level=5	did not test
resv_level=6	60%

resv_level=2 seemed like a good compromise between not letting windows be
too small, but not so big that heavier workloads will immediately suffer
without tuning.

This patch also change the behavior of directory reservations - they now
track file reservations.  The previous compromise of giving directory
windows only 8 bits wound up fragmenting more at some window sizes because
file allocations had smaller unused windows to poach from.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 Documentation/filesystems/ocfs2.txt | 2 +-
 fs/ocfs2/reservations.c             | 7 ++++---
 fs/ocfs2/reservations.h             | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index 412df909593..32339e584a9 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -80,6 +80,6 @@ user_xattr	(*)	Enables Extended User Attributes.
 nouser_xattr		Disables Extended User Attributes.
 acl			Enables POSIX Access Control Lists support.
 noacl		(*)	Disables POSIX Access Control Lists support.
-resv_level=4	(*)	Set how agressive allocation reservations will be.
+resv_level=2	(*)	Set how agressive allocation reservations will be.
 			Valid values are between 0 (reservations off) to 8
 			(maximum space for reservations).
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 7fc6cfee95f..87fa35791f1 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -55,9 +55,10 @@ static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
 	if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
 		/* 8, 16, 32, 64, 128, 256, 512, 1024 */
 		bits = 4 << osb->osb_resv_level;
-	} else
-		bits = OCFS2_RESV_DIR_WINDOW_BITS;
-
+	} else {
+		/* For now, treat directories the same as files. */
+		bits = 4 << osb->osb_resv_level;
+	}
 	return bits;
 }
 
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 34bb308375c..022aff601e1 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -22,7 +22,7 @@
 
 #include <linux/rbtree.h>
 
-#define OCFS2_DEFAULT_RESV_LEVEL	4
+#define OCFS2_DEFAULT_RESV_LEVEL	2
 #define OCFS2_MAX_RESV_LEVEL	9
 #define OCFS2_MIN_RESV_LEVEL	0
 
-- 
cgit v1.2.3


From 83f92318fa33cc084e14e64dc903e605f75884c1 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 5 Apr 2010 18:17:16 -0700
Subject: ocfs2: Add dir_resv_level mount option

The default behavior for directory reservations stays the same, but we add a
mount option so people can tweak the size of directory reservations
according to their workloads.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 Documentation/filesystems/ocfs2.txt |  4 ++++
 fs/ocfs2/dir.c                      |  6 ++++--
 fs/ocfs2/ocfs2.h                    |  1 +
 fs/ocfs2/reservations.c             |  9 ++++++---
 fs/ocfs2/reservations.h             |  2 ++
 fs/ocfs2/super.c                    | 23 +++++++++++++++++++++++
 6 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index 32339e584a9..1f7ae144f6d 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -83,3 +83,7 @@ noacl		(*)	Disables POSIX Access Control Lists support.
 resv_level=2	(*)	Set how agressive allocation reservations will be.
 			Valid values are between 0 (reservations off) to 8
 			(maximum space for reservations).
+dir_resv_level=	(*)	By default, directory reservations will scale with file
+			reservations - users should rarely need to change this
+			value. If allocation reservations are turned off, this
+			option will have no effect.
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8563f97c58a..6c9a28a2d3a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2977,7 +2977,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * if we only get one now, that's enough to continue. The rest
 	 * will be claimed after the conversion to extents.
 	 */
-	data_ac->ac_resv = &oi->ip_la_data_resv;
+	if (ocfs2_dir_resv_allowed(osb))
+		data_ac->ac_resv = &oi->ip_la_data_resv;
 	ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
 	if (ret) {
 		mlog_errno(ret);
@@ -3348,7 +3349,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
+		if (ocfs2_dir_resv_allowed(osb))
+			data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
 
 		credits = ocfs2_calc_extend_credits(sb, el, 1);
 	} else {
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 09d7aee3dab..a388528f485 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -356,6 +356,7 @@ struct ocfs2_super
 	struct ocfs2_reservation_map	osb_la_resmap;
 
 	unsigned int	osb_resv_level;
+	unsigned int	osb_dir_resv_level;
 
 	/* Next three fields are for local node slot recovery during
 	 * mount. */
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 87fa35791f1..6497bcc00fa 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -44,7 +44,11 @@ DEFINE_SPINLOCK(resv_lock);
 
 #define	OCFS2_MIN_RESV_WINDOW_BITS	8
 #define	OCFS2_MAX_RESV_WINDOW_BITS	1024
-#define	OCFS2_RESV_DIR_WINDOW_BITS	OCFS2_MIN_RESV_WINDOW_BITS
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+	return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
 
 static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
 					   struct ocfs2_alloc_reservation *resv)
@@ -56,8 +60,7 @@ static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
 		/* 8, 16, 32, 64, 128, 256, 512, 1024 */
 		bits = 4 << osb->osb_resv_level;
 	} else {
-		/* For now, treat directories the same as files. */
-		bits = 4 << osb->osb_resv_level;
+		bits = 4 << osb->osb_dir_resv_level;
 	}
 	return bits;
 }
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 022aff601e1..25b0c0e31e9 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -67,6 +67,8 @@ void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
 void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
 			 unsigned int flags);
 
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+
 /**
  * ocfs2_resv_discard() - truncate a reservation
  * @resmap:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5745682eb1c..79d7d4cf45b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -96,6 +96,7 @@ struct mount_options
 	signed short	slot;
 	int		localalloc_opt;
 	unsigned int	resv_level;
+	int		dir_resv_level;
 	char		cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
 
@@ -178,6 +179,7 @@ enum {
 	Opt_usrquota,
 	Opt_grpquota,
 	Opt_resv_level,
+	Opt_dir_resv_level,
 	Opt_err,
 };
 
@@ -205,6 +207,7 @@ static const match_table_t tokens = {
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
 	{Opt_resv_level, "resv_level=%u"},
+	{Opt_dir_resv_level, "dir_resv_level=%u"},
 	{Opt_err, NULL}
 };
 
@@ -1034,6 +1037,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
 	osb->osb_resv_level = parsed_options.resv_level;
+	osb->osb_dir_resv_level = parsed_options.resv_level;
+	if (parsed_options.dir_resv_level == -1)
+		osb->osb_dir_resv_level = parsed_options.resv_level;
+	else
+		osb->osb_dir_resv_level = parsed_options.dir_resv_level;
 
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
@@ -1295,6 +1303,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 	mopt->localalloc_opt = -1;
 	mopt->cluster_stack[0] = '\0';
 	mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+	mopt->dir_resv_level = -1;
 
 	if (!options) {
 		status = 1;
@@ -1449,6 +1458,17 @@ static int ocfs2_parse_options(struct super_block *sb,
 			    option < OCFS2_MAX_RESV_LEVEL)
 				mopt->resv_level = option;
 			break;
+		case Opt_dir_resv_level:
+			if (is_remount)
+				break;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= OCFS2_MIN_RESV_LEVEL &&
+			    option < OCFS2_MAX_RESV_LEVEL)
+				mopt->dir_resv_level = option;
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -1533,6 +1553,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
 		seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
 
+	if (osb->osb_dir_resv_level != osb->osb_resv_level)
+		seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 277a6a34175dcb0ee98dceee619e0e3190347a25 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 2 Apr 2010 18:02:33 +0900
Subject: nilfs2: change default of 'errors' mount option to 'remount-ro' mode

Like ext3, nilfs has 'errors' mount option to allow specifying desired
behavior on severe errors.

Currently, the default action is 'errors=continue' and has potential
to advance filesystem corruption for severe errors.

This will change the action to 'errors=remount-ro' to avoid the issue.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 Documentation/filesystems/nilfs2.txt | 4 ++--
 fs/nilfs2/super.c                    | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index cf6d0d85ca8..d3e7673995e 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -50,8 +50,8 @@ NILFS2 supports the following mount options:
 (*) == default
 
 nobarrier		Disables barriers.
-errors=continue(*)	Keep going on a filesystem error.
-errors=remount-ro	Remount the filesystem read-only on an error.
+errors=continue		Keep going on a filesystem error.
+errors=remount-ro(*)	Remount the filesystem read-only on an error.
 errors=panic		Panic and halt the machine if an error occurs.
 cp=n			Specify the checkpoint-number of the snapshot to be
 			mounted.  Checkpoints and snapshots are listed by lscp
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 48145f505a6..0b1758bf072 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -470,10 +470,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (nilfs_test_opt(sbi, SNAPSHOT))
 		seq_printf(seq, ",cp=%llu",
 			   (unsigned long long int)sbi->s_snapshot_cno);
-	if (nilfs_test_opt(sbi, ERRORS_RO))
-		seq_printf(seq, ",errors=remount-ro");
 	if (nilfs_test_opt(sbi, ERRORS_PANIC))
 		seq_printf(seq, ",errors=panic");
+	if (nilfs_test_opt(sbi, ERRORS_CONT))
+		seq_printf(seq, ",errors=continue");
 	if (nilfs_test_opt(sbi, STRICT_ORDER))
 		seq_printf(seq, ",order=strict");
 	if (nilfs_test_opt(sbi, NORECOVERY))
@@ -631,7 +631,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
 			  struct nilfs_super_block *sbp)
 {
 	sbi->s_mount_opt =
-		NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+		NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
 
 static int nilfs_setup_super(struct nilfs_sb_info *sbi)
-- 
cgit v1.2.3


From a8cd4561ea176f51e9f4707873ca4eff8fd5ee70 Mon Sep 17 00:00:00 2001
From: Anand Gadiyar <gadiyar@ti.com>
Date: Mon, 10 May 2010 14:51:19 +0530
Subject: fix "seperate" typos in comments

s/seperate/separate

Signed-off-by: Anand Gadiyar <gadiyar@ti.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/filesystems/logfs.txt            |  8 ++++----
 arch/arm/mach-s3c2443/clock.c                  |  2 +-
 arch/arm/plat-samsung/include/plat/gpio-core.h |  2 +-
 drivers/usb/host/ehci-omap.c                   |  2 +-
 fs/logfs/dir.c                                 |  2 +-
 fs/logfs/logfs.h                               |  2 +-
 fs/logfs/logfs_abi.h                           | 10 +++++-----
 7 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
index e64c94ba401..bca42c22a14 100644
--- a/Documentation/filesystems/logfs.txt
+++ b/Documentation/filesystems/logfs.txt
@@ -59,7 +59,7 @@ Levels
 ------
 
 Garbage collection (GC) may fail if all data is written
-indiscriminately.  One requirement of GC is that data is seperated
+indiscriminately.  One requirement of GC is that data is separated
 roughly according to the distance between the tree root and the data.
 Effectively that means all file data is on level 0, indirect blocks
 are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
@@ -67,7 +67,7 @@ respectively.  Inode file data is on level 6 for the inodes and 7-11
 for indirect blocks.
 
 Each segment contains objects of a single level only.  As a result,
-each level requires its own seperate segment to be open for writing.
+each level requires its own separate segment to be open for writing.
 
 Inode File
 ----------
@@ -106,9 +106,9 @@ Vim
 ---
 
 By cleverly predicting the life time of data, it is possible to
-seperate long-living data from short-living data and thereby reduce
+separate long-living data from short-living data and thereby reduce
 the GC overhead later.  Each type of distinc life expectency (vim) can
-have a seperate segment open for writing.  Each (level, vim) tupel can
+have a separate segment open for writing.  Each (level, vim) tupel can
 be open just once.  If an open segment with unknown vim is encountered
 at mount time, it is closed and ignored henceforth.
 
diff --git a/arch/arm/mach-s3c2443/clock.c b/arch/arm/mach-s3c2443/clock.c
index 62cd4eaee01..8a55e78f709 100644
--- a/arch/arm/mach-s3c2443/clock.c
+++ b/arch/arm/mach-s3c2443/clock.c
@@ -391,7 +391,7 @@ static struct clk clk_hsmmc = {
 
 /* i2s_eplldiv
  *
- * This clock is the output from the I2S divisor of ESYSCLK, and is seperate
+ * This clock is the output from the I2S divisor of ESYSCLK, and is separate
  * from the mux that comes after it (cannot merge into one single clock)
 */
 
diff --git a/arch/arm/plat-samsung/include/plat/gpio-core.h b/arch/arm/plat-samsung/include/plat/gpio-core.h
index 49ff406a706..16643e3029f 100644
--- a/arch/arm/plat-samsung/include/plat/gpio-core.h
+++ b/arch/arm/plat-samsung/include/plat/gpio-core.h
@@ -97,7 +97,7 @@ extern void s3c_gpiolib_add(struct s3c_gpio_chip *chip);
  * others = Special functions (dependant on bank)
  *
  * Note, since the code to deal with the case where there are two control
- * registers instead of one, we do not have a seperate set of function
+ * registers instead of one, we do not have a separate set of function
  * (samsung_gpiolib_add_4bit2_chips)for each case.
  */
 extern void samsung_gpiolib_add_4bit_chips(struct s3c_gpio_chip *chip,
diff --git a/drivers/usb/host/ehci-omap.c b/drivers/usb/host/ehci-omap.c
index a67a0030dd5..bed6de342ec 100644
--- a/drivers/usb/host/ehci-omap.c
+++ b/drivers/usb/host/ehci-omap.c
@@ -181,7 +181,7 @@ struct ehci_hcd_omap {
 	void __iomem		*ehci_base;
 
 	/* Regulators for USB PHYs.
-	 * Each PHY can have a seperate regulator.
+	 * Each PHY can have a separate regulator.
 	 */
 	struct regulator        *regulator[OMAP3_HS_USB_PORTS];
 };
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 2396a85c0f5..72d1893ddd3 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -12,7 +12,7 @@
  * Atomic dir operations
  *
  * Directory operations are by default not atomic.  Dentries and Inodes are
- * created/removed/altered in seperate operations.  Therefore we need to do
+ * created/removed/altered in separate operations.  Therefore we need to do
  * a small amount of journaling.
  *
  * Create, link, mkdir, mknod and symlink all share the same function to do
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 0a3df1a0c93..9028d7f83bb 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -704,7 +704,7 @@ static inline gc_level_t expand_level(u64 ino, level_t __level)
 	u8 level = (__force u8)__level;
 
 	if (ino == LOGFS_INO_MASTER) {
-		/* ifile has seperate areas */
+		/* ifile has separate areas */
 		level += LOGFS_MAX_LEVELS;
 	}
 	return (__force gc_level_t)level;
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
index f674725663f..ae960519c54 100644
--- a/fs/logfs/logfs_abi.h
+++ b/fs/logfs/logfs_abi.h
@@ -50,9 +50,9 @@ static inline void check_##type(void)				\
  * 12	- gc recycled blocks, long-lived data
  * 13	- replacement blocks, short-lived data
  *
- * Levels 1-11 are necessary for robust gc operations and help seperate
+ * Levels 1-11 are necessary for robust gc operations and help separate
  * short-lived metadata from longer-lived file data.  In the future,
- * file data should get seperated into several segments based on simple
+ * file data should get separated into several segments based on simple
  * heuristics.  Old data recycled during gc operation is expected to be
  * long-lived.  New data is of uncertain life expectancy.  New data
  * used to replace older blocks in existing files is expected to be
@@ -117,7 +117,7 @@ static inline void check_##type(void)				\
 #define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
 
 /*
- * LogFS needs to seperate data into levels.  Each level is defined as the
+ * LogFS needs to separate data into levels.  Each level is defined as the
  * maximal possible distance from the master inode (inode of the inode file).
  * Data blocks reside on level 0, 1x indirect block on level 1, etc.
  * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
@@ -204,7 +204,7 @@ SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
  * @ds_crc:			crc32 of structure starting with the next field
  * @ds_ifile_levels:		maximum number of levels for ifile
  * @ds_iblock_levels:		maximum number of levels for regular files
- * @ds_data_levels:		number of seperate levels for data
+ * @ds_data_levels:		number of separate levels for data
  * @pad0:			reserved, must be 0
  * @ds_feature_incompat:	incompatible filesystem features
  * @ds_feature_ro_compat:	read-only compatible filesystem features
@@ -456,7 +456,7 @@ enum logfs_vim {
  * @vim:			life expectancy of data
  *
  * "Areas" are segments currently being used for writing.  There is at least
- * one area per GC level.  Several may be used to seperate long-living from
+ * one area per GC level.  Several may be used to separate long-living from
  * short-living data.  If an area with unknown vim is encountered, it can
  * simply be closed.
  * The write buffer immediately follow this header.
-- 
cgit v1.2.3


From ca0dbd86b12be9af7cda230890eb741d5cb8b624 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Date: Fri, 7 May 2010 16:52:26 -0300
Subject: doc: inode uses a mutex instead of a semaphore.

Replace the introduced i_sem by an i_mutex in the filesystem locking
documentation. This was introduced [1] after all occurrences were
already replaced in the same text [2]. However, the term "inode
semaphore" has not been replaced then, and it's replaced now.

[1] afddba49d18f346e5cc2938b6ed7c512db18ca68
[2] a7bc02f4f47fd0e7860c6589f0ad000d1476f7a3

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/filesystems/Locking | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 06bbbed7120..af1608070cd 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -178,7 +178,7 @@ prototypes:
 locking rules:
 	All except set_page_dirty may block
 
-			BKL	PageLocked(page)	i_sem
+			BKL	PageLocked(page)	i_mutex
 writepage:		no	yes, unlocks (see below)
 readpage:		no	yes, unlocks
 sync_page:		no	maybe
@@ -429,7 +429,7 @@ check_flags:		no
 implementations.  If your fs is not using generic_file_llseek, you
 need to acquire and release the appropriate locks in your ->llseek().
 For many filesystems, it is probably safe to acquire the inode
-semaphore.  Note some filesystems (i.e. remote ones) provide no
+mutex.  Note some filesystems (i.e. remote ones) provide no
 protection for i_size so you will need to use the BKL.
 
 Note: ext2_release() was *the* source of contention on fs-intensive
-- 
cgit v1.2.3


From 34441427aab4bdb3069a4ffcda69a99357abcb2e Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Tue, 11 May 2010 14:06:46 -0700
Subject: revert "procfs: provide stack information for threads" and its fixup
 commits

Originally, commit d899bf7b ("procfs: provide stack information for
threads") attempted to introduce a new feature for showing where the
threadstack was located and how many pages are being utilized by the
stack.

Commit c44972f1 ("procfs: disable per-task stack usage on NOMMU") was
applied to fix the NO_MMU case.

Commit 89240ba0 ("x86, fs: Fix x86 procfs stack information for threads on
64-bit") was applied to fix a bug in ia32 executables being loaded.

Commit 9ebd4eba7 ("procfs: fix /proc/<pid>/stat stack pointer for kernel
threads") was applied to fix a bug which had kernel threads printing a
userland stack address.

Commit 1306d603f ('proc: partially revert "procfs: provide stack
information for threads"') was then applied to revert the stack pages
being used to solve a significant performance regression.

This patch nearly undoes the effect of all these patches.

The reason for reverting these is it provides an unusable value in
field 28.  For x86_64, a fork will result in the task->stack_start
value being updated to the current user top of stack and not the stack
start address.  This unpredictability of the stack_start value makes
it worthless.  That includes the intended use of showing how much stack
space a thread has.

Other architectures will get different values.  As an example, ia64
gets 0.  The do_fork() and copy_process() functions appear to treat the
stack_start and stack_size parameters as architecture specific.

I only partially reverted c44972f1 ("procfs: disable per-task stack usage
on NOMMU") .  If I had completely reverted it, I would have had to change
mm/Makefile only build pagewalk.o when CONFIG_PROC_PAGE_MONITOR is
configured.  Since I could not test the builds without significant effort,
I decided to not change mm/Makefile.

I only partially reverted 89240ba0 ("x86, fs: Fix x86 procfs stack
information for threads on 64-bit") .  I left the KSTK_ESP() change in
place as that seemed worthwhile.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  3 +--
 fs/compat.c                        |  2 --
 fs/exec.c                          |  2 --
 fs/proc/array.c                    |  3 +--
 fs/proc/task_mmu.c                 | 19 -------------------
 include/linux/sched.h              |  1 -
 kernel/fork.c                      |  2 --
 7 files changed, 2 insertions(+), 30 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a4f30faa4f1..1e359b62c40 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -316,7 +316,7 @@ address           perms offset  dev   inode      pathname
 08049000-0804a000 rw-p 00001000 03:00 8312       /opt/test
 0804a000-0806b000 rw-p 00000000 00:00 0          [heap]
 a7cb1000-a7cb2000 ---p 00000000 00:00 0
-a7cb2000-a7eb2000 rw-p 00000000 00:00 0          [threadstack:001ff4b4]
+a7cb2000-a7eb2000 rw-p 00000000 00:00 0
 a7eb2000-a7eb3000 ---p 00000000 00:00 0
 a7eb3000-a7ed5000 rw-p 00000000 00:00 0
 a7ed5000-a8008000 r-xp 00000000 03:00 4222       /lib/libc.so.6
@@ -352,7 +352,6 @@ is not associated with a file:
  [stack]                  = the stack of the main process
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
- [threadstack:xxxxxxxx]   = the stack of the thread, xxxxxxxx is the stack size
 
  or if empty, the mapping is anonymous.
 
diff --git a/fs/compat.c b/fs/compat.c
index 4b6ed03cc47..05448730f84 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1531,8 +1531,6 @@ int compat_do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
-	current->stack_start = current->mm->start_stack;
-
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
diff --git a/fs/exec.c b/fs/exec.c
index 49cdaa19e5b..e6e94c626c2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1387,8 +1387,6 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
-	current->stack_start = current->mm->start_stack;
-
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index e51f2ec2c5e..885ab5513ac 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,7 +81,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
-#include <linux/swapops.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -495,7 +494,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
-		(permitted && mm) ? task->stack_start : 0,
+		(permitted && mm) ? mm->start_stack : 0,
 		esp,
 		eip,
 		/* The signal information here is obsolete.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 070553427dd..47f5b145f56 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -247,25 +247,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 				} else if (vma->vm_start <= mm->start_stack &&
 					   vma->vm_end >= mm->start_stack) {
 					name = "[stack]";
-				} else {
-					unsigned long stack_start;
-					struct proc_maps_private *pmp;
-
-					pmp = m->private;
-					stack_start = pmp->task->stack_start;
-
-					if (vma->vm_start <= stack_start &&
-					    vma->vm_end >= stack_start) {
-						pad_len_spaces(m, len);
-						seq_printf(m,
-						 "[threadstack:%08lx]",
-#ifdef CONFIG_STACK_GROWSUP
-						 vma->vm_end - stack_start
-#else
-						 stack_start - vma->vm_start
-#endif
-						);
-					}
 				}
 			} else {
 				name = "[vdso]";
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dad7f668ebf..2b7b81df78b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1497,7 +1497,6 @@ struct task_struct {
 	/* bitmask of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
-	unsigned long stack_start;
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
 	struct memcg_batch_info {
 		int do_batch;	/* incremented when batch uncharge started */
diff --git a/kernel/fork.c b/kernel/fork.c
index 44b0791b0a2..4c14942a0ee 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1114,8 +1114,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->bts = NULL;
 
-	p->stack_start = stack_start;
-
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
-- 
cgit v1.2.3


From 4dc6ec00f6347b72312fa41dfc587d5302b05544 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 19 Apr 2010 15:11:28 -0400
Subject: nfsd4: implement reclaim_complete

This is a mandatory operation.  Also, here (not in open) is where we
should be committing the reboot recovery information.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 Documentation/filesystems/nfs/nfs41-server.txt |  2 +-
 fs/nfsd/nfs4proc.c                             |  5 ++++
 fs/nfsd/nfs4state.c                            | 33 +++++++++++++++++++++++---
 fs/nfsd/nfs4xdr.c                              | 12 +++++++++-
 fs/nfsd/xdr4.h                                 |  6 +++++
 5 files changed, 53 insertions(+), 5 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 6a53a84afc7..04884914a1c 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -137,7 +137,7 @@ NS*| OPENATTR             | OPT        |              | Section 18.17  |
    | READ                 | REQ        |              | Section 18.22  |
    | READDIR              | REQ        |              | Section 18.23  |
    | READLINK             | OPT        |              | Section 18.24  |
-NS | RECLAIM_COMPLETE     | REQ        |              | Section 18.51  |
+   | RECLAIM_COMPLETE     | REQ        |              | Section 18.51  |
    | RELEASE_LOCKOWNER    | MNI        |              | N/A            |
    | REMOVE               | REQ        |              | Section 18.25  |
    | RENAME               | REQ        |              | Section 18.26  |
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e2dc9608281..59ec449b0c7 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1312,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
 		.op_name = "OP_SEQUENCE",
 	},
+	[OP_RECLAIM_COMPLETE] = {
+		.op_func = (nfsd4op_func)nfsd4_reclaim_complete,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_RECLAIM_COMPLETE",
+	},
 };
 
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ede9dde52fe..84b0fe9a262 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1501,6 +1501,35 @@ out:
 	return status;
 }
 
+__be32
+nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
+{
+	if (rc->rca_one_fs) {
+		if (!cstate->current_fh.fh_dentry)
+			return nfserr_nofilehandle;
+		/*
+		 * We don't take advantage of the rca_one_fs case.
+		 * That's OK, it's optional, we can safely ignore it.
+		 */
+		 return nfs_ok;
+	}
+	nfs4_lock_state();
+	if (is_client_expired(cstate->session->se_client)) {
+		nfs4_unlock_state();
+		/*
+		 * The following error isn't really legal.
+		 * But we only get here if the client just explicitly
+		 * destroyed the client.  Surely it no longer cares what
+		 * error it gets back on an operation for the dead
+		 * client.
+		 */
+		return nfserr_stale_clientid;
+	}
+	nfsd4_create_clid_dir(cstate->session->se_client);
+	nfs4_unlock_state();
+	return nfs_ok;
+}
+
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
@@ -2510,10 +2539,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	}
 	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
 
-	if (nfsd4_has_session(&resp->cstate)) {
+	if (nfsd4_has_session(&resp->cstate))
 		open->op_stateowner->so_confirmed = 1;
-		nfsd4_create_clid_dir(open->op_stateowner->so_client);
-	}
 
 	/*
 	* Attempt to hand out a delegation. No error return, because the
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 126d0caabb3..ac17a708023 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1234,6 +1234,16 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
 	DECODE_TAIL;
 }
 
+static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(rc->rca_one_fs);
+
+	DECODE_TAIL;
+}
+
 static __be32
 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 {
@@ -1346,7 +1356,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 	[OP_TEST_STATEID]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_WANT_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_DESTROY_CLIENTID]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_reclaim_complete,
 };
 
 struct nfsd4_minorversion_ops {
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index c28958ec216..4d476ff08ae 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -381,6 +381,10 @@ struct nfsd4_destroy_session {
 	struct nfs4_sessionid	sessionid;
 };
 
+struct nfsd4_reclaim_complete {
+	u32 rca_one_fs;
+};
+
 struct nfsd4_op {
 	int					opnum;
 	__be32					status;
@@ -421,6 +425,7 @@ struct nfsd4_op {
 		struct nfsd4_create_session	create_session;
 		struct nfsd4_destroy_session	destroy_session;
 		struct nfsd4_sequence		sequence;
+		struct nfsd4_reclaim_complete	reclaim_complete;
 	} u;
 	struct nfs4_replay *			replay;
 };
@@ -523,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
 extern __be32 nfsd4_destroy_session(struct svc_rqst *,
 		struct nfsd4_compound_state *,
 		struct nfsd4_destroy_session *);
+__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
 extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
 		struct nfsd4_open *open);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
-- 
cgit v1.2.3


From b9d8b45ee3c5f62cdbd34ee006f3dd2ac51f7018 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Tue, 4 May 2010 21:45:38 -0500
Subject: sysfs-namespaces: add a high-level Documentation file

The first three paragraphs are almost verbatim taken from Eric's
commit message on the patch introducing network ns tags.  The next
two paragraphs I wrote to be a brief high level overview.  The last
section is taken from the commit message on "Implement sysfs tagged
directory support", but updated.  Hopefully correctly.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/filesystems/sysfs-tagging.txt | 42 +++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 Documentation/filesystems/sysfs-tagging.txt

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/sysfs-tagging.txt b/Documentation/filesystems/sysfs-tagging.txt
new file mode 100644
index 00000000000..caaaf1266d8
--- /dev/null
+++ b/Documentation/filesystems/sysfs-tagging.txt
@@ -0,0 +1,42 @@
+Sysfs tagging
+-------------
+
+(Taken almost verbatim from Eric Biederman's netns tagging patch
+commit msg)
+
+The problem.  Network devices show up in sysfs and with the network
+namespace active multiple devices with the same name can show up in
+the same directory, ouch!
+
+To avoid that problem and allow existing applications in network
+namespaces to see the same interface that is currently presented in
+sysfs, sysfs now has tagging directory support.
+
+By using the network namespace pointers as tags to separate out the
+the sysfs directory entries we ensure that we don't have conflicts
+in the directories and applications only see a limited set of
+the network devices.
+
+Each sysfs directory entry may be tagged with zero or one
+namespaces.  A sysfs_dirent is augmented with a void *s_ns.  If a
+directory entry is tagged, then sysfs_dirent->s_flags will have a
+flag between KOBJ_NS_TYPE_NONE and KOBJ_NS_TYPES, and s_ns will
+point to the namespace to which it belongs.
+
+Each sysfs superblock's sysfs_super_info contains an array void
+*ns[KOBJ_NS_TYPES].  When a a task in a tagging namespace
+kobj_nstype first mounts sysfs, a new superblock is created.  It
+will be differentiated from other sysfs mounts by having its
+s_fs_info->ns[kobj_nstype] set to the new namespace.  Note that
+through bind mounting and mounts propagation, a task can easily view
+the contents of other namespaces' sysfs mounts.  Therefore, when a
+namespace exits, it will call kobj_ns_exit() to invalidate any
+sysfs_dirent->s_ns pointers pointing to it.
+
+Users of this interface:
+- define a type in the kobj_ns_type enumeration.
+- call kobj_ns_type_register() with its kobj_ns_type_operations which has
+  - current_ns() which returns current's namespace
+  - netlink_ns() which returns a socket's namespace
+  - initial_ns() which returns the initial namesapce
+- call kobj_ns_exit() when an individual tag is no longer valid
-- 
cgit v1.2.3


From 0636c73ee7b129f77f577aaaefc8dde057be6d18 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 30 Apr 2010 11:09:34 -0500
Subject: ext3: make barrier options consistent with ext4

ext4 was updated to accept barrier/nobarrier mount options
in addition to the older barrier=0/1.  The barrier story
is complex enough, we should help people by making the options
the same at least, even if the defaults are different.

This patch allows the barrier/nobarrier mount options for ext3,
while keeping nobarrier the default.

It also unconditionally displays barrier status in show_options,
and prints a message at mount time if barriers are not enabled,
just as ext4 does.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 Documentation/filesystems/ext3.txt | 15 +++++++++++++--
 fs/ext3/super.c                    | 33 ++++++++++++++++++++++++++-------
 2 files changed, 39 insertions(+), 9 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 867c5b50cb4..272f80d5f96 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -59,8 +59,19 @@ commit=nrsec	(*)	Ext3 can be told to sync all its data and metadata
 			Setting it to very large values will improve
 			performance.
 
-barrier=1		This enables/disables barriers.  barrier=0 disables
-			it, barrier=1 enables it.
+barrier=<0(*)|1>	This enables/disables the use of write barriers in
+barrier			the jbd code.  barrier=0 disables, barrier=1 enables.
+nobarrier	(*)	This also requires an IO stack which can support
+			barriers, and if jbd gets an error on a barrier
+			write, it will disable again with a warning.
+			Write barriers enforce proper on-disk ordering
+			of journal commits, making volatile disk write caches
+			safe to use, at some performance penalty.  If
+			your disks are battery-backed in one way or another,
+			disabling barriers may safely improve performance.
+			The mount options "barrier" and "nobarrier" can
+			also be used to enable or disable barriers, for
+			consistency with other ext3 mount options.
 
 orlov		(*)	This enables the new Orlov block allocator. It is
 			enabled by default.
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6b6e49de091..0fc1293d0e9 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -653,8 +653,12 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
 	}
-	if (test_opt(sb, BARRIER))
-		seq_puts(seq, ",barrier=1");
+
+	/*
+	 * Always display barrier state so it's clear what the status is.
+	 */
+	seq_puts(seq, ",barrier=");
+	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
 
@@ -810,8 +814,8 @@ enum {
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-	Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
-	Opt_usrquota, Opt_grpquota
+	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
+	Opt_resize, Opt_usrquota, Opt_grpquota
 };
 
 static const match_table_t tokens = {
@@ -865,6 +869,8 @@ static const match_table_t tokens = {
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
 	{Opt_resize, "resize"},
 	{Opt_err, NULL},
 };
@@ -967,7 +973,11 @@ static int parse_options (char *options, struct super_block *sb,
 		int token;
 		if (!*p)
 			continue;
-
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = 0;
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_bsd_df:
@@ -1215,9 +1225,15 @@ set_qf_format:
 		case Opt_abort:
 			set_opt(sbi->s_mount_opt, ABORT);
 			break;
+		case Opt_nobarrier:
+			clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
 		case Opt_barrier:
-			if (match_int(&args[0], &option))
-				return 0;
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
 			if (option)
 				set_opt(sbi->s_mount_opt, BARRIER);
 			else
@@ -2276,6 +2292,9 @@ static int ext3_load_journal(struct super_block *sb,
 			return -EINVAL;
 	}
 
+	if (!(journal->j_flags & JFS_BARRIER))
+		printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
+
 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
 		err = journal_update_format(journal);
 		if (err)  {
-- 
cgit v1.2.3


From a9a745daadab26f13884ff26a50fa38247c11ce9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 14 May 2010 21:43:11 +1000
Subject: xfs: Delayed logging design documentation

Document the design of the delayed logging implementation. This
includes assumptions made, dead ends followed, the reasoning behind
the structuring of the code, the layout of various structures, how
things fit together, traps and pit-falls avoided, etc. This is all
too much to document in the code itself, so do it in a separate
file.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 .../filesystems/xfs-delayed-logging-design.txt     | 816 +++++++++++++++++++++
 1 file changed, 816 insertions(+)
 create mode 100644 Documentation/filesystems/xfs-delayed-logging-design.txt

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
new file mode 100644
index 00000000000..d8119e9d2d6
--- /dev/null
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -0,0 +1,816 @@
+XFS Delayed Logging Design
+--------------------------
+
+Introduction to Re-logging in XFS
+---------------------------------
+
+XFS logging is a combination of logical and physical logging. Some objects,
+such as inodes and dquots, are logged in logical format where the details
+logged are made up of the changes to in-core structures rather than on-disk
+structures. Other objects - typically buffers - have their physical changes
+logged. The reason for these differences is to reduce the amount of log space
+required for objects that are frequently logged. Some parts of inodes are more
+frequently logged than others, and inodes are typically more frequently logged
+than any other object (except maybe the superblock buffer) so keeping the
+amount of metadata logged low is of prime importance.
+
+The reason that this is such a concern is that XFS allows multiple separate
+modifications to a single object to be carried in the log at any given time.
+This allows the log to avoid needing to flush each change to disk before
+recording a new change to the object. XFS does this via a method called
+"re-logging". Conceptually, this is quite simple - all it requires is that any
+new change to the object is recorded with a *new copy* of all the existing
+changes in the new transaction that is written to the log.
+
+That is, if we have a sequence of changes A through to F, and the object was
+written to disk after change D, we would see in the log the following series
+of transactions, their contents and the log sequence number (LSN) of the
+transaction:
+
+	Transaction		Contents	LSN
+	   A			   A		   X
+	   B			  A+B		  X+n
+	   C			 A+B+C		 X+n+m
+	   D			A+B+C+D		X+n+m+o
+	    <object written to disk>
+	   E			   E		   Y (> X+n+m+o)
+	   F			  E+F		  Yٍ+p
+
+In other words, each time an object is relogged, the new transaction contains
+the aggregation of all the previous changes currently held only in the log.
+
+This relogging technique also allows objects to be moved forward in the log so
+that an object being relogged does not prevent the tail of the log from ever
+moving forward.  This can be seen in the table above by the changing
+(increasing) LSN of each subsquent transaction - the LSN is effectively a
+direct encoding of the location in the log of the transaction.
+
+This relogging is also used to implement long-running, multiple-commit
+transactions.  These transaction are known as rolling transactions, and require
+a special log reservation known as a permanent transaction reservation. A
+typical example of a rolling transaction is the removal of extents from an
+inode which can only be done at a rate of two extents per transaction because
+of reservation size limitations. Hence a rolling extent removal transaction
+keeps relogging the inode and btree buffers as they get modified in each
+removal operation. This keeps them moving forward in the log as the operation
+progresses, ensuring that current operation never gets blocked by itself if the
+log wraps around.
+
+Hence it can be seen that the relogging operation is fundamental to the correct
+working of the XFS journalling subsystem. From the above description, most
+people should be able to see why the XFS metadata operations writes so much to
+the log - repeated operations to the same objects write the same changes to
+the log over and over again. Worse is the fact that objects tend to get
+dirtier as they get relogged, so each subsequent transaction is writing more
+metadata into the log.
+
+Another feature of the XFS transaction subsystem is that most transactions are
+asynchronous. That is, they don't commit to disk until either a log buffer is
+filled (a log buffer can hold multiple transactions) or a synchronous operation
+forces the log buffers holding the transactions to disk. This means that XFS is
+doing aggregation of transactions in memory - batching them, if you like - to
+minimise the impact of the log IO on transaction throughput.
+
+The limitation on asynchronous transaction throughput is the number and size of
+log buffers made available by the log manager. By default there are 8 log
+buffers available and the size of each is 32kB - the size can be increased up
+to 256kB by use of a mount option.
+
+Effectively, this gives us the maximum bound of outstanding metadata changes
+that can be made to the filesystem at any point in time - if all the log
+buffers are full and under IO, then no more transactions can be committed until
+the current batch completes. It is now common for a single current CPU core to
+be to able to issue enough transactions to keep the log buffers full and under
+IO permanently. Hence the XFS journalling subsystem can be considered to be IO
+bound.
+
+Delayed Logging: Concepts
+-------------------------
+
+The key thing to note about the asynchronous logging combined with the
+relogging technique XFS uses is that we can be relogging changed objects
+multiple times before they are committed to disk in the log buffers. If we
+return to the previous relogging example, it is entirely possible that
+transactions A through D are committed to disk in the same log buffer.
+
+That is, a single log buffer may contain multiple copies of the same object,
+but only one of those copies needs to be there - the last one "D", as it
+contains all the changes from the previous changes. In other words, we have one
+necessary copy in the log buffer, and three stale copies that are simply
+wasting space. When we are doing repeated operations on the same set of
+objects, these "stale objects" can be over 90% of the space used in the log
+buffers. It is clear that reducing the number of stale objects written to the
+log would greatly reduce the amount of metadata we write to the log, and this
+is the fundamental goal of delayed logging.
+
+From a conceptual point of view, XFS is already doing relogging in memory (where
+memory == log buffer), only it is doing it extremely inefficiently. It is using
+logical to physical formatting to do the relogging because there is no
+infrastructure to keep track of logical changes in memory prior to physically
+formatting the changes in a transaction to the log buffer. Hence we cannot avoid
+accumulating stale objects in the log buffers.
+
+Delayed logging is the name we've given to keeping and tracking transactional
+changes to objects in memory outside the log buffer infrastructure. Because of
+the relogging concept fundamental to the XFS journalling subsystem, this is
+actually relatively easy to do - all the changes to logged items are already
+tracked in the current infrastructure. The big problem is how to accumulate
+them and get them to the log in a consistent, recoverable manner.
+Describing the problems and how they have been solved is the focus of this
+document.
+
+One of the key changes that delayed logging makes to the operation of the
+journalling subsystem is that it disassociates the amount of outstanding
+metadata changes from the size and number of log buffers available. In other
+words, instead of there only being a maximum of 2MB of transaction changes not
+written to the log at any point in time, there may be a much greater amount
+being accumulated in memory. Hence the potential for loss of metadata on a
+crash is much greater than for the existing logging mechanism.
+
+It should be noted that this does not change the guarantee that log recovery
+will result in a consistent filesystem. What it does mean is that as far as the
+recovered filesystem is concerned, there may be many thousands of transactions
+that simply did not occur as a result of the crash. This makes it even more
+important that applications that care about their data use fsync() where they
+need to ensure application level data integrity is maintained.
+
+It should be noted that delayed logging is not an innovative new concept that
+warrants rigorous proofs to determine whether it is correct or not. The method
+of accumulating changes in memory for some period before writing them to the
+log is used effectively in many filesystems including ext3 and ext4. Hence
+no time is spent in this document trying to convince the reader that the
+concept is sound. Instead it is simply considered a "solved problem" and as
+such implementing it in XFS is purely an exercise in software engineering.
+
+The fundamental requirements for delayed logging in XFS are simple:
+
+	1. Reduce the amount of metadata written to the log by at least
+	   an order of magnitude.
+	2. Supply sufficient statistics to validate Requirement #1.
+	3. Supply sufficient new tracing infrastructure to be able to debug
+	   problems with the new code.
+	4. No on-disk format change (metadata or log format).
+	5. Enable and disable with a mount option.
+	6. No performance regressions for synchronous transaction workloads.
+
+Delayed Logging: Design
+-----------------------
+
+Storing Changes
+
+The problem with accumulating changes at a logical level (i.e. just using the
+existing log item dirty region tracking) is that when it comes to writing the
+changes to the log buffers, we need to ensure that the object we are formatting
+is not changing while we do this. This requires locking the object to prevent
+concurrent modification. Hence flushing the logical changes to the log would
+require us to lock every object, format them, and then unlock them again.
+
+This introduces lots of scope for deadlocks with transactions that are already
+running. For example, a transaction has object A locked and modified, but needs
+the delayed logging tracking lock to commit the transaction. However, the
+flushing thread has the delayed logging tracking lock already held, and is
+trying to get the lock on object A to flush it to the log buffer. This appears
+to be an unsolvable deadlock condition, and it was solving this problem that
+was the barrier to implementing delayed logging for so long.
+
+The solution is relatively simple - it just took a long time to recognise it.
+Put simply, the current logging code formats the changes to each item into an
+vector array that points to the changed regions in the item. The log write code
+simply copies the memory these vectors point to into the log buffer during
+transaction commit while the item is locked in the transaction. Instead of
+using the log buffer as the destination of the formatting code, we can use an
+allocated memory buffer big enough to fit the formatted vector.
+
+If we then copy the vector into the memory buffer and rewrite the vector to
+point to the memory buffer rather than the object itself, we now have a copy of
+the changes in a format that is compatible with the log buffer writing code.
+that does not require us to lock the item to access. This formatting and
+rewriting can all be done while the object is locked during transaction commit,
+resulting in a vector that is transactionally consistent and can be accessed
+without needing to lock the owning item.
+
+Hence we avoid the need to lock items when we need to flush outstanding
+asynchronous transactions to the log. The differences between the existing
+formatting method and the delayed logging formatting can be seen in the
+diagram below.
+
+Current format log vector:
+
+Object    +---------------------------------------------+
+Vector 1      +----+
+Vector 2                    +----+
+Vector 3                                   +----------+
+
+After formatting:
+
+Log Buffer    +-V1-+-V2-+----V3----+
+
+Delayed logging vector:
+
+Object    +---------------------------------------------+
+Vector 1      +----+
+Vector 2                    +----+
+Vector 3                                   +----------+
+
+After formatting:
+
+Memory Buffer +-V1-+-V2-+----V3----+
+Vector 1      +----+
+Vector 2           +----+
+Vector 3                +----------+
+
+The memory buffer and associated vector need to be passed as a single object,
+but still need to be associated with the parent object so if the object is
+relogged we can replace the current memory buffer with a new memory buffer that
+contains the latest changes.
+
+The reason for keeping the vector around after we've formatted the memory
+buffer is to support splitting vectors across log buffer boundaries correctly.
+If we don't keep the vector around, we do not know where the region boundaries
+are in the item, so we'd need a new encapsulation method for regions in the log
+buffer writing (i.e. double encapsulation). This would be an on-disk format
+change and as such is not desirable.  It also means we'd have to write the log
+region headers in the formatting stage, which is problematic as there is per
+region state that needs to be placed into the headers during the log write.
+
+Hence we need to keep the vector, but by attaching the memory buffer to it and
+rewriting the vector addresses to point at the memory buffer we end up with a
+self-describing object that can be passed to the log buffer write code to be
+handled in exactly the same manner as the existing log vectors are handled.
+Hence we avoid needing a new on-disk format to handle items that have been
+relogged in memory.
+
+
+Tracking Changes
+
+Now that we can record transactional changes in memory in a form that allows
+them to be used without limitations, we need to be able to track and accumulate
+them so that they can be written to the log at some later point in time.  The
+log item is the natural place to store this vector and buffer, and also makes sense
+to be the object that is used to track committed objects as it will always
+exist once the object has been included in a transaction.
+
+The log item is already used to track the log items that have been written to
+the log but not yet written to disk. Such log items are considered "active"
+and as such are stored in the Active Item List (AIL) which is a LSN-ordered
+double linked list. Items are inserted into this list during log buffer IO
+completion, after which they are unpinned and can be written to disk. An object
+that is in the AIL can be relogged, which causes the object to be pinned again
+and then moved forward in the AIL when the log buffer IO completes for that
+transaction.
+
+Essentially, this shows that an item that is in the AIL can still be modified
+and relogged, so any tracking must be separate to the AIL infrastructure. As
+such, we cannot reuse the AIL list pointers for tracking committed items, nor
+can we store state in any field that is protected by the AIL lock. Hence the
+committed item tracking needs it's own locks, lists and state fields in the log
+item.
+
+Similar to the AIL, tracking of committed items is done through a new list
+called the Committed Item List (CIL).  The list tracks log items that have been
+committed and have formatted memory buffers attached to them. It tracks objects
+in transaction commit order, so when an object is relogged it is removed from
+it's place in the list and re-inserted at the tail. This is entirely arbitrary
+and done to make it easy for debugging - the last items in the list are the
+ones that are most recently modified. Ordering of the CIL is not necessary for
+transactional integrity (as discussed in the next section) so the ordering is
+done for convenience/sanity of the developers.
+
+
+Delayed Logging: Checkpoints
+
+When we have a log synchronisation event, commonly known as a "log force",
+all the items in the CIL must be written into the log via the log buffers.
+We need to write these items in the order that they exist in the CIL, and they
+need to be written as an atomic transaction. The need for all the objects to be
+written as an atomic transaction comes from the requirements of relogging and
+log replay - all the changes in all the objects in a given transaction must
+either be completely replayed during log recovery, or not replayed at all. If
+a transaction is not replayed because it is not complete in the log, then
+no later transactions should be replayed, either.
+
+To fulfill this requirement, we need to write the entire CIL in a single log
+transaction. Fortunately, the XFS log code has no fixed limit on the size of a
+transaction, nor does the log replay code. The only fundamental limit is that
+the transaction cannot be larger than just under half the size of the log.  The
+reason for this limit is that to find the head and tail of the log, there must
+be at least one complete transaction in the log at any given time. If a
+transaction is larger than half the log, then there is the possibility that a
+crash during the write of a such a transaction could partially overwrite the
+only complete previous transaction in the log. This will result in a recovery
+failure and an inconsistent filesystem and hence we must enforce the maximum
+size of a checkpoint to be slightly less than a half the log.
+
+Apart from this size requirement, a checkpoint transaction looks no different
+to any other transaction - it contains a transaction header, a series of
+formatted log items and a commit record at the tail. From a recovery
+perspective, the checkpoint transaction is also no different - just a lot
+bigger with a lot more items in it. The worst case effect of this is that we
+might need to tune the recovery transaction object hash size.
+
+Because the checkpoint is just another transaction and all the changes to log
+items are stored as log vectors, we can use the existing log buffer writing
+code to write the changes into the log. To do this efficiently, we need to
+minimise the time we hold the CIL locked while writing the checkpoint
+transaction. The current log write code enables us to do this easily with the
+way it separates the writing of the transaction contents (the log vectors) from
+the transaction commit record, but tracking this requires us to have a
+per-checkpoint context that travels through the log write process through to
+checkpoint completion.
+
+Hence a checkpoint has a context that tracks the state of the current
+checkpoint from initiation to checkpoint completion. A new context is initiated
+at the same time a checkpoint transaction is started. That is, when we remove
+all the current items from the CIL during a checkpoint operation, we move all
+those changes into the current checkpoint context. We then initialise a new
+context and attach that to the CIL for aggregation of new transactions.
+
+This allows us to unlock the CIL immediately after transfer of all the
+committed items and effectively allow new transactions to be issued while we
+are formatting the checkpoint into the log. It also allows concurrent
+checkpoints to be written into the log buffers in the case of log force heavy
+workloads, just like the existing transaction commit code does. This, however,
+requires that we strictly order the commit records in the log so that
+checkpoint sequence order is maintained during log replay.
+
+To ensure that we can be writing an item into a checkpoint transaction at
+the same time another transaction modifies the item and inserts the log item
+into the new CIL, then checkpoint transaction commit code cannot use log items
+to store the list of log vectors that need to be written into the transaction.
+Hence log vectors need to be able to be chained together to allow them to be
+detatched from the log items. That is, when the CIL is flushed the memory
+buffer and log vector attached to each log item needs to be attached to the
+checkpoint context so that the log item can be released. In diagrammatic form,
+the CIL would look like this before the flush:
+
+	CIL Head
+	   |
+	   V
+	Log Item <-> log vector 1	-> memory buffer
+	   |				-> vector array
+	   V
+	Log Item <-> log vector 2	-> memory buffer
+	   |				-> vector array
+	   V
+	......
+	   |
+	   V
+	Log Item <-> log vector N-1	-> memory buffer
+	   |				-> vector array
+	   V
+	Log Item <-> log vector N	-> memory buffer
+					-> vector array
+
+And after the flush the CIL head is empty, and the checkpoint context log
+vector list would look like:
+
+	Checkpoint Context
+	   |
+	   V
+	log vector 1	-> memory buffer
+	   |		-> vector array
+	   |		-> Log Item
+	   V
+	log vector 2	-> memory buffer
+	   |		-> vector array
+	   |		-> Log Item
+	   V
+	......
+	   |
+	   V
+	log vector N-1	-> memory buffer
+	   |		-> vector array
+	   |		-> Log Item
+	   V
+	log vector N	-> memory buffer
+			-> vector array
+			-> Log Item
+
+Once this transfer is done, the CIL can be unlocked and new transactions can
+start, while the checkpoint flush code works over the log vector chain to
+commit the checkpoint.
+
+Once the checkpoint is written into the log buffers, the checkpoint context is
+attached to the log buffer that the commit record was written to along with a
+completion callback. Log IO completion will call that callback, which can then
+run transaction committed processing for the log items (i.e. insert into AIL
+and unpin) in the log vector chain and then free the log vector chain and
+checkpoint context.
+
+Discussion Point: I am uncertain as to whether the log item is the most
+efficient way to track vectors, even though it seems like the natural way to do
+it. The fact that we walk the log items (in the CIL) just to chain the log
+vectors and break the link between the log item and the log vector means that
+we take a cache line hit for the log item list modification, then another for
+the log vector chaining. If we track by the log vectors, then we only need to
+break the link between the log item and the log vector, which means we should
+dirty only the log item cachelines. Normally I wouldn't be concerned about one
+vs two dirty cachelines except for the fact I've seen upwards of 80,000 log
+vectors in one checkpoint transaction. I'd guess this is a "measure and
+compare" situation that can be done after a working and reviewed implementation
+is in the dev tree....
+
+Delayed Logging: Checkpoint Sequencing
+
+One of the key aspects of the XFS transaction subsystem is that it tags
+committed transactions with the log sequence number of the transaction commit.
+This allows transactions to be issued asynchronously even though there may be
+future operations that cannot be completed until that transaction is fully
+committed to the log. In the rare case that a dependent operation occurs (e.g.
+re-using a freed metadata extent for a data extent), a special, optimised log
+force can be issued to force the dependent transaction to disk immediately.
+
+To do this, transactions need to record the LSN of the commit record of the
+transaction. This LSN comes directly from the log buffer the transaction is
+written into. While this works just fine for the existing transaction
+mechanism, it does not work for delayed logging because transactions are not
+written directly into the log buffers. Hence some other method of sequencing
+transactions is required.
+
+As discussed in the checkpoint section, delayed logging uses per-checkpoint
+contexts, and as such it is simple to assign a sequence number to each
+checkpoint. Because the switching of checkpoint contexts must be done
+atomically, it is simple to ensure that each new context has a monotonically
+increasing sequence number assigned to it without the need for an external
+atomic counter - we can just take the current context sequence number and add
+one to it for the new context.
+
+Then, instead of assigning a log buffer LSN to the transaction commit LSN
+during the commit, we can assign the current checkpoint sequence. This allows
+operations that track transactions that have not yet completed know what
+checkpoint sequence needs to be committed before they can continue. As a
+result, the code that forces the log to a specific LSN now needs to ensure that
+the log forces to a specific checkpoint.
+
+To ensure that we can do this, we need to track all the checkpoint contexts
+that are currently committing to the log. When we flush a checkpoint, the
+context gets added to a "committing" list which can be searched. When a
+checkpoint commit completes, it is removed from the committing list. Because
+the checkpoint context records the LSN of the commit record for the checkpoint,
+we can also wait on the log buffer that contains the commit record, thereby
+using the existing log force mechanisms to execute synchronous forces.
+
+It should be noted that the synchronous forces may need to be extended with
+mitigation algorithms similar to the current log buffer code to allow
+aggregation of multiple synchronous transactions if there are already
+synchronous transactions being flushed. Investigation of the performance of the
+current design is needed before making any decisions here.
+
+The main concern with log forces is to ensure that all the previous checkpoints
+are also committed to disk before the one we need to wait for. Therefore we
+need to check that all the prior contexts in the committing list are also
+complete before waiting on the one we need to complete. We do this
+synchronisation in the log force code so that we don't need to wait anywhere
+else for such serialisation - it only matters when we do a log force.
+
+The only remaining complexity is that a log force now also has to handle the
+case where the forcing sequence number is the same as the current context. That
+is, we need to flush the CIL and potentially wait for it to complete. This is a
+simple addition to the existing log forcing code to check the sequence numbers
+and push if required. Indeed, placing the current sequence checkpoint flush in
+the log force code enables the current mechanism for issuing synchronous
+transactions to remain untouched (i.e. commit an asynchronous transaction, then
+force the log at the LSN of that transaction) and so the higher level code
+behaves the same regardless of whether delayed logging is being used or not.
+
+Delayed Logging: Checkpoint Log Space Accounting
+
+The big issue for a checkpoint transaction is the log space reservation for the
+transaction. We don't know how big a checkpoint transaction is going to be
+ahead of time, nor how many log buffers it will take to write out, nor the
+number of split log vector regions are going to be used. We can track the
+amount of log space required as we add items to the commit item list, but we
+still need to reserve the space in the log for the checkpoint.
+
+A typical transaction reserves enough space in the log for the worst case space
+usage of the transaction. The reservation accounts for log record headers,
+transaction and region headers, headers for split regions, buffer tail padding,
+etc. as well as the actual space for all the changed metadata in the
+transaction. While some of this is fixed overhead, much of it is dependent on
+the size of the transaction and the number of regions being logged (the number
+of log vectors in the transaction).
+
+An example of the differences would be logging directory changes versus logging
+inode changes. If you modify lots of inode cores (e.g. chmod -R g+w *), then
+there are lots of transactions that only contain an inode core and an inode log
+format structure. That is, two vectors totaling roughly 150 bytes. If we modify
+10,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each
+vector is 12 bytes, so the total to be logged is approximately 1.75MB. In
+comparison, if we are logging full directory buffers, they are typically 4KB
+each, so we in 1.5MB of directory buffers we'd have roughly 400 buffers and a
+buffer format structure for each buffer - roughly 800 vectors or 1.51MB total
+space.  From this, it should be obvious that a static log space reservation is
+not particularly flexible and is difficult to select the "optimal value" for
+all workloads.
+
+Further, if we are going to use a static reservation, which bit of the entire
+reservation does it cover? We account for space used by the transaction
+reservation by tracking the space currently used by the object in the CIL and
+then calculating the increase or decrease in space used as the object is
+relogged. This allows for a checkpoint reservation to only have to account for
+log buffer metadata used such as log header records.
+
+However, even using a static reservation for just the log metadata is
+problematic. Typically log record headers use at least 16KB of log space per
+1MB of log space consumed (512 bytes per 32k) and the reservation needs to be
+large enough to handle arbitrary sized checkpoint transactions. This
+reservation needs to be made before the checkpoint is started, and we need to
+be able to reserve the space without sleeping.  For a 8MB checkpoint, we need a
+reservation of around 150KB, which is a non-trivial amount of space.
+
+A static reservation needs to manipulate the log grant counters - we can take a
+permanent reservation on the space, but we still need to make sure we refresh
+the write reservation (the actual space available to the transaction) after
+every checkpoint transaction completion. Unfortunately, if this space is not
+available when required, then the regrant code will sleep waiting for it.
+
+The problem with this is that it can lead to deadlocks as we may need to commit
+checkpoints to be able to free up log space (refer back to the description of
+rolling transactions for an example of this).  Hence we *must* always have
+space available in the log if we are to use static reservations, and that is
+very difficult and complex to arrange. It is possible to do, but there is a
+simpler way.
+
+The simpler way of doing this is tracking the entire log space used by the
+items in the CIL and using this to dynamically calculate the amount of log
+space required by the log metadata. If this log metadata space changes as a
+result of a transaction commit inserting a new memory buffer into the CIL, then
+the difference in space required is removed from the transaction that causes
+the change. Transactions at this level will *always* have enough space
+available in their reservation for this as they have already reserved the
+maximal amount of log metadata space they require, and such a delta reservation
+will always be less than or equal to the maximal amount in the reservation.
+
+Hence we can grow the checkpoint transaction reservation dynamically as items
+are added to the CIL and avoid the need for reserving and regranting log space
+up front. This avoids deadlocks and removes a blocking point from the
+checkpoint flush code.
+
+As mentioned early, transactions can't grow to more than half the size of the
+log. Hence as part of the reservation growing, we need to also check the size
+of the reservation against the maximum allowed transaction size. If we reach
+the maximum threshold, we need to push the CIL to the log. This is effectively
+a "background flush" and is done on demand. This is identical to
+a CIL push triggered by a log force, only that there is no waiting for the
+checkpoint commit to complete. This background push is checked and executed by
+transaction commit code.
+
+If the transaction subsystem goes idle while we still have items in the CIL,
+they will be flushed by the periodic log force issued by the xfssyncd. This log
+force will push the CIL to disk, and if the transaction subsystem stays idle,
+allow the idle log to be covered (effectively marked clean) in exactly the same
+manner that is done for the existing logging method. A discussion point is
+whether this log force needs to be done more frequently than the current rate
+which is once every 30s.
+
+
+Delayed Logging: Log Item Pinning
+
+Currently log items are pinned during transaction commit while the items are
+still locked. This happens just after the items are formatted, though it could
+be done any time before the items are unlocked. The result of this mechanism is
+that items get pinned once for every transaction that is committed to the log
+buffers. Hence items that are relogged in the log buffers will have a pin count
+for every outstanding transaction they were dirtied in. When each of these
+transactions is completed, they will unpin the item once. As a result, the item
+only becomes unpinned when all the transactions complete and there are no
+pending transactions. Thus the pinning and unpinning of a log item is symmetric
+as there is a 1:1 relationship with transaction commit and log item completion.
+
+For delayed logging, however, we have an assymetric transaction commit to
+completion relationship. Every time an object is relogged in the CIL it goes
+through the commit process without a corresponding completion being registered.
+That is, we now have a many-to-one relationship between transaction commit and
+log item completion. The result of this is that pinning and unpinning of the
+log items becomes unbalanced if we retain the "pin on transaction commit, unpin
+on transaction completion" model.
+
+To keep pin/unpin symmetry, the algorithm needs to change to a "pin on
+insertion into the CIL, unpin on checkpoint completion". In other words, the
+pinning and unpinning becomes symmetric around a checkpoint context. We have to
+pin the object the first time it is inserted into the CIL - if it is already in
+the CIL during a transaction commit, then we do not pin it again. Because there
+can be multiple outstanding checkpoint contexts, we can still see elevated pin
+counts, but as each checkpoint completes the pin count will retain the correct
+value according to it's context.
+
+Just to make matters more slightly more complex, this checkpoint level context
+for the pin count means that the pinning of an item must take place under the
+CIL commit/flush lock. If we pin the object outside this lock, we cannot
+guarantee which context the pin count is associated with. This is because of
+the fact pinning the item is dependent on whether the item is present in the
+current CIL or not. If we don't pin the CIL first before we check and pin the
+object, we have a race with CIL being flushed between the check and the pin
+(or not pinning, as the case may be). Hence we must hold the CIL flush/commit
+lock to guarantee that we pin the items correctly.
+
+Delayed Logging: Concurrent Scalability
+
+A fundamental requirement for the CIL is that accesses through transaction
+commits must scale to many concurrent commits. The current transaction commit
+code does not break down even when there are transactions coming from 2048
+processors at once. The current transaction code does not go any faster than if
+there was only one CPU using it, but it does not slow down either.
+
+As a result, the delayed logging transaction commit code needs to be designed
+for concurrency from the ground up. It is obvious that there are serialisation
+points in the design - the three important ones are:
+
+	1. Locking out new transaction commits while flushing the CIL
+	2. Adding items to the CIL and updating item space accounting
+	3. Checkpoint commit ordering
+
+Looking at the transaction commit and CIL flushing interactions, it is clear
+that we have a many-to-one interaction here. That is, the only restriction on
+the number of concurrent transactions that can be trying to commit at once is
+the amount of space available in the log for their reservations. The practical
+limit here is in the order of several hundred concurrent transactions for a
+128MB log, which means that it is generally one per CPU in a machine.
+
+The amount of time a transaction commit needs to hold out a flush is a
+relatively long period of time - the pinning of log items needs to be done
+while we are holding out a CIL flush, so at the moment that means it is held
+across the formatting of the objects into memory buffers (i.e. while memcpy()s
+are in progress). Ultimately a two pass algorithm where the formatting is done
+separately to the pinning of objects could be used to reduce the hold time of
+the transaction commit side.
+
+Because of the number of potential transaction commit side holders, the lock
+really needs to be a sleeping lock - if the CIL flush takes the lock, we do not
+want every other CPU in the machine spinning on the CIL lock. Given that
+flushing the CIL could involve walking a list of tens of thousands of log
+items, it will get held for a significant time and so spin contention is a
+significant concern. Preventing lots of CPUs spinning doing nothing is the
+main reason for choosing a sleeping lock even though nothing in either the
+transaction commit or CIL flush side sleeps with the lock held.
+
+It should also be noted that CIL flushing is also a relatively rare operation
+compared to transaction commit for asynchronous transaction workloads - only
+time will tell if using a read-write semaphore for exclusion will limit
+transaction commit concurrency due to cache line bouncing of the lock on the
+read side.
+
+The second serialisation point is on the transaction commit side where items
+are inserted into the CIL. Because transactions can enter this code
+concurrently, the CIL needs to be protected separately from the above
+commit/flush exclusion. It also needs to be an exclusive lock but it is only
+held for a very short time and so a spin lock is appropriate here. It is
+possible that this lock will become a contention point, but given the short
+hold time once per transaction I think that contention is unlikely.
+
+The final serialisation point is the checkpoint commit record ordering code
+that is run as part of the checkpoint commit and log force sequencing. The code
+path that triggers a CIL flush (i.e. whatever triggers the log force) will enter
+an ordering loop after writing all the log vectors into the log buffers but
+before writing the commit record. This loop walks the list of committing
+checkpoints and needs to block waiting for checkpoints to complete their commit
+record write. As a result it needs a lock and a wait variable. Log force
+sequencing also requires the same lock, list walk, and blocking mechanism to
+ensure completion of checkpoints.
+
+These two sequencing operations can use the mechanism even though the
+events they are waiting for are different. The checkpoint commit record
+sequencing needs to wait until checkpoint contexts contain a commit LSN
+(obtained through completion of a commit record write) while log force
+sequencing needs to wait until previous checkpoint contexts are removed from
+the committing list (i.e. they've completed). A simple wait variable and
+broadcast wakeups (thundering herds) has been used to implement these two
+serialisation queues. They use the same lock as the CIL, too. If we see too
+much contention on the CIL lock, or too many context switches as a result of
+the broadcast wakeups these operations can be put under a new spinlock and
+given separate wait lists to reduce lock contention and the number of processes
+woken by the wrong event.
+
+
+Lifecycle Changes
+
+The existing log item life cycle is as follows:
+
+	1. Transaction allocate
+	2. Transaction reserve
+	3. Lock item
+	4. Join item to transaction
+		If not already attached,
+			Allocate log item
+			Attach log item to owner item
+		Attach log item to transaction
+	5. Modify item
+		Record modifications in log item
+	6. Transaction commit
+		Pin item in memory
+		Format item into log buffer
+		Write commit LSN into transaction
+		Unlock item
+		Attach transaction to log buffer
+
+	<log buffer IO dispatched>
+	<log buffer IO completes>
+
+	7. Transaction completion
+		Mark log item committed
+		Insert log item into AIL
+			Write commit LSN into log item
+		Unpin log item
+	8. AIL traversal
+		Lock item
+		Mark log item clean
+		Flush item to disk
+
+	<item IO completion>
+
+	9. Log item removed from AIL
+		Moves log tail
+		Item unlocked
+
+Essentially, steps 1-6 operate independently from step 7, which is also
+independent of steps 8-9. An item can be locked in steps 1-6 or steps 8-9
+at the same time step 7 is occurring, but only steps 1-6 or 8-9 can occur
+at the same time. If the log item is in the AIL or between steps 6 and 7
+and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9
+are entered and completed is the object considered clean.
+
+With delayed logging, there are new steps inserted into the life cycle:
+
+	1. Transaction allocate
+	2. Transaction reserve
+	3. Lock item
+	4. Join item to transaction
+		If not already attached,
+			Allocate log item
+			Attach log item to owner item
+		Attach log item to transaction
+	5. Modify item
+		Record modifications in log item
+	6. Transaction commit
+		Pin item in memory if not pinned in CIL
+		Format item into log vector + buffer
+		Attach log vector and buffer to log item
+		Insert log item into CIL
+		Write CIL context sequence into transaction
+		Unlock item
+
+	<next log force>
+
+	7. CIL push
+		lock CIL flush
+		Chain log vectors and buffers together
+		Remove items from CIL
+		unlock CIL flush
+		write log vectors into log
+		sequence commit records
+		attach checkpoint context to log buffer
+
+	<log buffer IO dispatched>
+	<log buffer IO completes>
+
+	8. Checkpoint completion
+		Mark log item committed
+		Insert item into AIL
+			Write commit LSN into log item
+		Unpin log item
+	9. AIL traversal
+		Lock item
+		Mark log item clean
+		Flush item to disk
+	<item IO completion>
+	10. Log item removed from AIL
+		Moves log tail
+		Item unlocked
+
+From this, it can be seen that the only life cycle differences between the two
+logging methods are in the middle of the life cycle - they still have the same
+beginning and end and execution constraints. The only differences are in the
+commiting of the log items to the log itself and the completion processing.
+Hence delayed logging should not introduce any constraints on log item
+behaviour, allocation or freeing that don't already exist.
+
+As a result of this zero-impact "insertion" of delayed logging infrastructure
+and the design of the internal structures to avoid on disk format changes, we
+can basically switch between delayed logging and the existing mechanism with a
+mount option. Fundamentally, there is no reason why the log manager would not
+be able to swap methods automatically and transparently depending on load
+characteristics, but this should not be necessary if delayed logging works as
+designed.
+
+Roadmap:
+
+2.6.35 Inclusion in mainline as an experimental mount option
+	=> approximately 2-3 months to merge window
+	=> needs to be in xfs-dev tree in 4-6 weeks
+	=> code is nearing readiness for review
+
+2.6.37 Remove experimental tag from mount option
+	=> should be roughly 6 months after initial merge
+	=> enough time to:
+		=> gain confidence and fix problems reported by early
+		   adopters (a.k.a. guinea pigs)
+		=> address worst performance regressions and undesired
+		   behaviours
+		=> start tuning/optimising code for parallelism
+		=> start tuning/optimising algorithms consuming
+		   excessive CPU time
+
+2.6.39 Switch default mount option to use delayed logging
+	=> should be roughly 12 months after initial merge
+	=> enough time to shake out remaining problems before next round of
+	   enterprise distro kernel rebases
-- 
cgit v1.2.3


From 971ada0f6659488c3f36aed4c6f7670ff5ce4368 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 24 May 2010 14:32:05 -0700
Subject: mempolicy: document cpuset interaction with tmpfs mpol mount option

Update Documentation/filesystems/tmpfs.txt to describe the interaction of
tmpfs mount option memory policy with tasks' cpuset mems_allowed.

Note: the mount(8) man page [in the util-linux-ng package] requires
similiar updates.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/tmpfs.txt | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index fe09a2cb185..98ef5512415 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -94,11 +94,19 @@ NodeList format is a comma-separated list of decimal numbers and ranges,
 a range being two hyphen-separated decimal numbers, the smallest and
 largest node numbers in the range.  For example, mpol=bind:0-3,5,7,9-15
 
+A memory policy with a valid NodeList will be saved, as specified, for
+use at file creation time.  When a task allocates a file in the file
+system, the mount option memory policy will be applied with a NodeList,
+if any, modified by the calling task's cpuset constraints
+[See Documentation/cgroups/cpusets.txt] and any optional flags, listed
+below.  If the resulting NodeLists is the empty set, the effective memory
+policy for the file will revert to "default" policy.
+
 NUMA memory allocation policies have optional flags that can be used in
 conjunction with their modes.  These optional flags can be specified
 when tmpfs is mounted by appending them to the mode before the NodeList.
 See Documentation/vm/numa_memory_policy.txt for a list of all available
-memory allocation policy mode flags.
+memory allocation policy mode flags and their effect on memory policy.
 
 	=static		is equivalent to	MPOL_F_STATIC_NODES
 	=relative	is equivalent to	MPOL_F_RELATIVE_NODES
-- 
cgit v1.2.3


From 899f4530334da9292556e1f8f5791468e0136ff1 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Tue, 25 May 2010 02:47:00 +0100
Subject: squashfs: update documentation to include description of xattr layout

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 Documentation/filesystems/squashfs.txt | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index b324c033035..203f7202cc9 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -38,7 +38,8 @@ Hard link support:		yes			no
 Real inode numbers:		yes			no
 32-bit uids/gids:		yes			no
 File creation time:		yes			no
-Xattr and ACL support:		no			no
+Xattr support:			yes			no
+ACL support:			no			no
 
 Squashfs compresses data, inodes and directories.  In addition, inode and
 directory data are highly compacted, and packed on byte boundaries.  Each
@@ -58,7 +59,7 @@ obtained from this site also.
 3. SQUASHFS FILESYSTEM DESIGN
 -----------------------------
 
-A squashfs filesystem consists of seven parts, packed together on a byte
+A squashfs filesystem consists of a maximum of eight parts, packed together on a byte
 alignment:
 
 	 ---------------
@@ -80,6 +81,9 @@ alignment:
 	|---------------|
 	|    uid/gid	|
 	|  lookup table	|
+	|---------------|
+	|     xattr     |
+	|     table	|
 	 ---------------
 
 Compressed data blocks are written to the filesystem as files are read from
@@ -192,6 +196,26 @@ This table is stored compressed into metadata blocks.  A second index table is
 used to locate these.  This second index table for speed of access (and because
 it is small) is read at mount time and cached in memory.
 
+3.7 Xattr table
+---------------
+
+The xattr table contains extended attributes for each inode.  The xattrs
+for each inode are stored in a list, each list entry containing a type,
+name and value field.  The type field encodes the xattr prefix
+("user.", "trusted." etc) and it also encodes how the name/value fields
+should be interpreted.  Currently the type indicates whether the value
+is stored inline (in which case the value field contains the xattr value),
+or if it is stored out of line (in which case the value field stores a
+reference to where the actual value is stored).  This allows large values
+to be stored out of line improving scanning and lookup performance and it
+also allows values to be de-duplicated, the value being stored once, and
+all other occurences holding an out of line reference to that value.
+
+The xattr lists are packed into compressed 8K metadata blocks.
+To reduce overhead in inodes, rather than storing the on-disk
+location of the xattr list inside each inode, a 32-bit xattr id
+is stored.  This xattr id is mapped into the location of the xattr
+list using a second xattr id lookup table.
 
 4. TODOS AND OUTSTANDING ISSUES
 -------------------------------
@@ -199,9 +223,7 @@ it is small) is read at mount time and cached in memory.
 4.1 Todo list
 -------------
 
-Implement Xattr and ACL support.  The Squashfs 4.0 filesystem layout has hooks
-for these but the code has not been written.  Once the code has been written
-the existing layout should not require modification.
+Implement ACL support.
 
 4.2 Squashfs internal cache
 ---------------------------
-- 
cgit v1.2.3


From 866707fc2721df8fee637fcf0239628b9231f9ea Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Wed, 26 May 2010 14:44:54 -0700
Subject: Documentation/filesystems/Locking: update documentation on llseek()
 wrt BKL

The inode's i_size is not protected by the big kernel lock.  Therefore it
does not make sense to recommend taking the BKL in filesystems llseek
operations.  Instead it should use the inode's mutex or use just use
i_size_read() instead.  Add a note that this is not protecting
file->f_pos.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: John Kacur <jkacur@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index af1608070cd..61c98f03baa 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -429,8 +429,9 @@ check_flags:		no
 implementations.  If your fs is not using generic_file_llseek, you
 need to acquire and release the appropriate locks in your ->llseek().
 For many filesystems, it is probably safe to acquire the inode
-mutex.  Note some filesystems (i.e. remote ones) provide no
-protection for i_size so you will need to use the BKL.
+mutex or just to use i_size_read() instead.
+Note: this does not protect the file->f_pos against concurrent modifications
+since this is something the userspace has to take care about.
 
 Note: ext2_release() was *the* source of contention on fs-intensive
 loads and dropping BKL on ->release() helps to get rid of that (we still
-- 
cgit v1.2.3


From 7ea8085910ef3dd4f3cad6845aaa2b580d39b115 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 May 2010 17:53:25 +0200
Subject: drop unused dentry argument to ->fsync

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking        |  2 +-
 Documentation/filesystems/vfs.txt        |  2 +-
 arch/powerpc/platforms/cell/spufs/file.c |  3 +--
 drivers/char/ps3flash.c                  |  3 +--
 drivers/mtd/ubi/cdev.c                   |  3 +--
 drivers/staging/pohmelfs/inode.c         |  2 +-
 drivers/usb/gadget/printer.c             |  2 +-
 drivers/video/fb_defio.c                 |  2 +-
 fs/9p/vfs_file.c                         |  6 ++----
 fs/affs/affs.h                           |  2 +-
 fs/affs/file.c                           |  4 ++--
 fs/afs/internal.h                        |  2 +-
 fs/afs/write.c                           |  3 ++-
 fs/bad_inode.c                           |  3 +--
 fs/block_dev.c                           |  7 +------
 fs/btrfs/ctree.h                         |  2 +-
 fs/btrfs/file.c                          |  3 ++-
 fs/ceph/caps.c                           |  4 ++--
 fs/ceph/dir.c                            |  5 ++---
 fs/ceph/super.h                          |  2 +-
 fs/cifs/cifsfs.h                         |  2 +-
 fs/cifs/file.c                           |  4 ++--
 fs/coda/coda_int.h                       |  3 +--
 fs/coda/file.c                           |  4 ++--
 fs/ecryptfs/file.c                       |  2 +-
 fs/exofs/file.c                          |  7 +++----
 fs/ext2/ext2.h                           |  2 +-
 fs/ext2/file.c                           |  6 +++---
 fs/ext3/fsync.c                          |  4 ++--
 fs/ext4/ext4.h                           |  2 +-
 fs/ext4/fsync.c                          |  8 ++++----
 fs/fat/fat.h                             |  3 +--
 fs/fat/file.c                            |  6 +++---
 fs/fuse/dir.c                            |  5 ++---
 fs/fuse/file.c                           |  9 ++++-----
 fs/fuse/fuse_i.h                         |  3 +--
 fs/gfs2/file.c                           |  4 ++--
 fs/hostfs/hostfs_kern.c                  |  4 ++--
 fs/hpfs/file.c                           |  4 ++--
 fs/hpfs/hpfs_fn.h                        |  2 +-
 fs/hppfs/hppfs.c                         |  2 +-
 fs/jffs2/file.c                          |  4 ++--
 fs/jffs2/os-linux.h                      |  2 +-
 fs/jfs/file.c                            |  4 ++--
 fs/jfs/jfs_inode.h                       |  2 +-
 fs/libfs.c                               |  6 +++---
 fs/logfs/file.c                          |  4 ++--
 fs/logfs/logfs.h                         |  2 +-
 fs/ncpfs/file.c                          |  2 +-
 fs/nfs/dir.c                             |  6 ++++--
 fs/nfs/file.c                            |  5 +++--
 fs/nilfs2/file.c                         |  4 ++--
 fs/nilfs2/nilfs.h                        |  2 +-
 fs/ntfs/dir.c                            |  5 ++---
 fs/ntfs/file.c                           |  9 ++-------
 fs/ocfs2/file.c                          |  7 +++----
 fs/reiserfs/dir.c                        |  8 +++-----
 fs/reiserfs/file.c                       |  5 ++---
 fs/smbfs/file.c                          |  3 ++-
 fs/sync.c                                |  8 +++-----
 fs/ubifs/file.c                          |  4 ++--
 fs/ubifs/ubifs.h                         |  2 +-
 fs/xfs/linux-2.6/xfs_file.c              | 10 +++++-----
 include/linux/buffer_head.h              |  2 +-
 include/linux/ext3_fs.h                  |  2 +-
 include/linux/fb.h                       |  5 +----
 include/linux/fs.h                       |  8 ++++----
 include/trace/events/ext4.h              |  6 ++++--
 ipc/shm.c                                | 11 ++++-------
 69 files changed, 129 insertions(+), 157 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 61c98f03baa..96d4293607e 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -380,7 +380,7 @@ prototypes:
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *);
 	int (*release) (struct inode *, struct file *);
-	int (*fsync) (struct file *, struct dentry *, int datasync);
+	int (*fsync) (struct file *, int datasync);
 	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index b66858538df..d4f5731dcbb 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -729,7 +729,7 @@ struct file_operations {
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *);
 	int (*release) (struct inode *, struct file *);
-	int (*fsync) (struct file *, struct dentry *, int datasync);
+	int (*fsync) (struct file *, int datasync);
 	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 5c280825251..1a40da92154 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -1849,8 +1849,7 @@ out:
 	return ret;
 }
 
-static int spufs_mfc_fsync(struct file *file, struct dentry *dentry,
-			   int datasync)
+static int spufs_mfc_fsync(struct file *file, int datasync)
 {
 	return spufs_mfc_flush(file, NULL);
 }
diff --git a/drivers/char/ps3flash.c b/drivers/char/ps3flash.c
index 606048b72bc..85c004a518e 100644
--- a/drivers/char/ps3flash.c
+++ b/drivers/char/ps3flash.c
@@ -305,8 +305,7 @@ static int ps3flash_flush(struct file *file, fl_owner_t id)
 	return ps3flash_writeback(ps3flash_dev);
 }
 
-static int ps3flash_fsync(struct file *file, struct dentry *dentry,
-			  int datasync)
+static int ps3flash_fsync(struct file *file, int datasync)
 {
 	return ps3flash_writeback(ps3flash_dev);
 }
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 72ebb3f06b8..4dfa6b90c21 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -189,8 +189,7 @@ static loff_t vol_cdev_llseek(struct file *file, loff_t offset, int origin)
 	return new_offset;
 }
 
-static int vol_cdev_fsync(struct file *file, struct dentry *dentry,
-			  int datasync)
+static int vol_cdev_fsync(struct file *file, int datasync)
 {
 	struct ubi_volume_desc *desc = file->private_data;
 	struct ubi_device *ubi = desc->vol->ubi;
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 9286e863b0e..e92595eff1b 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -880,7 +880,7 @@ static struct inode *pohmelfs_alloc_inode(struct super_block *sb)
 /*
  * We want fsync() to work on POHMELFS.
  */
-static int pohmelfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int pohmelfs_fsync(struct file *file, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct writeback_control wbc = {
diff --git a/drivers/usb/gadget/printer.c b/drivers/usb/gadget/printer.c
index 6b8bf8c781c..43abf55d8c6 100644
--- a/drivers/usb/gadget/printer.c
+++ b/drivers/usb/gadget/printer.c
@@ -794,7 +794,7 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr)
 }
 
 static int
-printer_fsync(struct file *fd, struct dentry *dentry, int datasync)
+printer_fsync(struct file *fd, int datasync)
 {
 	struct printer_dev	*dev = fd->private_data;
 	unsigned long		flags;
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 1105a591dcc..073c9b408cf 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -66,7 +66,7 @@ static int fb_deferred_io_fault(struct vm_area_struct *vma,
 	return 0;
 }
 
-int fb_deferred_io_fsync(struct file *file, struct dentry *dentry, int datasync)
+int fb_deferred_io_fsync(struct file *file, int datasync)
 {
 	struct fb_info *info = file->private_data;
 
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 25b300e1c9d..2bedc6c94fc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -257,15 +257,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	return total;
 }
 
-static int v9fs_file_fsync(struct file *filp, struct dentry *dentry,
-					int datasync)
+static int v9fs_file_fsync(struct file *filp, int datasync)
 {
 	struct p9_fid *fid;
 	struct p9_wstat wstat;
 	int retval;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp,
-						dentry, datasync);
+	P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
 	fid = filp->private_data;
 	v9fs_blank_wstat(&wstat);
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac1..f05b6155ccc 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -183,7 +183,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 
 void		affs_free_prealloc(struct inode *inode);
 extern void	affs_truncate(struct inode *);
-int		affs_file_fsync(struct file *, struct dentry *, int);
+int		affs_file_fsync(struct file *, int);
 
 /* dir.c */
 
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9b..322710c3eed 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -916,9 +916,9 @@ affs_truncate(struct inode *inode)
 	affs_free_prealloc(inode);
 }
 
-int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int affs_file_fsync(struct file *filp, int datasync)
 {
-	struct inode * inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	int ret, err;
 
 	ret = write_inode_now(inode, 0);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 807f284cc75..5f679b77ce2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -740,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
 			      unsigned long, loff_t);
 extern int afs_writeback_all(struct afs_vnode *);
-extern int afs_fsync(struct file *, struct dentry *, int);
+extern int afs_fsync(struct file *, int);
 
 
 /*****************************************************************************/
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d..3dab9e9948d 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -701,8 +701,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
  * - the return status from this call provides a reliable indication of
  *   whether any write errors occurred for this process.
  */
-int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int afs_fsync(struct file *file, int datasync)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct afs_writeback *wb, *xwb;
 	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
 	int ret;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f6..52e59bf4aa5 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
 	return -EIO;
 }
 
-static int bad_file_fsync(struct file *file, struct dentry *dentry,
-			int datasync)
+static int bad_file_fsync(struct file *file, int datasync)
 {
 	return -EIO;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 26e5f502662..d0b37e626a1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -358,12 +358,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 	return retval;
 }
 	
-/*
- *	Filp is never NULL; the only case when ->fsync() is called with
- *	NULL first argument is nfsd_sync_dir() and that's not a directory.
- */
- 
-int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int blkdev_fsync(struct file *filp, int datasync)
 {
 	struct inode *bd_inode = filp->f_mapping->host;
 	struct block_device *bdev = I_BDEV(bd_inode);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e9bf86415e8..29c20092847 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2434,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 
 /* file.c */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_sync_file(struct file *file, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 79437c5eeb1..787b50a16a1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1101,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
  * important optimization for directories because holding the mutex prevents
  * new operations on the dir while we write to disk.
  */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int btrfs_sync_file(struct file *file, int datasync)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0dd0b81e64f..ae3e3a30644 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1776,9 +1776,9 @@ out:
 	spin_unlock(&ci->i_unsafe_lock);
 }
 
-int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ceph_fsync(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	unsigned flush_tid;
 	int ret;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4fd30900eff..0057f4a0734 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1107,10 +1107,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
  * an fsync() on a dir will wait for any uncommitted directory
  * operations to commit.
  */
-static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
-			  int datasync)
+static int ceph_dir_fsync(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct list_head *head = &ci->i_unsafe_dirops;
 	struct ceph_mds_request *req;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3725c9ee9d0..dd1e7add656 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -811,7 +811,7 @@ extern void ceph_put_cap(struct ceph_cap *cap);
 
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
-extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ceph_fsync(struct file *file, int datasync);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 				    struct ceph_mds_session *session);
 extern int ceph_get_cap_mds(struct inode *inode);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0242ff9cbf4..a7eb65c84b1 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			 size_t write_size, loff_t *poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
-extern int cifs_fsync(struct file *, struct dentry *, int);
+extern int cifs_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a83541ec971..f1ff785b229 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1676,7 +1676,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	return rc;
 }
 
-int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int cifs_fsync(struct file *file, int datasync)
 {
 	int xid;
 	int rc = 0;
@@ -1688,7 +1688,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	xid = GetXid();
 
 	cFYI(1, "Sync file - name: %s datasync: 0x%x",
-		dentry->d_name.name, datasync);
+		file->f_path.dentry->d_name.name, datasync);
 
 	rc = filemap_write_and_wait(inode->i_mapping);
 	if (rc == 0) {
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a3389..6b443ff43a1 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
 
 void coda_destroy_inodecache(void);
 int coda_init_inodecache(void);
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry,
-	       int datasync);
+int coda_fsync(struct file *coda_file, int datasync);
 void coda_sysctl_init(void);
 void coda_sysctl_clean(void);
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7196077b168..ad3cd2abeeb 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	return 0;
 }
 
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
+int coda_fsync(struct file *coda_file, int datasync)
 {
 	struct file *host_file;
-	struct inode *coda_inode = coda_dentry->d_inode;
+	struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
 	struct coda_file_info *cfi;
 	int err = 0;
 
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 3bdddbcc785..e8fcf4e2ed7 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -274,7 +274,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 }
 
 static int
-ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+ecryptfs_fsync(struct file *file, int datasync)
 {
 	return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
 }
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70..fef6899be39 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
-			    int datasync)
+static int exofs_file_fsync(struct file *filp, int datasync)
 {
 	int ret;
 	struct address_space *mapping = filp->f_mapping;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = mapping->host;
 	struct super_block *sb;
 
 	ret = filemap_write_and_wait(mapping);
@@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
 
 static int exofs_flush(struct file *file, fl_owner_t id)
 {
-	exofs_file_fsync(file, file->f_path.dentry, 1);
+	exofs_file_fsync(file, 1);
 	/* TODO: Flush the OSD target */
 	return 0;
 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2..8e917b68651 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -155,7 +155,7 @@ extern void ext2_write_super (struct super_block *);
 extern const struct file_operations ext2_dir_operations;
 
 /* file.c */
-extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ext2_fsync(struct file *file, int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697f..48bcfc32701 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
-int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ext2_fsync(struct file *file, int datasync)
 {
 	int ret;
-	struct super_block *sb = dentry->d_inode->i_sb;
+	struct super_block *sb = file->f_mapping->host->i_sb;
 	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
 
-	ret = simple_fsync(file, dentry, datasync);
+	ret = simple_fsync(file, datasync);
 	if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
 		/* We don't really know where the IO error happened... */
 		ext2_error(sb, __func__,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index fcf7487734b..d7e9f74dc3a 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,9 +43,9 @@
  * inode to disk.
  */
 
-int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext3_sync_file(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
 	int ret, needs_barrier = 0;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 60bd31026e7..19a4de57128 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1519,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 
 /* fsync.c */
-extern int ext4_sync_file(struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, int);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index b6a74f991bf..40f34520173 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -71,9 +71,9 @@ static void ext4_sync_parent(struct inode *inode)
  * i_mutex lock is held when entering and exiting this function
  */
 
-int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 	int ret;
@@ -81,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
-	trace_ext4_sync_file(file, dentry, datasync);
+	trace_ext4_sync_file(file, datasync);
 
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return 0;
@@ -91,7 +91,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		return ret;
 
 	if (!journal) {
-		ret = simple_fsync(file, dentry, datasync);
+		ret = simple_fsync(file, datasync);
 		if (!ret && !list_empty(&inode->i_dentry))
 			ext4_sync_parent(inode);
 		return ret;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 53dba57b49a..a1c8c4b44fd 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -309,8 +309,7 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
 extern void fat_truncate(struct inode *inode);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		       struct kstat *stat);
-extern int fat_file_fsync(struct file *file, struct dentry *dentry,
-			  int datasync);
+extern int fat_file_fsync(struct file *file, int datasync);
 
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a14c2f6a489..29a57694437 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -149,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int fat_file_fsync(struct file *filp, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	int res, err;
 
-	res = simple_fsync(filp, dentry, datasync);
+	res = simple_fsync(filp, datasync);
 	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
 
 	return res ? res : err;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1..3cdc5f78a40 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_dir_fsync(struct file *file, int datasync)
 {
-	/* nfsd can call this with no file */
-	return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
+	return fuse_fsync_common(file, datasync, 1);
 }
 
 static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d..b5fd6f9905e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
 	fuse_release_nowrite(inode);
 }
 
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
-		      int isdir)
+int fuse_fsync_common(struct file *file, int datasync, int isdir)
 {
-	struct inode *inode = de->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
 	struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	return err;
 }
 
-static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_fsync(struct file *file, int datasync)
 {
-	return fuse_fsync_common(file, de, datasync, 0);
+	return fuse_fsync_common(file, datasync, 0);
 }
 
 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45..2c0d14a8677 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -568,8 +568,7 @@ void fuse_release_common(struct file *file, int opcode);
 /**
  * Send FSYNC or FSYNCDIR request
  */
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
-		      int isdir);
+int fuse_fsync_common(struct file *file, int datasync, int isdir);
 
 /**
  * Notify poll wakeup
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b20bfcc9fa2..ed9a94f0ef1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -554,9 +554,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
  * Returns: errno
  */
 
-static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int gfs2_fsync(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
 	int ret = 0;
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a029d8f4cf..87ac1891a18 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -411,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file)
 	return 0;
 }
 
-int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hostfs_fsync(struct file *file, int datasync)
 {
-	return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync);
+	return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
 }
 
 static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff0036..a9ae9bfa752 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hpfs_file_fsync(struct file *file, int datasync)
 {
-	/*return file_fsync(file, dentry);*/
+	/*return file_fsync(file, datasync);*/
 	return 0; /* Don't fsync :-) */
 }
 
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d..75f9d432485 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
 
 /* file.c */
 
-int hpfs_file_fsync(struct file *, struct dentry *, int);
+int hpfs_file_fsync(struct file *, int);
 extern const struct file_operations hpfs_file_ops;
 extern const struct inode_operations hpfs_file_iops;
 extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593d..826c3f9d29a 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
 	return err;
 }
 
-static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int hppfs_fsync(struct file *file, int datasync)
 {
 	return 0;
 }
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c161a1..81349702443 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -26,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 			struct page **pagep, void **fsdata);
 static int jffs2_readpage (struct file *filp, struct page *pg);
 
-int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int jffs2_fsync(struct file *filp, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 
 	/* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 035a767f958..4791aacf308 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
 extern const struct file_operations jffs2_file_operations;
 extern const struct inode_operations jffs2_file_inode_operations;
 extern const struct address_space_operations jffs2_file_address_operations;
-int jffs2_fsync(struct file *, struct dentry *, int);
+int jffs2_fsync(struct file *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 
 /* ioctl.c */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 85d9ec65922..127263cc865 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -27,9 +27,9 @@
 #include "jfs_acl.h"
 #include "jfs_debug.h"
 
-int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int jfs_fsync(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	int rc = 0;
 
 	if (!(inode->i_state & I_DIRTY) ||
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9e6bda30a6e..11042b1f44b 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
 struct fid;
 
 extern struct inode *ialloc(struct inode *, umode_t);
-extern int jfs_fsync(struct file *, struct dentry *, int);
+extern int jfs_fsync(struct file *, int);
 extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
 extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
diff --git a/fs/libfs.c b/fs/libfs.c
index 232bea425b0..e57ea58bda6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -58,7 +58,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
 	return NULL;
 }
 
-int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int simple_sync_file(struct file *file, int datasync)
 {
 	return 0;
 }
@@ -851,13 +851,13 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 
-int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+int simple_fsync(struct file *file, int datasync)
 {
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = 0, /* metadata-only; caller takes care of data */
 	};
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	int err;
 	int ret;
 
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 0de52407187..abe1cafbd4c 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -219,9 +219,9 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 	}
 }
 
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int logfs_fsync(struct file *file, int datasync)
 {
-	struct super_block *sb = dentry->d_inode->i_sb;
+	struct super_block *sb = file->f_mapping->host->i_sb;
 
 	logfs_write_anchor(sb);
 	return 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 1a9db84f8d8..c838c4d7211 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -506,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops;
 int logfs_readpage(struct file *file, struct page *page);
 int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 		unsigned long arg);
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int logfs_fsync(struct file *file, int datasync);
 
 /* gc.c */
 u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index b9387089289..3639cc5cbda 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -22,7 +22,7 @@
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
 
-static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int ncp_fsync(struct file *file, int datasync)
 {
 	return 0;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index db64854b7b0..782b431ef91 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
 static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
 static int nfs_rename(struct inode *, struct dentry *,
 		      struct inode *, struct dentry *);
-static int nfs_fsync_dir(struct file *, struct dentry *, int);
+static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 
 const struct file_operations nfs_dir_operations = {
@@ -641,8 +641,10 @@ out:
  * All directory operations under NFS are synchronous, so fsync()
  * is a dummy operation.
  */
-static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
+static int nfs_fsync_dir(struct file *filp, int datasync)
 {
+	struct dentry *dentry = filp->f_path.dentry;
+
 	dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			datasync);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cac96bcc91e..36a5e74f51b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -53,7 +53,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
+static int  nfs_file_fsync(struct file *, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -322,8 +322,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
  * whether any write errors occurred for this process.
  */
 static int
-nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, int datasync)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = dentry->d_inode;
 
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443c..c9a30d7ff6f 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
 #include "nilfs.h"
 #include "segment.h"
 
-int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int nilfs_sync_file(struct file *file, int datasync)
 {
 	/*
 	 * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	 * This function should be implemented when the writeback function
 	 * will be implemented.
 	 */
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	int err;
 
 	if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd07..47d6d792812 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
 			   struct page *, struct inode *);
 
 /* file.c */
-extern int nilfs_sync_file(struct file *, struct dentry *, int);
+extern int nilfs_sync_file(struct file *, int);
 
 /* ioctl.c */
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index fe44d3feee4..0f48e7c5d9e 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
  * this problem for now.  We do write the $BITMAP attribute if it is present
  * which is the important one for a directory so things are not too bad.
  */
-static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
-		int datasync)
+static int ntfs_dir_fsync(struct file *filp, int datasync)
 {
-	struct inode *bmp_vi, *vi = dentry->d_inode;
+	struct inode *bmp_vi, *vi = filp->f_mapping->host;
 	int err, ret;
 	ntfs_attr na;
 
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index a1924a0d2ab..113ebd9f25a 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2133,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 /**
  * ntfs_file_fsync - sync a file to disk
  * @filp:	file to be synced
- * @dentry:	dentry describing the file to sync
  * @datasync:	if non-zero only flush user data and not metadata
  *
  * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
@@ -2149,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
  * Also, if @datasync is true, we do not wait on the inode to be written out
  * but we always wait on the page cache pages to be written out.
  *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
  * Locking: Caller must hold i_mutex on the inode.
  *
  * TODO: We should probably also write all attribute/index inodes associated
  * with this inode but since we have no simple way of getting to them we ignore
  * this problem for now.
  */
-static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
-		int datasync)
+static int ntfs_file_fsync(struct file *filp, int datasync)
 {
-	struct inode *vi = dentry->d_inode;
+	struct inode *vi = filp->f_mapping->host;
 	int err, ret = 0;
 
 	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 97e54b9e654..1c6220a8e07 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int ocfs2_sync_file(struct file *file,
-			   struct dentry *dentry,
-			   int datasync)
+static int ocfs2_sync_file(struct file *file, int datasync)
 {
 	int err = 0;
 	journal_t *journal;
-	struct inode *inode = dentry->d_inode;
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 4455fbe269a..198dabf1b2b 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,8 +14,7 @@
 extern const struct reiserfs_key MIN_KEY;
 
 static int reiserfs_readdir(struct file *, void *, filldir_t);
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
-			      int datasync);
+static int reiserfs_dir_fsync(struct file *filp, int datasync);
 
 const struct file_operations reiserfs_dir_operations = {
 	.llseek = generic_file_llseek,
@@ -28,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
 #endif
 };
 
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
-			      int datasync)
+static int reiserfs_dir_fsync(struct file *filp, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	int err;
 	reiserfs_write_lock(inode->i_sb);
 	err = reiserfs_commit_for_inode(inode);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9977df9f3a5..b82cdd8a45d 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
  * be removed...
  */
 
-static int reiserfs_sync_file(struct file *filp,
-			      struct dentry *dentry, int datasync)
+static int reiserfs_sync_file(struct file *filp, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	int err;
 	int barrier_done;
 
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 84ecf0e43f9..8e187a0f94b 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -28,8 +28,9 @@
 #include "proto.h"
 
 static int
-smb_fsync(struct file *file, struct dentry * dentry, int datasync)
+smb_fsync(struct file *file, int datasync)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct smb_sb_info *server = server_from_dentry(dentry);
 	int result;
 
diff --git a/fs/sync.c b/fs/sync.c
index e8cbd415e50..c9f83f480ec 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -130,12 +130,10 @@ void emergency_sync(void)
 
 /*
  * Generic function to fsync a file.
- *
- * filp may be NULL if called via the msync of a vma.
  */
-int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int file_fsync(struct file *filp, int datasync)
 {
-	struct inode * inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	struct super_block * sb;
 	int ret, err;
 
@@ -183,7 +181,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 	 * livelocks in fsync_buffers_list().
 	 */
 	mutex_lock(&mapping->host->i_mutex);
-	err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+	err = file->f_op->fsync(file, datasync);
 	if (!ret)
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5692cf72b80..c726da68e6b 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1304,9 +1304,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	return NULL;
 }
 
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ubifs_fsync(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	int err;
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bd2542dad01..b0904536cc1 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
 int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 
 /* file.c */
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_fsync(struct file *file, int datasync);
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
 
 /* dir.c */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index d8fb1b5d6cb..257a56b127c 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -100,10 +100,10 @@ xfs_iozero(
 STATIC int
 xfs_file_fsync(
 	struct file		*file,
-	struct dentry		*dentry,
 	int			datasync)
 {
-	struct xfs_inode	*ip = XFS_I(dentry->d_inode);
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_trans	*tp;
 	int			error = 0;
 	int			log_flushed = 0;
@@ -140,8 +140,8 @@ xfs_file_fsync(
 	 * might gets cleared when the inode gets written out via the AIL
 	 * or xfs_iflush_cluster.
 	 */
-	if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
-	    ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+	if (((inode->i_state & I_DIRTY_DATASYNC) ||
+	    ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
 	    ip->i_update_core) {
 		/*
 		 * Kick off a transaction to log the inode core to get the
@@ -868,7 +868,7 @@ write_retry:
 			mutex_lock(&inode->i_mutex);
 		xfs_ilock(ip, iolock);
 
-		error2 = -xfs_file_fsync(file, file->f_path.dentry,
+		error2 = -xfs_file_fsync(file,
 					 (file->f_flags & __O_SYNC) ? 0 : 1);
 		if (!error)
 			error = error2;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 16ed0284d78..05e5f599621 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -224,7 +224,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
-int file_fsync(struct file *, struct dentry *, int);
+int file_fsync(struct file *, int);
 int nobh_write_begin(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page **, void **, get_block_t*);
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 5f494b46509..7fc62d4550b 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -868,7 +868,7 @@ extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext3_htree_free_dir_info(struct dir_private_info *p);
 
 /* fsync.c */
-extern int ext3_sync_file (struct file *, struct dentry *, int);
+extern int ext3_sync_file(struct file *, int);
 
 /* hash.c */
 extern int ext3fs_dirhash(const char *name, int len, struct
diff --git a/include/linux/fb.h b/include/linux/fb.h
index f3793ebc241..907ace3a64c 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -4,8 +4,6 @@
 #include <linux/types.h>
 #include <linux/i2c.h>
 
-struct dentry;
-
 /* Definitions of frame buffers						*/
 
 #define FB_MAX			32	/* sufficient for now */
@@ -1017,8 +1015,7 @@ extern void fb_deferred_io_open(struct fb_info *info,
 				struct inode *inode,
 				struct file *file);
 extern void fb_deferred_io_cleanup(struct fb_info *info);
-extern int fb_deferred_io_fsync(struct file *file, struct dentry *dentry,
-				int datasync);
+extern int fb_deferred_io_fsync(struct file *file, int datasync);
 
 static inline bool fb_be_math(struct fb_info *info)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3d9ed830240..eb39e5eb77f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1498,7 +1498,7 @@ struct file_operations {
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
-	int (*fsync) (struct file *, struct dentry *, int datasync);
+	int (*fsync) (struct file *, int datasync);
 	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
@@ -2213,7 +2213,7 @@ extern int generic_segment_checks(const struct iovec *iov,
 /* fs/block_dev.c */
 extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos);
-extern int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync);
+extern int blkdev_fsync(struct file *filp, int datasync);
 
 /* fs/splice.c */
 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
@@ -2348,7 +2348,7 @@ extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
-extern int simple_sync_file(struct file *, struct dentry *, int);
+extern int simple_sync_file(struct file *, int);
 extern int simple_empty(struct dentry *);
 extern int simple_readpage(struct file *file, struct page *page);
 extern int simple_write_begin(struct file *file, struct address_space *mapping,
@@ -2373,7 +2373,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
 		const void __user *from, size_t count);
 
-extern int simple_fsync(struct file *, struct dentry *, int);
+extern int simple_fsync(struct file *, int);
 
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 5d60ad4ebf7..f5b1ba90e95 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -606,9 +606,9 @@ TRACE_EVENT(ext4_free_blocks,
 );
 
 TRACE_EVENT(ext4_sync_file,
-	TP_PROTO(struct file *file, struct dentry *dentry, int datasync),
+	TP_PROTO(struct file *file, int datasync),
 
-	TP_ARGS(file, dentry, datasync),
+	TP_ARGS(file, datasync),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
@@ -618,6 +618,8 @@ TRACE_EVENT(ext4_sync_file,
 	),
 
 	TP_fast_assign(
+		struct dentry *dentry = file->f_path.dentry;
+
 		__entry->dev		= dentry->d_inode->i_sb->s_dev;
 		__entry->ino		= dentry->d_inode->i_ino;
 		__entry->datasync	= datasync;
diff --git a/ipc/shm.c b/ipc/shm.c
index 1a314c89f93..52ed77eb971 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -273,16 +273,13 @@ static int shm_release(struct inode *ino, struct file *file)
 	return 0;
 }
 
-static int shm_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int shm_fsync(struct file *file, int datasync)
 {
-	int (*fsync) (struct file *, struct dentry *, int datasync);
 	struct shm_file_data *sfd = shm_file_data(file);
-	int ret = -EINVAL;
 
-	fsync = sfd->file->f_op->fsync;
-	if (fsync)
-		ret = fsync(sfd->file, sfd->file->f_path.dentry, datasync);
-	return ret;
+	if (!sfd->file->f_op->fsync)
+		return -EINVAL;
+	return sfd->file->f_op->fsync(sfd->file, datasync);
 }
 
 static unsigned long shm_get_unmapped_area(struct file *file,
-- 
cgit v1.2.3


From 7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f Mon Sep 17 00:00:00 2001
From: "npiggin@suse.de" <npiggin@suse.de>
Date: Thu, 27 May 2010 01:05:33 +1000
Subject: fs: introduce new truncate sequence

Introduce a new truncate calling sequence into fs/mm subsystems. Rather than
setattr > vmtruncate > truncate, have filesystems call their truncate sequence
from ->setattr if filesystem specific operations are required. vmtruncate is
deprecated, and truncate_pagecache and inode_newsize_ok helpers introduced
previously should be used.

simple_setattr is introduced for simple in-ram filesystems to implement
the new truncate sequence. Eventually all filesystems should be converted
to implement a setattr, and the default code in notify_change should go
away.

simple_setsize is also introduced to perform just the ATTR_SIZE portion
of simple_setattr (ie. changing i_size and trimming pagecache).

To implement the new truncate sequence:
- filesystem specific manipulations (eg freeing blocks) must be done in
  the setattr method rather than ->truncate.
- vmtruncate can not be used by core code to trim blocks past i_size in
  the event of write failure after allocation, so this must be performed
  in the fs code.
- convert usage of helpers block_write_begin, nobh_write_begin,
  cont_write_begin, and *blockdev_direct_IO* to use _newtrunc postfixed
  variants. These avoid calling vmtruncate to trim blocks (see previous).
- inode_setattr should not be used. generic_setattr is a new function
  to be used to copy simple attributes into the generic inode.
- make use of the better opportunity to handle errors with the new sequence.

Big problem with the previous calling sequence: the filesystem is not called
until i_size has already changed.  This means it is not allowed to fail the
call, and also it does not know what the previous i_size was. Also, generic
code calling vmtruncate to truncate allocated blocks in case of error had
no good way to return a meaningful error (or, for example, atomically handle
block deallocation).

Cc: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/vfs.txt |   7 ++-
 fs/attr.c                         |  50 ++++++++++++----
 fs/buffer.c                       | 123 ++++++++++++++++++++++++++++++--------
 fs/direct-io.c                    |  61 ++++++++++++-------
 fs/libfs.c                        |  76 +++++++++++++++++++++++
 include/linux/buffer_head.h       |   9 +++
 include/linux/fs.h                |  27 ++++++++-
 mm/truncate.c                     |  10 ++--
 8 files changed, 300 insertions(+), 63 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index d4f5731dcbb..94677e7dcb1 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -401,11 +401,16 @@ otherwise noted.
   	started might not be in the page cache at the end of the
   	walk).
 
-  truncate: called by the VFS to change the size of a file.  The
+  truncate: Deprecated. This will not be called if ->setsize is defined.
+	Called by the VFS to change the size of a file.  The
  	i_size field of the inode is set to the desired size by the
  	VFS before this method is called.  This method is called by
  	the truncate(2) system call and related functionality.
 
+	Note: ->truncate and vmtruncate are deprecated. Do not add new
+	instances/calls of these. Filesystems should be converted to do their
+	truncate sequence via ->setattr().
+
   permission: called by the VFS to check for access rights on a POSIX-like
   	filesystem.
 
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb48..b4fa3b0aa59 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
  * @offset:	the new size to assign to the inode
  * @Returns:	0 on success, -ve errno on failure
  *
+ * inode_newsize_ok must be called with i_mutex held.
+ *
  * inode_newsize_ok will check filesystem limits and ulimits to check that the
  * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
  * when necessary. Caller must not proceed with inode size change if failure is
  * returned. @inode must be a file (not directory), with appropriate
  * permissions to allow truncate (inode_newsize_ok does NOT check these
  * conditions).
- *
- * inode_newsize_ok must be called with i_mutex held.
  */
 int inode_newsize_ok(const struct inode *inode, loff_t offset)
 {
@@ -104,17 +104,25 @@ out_big:
 }
 EXPORT_SYMBOL(inode_newsize_ok);
 
-int inode_setattr(struct inode * inode, struct iattr * attr)
+/**
+ * generic_setattr - copy simple metadata updates into the generic inode
+ * @inode:	the inode to be updated
+ * @attr:	the new attributes
+ *
+ * generic_setattr must be called with i_mutex held.
+ *
+ * generic_setattr updates the inode's metadata with that specified
+ * in attr. Noticably missing is inode size update, which is more complex
+ * as it requires pagecache updates. See simple_setsize.
+ *
+ * The inode is not marked as dirty after this operation. The rationale is
+ * that for "simple" filesystems, the struct inode is the inode storage.
+ * The caller is free to mark the inode dirty afterwards if needed.
+ */
+void generic_setattr(struct inode *inode, const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 
-	if (ia_valid & ATTR_SIZE &&
-	    attr->ia_size != i_size_read(inode)) {
-		int error = vmtruncate(inode, attr->ia_size);
-		if (error)
-			return error;
-	}
-
 	if (ia_valid & ATTR_UID)
 		inode->i_uid = attr->ia_uid;
 	if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
+}
+EXPORT_SYMBOL(generic_setattr);
+
+/*
+ * note this function is deprecated, the new truncate sequence should be
+ * used instead -- see eg. simple_setsize, generic_setattr.
+ */
+int inode_setattr(struct inode *inode, const struct iattr *attr)
+{
+	unsigned int ia_valid = attr->ia_valid;
+
+	if (ia_valid & ATTR_SIZE &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error;
+
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	generic_setattr(inode, attr);
+
 	mark_inode_dirty(inode);
 
 	return 0;
diff --git a/fs/buffer.c b/fs/buffer.c
index e8aa7081d25..d54812b198e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 }
 
 /*
- * block_write_begin takes care of the basic task of block allocation and
- * bringing partial write blocks uptodate first.
- *
- * If *pagep is not NULL, then block_write_begin uses the locked page
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
+ * Filesystems implementing the new truncate sequence should use the
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
  */
-int block_write_begin(struct file *file, struct address_space *mapping,
+int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
 			unlock_page(page);
 			page_cache_release(page);
 			*pagep = NULL;
-
-			/*
-			 * prepare_write() may have instantiated a few blocks
-			 * outside i_size.  Trim these off again. Don't need
-			 * i_size_read because we hold i_mutex.
-			 */
-			if (pos + len > inode->i_size)
-				vmtruncate(inode, inode->i_size);
 		}
 	}
 
 out:
 	return status;
 }
+EXPORT_SYMBOL(block_write_begin_newtrunc);
+
+/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block)
+{
+	int ret;
+
+	ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
+					pagep, fsdata, get_block);
+
+	/*
+	 * prepare_write() may have instantiated a few blocks
+	 * outside i_size.  Trim these off again. Don't need
+	 * i_size_read because we hold i_mutex.
+	 *
+	 * Filesystems which pass down their own page also cannot
+	 * call into vmtruncate here because it would lead to lock
+	 * inversion problems (*pagep is locked). This is a further
+	 * example of where the old truncate sequence is inadequate.
+	 */
+	if (unlikely(ret) && *pagep == NULL) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
 EXPORT_SYMBOL(block_write_begin);
 
 int block_write_end(struct file *file, struct address_space *mapping,
@@ -2324,7 +2351,7 @@ out:
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
  */
-int cont_write_begin(struct file *file, struct address_space *mapping,
+int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block, loff_t *bytes)
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
 	}
 
 	*pagep = NULL;
-	err = block_write_begin(file, mapping, pos, len,
+	err = block_write_begin_newtrunc(file, mapping, pos, len,
 				flags, pagep, fsdata, get_block);
 out:
 	return err;
 }
+EXPORT_SYMBOL(cont_write_begin_newtrunc);
+
+int cont_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block, loff_t *bytes)
+{
+	int ret;
+
+	ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+					pagep, fsdata, get_block, bytes);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
 EXPORT_SYMBOL(cont_write_begin);
 
 int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
  *
  * We are not allowed to take the i_mutex here so we have to play games to
  * protect against truncate races as the page could now be beyond EOF.  Because
- * vmtruncate() writes the inode size before removing pages, once we have the
+ * truncate writes the inode size before removing pages, once we have the
  * page lock we can determine safely if the page is beyond EOF. If it is not
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 }
 
 /*
- * On entry, the page is fully not uptodate.
- * On exit the page is fully uptodate in the areas outside (from,to)
+ * Filesystems implementing the new truncate sequence should use the
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
  */
-int nobh_write_begin(struct file *file, struct address_space *mapping,
+int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
 		unlock_page(page);
 		page_cache_release(page);
 		*pagep = NULL;
-		return block_write_begin(file, mapping, pos, len, flags, pagep,
-					fsdata, get_block);
+		return block_write_begin_newtrunc(file, mapping, pos, len,
+					flags, pagep, fsdata, get_block);
 	}
 
 	if (PageMappedToDisk(page))
@@ -2605,8 +2652,34 @@ out_release:
 	page_cache_release(page);
 	*pagep = NULL;
 
-	if (pos + len > inode->i_size)
-		vmtruncate(inode, inode->i_size);
+	return ret;
+}
+EXPORT_SYMBOL(nobh_write_begin_newtrunc);
+
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block)
+{
+	int ret;
+
+	ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
+					pagep, fsdata, get_block);
+
+	/*
+	 * prepare_write() may have instantiated a few blocks
+	 * outside i_size.  Trim these off again. Don't need
+	 * i_size_read because we hold i_mutex.
+	 */
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
 
 	return ret;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index da111aacb46..7600aacf531 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1134,27 +1134,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	return ret;
 }
 
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- *  - if the flags value contains DIO_LOCKING we use a fancy locking
- *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
- *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
- *
- *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- *    internal locking but rather rely on the filesystem to synchronize
- *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
- */
 ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
@@ -1247,9 +1228,46 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 				nr_segs, blkbits, get_block, end_io,
 				submit_io, dio);
 
+out:
+	return retval;
+}
+EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
+
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	dio_submit_t submit_io,	int flags)
+{
+	ssize_t retval;
+
+	retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
+			offset, nr_segs, get_block, end_io, submit_io, flags);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
+	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
+	 * their own manner. This is a further example of where the old
+	 * truncate sequence is inadequate.
 	 *
 	 * NOTE: filesystems with their own locking have to handle this
 	 * on their own.
@@ -1257,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (flags & DIO_LOCKING) {
 		if (unlikely((rw & WRITE) && retval < 0)) {
 			loff_t isize = i_size_read(inode);
+			loff_t end = offset + iov_length(iov, nr_segs);
+
 			if (end > isize)
 				vmtruncate(inode, isize);
 		}
 	}
 
-out:
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/libfs.c b/fs/libfs.c
index b84d0a7a220..09e1016eb77 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
+#include <linux/quotaops.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
@@ -325,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return 0;
 }
 
+/**
+ * simple_setsize - handle core mm and vfs requirements for file size change
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setsize must be called with inode_mutex held.
+ *
+ * simple_setsize will check that the requested new size is OK (see
+ * inode_newsize_ok), and then will perform the necessary i_size update
+ * and pagecache truncation (if necessary). It will be typically be called
+ * from the filesystem's setattr function when ATTR_SIZE is passed in.
+ *
+ * The inode itself must have correct permissions and attributes to allow
+ * i_size to be changed, this function then just checks that the new size
+ * requested is valid.
+ *
+ * In the case of simple in-memory filesystems with inodes stored solely
+ * in the inode cache, and file data in the pagecache, nothing more needs
+ * to be done to satisfy a truncate request. Filesystems with on-disk
+ * blocks for example will need to free them in the case of truncate, in
+ * that case it may be easier not to use simple_setsize (but each of its
+ * components will likely be required at some point to update pagecache
+ * and inode etc).
+ */
+int simple_setsize(struct inode *inode, loff_t newsize)
+{
+	loff_t oldsize;
+	int error;
+
+	error = inode_newsize_ok(inode, newsize);
+	if (error)
+		return error;
+
+	oldsize = inode->i_size;
+	i_size_write(inode, newsize);
+	truncate_pagecache(inode, oldsize, newsize);
+
+	return error;
+}
+EXPORT_SYMBOL(simple_setsize);
+
+/**
+ * simple_setattr - setattr for simple in-memory filesystem
+ * @dentry: dentry
+ * @iattr: iattr structure
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setattr implements setattr for an in-memory filesystem which
+ * does not store its own file data or metadata (eg. uses the page cache
+ * and inode cache as its data store).
+ */
+int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	if (iattr->ia_valid & ATTR_SIZE) {
+		error = simple_setsize(inode, iattr->ia_size);
+		if (error)
+			return error;
+	}
+
+	generic_setattr(inode, iattr);
+
+	return error;
+}
+EXPORT_SYMBOL(simple_setattr);
+
 int simple_readpage(struct file *file, struct page *page)
 {
 	clear_highpage(page);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 05e5f599621..1b9ba193b78 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -203,6 +203,9 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
 int block_read_full_page(struct page*, get_block_t*);
 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
 				unsigned long from);
+int block_write_begin_newtrunc(struct file *, struct address_space *,
+				loff_t, unsigned, unsigned,
+				struct page **, void **, get_block_t*);
 int block_write_begin(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page **, void **, get_block_t*);
@@ -214,6 +217,9 @@ int generic_write_end(struct file *, struct address_space *,
 				struct page *, void *);
 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
+int cont_write_begin_newtrunc(struct file *, struct address_space *, loff_t,
+			unsigned, unsigned, struct page **, void **,
+			get_block_t *, loff_t *);
 int cont_write_begin(struct file *, struct address_space *, loff_t,
 			unsigned, unsigned, struct page **, void **,
 			get_block_t *, loff_t *);
@@ -225,6 +231,9 @@ void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
 int file_fsync(struct file *, int);
+int nobh_write_begin_newtrunc(struct file *, struct address_space *,
+				loff_t, unsigned, unsigned,
+				struct page **, void **, get_block_t*);
 int nobh_write_begin(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page **, void **, get_block_t*);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index acf6c52a50d..3428393942a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2257,6 +2257,10 @@ typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
 			    loff_t file_offset);
 void dio_end_io(struct bio *bio, int error);
 
+ssize_t __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	dio_submit_t submit_io, int lock_type);
 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
@@ -2270,6 +2274,24 @@ enum {
 	DIO_SKIP_HOLES	= 0x02,
 };
 
+static inline ssize_t blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb,
+	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs, get_block_t get_block,
+	dio_iodone_t end_io)
+{
+	return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset,
+				    nr_segs, get_block, end_io, NULL,
+				    DIO_LOCKING | DIO_SKIP_HOLES);
+}
+
+static inline ssize_t blockdev_direct_IO_no_locking_newtrunc(int rw, struct kiocb *iocb,
+	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs, get_block_t get_block,
+	dio_iodone_t end_io)
+{
+	return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset,
+				nr_segs, get_block, end_io, NULL, 0);
+}
 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs, get_block_t get_block,
@@ -2342,12 +2364,14 @@ extern int dcache_dir_open(struct inode *, struct file *);
 extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
 extern int dcache_readdir(struct file *, void *, filldir_t);
+extern int simple_setattr(struct dentry *, struct iattr *);
 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int simple_statfs(struct dentry *, struct kstatfs *);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
+extern int simple_setsize(struct inode *, loff_t);
 extern int noop_fsync(struct file *, int);
 extern int simple_empty(struct dentry *);
 extern int simple_readpage(struct file *file, struct page *page);
@@ -2384,7 +2408,8 @@ extern int buffer_migrate_page(struct address_space *,
 
 extern int inode_change_ok(const struct inode *, struct iattr *);
 extern int inode_newsize_ok(const struct inode *, loff_t offset);
-extern int __must_check inode_setattr(struct inode *, struct iattr *);
+extern int __must_check inode_setattr(struct inode *, const struct iattr *);
+extern void generic_setattr(struct inode *inode, const struct iattr *attr);
 
 extern void file_update_time(struct file *file);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index f42675a3615..937571b8b23 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -548,18 +548,18 @@ EXPORT_SYMBOL(truncate_pagecache);
  * NOTE! We have to be ready to update the memory sharing
  * between the file and the memory map for a potential last
  * incomplete page.  Ugly, but necessary.
+ *
+ * This function is deprecated and simple_setsize or truncate_pagecache
+ * should be used instead.
  */
 int vmtruncate(struct inode *inode, loff_t offset)
 {
-	loff_t oldsize;
 	int error;
 
-	error = inode_newsize_ok(inode, offset);
+	error = simple_setsize(inode, offset);
 	if (error)
 		return error;
-	oldsize = inode->i_size;
-	i_size_write(inode, offset);
-	truncate_pagecache(inode, oldsize, offset);
+
 	if (inode->i_op->truncate)
 		inode->i_op->truncate(inode);
 
-- 
cgit v1.2.3


From 99a4d54620264a614c89597bc5aaab22ec83f89c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 3 Jun 2010 16:22:29 +1000
Subject: xfs: remove done roadmap item from xfs-delayed-logging-design.txt

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 Documentation/filesystems/xfs-delayed-logging-design.txt | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
index d8119e9d2d6..96d0df28bed 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.txt
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -794,11 +794,6 @@ designed.
 
 Roadmap:
 
-2.6.35 Inclusion in mainline as an experimental mount option
-	=> approximately 2-3 months to merge window
-	=> needs to be in xfs-dev tree in 4-6 weeks
-	=> code is nearing readiness for review
-
 2.6.37 Remove experimental tag from mount option
 	=> should be roughly 6 months after initial merge
 	=> enough time to:
-- 
cgit v1.2.3


From 8cbccbe76168a0c627d2274e4a322116804db30f Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 2 Jun 2010 16:02:44 +0000
Subject: ipconfig: document DHCP hostname and DNS record

Now it's possible to update the DNS record for $HOST_NAME with

	ip=::::$HOST_NAME::dhcp

CC: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/filesystems/nfs/nfsroot.txt | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index 3ba0b945aaf..f2430a7974e 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -124,6 +124,8 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
 
   <hostname>	Name of the client. May be supplied by autoconfiguration,
   		but its absence will not trigger autoconfiguration.
+		If specified and DHCP is used, the user provided hostname will
+		be carried in the DHCP request to hopefully update DNS record.
 
   		Default: Client IP address is used in ASCII notation.
 
-- 
cgit v1.2.3


From 8b8edefa2fffbff97f9eec8b70e78ae23abad1a0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jul 2010 22:09:01 +0200
Subject: fscache: convert object to use workqueue instead of slow-work

Make fscache object state transition callbacks use workqueue instead
of slow-work.  New dedicated unbound CPU workqueue fscache_object_wq
is created.  get/put callbacks are renamed and modified to take
@object and called directly from the enqueue wrapper and the work
function.  While at it, make all open coded instances of get/put to
use fscache_get/put_object().

* Unbound workqueue is used.

* work_busy() output is printed instead of slow-work flags in object
  debugging outputs.  They mean basically the same thing bit-for-bit.

* sysctl fscache.object_max_active added to control concurrency.  The
  default value is nr_cpus clamped between 4 and
  WQ_UNBOUND_MAX_ACTIVE.

* slow_work_sleep_till_thread_needed() is replaced with fscache
  private implementation fscache_object_sleep_till_congested() which
  waits on fscache_object_wq congestion.

* debugfs support is dropped for now.  Tracing API based debug
  facility is planned to be added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
---
 Documentation/filesystems/caching/fscache.txt |  10 +--
 fs/cachefiles/namei.c                         |  13 ++--
 fs/fscache/internal.h                         |   7 ++
 fs/fscache/main.c                             |  76 ++++++++++++++++++
 fs/fscache/object-list.c                      |  11 ++-
 fs/fscache/object.c                           | 106 +++++++++++++-------------
 include/linux/fscache-cache.h                 |   9 ++-
 7 files changed, 158 insertions(+), 74 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index a91e2e2095b..770267af5b3 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -343,8 +343,8 @@ This will look something like:
 	[root@andromeda ~]# head /proc/fs/fscache/objects
 	OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA       OBJECT_KEY, AUX_DATA
 	======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================
-	   17e4b        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
-	   1693a        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
+	   17e4b        2 ACTV     0   0   0   0  0     0 7b  4 0 0 | NFS.fh           DT  0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
+	   1693a        2 ACTV     0   0   0   0  0     0 7b  4 0 0 | NFS.fh           DT  0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
 
 where the first set of columns before the '|' describe the object:
 
@@ -362,7 +362,7 @@ where the first set of columns before the '|' describe the object:
 	EM	Object's event mask
 	EV	Events raised on this object
 	F	Object flags
-	S	Object slow-work work item flags
+	S	Object work item busy state mask (1:pending 2:running)
 
 and the second set of columns describe the object's cookie, if present:
 
@@ -395,8 +395,8 @@ and the following paired letters:
 	w	Show objects that don't have pending writes
 	R	Show objects that have outstanding reads
 	r	Show objects that don't have outstanding reads
-	S	Show objects that have slow work queued
-	s	Show objects that don't have slow work queued
+	S	Show objects that have work queued
+	s	Show objects that don't have work queued
 
 If neither side of a letter pair is given, then both are implied.  For example:
 
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a7840bf42..42c7fafc8bf 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
 
 	printk(KERN_ERR "%sobject: OBJ%x\n",
 	       prefix, object->fscache.debug_id);
-	printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
+	printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
 	       prefix, fscache_object_states[object->fscache.state],
-	       object->fscache.flags, object->fscache.work.flags,
+	       object->fscache.flags, work_busy(&object->fscache.work),
 	       object->fscache.events,
 	       object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
 	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -212,7 +212,7 @@ wait_for_old_object:
 
 		/* if the object we're waiting for is queued for processing,
 		 * then just put ourselves on the queue behind it */
-		if (slow_work_is_queued(&xobject->fscache.work)) {
+		if (work_pending(&xobject->fscache.work)) {
 			_debug("queue OBJ%x behind OBJ%x immediately",
 			       object->fscache.debug_id,
 			       xobject->fscache.debug_id);
@@ -220,8 +220,7 @@ wait_for_old_object:
 		}
 
 		/* otherwise we sleep until either the object we're waiting for
-		 * is done, or the slow-work facility wants the thread back to
-		 * do other work */
+		 * is done, or the fscache_object is congested */
 		wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
 		init_wait(&wait);
 		requeue = false;
@@ -229,8 +228,8 @@ wait_for_old_object:
 			prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
 			if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
 				break;
-			requeue = slow_work_sleep_till_thread_needed(
-				&object->fscache.work, &timeout);
+
+			requeue = fscache_object_sleep_till_congested(&timeout);
 		} while (timeout > 0 && !requeue);
 		finish_wait(wq, &wait);
 
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index edd7434ab6e..6e0b5fb2523 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -82,6 +82,13 @@ extern unsigned fscache_defer_lookup;
 extern unsigned fscache_defer_create;
 extern unsigned fscache_debug;
 extern struct kobject *fscache_root;
+extern struct workqueue_struct *fscache_object_wq;
+DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+static inline bool fscache_object_congested(void)
+{
+	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
+}
 
 extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index add6bdb53f0..bb8d4c35c7a 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 
 MODULE_DESCRIPTION("FS Cache Manager");
@@ -40,22 +41,89 @@ MODULE_PARM_DESC(fscache_debug,
 		 "FS-Cache debugging mask");
 
 struct kobject *fscache_root;
+struct workqueue_struct *fscache_object_wq;
+
+DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+/* these values serve as lower bounds, will be adjusted in fscache_init() */
+static unsigned fscache_object_max_active = 4;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fscache_sysctl_header;
+
+static int fscache_max_active_sysctl(struct ctl_table *table, int write,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos)
+{
+	struct workqueue_struct **wqp = table->extra1;
+	unsigned int *datap = table->data;
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret == 0)
+		workqueue_set_max_active(*wqp, *datap);
+	return ret;
+}
+
+ctl_table fscache_sysctls[] = {
+	{
+		.procname	= "object_max_active",
+		.data		= &fscache_object_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_object_wq,
+	},
+	{}
+};
+
+ctl_table fscache_sysctls_root[] = {
+	{
+		.procname	= "fscache",
+		.mode		= 0555,
+		.child		= fscache_sysctls,
+	},
+	{}
+};
+#endif
 
 /*
  * initialise the fs caching module
  */
 static int __init fscache_init(void)
 {
+	unsigned int nr_cpus = num_possible_cpus();
+	unsigned int cpu;
 	int ret;
 
 	ret = slow_work_register_user(THIS_MODULE);
 	if (ret < 0)
 		goto error_slow_work;
 
+	fscache_object_max_active =
+		clamp_val(nr_cpus,
+			  fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
+					    fscache_object_max_active);
+	if (!fscache_object_wq)
+		goto error_object_wq;
+
+	for_each_possible_cpu(cpu)
+		init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
+
 	ret = fscache_proc_init();
 	if (ret < 0)
 		goto error_proc;
 
+#ifdef CONFIG_SYSCTL
+	ret = -ENOMEM;
+	fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
+	if (!fscache_sysctl_header)
+		goto error_sysctl;
+#endif
+
 	fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
 					       sizeof(struct fscache_cookie),
 					       0,
@@ -78,8 +146,14 @@ static int __init fscache_init(void)
 error_kobj:
 	kmem_cache_destroy(fscache_cookie_jar);
 error_cookie_jar:
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(fscache_sysctl_header);
+error_sysctl:
+#endif
 	fscache_proc_cleanup();
 error_proc:
+	destroy_workqueue(fscache_object_wq);
+error_object_wq:
 	slow_work_unregister_user(THIS_MODULE);
 error_slow_work:
 	return ret;
@@ -96,7 +170,9 @@ static void __exit fscache_exit(void)
 
 	kobject_put(fscache_root);
 	kmem_cache_destroy(fscache_cookie_jar);
+	unregister_sysctl_table(fscache_sysctl_header);
 	fscache_proc_cleanup();
+	destroy_workqueue(fscache_object_wq);
 	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 4a8eb31c533..ebe29c58138 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -34,8 +34,8 @@ struct fscache_objlist_data {
 #define FSCACHE_OBJLIST_CONFIG_NOREADS	0x00000200	/* show objects without active reads */
 #define FSCACHE_OBJLIST_CONFIG_EVENTS	0x00000400	/* show objects with events */
 #define FSCACHE_OBJLIST_CONFIG_NOEVENTS	0x00000800	/* show objects without no events */
-#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with slow work */
-#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without slow work */
+#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without work */
 
 	u8		buf[512];	/* key and aux data buffer */
 };
@@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 		       READS, NOREADS);
 		FILTER(obj->events & obj->event_mask,
 		       EVENTS, NOEVENTS);
-		FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
-		       WORK, NOWORK);
+		FILTER(work_busy(&obj->work), WORK, NOWORK);
 	}
 
 	seq_printf(m,
-		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
+		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
 		   obj->debug_id,
 		   obj->parent ? obj->parent->debug_id : -1,
 		   fscache_object_states_short[obj->state],
@@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 		   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
 		   obj->events,
 		   obj->flags,
-		   obj->work.flags);
+		   work_busy(&obj->work));
 
 	no_cookie = true;
 	keylen = auxlen = 0;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 0b589a9b4ff..b6b897c550a 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,7 +14,6 @@
 
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
-#include <linux/seq_file.h>
 #include "internal.h"
 
 const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 	[FSCACHE_OBJECT_DEAD]		= "DEAD",
 };
 
-static void fscache_object_slow_work_put_ref(struct slow_work *);
-static int  fscache_object_slow_work_get_ref(struct slow_work *);
-static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
-#endif
+static int  fscache_get_object(struct fscache_object *);
+static void fscache_put_object(struct fscache_object *);
 static void fscache_initialise_object(struct fscache_object *);
 static void fscache_lookup_object(struct fscache_object *);
 static void fscache_object_available(struct fscache_object *);
@@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *);
 static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 
-const struct slow_work_ops fscache_object_slow_work_ops = {
-	.owner		= THIS_MODULE,
-	.get_ref	= fscache_object_slow_work_get_ref,
-	.put_ref	= fscache_object_slow_work_put_ref,
-	.execute	= fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	.desc		= fscache_object_slow_work_desc,
-#endif
-};
-EXPORT_SYMBOL(fscache_object_slow_work_ops);
-
 /*
  * we need to notify the parent when an op completes that we had outstanding
  * upon it
@@ -345,7 +329,7 @@ unsupported_event:
 /*
  * execute an object
  */
-static void fscache_object_slow_work_execute(struct slow_work *work)
+void fscache_object_work_func(struct work_struct *work)
 {
 	struct fscache_object *object =
 		container_of(work, struct fscache_object, work);
@@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 	if (object->events & object->event_mask)
 		fscache_enqueue_object(object);
 	clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	fscache_put_object(object);
 }
-
-/*
- * describe an object for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *work,
-					  struct seq_file *m)
-{
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
-
-	seq_printf(m, "FSC: OBJ%x: %s",
-		   object->debug_id,
-		   fscache_object_states_short[object->state]);
-}
-#endif
+EXPORT_SYMBOL(fscache_object_work_func);
 
 /*
  * initialise an object
@@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object)
 	_enter("");
 	ASSERT(object->cookie != NULL);
 	ASSERT(object->cookie->parent != NULL);
-	ASSERT(list_empty(&object->work.link));
 
 	if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
 			      (1 << FSCACHE_OBJECT_EV_RELEASE) |
@@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object)
 		object->parent = NULL;
 	}
 
-	/* this just shifts the object release to the slow work processor */
-	fscache_stat(&fscache_n_cop_put_object);
-	object->cache->ops->put_object(object);
-	fscache_stat_d(&fscache_n_cop_put_object);
+	/* this just shifts the object release to the work processor */
+	fscache_put_object(object);
 
 	_leave("");
 }
@@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache,
 }
 
 /*
- * allow the slow work item processor to get a ref on an object
+ * get a ref on an object
  */
-static int fscache_object_slow_work_get_ref(struct slow_work *work)
+static int fscache_get_object(struct fscache_object *object)
 {
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
 	int ret;
 
 	fscache_stat(&fscache_n_cop_grab_object);
@@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
 }
 
 /*
- * allow the slow work item processor to discard a ref on a work item
+ * discard a ref on a work item
  */
-static void fscache_object_slow_work_put_ref(struct slow_work *work)
+static void fscache_put_object(struct fscache_object *object)
 {
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
-
 	fscache_stat(&fscache_n_cop_put_object);
 	object->cache->ops->put_object(object);
 	fscache_stat_d(&fscache_n_cop_put_object);
@@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object)
 {
 	_enter("{OBJ%x}", object->debug_id);
 
-	slow_work_enqueue(&object->work);
+	if (fscache_get_object(object) >= 0) {
+		wait_queue_head_t *cong_wq =
+			&get_cpu_var(fscache_object_cong_wait);
+
+		if (queue_work(fscache_object_wq, &object->work)) {
+			if (fscache_object_congested())
+				wake_up(cong_wq);
+		} else
+			fscache_put_object(object);
+
+		put_cpu_var(fscache_object_cong_wait);
+	}
+}
+
+/**
+ * fscache_object_sleep_till_congested - Sleep until object wq is congested
+ * @timoutp: Scheduler sleep timeout
+ *
+ * Allow an object handler to sleep until the object workqueue is congested.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * %true is returned if the object wq is congested, %false otherwise.
+ */
+bool fscache_object_sleep_till_congested(signed long *timeoutp)
+{
+	wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
+	DEFINE_WAIT(wait);
+
+	if (fscache_object_congested())
+		return true;
+
+	add_wait_queue_exclusive(cong_wq, &wait);
+	if (!fscache_object_congested())
+		*timeoutp = schedule_timeout(*timeoutp);
+	finish_wait(cong_wq, &wait);
+
+	return fscache_object_congested();
 }
+EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
 
 /*
  * enqueue the dependents of an object for metadata-type processing
@@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
 
 		/* sort onto appropriate lists */
 		fscache_enqueue_object(dep);
-		fscache_stat(&fscache_n_cop_put_object);
-		dep->cache->ops->put_object(dep);
-		fscache_stat_d(&fscache_n_cop_put_object);
+		fscache_put_object(dep);
 
 		if (!list_empty(&object->dependents))
 			cond_resched_lock(&object->lock);
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index c57db27ac86..27c8df50315 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -21,6 +21,7 @@
 #include <linux/fscache.h>
 #include <linux/sched.h>
 #include <linux/slow-work.h>
+#include <linux/workqueue.h>
 
 #define NR_MAXCACHES BITS_PER_LONG
 
@@ -389,7 +390,7 @@ struct fscache_object {
 	struct fscache_cache	*cache;		/* cache that supplied this object */
 	struct fscache_cookie	*cookie;	/* netfs's file/index object */
 	struct fscache_object	*parent;	/* parent object */
-	struct slow_work	work;		/* attention scheduling record */
+	struct work_struct	work;		/* attention scheduling record */
 	struct list_head	dependents;	/* FIFO of dependent objects */
 	struct list_head	dep_link;	/* link in parent's dependents list */
 	struct list_head	pending_ops;	/* unstarted operations on this object */
@@ -411,7 +412,7 @@ extern const char *fscache_object_states[];
 	(test_bit(FSCACHE_IOERROR, &(obj)->cache->flags) &&	\
 	 (obj)->state >= FSCACHE_OBJECT_DYING)
 
-extern const struct slow_work_ops fscache_object_slow_work_ops;
+extern void fscache_object_work_func(struct work_struct *work);
 
 /**
  * fscache_object_init - Initialise a cache object description
@@ -433,7 +434,7 @@ void fscache_object_init(struct fscache_object *object,
 	spin_lock_init(&object->lock);
 	INIT_LIST_HEAD(&object->cache_link);
 	INIT_HLIST_NODE(&object->cookie_link);
-	vslow_work_init(&object->work, &fscache_object_slow_work_ops);
+	INIT_WORK(&object->work, fscache_object_work_func);
 	INIT_LIST_HEAD(&object->dependents);
 	INIT_LIST_HEAD(&object->dep_link);
 	INIT_LIST_HEAD(&object->pending_ops);
@@ -534,6 +535,8 @@ extern void fscache_io_error(struct fscache_cache *cache);
 extern void fscache_mark_pages_cached(struct fscache_retrieval *op,
 				      struct pagevec *pagevec);
 
+extern bool fscache_object_sleep_till_congested(signed long *timeoutp);
+
 extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
 					       const void *data,
 					       uint16_t datalen);
-- 
cgit v1.2.3


From 773bc4f3b6898634a80a41c72a1f34cb89992dcd Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 5 Jul 2010 13:00:08 +0900
Subject: nilfs2: add barrier mount option

Nilfs enables write barriers by default and has "nobarrier" mount
option to disable this feature.  But it lacks the complementary option
and has no way to re-enable the feature on remount.

This adds "barrier" option to resolve this imbalance.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 Documentation/filesystems/nilfs2.txt | 5 ++++-
 fs/nilfs2/super.c                    | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index d3e7673995e..54f61c0ff44 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -49,7 +49,10 @@ Mount options
 NILFS2 supports the following mount options:
 (*) == default
 
-nobarrier		Disables barriers.
+barrier(*)		This enables/disables the use of write barriers.  This
+nobarrier		requires an IO stack which can support barriers, and
+			if nilfs gets an error on a barrier write, it will
+			disable again with a warning.
 errors=continue		Keep going on a filesystem error.
 errors=remount-ro(*)	Remount the filesystem read-only on an error.
 errors=panic		Panic and halt the machine if an error occurs.
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f2cfbbab234..13b0e955c02 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -603,7 +603,7 @@ static const struct export_operations nilfs_export_ops = {
 
 enum {
 	Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
+	Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
 	Opt_discard, Opt_err,
 };
 
@@ -611,6 +611,7 @@ static match_table_t tokens = {
 	{Opt_err_cont, "errors=continue"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_barrier, "barrier"},
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_snapshot, "cp=%u"},
 	{Opt_order, "order=%s"},
@@ -636,6 +637,9 @@ static int parse_options(char *options, struct super_block *sb)
 
 		token = match_token(p, tokens, args);
 		switch (token) {
+		case Opt_barrier:
+			nilfs_set_opt(sbi, BARRIER);
+			break;
 		case Opt_nobarrier:
 			nilfs_clear_opt(sbi, BARRIER);
 			break;
-- 
cgit v1.2.3


From 802d31775404ee335ca1e97a82e1e706a4c843be Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 5 Jul 2010 14:27:04 +0900
Subject: nilfs2: add nodiscard mount option

Nilfs has "discard" mount option which issues discard/TRIM commands to
underlying block device, but it lacks a complementary option and has
no way to disable the feature through remount.

This adds "nodiscard" option to resolve this imbalance.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 Documentation/filesystems/nilfs2.txt | 7 ++++---
 fs/nilfs2/super.c                    | 6 +++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 54f61c0ff44..d5c0cef38a7 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -77,9 +77,10 @@ norecovery		Disable recovery of the filesystem on mount.
 			This disables every write access on the device for
 			read-only mounts or snapshots.  This option will fail
 			for r/w mounts on an unclean volume.
-discard			Issue discard/TRIM commands to the underlying block
-			device when blocks are freed.  This is useful for SSD
-			devices and sparse/thinly-provisioned LUNs.
+discard			This enables/disables the use of discard/TRIM commands.
+nodiscard(*)		The discard/TRIM commands are sent to the underlying
+			block device when blocks are freed.  This is useful
+			for SSD devices and sparse/thinly-provisioned LUNs.
 
 NILFS2 usage
 ============
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 13b0e955c02..9da12211aac 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -604,7 +604,7 @@ static const struct export_operations nilfs_export_ops = {
 enum {
 	Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
-	Opt_discard, Opt_err,
+	Opt_discard, Opt_nodiscard, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -617,6 +617,7 @@ static match_table_t tokens = {
 	{Opt_order, "order=%s"},
 	{Opt_norecovery, "norecovery"},
 	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
 	{Opt_err, NULL}
 };
 
@@ -676,6 +677,9 @@ static int parse_options(char *options, struct super_block *sb)
 		case Opt_discard:
 			nilfs_set_opt(sbi, DISCARD);
 			break;
+		case Opt_nodiscard:
+			nilfs_clear_opt(sbi, DISCARD);
+			break;
 		default:
 			printk(KERN_ERR
 			       "NILFS: Unrecognized mount option \"%s\"\n", p);
-- 
cgit v1.2.3


From a64afb057b607c04383ab5fb53c51421ba18c434 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 20 Jul 2010 17:50:52 +1000
Subject: xfs: remove obsolete osyncisosync mount option

Since Linux 2.6.33 the kernel has support for real O_SYNC, which made
the osyncisosync option a no-op.  Warn the users about this and remove
the mount flag for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 Documentation/filesystems/xfs.txt | 11 -----------
 fs/xfs/linux-2.6/xfs_super.c      | 10 ++++------
 fs/xfs/xfs_mount.h                |  2 --
 3 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 9878f50d6ed..7bff3e4f35d 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -131,17 +131,6 @@ When mounting an XFS filesystem, the following options are accepted.
 	Don't check for double mounted file systems using the file system uuid.
 	This is useful to mount LVM snapshot volumes.
 
-  osyncisosync
-	Make O_SYNC writes implement true O_SYNC.  WITHOUT this option,
-	Linux XFS behaves as if an "osyncisdsync" option is used,
-	which will make writes to files opened with the O_SYNC flag set
-	behave as if the O_DSYNC flag had been used instead.
-	This can result in better performance without compromising
-	data safety.
-	However if this option is not in effect, timestamp updates from
-	O_SYNC writes can be lost if the system crashes.
-	If timestamp updates are critical, use the osyncisosync option.
-
   uquota/usrquota/uqnoenforce/quota
 	User disk quota accounting enabled, and limits (optionally)
 	enforced.  Refer to xfs_quota(8) for further details.
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 8c4f4476e5c..758df94690e 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -90,7 +90,6 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_BARRIER	"barrier"	/* use writer barriers for log write and
 					 * unwritten extent conversion */
 #define MNTOPT_NOBARRIER "nobarrier"	/* .. disable */
-#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
 #define MNTOPT_64BITINODE   "inode64"	/* inodes can be allocated anywhere */
 #define MNTOPT_IKEEP	"ikeep"		/* do not free empty inode clusters */
 #define MNTOPT_NOIKEEP	"noikeep"	/* free empty inode clusters */
@@ -274,8 +273,6 @@ xfs_parseargs(
 			mp->m_flags &= ~XFS_MOUNT_GRPID;
 		} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
 			mp->m_flags |= XFS_MOUNT_WSYNC;
-		} else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
-			mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
 		} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
 			mp->m_flags |= XFS_MOUNT_NORECOVERY;
 		} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
@@ -366,9 +363,11 @@ xfs_parseargs(
 			cmn_err(CE_WARN,
 	"XFS: ihashsize no longer used, option is deprecated.");
 		} else if (!strcmp(this_char, "osyncisdsync")) {
-			/* no-op, this is now the default */
 			cmn_err(CE_WARN,
-	"XFS: osyncisdsync is now the default, option is deprecated.");
+	"XFS: osyncisdsync has no effect, option is deprecated.");
+		} else if (!strcmp(this_char, "osyncisosync")) {
+			cmn_err(CE_WARN,
+	"XFS: osyncisosync has no effect, option is deprecated.");
 		} else if (!strcmp(this_char, "irixsgid")) {
 			cmn_err(CE_WARN,
 	"XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
@@ -500,7 +499,6 @@ xfs_showargs(
 		{ XFS_MOUNT_SWALLOC,		"," MNTOPT_SWALLOC },
 		{ XFS_MOUNT_NOUUID,		"," MNTOPT_NOUUID },
 		{ XFS_MOUNT_NORECOVERY,		"," MNTOPT_NORECOVERY },
-		{ XFS_MOUNT_OSYNCISOSYNC,	"," MNTOPT_OSYNCISOSYNC },
 		{ XFS_MOUNT_ATTR2,		"," MNTOPT_ATTR2 },
 		{ XFS_MOUNT_FILESTREAMS,	"," MNTOPT_FILESTREAM },
 		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e70dc39394a..622da2179a5 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -220,8 +220,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_GRPID		(1ULL << 9)	/* group-ID assigned from directory */
 #define XFS_MOUNT_NORECOVERY	(1ULL << 10)	/* no recovery - dirty fs */
 #define XFS_MOUNT_DFLT_IOSIZE	(1ULL << 12)	/* set default i/o size */
-#define XFS_MOUNT_OSYNCISOSYNC	(1ULL << 13)	/* o_sync is REALLY o_sync */
-						/* osyncisdsync is now default*/
 #define XFS_MOUNT_32BITINODES	(1ULL << 14)	/* do not create inodes above
 						 * 32 bits in size */
 #define XFS_MOUNT_SMALL_INUMS	(1ULL << 15)	/* users wants 32bit inodes */
-- 
cgit v1.2.3


From 8633328be242677fdedc42052838dd0608e7f342 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 19 Jul 2010 09:45:34 -0600
Subject: PCI: Allow read/write access to sysfs I/O port resources

PCI sysfs resource files currently only allow mmap'ing.  On x86 this
works fine for memory backed BARs, but doesn't work at all for I/O
port backed BARs.  Add read/write to I/O port PCI sysfs resource
files to allow userspace access to these device regions.

Acked-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/filesystems/sysfs-pci.txt |  7 +++-
 drivers/pci/pci-sysfs.c                 | 68 +++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 85354b32d73..74eaac26f8b 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -39,7 +39,7 @@ files, each with their own function.
        local_cpus	   nearby CPU mask (cpumask, ro)
        remove		   remove device from kernel's list (ascii, wo)
        resource		   PCI resource host addresses (ascii, ro)
-       resource0..N	   PCI resource N, if present (binary, mmap)
+       resource0..N	   PCI resource N, if present (binary, mmap, rw[1])
        resource0_wc..N_wc  PCI WC map resource N, if prefetchable (binary, mmap)
        rom		   PCI ROM resource, if present (binary, ro)
        subsystem_device	   PCI subsystem device (ascii, ro)
@@ -54,13 +54,16 @@ files, each with their own function.
   binary - file contains binary data
   cpumask - file contains a cpumask type
 
+[1] rw for RESOURCE_IO (I/O port) regions only
+
 The read only files are informational, writes to them will be ignored, with
 the exception of the 'rom' file.  Writable files can be used to perform
 actions on the device (e.g. changing config space, detaching a device).
 mmapable files are available via an mmap of the file at offset 0 and can be
 used to do actual device programming from userspace.  Note that some platforms
 don't support mmapping of certain resources, so be sure to check the return
-value from any attempted mmap.
+value from any attempted mmap.  The most notable of these are I/O port
+resources, which also provide read/write access.
 
 The 'enable' file provides a counter that indicates how many times the device 
 has been enabled.  If the 'enable' file currently returns '4', and a '1' is
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 5935b854917..f7692dc531e 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -778,6 +778,70 @@ pci_mmap_resource_wc(struct file *filp, struct kobject *kobj,
 	return pci_mmap_resource(kobj, attr, vma, 1);
 }
 
+static ssize_t
+pci_resource_io(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count, bool write)
+{
+	struct pci_dev *pdev = to_pci_dev(container_of(kobj,
+						       struct device, kobj));
+	struct resource *res = attr->private;
+	unsigned long port = off;
+	int i;
+
+	for (i = 0; i < PCI_ROM_RESOURCE; i++)
+		if (res == &pdev->resource[i])
+			break;
+	if (i >= PCI_ROM_RESOURCE)
+		return -ENODEV;
+
+	port += pci_resource_start(pdev, i);
+
+	if (port > pci_resource_end(pdev, i))
+		return 0;
+
+	if (port + count - 1 > pci_resource_end(pdev, i))
+		return -EINVAL;
+
+	switch (count) {
+	case 1:
+		if (write)
+			outb(*(u8 *)buf, port);
+		else
+			*(u8 *)buf = inb(port);
+		return 1;
+	case 2:
+		if (write)
+			outw(*(u16 *)buf, port);
+		else
+			*(u16 *)buf = inw(port);
+		return 2;
+	case 4:
+		if (write)
+			outl(*(u32 *)buf, port);
+		else
+			*(u32 *)buf = inl(port);
+		return 4;
+	}
+	return -EINVAL;
+}
+
+static ssize_t
+pci_read_resource_io(struct file *filp, struct kobject *kobj,
+		     struct bin_attribute *attr, char *buf,
+		     loff_t off, size_t count)
+{
+	return pci_resource_io(filp, kobj, attr, buf, off, count, false);
+}
+
+static ssize_t
+pci_write_resource_io(struct file *filp, struct kobject *kobj,
+		      struct bin_attribute *attr, char *buf,
+		      loff_t off, size_t count)
+{
+	return pci_resource_io(filp, kobj, attr, buf, off, count, true);
+}
+
 /**
  * pci_remove_resource_files - cleanup resource files
  * @pdev: dev to cleanup
@@ -828,6 +892,10 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine)
 			sprintf(res_attr_name, "resource%d", num);
 			res_attr->mmap = pci_mmap_resource_uc;
 		}
+		if (pci_resource_flags(pdev, num) & IORESOURCE_IO) {
+			res_attr->read = pci_read_resource_io;
+			res_attr->write = pci_write_resource_io;
+		}
 		res_attr->attr.name = res_attr_name;
 		res_attr->attr.mode = S_IRUSR | S_IWUSR;
 		res_attr->size = pci_resource_len(pdev, num);
-- 
cgit v1.2.3


From 0ea6e61122196509af82cc4f36cbdaacbefb8227 Mon Sep 17 00:00:00 2001
From: "Justin P. Mattock" <justinmattock@gmail.com>
Date: Fri, 23 Jul 2010 20:51:24 -0700
Subject: Documentation: update broken web addresses.

Below you will find an updated version from the original series bunching all patches into one big patch
updating broken web addresses that are located in Documentation/*
Some of the addresses date as far far back as 1995 etc... so searching became a bit difficult,
the best way to deal with these is to use web.archive.org to locate these addresses that are outdated.
Now there are also some addresses pointing to .spec files some are located, but some(after searching
on the companies site)where still no where to be found. In this case I just changed the address
to the company site this way the users can contact the company and they can locate them for the users.

Signed-off-by: Justin P. Mattock <justinmattock@gmail.com>
Signed-off-by: Thomas Weber <weber@corscience.de>
Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Cc: Paulo Marques <pmarques@grupopie.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Michael Neuling <mikey@neuling.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ABI/testing/sysfs-devices-memory     |  6 +-
 Documentation/ABI/testing/sysfs-devices-system-cpu |  2 +-
 Documentation/DocBook/scsi.tmpl                    |  2 +-
 Documentation/DocBook/v4l/compat.xml               |  5 +-
 Documentation/DocBook/v4l/fdl-appendix.xml         |  2 +-
 Documentation/HOWTO                                |  6 +-
 Documentation/RCU/RTFP.txt                         |  2 +-
 Documentation/SubmittingDrivers                    |  2 +-
 Documentation/aoe/aoe.txt                          |  2 +-
 Documentation/arm/IXP2000                          |  2 +-
 Documentation/arm/IXP4xx                           | 14 ++---
 Documentation/arm/README                           |  4 +-
 Documentation/arm/SA1100/Assabet                   |  5 +-
 Documentation/arm/SA1100/Brutus                    |  2 +-
 Documentation/arm/SA1100/FreeBird                  |  2 +-
 Documentation/arm/SA1100/GraphicsClient            |  2 +-
 Documentation/arm/SA1100/GraphicsMaster            |  2 +-
 Documentation/arm/SA1100/Itsy                      |  2 +-
 Documentation/arm/SA1100/PLEB                      |  2 +-
 Documentation/arm/SA1100/Victor                    |  2 +-
 Documentation/arm/SA1100/nanoEngine                |  2 +-
 Documentation/binfmt_misc.txt                      |  2 +-
 Documentation/blockdev/paride.txt                  |  2 +-
 Documentation/cdrom/packet-writing.txt             |  2 +-
 Documentation/cgroups/cpusets.txt                  |  2 +-
 Documentation/development-process/4.Coding         |  2 +-
 Documentation/device-mapper/dm-crypt.txt           |  2 +-
 Documentation/devices.txt                          |  9 +--
 Documentation/dvb/faq.txt                          |  2 +-
 Documentation/fb/framebuffer.txt                   |  2 +-
 Documentation/filesystems/9p.txt                   |  2 +-
 Documentation/filesystems/affs.txt                 |  2 +-
 Documentation/filesystems/befs.txt                 |  4 +-
 Documentation/filesystems/isofs.txt                |  2 +-
 Documentation/filesystems/proc.txt                 |  4 +-
 Documentation/filesystems/vfat.txt                 |  3 +-
 Documentation/hwmon/adm1026                        |  2 +-
 Documentation/hwmon/g760a                          |  2 +-
 Documentation/hwmon/gl518sm                        |  3 +-
 Documentation/hwmon/k8temp                         |  2 +-
 Documentation/hwmon/lm85                           |  6 +-
 Documentation/hwmon/smsc47m1                       | 11 ++--
 Documentation/hwmon/thmc50                         |  2 +-
 Documentation/hwmon/via686a                        |  2 +-
 Documentation/hwmon/w83627hf                       |  6 +-
 Documentation/hwmon/w83781d                        |  2 +-
 Documentation/hwmon/w83792d                        |  2 +-
 Documentation/i2c/busses/i2c-ali1535               |  2 +-
 Documentation/i2c/busses/i2c-ali1563               |  2 +-
 Documentation/i2c/busses/i2c-ali15x3               |  2 +-
 Documentation/i2c/busses/i2c-piix4                 |  2 +-
 Documentation/i2c/busses/i2c-sis630                |  2 +-
 Documentation/ia64/aliasing.txt                    |  2 -
 Documentation/ia64/serial.txt                      |  2 +-
 Documentation/infiniband/user_verbs.txt            |  2 +-
 Documentation/input/appletouch.txt                 |  2 +-
 Documentation/input/bcm5974.txt                    |  2 +-
 Documentation/input/iforce-protocol.txt            |  2 +-
 Documentation/input/sentelic.txt                   |  2 +-
 Documentation/input/xpad.txt                       |  2 +-
 Documentation/intel_txt.txt                        | 12 ++--
 Documentation/ioctl/ioctl-number.txt               | 10 +--
 Documentation/isdn/README                          |  2 +-
 Documentation/isdn/README.HiSax                    |  2 +-
 Documentation/ja_JP/HOWTO                          |  2 +-
 Documentation/ja_JP/SubmittingPatches              |  6 +-
 Documentation/kernel-docs.txt                      | 39 ++++++------
 Documentation/ko_KR/HOWTO                          |  8 +--
 Documentation/laptops/acer-wmi.txt                 |  2 +-
 Documentation/ldm.txt                              |  2 +-
 Documentation/md.txt                               |  2 +-
 Documentation/misc-devices/c2port.txt              |  4 +-
 Documentation/mtd/nand_ecc.txt                     |  2 +-
 Documentation/networking/3c509.txt                 |  2 +-
 Documentation/networking/README.ipw2100            |  3 +-
 Documentation/networking/README.ipw2200            |  2 +-
 Documentation/networking/README.sb1000             |  4 +-
 Documentation/networking/arcnet.txt                |  4 +-
 Documentation/networking/bonding.txt               |  2 +-
 Documentation/networking/decnet.txt                |  2 +-
 Documentation/networking/fore200e.txt              |  2 +-
 Documentation/networking/ipddp.txt                 |  5 --
 Documentation/networking/iphase.txt                |  2 +-
 Documentation/networking/packet_mmap.txt           |  4 +-
 Documentation/networking/ray_cs.txt                |  4 +-
 Documentation/networking/s2io.txt                  |  3 +-
 Documentation/networking/tlan.txt                  |  4 +-
 Documentation/networking/udplite.txt               | 13 ++--
 Documentation/networking/wavelan.txt               |  3 +-
 Documentation/power/apm-acpi.txt                   |  2 +-
 Documentation/power/basic-pm-debugging.txt         |  3 +-
 Documentation/power/video.txt                      | 12 ++--
 Documentation/powerpc/booting-without-of.txt       |  2 +-
 Documentation/s390/Debugging390.txt                |  2 +-
 Documentation/scsi/BusLogic.txt                    |  2 +-
 Documentation/scsi/ChangeLog.megaraid              |  2 +-
 Documentation/scsi/FlashPoint.txt                  |  4 +-
 Documentation/scsi/Mylex.txt                       |  2 +-
 Documentation/scsi/NinjaSCSI.txt                   |  2 -
 Documentation/scsi/aic79xx.txt                     |  2 +-
 Documentation/scsi/aic7xxx.txt                     |  2 +-
 Documentation/scsi/ibmmca.txt                      |  2 +-
 Documentation/scsi/osst.txt                        |  2 +-
 Documentation/scsi/ppa.txt                         |  6 +-
 Documentation/scsi/scsi-generic.txt                |  8 +--
 Documentation/scsi/scsi.txt                        |  4 +-
 Documentation/scsi/scsi_mid_low_api.txt            |  2 +-
 Documentation/serial/moxa-smartio                  |  2 +-
 Documentation/sound/alsa/ALSA-Configuration.txt    |  2 +-
 Documentation/sound/alsa/HD-Audio.txt              |  2 +-
 Documentation/sound/alsa/soc/DAI.txt               |  2 +-
 Documentation/sound/alsa/soc/codec.txt             |  2 +-
 Documentation/sound/alsa/soc/platform.txt          |  2 +-
 Documentation/sound/oss/README.OSS                 |  5 +-
 Documentation/telephony/ixj.txt                    |  9 +--
 Documentation/uml/UserModeLinux-HOWTO.txt          | 73 +++-------------------
 Documentation/usb/linux.inf                        |  2 +-
 Documentation/usb/mtouchusb.txt                    |  6 +-
 Documentation/usb/usb-serial.txt                   |  4 +-
 Documentation/video4linux/API.html                 |  2 +-
 Documentation/video4linux/CQcam.txt                |  4 +-
 Documentation/video4linux/README.cpia              |  2 +-
 Documentation/video4linux/README.ivtv              |  2 +-
 Documentation/video4linux/Zoran                    |  6 +-
 Documentation/video4linux/bttv/Cards               |  6 +-
 Documentation/video4linux/bttv/MAKEDEV             |  2 +-
 Documentation/video4linux/bttv/Specs               |  4 +-
 .../video4linux/cx88/hauppauge-wintv-cx88-ir.txt   |  2 +-
 .../video4linux/hauppauge-wintv-cx88-ir.txt        |  2 +-
 Documentation/video4linux/ibmcam.txt               |  5 +-
 Documentation/video4linux/se401.txt                |  2 +-
 Documentation/video4linux/w9966.txt                |  2 +-
 Documentation/w1/masters/ds2482                    |  4 +-
 Documentation/w1/masters/mxc-w1                    |  3 +-
 Documentation/w1/masters/omap-hdq                  |  2 +-
 Documentation/zh_CN/HOWTO                          |  8 +--
 Documentation/zh_CN/SubmittingDrivers              |  2 +-
 Documentation/zh_CN/SubmittingPatches              |  6 +-
 138 files changed, 238 insertions(+), 330 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
index aba7d989208..7405de26ee6 100644
--- a/Documentation/ABI/testing/sysfs-devices-memory
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -7,7 +7,7 @@ Description:
 		added or removed dynamically to represent hot-add/remove
 		operations.
 Users:		hotplug memory add/remove tools
-		https://w3.opensource.ibm.com/projects/powerpc-utils/
+		http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils
 
 What:		/sys/devices/system/memory/memoryX/removable
 Date:		June 2008
@@ -19,7 +19,7 @@ Description:
 		identify removable sections of the memory before attempting
 		potentially expensive hot-remove memory operation
 Users:		hotplug memory remove tools
-		https://w3.opensource.ibm.com/projects/powerpc-utils/
+		http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils	
 
 What:		/sys/devices/system/memory/memoryX/phys_device
 Date:		September 2008
@@ -58,7 +58,7 @@ Description:
 		by root to offline that section.
 		# echo offline > /sys/devices/system/memory/memory22/state
 Users:		hotplug memory remove tools
-		https://w3.opensource.ibm.com/projects/powerpc-utils/
+		http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils
 
 
 What:		/sys/devices/system/memoryX/nodeY
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 84a710f87c6..7564e88bfa4 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -197,7 +197,7 @@ Description:	These files exist in every cpu's cache index directories.
 		Currently, only AMD Family 10h Processors support cache index
 		disable, and only for their L3 caches.  See the BIOS and
 		Kernel Developer's Guide at
-		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf
+		http://support.amd.com/us/Embedded_TechDocs/31116-Public-GH-BKDG_3-28_5-28-09.pdf	
 		for formatting information and other details on the
 		cache index disable.
 Users:    joachim.deguara@amd.com
diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl
index d87f4569e76..324b53494f0 100644
--- a/Documentation/DocBook/scsi.tmpl
+++ b/Documentation/DocBook/scsi.tmpl
@@ -393,7 +393,7 @@
         </para>
         <para>
           For documentation see
-          <ulink url='http://www.torque.net/sg/sdebug26.html'>http://www.torque.net/sg/sdebug26.html</ulink>
+          <ulink url='http://sg.danny.cz/sg/sdebug26.html'>http://sg.danny.cz/sg/sdebug26.html</ulink>
         </para>
 <!-- !Edrivers/scsi/scsi_debug.c -->
       </sect2>
diff --git a/Documentation/DocBook/v4l/compat.xml b/Documentation/DocBook/v4l/compat.xml
index b42b935913c..54447f0d078 100644
--- a/Documentation/DocBook/v4l/compat.xml
+++ b/Documentation/DocBook/v4l/compat.xml
@@ -1091,8 +1091,9 @@ signed 64-bit integer. Output devices should not send a buffer out
 until the time in the timestamp field has arrived. I would like to
 follow SGI's lead, and adopt a multimedia timestamping system like
 their UST (Unadjusted System Time). See
-http://reality.sgi.com/cpirazzi_engr/lg/time/intro.html. [This link is
-no longer valid.] UST uses timestamps that are 64-bit signed integers
+http://web.archive.org/web/*/http://reality.sgi.com
+/cpirazzi_engr/lg/time/intro.html. 
+UST uses timestamps that are 64-bit signed integers
 (not struct timeval's) and given in nanosecond units. The UST clock
 starts at zero when the system is booted and runs continuously and
 uniformly. It takes a little over 292 years for UST to overflow. There
diff --git a/Documentation/DocBook/v4l/fdl-appendix.xml b/Documentation/DocBook/v4l/fdl-appendix.xml
index b6ce50dbe49..ae22394ba99 100644
--- a/Documentation/DocBook/v4l/fdl-appendix.xml
+++ b/Documentation/DocBook/v4l/fdl-appendix.xml
@@ -2,7 +2,7 @@
      The GNU Free Documentation License 1.1 in DocBook
      Markup by Eric Baudais <baudais@okstate.edu>
      Maintained by the GNOME Documentation Project
-     http://developer.gnome.org/projects/gdp
+     http://live.gnome.org/DocumentationProject
      Version: 1.0.1
      Last Modified: Nov 16, 2000
 -->
diff --git a/Documentation/HOWTO b/Documentation/HOWTO
index 40ada93b820..365bda9a0d9 100644
--- a/Documentation/HOWTO
+++ b/Documentation/HOWTO
@@ -187,7 +187,7 @@ apply a patch.
 If you do not know where you want to start, but you want to look for
 some task to start doing to join into the kernel development community,
 go to the Linux Kernel Janitor's project:
-	http://janitor.kernelnewbies.org/
+	http://kernelnewbies.org/KernelJanitors	
 It is a great place to start.  It describes a list of relatively simple
 problems that need to be cleaned up and fixed within the Linux kernel
 source tree.  Working with the developers in charge of this project, you
@@ -315,7 +315,7 @@ process is tracked with the tool patchwork.  Patchwork offers a web
 interface which shows patch postings, any comments on a patch or
 revisions to it, and maintainers can mark patches as under review,
 accepted, or rejected.  Most of these patchwork sites are listed at
-http://patchwork.kernel.org/ or http://patchwork.ozlabs.org/.
+http://patchwork.kernel.org/.
 
 2.6.x -next kernel tree for integration tests
 ---------------------------------------------
@@ -595,7 +595,7 @@ start exactly where you are now.
 
 ----------
 Thanks to Paolo Ciarrocchi who allowed the "Development Process"
-(http://linux.tar.bz/articles/2.6-development_process) section
+(http://lwn.net/Articles/94386/) section
 to be based on text he had written, and to Randy Dunlap and Gerrit
 Huizenga for some of the list of things you should and should not say.
 Also thanks to Pat Mochel, Hanna Linder, Randy Dunlap, Kay Sievers,
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 5aea459e3dd..c43460dade0 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -606,7 +606,7 @@ Suparna Bhattacharya"
 ,Year="2006"
 ,pages="v2 123-138"
 ,note="Available:
-\url{http://www.linuxsymposium.org/2006/view_abstract.php?content_key=184}
+\url{http://www.linuxsymposium.org/2006/index_2006.php}
 \url{http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf}
 [Viewed January 1, 2007]"
 ,annotation="
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
index 4947fd8fb18..38d2aab59ca 100644
--- a/Documentation/SubmittingDrivers
+++ b/Documentation/SubmittingDrivers
@@ -161,7 +161,7 @@ How to NOT write kernel driver by Arjan van de Ven:
 	http://www.fenrus.org/how-to-not-write-a-device-driver-paper.pdf
 
 Kernel Janitor:
-	http://janitor.kernelnewbies.org/
+	http://kernelnewbies.org/KernelJanitors
 
 GIT, Fast Version Control System:
 	http://git-scm.com/
diff --git a/Documentation/aoe/aoe.txt b/Documentation/aoe/aoe.txt
index 3a4dbe4663c..b5aada9f20c 100644
--- a/Documentation/aoe/aoe.txt
+++ b/Documentation/aoe/aoe.txt
@@ -1,6 +1,6 @@
 The EtherDrive (R) HOWTO for users of 2.6 kernels is found at ...
 
-  http://www.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html
+  http://www.coraid.com/SUPPORT/EtherDrive-HBA  
 
   It has many tips and hints!
 
diff --git a/Documentation/arm/IXP2000 b/Documentation/arm/IXP2000
index e0148b6b2c4..68d21d92a30 100644
--- a/Documentation/arm/IXP2000
+++ b/Documentation/arm/IXP2000
@@ -14,7 +14,7 @@ telecom systems. In addition to an XScale core, it contains up to 8
 interfaces (UTOPIA, SPI, etc), a PCI host bridge, one serial port,
 flash interface, and some other odds and ends. For more information, see:
 
-http://developer.intel.com/design/network/products/npfamily/ixp2xxx.htm
+http://developer.intel.com
 
 2. Linux Support
 
diff --git a/Documentation/arm/IXP4xx b/Documentation/arm/IXP4xx
index 72fbcc4fcab..133c5fa6c7a 100644
--- a/Documentation/arm/IXP4xx
+++ b/Documentation/arm/IXP4xx
@@ -45,7 +45,7 @@ require the use of Intel's propietary CSR softare:
 If you need to use any of the above, you need to download Intel's
 software from:
 
-   http://developer.intel.com/design/network/products/npfamily/ixp425swr1.htm
+   http://developer.intel.com/design/network/products/npfamily/ixp425.htm    
 
 DO NOT POST QUESTIONS TO THE LINUX MAILING LISTS REGARDING THE PROPIETARY
 SOFTWARE.
@@ -53,7 +53,7 @@ SOFTWARE.
 There are several websites that provide directions/pointers on using
 Intel's software:
 
-http://ixp4xx-osdg.sourceforge.net/ 
+   http://sourceforge.net/projects/ixp4xx-osdg/
    Open Source Developer's Guide for using uClinux and the Intel libraries 
 
 http://gatewaymaker.sourceforge.net/ 
@@ -112,21 +112,21 @@ http://www.adiengineering.com/productsCoyote.html
    Finally, there is an IDE port hanging off the expansion bus.
 
 Gateworks Avila Network Platform
-http://www.gateworks.com/avila_sbc.htm
+http://www.gateworks.com/support/overview.php
 
    The Avila platform is basically and IXDP425 with the 4 PCI slots
    replaced with mini-PCI slots and a CF IDE interface hanging off
    the expansion bus.
 
 Intel IXDP425 Development Platform
-http://developer.intel.com/design/network/products/npfamily/ixdp425.htm
+http://www.intel.com/design/network/products/npfamily/ixdpg425.htm  
 
    This is Intel's standard reference platform for the IXDP425 and is 
    also known as the Richfield board. It contains 4 PCI slots, 16MB
    of flash, two 10/100 ports and one ADSL port.
 
 Intel IXDP465 Development Platform
-http://developer.intel.com/design/network/products/npfamily/ixdp465.htm
+http://www.intel.com/design/network/products/npfamily/ixdp465.htm
 
    This is basically an IXDP425 with an IXP465 and 32M of flash instead
    of just 16.
@@ -141,15 +141,13 @@ Intel IXDPG425 Development Platform
    a pivot_root to NFS.
 
 Motorola PrPMC1100 Processor Mezanine Card
-http://www.fountainsys.com/datasheet/PrPMC1100.pdf
+http://www.fountainsys.com
 
    The PrPMC1100 is based on the IXCP1100 and is meant to plug into
    and IXP2400/2800 system to act as the system controller. It simply
    contains a CPU and 16MB of flash on the board and needs to be
    plugged into a carrier board to function. Currently Linux only
    supports the Motorola PrPMC carrier board for this platform.
-   See https://mcg.motorola.com/us/ds/pdf/ds0144.pdf for info
-   on the carrier board.
 
 5. TODO LIST
 
diff --git a/Documentation/arm/README b/Documentation/arm/README
index d98783fbe0c..aea34095cdc 100644
--- a/Documentation/arm/README
+++ b/Documentation/arm/README
@@ -41,12 +41,12 @@ Bug reports etc
 ---------------
 
   Please send patches to the patch system.  For more information, see
-  http://www.arm.linux.org.uk/patches/info.html  Always include some
+  http://www.arm.linux.org.uk/developer/patches/info.php Always include some
   explanation as to what the patch does and why it is needed.
 
   Bug reports should be sent to linux-arm-kernel@lists.arm.linux.org.uk,
   or submitted through the web form at
-  http://www.arm.linux.org.uk/forms/solution.shtml
+  http://www.arm.linux.org.uk/developer/ 
 
   When sending bug reports, please ensure that they contain all relevant
   information, eg. the kernel messages that were printed before/during
diff --git a/Documentation/arm/SA1100/Assabet b/Documentation/arm/SA1100/Assabet
index 91f7ce7ba42..08b885d3567 100644
--- a/Documentation/arm/SA1100/Assabet
+++ b/Documentation/arm/SA1100/Assabet
@@ -2,8 +2,7 @@ The Intel Assabet (SA-1110 evaluation) board
 ============================================
 
 Please see:
-http://developer.intel.com/design/strong/quicklist/eval-plat/sa-1110.htm
-http://developer.intel.com/design/strong/guides/278278.htm
+http://developer.intel.com
 
 Also some notes from John G Dorsey <jd5q@andrew.cmu.edu>:
 http://www.cs.cmu.edu/~wearable/software/assabet.html
@@ -64,7 +63,7 @@ Initial RedBoot configuration
 -----------------------------
 
 The commands used here are explained in The RedBoot User's Guide available
-on-line at http://sources.redhat.com/ecos/docs-latest/redboot/redboot.html.
+on-line at http://sources.redhat.com/ecos/docs.html.
 Please refer to it for explanations.
 
 If you have a CF network card (my Assabet kit contained a CF+ LP-E from
diff --git a/Documentation/arm/SA1100/Brutus b/Documentation/arm/SA1100/Brutus
index b1cfd405dcc..6a3aa95e9bf 100644
--- a/Documentation/arm/SA1100/Brutus
+++ b/Documentation/arm/SA1100/Brutus
@@ -1,7 +1,7 @@
 Brutus is an evaluation platform for the SA1100 manufactured by Intel.  
 For more details, see:
 
-http://developer.intel.com/design/strong/applnots/sa1100lx/getstart.htm
+http://developer.intel.com
 
 To compile for Brutus, you must issue the following commands:
 
diff --git a/Documentation/arm/SA1100/FreeBird b/Documentation/arm/SA1100/FreeBird
index eda28b3232e..fb23b770aaf 100644
--- a/Documentation/arm/SA1100/FreeBird
+++ b/Documentation/arm/SA1100/FreeBird
@@ -1,5 +1,5 @@
 Freebird-1.1 is produced by Legned(C) ,Inc.
-(http://www.legend.com.cn)
+http://web.archive.org/web/*/http://www.legend.com.cn
 and software/linux mainatined by Coventive(C),Inc.
 (http://www.coventive.com)
 
diff --git a/Documentation/arm/SA1100/GraphicsClient b/Documentation/arm/SA1100/GraphicsClient
index 6c9c4f5a36e..867bb35943a 100644
--- a/Documentation/arm/SA1100/GraphicsClient
+++ b/Documentation/arm/SA1100/GraphicsClient
@@ -71,7 +71,7 @@ Supported peripherals:
 - serial ports (ttyS[0-2])
   - ttyS0 is default for serial console
 - Smart I/O (ADC, keypad, digital inputs, etc)
-  See http://www.applieddata.com/developers/linux for IOCTL documentation
+  See http://www.eurotech-inc.com/linux-sbc.asp for IOCTL documentation
   and example user space code. ps/2 keybd is multiplexed through this driver
 
 To do:
diff --git a/Documentation/arm/SA1100/GraphicsMaster b/Documentation/arm/SA1100/GraphicsMaster
index ee7c6595f23..9145088a0ba 100644
--- a/Documentation/arm/SA1100/GraphicsMaster
+++ b/Documentation/arm/SA1100/GraphicsMaster
@@ -28,7 +28,7 @@ Supported peripherals:
 - serial ports (ttyS[0-2])
   - ttyS0 is default for serial console
 - Smart I/O (ADC, keypad, digital inputs, etc)
-  See http://www.applieddata.com/developers/linux for IOCTL documentation
+  See http://www.eurotech-inc.com/linux-sbc.asp for IOCTL documentation
   and example user space code. ps/2 keybd is multiplexed through this driver
 
 To do:
diff --git a/Documentation/arm/SA1100/Itsy b/Documentation/arm/SA1100/Itsy
index 3b594534323..44b94997fa0 100644
--- a/Documentation/arm/SA1100/Itsy
+++ b/Documentation/arm/SA1100/Itsy
@@ -4,7 +4,7 @@ research projects at Compaq that are related to pocket computing.
 
 For more information, see:
 
-	http://www.research.digital.com/wrl/itsy/index.html
+	http://www.hpl.hp.com/downloads/crl/itsy/
 
 Notes on initial 2.4 Itsy support (8/27/2000) :
 The port was done on an Itsy version 1.5 machine with a daughtercard with
diff --git a/Documentation/arm/SA1100/PLEB b/Documentation/arm/SA1100/PLEB
index 92cae066908..b9c8a631a35 100644
--- a/Documentation/arm/SA1100/PLEB
+++ b/Documentation/arm/SA1100/PLEB
@@ -6,6 +6,6 @@ PLEB support has yet to be fully integrated.
 
 For more information, see:
 
-	http://www.cse.unsw.edu.au/~pleb/
+	http://www.cse.unsw.edu.au
 
 
diff --git a/Documentation/arm/SA1100/Victor b/Documentation/arm/SA1100/Victor
index f938a29fdc2..9cff415da5a 100644
--- a/Documentation/arm/SA1100/Victor
+++ b/Documentation/arm/SA1100/Victor
@@ -3,7 +3,7 @@ VisuAide, Inc. to be used by blind people.
 
 For more information related to Victor, see:
 
-	http://www.visuaide.com/victor
+	http://www.humanware.com/en-usa/products
 
 Of course Victor is using Linux as its main operating system.
 The Victor implementation for Linux is maintained by Nicolas Pitre:
diff --git a/Documentation/arm/SA1100/nanoEngine b/Documentation/arm/SA1100/nanoEngine
index fc431cbfefc..48a7934f95f 100644
--- a/Documentation/arm/SA1100/nanoEngine
+++ b/Documentation/arm/SA1100/nanoEngine
@@ -7,5 +7,5 @@ for more info.
 (Ref: Stuart Adams <sja@brightstareng.com>)
 
 Also visit Larry Doolittle's "Linux for the nanoEngine" site:
-http://recycle.lbl.gov/~ldoolitt/bse/
+http://www.brightstareng.com/arm/nanoeng.htm
 
diff --git a/Documentation/binfmt_misc.txt b/Documentation/binfmt_misc.txt
index f609ebf9c78..c1ed6948ba8 100644
--- a/Documentation/binfmt_misc.txt
+++ b/Documentation/binfmt_misc.txt
@@ -111,6 +111,6 @@ cause unexpected behaviour and can be a security hazard.
 
 
 There is a web page about binfmt_misc at
-http://www.tat.physik.uni-tuebingen.de/~rguenth/linux/binfmt_misc.html
+http://www.tat.physik.uni-tuebingen.de
 
 Richard Günther <rguenth@tat.physik.uni-tuebingen.de>
diff --git a/Documentation/blockdev/paride.txt b/Documentation/blockdev/paride.txt
index e4312676bdd..ee6717e3771 100644
--- a/Documentation/blockdev/paride.txt
+++ b/Documentation/blockdev/paride.txt
@@ -412,6 +412,6 @@ have in your mail headers, when sending mail to the list server.
 You might also find some useful information on the linux-parport
 web pages (although they are not always up to date) at
 
-	http://www.torque.net/parport/
+	http://web.archive.org/web/*/http://www.torque.net/parport/
 
 
diff --git a/Documentation/cdrom/packet-writing.txt b/Documentation/cdrom/packet-writing.txt
index 1c407778c8b..13c251d5add 100644
--- a/Documentation/cdrom/packet-writing.txt
+++ b/Documentation/cdrom/packet-writing.txt
@@ -95,7 +95,7 @@ Using the pktcdvd sysfs interface
 
 Since Linux 2.6.20, the pktcdvd module has a sysfs interface
 and can be controlled by it. For example the "pktcdvd" tool uses
-this interface. (see http://people.freenet.de/BalaGi#pktcdvd )
+this interface. (see http://tom.ist-im-web.de/download/pktcdvd )
 
 "pktcdvd" works similar to "pktsetup", e.g.:
 
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 51682ab2dd1..5d0d5692a36 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -691,7 +691,7 @@ There are ways to query or modify cpusets:
    cat, rmdir commands from the shell, or their equivalent from C.
  - via the C library libcpuset.
  - via the C library libcgroup.
-   (http://sourceforge.net/proects/libcg/)
+   (http://sourceforge.net/projects/libcg/)
  - via the python application cset.
    (http://developer.novell.com/wiki/index.php/Cpuset)
 
diff --git a/Documentation/development-process/4.Coding b/Documentation/development-process/4.Coding
index a5a3450faaa..2278693c8ff 100644
--- a/Documentation/development-process/4.Coding
+++ b/Documentation/development-process/4.Coding
@@ -281,7 +281,7 @@ With sparse, the programmer can be warned about confusion between
 user-space and kernel-space addresses, mixture of big-endian and
 small-endian quantities, the passing of integer values where a set of bit
 flags is expected, and so on.  Sparse must be installed separately (it can
-be found at http://www.kernel.org/pub/software/devel/sparse/ if your
+be found at https://sparse.wiki.kernel.org/index.php/Main_Page if your
 distributor does not package it); it can then be run on the code by adding
 "C=1" to your make command.
 
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 6680cab2c70..524de926290 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -36,7 +36,7 @@ Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
 encryption with dm-crypt using the 'cryptsetup' utility, see
-http://luks.endorphin.org/
+http://clemens.endorphin.org/cryptography
 
 [[
 #!/bin/sh
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 1d83d124056..f2da781705b 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -1517,7 +1517,7 @@ Your cooperation is appreciated.
 		    ...
 
 		The driver and documentation may be obtained from
-		http://www.proximity.com.au/~brian/winradio/
+		http://www.winradio.com/
 
  82 block	I2O hard disk
 		  0 = /dev/i2o/hdag	33rd I2O hard disk, whole disk
@@ -1723,7 +1723,7 @@ Your cooperation is appreciated.
 		  1 = /dev/comedi1	Second comedi device
 		    ...
 
-		See http://stm.lbl.gov/comedi or http://www.llp.fu-berlin.de/.
+		See http://stm.lbl.gov/comedi.
 
  98 block	User-mode virtual block device
 		  0 = /dev/ubda		First user-mode block device
@@ -1984,7 +1984,7 @@ Your cooperation is appreciated.
                 256 NetWare volumes can be supported in a single
                 machine.
 
-                http://www.kernel.org/pub/linux/kernel/people/jmerkey/nwfs
+                http://cgfa.telepac.pt/ftp2/kernel.org/linux/kernel/people/jmerkey/nwfs/
 
                 0 = /dev/nwfs/v0    First NetWare (NWFS) Logical Volume
                 1 = /dev/nwfs/v1    Second NetWare (NWFS) Logical Volume
@@ -2591,7 +2591,8 @@ Your cooperation is appreciated.
 		  1 = /dev/intermezzo1	Second cache manager
 		    ...
 
-		See http://www.inter-mezzo.org/ for more information.
+		See http://web.archive.org/web/20080115195241/
+		http://inter-mezzo.org/index.html
 
 186 char	Object-based storage control device
 		  0 = /dev/obd0		First obd control device
diff --git a/Documentation/dvb/faq.txt b/Documentation/dvb/faq.txt
index 2511a335abd..121832e5d89 100644
--- a/Documentation/dvb/faq.txt
+++ b/Documentation/dvb/faq.txt
@@ -76,7 +76,7 @@ Some very frequently asked questions about linuxtv-dvb
 		the TuxBox CVS many interesting DVB applications and the dBox2
 		DVB source
 
-	http://sourceforge.net/projects/dvbsak/
+	http://www.linuxtv.org/downloads/	
 		DVB Swiss Army Knife library and utilities
 
 	http://www.nenie.org/misc/mpsys/
diff --git a/Documentation/fb/framebuffer.txt b/Documentation/fb/framebuffer.txt
index fe79e3c8847..58c5ae2e9f5 100644
--- a/Documentation/fb/framebuffer.txt
+++ b/Documentation/fb/framebuffer.txt
@@ -330,7 +330,7 @@ and on its mirrors.
 
 The latest version of fbset can be found at
 
-    http://home.tvd.be/cr26864/Linux/fbdev/
+    http://www.linux-fbdev.org/ 
 
   
 10. Credits                                                       
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index c0236e753bc..f9765e8cf08 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -128,7 +128,7 @@ OPTIONS
 RESOURCES
 =========
 
-Our current recommendation is to use Inferno (http://www.vitanuova.com/inferno)
+Our current recommendation is to use Inferno (http://www.vitanuova.com/nferno/index.html)
 as the 9p server.  You can start a 9p server under Inferno by issuing the
 following command:
    ; styxlisten -A tcp!*!564 export '#U*'
diff --git a/Documentation/filesystems/affs.txt b/Documentation/filesystems/affs.txt
index 2d1524469c2..81ac488e375 100644
--- a/Documentation/filesystems/affs.txt
+++ b/Documentation/filesystems/affs.txt
@@ -216,4 +216,4 @@ due to an incompatibility with the Amiga floppy controller.
 
 If you are interested in an Amiga Emulator for Linux, look at
 
-http://www.freiburg.linux.de/~uae/
+http://web.archive.org/web/*/http://www.freiburg.linux.de/~uae/
diff --git a/Documentation/filesystems/befs.txt b/Documentation/filesystems/befs.txt
index 67391a15949..6e49c363938 100644
--- a/Documentation/filesystems/befs.txt
+++ b/Documentation/filesystems/befs.txt
@@ -31,7 +31,7 @@ Current maintainer: Sergey S. Kostyliov <rathamahata@php4.ru>
 
 WHAT IS THIS DRIVER?
 ==================
-This module implements the native filesystem of BeOS <http://www.be.com/>
+This module implements the native filesystem of BeOS http://www.beincorporated.com/ 
 for the linux 2.4.1 and later kernels. Currently it is a read-only
 implementation.
 
@@ -61,7 +61,7 @@ step 2.  Configuration & make kernel
 
 The linux kernel has many compile-time options. Most of them are beyond the
 scope of this document. I suggest the Kernel-HOWTO document as a good general
-reference on this topic. <http://www.linux.com/howto/Kernel-HOWTO.html>
+reference on this topic. http://www.linuxdocs.org/HOWTOs/Kernel-HOWTO-4.html 
 
 However, to use the BeFS module, you must enable it at configure time.
 
diff --git a/Documentation/filesystems/isofs.txt b/Documentation/filesystems/isofs.txt
index 3c367c3b360..ba0a93384de 100644
--- a/Documentation/filesystems/isofs.txt
+++ b/Documentation/filesystems/isofs.txt
@@ -41,7 +41,7 @@ Mount options unique to the isofs filesystem.
   sbsector=xxx  Session begins from sector xxx
 
 Recommended documents about ISO 9660 standard are located at:
-http://www.y-adagio.com/public/standards/iso_cdromr/tocont.htm
+http://www.y-adagio.com/
 ftp://ftp.ecma.ch/ecma-st/Ecma-119.pdf
 Quoting from the PDF "This 2nd Edition of Standard ECMA-119 is technically 
 identical with ISO 9660.", so it is a valid and gratis substitute of the
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 9fb6cbe70bd..8fe8895894d 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -73,9 +73,9 @@ contact Bodo  Bauer  at  bb@ricochet.net.  We'll  be happy to add them to this
 document.
 
 The   latest   version    of   this   document   is    available   online   at
-http://skaro.nightcrawler.com/~bb/Docs/Proc as HTML version.
+http://tldp.org/LDP/Linux-Filesystem-Hierarchy/html/proc.html
 
-If  the above  direction does  not works  for you,  ypu could  try the  kernel
+If  the above  direction does  not works  for you,  you could  try the  kernel
 mailing  list  at  linux-kernel@vger.kernel.org  and/or try  to  reach  me  at
 comandante@zaralinux.com.
 
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index eed520fd0c8..ead764b2728 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -165,7 +165,8 @@ TEST SUITE
 If you plan to make any modifications to the vfat filesystem, please
 get the test suite that comes with the vfat distribution at
 
-  http://bmrc.berkeley.edu/people/chaffee/vfat.html
+  http://web.archive.org/web/*/http://bmrc.berkeley.edu/
+  people/chaffee/vfat.html
 
 This tests quite a few parts of the vfat filesystem and additional
 tests for new features or untested features would be appreciated.
diff --git a/Documentation/hwmon/adm1026 b/Documentation/hwmon/adm1026
index f4327db2307..d8fabe0c23a 100644
--- a/Documentation/hwmon/adm1026
+++ b/Documentation/hwmon/adm1026
@@ -6,7 +6,7 @@ Supported chips:
     Prefix: 'adm1026'
     Addresses scanned: I2C 0x2c, 0x2d, 0x2e
     Datasheet: Publicly available at the Analog Devices website
-               http://www.analog.com/en/prod/0,,766_825_ADM1026,00.html
+               http://www.onsemi.com/PowerSolutions/product.do?id=ADM1026
 
 Authors:
         Philip Pokorny <ppokorny@penguincomputing.com> for Penguin Computing
diff --git a/Documentation/hwmon/g760a b/Documentation/hwmon/g760a
index e032eeb7562..cfc89453706 100644
--- a/Documentation/hwmon/g760a
+++ b/Documentation/hwmon/g760a
@@ -5,7 +5,7 @@ Supported chips:
   * Global Mixed-mode Technology Inc. G760A
     Prefix: 'g760a'
     Datasheet: Publicly available at the GMT website
-      http://www.gmt.com.tw/datasheet/g760a.pdf
+      http://www.gmt.com.tw/product/datasheet/EDS-760A.pdf
 
 Author: Herbert Valerio Riedel <hvr@gnu.org>
 
diff --git a/Documentation/hwmon/gl518sm b/Documentation/hwmon/gl518sm
index 229f8b78918..26f9f3c02dc 100644
--- a/Documentation/hwmon/gl518sm
+++ b/Documentation/hwmon/gl518sm
@@ -5,11 +5,10 @@ Supported chips:
   * Genesys Logic GL518SM release 0x00
     Prefix: 'gl518sm'
     Addresses scanned: I2C 0x2c and 0x2d
-    Datasheet: http://www.genesyslogic.com/pdf
   * Genesys Logic GL518SM release 0x80
     Prefix: 'gl518sm'
     Addresses scanned: I2C 0x2c and 0x2d
-    Datasheet: http://www.genesyslogic.com/pdf
+    Datasheet: http://www.genesyslogic.com/
 
 Authors:
         Frodo Looijaard <frodol@dds.nl>,
diff --git a/Documentation/hwmon/k8temp b/Documentation/hwmon/k8temp
index 0005c716614..716dc24c723 100644
--- a/Documentation/hwmon/k8temp
+++ b/Documentation/hwmon/k8temp
@@ -5,7 +5,7 @@ Supported chips:
   * AMD Athlon64/FX or Opteron CPUs
     Prefix: 'k8temp'
     Addresses scanned: PCI space
-    Datasheet: http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf
+    Datasheet: http://support.amd.com/us/Processor_TechDocs/32559.pdf
 
 Author: Rudolf Marek
 Contact: Rudolf Marek <r.marek@assembler.cz>
diff --git a/Documentation/hwmon/lm85 b/Documentation/hwmon/lm85
index a76aefeeb68..b98e0e0d191 100644
--- a/Documentation/hwmon/lm85
+++ b/Documentation/hwmon/lm85
@@ -9,15 +9,15 @@ Supported chips:
   * Analog Devices ADM1027
     Prefix: 'adm1027'
     Addresses scanned: I2C 0x2c, 0x2d, 0x2e
-    Datasheet: http://www.analog.com/en/prod/0,,766_825_ADM1027,00.html
+    Datasheet: http://www.onsemi.com/PowerSolutions/product.do?id=ADM1027
   * Analog Devices ADT7463
     Prefix: 'adt7463'
     Addresses scanned: I2C 0x2c, 0x2d, 0x2e
-    Datasheet: http://www.analog.com/en/prod/0,,766_825_ADT7463,00.html
+    Datasheet: http://www.onsemi.com/PowerSolutions/product.do?id=ADT7463
   * SMSC EMC6D100, SMSC EMC6D101
     Prefix: 'emc6d100'
     Addresses scanned: I2C 0x2c, 0x2d, 0x2e
-    Datasheet: http://www.smsc.com/main/tools/discontinued/6d100.pdf
+    Datasheet: http://www.smsc.com/media/Downloads_Public/discontinued/6d100.pdf 
   * SMSC EMC6D102
     Prefix: 'emc6d102'
     Addresses scanned: I2C 0x2c, 0x2d, 0x2e
diff --git a/Documentation/hwmon/smsc47m1 b/Documentation/hwmon/smsc47m1
index 42c8431b3c9..2a13378dcf2 100644
--- a/Documentation/hwmon/smsc47m1
+++ b/Documentation/hwmon/smsc47m1
@@ -7,13 +7,10 @@ Supported chips:
     Addresses scanned: none, address read from Super I/O config space
     Prefix: 'smsc47m1'
     Datasheets:
-        http://www.smsc.com/main/datasheets/47b27x.pdf
-        http://www.smsc.com/main/datasheets/47m10x.pdf
-        http://www.smsc.com/main/datasheets/47m112.pdf
-        http://www.smsc.com/main/tools/discontinued/47m13x.pdf
-        http://www.smsc.com/main/datasheets/47m14x.pdf
-        http://www.smsc.com/main/tools/discontinued/47m15x.pdf
-        http://www.smsc.com/main/datasheets/47m192.pdf
+        http://www.smsc.com/media/Downloads_Public/Data_Sheets/47b272.pdf
+        http://www.smsc.com/media/Downloads_Public/Data_Sheets/47m10x.pdf
+        http://www.smsc.com/media/Downloads_Public/Data_Sheets/47m112.pdf
+        http://www.smsc.com/
   * SMSC LPC47M292
     Addresses scanned: none, address read from Super I/O config space
     Prefix: 'smsc47m2'
diff --git a/Documentation/hwmon/thmc50 b/Documentation/hwmon/thmc50
index 9639ca93d55..8a7772ade8d 100644
--- a/Documentation/hwmon/thmc50
+++ b/Documentation/hwmon/thmc50
@@ -9,7 +9,7 @@ Supported chips:
   * Texas Instruments THMC50
     Prefix: 'thmc50'
     Addresses scanned: I2C 0x2c - 0x2e
-    Datasheet: http://focus.ti.com/docs/prod/folders/print/thmc50.html
+    Datasheet: http://www.ti.com/ 
 
 Author: Krzysztof Helt <krzysztof.h1@wp.pl>
 
diff --git a/Documentation/hwmon/via686a b/Documentation/hwmon/via686a
index d651b25f751..e5f90ab5c48 100644
--- a/Documentation/hwmon/via686a
+++ b/Documentation/hwmon/via686a
@@ -5,7 +5,7 @@ Supported chips:
   * Via VT82C686A, VT82C686B  Southbridge Integrated Hardware Monitor
     Prefix: 'via686a'
     Addresses scanned: ISA in PCI-space encoded address
-    Datasheet: On request through web form (http://www.via.com.tw/en/support/datasheets/)
+    Datasheet: On request through web form (http://www.via.com.tw/en/resources/download-center/)
 
 Authors:
         Kyösti Mälkki <kmalkki@cc.hut.fi>,
diff --git a/Documentation/hwmon/w83627hf b/Documentation/hwmon/w83627hf
index 44dd2bcc72b..fb145e5e722 100644
--- a/Documentation/hwmon/w83627hf
+++ b/Documentation/hwmon/w83627hf
@@ -5,23 +5,19 @@ Supported chips:
   * Winbond W83627HF (ISA accesses ONLY)
     Prefix: 'w83627hf'
     Addresses scanned: ISA address retrieved from Super I/O registers
-    Datasheet: http://www.winbond.com/PDF/sheet/w83627hf.pdf
   * Winbond W83627THF
     Prefix: 'w83627thf'
     Addresses scanned: ISA address retrieved from Super I/O registers
-    Datasheet: http://www.winbond.com/PDF/sheet/w83627thf.pdf
   * Winbond W83697HF
     Prefix: 'w83697hf'
     Addresses scanned: ISA address retrieved from Super I/O registers
-    Datasheet: http://www.winbond.com/PDF/sheet/697hf.pdf
   * Winbond W83637HF
     Prefix: 'w83637hf'
     Addresses scanned: ISA address retrieved from Super I/O registers
-    Datasheet: http://www.winbond.com/PDF/sheet/w83637hf.pdf
   * Winbond W83687THF
     Prefix: 'w83687thf'
     Addresses scanned: ISA address retrieved from Super I/O registers
-    Datasheet: Provided by Winbond on request
+    Datasheet: Provided by Winbond on request(http://www.winbond.com/hq/enu)
 
 Authors:
         Frodo Looijaard <frodol@dds.nl>,
diff --git a/Documentation/hwmon/w83781d b/Documentation/hwmon/w83781d
index c91e0b63ea1..ecbc1e4574b 100644
--- a/Documentation/hwmon/w83781d
+++ b/Documentation/hwmon/w83781d
@@ -9,7 +9,7 @@ Supported chips:
   * Winbond W83782D
     Prefix: 'w83782d'
     Addresses scanned: I2C 0x28 - 0x2f, ISA 0x290 (8 I/O ports)
-    Datasheet: http://www.winbond.com/PDF/sheet/w83782d.pdf
+    Datasheet: http://www.winbond.com
   * Winbond W83783S
     Prefix: 'w83783s'
     Addresses scanned: I2C 0x2d
diff --git a/Documentation/hwmon/w83792d b/Documentation/hwmon/w83792d
index 14a668ed8aa..8a023ce0b72 100644
--- a/Documentation/hwmon/w83792d
+++ b/Documentation/hwmon/w83792d
@@ -5,7 +5,7 @@ Supported chips:
   * Winbond W83792D
     Prefix: 'w83792d'
     Addresses scanned: I2C 0x2c - 0x2f
-    Datasheet: http://www.winbond.com.tw/E-WINBONDHTM/partner/PDFresult.asp?Pname=1035
+    Datasheet: http://www.winbond.com.tw
 
 Author: Chunhao Huang
 Contact: DZShen <DZShen@Winbond.com.tw>
diff --git a/Documentation/i2c/busses/i2c-ali1535 b/Documentation/i2c/busses/i2c-ali1535
index acbc65a0809..5d46342e486 100644
--- a/Documentation/i2c/busses/i2c-ali1535
+++ b/Documentation/i2c/busses/i2c-ali1535
@@ -3,7 +3,7 @@ Kernel driver i2c-ali1535
 Supported adapters:
   * Acer Labs, Inc. ALI 1535 (south bridge)
     Datasheet: Now under NDA
-	http://www.ali.com.tw/eng/support/datasheet_request.php
+	http://www.ali.com.tw/
 
 Authors:
 	Frodo Looijaard <frodol@dds.nl>,
diff --git a/Documentation/i2c/busses/i2c-ali1563 b/Documentation/i2c/busses/i2c-ali1563
index 54691698d2d..41b1a077e4c 100644
--- a/Documentation/i2c/busses/i2c-ali1563
+++ b/Documentation/i2c/busses/i2c-ali1563
@@ -3,7 +3,7 @@ Kernel driver i2c-ali1563
 Supported adapters:
   * Acer Labs, Inc. ALI 1563 (south bridge)
     Datasheet: Now under NDA
-	http://www.ali.com.tw/eng/support/datasheet_request.php
+	http://www.ali.com.tw/
 
 Author: Patrick Mochel <mochel@digitalimplant.org>
 
diff --git a/Documentation/i2c/busses/i2c-ali15x3 b/Documentation/i2c/busses/i2c-ali15x3
index 600da90b8f1..42888d8ac12 100644
--- a/Documentation/i2c/busses/i2c-ali15x3
+++ b/Documentation/i2c/busses/i2c-ali15x3
@@ -3,7 +3,7 @@ Kernel driver i2c-ali15x3
 Supported adapters:
   * Acer Labs, Inc. ALI 1533 and 1543C (south bridge)
     Datasheet: Now under NDA
-	http://www.ali.com.tw/eng/support/datasheet_request.php
+	http://www.ali.com.tw/
 
 Authors:
 	Frodo Looijaard <frodol@dds.nl>,
diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4
index ac540c71c7e..475bb4ae072 100644
--- a/Documentation/i2c/busses/i2c-piix4
+++ b/Documentation/i2c/busses/i2c-piix4
@@ -97,4 +97,4 @@ of all affected systems, so the only safe solution was to prevent access to
 the SMBus on all IBM systems (detected using DMI data.)
 
 For additional information, read:
-http://www.lm-sensors.org/browser/lm-sensors/trunk/README.thinkpad
+http://www.lm-sensors.org/browser/lm-sensors/trunk/README
diff --git a/Documentation/i2c/busses/i2c-sis630 b/Documentation/i2c/busses/i2c-sis630
index 629ea2c356f..0b969736693 100644
--- a/Documentation/i2c/busses/i2c-sis630
+++ b/Documentation/i2c/busses/i2c-sis630
@@ -2,7 +2,7 @@ Kernel driver i2c-sis630
 
 Supported adapters:
   * Silicon Integrated Systems Corp (SiS)
-	630 chipset (Datasheet: available at http://amalysh.bei.t-online.de/docs/SIS/)
+	630 chipset (Datasheet: available at http://www.sfr-fresh.com/linux)
 	730 chipset
   * Possible other SiS chipsets ?
 
diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt
index aa3e953f0f7..5a4dea6abeb 100644
--- a/Documentation/ia64/aliasing.txt
+++ b/Documentation/ia64/aliasing.txt
@@ -168,8 +168,6 @@ PAST PROBLEM CASES
 
     mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
 
-      See https://bugzilla.novell.com/show_bug.cgi?id=140858.
-
       The EFI memory map reports the following attributes:
         0x00000-0x9FFFF WB only
         0xA0000-0xBFFFF UC only (VGA frame buffer)
diff --git a/Documentation/ia64/serial.txt b/Documentation/ia64/serial.txt
index 040b9773209..6869c73de4e 100644
--- a/Documentation/ia64/serial.txt
+++ b/Documentation/ia64/serial.txt
@@ -133,7 +133,7 @@ TROUBLESHOOTING SERIAL CONSOLE PROBLEMS
 
 
-[1] http://www.dig64.org/specifications/DIG64_PCDPv20.pdf
+[1] http://www.dig64.org/specifications/agreement 
     The table was originally defined as the "HCDP" for "Headless
     Console/Debug Port."  The current version is the "PCDP" for
     "Primary Console and Debug Port Devices."
diff --git a/Documentation/infiniband/user_verbs.txt b/Documentation/infiniband/user_verbs.txt
index afe3f8da901..e5092d696da 100644
--- a/Documentation/infiniband/user_verbs.txt
+++ b/Documentation/infiniband/user_verbs.txt
@@ -5,7 +5,7 @@ USERSPACE VERBS ACCESS
   described in chapter 11 of the InfiniBand Architecture Specification.
 
   To use the verbs, the libibverbs library, available from
-  <http://openib.org/>, is required.  libibverbs contains a
+  http://www.openfabrics.org/, is required.  libibverbs contains a
   device-independent API for using the ib_uverbs interface.
   libibverbs also requires appropriate device-dependent kernel and
   userspace driver for your InfiniBand hardware.  For example, to use
diff --git a/Documentation/input/appletouch.txt b/Documentation/input/appletouch.txt
index 4f7c633a76d..b13de3f8910 100644
--- a/Documentation/input/appletouch.txt
+++ b/Documentation/input/appletouch.txt
@@ -82,4 +82,4 @@ Links:
 ------
 
 [1]: http://johannes.sipsolutions.net/PowerBook/touchpad/
-[2]: http://web.telia.com/~u89404340/touchpad/index.html
+[2]: http://web.archive.org/web/*/http://web.telia.com/~u89404340/touchpad/index.html
diff --git a/Documentation/input/bcm5974.txt b/Documentation/input/bcm5974.txt
index 5e22dcf6d48..74d3876d6f3 100644
--- a/Documentation/input/bcm5974.txt
+++ b/Documentation/input/bcm5974.txt
@@ -62,4 +62,4 @@ Links
 -----
 
 [1] http://ubuntuforums.org/showthread.php?t=840040
-[2] http://http://bitmath.org/code/
+[2] http://bitmath.org/code/
diff --git a/Documentation/input/iforce-protocol.txt b/Documentation/input/iforce-protocol.txt
index 3ac92413c87..2d5fbfd6023 100644
--- a/Documentation/input/iforce-protocol.txt
+++ b/Documentation/input/iforce-protocol.txt
@@ -251,7 +251,7 @@ Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy.
 
 ** Author of this document **
 Johann Deneux <johann.deneux@gmail.com>
-Home page at http://www.esil.univ-mrs.fr/~jdeneux/projects/ff/
+Home page at http://web.archive.org/web/*/http://www.esil.univ-mrs.fr
 
 Additions by Vojtech Pavlik.
 
diff --git a/Documentation/input/sentelic.txt b/Documentation/input/sentelic.txt
index b35affd5c64..b2ef125b71f 100644
--- a/Documentation/input/sentelic.txt
+++ b/Documentation/input/sentelic.txt
@@ -341,7 +341,7 @@ Byte 5~8: Don't care (Absolute packet)
 FSP supports basic PS/2 commanding set and modes, refer to following URL for
 details about PS/2 commands:
 
-http://www.computer-engineering.org/index.php?title=PS/2_Mouse_Interface
+http://www.computer-engineering.org/ps2mouse/
 
 ==============================================================================
 * Programming Sequence for Determining Packet Parsing Flow
diff --git a/Documentation/input/xpad.txt b/Documentation/input/xpad.txt
index aae0d404c56..7cc9a436e6a 100644
--- a/Documentation/input/xpad.txt
+++ b/Documentation/input/xpad.txt
@@ -150,7 +150,7 @@ the basic functionality.
 
 1. http://euc.jp/periphs/xbox-controller.ja.html (ITO Takayuki)
 2. http://xpad.xbox-scene.com/
-3. http://www.xboxhackz.com/Hackz-Reference.htm
+3. http://www.markosweb.com/www/xboxhackz.com/ 
 
 4. /proc/bus/usb/devices - dump from InterAct PowerPad Pro (Germany):
 
diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt
index 5dc59b04a71..849de1a78e7 100644
--- a/Documentation/intel_txt.txt
+++ b/Documentation/intel_txt.txt
@@ -25,20 +25,18 @@ which has been updated for the new released platforms.
 Intel TXT has been presented at various events over the past few
 years, some of which are:
       LinuxTAG 2008:
-          http://www.linuxtag.org/2008/en/conf/events/vp-donnerstag/
-          details.html?talkid=110
+          http://www.linuxtag.org/2008/en/conf/events/vp-donnerstag.html
       TRUST2008:
-          http://www.trust2008.eu/downloads/Keynote-Speakers/
+          http://www.trust-conference.eu/downloads/Keynote-Speakers/
           3_David-Grawrock_The-Front-Door-of-Trusted-Computing.pdf
-      IDF 2008, Shanghai:
-          http://inteldeveloperforum.com.edgesuite.net/shanghai_2008/
-          aep/PROS003/index.html
+      IDF, Shanghai:
+          http://www.prcidf.com.cn/index_en.html
       IDFs 2006, 2007 (I'm not sure if/where they are online)
 
 Trusted Boot Project Overview:
 =============================
 
-Trusted Boot (tboot) is an open source, pre- kernel/VMM module that
+Trusted Boot (tboot) is an open source, pre-kernel/VMM module that
 uses Intel TXT to perform a measured and verified launch of an OS
 kernel/VMM.
 
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index dd5806f4fcc..2ec3d7d8998 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -74,7 +74,7 @@ Code  Seq#(hex)	Include File		Comments
 0x10	00-0F	drivers/char/s390/vmcp.h
 0x12	all	linux/fs.h
 		linux/blkpg.h
-0x1b	all	InfiniBand Subsystem	<http://www.openib.org/>
+0x1b	all	InfiniBand Subsystem	<http://infiniband.sourceforge.net/>
 0x20	all	drivers/cdrom/cm206.h
 0x22	all	scsi/sg.h
 '#'	00-3F	IEEE 1394 Subsystem	Block for the entire subsystem
@@ -190,7 +190,7 @@ Code  Seq#(hex)	Include File		Comments
 '['	00-07	linux/usb/tmc.h		USB Test and Measurement Devices
 					<mailto:gregkh@suse.de>
 'a'	all	linux/atm*.h, linux/sonet.h	ATM on linux
-					<http://lrcwww.epfl.ch/linux-atm/magic.html>
+					<http://lrcwww.epfl.ch/>
 'b'	00-FF				conflict! bit3 vme host bridge
 					<mailto:natalia@nikhefk.nikhef.nl>
 'b'	00-0F	media/bt819.h		conflict!
@@ -225,7 +225,7 @@ Code  Seq#(hex)	Include File		Comments
 'k'	00-0F	linux/spi/spidev.h	conflict!
 'k'	00-05	video/kyro.h		conflict!
 'l'	00-3F	linux/tcfs_fs.h		transparent cryptographic file system
-					<http://mikonos.dia.unisa.it/tcfs>
+					<http://web.archive.org/web/*/http://mikonos.dia.unisa.it/tcfs>
 'l'	40-7F	linux/udf_fs_i.h	in development:
 					<http://sourceforge.net/projects/linux-udf/>
 'm'	00-09	linux/mmtimer.h		conflict!
@@ -252,7 +252,7 @@ Code  Seq#(hex)	Include File		Comments
 					<mailto:giometti@linux.it>
 'q'	00-1F	linux/serio.h
 'q'	80-FF	linux/telephony.h	Internet PhoneJACK, Internet LineJACK
-		linux/ixjuser.h		<http://www.quicknet.net>
+		linux/ixjuser.h		<http://web.archive.org/web/*/http://www.quicknet.net>
 'r'	00-1F	linux/msdos_fs.h and fs/fat/dir.c
 's'	all	linux/cdk.h
 't'	00-7F	linux/if_ppp.h
@@ -286,7 +286,7 @@ Code  Seq#(hex)	Include File		Comments
 0x89	F0-FF	linux/sockios.h		SIOCDEVPRIVATE range
 0x8B	all	linux/wireless.h
 0x8C	00-3F				WiNRADiO driver
-					<http://www.proximity.com.au/~brian/winradio/>
+					<http://www.winradio.com.au/>
 0x90	00	drivers/cdrom/sbpcd.h
 0x92	00-0F	drivers/usb/mon/mon_bin.c
 0x93	60-7F	linux/auto_fs.h
diff --git a/Documentation/isdn/README b/Documentation/isdn/README
index 6783437f21c..cfb1884342e 100644
--- a/Documentation/isdn/README
+++ b/Documentation/isdn/README
@@ -36,7 +36,7 @@ README for the ISDN-subsystem
      http://www.mhessler.de/i4lfaq/
   It can be viewed online, or downloaded in sgml/text/html format.
   The FAQ can also be viewed online at
-     http://www.isdn4inux.de/faq/
+     http://www.isdn4linux.de/faq/
   or downloaded from
      ftp://ftp.isdn4linux.de/pub/isdn4linux/FAQ/
 
diff --git a/Documentation/isdn/README.HiSax b/Documentation/isdn/README.HiSax
index 031c8d81433..99e87a61897 100644
--- a/Documentation/isdn/README.HiSax
+++ b/Documentation/isdn/README.HiSax
@@ -486,7 +486,7 @@ Appendix: Teles PCMCIA driver
 -----------------------------
 
 See
-   http://www.stud.uni-wuppertal.de/~ea0141/pcmcia.html
+   http://www.linux.no/teles_cs.txt 
 for instructions.
 
 Appendix: Linux and ISDN-leased lines
diff --git a/Documentation/ja_JP/HOWTO b/Documentation/ja_JP/HOWTO
index 55476982b5c..b63301a0381 100644
--- a/Documentation/ja_JP/HOWTO
+++ b/Documentation/ja_JP/HOWTO
@@ -223,7 +223,7 @@ web サイトには、コードの構成、サブシステム、現在存在す
 あなたがどこからスタートして良いかわからないが、Linux カーネル開発コミュ
 ニティに参加して何かすることをさがしている場合には、Linux kernel
 Janitor's プロジェクトにいけば良いでしょう -
-	http://janitor.kernelnewbies.org/
+	http://kernelnewbies.org/KernelJanitors
 ここはそのようなスタートをするのにうってつけの場所です。ここには、
 Linux カーネルソースツリーの中に含まれる、きれいにし、修正しなければな
 らない、単純な問題のリストが記述されています。このプロジェクトに関わる
diff --git a/Documentation/ja_JP/SubmittingPatches b/Documentation/ja_JP/SubmittingPatches
index a9dc1243e85..f107c834d24 100644
--- a/Documentation/ja_JP/SubmittingPatches
+++ b/Documentation/ja_JP/SubmittingPatches
@@ -97,7 +97,7 @@ Quilt:
 http://savannah.nongnu.org/projects/quilt
 
 Andrew Morton's patch scripts:
-http://www.zip.com.au/~akpm/linux/patches/
+http://userweb.kernel.org/~akpm/stuff/tpp.txt
 このリンクの先のスクリプトの代わりとして、quilt がパッチマネジメント
 ツールとして推奨されています(上のリンクを見てください)。
 
@@ -210,7 +210,7 @@ VGER.KERNEL.ORG でホスティングされているメーリングリストの
  ・移植性のないコードから移植性のあるコードへの置き換え(小さい範囲で
    あればアーキテクチャ特有のことでも他の人がコピーできます)
  ・作者やメンテナによる修正(すなわち patch monkey の再転送モード)
-URL: <http://www.kernel.org/pub/linux/kernel/people/bunk/trivial/>
+EMAIL: <trivial@kernel.org>
 
 7) MIME やリンクや圧縮ファイルや添付ファイルではなくプレインテキストのみ
 
@@ -534,7 +534,7 @@ gcc においては、マクロと同じくらい軽いです。
 ----------------------
 
 Andrew Morton, "The perfect patch" (tpp).
-  <http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt>
+  <http://userweb.kernel.org/~akpm/stuff/tpp.txt>
 
 Jeff Garzik, "Linux kernel patch submission format".
   <http://linux.yyz.us/patch-format.html>
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index ec8d31ee12e..715eaaf1519 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -75,7 +75,7 @@
 
      * Title: "Conceptual Architecture of the Linux Kernel"
        Author: Ivan T. Bowman.
-       URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a1.html
+       URL: http://plg.uwaterloo.ca/
        Keywords: conceptual software architecture, extracted design,
        reverse engineering, system structure.
        Description: Conceptual software architecture of the Linux kernel,
@@ -84,7 +84,7 @@
 
      * Title: "Concrete Architecture of the Linux Kernel"
        Author: Ivan T. Bowman, Saheem Siddiqi, and Meyer C. Tanuan.
-       URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a2.html
+       URL: http://plg.uwaterloo.ca/
        Keywords: concrete architecture, extracted design, reverse
        engineering, system structure, dependencies.
        Description: Concrete architecture of the Linux kernel,
@@ -95,7 +95,7 @@
      * Title: "Linux as a Case Study: Its Extracted Software
        Architecture"
        Author: Ivan T. Bowman, Richard C. Holt and Neil V. Brewster.
-       URL: http://plg.uwaterloo.ca/~itbowman/papers/linuxcase.html
+       URL: http://plg.uwaterloo.ca/
        Keywords: software architecture, architecture recovery,
        redocumentation.
        Description: Paper appeared at ICSE'99, Los Angeles, May 16-22,
@@ -104,7 +104,7 @@
 
      * Title: "Overview of the Virtual File System"
        Author: Richard Gooch.
-       URL: http://www.atnf.csiro.au/~rgooch/linux/vfs.txt
+       URL: http://www.mjmwired.net/kernel/Documentation/filesystems/vfs.txt
        Keywords: VFS, File System, mounting filesystems, opening files,
        dentries, dcache.
        Description: Brief introduction to the Linux Virtual File System.
@@ -267,15 +267,13 @@
      * Title: "Kernel API changes from 2.0 to 2.2"
        Author: Richard Gooch.
        URL:
-       http://www.atnf.csiro.au/~rgooch/linux/docs/porting-to-2.2.html
+       http://www.linuxhq.com/guides/LKMPG/node28.html 
        Keywords: 2.2, changes.
        Description: Kernel functions/structures/variables which changed
        from 2.0.x to 2.2.x.
 
      * Title: "Kernel API changes from 2.2 to 2.4"
        Author: Richard Gooch.
-       URL:
-       http://www.atnf.csiro.au/~rgooch/linux/docs/porting-to-2.4.html
        Keywords: 2.4, changes.
        Description: Kernel functions/structures/variables which changed
        from 2.2.x to 2.4.x.
@@ -290,7 +288,6 @@
        
      * Title: "I/O Event Handling Under Linux"
        Author: Richard Gooch.
-       URL: http://www.atnf.csiro.au/~rgooch/linux/docs/io-events.html
        Keywords: IO, I/O, select(2), poll(2), FDs, aio_read(2), readiness
        event queues.
        Description: From the Introduction: "I/O Event handling is about
@@ -386,64 +383,64 @@
 
      * Title: "Porting Device Drivers To Linux 2.2: part II"
        Author: Alan Cox.
-       URL: http://www.linux-mag.com/1999-06/gear_01.html
+       URL: http://www.linux-mag.com/id/238 
        Keywords: ports, porting.
        Description: Second part on porting from 2.0 to 2.2 kernels.
 
      * Title: "How To Make Sure Your Driver Will Work On The Power
        Macintosh"
        Author: Paul Mackerras.
-       URL: http://www.linux-mag.com/1999-07/gear_01.html
+       URL: http://www.linux-mag.com/id/261
        Keywords: Mac, Power Macintosh, porting, drivers, compatibility.
        Description: The title says it all.
 
      * Title: "An Introduction to SCSI Drivers"
        Author: Alan Cox.
-       URL: http://www.linux-mag.com/1999-08/gear_01.html
+       URL: http://www.linux-mag.com/id/284
        Keywords: SCSI, device, driver.
        Description: The title says it all.
 
      * Title: "Advanced SCSI Drivers And Other Tales"
        Author: Alan Cox.
-       URL: http://www.linux-mag.com/1999-09/gear_01.html
+       URL: http://www.linux-mag.com/id/307
        Keywords: SCSI, device, driver, advanced.
        Description: The title says it all.
 
      * Title: "Writing Linux Mouse Drivers"
        Author: Alan Cox.
-       URL: http://www.linux-mag.com/1999-10/gear_01.html
+       URL: http://www.linux-mag.com/id/330
        Keywords: mouse, driver, gpm.
        Description: The title says it all.
 
      * Title: "More on Mouse Drivers"
        Author: Alan Cox.
-       URL: http://www.linux-mag.com/1999-11/gear_01.html
+       URL: http://www.linux-mag.com/id/356
        Keywords: mouse, driver, gpm, races, asynchronous I/O.
        Description: The title still says it all.
 
      * Title: "Writing Video4linux Radio Driver"
        Author: Alan Cox.
-       URL: http://www.linux-mag.com/1999-12/gear_01.html
+       URL: http://www.linux-mag.com/id/381
        Keywords: video4linux, driver, radio, radio devices.
        Description: The title says it all.
 
      * Title: "Video4linux Drivers, Part 1: Video-Capture Device"
        Author: Alan Cox.
-       URL: http://www.linux-mag.com/2000-01/gear_01.html
+       URL: http://www.linux-mag.com/id/406
        Keywords: video4linux, driver, video capture, capture devices,
        camera driver.
        Description: The title says it all.
 
      * Title: "Video4linux Drivers, Part 2: Video-capture Devices"
        Author: Alan Cox.
-       URL: http://www.linux-mag.com/2000-02/gear_01.html
+       URL: http://www.linux-mag.com/id/429
        Keywords: video4linux, driver, video capture, capture devices,
        camera driver, control, query capabilities, capability, facility.
        Description: The title says it all.
 
      * Title: "PCI Management in Linux 2.2"
        Author: Alan Cox.
-       URL: http://www.linux-mag.com/2000-03/gear_01.html
+       URL: http://www.linux-mag.com/id/452
        Keywords: PCI, bus, bus-mastering.
        Description: The title says it all.
 
@@ -502,7 +499,7 @@
        
      * Title: "A Linux vm README"
        Author: Kanoj Sarcar.
-       URL: http://reality.sgi.com/kanoj_engr/vm229.html
+       URL: http://kos.enix.org/pub/linux-vmm.html
        Keywords: virtual memory, mm, pgd, vma, page, page flags, page
        cache, swap cache, kswapd.
        Description: Telegraphic, short descriptions and definitions
@@ -659,7 +656,7 @@
 
      * Name: "Linux Kernel Source Reference"
        Author: Thomas Graichen.
-       URL: http://innominate.org/~graichen/projects/lksr/
+       URL: http://marc.info/?l=linux-kernel&m=96446640102205&w=4
        Keywords: CVS, web, cvsweb, browsing source code.
        Description: Web interface to a CVS server with the kernel
        sources. "Here you can have a look at any file of the Linux kernel
@@ -682,7 +679,7 @@
        produced during the week. Published every Thursday.
 
      * Name: "Kernel Traffic"
-       URL: http://kt.zork.net/kernel-traffic/
+       URL: http://kt.earth.li/kernel-traffic/index.html
        Keywords: linux-kernel mailing list, weekly kernel news.
        Description: Weekly newsletter covering the most relevant
        discussions of the linux-kernel mailing list.
diff --git a/Documentation/ko_KR/HOWTO b/Documentation/ko_KR/HOWTO
index 029fca914c0..e3a55b6091e 100644
--- a/Documentation/ko_KR/HOWTO
+++ b/Documentation/ko_KR/HOWTO
@@ -122,7 +122,7 @@ mtk.manpages@gmail.com의 메인테이너에게 보낼 것을 권장한다.
 
     올바른 패치들을 만드는 법에 관한 훌륭한 다른 문서들이 있다.
     "The Perfect Patch"
-        http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
+        http://userweb.kernel.org/~akpm/stuff/tpp.txt
     "Linux kernel patch submission format"
         http://linux.yyz.us/patch-format.html
 
@@ -192,7 +192,7 @@ Documentation/DocBook/ 디렉토리 내에서 만들어지며 PDF, Postscript, H
 
 여러분이 어디서 시작해야 할진 모르지만 커널 개발 커뮤니티에 참여할 수
 있는 일들을 찾길 원한다면 리눅스 커널 Janitor 프로젝트를 살펴봐라.
-         http://janitor.kernelnewbies.org/
+	http://kernelnewbies.org/KernelJanitors
 그곳은 시작하기에 훌륭한 장소이다. 그곳은 리눅스 커널 소스 트리내에
 간단히 정리되고 수정될 수 있는 문제들에 관하여 설명한다. 여러분은 이
 프로젝트를 대표하는 개발자들과 일하면서 자신의 패치를 리눅스 커널 트리에
@@ -596,7 +596,7 @@ Pat이라는 이름을 가진 여자가 있을 수도 있는 것이다. 리눅
 
 이것이 무엇인지 더 자세한 것을 알고 싶다면 다음 문서의 ChageLog 항을 봐라.
    "The Perfect Patch"
-    http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
+    http://userweb.kernel.org/~akpm/stuff/tpp.txt
 
 
@@ -610,7 +610,7 @@ Pat이라는 이름을 가진 여자가 있을 수도 있는 것이다. 리눅
 
 
 ----------
-"개발 프로세스"(http://linux.tar.gz/articles/2.6-development_process) 섹션을
+"개발 프로세스"(http://lwn.net/Articles/94386/) 섹션을
 작성하는데 있어 참고할 문서를 사용하도록 허락해준 Paolo Ciarrocchi에게
 감사한다. 여러분들이 말해야 할 것과 말해서는 안되는 것의 목록 중 일부를 제공해준
 Randy Dunlap과 Gerrit Huizenga에게 감사한다. 또한 검토와 의견 그리고
diff --git a/Documentation/laptops/acer-wmi.txt b/Documentation/laptops/acer-wmi.txt
index 0768fcc3ba3..4beafa663dd 100644
--- a/Documentation/laptops/acer-wmi.txt
+++ b/Documentation/laptops/acer-wmi.txt
@@ -173,7 +173,7 @@ Credits
 *******
 
 Olaf Tauber, who did the real hard work when he developed acerhk
-http://www.informatik.hu-berlin.de/~tauber/acerhk
+http://www.cakey.de/acerhk/
 All the authors of laptop ACPI modules in the kernel, whose work
 was an inspiration in the early days of acer_acpi
 Mathieu Segaud, who solved the problem with having to modprobe the driver
diff --git a/Documentation/ldm.txt b/Documentation/ldm.txt
index 718085bc9f1..4f80edd14d0 100644
--- a/Documentation/ldm.txt
+++ b/Documentation/ldm.txt
@@ -98,7 +98,7 @@ More Documentation
 There is an Overview of the LDM together with complete Technical Documentation.
 It is available for download.
 
-  http://www.linux-ntfs.org/content/view/19/37/
+  http://www.linux-ntfs.org/
 
 If you have any LDM questions that aren't answered in the documentation, email
 me.
diff --git a/Documentation/md.txt b/Documentation/md.txt
index e4e893ef3e0..a81c7b4790f 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -1,5 +1,5 @@
 Tools that manage md devices can be found at
-   http://www.<country>.kernel.org/pub/linux/utils/raid/....
+   http://www.kernel.org/pub/linux/utils/raid/ 
 
 
 Boot time assembly of RAID arrays
diff --git a/Documentation/misc-devices/c2port.txt b/Documentation/misc-devices/c2port.txt
index d9bf93ea439..ea734446561 100644
--- a/Documentation/misc-devices/c2port.txt
+++ b/Documentation/misc-devices/c2port.txt
@@ -32,10 +32,10 @@ The C2 Interface main references are at (http://www.silabs.com)
 Silicon Laboratories site], see:
 
 - AN127: FLASH Programming via the C2 Interface at
-http://www.silabs.com/public/documents/tpub_doc/anote/Microcontrollers/Small_Form_Factor/en/an127.pdf, and
+http://www.silabs.com/Support Documents/TechnicalDocs/an127.pdf 
 
 - C2 Specification at
-http://www.silabs.com/public/documents/tpub_doc/spec/Microcontrollers/en/C2spec.pdf,
+http://www.silabs.com/pages/DownloadDoc.aspx?FILEURL=Support%20Documents/TechnicalDocs/an127.pdf&src=SearchResults
 
 however it implements a two wire serial communication protocol (bit
 banging) designed to enable in-system programming, debugging, and
diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt
index 274821b35a7..990efd7a981 100644
--- a/Documentation/mtd/nand_ecc.txt
+++ b/Documentation/mtd/nand_ecc.txt
@@ -91,7 +91,7 @@ ECC 1   rp15  rp14  rp13  rp12  rp11  rp10  rp09  rp08
 ECC 2   cp5   cp4   cp3   cp2   cp1   cp0      1     1
 
 I detected after writing this that ST application note AN1823
-(http://www.st.com/stonline/books/pdf/docs/10123.pdf) gives a much
+(http://www.st.com/stonline/) gives a much
 nicer picture.(but they use line parity as term where I use row parity)
 Oh well, I'm graphically challenged, so suffer with me for a moment :-)
 And I could not reuse the ST picture anyway for copyright reasons.
diff --git a/Documentation/networking/3c509.txt b/Documentation/networking/3c509.txt
index 3c45d5dcd63..dcc9eaf5939 100644
--- a/Documentation/networking/3c509.txt
+++ b/Documentation/networking/3c509.txt
@@ -31,7 +31,7 @@ models:
 Large portions of this documentation were heavily borrowed from the guide
 written the original author of the 3c509 driver, Donald Becker. The master
 copy of that document, which contains notes on older versions of the driver,
-currently resides on Scyld web server: http://www.scyld.com/network/3c509.html.
+currently resides on Scyld web server: http://www.scyld.com/.
 
 
 (1) Special Driver Features
diff --git a/Documentation/networking/README.ipw2100 b/Documentation/networking/README.ipw2100
index f3fcaa41f77..6f85e1d0603 100644
--- a/Documentation/networking/README.ipw2100
+++ b/Documentation/networking/README.ipw2100
@@ -72,8 +72,7 @@ such, if you are interested in deploying or shipping a driver as part of
 solution intended to be used for purposes other than development, please
 obtain a tested driver from Intel Customer Support at:
 
-http://support.intel.com/support/notebook/sb/CS-006408.htm
-
+http://www.intel.com/support/wireless/sb/CS-006408.htm
 
 1. Introduction
 -----------------------------------------------
diff --git a/Documentation/networking/README.ipw2200 b/Documentation/networking/README.ipw2200
index 80c728522c4..c276490165e 100644
--- a/Documentation/networking/README.ipw2200
+++ b/Documentation/networking/README.ipw2200
@@ -85,7 +85,7 @@ such, if you are interested in deploying or shipping a driver as part of
 solution intended to be used for purposes other than development, please 
 obtain a tested driver from Intel Customer Support at:
 
-http://support.intel.com/support/notebook/sb/CS-006408.htm
+http://support.intel.com
 
 
 1.   Introduction
diff --git a/Documentation/networking/README.sb1000 b/Documentation/networking/README.sb1000
index f82d42584e9..f92c2aac56a 100644
--- a/Documentation/networking/README.sb1000
+++ b/Documentation/networking/README.sb1000
@@ -27,8 +27,8 @@ cable modem easy.
    in Franco's original source code distribution .tar.gz file.  Support for
    the sb1000 driver can be found at:
 
-      http://home.adelphia.net/~siglercm/sb1000.html
-      http://linuxpower.cx/~cable/
+      http://web.archive.org/web/*/http://home.adelphia.net/~siglercm/sb1000.html
+      http://web.archive.org/web/*/http://linuxpower.cx/~cable/
 
    along with these utilities.
 
diff --git a/Documentation/networking/arcnet.txt b/Documentation/networking/arcnet.txt
index 79601254038..9ff57950215 100644
--- a/Documentation/networking/arcnet.txt
+++ b/Documentation/networking/arcnet.txt
@@ -68,7 +68,7 @@ REAL NAME" to listserv@tichy.ch.uj.edu.pl.  Then, to submit messages to the
 list, mail to linux-arcnet@tichy.ch.uj.edu.pl.
 
 There are archives of the mailing list at:
-	http://tichy.ch.uj.edu.pl/lists/linux-arcnet
+	http://epistolary.org/mailman/listinfo.cgi/arcnet
 
 The people on linux-net@vger.kernel.org have also been known to be very
 helpful, especially when we're talking about ALPHA Linux kernels that may or
@@ -79,7 +79,7 @@ Other Drivers and Info
 ----------------------
 
 You can try my ARCNET page on the World Wide Web at:
-	http://www.worldvisions.ca/~apenwarr/arcnet/
+	http://www.qis.net/~jschmitz/arcnet/	
 
 Also, SMC (one of the companies that makes ARCnet cards) has a WWW site you
 might be interested in, which includes several drivers for various cards
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 61f516b135b..e0ec7cb3f67 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -2432,7 +2432,7 @@ be found at:
 https://lists.sourceforge.net/lists/listinfo/bonding-devel
 
 Donald Becker's Ethernet Drivers and diag programs may be found at :
- - http://www.scyld.com/network/
+ - http://web.archive.org/web/*/http://www.scyld.com/network/ 
 
 You will also find a lot of information regarding Ethernet, NWay, MII,
 etc. at www.scyld.com.
diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt
index d8968958d83..e12a4900cf7 100644
--- a/Documentation/networking/decnet.txt
+++ b/Documentation/networking/decnet.txt
@@ -4,7 +4,7 @@
 1) Other documentation....
 
    o Project Home Pages
-       http://www.chygwyn.com/DECnet/                      - Kernel info
+       http://www.chygwyn.com/                      	    - Kernel info
        http://linux-decnet.sourceforge.net/                - Userland tools
        http://www.sourceforge.net/projects/linux-decnet/   - Status page
 
diff --git a/Documentation/networking/fore200e.txt b/Documentation/networking/fore200e.txt
index b1f337f0f4c..6e0d2a9613e 100644
--- a/Documentation/networking/fore200e.txt
+++ b/Documentation/networking/fore200e.txt
@@ -39,7 +39,7 @@ version. Alternative binary firmware images can be found somewhere on the
 ForeThought CD-ROM supplied with your adapter by FORE Systems.
 
 You can also get the latest firmware images from FORE Systems at
-http://www.fore.com. Register TACTics Online and go to
+http://en.wikipedia.org/wiki/FORE_Systems. Register TACTics Online and go to
 the 'software updates' pages. The firmware binaries are part of
 the various ForeThought software distributions.
 
diff --git a/Documentation/networking/ipddp.txt b/Documentation/networking/ipddp.txt
index 661a5558dd8..ba5c217fffe 100644
--- a/Documentation/networking/ipddp.txt
+++ b/Documentation/networking/ipddp.txt
@@ -36,11 +36,6 @@ AppleTalk-IP to IP decapsulation.
 Basic instructions for user space tools
 =======================================
 
-To enable AppleTalk-IP decapsulation/encapsulation you will need the 
-proper tools. You can get the tools for decapsulation from
-http://spacs1.spacs.k12.wi.us/~jschlst/index.html and for encapsulation
-from http://www.maths.unm.edu/~bradford/ltpc.html
-
 I will briefly describe the operation of the tools, but you will
 need to consult the supporting documentation for each set of tools.
 
diff --git a/Documentation/networking/iphase.txt b/Documentation/networking/iphase.txt
index 55eac4a784e..670b72f1658 100644
--- a/Documentation/networking/iphase.txt
+++ b/Documentation/networking/iphase.txt
@@ -22,7 +22,7 @@ The features and limitations of this driver are as follows:
     - All variants of Interphase ATM PCI (i)Chip adapter cards are supported, 
       including x575 (OC3, control memory 128K , 512K and packet memory 128K, 
       512K and 1M), x525 (UTP25) and x531 (DS3 and E3). See 
-      http://www.iphase.com/site/iphase-web/?epi_menuItemID=e196f04b4b3b40502f150882e21046a0
+      http://www.iphase.com/
       for details.
     - Only x86 platforms are supported.
     - SMP is supported.
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 98f71a5cef0..2a72262fbad 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -8,7 +8,7 @@ capture network traffic with utilities like tcpdump or any other that needs
 raw access to network interface.
 
 You can find the latest version of this document at:
-    http://pusa.uv.es/~ulisses/packet_mmap/
+    http://wiki.ipxwarzone.com/index.php5?title=Linux_packet_mmap
 
 Howto can be found at:
     http://wiki.gnu-log.net (packet_mmap)
@@ -56,7 +56,7 @@ support for PACKET_MMAP, and also probably the libpcap included in your distribu
 
 I'm aware of two implementations of PACKET_MMAP in libpcap:
 
-    http://pusa.uv.es/~ulisses/packet_mmap/  (by Simon Patarin, based on libpcap 0.6.2)
+    http://wiki.ipxwarzone.com/		     (by Simon Patarin, based on libpcap 0.6.2)
     http://public.lanl.gov/cpw/              (by Phil Wood, based on lastest libpcap)
 
 The rest of this document is intended for people who want to understand
diff --git a/Documentation/networking/ray_cs.txt b/Documentation/networking/ray_cs.txt
index 145d27a5239..c0c12307ed9 100644
--- a/Documentation/networking/ray_cs.txt
+++ b/Documentation/networking/ray_cs.txt
@@ -13,8 +13,8 @@ wireless LAN cards.
 
 As of kernel 2.3.18, the ray_cs driver is part of the Linux kernel
 source.  My web page for the development of ray_cs is at
-http://world.std.com/~corey/raylink.html and I can be emailed at
-corey@world.std.com
+http://web.ralinktech.com/ralink/Home/Support/Linux.html 
+and I can be emailed at corey@world.std.com
 
 The kernel driver is based on ray_cs-1.62.tgz
 
diff --git a/Documentation/networking/s2io.txt b/Documentation/networking/s2io.txt
index c3d6b4d5d01..9d4e0f4df5a 100644
--- a/Documentation/networking/s2io.txt
+++ b/Documentation/networking/s2io.txt
@@ -133,7 +133,8 @@ bring down CPU utilization.
 ** For AMD opteron platforms with 8131 chipset, MMRBC=1 and MOST=1 are 
 recommended as safe parameters.
 For more information, please review the AMD8131 errata at
-http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26310.pdf
+http://vip.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/
+26310_AMD-8131_HyperTransport_PCI-X_Tunnel_Revision_Guide_rev_3_18.pdf
 
 6.  Available Downloads
 Neterion "s2io" driver in Red Hat and Suse 2.6-based distributions is kept up 
diff --git a/Documentation/networking/tlan.txt b/Documentation/networking/tlan.txt
index 7e6aa5b20c3..34550dfcef7 100644
--- a/Documentation/networking/tlan.txt
+++ b/Documentation/networking/tlan.txt
@@ -2,7 +2,7 @@
 (C) 1998 James Banks
 (C) 1999-2001 Torben Mathiasen <tmm@image.dk, torben.mathiasen@compaq.com>
 
-For driver information/updates visit http://opensource.compaq.com
+For driver information/updates visit http://www.compaq.com
 
 
 TLAN driver for Linux, version 1.14a
@@ -113,5 +113,5 @@ III.  Things to try if you have problems.
 
 There is also a tlan mailing list which you can join by sending "subscribe tlan"
 in the body of an email to majordomo@vuser.vu.union.edu.
-There is also a tlan website at http://opensource.compaq.com
+There is also a tlan website at http://www.compaq.com
 
diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt
index 855d8da57a2..d727a382910 100644
--- a/Documentation/networking/udplite.txt
+++ b/Documentation/networking/udplite.txt
@@ -11,11 +11,13 @@
   This file briefly describes the existing kernel support and the socket API.
   For in-depth information, you can consult:
 
-   o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/
+   o The UDP-Lite Homepage:
+	http://web.archive.org/web/*/http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/ 
        From here you can also download some example application source code.
 
    o The UDP-Lite HOWTO on
-       http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt
+       http://web.archive.org/web/*/http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/
+	files/UDP-Lite-HOWTO.txt
 
    o The Wireshark UDP-Lite WiKi (with capture files):
        http://wiki.wireshark.org/Lightweight_User_Datagram_Protocol
@@ -26,12 +28,7 @@
   I) APPLICATIONS
 
   Several applications have been ported successfully to UDP-Lite. Ethereal
-  (now called wireshark) has UDP-Litev4/v6 support by default. The tarball on
-
-   http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/udplite_linux.tar.gz
-
-  has source code for several v4/v6 client-server and network testing examples.
-
+  (now called wireshark) has UDP-Litev4/v6 support by default. 
   Porting applications to UDP-Lite is straightforward: only socket level and
   IPPROTO need to be changed; senders additionally set the checksum coverage
   length (default = header length = 8). Details are in the next section.
diff --git a/Documentation/networking/wavelan.txt b/Documentation/networking/wavelan.txt
index afa6e521c68..90e0ac4e15d 100644
--- a/Documentation/networking/wavelan.txt
+++ b/Documentation/networking/wavelan.txt
@@ -50,7 +50,8 @@ and a Lucent Modem, and NOT 802.11 compatible.
 -----------------
 	o Config :	Not yet in kernel
 	o Location :	Pcmcia package 3.1.10+
-	o on-line doc :	http://www.fasta.fh-dortmund.de/users/andy/wvlan/
+	o on-line doc :
+	http://web.archive.org/web/*/http://www.fasta.fh-dortmund.de/users/andy/wvlan/	
 
 	This is the driver for the current generation of Wavelan IEEE,
 which is 802.11 compatible. Depending on version, it is 2 Mb/s or 11
diff --git a/Documentation/power/apm-acpi.txt b/Documentation/power/apm-acpi.txt
index 1bd799dc17e..6cc423d3662 100644
--- a/Documentation/power/apm-acpi.txt
+++ b/Documentation/power/apm-acpi.txt
@@ -28,5 +28,5 @@ and be sure that they are started sometime in the system boot process.
 Go ahead and start both.  If ACPI or APM is not available on your
 system the associated daemon will exit gracefully.
 
-  apmd:   http://worldvisions.ca/~apenwarr/apmd/
+  apmd:   http://ftp.debian.org/pool/main/a/apmd/
   acpid:  http://acpid.sf.net/
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
index 1555001bc73..ddd78172ef7 100644
--- a/Documentation/power/basic-pm-debugging.txt
+++ b/Documentation/power/basic-pm-debugging.txt
@@ -179,8 +179,7 @@ use the PM_TRACE mechanism documented in Documentation/s2ram.txt .
 
 To verify that the STR works, it is generally more convenient to use the s2ram
 tool available from http://suspend.sf.net and documented at
-http://en.opensuse.org/s2ram .  However, before doing that it is recommended to
-carry out STR testing using the facility described in section 1.
+http://en.opensuse.org/SDB:Suspend_to_RAM.
 
 Namely, after writing "freezer", "devices", "platform", "processors", or "core"
 into /sys/power/pm_test (available if the kernel is compiled with
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt
index 2b358498d09..3e6272bc447 100644
--- a/Documentation/power/video.txt
+++ b/Documentation/power/video.txt
@@ -67,11 +67,11 @@ There are a few types of systems where video works after S3 resume:
   POSTing bios works. Ole Rohne has patch to do just that at
   http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
 
-(8) on some systems, you can use the video_post utility mentioned here:
-  http://bugzilla.kernel.org/show_bug.cgi?id=3670. Do echo 3 > /sys/power/state
-  && /usr/sbin/video_post - which will initialize the display in console mode.
-  If  you are in X, you can switch to a virtual terminal and back to X using
-  CTRL+ALT+F1 - CTRL+ALT+F7 to get the display working in graphical mode again.
+(8) on some systems, you can use the video_post utility and or 
+  do echo 3 > /sys/power/state  && /usr/sbin/video_post - which will 
+  initialize the display in console mode. If you are in X, you can switch
+  to a virtual terminal and back to X using  CTRL+ALT+F1 - CTRL+ALT+F7 to get
+  the display working in graphical mode again.
 
 Now, if you pass acpi_sleep=something, and it does not work with your
 bios, you'll get a hard crash during resume. Be careful. Also it is
@@ -177,7 +177,7 @@ Mainboard	    Graphics card                 hack (or "how to do it")
 Asus A7V8X	    nVidia RIVA TNT2 model 64	  s3_bios,s3_mode (4)
 
 
-(*) from http://www.ubuntulinux.org/wiki/HoaryPMResults, not sure
+(*) from https://wiki.ubuntu.com/HoaryPMResults, not sure
     which options to use. If you know, please tell me.
 
 (***) To be tested with a newer kernel.
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index 46d22105aa0..568fa08e82e 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -1048,7 +1048,7 @@ IV - "dtc", the device tree compiler
 
 
 dtc source code can be found at
-<http://ozlabs.org/~dgibson/dtc/dtc.tar.gz>
+<http://git.jdl.com/gitweb/?p=dtc.git>
 
 WARNING: This version is still in early development stage; the
 resulting device-tree "blobs" have not yet been validated with the
diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt
index 1eb576a023b..86f9f74b2b3 100644
--- a/Documentation/s390/Debugging390.txt
+++ b/Documentation/s390/Debugging390.txt
@@ -2531,5 +2531,5 @@ Special Thanks
 ==============
 Special thanks to Neale Ferguson who maintains a much
 prettier HTML version of this page at
-http://penguinvm.princeton.edu/notes.html#Debug390
+http://linuxvm.org/penguinvm/
 Bob Grainger Stefan Bader & others for reporting bugs
diff --git a/Documentation/scsi/BusLogic.txt b/Documentation/scsi/BusLogic.txt
index 98023baa0f0..d7fbc9488b9 100644
--- a/Documentation/scsi/BusLogic.txt
+++ b/Documentation/scsi/BusLogic.txt
@@ -47,7 +47,7 @@ tune driver performance and error recovery to their particular needs.
 The latest information on Linux support for BusLogic SCSI Host Adapters, as
 well as the most recent release of this driver and the latest firmware for the
 BT-948/958/958D, will always be available from my Linux Home Page at URL
-"http://www.dandelion.com/Linux/".
+"http://sourceforge.net/projects/dandelion/".
 
 Bug reports should be sent via electronic mail to "lnz@dandelion.com".  Please
 include with the bug report the complete configuration messages reported by the
diff --git a/Documentation/scsi/ChangeLog.megaraid b/Documentation/scsi/ChangeLog.megaraid
index 38e9e7cadc9..5e07d320817 100644
--- a/Documentation/scsi/ChangeLog.megaraid
+++ b/Documentation/scsi/ChangeLog.megaraid
@@ -215,7 +215,7 @@ Older Version	: 2.20.4.5 (scsi module), 2.20.2.5 (cmm module)
 
 3.	Convert pci_module_init to pci_register_driver
 	Convert from pci_module_init to pci_register_driver
-	(from:http://kerneljanitors.org/TODO)
+	(from:http://kernelnewbies.org/KernelJanitors/TODO)
 		- Signed-off-by: Domen Puncer <domen@coderock.org>
 
 4.	Use the pre defined DMA mask constants from dma-mapping.h
diff --git a/Documentation/scsi/FlashPoint.txt b/Documentation/scsi/FlashPoint.txt
index 1540a92f6d2..5b5f29cb9f8 100644
--- a/Documentation/scsi/FlashPoint.txt
+++ b/Documentation/scsi/FlashPoint.txt
@@ -13,7 +13,7 @@ operating system support to its BusLogic brand of FlashPoint Ultra SCSI
 host adapters.  All of BusLogic's other SCSI host adapters, including the
 MultiMaster line, currently support the Linux operating system.  Linux
 drivers and information will be available on October 15th at
-http://www.dandelion.com/Linux/.
+http://sourceforge.net/projects/dandelion/.
 
 "Mylex is committed to supporting the Linux community," says Peter Shambora,
 vice president of marketing for Mylex.  "We have supported Linux driver
@@ -27,7 +27,7 @@ Linux is a freely-distributed implementation of UNIX for Intel x86, Sun
 SPARC, SGI MIPS, Motorola 68k, Digital Alpha AXP and Motorola PowerPC
 machines.  It supports a wide range of software, including the X Window
 System, Emacs, and TCP/IP networking.  Further information is available at
-http://www.linux.org and http://www.ssc.com/linux.
+http://www.linux.org and http://www.ssc.com/.
 
 FlashPoint Host Adapters
 
diff --git a/Documentation/scsi/Mylex.txt b/Documentation/scsi/Mylex.txt
index cdf69293f7d..3797f3e6c2b 100644
--- a/Documentation/scsi/Mylex.txt
+++ b/Documentation/scsi/Mylex.txt
@@ -2,4 +2,4 @@ Please see the file README.BusLogic for information about Linux support for
 Mylex (formerly BusLogic) MultiMaster and FlashPoint SCSI Host Adapters.
 
 The Mylex DAC960 PCI RAID Controllers are now supported.  Please consult
-http://www.dandelion.com/Linux/ for further information on the DAC960 driver.
+http://sourceforge.net/projects/dandelion for further information on the DAC960 driver.
diff --git a/Documentation/scsi/NinjaSCSI.txt b/Documentation/scsi/NinjaSCSI.txt
index 3229b64cf24..ac8db8ceec7 100644
--- a/Documentation/scsi/NinjaSCSI.txt
+++ b/Documentation/scsi/NinjaSCSI.txt
@@ -3,8 +3,6 @@
 
 1. Comment
  This is Workbit corp.'s(http://www.workbit.co.jp/) NinjaSCSI-3
-(http://www.workbit.co.jp/ts/z_nj3r.html) and NinjaSCSI-32Bi
-(http://www.workbit.co.jp/ts/z_njsc32bi.html) PCMCIA card driver module
 for Linux.
 
 2. My Linux environment
diff --git a/Documentation/scsi/aic79xx.txt b/Documentation/scsi/aic79xx.txt
index c014eccaf19..16e054c9c70 100644
--- a/Documentation/scsi/aic79xx.txt
+++ b/Documentation/scsi/aic79xx.txt
@@ -447,7 +447,7 @@ The following information is available in this file:
       http://www.adaptec.com/buy-cables/.
 
    Europe
-    - Visit our Web site at http://www.adaptec-europe.com/.
+    - Visit our Web site at http://www.adaptec.com/en-US/_common/world_index.
     - To speak with a Technical Support Specialist, call, or email,
       * German:  +49 89 4366 5522, Monday-Friday, 9:00-17:00 CET,
         http://ask-de.adaptec.com/.
diff --git a/Documentation/scsi/aic7xxx.txt b/Documentation/scsi/aic7xxx.txt
index b7e238cbb5a..18f8d1905e6 100644
--- a/Documentation/scsi/aic7xxx.txt
+++ b/Documentation/scsi/aic7xxx.txt
@@ -344,7 +344,7 @@ The following information is available in this file:
       http://www.adaptec.com/buy-cables/.
 
    Europe
-    - Visit our Web site at http://www.adaptec-europe.com/.
+    - Visit our Web site at http://www.adaptec.com/en-US/_common/world_index.
     - To speak with a Technical Support Specialist, call, or email,
       * German:  +49 89 4366 5522, Monday-Friday, 9:00-17:00 CET,
         http://ask-de.adaptec.com/.
diff --git a/Documentation/scsi/ibmmca.txt b/Documentation/scsi/ibmmca.txt
index 3920f28710c..45d61ad8c6f 100644
--- a/Documentation/scsi/ibmmca.txt
+++ b/Documentation/scsi/ibmmca.txt
@@ -1109,7 +1109,7 @@
      Q: Where can I find the latest info about this driver?
      A: See the file MAINTAINERS for the current WWW-address, which offers
         updates, info and Q/A lists. At this file's origin, the webaddress
-	was: http://www.uni-mainz.de/~langm000/linux.html
+	was: http://www.staff.uni-mainz.de/mlang/linux.html
      Q: My SCSI-adapter is not recognized by the driver, what can I do?
      A: Just force it to be recognized by kernel parameters. See section 5.1.
         If this really happens, do also send e-mail to the maintainer, as
diff --git a/Documentation/scsi/osst.txt b/Documentation/scsi/osst.txt
index 2b21890bc98..ad86c6d1e89 100644
--- a/Documentation/scsi/osst.txt
+++ b/Documentation/scsi/osst.txt
@@ -135,7 +135,7 @@ The driver development is coordinated through a mailing list
 a CVS repository and some web pages. 
 The tester's pages which contain recent news and updated drivers to download
 can be found on
-http://linux1.onstream.nl/test/
+http://sourceforge.net/projects/osst/
 
 If you find any problems, please have a look at the tester's page in order
 to see whether the problem is already known and solved. Otherwise, please
diff --git a/Documentation/scsi/ppa.txt b/Documentation/scsi/ppa.txt
index 067ac394e0b..05ff47dbe8d 100644
--- a/Documentation/scsi/ppa.txt
+++ b/Documentation/scsi/ppa.txt
@@ -1,13 +1,13 @@
 -------- Terse where to get ZIP Drive help info --------
 
 General Iomega ZIP drive page for Linux:
-http://www.torque.net/~campbell/
+http://web.archive.org/web/*/http://www.torque.net/~campbell/
 
 Driver archive for old drivers:
-http://www.torque.net/~campbell/ppa/
+http://web.archive.org/web/*/http://www.torque.net/~campbell/ppa
 
 Linux Parport page (parallel port)
-http://www.torque.net/parport/
+http://web.archive.org/web/*/http://www.torque.net/parport/
 
 Email list for Linux Parport
 linux-parport@torque.net
diff --git a/Documentation/scsi/scsi-generic.txt b/Documentation/scsi/scsi-generic.txt
index c38e2b3648e..0a22ab8ea0c 100644
--- a/Documentation/scsi/scsi-generic.txt
+++ b/Documentation/scsi/scsi-generic.txt
@@ -34,11 +34,11 @@ http://www.tldp.org/HOWTO/SCSI-Generic-HOWTO
 This describes the sg version 3 driver found in the lk 2.4 series.
 The LDP renders documents in single and multiple page HTML, postscript
 and pdf. This document can also be found at:
-http://www.torque.net/sg/p/sg_v3_ho.html
+http://sg.danny.cz/sg/p/sg_v3_ho.html
 
 Documentation for the version 2 sg driver found in the lk 2.2 series can
-be found at http://www.torque.net/sg/p/scsi-generic.txt . A larger version
-is at:  http://www.torque.net/sg/p/scsi-generic_long.txt .
+be found at http://sg.danny.cz/sg/. A larger version
+is at: http://sg.danny.cz/sg/p/scsi-generic_long.txt.
 
 The original documentation for the sg driver (prior to lk 2.2.6) can be
 found at http://www.torque.net/sg/p/original/SCSI-Programming-HOWTO.txt
@@ -61,7 +61,7 @@ There are two packages of sg utilities:
   - sg_utils    for the sg version 2 (and original) driver found in lk 2.2
                 and earlier
 Both packages will work in the lk 2.4 series however sg3_utils offers more
-capabilities. They can be found at: http://www.torque.net/sg and 
+capabilities. They can be found at: http://sg.danny.cz/sg/sg3_utils.html and 
 freshmeat.net
 
 Another approach is to look at the applications that use the sg driver.
diff --git a/Documentation/scsi/scsi.txt b/Documentation/scsi/scsi.txt
index dd1bbf4e98e..3d99d38cb62 100644
--- a/Documentation/scsi/scsi.txt
+++ b/Documentation/scsi/scsi.txt
@@ -4,8 +4,8 @@ The Linux Documentation Project (LDP) maintains a document describing
 the SCSI subsystem in the Linux kernel (lk) 2.4 series. See:
 http://www.tldp.org/HOWTO/SCSI-2.4-HOWTO . The LDP has single
 and multiple page HTML renderings as well as postscript and pdf.
-It can also be found at http://www.torque.net/scsi/SCSI-2.4-HOWTO .
-
+It can also be found at:
+http://web.archive.org/web/*/http://www.torque.net/scsi/SCSI-2.4-HOWTO
 
 Notes on using modules in the SCSI subsystem
 ============================================
diff --git a/Documentation/scsi/scsi_mid_low_api.txt b/Documentation/scsi/scsi_mid_low_api.txt
index de67229251d..570ef2b3d79 100644
--- a/Documentation/scsi/scsi_mid_low_api.txt
+++ b/Documentation/scsi/scsi_mid_low_api.txt
@@ -43,7 +43,7 @@ There is a SCSI documentation directory within the kernel source tree,
 typically Documentation/scsi . Most documents are in plain
 (i.e. ASCII) text. This file is named scsi_mid_low_api.txt and can be 
 found in that directory. A more recent copy of this document may be found
-at http://www.torque.net/scsi/scsi_mid_low_api.txt.gz . 
+at http://web.archive.org/web/20070107183357rn_1/sg.torque.net/scsi/. 
 Many LLDs are documented there (e.g. aic7xxx.txt). The SCSI mid-level is
 briefly described in scsi.txt which contains a url to a document 
 describing the SCSI subsystem in the lk 2.4 series. Two upper level 
diff --git a/Documentation/serial/moxa-smartio b/Documentation/serial/moxa-smartio
index 5337e80a5b9..d1044391868 100644
--- a/Documentation/serial/moxa-smartio
+++ b/Documentation/serial/moxa-smartio
@@ -76,7 +76,7 @@ Content
    GNU General Public License in this version. Please refer to GNU General
    Public License announcement in each source code file for more detail.
 
-   In Moxa's Web sites, you may always find latest driver at http://web.moxa.com.
+   In Moxa's Web sites, you may always find latest driver at http://www.moxa.com/.
 
    This version of driver can be installed as Loadable Module (Module driver)
    or built-in into kernel (Static driver). You may refer to following
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 2075bbb8b3e..7f4dcebda9c 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -1285,7 +1285,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     about this driver.  Note that it has been discontinued, but the 
     Voyetra Turtle Beach knowledge base entry for it is still available
     at
-	http://www.turtlebeach.com/site/kb_ftp/790.asp
+	http://www.turtlebeach.com
 
   Module snd-msnd-pinnacle
   ------------------------
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt
index bdafdbd3256..278cc2122ea 100644
--- a/Documentation/sound/alsa/HD-Audio.txt
+++ b/Documentation/sound/alsa/HD-Audio.txt
@@ -562,7 +562,7 @@ compare the codec registers directly.
 Send a bug report either the followings:
 
 kernel-bugzilla::
-  http://bugme.linux-foundation.org/
+  https://bugzilla.kernel.org/
 alsa-devel ML::
   alsa-devel@alsa-project.org
 
diff --git a/Documentation/sound/alsa/soc/DAI.txt b/Documentation/sound/alsa/soc/DAI.txt
index 0ebd7ea9706..c9679264c55 100644
--- a/Documentation/sound/alsa/soc/DAI.txt
+++ b/Documentation/sound/alsa/soc/DAI.txt
@@ -13,7 +13,7 @@ frame (FRAME) (usually 48kHz) is always driven by the controller. Each AC97
 frame is 21uS long and is divided into 13 time slots.
 
 The AC97 specification can be found at :-
-http://www.intel.com/design/chipsets/audio/ac97_r23.pdf
+http://www.intel.com/p/en_US/business/design
 
 
 I2S
diff --git a/Documentation/sound/alsa/soc/codec.txt b/Documentation/sound/alsa/soc/codec.txt
index 1e95342ed72..37ba3a72cb7 100644
--- a/Documentation/sound/alsa/soc/codec.txt
+++ b/Documentation/sound/alsa/soc/codec.txt
@@ -143,7 +143,7 @@ struct snd_soc_ops {
 };
 
 Please refer to the ALSA driver PCM documentation for details.
-http://www.alsa-project.org/~iwai/writing-an-alsa-driver/c436.htm
+http://www.alsa-project.org/~iwai/writing-an-alsa-driver/
 
 
 5 - DAPM description.
diff --git a/Documentation/sound/alsa/soc/platform.txt b/Documentation/sound/alsa/soc/platform.txt
index b681d17fc38..06d835987c6 100644
--- a/Documentation/sound/alsa/soc/platform.txt
+++ b/Documentation/sound/alsa/soc/platform.txt
@@ -39,7 +39,7 @@ struct snd_soc_platform {
 };
 
 Please refer to the ALSA driver documentation for details of audio DMA.
-http://www.alsa-project.org/~iwai/writing-an-alsa-driver/c436.htm
+http://www.alsa-project.org/~iwai/writing-an-alsa-driver/
 
 An example DMA driver is soc/pxa/pxa2xx-pcm.c
 
diff --git a/Documentation/sound/oss/README.OSS b/Documentation/sound/oss/README.OSS
index fd42b05b2f5..c615debbf08 100644
--- a/Documentation/sound/oss/README.OSS
+++ b/Documentation/sound/oss/README.OSS
@@ -36,7 +36,7 @@ with OSS API.
 
 Packages "snd-util-3.8.tar.gz" and "snd-data-0.1.tar.Z"
 contain useful utilities to be used with this driver.
-See http://www.opensound.com/ossfree/getting.html for
+See http://www.opensound.com/ossfree/ for
 download instructions.
 
 If you are looking for the installation instructions, please
@@ -1438,7 +1438,7 @@ of this driver (see http://www.4Front-tech.com/oss.html for more info).
 There are some common audio chipsets that are not supported yet. For example
 Sierra Aria and IBM Mwave. It's possible that these architectures
 get some support in future but I can't make any promises. Just look
-at the home page (http://www.opensound.com/ossfree/new_cards.html)
+at the home page (http://www.opensound.com/ossfree/)
 for latest info.
 
 Information about unsupported sound cards and chipsets is welcome as well
@@ -1449,7 +1449,6 @@ If you have any corrections and/or comments, please contact me.
 Hannu Savolainen
 hannu@opensound.com
 
-Personal home page:	   http://www.compusonic.fi/~hannu
 home page of OSS/Free: http://www.opensound.com/ossfree
 
 home page of commercial OSS
diff --git a/Documentation/telephony/ixj.txt b/Documentation/telephony/ixj.txt
index 44d124005ba..4fb314d5170 100644
--- a/Documentation/telephony/ixj.txt
+++ b/Documentation/telephony/ixj.txt
@@ -108,14 +108,9 @@ applications.
 
 1.4  Where to get things
 
-You can download the latest versions of the driver from:
-
-http://www.quicknet.net/develop.htm
-
-You can download the latest version of this document from:
-
-http://www.quicknet.net/develop.htm
+Info on latest versions of the driver are here:
 
+http://web.archive.org/web/*/http://www.quicknet.net/develop.htm
 
 1.5  Mailing List
 
diff --git a/Documentation/uml/UserModeLinux-HOWTO.txt b/Documentation/uml/UserModeLinux-HOWTO.txt
index 628013f944c..9b7e1904db1 100644
--- a/Documentation/uml/UserModeLinux-HOWTO.txt
+++ b/Documentation/uml/UserModeLinux-HOWTO.txt
@@ -8,62 +8,6 @@
 
   Table of Contents
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
   1. Introduction
 
      1.1 How is User Mode Linux Different?
@@ -277,8 +221,7 @@
 
   1. Download the latest UML patch from
 
-     the download page <http://user-mode-linux.sourceforge.net/dl-
-     sf.html>
+     the download page <http://user-mode-linux.sourceforge.net/
 
      In this example, the file is uml-patch-2.4.0-prerelease.bz2.
 
@@ -438,7 +381,7 @@
   as modules, especially filesystems and network protocols and filters,
   so most symbols which need to be exported probably already are.
   However, if you do find symbols that need exporting, let  us
-  <http://user-mode-linux.sourceforge.net/contacts.html>  know, and
+  <http://user-mode-linux.sourceforge.net/>  know, and
   they'll be "taken care of".
 
 
@@ -498,8 +441,8 @@
 
   You will need a filesystem to boot UML from.  There are a number
   available for download from  here  <http://user-mode-
-  linux.sourceforge.net/dl-sf.html> .  There are also  several tools
-  <http://user-mode-linux.sourceforge.net/fs_making.html>  which can be
+  linux.sourceforge.net/> .  There are also  several tools
+  <http://user-mode-linux.sourceforge.net/>  which can be
   used to generate UML-compatible filesystem images from media.
   The kernel will boot up and present you with a login prompt.
 
@@ -1236,7 +1179,7 @@
 
 
   Harald's original README is here <http://user-mode-linux.source-
-  forge.net/text/mcast.txt>  and explains these in detail, as well as
+  forge.net/>  and explains these in detail, as well as
   some other issues.
 
 
@@ -1311,7 +1254,7 @@
   kernel.
 
   These were pointed out by Tim Robinson <timro at trkr dot net> in
-  <http://www.geocrawler.com/lists/3/SourceForge/597/0/> name="this uml-
+  <http://www.geocrawler.com/> name="this uml-
   user post"> .
 
 
@@ -2038,7 +1981,7 @@
 
   uml_moo is installed with the UML deb and RPM.  If you didn't install
   UML from one of those packages, you can also get it from the UML
-  utilities <http://user-mode-linux.sourceforge.net/dl-sf.html#UML
+  utilities <http://user-mode-linux.sourceforge.net/
   utilities>  tar file in tools/moo.
 
 
@@ -4599,7 +4542,7 @@
 
   Michael Jennings <mikejen at hevanet.com>  sent in some material which
   is now gracing the top of the  index  page <http://user-mode-
-  linux.sourceforge.net/index.html>  of this site.
+  linux.sourceforge.net/>  of this site.
 
   SGI <http://www.sgi.com>  (and more specifically Ralf Baechle <ralf at
   uni-koblenz.de> ) gave me an account on oss.sgi.com
diff --git a/Documentation/usb/linux.inf b/Documentation/usb/linux.inf
index 2f7217d124f..af71d87d9e9 100644
--- a/Documentation/usb/linux.inf
+++ b/Documentation/usb/linux.inf
@@ -9,7 +9,7 @@
 ;
 ; Microsoft only directly supports RNDIS drivers, and bundled them into XP.
 ; The Microsoft "Remote NDIS USB Driver Kit" is currently found at:
-;   http://www.microsoft.com/whdc/hwdev/resources/HWservices/rndis.mspx
+;   http://www.microsoft.com/whdc/device/network/ndis/rmndis.mspx
 
 
 [Version]
diff --git a/Documentation/usb/mtouchusb.txt b/Documentation/usb/mtouchusb.txt
index e43cfffaa10..86302cd53ed 100644
--- a/Documentation/usb/mtouchusb.txt
+++ b/Documentation/usb/mtouchusb.txt
@@ -54,10 +54,6 @@ generic functions like calibrations, resets, and vendor information can be
 requested from the userspace (And the drivers would handle the vendor specific
 tasks).
 
-ADDITIONAL INFORMATION/UPDATES/X CONFIGURATION EXAMPLE:
-
-http://groomlakelabs.com/grandamp/code/microtouch/
-
 TODO:
 
 Implement a control urb again to handle requests to and from the device
@@ -68,7 +64,7 @@ DISCLAIMER:
 I am not a MicroTouch/3M employee, nor have I ever been.  3M does not support 
 this driver!  If you want touch drivers only supported within X, please go to:
 
-http://www.3m.com/3MTouchSystems/downloads/
+http://www.3m.com/3MTouchSystems/
 
 THANKS:
 
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt
index f4d21451025..5bd7926185e 100644
--- a/Documentation/usb/usb-serial.txt
+++ b/Documentation/usb/usb-serial.txt
@@ -83,7 +83,7 @@ HandSpring Visor, Palm USB, and Clié USB driver
   parameters.  e.g. modprobe visor vendor=0x54c product=0x66
   
   There is a webpage and mailing lists for this portion of the driver at:
-  http://usbvisor.sourceforge.net/
+  http://sourceforge.net/projects/usbvisor/
 
   For any questions or problems with this driver, please contact Greg
   Kroah-Hartman at greg@kroah.com
@@ -184,7 +184,7 @@ Keyspan USA-series Serial Adapters
     functionality.
   
   More information is available at:
-        http://misc.nu/hugh/keyspan.html
+        http://www.carnationsoftware.com/carnation/Keyspan.html
    
   For any questions or problems with this driver, please contact Hugh
   Blemings at hugh@misc.nu
diff --git a/Documentation/video4linux/API.html b/Documentation/video4linux/API.html
index d749d41f647..d72fd2aa915 100644
--- a/Documentation/video4linux/API.html
+++ b/Documentation/video4linux/API.html
@@ -17,7 +17,7 @@
    </tr>
    <tr>
     <td>
-     <a href="http://www.linuxtv.org/downloads/video4linux/API/V4L2_API">V4L2 API</a>
+     <a href="http://v4l2spec.bytesex.org/spec-single/v4l2.html">V4L2 API</a>
     </td>
     <td>Should be used for new projects
     </td>
diff --git a/Documentation/video4linux/CQcam.txt b/Documentation/video4linux/CQcam.txt
index d230878e473..8977e7ce4da 100644
--- a/Documentation/video4linux/CQcam.txt
+++ b/Documentation/video4linux/CQcam.txt
@@ -203,11 +203,11 @@ The V4L2 API spec:
   http://v4l2spec.bytesex.org/
 
 Some web pages about the quickcams:
-   http://www.dkfz-heidelberg.de/Macromol/wedemann/mini-HOWTO-cqcam.html
+   http://www.pingouin-land.com/howto/QuickCam-HOWTO.html
 
    http://www.crynwr.com/qcpc/            QuickCam Third-Party Drivers
    http://www.crynwr.com/qcpc/re.html     Some Reverse Engineering
-   http://cse.unl.edu/~cluening/gqcam/    v4l client
+   http://www.wirelesscouch.net/software/gqcam/   v4l client
    http://phobos.illtel.denver.co.us/pub/qcread/ doesn't use v4l
    ftp://ftp.cs.unm.edu/pub/chris/quickcam/   Has lots of drivers
    http://www.cs.duke.edu/~reynolds/quickcam/ Has lots of information
diff --git a/Documentation/video4linux/README.cpia b/Documentation/video4linux/README.cpia
index 19cd3bf2498..8a747fee661 100644
--- a/Documentation/video4linux/README.cpia
+++ b/Documentation/video4linux/README.cpia
@@ -185,7 +185,7 @@ THANKS (in no particular order):
 ---------------------------------------------------------------------------
 REFERENCES
 
-   1. http://www.risc.uni-linz.ac.at/people/ppregler
+   1. http://www.risc.uni-linz.ac.at/
       mailto:Peter_Pregler@email.com
    2. see the file COPYING in the top directory of the kernel tree
    3. http://webcam.sourceforge.net/
diff --git a/Documentation/video4linux/README.ivtv b/Documentation/video4linux/README.ivtv
index 73df22c40bf..42b06686eb7 100644
--- a/Documentation/video4linux/README.ivtv
+++ b/Documentation/video4linux/README.ivtv
@@ -10,7 +10,7 @@ Hauppauge PVR-350.
 NOTE: this driver requires the latest encoder firmware (version 2.06.039, size
 376836 bytes). Get the firmware from here:
 
-http://dl.ivtvdriver.org/ivtv/firmware/firmware.tar.gz
+http://dl.ivtvdriver.org/ivtv/firmware/
 
 NOTE: 'normal' TV applications do not work with this driver, you need
 an application that can handle MPEG input such as mplayer, xine, MythTV,
diff --git a/Documentation/video4linux/Zoran b/Documentation/video4linux/Zoran
index 0e89e767629..00e3f926781 100644
--- a/Documentation/video4linux/Zoran
+++ b/Documentation/video4linux/Zoran
@@ -174,7 +174,7 @@ and is used in Argentinia, Uruguay, an a few others
 We do not talk about how the audio is broadcast !
 
 A rather good sites about the TV standards are:
-http://www.sony.jp/ServiceArea/Voltage_map/
+http://www.sony.jp/support/
 http://info.electronicwerkstatt.de/bereiche/fernsehtechnik/frequenzen_und_normen/Fernsehnormen/
 and http://www.cabl.com/restaurant/channel.html
 
@@ -330,7 +330,7 @@ These extensions are known as the v4l/mjpeg extensions. See zoran.h for
 details (structs/ioctls).
 
 Information - video4linux:
-http://roadrunner.swansea.linux.org.uk/v4lapi.shtml
+http://linux.bytesex.org/v4l2/API.html
 Documentation/video4linux/API.html
 /usr/include/linux/videodev.h
 
@@ -390,7 +390,7 @@ BUZIOC_G_STATUS
 Get the status of the input lines (video source connected/norm).
 
 For programming example, please, look at lavrec.c and lavplay.c code in
-lavtools-1.2p2 package (URL: http://www.cicese.mx/~mirsev/DC10plus/)
+lavtools-1.2p2 package (URL: http://www.cicese.mx/)
 and the 'examples' directory in the original Buz driver distribution.
 
 Additional notes for software developers:
diff --git a/Documentation/video4linux/bttv/Cards b/Documentation/video4linux/bttv/Cards
index d3389655ad9..12217fc4972 100644
--- a/Documentation/video4linux/bttv/Cards
+++ b/Documentation/video4linux/bttv/Cards
@@ -802,7 +802,7 @@ Kworld (www.kworld.com.tw)
 
 
-JTT/ Justy Corp.http://www.justy.co.jp/ (www.jtt.com.jp website down)
+JTT/ Justy Corp.(http://www.jtt.ne.jp/)
 ---------------------------------------------------------------------
    JTT-02 (JTT TV) "TV watchmate pro" (bt848)
 
@@ -828,7 +828,7 @@ Eline www.eline-net.com/
    Eline Vision TVMaster / TVMaster FM (ELV-TVM/ ELV-TVM-FM) = LR26  (bt878)
    Eline Vision TVMaster-2000 (ELV-TVM-2000, ELV-TVM-2000-FM)= LR138 (saa713x)
 
-Spirit http://www.spiritmodems.com.au/
+Spirit 
 ------
    Spirit TV Tuner/Video Capture Card (bt848)
 
@@ -959,6 +959,6 @@ Asus www.asuscom.com
 
 Hoontech
 --------
-http://www.hoontech.com/korean/download/down_driver_list03.html
+http://www.hoontech.de/
    HART Vision 848 (H-ART Vision 848)
    HART Vision 878 (H-Art Vision 878)
diff --git a/Documentation/video4linux/bttv/MAKEDEV b/Documentation/video4linux/bttv/MAKEDEV
index 6c29ba43b6c..9d112f7fd5f 100644
--- a/Documentation/video4linux/bttv/MAKEDEV
+++ b/Documentation/video4linux/bttv/MAKEDEV
@@ -14,7 +14,7 @@ function makedev () {
 	ln -s /dev/${1}0 /dev/$1
 }
 
-# see http://roadrunner.swansea.uk.linux.org/v4lapi.shtml
+# see http://linux.bytesex.org/v4l2/API.html
 
 echo "*** new device names ***"
 makedev video 0
diff --git a/Documentation/video4linux/bttv/Specs b/Documentation/video4linux/bttv/Specs
index 79b9e576fe7..f32466cdae0 100644
--- a/Documentation/video4linux/bttv/Specs
+++ b/Documentation/video4linux/bttv/Specs
@@ -1,3 +1,3 @@
 Philips		http://www.Semiconductors.COM/pip/
-Conexant	http://www.conexant.com/techinfo/default.asp
-Micronas	http://www.micronas.de/pages/product_documentation/index.html
+Conexant	http://www.conexant.com/
+Micronas	http://www.micronas.com/en/home/index.html
diff --git a/Documentation/video4linux/cx88/hauppauge-wintv-cx88-ir.txt b/Documentation/video4linux/cx88/hauppauge-wintv-cx88-ir.txt
index faccee68f60..f4329a38878 100644
--- a/Documentation/video4linux/cx88/hauppauge-wintv-cx88-ir.txt
+++ b/Documentation/video4linux/cx88/hauppauge-wintv-cx88-ir.txt
@@ -44,7 +44,7 @@ http://www.atmel.com/dyn/resources/prod_documents/doc2817.pdf
 This data sheet (google search) seems to have a lovely description of the
 RC5 basics
 
-http://users.pandora.be/nenya/electronics/rc5/  and more data
+http://www.nenya.be/beor/electronics/rc5.htm  and more data
 
 http://www.ee.washington.edu/circuit_archive/text/ir_decode.txt
 and even a reference to how to decode a bi-phase data stream.
diff --git a/Documentation/video4linux/hauppauge-wintv-cx88-ir.txt b/Documentation/video4linux/hauppauge-wintv-cx88-ir.txt
index faccee68f60..a2fd363c40c 100644
--- a/Documentation/video4linux/hauppauge-wintv-cx88-ir.txt
+++ b/Documentation/video4linux/hauppauge-wintv-cx88-ir.txt
@@ -44,7 +44,7 @@ http://www.atmel.com/dyn/resources/prod_documents/doc2817.pdf
 This data sheet (google search) seems to have a lovely description of the
 RC5 basics
 
-http://users.pandora.be/nenya/electronics/rc5/  and more data
+http://www.nenya.be/beor/electronics/rc5.htm and more data
 
 http://www.ee.washington.edu/circuit_archive/text/ir_decode.txt
 and even a reference to how to decode a bi-phase data stream.
diff --git a/Documentation/video4linux/ibmcam.txt b/Documentation/video4linux/ibmcam.txt
index 397a94eb77b..a51055211e6 100644
--- a/Documentation/video4linux/ibmcam.txt
+++ b/Documentation/video4linux/ibmcam.txt
@@ -27,9 +27,8 @@ SUPPORTED CAMERAS:
 
 Xirlink "C-It" camera, also known as "IBM PC Camera".
 The device uses proprietary ASIC (and compression method);
-it is manufactured by Xirlink. See http://www.xirlink.com/
-(renamed to http://www.veo.com), http://www.ibmpccamera.com,
-or http://www.c-itnow.com/ for details and pictures.
+it is manufactured by Xirlink. See http://xirlinkwebcam.sourceforge.net, 
+http://www.ibmpccamera.com, or http://www.c-itnow.com/ for details and pictures.
 
 This very chipset ("X Chip", as marked at the factory)
 is used in several other cameras, and they are supported
diff --git a/Documentation/video4linux/se401.txt b/Documentation/video4linux/se401.txt
index 7b9d1c960a1..bd6526ec8dd 100644
--- a/Documentation/video4linux/se401.txt
+++ b/Documentation/video4linux/se401.txt
@@ -49,6 +49,6 @@ order to increase the throughput (and thus framerate).
 HELP:
 
 The latest info on this driver can be found at:
-http://www.chello.nl/~j.vreeken/se401/
+http://members.chello.nl/~j.vreeken/se401/
 And questions to me can be send to:
 pe1rxq@amsat.org
diff --git a/Documentation/video4linux/w9966.txt b/Documentation/video4linux/w9966.txt
index 78a651254b8..855024525fd 100644
--- a/Documentation/video4linux/w9966.txt
+++ b/Documentation/video4linux/w9966.txt
@@ -24,7 +24,7 @@ where every two pixels take 4 bytes. In SDL (www.libsdl.org) this format
 is called VIDEO_PALETTE_YUV422 (16 bpp).
 
 A minimal test application (with source) is available from:
-  http://hem.fyristorg.com/mogul/w9966.html
+  http://www.slackwaresupport.com/howtos/Webcam-HOWTO
 
 The slow framerate is due to missing DMA ECP read support in the
 parport drivers. I might add working EPP support later.
diff --git a/Documentation/w1/masters/ds2482 b/Documentation/w1/masters/ds2482
index 299b91c7609..56f8edace6a 100644
--- a/Documentation/w1/masters/ds2482
+++ b/Documentation/w1/masters/ds2482
@@ -6,8 +6,8 @@ Supported chips:
     Prefix: 'ds2482'
     Addresses scanned: None
     Datasheets:
-        http://pdfserv.maxim-ic.com/en/ds/DS2482-100-DS2482S-100.pdf
-        http://pdfserv.maxim-ic.com/en/ds/DS2482-800-DS2482S-800.pdf
+        http://datasheets.maxim-ic.com/en/ds/DS2482-100.pdf
+        http://datasheets.maxim-ic.com/en/ds/DS2482-800.pdf
 
 Author: Ben Gardner <bgardner@wabtec.com>
 
diff --git a/Documentation/w1/masters/mxc-w1 b/Documentation/w1/masters/mxc-w1
index 97f6199a7f3..38be1ad6553 100644
--- a/Documentation/w1/masters/mxc-w1
+++ b/Documentation/w1/masters/mxc-w1
@@ -5,7 +5,8 @@ Supported chips:
   * Freescale MX27, MX31 and probably other i.MX SoCs
     Datasheets:
         http://www.freescale.com/files/32bit/doc/data_sheet/MCIMX31.pdf?fpsp=1
-	http://www.freescale.com/files/dsp/MCIMX27.pdf?fpsp=1
+	http://cache.freescale.com/files/dsp/doc/archive/MCIMX27.pdf?fsrch=1&WT_TYPE=
+	Data%20Sheets&WT_VENDOR=FREESCALE&WT_FILE_FORMAT=pdf&WT_ASSET=Documentation
 
 Author: Originally based on Freescale code, prepared for mainline by
 	Sascha Hauer <s.hauer@pengutronix.de>
diff --git a/Documentation/w1/masters/omap-hdq b/Documentation/w1/masters/omap-hdq
index ca722e09b6a..884dc284b21 100644
--- a/Documentation/w1/masters/omap-hdq
+++ b/Documentation/w1/masters/omap-hdq
@@ -7,7 +7,7 @@ Supported chips:
 
 A useful link about HDQ basics:
 ===============================
-http://focus.ti.com/lit/an/slua408/slua408.pdf
+http://focus.ti.com/lit/an/slua408a/slua408a.pdf
 
 Description:
 ============
diff --git a/Documentation/zh_CN/HOWTO b/Documentation/zh_CN/HOWTO
index 3d80e8af36e..69160779e43 100644
--- a/Documentation/zh_CN/HOWTO
+++ b/Documentation/zh_CN/HOWTO
@@ -112,7 +112,7 @@ Linux内核代码中包含有大量的文档。这些文档对于学习如何与
 
     其他关于如何正确地生成补丁的优秀文档包括：
     "The Perfect Patch"
-        http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
+        http://userweb.kernel.org/~akpm/stuff/tpp.txt
     "Linux kernel patch submission format"
         http://linux.yyz.us/patch-format.html
 
@@ -168,7 +168,7 @@ Linux内核代码中包含有大量的文档。这些文档对于学习如何与
 
 如果你想加入内核开发社区并协助完成一些任务，却找不到从哪里开始，可以访问
 “Linux内核房管员”计划：
-	http://janitor.kernelnewbies.org/
+	http://kernelnewbies.org/KernelJanitors
 这是极佳的起点。它提供一个相对简单的任务列表，列出内核代码中需要被重新
 整理或者改正的地方。通过和负责这个计划的开发者们一同工作，你会学到将补丁
 集成进内核的基本原理。如果还没有决定下一步要做什么的话，你还可能会得到方
@@ -515,7 +515,7 @@ Linux内核社区并不喜欢一下接收大段的代码。修改需要被恰当
 
 想了解它具体应该看起来像什么，请查阅以下文档中的“ChangeLog”章节：
   “The Perfect Patch”
-      http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
+  	 http://userweb.kernel.org/~akpm/stuff/tpp.txt
 
 
 这些事情有时候做起来很难。要在任何方面都做到完美可能需要好几年时间。这是
@@ -525,7 +525,7 @@ Linux内核社区并不喜欢一下接收大段的代码。修改需要被恰当
 
 ---------------
 感谢Paolo Ciarrocchi允许“开发流程”部分基于他所写的文章
-(http://linux.tar.bz/articles/2.6-development_process)，感谢Randy
+(http://www.kerneltravel.net/newbie/2.6-development_process)，感谢Randy
 Dunlap和Gerrit Huizenga完善了应该说和不该说的列表。感谢Pat Mochel, Hanna
 Linder, Randy Dunlap, Kay Sievers, Vojtech Pavlik, Jan Kara, Josh Boyer,
 Kees Cook, Andrew Morton, Andi Kleen, Vadim Lobanov, Jesper Juhl, Adrian
diff --git a/Documentation/zh_CN/SubmittingDrivers b/Documentation/zh_CN/SubmittingDrivers
index 5f4815c63ec..c27b0f6cdd3 100644
--- a/Documentation/zh_CN/SubmittingDrivers
+++ b/Documentation/zh_CN/SubmittingDrivers
@@ -165,4 +165,4 @@ Linux USB项目：
 	http://www.fenrus.org/how-to-not-write-a-device-driver-paper.pdf
 
 内核清洁工 (Kernel Janitor):
-	http://janitor.kernelnewbies.org/
+	http://kernelnewbies.org/KernelJanitors
diff --git a/Documentation/zh_CN/SubmittingPatches b/Documentation/zh_CN/SubmittingPatches
index 985c92e20b7..9a1a6e1ed09 100644
--- a/Documentation/zh_CN/SubmittingPatches
+++ b/Documentation/zh_CN/SubmittingPatches
@@ -83,7 +83,7 @@ Quilt:
 http://savannah.nongnu.org/projects/quilt
 
 Andrew Morton 的补丁脚本:
-http://www.zip.com.au/~akpm/linux/patches/
+http://userweb.kernel.org/~akpm/stuff/patch-scripts.tar.gz
 作为这些脚本的替代，quilt 是值得推荐的补丁管理工具(看上面的链接)。
 
 2)描述你的改动。
@@ -166,7 +166,7 @@ MAITAINERS 文件里的）发送一个手册页（man-pages）补丁，或者至
   人拷贝，只要它是琐碎的）
   任何文件的作者/维护者对该文件的改动（例如 patch monkey 在重传模式下）
 
-URL: <http://www.kernel.org/pub/linux/kernel/people/bunk/trivial/>
+EMAIL: trivial@kernel.org
 
 (译注，关于“琐碎补丁”的一些说明：因为原文的这一部分写得比较简单，所以不得不
 违例写一下译注。"trivial"这个英文单词的本意是“琐碎的，不重要的。”但是在这里
@@ -394,7 +394,7 @@ Static inline 函数相比宏来说，是好得多的选择。Static inline 函
 ----------------
 
 Andrew Morton, "The perfect patch" (tpp).
-  <http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt>
+  <http://userweb.kernel.org/~akpm/stuff/tpp.txt>
 
 Jeff Garzik, "Linux kernel patch submission format".
   <http://linux.yyz.us/patch-format.html>
-- 
cgit v1.2.3


From 30a69000a4ba9cf49e8b826431847cc80881b59b Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 20 Jul 2010 15:22:05 -0700
Subject: sysfs: fix discrepancies between implementation and documentation

Fix all discrepancies I know of between the sysfs implementation and its
documentation.

Signed-off-by: Bart Van Assche <bart.vanassche@gmail.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/filesystems/sysfs.txt | 44 +++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 19 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 931c806642c..d78ed0bb275 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -4,7 +4,7 @@ sysfs - _The_ filesystem for exporting kernel objects.
 Patrick Mochel	<mochel@osdl.org>
 Mike Murphy <mamurph@cs.clemson.edu>
 
-Revised:    22 February 2009
+Revised:    10 July 2010
 Original:   10 January 2003
 
 
@@ -124,7 +124,7 @@ show and store methods of the attribute owners.
 
 struct sysfs_ops {
         ssize_t (*show)(struct kobject *, struct attribute *, char *);
-        ssize_t (*store)(struct kobject *, struct attribute *, const char *);
+        ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t);
 };
 
 [ Subsystems should have already defined a struct kobj_type as a
@@ -139,18 +139,22 @@ calls the associated methods.
 
 To illustrate:
 
+#define to_dev(obj) container_of(obj, struct device, kobj)
 #define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr)
-#define to_dev(d) container_of(d, struct device, kobj)
 
-static ssize_t
-dev_attr_show(struct kobject * kobj, struct attribute * attr, char * buf)
+static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr,
+                             char *buf)
 {
-        struct device_attribute * dev_attr = to_dev_attr(attr);
-        struct device * dev = to_dev(kobj);
-        ssize_t ret = 0;
+        struct device_attribute *dev_attr = to_dev_attr(attr);
+        struct device *dev = to_dev(kobj);
+        ssize_t ret = -EIO;
 
         if (dev_attr->show)
-                ret = dev_attr->show(dev, buf);
+                ret = dev_attr->show(dev, dev_attr, buf);
+        if (ret >= (ssize_t)PAGE_SIZE) {
+                print_symbol("dev_attr_show: %s returned bad count\n",
+                                (unsigned long)dev_attr->show);
+        }
         return ret;
 }
 
@@ -163,10 +167,9 @@ To read or write attributes, show() or store() methods must be
 specified when declaring the attribute. The method types should be as
 simple as those defined for device attributes:
 
-ssize_t (*show)(struct device * dev, struct device_attribute * attr,
-                char * buf);
-ssize_t (*store)(struct device * dev, struct device_attribute * attr,
-                 const char * buf);
+ssize_t (*show)(struct device *dev, struct device_attribute *attr, char *buf);
+ssize_t (*store)(struct device *dev, struct device_attribute *attr,
+                 const char *buf, size_t count);
 
 IOW, they should take only an object, an attribute, and a buffer as parameters.
 
@@ -209,8 +212,8 @@ Other notes:
 
 - show() should always use snprintf(). 
 
-- store() should return the number of bytes used from the buffer. This
-  can be done using strlen().
+- store() should return the number of bytes used from the buffer. If the
+  entire buffer has been used, just return the count argument.
 
 - show() or store() can always return errors. If a bad value comes
   through, be sure to return an error.
@@ -223,15 +226,18 @@ Other notes:
 
 A very simple (and naive) implementation of a device attribute is:
 
-static ssize_t show_name(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t show_name(struct device *dev, struct device_attribute *attr,
+                         char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%s\n", dev->name);
 }
 
-static ssize_t store_name(struct device * dev, const char * buf)
+static ssize_t store_name(struct device *dev, struct device_attribute *attr,
+                          const char *buf, size_t count)
 {
-	sscanf(buf, "%20s", dev->name);
-	return strnlen(buf, PAGE_SIZE);
+        snprintf(dev->name, sizeof(dev->name), "%.*s",
+                 (int)min(count, sizeof(dev->name) - 1), buf);
+	return count;
 }
 
 static DEVICE_ATTR(name, S_IRUGO, show_name, store_name);
-- 
cgit v1.2.3


From a5307032718b1e90e6d07008d7fd44d1446db7d7 Mon Sep 17 00:00:00 2001
From: Ira Weiny <weiny2@llnl.gov>
Date: Thu, 15 Jul 2010 11:34:44 -0700
Subject: sysfs: Fix one more signature discrepancy between sysfs
 implementation and docs.

Signed-off-by: Ira Weiny <weiny2@llnl.gov>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/filesystems/sysfs.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index d78ed0bb275..5d1335faec2 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -4,7 +4,7 @@ sysfs - _The_ filesystem for exporting kernel objects.
 Patrick Mochel	<mochel@osdl.org>
 Mike Murphy <mamurph@cs.clemson.edu>
 
-Revised:    10 July 2010
+Revised:    15 July 2010
 Original:   10 January 2003
 
 
@@ -333,7 +333,7 @@ Structure:
 struct bus_attribute {
         struct attribute        attr;
         ssize_t (*show)(struct bus_type *, char * buf);
-        ssize_t (*store)(struct bus_type *, const char * buf);
+        ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
 };
 
 Declaring:
-- 
cgit v1.2.3


From 4b676d2dbed3dadc6ef913d58f85360547fa071e Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Thu, 5 Aug 2010 23:42:54 +0100
Subject: Squashfs: update Kconfig and documentation for LZO

Update compression types supported and add some help text for
the LZO Kconfig option.

Also add missing "default n" line and make some trivial whitespace
cleanups too.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 Documentation/filesystems/squashfs.txt |  2 +-
 fs/squashfs/Kconfig                    | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index 203f7202cc9..66699afd66c 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -2,7 +2,7 @@ SQUASHFS 4.0 FILESYSTEM
 =======================
 
 Squashfs is a compressed read-only filesystem for Linux.
-It uses zlib compression to compress files, inodes and directories.
+It uses zlib/lzo compression to compress files, inodes and directories.
 Inodes in the system are very small and all blocks are packed to minimise
 data overhead. Block sizes greater than 4K are supported up to a maximum
 of 1Mbytes (default block size 128K).
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 3da01108c44..e5f63da64d0 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,13 +5,13 @@ config SQUASHFS
 	help
 	  Saying Y here includes support for SquashFS 4.0 (a Compressed
 	  Read-Only File System).  Squashfs is a highly compressed read-only
-	  filesystem for Linux.  It uses zlib compression to compress both
+	  filesystem for Linux.  It uses zlib/lzo compression to compress both
 	  files, inodes and directories.  Inodes in the system are very small
 	  and all blocks are packed to minimise data overhead. Block sizes
 	  greater than 4K are supported up to a maximum of 1 Mbytes (default
 	  block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
 	  (larger than 4GB), full uid/gid information, hard links and
-	  timestamps.  
+	  timestamps.
 
 	  Squashfs is intended for general read-only filesystem use, for
 	  archival use (i.e. in cases where a .tar.gz file may be used), and in
@@ -40,11 +40,21 @@ config SQUASHFS_XATTR
 config SQUASHFS_LZO
 	bool "Include support for LZO compressed file systems"
 	depends on SQUASHFS
+	default n
 	select LZO_DECOMPRESS
+	help
+	  Saying Y here includes support for reading Squashfs file systems
+	  compressed with LZO compresssion.  LZO compression is mainly
+	  aimed at embedded systems with slower CPUs where the overheads
+	  of zlib are too high.
 
-config SQUASHFS_EMBEDDED
+	  LZO is not the standard compression used in Squashfs and so most
+	  file systems will be readable without selecting this option.
 
-	bool "Additional option for memory-constrained systems" 
+	  If unsure, say N.
+
+config SQUASHFS_EMBEDDED
+	bool "Additional option for memory-constrained systems"
 	depends on SQUASHFS
 	default n
 	help
-- 
cgit v1.2.3


From 1e2317350971c8b01e6adddc798a00e9bcc1a440 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 7 Jun 2010 09:29:20 +0200
Subject: update documentation for the new truncate sequence

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index a7e9746ee7e..f9547a5c187 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -273,3 +273,21 @@ it's safe to remove it.  If you don't need it, remove it.
 deliberate; as soon as struct block_device * is propagated in a reasonable
 way by that code fixing will become trivial; until then nothing can be
 done.
+
+[mandatory]
+
+	block truncatation on error exit from ->write_begin, and ->direct_IO
+moved from generic methods (block_write_begin, cont_write_begin,
+nobh_write_begin, blockdev_direct_IO*) to callers.  Take a look at
+ext2_write_failed and callers for an example.
+
+[mandatory]
+
+	->truncate is going away.  The whole truncate sequence needs to be
+implemented in ->setattr, which is now mandatory for filesystems
+implementing on-disk size changes.  Start with a copy of the old inode_setattr
+and vmtruncate, and the reorder the vmtruncate + foofs_vmtruncate sequence to
+be in order of zeroing blocks using block_truncate_page or similar helpers,
+size update and on finally on-disk truncation which should not fail.
+inode_change_ok now includes the size checks for ATTR_SIZE and must be called
+in the beginning of ->setattr unconditionally.
-- 
cgit v1.2.3


From 336fb3b97b78edc65bae0b223b83bf676cfe29e2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Jun 2010 00:37:12 -0400
Subject: update VFS documentation for method changes.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking | 22 ++++++++++++----------
 Documentation/filesystems/porting | 27 +++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 10 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 96d4293607e..bbcc15651a2 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -92,8 +92,8 @@ prototypes:
 	void (*destroy_inode)(struct inode *);
 	void (*dirty_inode) (struct inode *);
 	int (*write_inode) (struct inode *, int);
-	void (*drop_inode) (struct inode *);
-	void (*delete_inode) (struct inode *);
+	int (*drop_inode) (struct inode *);
+	void (*evict_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
 	void (*write_super) (struct super_block *);
 	int (*sync_fs)(struct super_block *sb, int wait);
@@ -101,14 +101,13 @@ prototypes:
 	int (*unfreeze_fs) (struct super_block *);
 	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
-	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
 	int (*show_options)(struct seq_file *, struct vfsmount *);
 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 
 locking rules:
-	All may block.
+	All may block [not true, see below]
 	None have BKL
 			s_umount
 alloc_inode:
@@ -116,22 +115,25 @@ destroy_inode:
 dirty_inode:				(must not sleep)
 write_inode:
 drop_inode:				!!!inode_lock!!!
-delete_inode:
+evict_inode:
 put_super:		write
 write_super:		read
 sync_fs:		read
 freeze_fs:		read
 unfreeze_fs:		read
-statfs:			no
-remount_fs:		maybe		(see below)
-clear_inode:
+statfs:			maybe(read)	(see below)
+remount_fs:		write
 umount_begin:		no
 show_options:		no		(namespace_sem)
 quota_read:		no		(see below)
 quota_write:		no		(see below)
 
-->remount_fs() will have the s_umount exclusive lock if it's already mounted.
-When called from get_sb_single, it does NOT have the s_umount lock.
+->statfs() has s_umount (shared) when called by ustat(2) (native or
+compat), but that's an accident of bad API; s_umount is used to pin
+the superblock down when we only have dev_t given us by userland to
+identify the superblock.  Everything else (statfs(), fstatfs(), etc.)
+doesn't hold it when calling ->statfs() - superblock is pinned down
+by resolving the pathname passed to syscall.
 ->quota_read() and ->quota_write() functions are both guaranteed to
 be the only ones operating on the quota file by the quota code (via
 dqio_sem) (unless an admin really wants to screw up something and
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index f9547a5c187..b12c8953868 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -291,3 +291,30 @@ be in order of zeroing blocks using block_truncate_page or similar helpers,
 size update and on finally on-disk truncation which should not fail.
 inode_change_ok now includes the size checks for ATTR_SIZE and must be called
 in the beginning of ->setattr unconditionally.
+
+[mandatory]
+
+	->clear_inode() and ->delete_inode() are gone; ->evict_inode() should
+be used instead.  It gets called whenever the inode is evicted, whether it has
+remaining links or not.  Caller does *not* evict the pagecache or inode-associated
+metadata buffers; getting rid of those is responsibility of method, as it had
+been for ->delete_inode().
+	->drop_inode() returns int now; it's called on final iput() with inode_lock
+held and it returns true if filesystems wants the inode to be dropped.  As before,
+generic_drop_inode() is still the default and it's been updated appropriately.
+generic_delete_inode() is also alive and it consists simply of return 1.  Note that
+all actual eviction work is done by caller after ->drop_inode() returns.
+	clear_inode() is gone; use end_writeback() instead.  As before, it must
+be called exactly once on each call of ->evict_inode() (as it used to be for
+each call of ->delete_inode()).  Unlike before, if you are using inode-associated
+metadata buffers (i.e. mark_buffer_dirty_inode()), it's your responsibility to
+call invalidate_inode_buffers() before end_writeback().
+	No async writeback (and thus no calls of ->write_inode()) will happen
+after end_writeback() returns, so actions that should not overlap with ->write_inode()
+(e.g. freeing on-disk inode if i_nlink is 0) ought to be done after that call.
+
+	NOTE: checking i_nlink in the beginning of ->write_inode() and bailing out
+if it's zero is not *and* *never* *had* *been* enough.  Final unlink() and iput()
+may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly
+free the on-disk inode, you may end up doing that while ->write_inode() is writing
+to it.
-- 
cgit v1.2.3


From a63d83f427fbce97a6cea0db2e64b0eb8435cd10 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:19:46 -0700
Subject: oom: badness heuristic rewrite

This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions.  The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.

Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead.  This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits.  This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.

The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory.  "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit.  The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.

The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.

Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs.  In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.

Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it.  It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability.  Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000.  It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered.  The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.

/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa.  Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning.  Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity.  This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  94 ++++++++------
 fs/proc/base.c                     |  94 +++++++++++++-
 include/linux/memcontrol.h         |   8 ++
 include/linux/oom.h                |  14 +-
 include/linux/sched.h              |   3 +-
 kernel/fork.c                      |   1 +
 mm/memcontrol.c                    |  18 +++
 mm/oom_kill.c                      | 259 ++++++++++++++++---------------------
 8 files changed, 300 insertions(+), 191 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 8fe8895894d..cf1295c2bb6 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -33,7 +33,8 @@ Table of Contents
   2	Modifying System Parameters
 
   3	Per-Process Parameters
-  3.1	/proc/<pid>/oom_adj - Adjust the oom-killer score
+  3.1	/proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer
+								score
   3.2	/proc/<pid>/oom_score - Display current oom-killer score
   3.3	/proc/<pid>/io - Display the IO accounting fields
   3.4	/proc/<pid>/coredump_filter - Core dump filtering settings
@@ -1234,42 +1235,61 @@ of the kernel.
 CHAPTER 3: PER-PROCESS PARAMETERS
 ------------------------------------------------------------------------------
 
-3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
-------------------------------------------------------
-
-This file can be used to adjust the score used to select which processes
-should be killed in an  out-of-memory  situation.  Giving it a high score will
-increase the likelihood of this process being killed by the oom-killer.  Valid
-values are in the range -16 to +15, plus the special value -17, which disables
-oom-killing altogether for this process.
-
-The process to be killed in an out-of-memory situation is selected among all others
-based on its badness score. This value equals the original memory size of the process
-and is then updated according to its CPU time (utime + stime) and the
-run time (uptime - start time). The longer it runs the smaller is the score.
-Badness score is divided by the square root of the CPU time and then by
-the double square root of the run time.
-
-Swapped out tasks are killed first. Half of each child's memory size is added to
-the parent's score if they do not share the same memory. Thus forking servers
-are the prime candidates to be killed. Having only one 'hungry' child will make
-parent less preferable than the child.
-
-/proc/<pid>/oom_score shows process' current badness score.
-
-The following heuristics are then applied:
- * if the task was reniced, its score doubles
- * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE
- 	or CAP_SYS_RAWIO) have their score divided by 4
- * if oom condition happened in one cpuset and checked process does not belong
- 	to it, its score is divided by 8
- * the resulting score is multiplied by two to the power of oom_adj, i.e.
-	points <<= oom_adj when it is positive and
-	points >>= -(oom_adj) otherwise
-
-The task with the highest badness score is then selected and its children
-are killed, process itself will be killed in an OOM situation when it does
-not have children or some of them disabled oom like described above.
+3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score
+--------------------------------------------------------------------------------
+
+These file can be used to adjust the badness heuristic used to select which
+process gets killed in out of memory conditions.
+
+The badness heuristic assigns a value to each candidate task ranging from 0
+(never kill) to 1000 (always kill) to determine which process is targeted.  The
+units are roughly a proportion along that range of allowed memory the process
+may allocate from based on an estimation of its current memory and swap use.
+For example, if a task is using all allowed memory, its badness score will be
+1000.  If it is using half of its allowed memory, its score will be 500.
+
+There is an additional factor included in the badness score: root
+processes are given 3% extra memory over other tasks.
+
+The amount of "allowed" memory depends on the context in which the oom killer
+was called.  If it is due to the memory assigned to the allocating task's cpuset
+being exhausted, the allowed memory represents the set of mems assigned to that
+cpuset.  If it is due to a mempolicy's node(s) being exhausted, the allowed
+memory represents the set of mempolicy nodes.  If it is due to a memory
+limit (or swap limit) being reached, the allowed memory is that configured
+limit.  Finally, if it is due to the entire system being out of memory, the
+allowed memory represents all allocatable resources.
+
+The value of /proc/<pid>/oom_score_adj is added to the badness score before it
+is used to determine which task to kill.  Acceptable values range from -1000
+(OOM_SCORE_ADJ_MIN) to +1000 (OOM_SCORE_ADJ_MAX).  This allows userspace to
+polarize the preference for oom killing either by always preferring a certain
+task or completely disabling it.  The lowest possible value, -1000, is
+equivalent to disabling oom killing entirely for that task since it will always
+report a badness score of 0.
+
+Consequently, it is very simple for userspace to define the amount of memory to
+consider for each task.  Setting a /proc/<pid>/oom_score_adj value of +500, for
+example, is roughly equivalent to allowing the remainder of tasks sharing the
+same system, cpuset, mempolicy, or memory controller resources to use at least
+50% more memory.  A value of -500, on the other hand, would be roughly
+equivalent to discounting 50% of the task's allowed memory from being considered
+as scoring against the task.
+
+For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also
+be used to tune the badness score.  Its acceptable values range from -16
+(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17
+(OOM_DISABLE) to disable oom killing entirely for that task.  Its value is
+scaled linearly with /proc/<pid>/oom_score_adj.
+
+Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
+other with its scaled value.
+
+Caveat: when a parent task is selected, the oom killer will sacrifice any first
+generation children with seperate address spaces instead, if possible.  This
+avoids servers and important system daemons from being killed and loses the
+minimal amount of work.
+
 
 3.2 /proc/<pid>/oom_score - Display current oom-killer score
 -------------------------------------------------------------
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5949d0ac30f..f923b728388 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -63,6 +63,7 @@
 #include <linux/namei.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
@@ -430,12 +431,11 @@ static const struct file_operations proc_lstats_operations = {
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points = 0;
-	struct timespec uptime;
 
-	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
 	if (pid_alive(task))
-		points = badness(task, NULL, NULL, uptime.tv_sec);
+		points = oom_badness(task, NULL, NULL,
+					totalram_pages + total_swap_pages);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
@@ -1038,7 +1038,15 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	}
 
 	task->signal->oom_adj = oom_adjust;
-
+	/*
+	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+	 * value is always attainable.
+	 */
+	if (task->signal->oom_adj == OOM_ADJUST_MAX)
+		task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
+	else
+		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
+								-OOM_DISABLE;
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 
@@ -1051,6 +1059,82 @@ static const struct file_operations proc_oom_adjust_operations = {
 	.llseek		= generic_file_llseek,
 };
 
+static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	char buffer[PROC_NUMBUF];
+	int oom_score_adj = OOM_SCORE_ADJ_MIN;
+	unsigned long flags;
+	size_t len;
+
+	if (!task)
+		return -ESRCH;
+	if (lock_task_sighand(task, &flags)) {
+		oom_score_adj = task->signal->oom_score_adj;
+		unlock_task_sighand(task, &flags);
+	}
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	unsigned long flags;
+	long oom_score_adj;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
+	if (err)
+		return -EINVAL;
+	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
+			oom_score_adj > OOM_SCORE_ADJ_MAX)
+		return -EINVAL;
+
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+	if (!lock_task_sighand(task, &flags)) {
+		put_task_struct(task);
+		return -ESRCH;
+	}
+	if (oom_score_adj < task->signal->oom_score_adj &&
+			!capable(CAP_SYS_RESOURCE)) {
+		unlock_task_sighand(task, &flags);
+		put_task_struct(task);
+		return -EACCES;
+	}
+
+	task->signal->oom_score_adj = oom_score_adj;
+	/*
+	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
+	 * always attainable.
+	 */
+	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		task->signal->oom_adj = OOM_DISABLE;
+	else
+		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
+							OOM_SCORE_ADJ_MAX;
+	unlock_task_sighand(task, &flags);
+	put_task_struct(task);
+	return count;
+}
+
+static const struct file_operations proc_oom_score_adj_operations = {
+	.read		= oom_score_adj_read,
+	.write		= oom_score_adj_write,
+};
+
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -2623,6 +2707,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 	INF("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
@@ -2957,6 +3042,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 	INF("oom_score", S_IRUGO, proc_oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUSR, proc_sessionid_operations),
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9f1afd36158..73564cac38c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -125,6 +125,8 @@ void mem_cgroup_update_file_mapped(struct page *page, int val);
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask, int nid,
 						int zid);
+u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -304,6 +306,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 	return 0;
 }
 
+static inline
+u64 mem_cgroup_get_limit(struct mem_cgroup *mem)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 40e5e3a6bc2..73b8d7b6dd1 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -1,14 +1,24 @@
 #ifndef __INCLUDE_LINUX_OOM_H
 #define __INCLUDE_LINUX_OOM_H
 
-/* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */
+/*
+ * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
+ */
 #define OOM_DISABLE (-17)
 /* inclusive */
 #define OOM_ADJUST_MIN (-16)
 #define OOM_ADJUST_MAX 15
 
+/*
+ * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for
+ * pid.
+ */
+#define OOM_SCORE_ADJ_MIN	(-1000)
+#define OOM_SCORE_ADJ_MAX	1000
+
 #ifdef __KERNEL__
 
+#include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/nodemask.h>
 
@@ -27,6 +37,8 @@ enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
+extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+			const nodemask_t *nodemask, unsigned long totalpages);
 extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9591907c4f7..ce160d68f5e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -621,7 +621,8 @@ struct signal_struct {
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 
-	int oom_adj;	/* OOM kill score adjustment (bit shift) */
+	int oom_adj;		/* OOM kill score adjustment (bit shift) */
+	int oom_score_adj;	/* OOM kill score adjustment */
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/kernel/fork.c b/kernel/fork.c
index a82a65cef74..98b450876f9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -899,6 +899,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 
 	sig->oom_adj = current->signal->oom_adj;
+	sig->oom_score_adj = current->signal->oom_score_adj;
 
 	return 0;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 31abd1c2c0c..de54ea0094a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1126,6 +1126,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
 	return num;
 }
 
+/*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+	u64 limit;
+	u64 memsw;
+
+	limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+			total_swap_pages;
+	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+	/*
+	 * If memsw is finite and limits the amount of swap space available
+	 * to this memcg, return that limit.
+	 */
+	return min(limit, memsw);
+}
+
 /*
  * Visit the first child (need not be the first child as per the ordering
  * of the cgroup list, since we track last_scanned_child) of @mem and use
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0a4ca8a0234..d3def05a33d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
  *  Copyright (C)  1998,2000  Rik van Riel
  *	Thanks go out to Claus Fischer for some serious inspiration and
  *	for goading me into coding this file...
+ *  Copyright (C)  2010  Google, Inc.
+ *	Rewritten by David Rientjes
  *
  *  The routines in this file are used to kill a process when
  *  we're seriously out of memory. This gets called from __alloc_pages()
@@ -34,7 +36,6 @@ int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
-/* #define DEBUG */
 
 #ifdef CONFIG_NUMA
 /**
@@ -140,137 +141,76 @@ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
 }
 
 /**
- * badness - calculate a numeric value for how bad this task has been
+ * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
- * @uptime: current uptime in seconds
+ * @totalpages: total present RAM allowed for page allocation
  *
- * The formula used is relatively simple and documented inline in the
- * function. The main rationale is that we want to select a good task
- * to kill when we run out of memory.
- *
- * Good in this context means that:
- * 1) we lose the minimum amount of work done
- * 2) we recover a large amount of memory
- * 3) we don't kill anything innocent of eating tons of memory
- * 4) we want to kill the minimum amount of processes (one)
- * 5) we try to kill the process the user expects us to kill, this
- *    algorithm has been meticulously tuned to meet the principle
- *    of least surprise ... (be careful when you change it)
+ * The heuristic for determining which task to kill is made to be as simple and
+ * predictable as possible.  The goal is to return the highest value for the
+ * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
-		      const nodemask_t *nodemask, unsigned long uptime)
+unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+		      const nodemask_t *nodemask, unsigned long totalpages)
 {
-	unsigned long points, cpu_time, run_time;
-	struct task_struct *child;
-	struct task_struct *c, *t;
-	int oom_adj = p->signal->oom_adj;
-	struct task_cputime task_time;
-	unsigned long utime;
-	unsigned long stime;
+	int points;
 
 	if (oom_unkillable_task(p, mem, nodemask))
 		return 0;
-	if (oom_adj == OOM_DISABLE)
-		return 0;
 
 	p = find_lock_task_mm(p);
 	if (!p)
 		return 0;
 
 	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = p->mm->total_vm;
-	task_unlock(p);
-
-	/*
-	 * swapoff can easily use up all memory, so kill those first.
-	 */
-	if (p->flags & PF_OOM_ORIGIN)
-		return ULONG_MAX;
-
-	/*
-	 * Processes which fork a lot of child processes are likely
-	 * a good choice. We add half the vmsize of the children if they
-	 * have an own mm. This prevents forking servers to flood the
-	 * machine with an endless amount of children. In case a single
-	 * child is eating the vast majority of memory, adding only half
-	 * to the parents will make the child our kill candidate of choice.
+	 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
+	 * need to be executed for something that cannot be killed.
 	 */
-	t = p;
-	do {
-		list_for_each_entry(c, &t->children, sibling) {
-			child = find_lock_task_mm(c);
-			if (child) {
-				if (child->mm != p->mm)
-					points += child->mm->total_vm/2 + 1;
-				task_unlock(child);
-			}
-		}
-	} while_each_thread(p, t);
+	if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+		task_unlock(p);
+		return 0;
+	}
 
 	/*
-	 * CPU time is in tens of seconds and run time is in thousands
-         * of seconds. There is no particular reason for this other than
-         * that it turned out to work very well in practice.
+	 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
+	 * priority for oom killing.
 	 */
-	thread_group_cputime(p, &task_time);
-	utime = cputime_to_jiffies(task_time.utime);
-	stime = cputime_to_jiffies(task_time.stime);
-	cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
-
-
-	if (uptime >= p->start_time.tv_sec)
-		run_time = (uptime - p->start_time.tv_sec) >> 10;
-	else
-		run_time = 0;
-
-	if (cpu_time)
-		points /= int_sqrt(cpu_time);
-	if (run_time)
-		points /= int_sqrt(int_sqrt(run_time));
+	if (p->flags & PF_OOM_ORIGIN) {
+		task_unlock(p);
+		return 1000;
+	}
 
 	/*
-	 * Niced processes are most likely less important, so double
-	 * their badness points.
+	 * The memory controller may have a limit of 0 bytes, so avoid a divide
+	 * by zero, if necessary.
 	 */
-	if (task_nice(p) > 0)
-		points *= 2;
+	if (!totalpages)
+		totalpages = 1;
 
 	/*
-	 * Superuser processes are usually more important, so we make it
-	 * less likely that we kill those.
+	 * The baseline for the badness score is the proportion of RAM that each
+	 * task's rss and swap space use.
 	 */
-	if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
-	    has_capability_noaudit(p, CAP_SYS_RESOURCE))
-		points /= 4;
+	points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
+			totalpages;
+	task_unlock(p);
 
 	/*
-	 * We don't want to kill a process with direct hardware access.
-	 * Not only could that mess up the hardware, but usually users
-	 * tend to only have this flag set on applications they think
-	 * of as important.
+	 * Root processes get 3% bonus, just like the __vm_enough_memory()
+	 * implementation used by LSMs.
 	 */
-	if (has_capability_noaudit(p, CAP_SYS_RAWIO))
-		points /= 4;
+	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
+		points -= 30;
 
 	/*
-	 * Adjust the score by oom_adj.
+	 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+	 * either completely disable oom killing or always prefer a certain
+	 * task.
 	 */
-	if (oom_adj) {
-		if (oom_adj > 0) {
-			if (!points)
-				points = 1;
-			points <<= oom_adj;
-		} else
-			points >>= -(oom_adj);
-	}
+	points += p->signal->oom_score_adj;
 
-#ifdef DEBUG
-	printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
-	p->pid, p->comm, points);
-#endif
-	return points;
+	if (points < 0)
+		return 0;
+	return (points < 1000) ? points : 1000;
 }
 
 /*
@@ -278,12 +218,20 @@ unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
  */
 #ifdef CONFIG_NUMA
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-				    gfp_t gfp_mask, nodemask_t *nodemask)
+				gfp_t gfp_mask, nodemask_t *nodemask,
+				unsigned long *totalpages)
 {
 	struct zone *zone;
 	struct zoneref *z;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	bool cpuset_limited = false;
+	int nid;
 
+	/* Default to all available memory */
+	*totalpages = totalram_pages + total_swap_pages;
+
+	if (!zonelist)
+		return CONSTRAINT_NONE;
 	/*
 	 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
 	 * to kill current.We have to random task kill in this case.
@@ -293,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 		return CONSTRAINT_NONE;
 
 	/*
-	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
-	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
-	 * feature. mempolicy is an only user of nodemask here.
-	 * check mempolicy's nodemask contains all N_HIGH_MEMORY
+	 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
+	 * the page allocator means a mempolicy is in effect.  Cpuset policy
+	 * is enforced in get_page_from_freelist().
 	 */
-	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
+	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+		*totalpages = total_swap_pages;
+		for_each_node_mask(nid, *nodemask)
+			*totalpages += node_spanned_pages(nid);
 		return CONSTRAINT_MEMORY_POLICY;
+	}
 
 	/* Check this allocation failure is caused by cpuset's wall function */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 			high_zoneidx, nodemask)
 		if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
-			return CONSTRAINT_CPUSET;
+			cpuset_limited = true;
 
+	if (cpuset_limited) {
+		*totalpages = total_swap_pages;
+		for_each_node_mask(nid, cpuset_current_mems_allowed)
+			*totalpages += node_spanned_pages(nid);
+		return CONSTRAINT_CPUSET;
+	}
 	return CONSTRAINT_NONE;
 }
 #else
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-				gfp_t gfp_mask, nodemask_t *nodemask)
+				gfp_t gfp_mask, nodemask_t *nodemask,
+				unsigned long *totalpages)
 {
+	*totalpages = totalram_pages + total_swap_pages;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -323,17 +282,16 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned long *ppoints,
-		struct mem_cgroup *mem, const nodemask_t *nodemask)
+static struct task_struct *select_bad_process(unsigned int *ppoints,
+		unsigned long totalpages, struct mem_cgroup *mem,
+		const nodemask_t *nodemask)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
-	struct timespec uptime;
 	*ppoints = 0;
 
-	do_posix_clock_monotonic_gettime(&uptime);
 	for_each_process(p) {
-		unsigned long points;
+		unsigned int points;
 
 		if (oom_unkillable_task(p, mem, nodemask))
 			continue;
@@ -365,11 +323,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 				return ERR_PTR(-1UL);
 
 			chosen = p;
-			*ppoints = ULONG_MAX;
+			*ppoints = 1000;
 		}
 
-		points = badness(p, mem, nodemask, uptime.tv_sec);
-		if (points > *ppoints || !chosen) {
+		points = oom_badness(p, mem, nodemask, totalpages);
+		if (points > *ppoints) {
 			chosen = p;
 			*ppoints = points;
 		}
@@ -384,7 +342,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
  *
  * Dumps the current memory state of all system tasks, excluding kernel threads.
  * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
- * score, and name.
+ * value, oom_score_adj value, and name.
  *
  * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
  * shown.
@@ -396,8 +354,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	printk(KERN_INFO "[ pid ]   uid  tgid total_vm      rss cpu oom_adj "
-	       "name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
 	for_each_process(p) {
 		if (p->flags & PF_KTHREAD)
 			continue;
@@ -414,10 +371,11 @@ static void dump_tasks(const struct mem_cgroup *mem)
 			continue;
 		}
 
-		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3u     %3d %s\n",
-		       task->pid, __task_cred(task)->uid, task->tgid,
-		       task->mm->total_vm, get_mm_rss(task->mm),
-		       task_cpu(task), task->signal->oom_adj, task->comm);
+		pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
+			task->pid, __task_cred(task)->uid, task->tgid,
+			task->mm->total_vm, get_mm_rss(task->mm),
+			task_cpu(task), task->signal->oom_adj,
+			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
 	}
 }
@@ -427,8 +385,9 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
 	task_lock(current);
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_adj=%d\n",
-		current->comm, gfp_mask, order, current->signal->oom_adj);
+		"oom_adj=%d, oom_score_adj=%d\n",
+		current->comm, gfp_mask, order, current->signal->oom_adj,
+		current->signal->oom_score_adj);
 	cpuset_print_task_mems_allowed(current);
 	task_unlock(current);
 	dump_stack();
@@ -468,14 +427,14 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 #undef K
 
 static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-			    unsigned long points, struct mem_cgroup *mem,
-			    nodemask_t *nodemask, const char *message)
+			    unsigned int points, unsigned long totalpages,
+			    struct mem_cgroup *mem, nodemask_t *nodemask,
+			    const char *message)
 {
 	struct task_struct *victim = p;
 	struct task_struct *child;
 	struct task_struct *t = p;
-	unsigned long victim_points = 0;
-	struct timespec uptime;
+	unsigned int victim_points = 0;
 
 	if (printk_ratelimit())
 		dump_header(p, gfp_mask, order, mem);
@@ -491,7 +450,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	}
 
 	task_lock(p);
-	pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n",
+	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
 		message, task_pid_nr(p), p->comm, points);
 	task_unlock(p);
 
@@ -501,14 +460,15 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 * parent.  This attempts to lose the minimal amount of work done while
 	 * still freeing memory.
 	 */
-	do_posix_clock_monotonic_gettime(&uptime);
 	do {
 		list_for_each_entry(child, &t->children, sibling) {
-			unsigned long child_points;
+			unsigned int child_points;
 
-			/* badness() returns 0 if the thread is unkillable */
-			child_points = badness(child, mem, nodemask,
-					       uptime.tv_sec);
+			/*
+			 * oom_badness() returns 0 if the thread is unkillable
+			 */
+			child_points = oom_badness(child, mem, nodemask,
+								totalpages);
 			if (child_points > victim_points) {
 				victim = child;
 				victim_points = child_points;
@@ -546,17 +506,19 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 {
-	unsigned long points = 0;
+	unsigned long limit;
+	unsigned int points = 0;
 	struct task_struct *p;
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
+	limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem, NULL);
+	p = select_bad_process(&points, limit, mem, NULL);
 	if (!p || PTR_ERR(p) == -1UL)
 		goto out;
 
-	if (oom_kill_process(p, gfp_mask, 0, points, mem, NULL,
+	if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
 				"Memory cgroup out of memory"))
 		goto retry;
 out:
@@ -681,8 +643,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *nodemask)
 {
 	struct task_struct *p;
+	unsigned long totalpages;
 	unsigned long freed = 0;
-	unsigned long points;
+	unsigned int points;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
 
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
@@ -705,8 +668,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 * Check if there were limitations on the allocation (only relevant for
 	 * NUMA) that may require different handling.
 	 */
-	if (zonelist)
-		constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
+	constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+						&totalpages);
 	check_panic_on_oom(constraint, gfp_mask, order);
 
 	read_lock(&tasklist_lock);
@@ -718,14 +681,14 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		 * non-zero, current could not be killed so we must fallback to
 		 * the tasklist scan.
 		 */
-		if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
-				nodemask,
+		if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
+				NULL, nodemask,
 				"Out of memory (oom_kill_allocating_task)"))
 			return;
 	}
 
 retry:
-	p = select_bad_process(&points, NULL,
+	p = select_bad_process(&points, totalpages, NULL,
 			constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
 								 NULL);
 	if (PTR_ERR(p) == -1UL)
@@ -738,8 +701,8 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL, nodemask,
-			     "Out of memory"))
+	if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+				nodemask, "Out of memory"))
 		goto retry;
 	read_unlock(&tasklist_lock);
 
-- 
cgit v1.2.3


From 51b1bd2ace1595b72956224deda349efa880b693 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:19:47 -0700
Subject: oom: deprecate oom_adj tunable

/proc/pid/oom_adj is now deprecated so that that it may eventually be
removed.  The target date for removal is August 2012.

A warning will be printed to the kernel log if a task attempts to use this
interface.  Future warning will be suppressed until the kernel is rebooted
to prevent spamming the kernel log.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/feature-removal-schedule.txt | 25 +++++++++++++++++++++++++
 Documentation/filesystems/proc.txt         |  3 +++
 fs/proc/base.c                             |  8 ++++++++
 include/linux/oom.h                        |  3 +++
 4 files changed, 39 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 71f0fea1058..56cee4727b1 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -151,6 +151,31 @@ Who:	Eric Biederman <ebiederm@xmission.com>
 
 ---------------------------
 
+What:	/proc/<pid>/oom_adj
+When:	August 2012
+Why:	/proc/<pid>/oom_adj allows userspace to influence the oom killer's
+	badness heuristic used to determine which task to kill when the kernel
+	is out of memory.
+
+	The badness heuristic has since been rewritten since the introduction of
+	this tunable such that its meaning is deprecated.  The value was
+	implemented as a bitshift on a score generated by the badness()
+	function that did not have any precise units of measure.  With the
+	rewrite, the score is given as a proportion of available memory to the
+	task allocating pages, so using a bitshift which grows the score
+	exponentially is, thus, impossible to tune with fine granularity.
+
+	A much more powerful interface, /proc/<pid>/oom_score_adj, was
+	introduced with the oom killer rewrite that allows users to increase or
+	decrease the badness() score linearly.  This interface will replace
+	/proc/<pid>/oom_adj.
+
+	A warning will be emitted to the kernel log if an application uses this
+	deprecated interface.  After it is printed once, future warnings will be
+	suppressed until the kernel is rebooted.
+
+---------------------------
+
 What:	remove EXPORT_SYMBOL(kernel_thread)
 When:	August 2006
 Files:	arch/*/kernel/*_ksyms.c
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index cf1295c2bb6..a6aca874088 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1285,6 +1285,9 @@ scaled linearly with /proc/<pid>/oom_score_adj.
 Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
 other with its scaled value.
 
+NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
+Documentation/feature-removal-schedule.txt.
+
 Caveat: when a parent task is selected, the oom killer will sacrifice any first
 generation children with seperate address spaces instead, if possible.  This
 avoids servers and important system daemons from being killed and loses the
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f923b728388..69254a365ce 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1037,6 +1037,14 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EACCES;
 	}
 
+	/*
+	 * Warn that /proc/pid/oom_adj is deprecated, see
+	 * Documentation/feature-removal-schedule.txt.
+	 */
+	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
+			"please use /proc/%d/oom_score_adj instead.\n",
+			current->comm, task_pid_nr(current),
+			task_pid_nr(task), task_pid_nr(task));
 	task->signal->oom_adj = oom_adjust;
 	/*
 	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 73b8d7b6dd1..f209b683e11 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -2,6 +2,9 @@
 #define __INCLUDE_LINUX_OOM_H
 
 /*
+ * /proc/<pid>/oom_adj is deprecated, see
+ * Documentation/feature-removal-schedule.txt.
+ *
  * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
  */
 #define OOM_DISABLE (-17)
-- 
cgit v1.2.3


From b19dd42faf413b4705d4adb38521e82d73fa4249 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 4 Jul 2010 00:15:10 +0200
Subject: bkl: Remove locked .ioctl file operation

The last user is gone, so we can safely remove this

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/filesystems/Locking |  8 +-------
 Documentation/filesystems/vfs.txt |  6 +-----
 fs/bad_inode.c                    |  7 -------
 fs/compat_ioctl.c                 |  3 +--
 fs/ioctl.c                        | 18 ++++--------------
 fs/proc/inode.c                   | 17 ++++-------------
 include/linux/fs.h                |  5 ++---
 7 files changed, 13 insertions(+), 51 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index bbcc15651a2..2db4283efa8 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -374,8 +374,6 @@ prototypes:
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
-	int (*ioctl) (struct inode *, struct file *, unsigned int,
-			unsigned long);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -409,8 +407,7 @@ write:			no
 aio_write:		no
 readdir: 		no
 poll:			no
-ioctl:			yes	(see below)
-unlocked_ioctl:		no	(see below)
+unlocked_ioctl:		no
 compat_ioctl:		no
 mmap:			no
 open:			no
@@ -453,9 +450,6 @@ move ->readdir() to inode_operations and use a separate method for directory
 anything that resembles union-mount we won't have a struct file for all
 components. And there are other reasons why the current interface is a mess...
 
-->ioctl() on regular files is superceded by the ->unlocked_ioctl() that
-doesn't take the BKL.
-
 ->read on directories probably must go away - we should just enforce -EISDIR
 in sys_read() and friends.
 
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 94677e7dcb1..ed7e5efc06d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -727,7 +727,6 @@ struct file_operations {
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
-	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -768,10 +767,7 @@ otherwise noted.
 	activity on this file and (optionally) go to sleep until there
 	is activity. Called by the select(2) and poll(2) system calls
 
-  ioctl: called by the ioctl(2) system call
-
-  unlocked_ioctl: called by the ioctl(2) system call. Filesystems that do not
-  	require the BKL should use this method instead of the ioctl() above.
+  unlocked_ioctl: called by the ioctl(2) system call.
 
   compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
  	 are used on 64 bit kernels.
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 52e59bf4aa5..f024d8aadde 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -55,12 +55,6 @@ static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
 	return POLLERR;
 }
 
-static int bad_file_ioctl (struct inode *inode, struct file *filp,
-			unsigned int cmd, unsigned long arg)
-{
-	return -EIO;
-}
-
 static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
 			unsigned long arg)
 {
@@ -159,7 +153,6 @@ static const struct file_operations bad_file_ops =
 	.aio_write	= bad_file_aio_write,
 	.readdir	= bad_file_readdir,
 	.poll		= bad_file_poll,
-	.ioctl		= bad_file_ioctl,
 	.unlocked_ioctl	= bad_file_unlocked_ioctl,
 	.compat_ioctl	= bad_file_compat_ioctl,
 	.mmap		= bad_file_mmap,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 70227e0dc01..03e59aa318e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1699,8 +1699,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 				goto out_fput;
 		}
 
-		if (!filp->f_op ||
-		    (!filp->f_op->ioctl && !filp->f_op->unlocked_ioctl))
+		if (!filp->f_op || !filp->f_op->unlocked_ioctl)
 			goto do_ioctl;
 		break;
 	}
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 2d140a71386..f855ea4fc88 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -29,7 +29,6 @@
  * @arg:	command-specific argument for ioctl
  *
  * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
- * invokes filesystem specific ->ioctl method.  If neither method exists,
  * returns -ENOTTY.
  *
  * Returns 0 on success, -errno on error.
@@ -39,21 +38,12 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
 {
 	int error = -ENOTTY;
 
-	if (!filp->f_op)
+	if (!filp->f_op || !filp->f_op->unlocked_ioctl)
 		goto out;
 
-	if (filp->f_op->unlocked_ioctl) {
-		error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
-		if (error == -ENOIOCTLCMD)
-			error = -EINVAL;
-		goto out;
-	} else if (filp->f_op->ioctl) {
-		lock_kernel();
-		error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
-					  filp, cmd, arg);
-		unlock_kernel();
-	}
-
+	error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
+	if (error == -ENOIOCTLCMD)
+		error = -EINVAL;
  out:
 	return error;
 }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 23561cda724..9c2b5f48487 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -214,8 +214,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
 {
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	long rv = -ENOTTY;
-	long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
-	int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
+	long (*ioctl)(struct file *, unsigned int, unsigned long);
 
 	spin_lock(&pde->pde_unload_lock);
 	if (!pde->proc_fops) {
@@ -223,19 +222,11 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
 		return rv;
 	}
 	pde->pde_users++;
-	unlocked_ioctl = pde->proc_fops->unlocked_ioctl;
-	ioctl = pde->proc_fops->ioctl;
+	ioctl = pde->proc_fops->unlocked_ioctl;
 	spin_unlock(&pde->pde_unload_lock);
 
-	if (unlocked_ioctl) {
-		rv = unlocked_ioctl(file, cmd, arg);
-		if (rv == -ENOIOCTLCMD)
-			rv = -EINVAL;
-	} else if (ioctl) {
-		WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
-			  "%pf will be called without the Bkl held\n", ioctl);
-		rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
-	}
+	if (ioctl)
+		rv = ioctl(file, cmd, arg);
 
 	pde_users_dec(pde);
 	return rv;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7a0625e26a3..3c786fdaeab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1483,8 +1483,8 @@ struct block_device_operations;
 
 /*
  * NOTE:
- * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl
- * can be called without the big kernel lock held in all filesystems.
+ * all file operations except setlease can be called without
+ * the big kernel lock held in all filesystems.
  */
 struct file_operations {
 	struct module *owner;
@@ -1495,7 +1495,6 @@ struct file_operations {
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
-	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
-- 
cgit v1.2.3


From 306a075362a288683f6346185f97dd0e06df19da Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Sep 2010 10:54:37 -0400
Subject: NFS: Allow NFSROOT debugging messages to be enabled dynamically

As a convenience, introduce a kernel command line option to enable
NFSROOT debugging messages.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/nfs/nfsroot.txt | 22 ++++++++++++++++++++++
 Documentation/kernel-parameters.txt       |  5 ++++-
 fs/nfs/nfsroot.c                          | 19 +++++++++++++------
 3 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index f2430a7974e..90c71c6f0d0 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -159,6 +159,28 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
                 Default: any
 
 
+nfsrootdebug
+
+  This parameter enables debugging messages to appear in the kernel
+  log at boot time so that administrators can verify that the correct
+  NFS mount options, server address, and root path are passed to the
+  NFS client.
+
+
+rdinit=<executable file>
+
+  To specify which file contains the program that starts system
+  initialization, administrators can use this command line parameter.
+  The default value of this parameter is "/init".  If the specified
+  file exists and the kernel can execute it, root filesystem related
+  kernel command line parameters, including `nfsroot=', are ignored.
+
+  A description of the process of mounting the root file system can be
+  found in:
+
+    Documentation/early-userspace/README
+
+
 
 
 3.) Boot Loader
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index f084af0cb8e..0fe70b2c419 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1532,12 +1532,15 @@ and is between 256 and 4096 characters. It is defined in the file
 			1 to enable accounting
 			Default value is 0.
 
-	nfsaddrs=	[NFS]
+	nfsaddrs=	[NFS] Deprecated.  Use ip= instead.
 			See Documentation/filesystems/nfs/nfsroot.txt.
 
 	nfsroot=	[NFS] nfs root filesystem for disk-less boxes.
 			See Documentation/filesystems/nfs/nfsroot.txt.
 
+	nfsrootdebug	[NFS] enable nfsroot debugging messages.
+			See Documentation/filesystems/nfs/nfsroot.txt.
+
 	nfs.callback_tcpport=
 			[NFS] set the TCP port on which the NFSv4 callback
 			channel should listen.
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8e7d623173a..460df365288 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -67,6 +67,7 @@
  *				NFS over TCP.
  *	Fabian Frederick:	Option parser rebuilt (using parser lib)
  *	Chuck Lever	:	Use super.c's text-based mount option parsing
+ *	Chuck Lever	:	Add "nfsrootdebug".
  */
 
 #include <linux/types.h>
@@ -80,8 +81,6 @@
 
 #include "internal.h"
 
-/* Define this to allow debugging output */
-#undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
 
 /* Default path we try to mount. "%s" gets replaced by our IP address */
@@ -102,6 +101,18 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
 /* server:export path string passed to super.c */
 static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
 
+/*
+ * When the "nfsrootdebug" kernel command line option is specified,
+ * enable debugging messages for NFSROOT.
+ */
+static int __init nfs_root_debug(char *__unused)
+{
+	nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
+	return 1;
+}
+
+__setup("nfsrootdebug", nfs_root_debug);
+
 /*
  *  Parse NFS server and directory information passed on the kernel
  *  command line.
@@ -282,10 +293,6 @@ out_devnametoolong:
  */
 int __init nfs_root_data(char **root_device, char **root_data)
 {
-#ifdef NFSROOT_DEBUG
-	nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
-#endif	/* NFSROOT_DEBUG */
-
 	servaddr = root_server_addr;
 	if (servaddr == htonl(INADDR_NONE)) {
 		printk(KERN_ERR "Root-NFS: no NFS server address\n");
-- 
cgit v1.2.3


From 2116b7a473bf1c8d26998b477c294e7fe294921f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 4 Oct 2010 22:55:57 +0200
Subject: smbfs: move to drivers/staging

smbfs has been scheduled for removal in 2.6.27, so
maybe we can now move it to drivers/staging on the
way out.

smbfs still uses the big kernel lock and nobody
is going to fix that, so we should be getting
rid of it soon.

This removes the 32 bit compat mount and ioctl
handling code, which is implemented in common fs
code, and moves all smbfs related files into
drivers/staging/smbfs.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/filesystems/00-INDEX   |    2 -
 Documentation/filesystems/smbfs.txt  |    8 -
 Documentation/ioctl/ioctl-number.txt |    2 +-
 drivers/staging/Kconfig              |    2 +
 drivers/staging/Makefile             |    1 +
 drivers/staging/smbfs/Kconfig        |   55 +
 drivers/staging/smbfs/Makefile       |   18 +
 drivers/staging/smbfs/TODO           |    8 +
 drivers/staging/smbfs/cache.c        |  208 ++
 drivers/staging/smbfs/dir.c          |  702 +++++++
 drivers/staging/smbfs/file.c         |  453 +++++
 drivers/staging/smbfs/getopt.c       |   64 +
 drivers/staging/smbfs/getopt.h       |   14 +
 drivers/staging/smbfs/inode.c        |  839 ++++++++
 drivers/staging/smbfs/ioctl.c        |   68 +
 drivers/staging/smbfs/proc.c         | 3506 +++++++++++++++++++++++++++++++++
 drivers/staging/smbfs/proto.h        |   87 +
 drivers/staging/smbfs/request.c      |  817 ++++++++
 drivers/staging/smbfs/request.h      |   70 +
 drivers/staging/smbfs/smb.h          |  118 ++
 drivers/staging/smbfs/smb_debug.h    |   34 +
 drivers/staging/smbfs/smb_fs.h       |  153 ++
 drivers/staging/smbfs/smb_fs_i.h     |   37 +
 drivers/staging/smbfs/smb_fs_sb.h    |  100 +
 drivers/staging/smbfs/smb_mount.h    |   65 +
 drivers/staging/smbfs/smbfs.txt      |    8 +
 drivers/staging/smbfs/smbiod.c       |  343 ++++
 drivers/staging/smbfs/smbno.h        |  363 ++++
 drivers/staging/smbfs/sock.c         |  385 ++++
 drivers/staging/smbfs/symlink.c      |   67 +
 fs/Kconfig                           |    1 -
 fs/Makefile                          |    1 -
 fs/compat.c                          |   31 +-
 fs/compat_ioctl.c                    |   26 -
 fs/smbfs/Kconfig                     |   55 -
 fs/smbfs/Makefile                    |   18 -
 fs/smbfs/cache.c                     |  208 --
 fs/smbfs/dir.c                       |  702 -------
 fs/smbfs/file.c                      |  454 -----
 fs/smbfs/getopt.c                    |   64 -
 fs/smbfs/getopt.h                    |   14 -
 fs/smbfs/inode.c                     |  839 --------
 fs/smbfs/ioctl.c                     |   69 -
 fs/smbfs/proc.c                      | 3507 ----------------------------------
 fs/smbfs/proto.h                     |   87 -
 fs/smbfs/request.c                   |  818 --------
 fs/smbfs/request.h                   |   70 -
 fs/smbfs/smb_debug.h                 |   34 -
 fs/smbfs/smbiod.c                    |  344 ----
 fs/smbfs/sock.c                      |  386 ----
 fs/smbfs/symlink.c                   |   68 -
 include/linux/Kbuild                 |    4 -
 include/linux/smb.h                  |  118 --
 include/linux/smb_fs.h               |  153 --
 include/linux/smb_fs_i.h             |   37 -
 include/linux/smb_fs_sb.h            |  100 -
 include/linux/smb_mount.h            |   65 -
 include/linux/smbno.h                |  363 ----
 58 files changed, 8587 insertions(+), 8646 deletions(-)
 delete mode 100644 Documentation/filesystems/smbfs.txt
 create mode 100644 drivers/staging/smbfs/Kconfig
 create mode 100644 drivers/staging/smbfs/Makefile
 create mode 100644 drivers/staging/smbfs/TODO
 create mode 100644 drivers/staging/smbfs/cache.c
 create mode 100644 drivers/staging/smbfs/dir.c
 create mode 100644 drivers/staging/smbfs/file.c
 create mode 100644 drivers/staging/smbfs/getopt.c
 create mode 100644 drivers/staging/smbfs/getopt.h
 create mode 100644 drivers/staging/smbfs/inode.c
 create mode 100644 drivers/staging/smbfs/ioctl.c
 create mode 100644 drivers/staging/smbfs/proc.c
 create mode 100644 drivers/staging/smbfs/proto.h
 create mode 100644 drivers/staging/smbfs/request.c
 create mode 100644 drivers/staging/smbfs/request.h
 create mode 100644 drivers/staging/smbfs/smb.h
 create mode 100644 drivers/staging/smbfs/smb_debug.h
 create mode 100644 drivers/staging/smbfs/smb_fs.h
 create mode 100644 drivers/staging/smbfs/smb_fs_i.h
 create mode 100644 drivers/staging/smbfs/smb_fs_sb.h
 create mode 100644 drivers/staging/smbfs/smb_mount.h
 create mode 100644 drivers/staging/smbfs/smbfs.txt
 create mode 100644 drivers/staging/smbfs/smbiod.c
 create mode 100644 drivers/staging/smbfs/smbno.h
 create mode 100644 drivers/staging/smbfs/sock.c
 create mode 100644 drivers/staging/smbfs/symlink.c
 delete mode 100644 fs/smbfs/Kconfig
 delete mode 100644 fs/smbfs/Makefile
 delete mode 100644 fs/smbfs/cache.c
 delete mode 100644 fs/smbfs/dir.c
 delete mode 100644 fs/smbfs/file.c
 delete mode 100644 fs/smbfs/getopt.c
 delete mode 100644 fs/smbfs/getopt.h
 delete mode 100644 fs/smbfs/inode.c
 delete mode 100644 fs/smbfs/ioctl.c
 delete mode 100644 fs/smbfs/proc.c
 delete mode 100644 fs/smbfs/proto.h
 delete mode 100644 fs/smbfs/request.c
 delete mode 100644 fs/smbfs/request.h
 delete mode 100644 fs/smbfs/smb_debug.h
 delete mode 100644 fs/smbfs/smbiod.c
 delete mode 100644 fs/smbfs/sock.c
 delete mode 100644 fs/smbfs/symlink.c
 delete mode 100644 include/linux/smb.h
 delete mode 100644 include/linux/smb_fs.h
 delete mode 100644 include/linux/smb_fs_i.h
 delete mode 100644 include/linux/smb_fs_sb.h
 delete mode 100644 include/linux/smb_mount.h
 delete mode 100644 include/linux/smbno.h

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 4303614b5ad..8c624a18f67 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -96,8 +96,6 @@ seq_file.txt
 	- how to use the seq_file API
 sharedsubtree.txt
 	- a description of shared subtrees for namespaces.
-smbfs.txt
-	- info on using filesystems with the SMB protocol (Win 3.11 and NT).
 spufs.txt
 	- info and mount options for the SPU filesystem used on Cell.
 sysfs-pci.txt
diff --git a/Documentation/filesystems/smbfs.txt b/Documentation/filesystems/smbfs.txt
deleted file mode 100644
index 194fb0decd2..00000000000
--- a/Documentation/filesystems/smbfs.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-Smbfs is a filesystem that implements the SMB protocol, which is the
-protocol used by Windows for Workgroups, Windows 95 and Windows NT.
-Smbfs was inspired by Samba, the program written by Andrew Tridgell
-that turns any Unix host into a file server for DOS or Windows clients.
-
-Smbfs is a SMB client, but uses parts of samba for its operation. For
-more info on samba, including documentation, please go to
-http://www.samba.org/ and then on to your nearest mirror.
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 33223ff121d..d15834a6e46 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -259,7 +259,7 @@ Code  Seq#(hex)	Include File		Comments
 't'	00-7F	linux/if_ppp.h
 't'	80-8F	linux/isdn_ppp.h
 't'	90	linux/toshiba.h
-'u'	00-1F	linux/smb_fs.h
+'u'	00-1F	linux/smb_fs.h		gone
 'v'	all	linux/videodev.h	conflict!
 'v'	00-1F	linux/ext2_fs.h		conflict!
 'v'	00-1F	linux/fs.h		conflict!
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 01503536e45..4a9190808b7 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -149,6 +149,8 @@ source "drivers/staging/msm/Kconfig"
 
 source "drivers/staging/lirc/Kconfig"
 
+source "drivers/staging/smbfs/Kconfig"
+
 source "drivers/staging/easycap/Kconfig"
 
 source "drivers/staging/solo6x10/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index de5c0e5a99e..7cb02a8e9f7 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_AUTOFS_FS)		+= autofs/
 obj-$(CONFIG_IDE_PHISON)	+= phison/
 obj-$(CONFIG_LINE6_USB)		+= line6/
 obj-$(CONFIG_USB_SERIAL_QUATECH2)	+= serqt_usb2/
+obj-$(CONFIG_SMB_FS)		+= smbfs/
 obj-$(CONFIG_USB_SERIAL_QUATECH_USB2)	+= quatech_usb2/
 obj-$(CONFIG_OCTEON_ETHERNET)	+= octeon/
 obj-$(CONFIG_VT6655)		+= vt6655/
diff --git a/drivers/staging/smbfs/Kconfig b/drivers/staging/smbfs/Kconfig
new file mode 100644
index 00000000000..e668127c8b2
--- /dev/null
+++ b/drivers/staging/smbfs/Kconfig
@@ -0,0 +1,55 @@
+config SMB_FS
+	tristate "SMB file system support (OBSOLETE, please use CIFS)"
+	depends on INET
+	select NLS
+	help
+	  SMB (Server Message Block) is the protocol Windows for Workgroups
+	  (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
+	  files and printers over local networks.  Saying Y here allows you to
+	  mount their file systems (often called "shares" in this context) and
+	  access them just like any other Unix directory.  Currently, this
+	  works only if the Windows machines use TCP/IP as the underlying
+	  transport protocol, and not NetBEUI.  For details, read
+	  <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
+	  available from <http://www.tldp.org/docs.html#howto>.
+
+	  Note: if you just want your box to act as an SMB *server* and make
+	  files and printing services available to Windows clients (which need
+	  to have a TCP/IP stack), you don't need to say Y here; you can use
+	  the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
+	  for that.
+
+	  General information about how to connect Linux, Windows machines and
+	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+
+	  To compile the SMB support as a module, choose M here:
+	  the module will be called smbfs.  Most people say N, however.
+
+config SMB_NLS_DEFAULT
+	bool "Use a default NLS"
+	depends on SMB_FS
+	help
+	  Enabling this will make smbfs use nls translations by default. You
+	  need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
+	  settings and you need to give the default nls for the SMB server as
+	  CONFIG_SMB_NLS_REMOTE.
+
+	  The nls settings can be changed at mount time, if your smbmount
+	  supports that, using the codepage and iocharset parameters.
+
+	  smbmount from samba 2.2.0 or later supports this.
+
+config SMB_NLS_REMOTE
+	string "Default Remote NLS Option"
+	depends on SMB_NLS_DEFAULT
+	default "cp437"
+	help
+	  This setting allows you to specify a default value for which
+	  codepage the server uses. If this field is left blank no
+	  translations will be done by default. The local codepage/charset
+	  default to CONFIG_NLS_DEFAULT.
+
+	  The nls settings can be changed at mount time, if your smbmount
+	  supports that, using the codepage and iocharset parameters.
+
+	  smbmount from samba 2.2.0 or later supports this.
diff --git a/drivers/staging/smbfs/Makefile b/drivers/staging/smbfs/Makefile
new file mode 100644
index 00000000000..4faf8c4722c
--- /dev/null
+++ b/drivers/staging/smbfs/Makefile
@@ -0,0 +1,18 @@
+#
+# Makefile for the linux smb-filesystem routines.
+#
+
+obj-$(CONFIG_SMB_FS) += smbfs.o
+
+smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
+		symlink.o smbiod.o request.o
+
+# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
+# SMBFS_PARANOIA should normally be enabled.
+
+EXTRA_CFLAGS += -DSMBFS_PARANOIA
+#EXTRA_CFLAGS += -DSMBFS_DEBUG
+#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
+#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
+#EXTRA_CFLAGS += -Werror
+
diff --git a/drivers/staging/smbfs/TODO b/drivers/staging/smbfs/TODO
new file mode 100644
index 00000000000..24f4d29d53a
--- /dev/null
+++ b/drivers/staging/smbfs/TODO
@@ -0,0 +1,8 @@
+smbfs is on its way out of the kernel, it has been replaced
+by cifs several years ago.
+
+The smbfs code uses the big kernel lock which
+is getting deprecated.
+
+Users that find smbfs to work but not cifs should contact
+the CIFS developers on linux-cifs@vger.kernel.org.
diff --git a/drivers/staging/smbfs/cache.c b/drivers/staging/smbfs/cache.c
new file mode 100644
index 00000000000..dbb98658148
--- /dev/null
+++ b/drivers/staging/smbfs/cache.c
@@ -0,0 +1,208 @@
+/*
+ *  cache.c
+ *
+ * Copyright (C) 1997 by Bill Hawes
+ *
+ * Routines to support directory cacheing using the page cache.
+ * This cache code is almost directly taken from ncpfs.
+ *
+ * Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/net.h>
+
+#include <asm/page.h>
+
+#include "smb_fs.h"
+#include "smb_debug.h"
+#include "proto.h"
+
+/*
+ * Force the next attempt to use the cache to be a timeout.
+ * If we can't find the page that's fine, it will cause a refresh.
+ */
+void
+smb_invalid_dir_cache(struct inode * dir)
+{
+	struct smb_sb_info *server = server_from_inode(dir);
+	union  smb_dir_cache *cache = NULL;
+	struct page *page = NULL;
+
+	page = grab_cache_page(&dir->i_data, 0);
+	if (!page)
+		goto out;
+
+	if (!PageUptodate(page))
+		goto out_unlock;
+
+	cache = kmap(page);
+	cache->head.time = jiffies - SMB_MAX_AGE(server);
+
+	kunmap(page);
+	SetPageUptodate(page);
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return;
+}
+
+/*
+ * Mark all dentries for 'parent' as invalid, forcing them to be re-read
+ */
+void
+smb_invalidate_dircache_entries(struct dentry *parent)
+{
+	struct smb_sb_info *server = server_from_dentry(parent);
+	struct list_head *next;
+	struct dentry *dentry;
+
+	spin_lock(&dcache_lock);
+	next = parent->d_subdirs.next;
+	while (next != &parent->d_subdirs) {
+		dentry = list_entry(next, struct dentry, d_u.d_child);
+		dentry->d_fsdata = NULL;
+		smb_age_dentry(server, dentry);
+		next = next->next;
+	}
+	spin_unlock(&dcache_lock);
+}
+
+/*
+ * dget, but require that fpos and parent matches what the dentry contains.
+ * dentry is not known to be a valid pointer at entry.
+ */
+struct dentry *
+smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
+{
+	struct dentry *dent = dentry;
+	struct list_head *next;
+
+	if (d_validate(dent, parent)) {
+		if (dent->d_name.len <= SMB_MAXNAMELEN &&
+		    (unsigned long)dent->d_fsdata == fpos) {
+			if (!dent->d_inode) {
+				dput(dent);
+				dent = NULL;
+			}
+			return dent;
+		}
+		dput(dent);
+	}
+
+	/* If a pointer is invalid, we search the dentry. */
+	spin_lock(&dcache_lock);
+	next = parent->d_subdirs.next;
+	while (next != &parent->d_subdirs) {
+		dent = list_entry(next, struct dentry, d_u.d_child);
+		if ((unsigned long)dent->d_fsdata == fpos) {
+			if (dent->d_inode)
+				dget_locked(dent);
+			else
+				dent = NULL;
+			goto out_unlock;
+		}
+		next = next->next;
+	}
+	dent = NULL;
+out_unlock:
+	spin_unlock(&dcache_lock);
+	return dent;
+}
+
+
+/*
+ * Create dentry/inode for this file and add it to the dircache.
+ */
+int
+smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	       struct smb_cache_control *ctrl, struct qstr *qname,
+	       struct smb_fattr *entry)
+{
+	struct dentry *newdent, *dentry = filp->f_path.dentry;
+	struct inode *newino, *inode = dentry->d_inode;
+	struct smb_cache_control ctl = *ctrl;
+	int valid = 0;
+	int hashed = 0;
+	ino_t ino = 0;
+
+	qname->hash = full_name_hash(qname->name, qname->len);
+
+	if (dentry->d_op && dentry->d_op->d_hash)
+		if (dentry->d_op->d_hash(dentry, qname) != 0)
+			goto end_advance;
+
+	newdent = d_lookup(dentry, qname);
+
+	if (!newdent) {
+		newdent = d_alloc(dentry, qname);
+		if (!newdent)
+			goto end_advance;
+	} else {
+		hashed = 1;
+		memcpy((char *) newdent->d_name.name, qname->name,
+		       newdent->d_name.len);
+	}
+
+	if (!newdent->d_inode) {
+		smb_renew_times(newdent);
+		entry->f_ino = iunique(inode->i_sb, 2);
+		newino = smb_iget(inode->i_sb, entry);
+		if (newino) {
+			smb_new_dentry(newdent);
+			d_instantiate(newdent, newino);
+			if (!hashed)
+				d_rehash(newdent);
+		}
+	} else
+		smb_set_inode_attr(newdent->d_inode, entry);
+
+        if (newdent->d_inode) {
+		ino = newdent->d_inode->i_ino;
+		newdent->d_fsdata = (void *) ctl.fpos;
+		smb_new_dentry(newdent);
+	}
+
+	if (ctl.idx >= SMB_DIRCACHE_SIZE) {
+		if (ctl.page) {
+			kunmap(ctl.page);
+			SetPageUptodate(ctl.page);
+			unlock_page(ctl.page);
+			page_cache_release(ctl.page);
+		}
+		ctl.cache = NULL;
+		ctl.idx  -= SMB_DIRCACHE_SIZE;
+		ctl.ofs  += 1;
+		ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
+		if (ctl.page)
+			ctl.cache = kmap(ctl.page);
+	}
+	if (ctl.cache) {
+		ctl.cache->dentry[ctl.idx] = newdent;
+		valid = 1;
+	}
+	dput(newdent);
+
+end_advance:
+	if (!valid)
+		ctl.valid = 0;
+	if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
+		if (!ino)
+			ino = find_inode_number(dentry, qname);
+		if (!ino)
+			ino = iunique(inode->i_sb, 2);
+		ctl.filled = filldir(dirent, qname->name, qname->len,
+				     filp->f_pos, ino, DT_UNKNOWN);
+		if (!ctl.filled)
+			filp->f_pos += 1;
+	}
+	ctl.fpos += 1;
+	ctl.idx  += 1;
+	*ctrl = ctl;
+	return (ctl.valid || !ctl.filled);
+}
diff --git a/drivers/staging/smbfs/dir.c b/drivers/staging/smbfs/dir.c
new file mode 100644
index 00000000000..936b3bb2099
--- /dev/null
+++ b/drivers/staging/smbfs/dir.c
@@ -0,0 +1,702 @@
+/*
+ *  dir.c
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/smp_lock.h>
+#include <linux/ctype.h>
+#include <linux/net.h>
+#include <linux/sched.h>
+
+#include "smb_fs.h"
+#include "smb_mount.h"
+#include "smbno.h"
+
+#include "smb_debug.h"
+#include "proto.h"
+
+static int smb_readdir(struct file *, void *, filldir_t);
+static int smb_dir_open(struct inode *, struct file *);
+
+static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
+static int smb_mkdir(struct inode *, struct dentry *, int);
+static int smb_rmdir(struct inode *, struct dentry *);
+static int smb_unlink(struct inode *, struct dentry *);
+static int smb_rename(struct inode *, struct dentry *,
+		      struct inode *, struct dentry *);
+static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
+static int smb_link(struct dentry *, struct inode *, struct dentry *);
+
+const struct file_operations smb_dir_operations =
+{
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= smb_readdir,
+	.unlocked_ioctl	= smb_ioctl,
+	.open		= smb_dir_open,
+};
+
+const struct inode_operations smb_dir_inode_operations =
+{
+	.create		= smb_create,
+	.lookup		= smb_lookup,
+	.unlink		= smb_unlink,
+	.mkdir		= smb_mkdir,
+	.rmdir		= smb_rmdir,
+	.rename		= smb_rename,
+	.getattr	= smb_getattr,
+	.setattr	= smb_notify_change,
+};
+
+const struct inode_operations smb_dir_inode_operations_unix =
+{
+	.create		= smb_create,
+	.lookup		= smb_lookup,
+	.unlink		= smb_unlink,
+	.mkdir		= smb_mkdir,
+	.rmdir		= smb_rmdir,
+	.rename		= smb_rename,
+	.getattr	= smb_getattr,
+	.setattr	= smb_notify_change,
+	.symlink	= smb_symlink,
+	.mknod		= smb_make_node,
+	.link		= smb_link,
+};
+
+/*
+ * Read a directory, using filldir to fill the dirent memory.
+ * smb_proc_readdir does the actual reading from the smb server.
+ *
+ * The cache code is almost directly taken from ncpfs
+ */
+static int 
+smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *dir = dentry->d_inode;
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	union  smb_dir_cache *cache = NULL;
+	struct smb_cache_control ctl;
+	struct page *page = NULL;
+	int result;
+
+	ctl.page  = NULL;
+	ctl.cache = NULL;
+
+	VERBOSE("reading %s/%s, f_pos=%d\n",
+		DENTRY_PATH(dentry),  (int) filp->f_pos);
+
+	result = 0;
+
+	lock_kernel();
+
+	switch ((unsigned int) filp->f_pos) {
+	case 0:
+		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos = 1;
+		/* fallthrough */
+	case 1:
+		if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
+			goto out;
+		filp->f_pos = 2;
+	}
+
+	/*
+	 * Make sure our inode is up-to-date.
+	 */
+	result = smb_revalidate_inode(dentry);
+	if (result)
+		goto out;
+
+
+	page = grab_cache_page(&dir->i_data, 0);
+	if (!page)
+		goto read_really;
+
+	ctl.cache = cache = kmap(page);
+	ctl.head  = cache->head;
+
+	if (!PageUptodate(page) || !ctl.head.eof) {
+		VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
+			 DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
+		goto init_cache;
+	}
+
+	if (filp->f_pos == 2) {
+		if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
+			goto init_cache;
+
+		/*
+		 * N.B. ncpfs checks mtime of dentry too here, we don't.
+		 *   1. common smb servers do not update mtime on dir changes
+		 *   2. it requires an extra smb request
+		 *      (revalidate has the same timeout as ctl.head.time)
+		 *
+		 * Instead smbfs invalidates its own cache on local changes
+		 * and remote changes are not seen until timeout.
+		 */
+	}
+
+	if (filp->f_pos > ctl.head.end)
+		goto finished;
+
+	ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
+	ctl.ofs  = ctl.fpos / SMB_DIRCACHE_SIZE;
+	ctl.idx  = ctl.fpos % SMB_DIRCACHE_SIZE;
+
+	for (;;) {
+		if (ctl.ofs != 0) {
+			ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
+			if (!ctl.page)
+				goto invalid_cache;
+			ctl.cache = kmap(ctl.page);
+			if (!PageUptodate(ctl.page))
+				goto invalid_cache;
+		}
+		while (ctl.idx < SMB_DIRCACHE_SIZE) {
+			struct dentry *dent;
+			int res;
+
+			dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
+					     dentry, filp->f_pos);
+			if (!dent)
+				goto invalid_cache;
+
+			res = filldir(dirent, dent->d_name.name,
+				      dent->d_name.len, filp->f_pos,
+				      dent->d_inode->i_ino, DT_UNKNOWN);
+			dput(dent);
+			if (res)
+				goto finished;
+			filp->f_pos += 1;
+			ctl.idx += 1;
+			if (filp->f_pos > ctl.head.end)
+				goto finished;
+		}
+		if (ctl.page) {
+			kunmap(ctl.page);
+			SetPageUptodate(ctl.page);
+			unlock_page(ctl.page);
+			page_cache_release(ctl.page);
+			ctl.page = NULL;
+		}
+		ctl.idx  = 0;
+		ctl.ofs += 1;
+	}
+invalid_cache:
+	if (ctl.page) {
+		kunmap(ctl.page);
+		unlock_page(ctl.page);
+		page_cache_release(ctl.page);
+		ctl.page = NULL;
+	}
+	ctl.cache = cache;
+init_cache:
+	smb_invalidate_dircache_entries(dentry);
+	ctl.head.time = jiffies;
+	ctl.head.eof = 0;
+	ctl.fpos = 2;
+	ctl.ofs = 0;
+	ctl.idx = SMB_DIRCACHE_START;
+	ctl.filled = 0;
+	ctl.valid  = 1;
+read_really:
+	result = server->ops->readdir(filp, dirent, filldir, &ctl);
+	if (result == -ERESTARTSYS && page)
+		ClearPageUptodate(page);
+	if (ctl.idx == -1)
+		goto invalid_cache;	/* retry */
+	ctl.head.end = ctl.fpos - 1;
+	ctl.head.eof = ctl.valid;
+finished:
+	if (page) {
+		cache->head = ctl.head;
+		kunmap(page);
+		if (result != -ERESTARTSYS)
+			SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	if (ctl.page) {
+		kunmap(ctl.page);
+		SetPageUptodate(ctl.page);
+		unlock_page(ctl.page);
+		page_cache_release(ctl.page);
+	}
+out:
+	unlock_kernel();
+	return result;
+}
+
+static int
+smb_dir_open(struct inode *dir, struct file *file)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct smb_sb_info *server;
+	int error = 0;
+
+	VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name);
+
+	/*
+	 * Directory timestamps in the core protocol aren't updated
+	 * when a file is added, so we give them a very short TTL.
+	 */
+	lock_kernel();
+	server = server_from_dentry(dentry);
+	if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
+		unsigned long age = jiffies - SMB_I(dir)->oldmtime;
+		if (age > 2*HZ)
+			smb_invalid_dir_cache(dir);
+	}
+
+	/*
+	 * Note: in order to allow the smbmount process to open the
+	 * mount point, we only revalidate if the connection is valid or
+	 * if the process is trying to access something other than the root.
+	 */
+	if (server->state == CONN_VALID || !IS_ROOT(dentry))
+		error = smb_revalidate_inode(dentry);
+	unlock_kernel();
+	return error;
+}
+
+/*
+ * Dentry operations routines
+ */
+static int smb_lookup_validate(struct dentry *, struct nameidata *);
+static int smb_hash_dentry(struct dentry *, struct qstr *);
+static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+static int smb_delete_dentry(struct dentry *);
+
+static const struct dentry_operations smbfs_dentry_operations =
+{
+	.d_revalidate	= smb_lookup_validate,
+	.d_hash		= smb_hash_dentry,
+	.d_compare	= smb_compare_dentry,
+	.d_delete	= smb_delete_dentry,
+};
+
+static const struct dentry_operations smbfs_dentry_operations_case =
+{
+	.d_revalidate	= smb_lookup_validate,
+	.d_delete	= smb_delete_dentry,
+};
+
+
+/*
+ * This is the callback when the dcache has a lookup hit.
+ */
+static int
+smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	struct inode * inode = dentry->d_inode;
+	unsigned long age = jiffies - dentry->d_time;
+	int valid;
+
+	/*
+	 * The default validation is based on dentry age:
+	 * we believe in dentries for a few seconds.  (But each
+	 * successful server lookup renews the timestamp.)
+	 */
+	valid = (age <= SMB_MAX_AGE(server));
+#ifdef SMBFS_DEBUG_VERBOSE
+	if (!valid)
+		VERBOSE("%s/%s not valid, age=%lu\n", 
+			DENTRY_PATH(dentry), age);
+#endif
+
+	if (inode) {
+		lock_kernel();
+		if (is_bad_inode(inode)) {
+			PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
+			valid = 0;
+		} else if (!valid)
+			valid = (smb_revalidate_inode(dentry) == 0);
+		unlock_kernel();
+	} else {
+		/*
+		 * What should we do for negative dentries?
+		 */
+	}
+	return valid;
+}
+
+static int 
+smb_hash_dentry(struct dentry *dir, struct qstr *this)
+{
+	unsigned long hash;
+	int i;
+
+	hash = init_name_hash();
+	for (i=0; i < this->len ; i++)
+		hash = partial_name_hash(tolower(this->name[i]), hash);
+	this->hash = end_name_hash(hash);
+  
+	return 0;
+}
+
+static int
+smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
+{
+	int i, result = 1;
+
+	if (a->len != b->len)
+		goto out;
+	for (i=0; i < a->len; i++) {
+		if (tolower(a->name[i]) != tolower(b->name[i]))
+			goto out;
+	}
+	result = 0;
+out:
+	return result;
+}
+
+/*
+ * This is the callback from dput() when d_count is going to 0.
+ * We use this to unhash dentries with bad inodes.
+ */
+static int
+smb_delete_dentry(struct dentry * dentry)
+{
+	if (dentry->d_inode) {
+		if (is_bad_inode(dentry->d_inode)) {
+			PARANOIA("bad inode, unhashing %s/%s\n",
+				 DENTRY_PATH(dentry));
+			return 1;
+		}
+	} else {
+		/* N.B. Unhash negative dentries? */
+	}
+	return 0;
+}
+
+/*
+ * Initialize a new dentry
+ */
+void
+smb_new_dentry(struct dentry *dentry)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+
+	if (server->mnt->flags & SMB_MOUNT_CASE)
+		dentry->d_op = &smbfs_dentry_operations_case;
+	else
+		dentry->d_op = &smbfs_dentry_operations;
+	dentry->d_time = jiffies;
+}
+
+
+/*
+ * Whenever a lookup succeeds, we know the parent directories
+ * are all valid, so we want to update the dentry timestamps.
+ * N.B. Move this to dcache?
+ */
+void
+smb_renew_times(struct dentry * dentry)
+{
+	dget(dentry);
+	spin_lock(&dentry->d_lock);
+	for (;;) {
+		struct dentry *parent;
+
+		dentry->d_time = jiffies;
+		if (IS_ROOT(dentry))
+			break;
+		parent = dentry->d_parent;
+		dget(parent);
+		spin_unlock(&dentry->d_lock);
+		dput(dentry);
+		dentry = parent;
+		spin_lock(&dentry->d_lock);
+	}
+	spin_unlock(&dentry->d_lock);
+	dput(dentry);
+}
+
+static struct dentry *
+smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct smb_fattr finfo;
+	struct inode *inode;
+	int error;
+	struct smb_sb_info *server;
+
+	error = -ENAMETOOLONG;
+	if (dentry->d_name.len > SMB_MAXNAMELEN)
+		goto out;
+
+	/* Do not allow lookup of names with backslashes in */
+	error = -EINVAL;
+	if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
+		goto out;
+
+	lock_kernel();
+	error = smb_proc_getattr(dentry, &finfo);
+#ifdef SMBFS_PARANOIA
+	if (error && error != -ENOENT)
+		PARANOIA("find %s/%s failed, error=%d\n",
+			 DENTRY_PATH(dentry), error);
+#endif
+
+	inode = NULL;
+	if (error == -ENOENT)
+		goto add_entry;
+	if (!error) {
+		error = -EACCES;
+		finfo.f_ino = iunique(dentry->d_sb, 2);
+		inode = smb_iget(dir->i_sb, &finfo);
+		if (inode) {
+	add_entry:
+			server = server_from_dentry(dentry);
+			if (server->mnt->flags & SMB_MOUNT_CASE)
+				dentry->d_op = &smbfs_dentry_operations_case;
+			else
+				dentry->d_op = &smbfs_dentry_operations;
+
+			d_add(dentry, inode);
+			smb_renew_times(dentry);
+			error = 0;
+		}
+	}
+	unlock_kernel();
+out:
+	return ERR_PTR(error);
+}
+
+/*
+ * This code is common to all routines creating a new inode.
+ */
+static int
+smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	struct inode *inode;
+	int error;
+	struct smb_fattr fattr;
+
+	VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
+
+	error = smb_proc_getattr(dentry, &fattr);
+	if (error)
+		goto out_close;
+
+	smb_renew_times(dentry);
+	fattr.f_ino = iunique(dentry->d_sb, 2);
+	inode = smb_iget(dentry->d_sb, &fattr);
+	if (!inode)
+		goto out_no_inode;
+
+	if (have_id) {
+		struct smb_inode_info *ei = SMB_I(inode);
+		ei->fileid = fileid;
+		ei->access = SMB_O_RDWR;
+		ei->open = server->generation;
+	}
+	d_instantiate(dentry, inode);
+out:
+	return error;
+
+out_no_inode:
+	error = -EACCES;
+out_close:
+	if (have_id) {
+		PARANOIA("%s/%s failed, error=%d, closing %u\n",
+			 DENTRY_PATH(dentry), error, fileid);
+		smb_close_fileid(dentry, fileid);
+	}
+	goto out;
+}
+
+/* N.B. How should the mode argument be used? */
+static int
+smb_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	__u16 fileid;
+	int error;
+	struct iattr attr;
+
+	VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
+
+	lock_kernel();
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
+	if (!error) {
+		if (server->opt.capabilities & SMB_CAP_UNIX) {
+			/* Set attributes for new file */
+			attr.ia_valid = ATTR_MODE;
+			attr.ia_mode = mode;
+			error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
+		}
+		error = smb_instantiate(dentry, fileid, 1);
+	} else {
+		PARANOIA("%s/%s failed, error=%d\n",
+			 DENTRY_PATH(dentry), error);
+	}
+	unlock_kernel();
+	return error;
+}
+
+/* N.B. How should the mode argument be used? */
+static int
+smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	int error;
+	struct iattr attr;
+
+	lock_kernel();
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_mkdir(dentry);
+	if (!error) {
+		if (server->opt.capabilities & SMB_CAP_UNIX) {
+			/* Set attributes for new directory */
+			attr.ia_valid = ATTR_MODE;
+			attr.ia_mode = mode;
+			error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
+		}
+		error = smb_instantiate(dentry, 0, 0);
+	}
+	unlock_kernel();
+	return error;
+}
+
+static int
+smb_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	/*
+	 * Close the directory if it's open.
+	 */
+	lock_kernel();
+	smb_close(inode);
+
+	/*
+	 * Check that nobody else is using the directory..
+	 */
+	error = -EBUSY;
+	if (!d_unhashed(dentry))
+		goto out;
+
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_rmdir(dentry);
+
+out:
+	unlock_kernel();
+	return error;
+}
+
+static int
+smb_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+
+	/*
+	 * Close the file if it's open.
+	 */
+	lock_kernel();
+	smb_close(dentry->d_inode);
+
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_unlink(dentry);
+	if (!error)
+		smb_renew_times(dentry);
+	unlock_kernel();
+	return error;
+}
+
+static int
+smb_rename(struct inode *old_dir, struct dentry *old_dentry,
+	   struct inode *new_dir, struct dentry *new_dentry)
+{
+	int error;
+
+	/*
+	 * Close any open files, and check whether to delete the
+	 * target before attempting the rename.
+	 */
+	lock_kernel();
+	if (old_dentry->d_inode)
+		smb_close(old_dentry->d_inode);
+	if (new_dentry->d_inode) {
+		smb_close(new_dentry->d_inode);
+		error = smb_proc_unlink(new_dentry);
+		if (error) {
+			VERBOSE("unlink %s/%s, error=%d\n",
+				DENTRY_PATH(new_dentry), error);
+			goto out;
+		}
+		/* FIXME */
+		d_delete(new_dentry);
+	}
+
+	smb_invalid_dir_cache(old_dir);
+	smb_invalid_dir_cache(new_dir);
+	error = smb_proc_mv(old_dentry, new_dentry);
+	if (!error) {
+		smb_renew_times(old_dentry);
+		smb_renew_times(new_dentry);
+	}
+out:
+	unlock_kernel();
+	return error;
+}
+
+/*
+ * FIXME: samba servers won't let you create device nodes unless uid/gid
+ * matches the connection credentials (and we don't know which those are ...)
+ */
+static int
+smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+	int error;
+	struct iattr attr;
+
+	attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
+	attr.ia_mode = mode;
+	current_euid_egid(&attr.ia_uid, &attr.ia_gid);
+
+	if (!new_valid_dev(dev))
+		return -EINVAL;
+
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
+	if (!error) {
+		error = smb_instantiate(dentry, 0, 0);
+	}
+	return error;
+}
+
+/*
+ * dentry = existing file
+ * new_dentry = new file
+ */
+static int
+smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
+{
+	int error;
+
+	DEBUG1("smb_link old=%s/%s new=%s/%s\n",
+	       DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
+	if (!error) {
+		smb_renew_times(dentry);
+		error = smb_instantiate(new_dentry, 0, 0);
+	}
+	return error;
+}
diff --git a/drivers/staging/smbfs/file.c b/drivers/staging/smbfs/file.c
new file mode 100644
index 00000000000..5dcd19c60eb
--- /dev/null
+++ b/drivers/staging/smbfs/file.c
@@ -0,0 +1,453 @@
+/*
+ *  file.c
+ *
+ *  Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/net.h>
+#include <linux/aio.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include "smbno.h"
+#include "smb_fs.h"
+#include "smb_debug.h"
+#include "proto.h"
+
+static int
+smb_fsync(struct file *file, int datasync)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	int result;
+
+	VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
+
+	/*
+	 * The VFS will writepage() all dirty pages for us, but we
+	 * should send a SMBflush to the server, letting it know that
+	 * we want things synchronized with actual storage.
+	 *
+	 * Note: this function requires all pages to have been written already
+	 *       (should be ok with writepage_sync)
+	 */
+	result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
+	return result;
+}
+
+/*
+ * Read a page synchronously.
+ */
+static int
+smb_readpage_sync(struct dentry *dentry, struct page *page)
+{
+	char *buffer = kmap(page);
+	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	unsigned int rsize = smb_get_rsize(server);
+	int count = PAGE_SIZE;
+	int result;
+
+	VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
+		DENTRY_PATH(dentry), count, offset, rsize);
+
+	result = smb_open(dentry, SMB_O_RDONLY);
+	if (result < 0)
+		goto io_error;
+
+	do {
+		if (count < rsize)
+			rsize = count;
+
+		result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
+		if (result < 0)
+			goto io_error;
+
+		count -= result;
+		offset += result;
+		buffer += result;
+		dentry->d_inode->i_atime =
+			current_fs_time(dentry->d_inode->i_sb);
+		if (result < rsize)
+			break;
+	} while (count);
+
+	memset(buffer, 0, count);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	result = 0;
+
+io_error:
+	kunmap(page);
+	unlock_page(page);
+	return result;
+}
+
+/*
+ * We are called with the page locked and we unlock it when done.
+ */
+static int
+smb_readpage(struct file *file, struct page *page)
+{
+	int		error;
+	struct dentry  *dentry = file->f_path.dentry;
+
+	page_cache_get(page);
+	error = smb_readpage_sync(dentry, page);
+	page_cache_release(page);
+	return error;
+}
+
+/*
+ * Write a page synchronously.
+ * Offset is the data offset within the page.
+ */
+static int
+smb_writepage_sync(struct inode *inode, struct page *page,
+		   unsigned long pageoffset, unsigned int count)
+{
+	loff_t offset;
+	char *buffer = kmap(page) + pageoffset;
+	struct smb_sb_info *server = server_from_inode(inode);
+	unsigned int wsize = smb_get_wsize(server);
+	int ret = 0;
+
+	offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
+	VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
+		inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
+
+	do {
+		int write_ret;
+
+		if (count < wsize)
+			wsize = count;
+
+		write_ret = server->ops->write(inode, offset, wsize, buffer);
+		if (write_ret < 0) {
+			PARANOIA("failed write, wsize=%d, write_ret=%d\n",
+				 wsize, write_ret);
+			ret = write_ret;
+			break;
+		}
+		/* N.B. what if result < wsize?? */
+#ifdef SMBFS_PARANOIA
+		if (write_ret < wsize)
+			PARANOIA("short write, wsize=%d, write_ret=%d\n",
+				 wsize, write_ret);
+#endif
+		buffer += wsize;
+		offset += wsize;
+		count -= wsize;
+		/*
+		 * Update the inode now rather than waiting for a refresh.
+		 */
+		inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
+		SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
+		if (offset > inode->i_size)
+			inode->i_size = offset;
+	} while (count);
+
+	kunmap(page);
+	return ret;
+}
+
+/*
+ * Write a page to the server. This will be used for NFS swapping only
+ * (for now), and we currently do this synchronously only.
+ *
+ * We are called with the page locked and we unlock it when done.
+ */
+static int
+smb_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	unsigned long end_index;
+	unsigned offset = PAGE_CACHE_SIZE;
+	int err;
+
+	BUG_ON(!mapping);
+	inode = mapping->host;
+	BUG_ON(!inode);
+
+	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+
+	/* easy case */
+	if (page->index < end_index)
+		goto do_it;
+	/* things got complicated... */
+	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
+	/* OK, are we completely out? */
+	if (page->index >= end_index+1 || !offset)
+		return 0; /* truncated - don't care */
+do_it:
+	page_cache_get(page);
+	err = smb_writepage_sync(inode, page, 0, offset);
+	SetPageUptodate(page);
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+static int
+smb_updatepage(struct file *file, struct page *page, unsigned long offset,
+	       unsigned int count)
+{
+	struct dentry *dentry = file->f_path.dentry;
+
+	DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
+		((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
+
+	return smb_writepage_sync(dentry->d_inode, page, offset, count);
+}
+
+static ssize_t
+smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			unsigned long nr_segs, loff_t pos)
+{
+	struct file * file = iocb->ki_filp;
+	struct dentry * dentry = file->f_path.dentry;
+	ssize_t	status;
+
+	VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
+		(unsigned long) iocb->ki_left, (unsigned long) pos);
+
+	status = smb_revalidate_inode(dentry);
+	if (status) {
+		PARANOIA("%s/%s validation failed, error=%Zd\n",
+			 DENTRY_PATH(dentry), status);
+		goto out;
+	}
+
+	VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
+		(long)dentry->d_inode->i_size,
+		dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
+
+	status = generic_file_aio_read(iocb, iov, nr_segs, pos);
+out:
+	return status;
+}
+
+static int
+smb_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	struct dentry * dentry = file->f_path.dentry;
+	int	status;
+
+	VERBOSE("file %s/%s, address %lu - %lu\n",
+		DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
+
+	status = smb_revalidate_inode(dentry);
+	if (status) {
+		PARANOIA("%s/%s validation failed, error=%d\n",
+			 DENTRY_PATH(dentry), status);
+		goto out;
+	}
+	status = generic_file_mmap(file, vma);
+out:
+	return status;
+}
+
+static ssize_t
+smb_file_splice_read(struct file *file, loff_t *ppos,
+		     struct pipe_inode_info *pipe, size_t count,
+		     unsigned int flags)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	ssize_t status;
+
+	VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
+		DENTRY_PATH(dentry), *ppos, count);
+
+	status = smb_revalidate_inode(dentry);
+	if (status) {
+		PARANOIA("%s/%s validation failed, error=%Zd\n",
+			 DENTRY_PATH(dentry), status);
+		goto out;
+	}
+	status = generic_file_splice_read(file, ppos, pipe, count, flags);
+out:
+	return status;
+}
+
+/*
+ * This does the "real" work of the write. The generic routine has
+ * allocated the page, locked it, done all the page alignment stuff
+ * calculations etc. Now we should just copy the data from user
+ * space and write it back to the real medium..
+ *
+ * If the writer ends up delaying the write, the writer needs to
+ * increment the page use counts until he is done with the page.
+ */
+static int smb_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
+	if (!*pagep)
+		return -ENOMEM;
+	return 0;
+}
+
+static int smb_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	int status;
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+
+	lock_kernel();
+	status = smb_updatepage(file, page, offset, copied);
+	unlock_kernel();
+
+	if (!status) {
+		if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+			SetPageUptodate(page);
+		status = copied;
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return status;
+}
+
+const struct address_space_operations smb_file_aops = {
+	.readpage = smb_readpage,
+	.writepage = smb_writepage,
+	.write_begin = smb_write_begin,
+	.write_end = smb_write_end,
+};
+
+/* 
+ * Write to a file (through the page cache).
+ */
+static ssize_t
+smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
+{
+	struct file * file = iocb->ki_filp;
+	struct dentry * dentry = file->f_path.dentry;
+	ssize_t	result;
+
+	VERBOSE("file %s/%s, count=%lu@%lu\n",
+		DENTRY_PATH(dentry),
+		(unsigned long) iocb->ki_left, (unsigned long) pos);
+
+	result = smb_revalidate_inode(dentry);
+	if (result) {
+		PARANOIA("%s/%s validation failed, error=%Zd\n",
+			 DENTRY_PATH(dentry), result);
+		goto out;
+	}
+
+	result = smb_open(dentry, SMB_O_WRONLY);
+	if (result)
+		goto out;
+
+	if (iocb->ki_left > 0) {
+		result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+		VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
+			(long) file->f_pos, (long) dentry->d_inode->i_size,
+			dentry->d_inode->i_mtime.tv_sec,
+			dentry->d_inode->i_atime.tv_sec);
+	}
+out:
+	return result;
+}
+
+static int
+smb_file_open(struct inode *inode, struct file * file)
+{
+	int result;
+	struct dentry *dentry = file->f_path.dentry;
+	int smb_mode = (file->f_mode & O_ACCMODE) - 1;
+
+	lock_kernel();
+	result = smb_open(dentry, smb_mode);
+	if (result)
+		goto out;
+	SMB_I(inode)->openers++;
+out:
+	unlock_kernel();
+	return result;
+}
+
+static int
+smb_file_release(struct inode *inode, struct file * file)
+{
+	lock_kernel();
+	if (!--SMB_I(inode)->openers) {
+		/* We must flush any dirty pages now as we won't be able to
+		   write anything after close. mmap can trigger this.
+		   "openers" should perhaps include mmap'ers ... */
+		filemap_write_and_wait(inode->i_mapping);
+		smb_close(inode);
+	}
+	unlock_kernel();
+	return 0;
+}
+
+/*
+ * Check whether the required access is compatible with
+ * an inode's permission. SMB doesn't recognize superuser
+ * privileges, so we need our own check for this.
+ */
+static int
+smb_file_permission(struct inode *inode, int mask)
+{
+	int mode = inode->i_mode;
+	int error = 0;
+
+	VERBOSE("mode=%x, mask=%x\n", mode, mask);
+
+	/* Look at user permissions */
+	mode >>= 6;
+	if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
+		error = -EACCES;
+	return error;
+}
+
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+	lock_kernel();
+	ret = generic_file_llseek_unlocked(file, offset, origin);
+	unlock_kernel();
+	return ret;
+}
+
+const struct file_operations smb_file_operations =
+{
+	.llseek 	= smb_remote_llseek,
+	.read		= do_sync_read,
+	.aio_read	= smb_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= smb_file_aio_write,
+	.unlocked_ioctl	= smb_ioctl,
+	.mmap		= smb_file_mmap,
+	.open		= smb_file_open,
+	.release	= smb_file_release,
+	.fsync		= smb_fsync,
+	.splice_read	= smb_file_splice_read,
+};
+
+const struct inode_operations smb_file_inode_operations =
+{
+	.permission	= smb_file_permission,
+	.getattr	= smb_getattr,
+	.setattr	= smb_notify_change,
+};
diff --git a/drivers/staging/smbfs/getopt.c b/drivers/staging/smbfs/getopt.c
new file mode 100644
index 00000000000..7ae0f5273ab
--- /dev/null
+++ b/drivers/staging/smbfs/getopt.c
@@ -0,0 +1,64 @@
+/*
+ * getopt.c
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/net.h>
+
+#include "getopt.h"
+
+/**
+ *	smb_getopt - option parser
+ *	@caller: name of the caller, for error messages
+ *	@options: the options string
+ *	@opts: an array of &struct option entries controlling parser operations
+ *	@optopt: output; will contain the current option
+ *	@optarg: output; will contain the value (if one exists)
+ *	@flag: output; may be NULL; should point to a long for or'ing flags
+ *	@value: output; may be NULL; will be overwritten with the integer value
+ *		of the current argument.
+ *
+ *	Helper to parse options on the format used by mount ("a=b,c=d,e,f").
+ *	Returns opts->val if a matching entry in the 'opts' array is found,
+ *	0 when no more tokens are found, -1 if an error is encountered.
+ */
+int smb_getopt(char *caller, char **options, struct option *opts,
+	       char **optopt, char **optarg, unsigned long *flag,
+	       unsigned long *value)
+{
+	char *token;
+	char *val;
+	int i;
+
+	do {
+		if ((token = strsep(options, ",")) == NULL)
+			return 0;
+	} while (*token == '\0');
+	*optopt = token;
+
+	*optarg = NULL;
+	if ((val = strchr (token, '=')) != NULL) {
+		*val++ = 0;
+		if (value)
+			*value = simple_strtoul(val, NULL, 0);
+		*optarg = val;
+	}
+
+	for (i = 0; opts[i].name != NULL; i++) {
+		if (!strcmp(opts[i].name, token)) {
+			if (!opts[i].flag && (!val || !*val)) {
+				printk("%s: the %s option requires an argument\n",
+				       caller, token);
+				return -1;
+			}
+
+			if (flag && opts[i].flag)
+				*flag |= opts[i].flag;
+
+			return opts[i].val;
+		}
+	}
+	printk("%s: Unrecognized mount option %s\n", caller, token);
+	return -1;
+}
diff --git a/drivers/staging/smbfs/getopt.h b/drivers/staging/smbfs/getopt.h
new file mode 100644
index 00000000000..146219ac7c4
--- /dev/null
+++ b/drivers/staging/smbfs/getopt.h
@@ -0,0 +1,14 @@
+#ifndef _LINUX_GETOPT_H
+#define _LINUX_GETOPT_H
+
+struct option {
+	const char *name;
+	unsigned long flag;
+	int val;
+};
+
+extern int smb_getopt(char *caller, char **options, struct option *opts,
+		      char **optopt, char **optarg, unsigned long *flag,
+		      unsigned long *value);
+
+#endif /* _LINUX_GETOPT_H */
diff --git a/drivers/staging/smbfs/inode.c b/drivers/staging/smbfs/inode.c
new file mode 100644
index 00000000000..9287599ddb5
--- /dev/null
+++ b/drivers/staging/smbfs/inode.c
@@ -0,0 +1,839 @@
+/*
+ *  inode.c
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/dcache.h>
+#include <linux/smp_lock.h>
+#include <linux/nls.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/net.h>
+#include <linux/vfs.h>
+#include <linux/highuid.h>
+#include <linux/sched.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "smb_fs.h"
+#include "smbno.h"
+#include "smb_mount.h"
+#include "smb_debug.h"
+#include "getopt.h"
+#include "proto.h"
+
+/* Always pick a default string */
+#ifdef CONFIG_SMB_NLS_REMOTE
+#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
+#else
+#define SMB_NLS_REMOTE ""
+#endif
+
+#define SMB_TTL_DEFAULT 1000
+
+static void smb_evict_inode(struct inode *);
+static void smb_put_super(struct super_block *);
+static int  smb_statfs(struct dentry *, struct kstatfs *);
+static int  smb_show_options(struct seq_file *, struct vfsmount *);
+
+static struct kmem_cache *smb_inode_cachep;
+
+static struct inode *smb_alloc_inode(struct super_block *sb)
+{
+	struct smb_inode_info *ei;
+	ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void smb_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
+}
+
+static void init_once(void *foo)
+{
+	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	smb_inode_cachep = kmem_cache_create("smb_inode_cache",
+					     sizeof(struct smb_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (smb_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(smb_inode_cachep);
+}
+
+static int smb_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_NODIRATIME;
+	return 0;
+}
+
+static const struct super_operations smb_sops =
+{
+	.alloc_inode	= smb_alloc_inode,
+	.destroy_inode	= smb_destroy_inode,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= smb_evict_inode,
+	.put_super	= smb_put_super,
+	.statfs		= smb_statfs,
+	.show_options	= smb_show_options,
+	.remount_fs	= smb_remount,
+};
+
+
+/* We are always generating a new inode here */
+struct inode *
+smb_iget(struct super_block *sb, struct smb_fattr *fattr)
+{
+	struct smb_sb_info *server = SMB_SB(sb);
+	struct inode *result;
+
+	DEBUG1("smb_iget: %p\n", fattr);
+
+	result = new_inode(sb);
+	if (!result)
+		return result;
+	result->i_ino = fattr->f_ino;
+	SMB_I(result)->open = 0;
+	SMB_I(result)->fileid = 0;
+	SMB_I(result)->access = 0;
+	SMB_I(result)->flags = 0;
+	SMB_I(result)->closed = 0;
+	SMB_I(result)->openers = 0;
+	smb_set_inode_attr(result, fattr);
+	if (S_ISREG(result->i_mode)) {
+		result->i_op = &smb_file_inode_operations;
+		result->i_fop = &smb_file_operations;
+		result->i_data.a_ops = &smb_file_aops;
+	} else if (S_ISDIR(result->i_mode)) {
+		if (server->opt.capabilities & SMB_CAP_UNIX)
+			result->i_op = &smb_dir_inode_operations_unix;
+		else
+			result->i_op = &smb_dir_inode_operations;
+		result->i_fop = &smb_dir_operations;
+	} else if (S_ISLNK(result->i_mode)) {
+		result->i_op = &smb_link_inode_operations;
+	} else {
+		init_special_inode(result, result->i_mode, fattr->f_rdev);
+	}
+	insert_inode_hash(result);
+	return result;
+}
+
+/*
+ * Copy the inode data to a smb_fattr structure.
+ */
+void
+smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
+{
+	memset(fattr, 0, sizeof(struct smb_fattr));
+	fattr->f_mode	= inode->i_mode;
+	fattr->f_nlink	= inode->i_nlink;
+	fattr->f_ino	= inode->i_ino;
+	fattr->f_uid	= inode->i_uid;
+	fattr->f_gid	= inode->i_gid;
+	fattr->f_size	= inode->i_size;
+	fattr->f_mtime	= inode->i_mtime;
+	fattr->f_ctime	= inode->i_ctime;
+	fattr->f_atime	= inode->i_atime;
+	fattr->f_blocks	= inode->i_blocks;
+
+	fattr->attr	= SMB_I(inode)->attr;
+	/*
+	 * Keep the attributes in sync with the inode permissions.
+	 */
+	if (fattr->f_mode & S_IWUSR)
+		fattr->attr &= ~aRONLY;
+	else
+		fattr->attr |= aRONLY;
+}
+
+/*
+ * Update the inode, possibly causing it to invalidate its pages if mtime/size
+ * is different from last time.
+ */
+void
+smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
+{
+	struct smb_inode_info *ei = SMB_I(inode);
+
+	/*
+	 * A size change should have a different mtime, or same mtime
+	 * but different size.
+	 */
+	time_t last_time = inode->i_mtime.tv_sec;
+	loff_t last_sz = inode->i_size;
+
+	inode->i_mode	= fattr->f_mode;
+	inode->i_nlink	= fattr->f_nlink;
+	inode->i_uid	= fattr->f_uid;
+	inode->i_gid	= fattr->f_gid;
+	inode->i_ctime	= fattr->f_ctime;
+	inode->i_blocks = fattr->f_blocks;
+	inode->i_size	= fattr->f_size;
+	inode->i_mtime	= fattr->f_mtime;
+	inode->i_atime	= fattr->f_atime;
+	ei->attr = fattr->attr;
+
+	/*
+	 * Update the "last time refreshed" field for revalidation.
+	 */
+	ei->oldmtime = jiffies;
+
+	if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
+		VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
+			inode->i_ino,
+			(long) last_time, (long) inode->i_mtime.tv_sec,
+			(long) last_sz, (long) inode->i_size);
+
+		if (!S_ISDIR(inode->i_mode))
+			invalidate_remote_inode(inode);
+	}
+}
+
+/*
+ * This is called if the connection has gone bad ...
+ * try to kill off all the current inodes.
+ */
+void
+smb_invalidate_inodes(struct smb_sb_info *server)
+{
+	VERBOSE("\n");
+	shrink_dcache_sb(SB_of(server));
+	invalidate_inodes(SB_of(server));
+}
+
+/*
+ * This is called to update the inode attributes after
+ * we've made changes to a file or directory.
+ */
+static int
+smb_refresh_inode(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+	struct smb_fattr fattr;
+
+	error = smb_proc_getattr(dentry, &fattr);
+	if (!error) {
+		smb_renew_times(dentry);
+		/*
+		 * Check whether the type part of the mode changed,
+		 * and don't update the attributes if it did.
+		 *
+		 * And don't dick with the root inode
+		 */
+		if (inode->i_ino == 2)
+			return error;
+		if (S_ISLNK(inode->i_mode))
+			return error;	/* VFS will deal with it */
+
+		if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
+			smb_set_inode_attr(inode, &fattr);
+		} else {
+			/*
+			 * Big trouble! The inode has become a new object,
+			 * so any operations attempted on it are invalid.
+			 *
+			 * To limit damage, mark the inode as bad so that
+			 * subsequent lookup validations will fail.
+			 */
+			PARANOIA("%s/%s changed mode, %07o to %07o\n",
+				 DENTRY_PATH(dentry),
+				 inode->i_mode, fattr.f_mode);
+
+			fattr.f_mode = inode->i_mode; /* save mode */
+			make_bad_inode(inode);
+			inode->i_mode = fattr.f_mode; /* restore mode */
+			/*
+			 * No need to worry about unhashing the dentry: the
+			 * lookup validation will see that the inode is bad.
+			 * But we do want to invalidate the caches ...
+			 */
+			if (!S_ISDIR(inode->i_mode))
+				invalidate_remote_inode(inode);
+			else
+				smb_invalid_dir_cache(inode);
+			error = -EIO;
+		}
+	}
+	return error;
+}
+
+/*
+ * This is called when we want to check whether the inode
+ * has changed on the server.  If it has changed, we must
+ * invalidate our local caches.
+ */
+int
+smb_revalidate_inode(struct dentry *dentry)
+{
+	struct smb_sb_info *s = server_from_dentry(dentry);
+	struct inode *inode = dentry->d_inode;
+	int error = 0;
+
+	DEBUG1("smb_revalidate_inode\n");
+	lock_kernel();
+
+	/*
+	 * Check whether we've recently refreshed the inode.
+	 */
+	if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
+		VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
+			inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
+		goto out;
+	}
+
+	error = smb_refresh_inode(dentry);
+out:
+	unlock_kernel();
+	return error;
+}
+
+/*
+ * This routine is called when i_nlink == 0 and i_count goes to 0.
+ * All blocking cleanup operations need to go here to avoid races.
+ */
+static void
+smb_evict_inode(struct inode *ino)
+{
+	DEBUG1("ino=%ld\n", ino->i_ino);
+	truncate_inode_pages(&ino->i_data, 0);
+	end_writeback(ino);
+	lock_kernel();
+	if (smb_close(ino))
+		PARANOIA("could not close inode %ld\n", ino->i_ino);
+	unlock_kernel();
+}
+
+static struct option opts[] = {
+	{ "version",	0, 'v' },
+	{ "win95",	SMB_MOUNT_WIN95, 1 },
+	{ "oldattr",	SMB_MOUNT_OLDATTR, 1 },
+	{ "dirattr",	SMB_MOUNT_DIRATTR, 1 },
+	{ "case",	SMB_MOUNT_CASE, 1 },
+	{ "uid",	0, 'u' },
+	{ "gid",	0, 'g' },
+	{ "file_mode",	0, 'f' },
+	{ "dir_mode",	0, 'd' },
+	{ "iocharset",	0, 'i' },
+	{ "codepage",	0, 'c' },
+	{ "ttl",	0, 't' },
+	{ NULL,		0, 0}
+};
+
+static int
+parse_options(struct smb_mount_data_kernel *mnt, char *options)
+{
+	int c;
+	unsigned long flags;
+	unsigned long value;
+	char *optarg;
+	char *optopt;
+
+	flags = 0;
+	while ( (c = smb_getopt("smbfs", &options, opts,
+				&optopt, &optarg, &flags, &value)) > 0) {
+
+		VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
+		switch (c) {
+		case 1:
+			/* got a "flag" option */
+			break;
+		case 'v':
+			if (value != SMB_MOUNT_VERSION) {
+			printk ("smbfs: Bad mount version %ld, expected %d\n",
+				value, SMB_MOUNT_VERSION);
+				return 0;
+			}
+			mnt->version = value;
+			break;
+		case 'u':
+			mnt->uid = value;
+			flags |= SMB_MOUNT_UID;
+			break;
+		case 'g':
+			mnt->gid = value;
+			flags |= SMB_MOUNT_GID;
+			break;
+		case 'f':
+			mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
+			flags |= SMB_MOUNT_FMODE;
+			break;
+		case 'd':
+			mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
+			flags |= SMB_MOUNT_DMODE;
+			break;
+		case 'i':
+			strlcpy(mnt->codepage.local_name, optarg, 
+				SMB_NLS_MAXNAMELEN);
+			break;
+		case 'c':
+			strlcpy(mnt->codepage.remote_name, optarg,
+				SMB_NLS_MAXNAMELEN);
+			break;
+		case 't':
+			mnt->ttl = value;
+			break;
+		default:
+			printk ("smbfs: Unrecognized mount option %s\n",
+				optopt);
+			return -1;
+		}
+	}
+	mnt->flags = flags;
+	return c;
+}
+
+/*
+ * smb_show_options() is for displaying mount options in /proc/mounts.
+ * It tries to avoid showing settings that were not changed from their
+ * defaults.
+ */
+static int
+smb_show_options(struct seq_file *s, struct vfsmount *m)
+{
+	struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
+	int i;
+
+	for (i = 0; opts[i].name != NULL; i++)
+		if (mnt->flags & opts[i].flag)
+			seq_printf(s, ",%s", opts[i].name);
+
+	if (mnt->flags & SMB_MOUNT_UID)
+		seq_printf(s, ",uid=%d", mnt->uid);
+	if (mnt->flags & SMB_MOUNT_GID)
+		seq_printf(s, ",gid=%d", mnt->gid);
+	if (mnt->mounted_uid != 0)
+		seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
+
+	/* 
+	 * Defaults for file_mode and dir_mode are unknown to us; they
+	 * depend on the current umask of the user doing the mount.
+	 */
+	if (mnt->flags & SMB_MOUNT_FMODE)
+		seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
+	if (mnt->flags & SMB_MOUNT_DMODE)
+		seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
+
+	if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
+		seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
+	if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
+		seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
+
+	if (mnt->ttl != SMB_TTL_DEFAULT)
+		seq_printf(s, ",ttl=%d", mnt->ttl);
+
+	return 0;
+}
+
+static void
+smb_unload_nls(struct smb_sb_info *server)
+{
+	unload_nls(server->remote_nls);
+	unload_nls(server->local_nls);
+}
+
+static void
+smb_put_super(struct super_block *sb)
+{
+	struct smb_sb_info *server = SMB_SB(sb);
+
+	lock_kernel();
+
+	smb_lock_server(server);
+	server->state = CONN_INVALID;
+	smbiod_unregister_server(server);
+
+	smb_close_socket(server);
+
+	if (server->conn_pid)
+		kill_pid(server->conn_pid, SIGTERM, 1);
+
+	bdi_destroy(&server->bdi);
+	kfree(server->ops);
+	smb_unload_nls(server);
+	sb->s_fs_info = NULL;
+	smb_unlock_server(server);
+	put_pid(server->conn_pid);
+	kfree(server);
+
+	unlock_kernel();
+}
+
+static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
+{
+	struct smb_sb_info *server;
+	struct smb_mount_data_kernel *mnt;
+	struct smb_mount_data *oldmnt;
+	struct inode *root_inode;
+	struct smb_fattr root;
+	int ver;
+	void *mem;
+	static int warn_count;
+
+	if (warn_count < 5) {
+		warn_count++;
+		printk(KERN_EMERG "smbfs is deprecated and will be removed"
+			" from the 2.6.37 kernel. Please migrate to cifs\n");
+	}
+
+	if (!raw_data)
+		goto out_no_data;
+
+	oldmnt = (struct smb_mount_data *) raw_data;
+	ver = oldmnt->version;
+	if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
+		goto out_wrong_data;
+
+	sb->s_flags |= MS_NODIRATIME;
+	sb->s_blocksize = 1024;	/* Eh...  Is this correct? */
+	sb->s_blocksize_bits = 10;
+	sb->s_magic = SMB_SUPER_MAGIC;
+	sb->s_op = &smb_sops;
+	sb->s_time_gran = 100;
+
+	server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
+	if (!server)
+		goto out_no_server;
+	sb->s_fs_info = server;
+	
+	if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
+		goto out_bdi;
+
+	sb->s_bdi = &server->bdi;
+
+	server->super_block = sb;
+	server->mnt = NULL;
+	server->sock_file = NULL;
+	init_waitqueue_head(&server->conn_wq);
+	init_MUTEX(&server->sem);
+	INIT_LIST_HEAD(&server->entry);
+	INIT_LIST_HEAD(&server->xmitq);
+	INIT_LIST_HEAD(&server->recvq);
+	server->conn_error = 0;
+	server->conn_pid = NULL;
+	server->state = CONN_INVALID; /* no connection yet */
+	server->generation = 0;
+
+	/* Allocate the global temp buffer and some superblock helper structs */
+	/* FIXME: move these to the smb_sb_info struct */
+	VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
+		sizeof(struct smb_mount_data_kernel));
+	mem = kmalloc(sizeof(struct smb_ops) +
+		      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
+	if (!mem)
+		goto out_no_mem;
+
+	server->ops = mem;
+	smb_install_null_ops(server->ops);
+	server->mnt = mem + sizeof(struct smb_ops);
+
+	/* Setup NLS stuff */
+	server->remote_nls = NULL;
+	server->local_nls = NULL;
+
+	mnt = server->mnt;
+
+	memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
+	strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
+		SMB_NLS_MAXNAMELEN);
+	strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
+		SMB_NLS_MAXNAMELEN);
+
+	mnt->ttl = SMB_TTL_DEFAULT;
+	if (ver == SMB_MOUNT_OLDVERSION) {
+		mnt->version = oldmnt->version;
+
+		SET_UID(mnt->uid, oldmnt->uid);
+		SET_GID(mnt->gid, oldmnt->gid);
+
+		mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
+		mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
+
+		mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
+			SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
+	} else {
+		mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
+				S_IROTH | S_IXOTH | S_IFREG;
+		mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
+				S_IROTH | S_IXOTH | S_IFDIR;
+		if (parse_options(mnt, raw_data))
+			goto out_bad_option;
+	}
+	mnt->mounted_uid = current_uid();
+	smb_setcodepage(server, &mnt->codepage);
+
+	/*
+	 * Display the enabled options
+	 * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
+	 */
+	if (mnt->flags & SMB_MOUNT_OLDATTR)
+		printk("SMBFS: Using core getattr (Win 95 speedup)\n");
+	else if (mnt->flags & SMB_MOUNT_DIRATTR)
+		printk("SMBFS: Using dir ff getattr\n");
+
+	if (smbiod_register_server(server) < 0) {
+		printk(KERN_ERR "smbfs: failed to start smbiod\n");
+		goto out_no_smbiod;
+	}
+
+	/*
+	 * Keep the super block locked while we get the root inode.
+	 */
+	smb_init_root_dirent(server, &root, sb);
+	root_inode = smb_iget(sb, &root);
+	if (!root_inode)
+		goto out_no_root;
+
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_no_root;
+
+	smb_new_dentry(sb->s_root);
+
+	return 0;
+
+out_no_root:
+	iput(root_inode);
+out_no_smbiod:
+	smb_unload_nls(server);
+out_bad_option:
+	kfree(mem);
+out_no_mem:
+	bdi_destroy(&server->bdi);
+out_bdi:
+	if (!server->mnt)
+		printk(KERN_ERR "smb_fill_super: allocation failure\n");
+	sb->s_fs_info = NULL;
+	kfree(server);
+	goto out_fail;
+out_wrong_data:
+	printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
+	goto out_fail;
+out_no_data:
+	printk(KERN_ERR "smb_fill_super: missing data argument\n");
+out_fail:
+	return -EINVAL;
+out_no_server:
+	printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
+	return -ENOMEM;
+}
+
+static int
+smb_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	int result;
+	
+	lock_kernel();
+
+	result = smb_proc_dskattr(dentry, buf);
+
+	unlock_kernel();
+
+	buf->f_type = SMB_SUPER_MAGIC;
+	buf->f_namelen = SMB_MAXPATHLEN;
+	return result;
+}
+
+int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	int err = smb_revalidate_inode(dentry);
+	if (!err)
+		generic_fillattr(dentry->d_inode, stat);
+	return err;
+}
+
+int
+smb_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
+	int error, changed, refresh = 0;
+	struct smb_fattr fattr;
+
+	lock_kernel();
+
+	error = smb_revalidate_inode(dentry);
+	if (error)
+		goto out;
+
+	if ((error = inode_change_ok(inode, attr)) < 0)
+		goto out;
+
+	error = -EPERM;
+	if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
+		goto out;
+
+	if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
+		goto out;
+
+	if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
+		goto out;
+
+	if ((attr->ia_valid & ATTR_SIZE) != 0) {
+		VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
+			DENTRY_PATH(dentry),
+			(long) inode->i_size, (long) attr->ia_size);
+
+		filemap_write_and_wait(inode->i_mapping);
+
+		error = smb_open(dentry, O_WRONLY);
+		if (error)
+			goto out;
+		error = server->ops->truncate(inode, attr->ia_size);
+		if (error)
+			goto out;
+		truncate_setsize(inode, attr->ia_size);
+		refresh = 1;
+	}
+
+	if (server->opt.capabilities & SMB_CAP_UNIX) {
+		/* For now we don't want to set the size with setattr_unix */
+		attr->ia_valid &= ~ATTR_SIZE;
+		/* FIXME: only call if we actually want to set something? */
+		error = smb_proc_setattr_unix(dentry, attr, 0, 0);
+		if (!error)
+			refresh = 1;
+
+		goto out;
+	}
+
+	/*
+	 * Initialize the fattr and check for changed fields.
+	 * Note: CTIME under SMB is creation time rather than
+	 * change time, so we don't attempt to change it.
+	 */
+	smb_get_inode_attr(inode, &fattr);
+
+	changed = 0;
+	if ((attr->ia_valid & ATTR_MTIME) != 0) {
+		fattr.f_mtime = attr->ia_mtime;
+		changed = 1;
+	}
+	if ((attr->ia_valid & ATTR_ATIME) != 0) {
+		fattr.f_atime = attr->ia_atime;
+		/* Earlier protocols don't have an access time */
+		if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
+			changed = 1;
+	}
+	if (changed) {
+		error = smb_proc_settime(dentry, &fattr);
+		if (error)
+			goto out;
+		refresh = 1;
+	}
+
+	/*
+	 * Check for mode changes ... we're extremely limited in
+	 * what can be set for SMB servers: just the read-only bit.
+	 */
+	if ((attr->ia_valid & ATTR_MODE) != 0) {
+		VERBOSE("%s/%s mode change, old=%x, new=%x\n",
+			DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
+		changed = 0;
+		if (attr->ia_mode & S_IWUSR) {
+			if (fattr.attr & aRONLY) {
+				fattr.attr &= ~aRONLY;
+				changed = 1;
+			}
+		} else {
+			if (!(fattr.attr & aRONLY)) {
+				fattr.attr |= aRONLY;
+				changed = 1;
+			}
+		}
+		if (changed) {
+			error = smb_proc_setattr(dentry, &fattr);
+			if (error)
+				goto out;
+			refresh = 1;
+		}
+	}
+	error = 0;
+
+out:
+	if (refresh)
+		smb_refresh_inode(dentry);
+	unlock_kernel();
+	return error;
+}
+
+static int smb_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
+}
+
+static struct file_system_type smb_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "smbfs",
+	.get_sb		= smb_get_sb,
+	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_BINARY_MOUNTDATA,
+};
+
+static int __init init_smb_fs(void)
+{
+	int err;
+	DEBUG1("registering ...\n");
+
+	err = init_inodecache();
+	if (err)
+		goto out_inode;
+	err = smb_init_request_cache();
+	if (err)
+		goto out_request;
+	err = register_filesystem(&smb_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	smb_destroy_request_cache();
+out_request:
+	destroy_inodecache();
+out_inode:
+	return err;
+}
+
+static void __exit exit_smb_fs(void)
+{
+	DEBUG1("unregistering ...\n");
+	unregister_filesystem(&smb_fs_type);
+	smb_destroy_request_cache();
+	destroy_inodecache();
+}
+
+module_init(init_smb_fs)
+module_exit(exit_smb_fs)
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/smbfs/ioctl.c b/drivers/staging/smbfs/ioctl.c
new file mode 100644
index 00000000000..2da16926747
--- /dev/null
+++ b/drivers/staging/smbfs/ioctl.c
@@ -0,0 +1,68 @@
+/*
+ *  ioctl.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/highuid.h>
+#include <linux/smp_lock.h>
+#include <linux/net.h>
+
+#include <asm/uaccess.h>
+
+#include "smb_fs.h"
+#include "smb_mount.h"
+#include "proto.h"
+
+long
+smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
+	struct smb_conn_opt opt;
+	int result = -EINVAL;
+
+	lock_kernel();
+	switch (cmd) {
+		uid16_t uid16;
+		uid_t uid32;
+	case SMB_IOC_GETMOUNTUID:
+		SET_UID(uid16, server->mnt->mounted_uid);
+		result = put_user(uid16, (uid16_t __user *) arg);
+		break;
+	case SMB_IOC_GETMOUNTUID32:
+		SET_UID(uid32, server->mnt->mounted_uid);
+		result = put_user(uid32, (uid_t __user *) arg);
+		break;
+
+	case SMB_IOC_NEWCONN:
+		/* arg is smb_conn_opt, or NULL if no connection was made */
+		if (!arg) {
+			result = 0;
+			smb_lock_server(server);
+			server->state = CONN_RETRIED;
+			printk(KERN_ERR "Connection attempt failed!  [%d]\n",
+			       server->conn_error);
+			smbiod_flush(server);
+			smb_unlock_server(server);
+			break;
+		}
+
+		result = -EFAULT;
+		if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
+			result = smb_newconn(server, &opt);
+		break;
+	default:
+		break;
+	}
+	unlock_kernel();
+
+	return result;
+}
diff --git a/drivers/staging/smbfs/proc.c b/drivers/staging/smbfs/proc.c
new file mode 100644
index 00000000000..2fb079c37f2
--- /dev/null
+++ b/drivers/staging/smbfs/proc.c
@@ -0,0 +1,3506 @@
+/*
+ *  proc.c
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/dcache.h>
+#include <linux/nls.h>
+#include <linux/smp_lock.h>
+#include <linux/net.h>
+#include <linux/vfs.h>
+#include <net/sock.h>
+
+#include <asm/string.h>
+#include <asm/div64.h>
+
+#include "smb_fs.h"
+#include "smbno.h"
+#include "smb_mount.h"
+#include "smb_debug.h"
+#include "proto.h"
+#include "request.h"
+
+
+/* Features. Undefine if they cause problems, this should perhaps be a
+   config option. */
+#define SMBFS_POSIX_UNLINK 1
+
+/* Allow smb_retry to be interrupted. */
+#define SMB_RETRY_INTR
+
+#define SMB_VWV(packet)  ((packet) + SMB_HEADER_LEN)
+#define SMB_CMD(packet)  (*(packet+8))
+#define SMB_WCT(packet)  (*(packet+SMB_HEADER_LEN - 1))
+
+#define SMB_DIRINFO_SIZE 43
+#define SMB_STATUS_SIZE  21
+
+#define SMB_ST_BLKSIZE	(PAGE_SIZE)
+#define SMB_ST_BLKSHIFT	(PAGE_SHIFT)
+
+static struct smb_ops smb_ops_core;
+static struct smb_ops smb_ops_os2;
+static struct smb_ops smb_ops_win95;
+static struct smb_ops smb_ops_winNT;
+static struct smb_ops smb_ops_unix;
+static struct smb_ops smb_ops_null;
+
+static void
+smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
+static void
+smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
+static int
+smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
+		      struct smb_fattr *fattr);
+static int
+smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
+		    struct smb_fattr *fattr);
+static int
+smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
+		      u16 attr);
+static int
+smb_proc_setattr_ext(struct smb_sb_info *server,
+		     struct inode *inode, struct smb_fattr *fattr);
+static int
+smb_proc_query_cifsunix(struct smb_sb_info *server);
+static void
+install_ops(struct smb_ops *dst, struct smb_ops *src);
+
+
+static void
+str_upper(char *name, int len)
+{
+	while (len--)
+	{
+		if (*name >= 'a' && *name <= 'z')
+			*name -= ('a' - 'A');
+		name++;
+	}
+}
+
+#if 0
+static void
+str_lower(char *name, int len)
+{
+	while (len--)
+	{
+		if (*name >= 'A' && *name <= 'Z')
+			*name += ('a' - 'A');
+		name++;
+	}
+}
+#endif
+
+/* reverse a string inline. This is used by the dircache walking routines */
+static void reverse_string(char *buf, int len)
+{
+	char c;
+	char *end = buf+len-1;
+
+	while(buf < end) {
+		c = *buf;
+		*(buf++) = *end;
+		*(end--) = c;
+	}
+}
+
+/* no conversion, just a wrapper for memcpy. */
+static int convert_memcpy(unsigned char *output, int olen,
+			  const unsigned char *input, int ilen,
+			  struct nls_table *nls_from,
+			  struct nls_table *nls_to)
+{
+	if (olen < ilen)
+		return -ENAMETOOLONG;
+	memcpy(output, input, ilen);
+	return ilen;
+}
+
+static inline int write_char(unsigned char ch, char *output, int olen)
+{
+	if (olen < 4)
+		return -ENAMETOOLONG;
+	sprintf(output, ":x%02x", ch);
+	return 4;
+}
+
+static inline int write_unichar(wchar_t ch, char *output, int olen)
+{
+	if (olen < 5)
+		return -ENAMETOOLONG;
+	sprintf(output, ":%04x", ch);
+	return 5;
+}
+
+/* convert from one "codepage" to another (possibly being utf8). */
+static int convert_cp(unsigned char *output, int olen,
+		      const unsigned char *input, int ilen,
+		      struct nls_table *nls_from,
+		      struct nls_table *nls_to)
+{
+	int len = 0;
+	int n;
+	wchar_t ch;
+
+	while (ilen > 0) {
+		/* convert by changing to unicode and back to the new cp */
+		n = nls_from->char2uni(input, ilen, &ch);
+		if (n == -EINVAL) {
+			ilen--;
+			n = write_char(*input++, output, olen);
+			if (n < 0)
+				goto fail;
+			output += n;
+			olen -= n;
+			len += n;
+			continue;
+		} else if (n < 0)
+			goto fail;
+		input += n;
+		ilen -= n;
+
+		n = nls_to->uni2char(ch, output, olen);
+		if (n == -EINVAL)
+			n = write_unichar(ch, output, olen);
+		if (n < 0)
+			goto fail;
+		output += n;
+		olen -= n;
+
+		len += n;
+	}
+	return len;
+fail:
+	return n;
+}
+
+/* ----------------------------------------------------------- */
+
+/*
+ * nls_unicode
+ *
+ * This encodes/decodes little endian unicode format
+ */
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	if (boundlen < 2)
+		return -EINVAL;
+	*out++ = uni & 0xff;
+	*out++ = uni >> 8;
+	return 2;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	if (boundlen < 2)
+		return -EINVAL;
+	*uni = (rawstring[1] << 8) | rawstring[0];
+	return 2;
+}
+
+static struct nls_table unicode_table = {
+	.charset	= "unicode",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+};
+
+/* ----------------------------------------------------------- */
+
+static int setcodepage(struct nls_table **p, char *name)
+{
+	struct nls_table *nls;
+
+	if (!name || !*name) {
+		nls = NULL;
+	} else if ( (nls = load_nls(name)) == NULL) {
+		printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
+		return -EINVAL;
+	}
+
+	/* if already set, unload the previous one. */
+	if (*p && *p != &unicode_table)
+		unload_nls(*p);
+	*p = nls;
+
+	return 0;
+}
+
+/* Handles all changes to codepage settings. */
+int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
+{
+	int n = 0;
+
+	smb_lock_server(server);
+
+	/* Don't load any nls_* at all, if no remote is requested */
+	if (!*cp->remote_name)
+		goto out;
+
+	/* local */
+	n = setcodepage(&server->local_nls, cp->local_name);
+	if (n != 0)
+		goto out;
+
+	/* remote */
+	if (!strcmp(cp->remote_name, "unicode")) {
+		server->remote_nls = &unicode_table;
+	} else {
+		n = setcodepage(&server->remote_nls, cp->remote_name);
+		if (n != 0)
+			setcodepage(&server->local_nls, NULL);
+	}
+
+out:
+	if (server->local_nls != NULL && server->remote_nls != NULL)
+		server->ops->convert = convert_cp;
+	else
+		server->ops->convert = convert_memcpy;
+
+	smb_unlock_server(server);
+	return n;
+}
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Encoding/Decoding section                                                */
+/*                                                                           */
+/*****************************************************************************/
+
+static __u8 *
+smb_encode_smb_length(__u8 * p, __u32 len)
+{
+	*p = 0;
+	*(p+1) = 0;
+	*(p+2) = (len & 0xFF00) >> 8;
+	*(p+3) = (len & 0xFF);
+	if (len > 0xFFFF)
+	{
+		*(p+1) = 1;
+	}
+	return p + 4;
+}
+
+/*
+ * smb_build_path: build the path to entry and name storing it in buf.
+ * The path returned will have the trailing '\0'.
+ */
+static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
+			  int maxlen,
+			  struct dentry *entry, struct qstr *name)
+{
+	unsigned char *path = buf;
+	int len;
+	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
+
+	if (maxlen < (2<<unicode))
+		return -ENAMETOOLONG;
+
+	if (maxlen > SMB_MAXPATHLEN + 1)
+		maxlen = SMB_MAXPATHLEN + 1;
+
+	if (entry == NULL)
+		goto test_name_and_out;
+
+	/*
+	 * If IS_ROOT, we have to do no walking at all.
+	 */
+	if (IS_ROOT(entry) && !name) {
+		*path++ = '\\';
+		if (unicode) *path++ = '\0';
+		*path++ = '\0';
+		if (unicode) *path++ = '\0';
+		return path-buf;
+	}
+
+	/*
+	 * Build the path string walking the tree backward from end to ROOT
+	 * and store it in reversed order [see reverse_string()]
+	 */
+	dget(entry);
+	spin_lock(&entry->d_lock);
+	while (!IS_ROOT(entry)) {
+		struct dentry *parent;
+
+		if (maxlen < (3<<unicode)) {
+			spin_unlock(&entry->d_lock);
+			dput(entry);
+			return -ENAMETOOLONG;
+		}
+
+		len = server->ops->convert(path, maxlen-2, 
+				      entry->d_name.name, entry->d_name.len,
+				      server->local_nls, server->remote_nls);
+		if (len < 0) {
+			spin_unlock(&entry->d_lock);
+			dput(entry);
+			return len;
+		}
+		reverse_string(path, len);
+		path += len;
+		if (unicode) {
+			/* Note: reverse order */
+			*path++ = '\0';
+			maxlen--;
+		}
+		*path++ = '\\';
+		maxlen -= len+1;
+
+		parent = entry->d_parent;
+		dget(parent);
+		spin_unlock(&entry->d_lock);
+		dput(entry);
+		entry = parent;
+		spin_lock(&entry->d_lock);
+	}
+	spin_unlock(&entry->d_lock);
+	dput(entry);
+	reverse_string(buf, path-buf);
+
+	/* maxlen has space for at least one char */
+test_name_and_out:
+	if (name) {
+		if (maxlen < (3<<unicode))
+			return -ENAMETOOLONG;
+		*path++ = '\\';
+		if (unicode) {
+			*path++ = '\0';
+			maxlen--;
+		}
+		len = server->ops->convert(path, maxlen-2, 
+				      name->name, name->len,
+				      server->local_nls, server->remote_nls);
+		if (len < 0)
+			return len;
+		path += len;
+		maxlen -= len+1;
+	}
+	/* maxlen has space for at least one char */
+	*path++ = '\0';
+	if (unicode) *path++ = '\0';
+	return path-buf;
+}
+
+static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
+			   struct dentry *dir, struct qstr *name)
+{
+	int result;
+
+	result = smb_build_path(server, buf, maxlen, dir, name);
+	if (result < 0)
+		goto out;
+	if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
+		str_upper(buf, result);
+out:
+	return result;
+}
+
+/* encode_path for non-trans2 request SMBs */
+static int smb_simple_encode_path(struct smb_request *req, char **p,
+				  struct dentry * entry, struct qstr * name)
+{
+	struct smb_sb_info *server = req->rq_server;
+	char *s = *p;
+	int res;
+	int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
+	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
+
+	if (!maxlen)
+		return -ENAMETOOLONG;
+	*s++ = 4;	/* ASCII data format */
+
+	/*
+	 * SMB Unicode strings must be 16bit aligned relative the start of the
+	 * packet. If they are not they must be padded with 0.
+	 */
+	if (unicode) {
+		int align = s - (char *)req->rq_buffer;
+		if (!(align & 1)) {
+			*s++ = '\0';
+			maxlen--;
+		}
+	}
+
+	res = smb_encode_path(server, s, maxlen-1, entry, name);
+	if (res < 0)
+		return res;
+	*p = s + res;
+	return 0;
+}
+
+/* The following are taken directly from msdos-fs */
+
+/* Linear day numbers of the respective 1sts in non-leap years. */
+
+static int day_n[] =
+{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
+		  /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
+
+
+static time_t
+utc2local(struct smb_sb_info *server, time_t time)
+{
+	return time - server->opt.serverzone*60;
+}
+
+static time_t
+local2utc(struct smb_sb_info *server, time_t time)
+{
+	return time + server->opt.serverzone*60;
+}
+
+/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
+
+static time_t
+date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
+{
+	int month, year;
+	time_t secs;
+
+	/* first subtract and mask after that... Otherwise, if
+	   date == 0, bad things happen */
+	month = ((date >> 5) - 1) & 15;
+	year = date >> 9;
+	secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
+	    ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
+						   month < 2 ? 1 : 0) + 3653);
+	/* days since 1.1.70 plus 80's leap day */
+	return local2utc(server, secs);
+}
+
+
+/* Convert linear UNIX date to a MS-DOS time/date pair. */
+
+static void
+date_unix2dos(struct smb_sb_info *server,
+	      int unix_date, __u16 *date, __u16 *time)
+{
+	int day, year, nl_day, month;
+
+	unix_date = utc2local(server, unix_date);
+	if (unix_date < 315532800)
+		unix_date = 315532800;
+
+	*time = (unix_date % 60) / 2 +
+		(((unix_date / 60) % 60) << 5) +
+		(((unix_date / 3600) % 24) << 11);
+
+	day = unix_date / 86400 - 3652;
+	year = day / 365;
+	if ((year + 3) / 4 + 365 * year > day)
+		year--;
+	day -= (year + 3) / 4 + 365 * year;
+	if (day == 59 && !(year & 3)) {
+		nl_day = day;
+		month = 2;
+	} else {
+		nl_day = (year & 3) || day <= 59 ? day : day - 1;
+		for (month = 1; month < 12; month++)
+			if (day_n[month] > nl_day)
+				break;
+	}
+	*date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
+}
+
+/* The following are taken from fs/ntfs/util.c */
+
+#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
+
+/*
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * into Unix UTC (based 1970-01-01, in seconds).
+ */
+static struct timespec
+smb_ntutc2unixutc(u64 ntutc)
+{
+	struct timespec ts;
+	/* FIXME: what about the timezone difference? */
+	/* Subtract the NTFS time offset, then convert to 1s intervals. */
+	u64 t = ntutc - NTFS_TIME_OFFSET;
+	ts.tv_nsec = do_div(t, 10000000) * 100;
+	ts.tv_sec = t; 
+	return ts;
+}
+
+/* Convert the Unix UTC into NT time */
+static u64
+smb_unixutc2ntutc(struct timespec ts)
+{
+	/* Note: timezone conversion is probably wrong. */
+	/* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
+	return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
+}
+
+#define MAX_FILE_MODE	6
+static mode_t file_mode[] = {
+	S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
+};
+
+static int smb_filetype_to_mode(u32 filetype)
+{
+	if (filetype > MAX_FILE_MODE) {
+		PARANOIA("Filetype out of range: %d\n", filetype);
+		return S_IFREG;
+	}
+	return file_mode[filetype];
+}
+
+static u32 smb_filetype_from_mode(int mode)
+{
+	if (S_ISREG(mode))
+		return UNIX_TYPE_FILE;
+	if (S_ISDIR(mode))
+		return UNIX_TYPE_DIR;
+	if (S_ISLNK(mode))
+		return UNIX_TYPE_SYMLINK;
+	if (S_ISCHR(mode))
+		return UNIX_TYPE_CHARDEV;
+	if (S_ISBLK(mode))
+		return UNIX_TYPE_BLKDEV;
+	if (S_ISFIFO(mode))
+		return UNIX_TYPE_FIFO;
+	if (S_ISSOCK(mode))
+		return UNIX_TYPE_SOCKET;
+	return UNIX_TYPE_UNKNOWN;
+}
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Support section.                                                         */
+/*                                                                           */
+/*****************************************************************************/
+
+__u32
+smb_len(__u8 * p)
+{
+	return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
+}
+
+static __u16
+smb_bcc(__u8 * packet)
+{
+	int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
+	return WVAL(packet, pos);
+}
+
+/* smb_valid_packet: We check if packet fulfills the basic
+   requirements of a smb packet */
+
+static int
+smb_valid_packet(__u8 * packet)
+{
+	return (packet[4] == 0xff
+		&& packet[5] == 'S'
+		&& packet[6] == 'M'
+		&& packet[7] == 'B'
+		&& (smb_len(packet) + 4 == SMB_HEADER_LEN
+		    + SMB_WCT(packet) * 2 + smb_bcc(packet)));
+}
+
+/* smb_verify: We check if we got the answer we expected, and if we
+   got enough data. If bcc == -1, we don't care. */
+
+static int
+smb_verify(__u8 * packet, int command, int wct, int bcc)
+{
+	if (SMB_CMD(packet) != command)
+		goto bad_command;
+	if (SMB_WCT(packet) < wct)
+		goto bad_wct;
+	if (bcc != -1 && smb_bcc(packet) < bcc)
+		goto bad_bcc;
+	return 0;
+
+bad_command:
+	printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
+	       command, SMB_CMD(packet));
+	goto fail;
+bad_wct:
+	printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
+	       command, wct, SMB_WCT(packet));
+	goto fail;
+bad_bcc:
+	printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
+	       command, bcc, smb_bcc(packet));
+fail:
+	return -EIO;
+}
+
+/*
+ * Returns the maximum read or write size for the "payload". Making all of the
+ * packet fit within the negotiated max_xmit size.
+ *
+ * N.B. Since this value is usually computed before locking the server,
+ * the server's packet size must never be decreased!
+ */
+static inline int
+smb_get_xmitsize(struct smb_sb_info *server, int overhead)
+{
+	return server->opt.max_xmit - overhead;
+}
+
+/*
+ * Calculate the maximum read size
+ */
+int
+smb_get_rsize(struct smb_sb_info *server)
+{
+	/* readX has 12 parameters, read has 5 */
+	int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
+	int size = smb_get_xmitsize(server, overhead);
+
+	VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
+
+	return size;
+}
+
+/*
+ * Calculate the maximum write size
+ */
+int
+smb_get_wsize(struct smb_sb_info *server)
+{
+	/* writeX has 14 parameters, write has 5 */
+	int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
+	int size = smb_get_xmitsize(server, overhead);
+
+	VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
+
+	return size;
+}
+
+/*
+ * Convert SMB error codes to -E... errno values.
+ */
+int
+smb_errno(struct smb_request *req)
+{
+	int errcls = req->rq_rcls;
+	int error  = req->rq_err;
+	char *class = "Unknown";
+
+	VERBOSE("errcls %d  code %d  from command 0x%x\n",
+		errcls, error, SMB_CMD(req->rq_header));
+
+	if (errcls == ERRDOS) {
+		switch (error) {
+		case ERRbadfunc:
+			return -EINVAL;
+		case ERRbadfile:
+		case ERRbadpath:
+			return -ENOENT;
+		case ERRnofids:
+			return -EMFILE;
+		case ERRnoaccess:
+			return -EACCES;
+		case ERRbadfid:
+			return -EBADF;
+		case ERRbadmcb:
+			return -EREMOTEIO;
+		case ERRnomem:
+			return -ENOMEM;
+		case ERRbadmem:
+			return -EFAULT;
+		case ERRbadenv:
+		case ERRbadformat:
+			return -EREMOTEIO;
+		case ERRbadaccess:
+			return -EACCES;
+		case ERRbaddata:
+			return -E2BIG;
+		case ERRbaddrive:
+			return -ENXIO;
+		case ERRremcd:
+			return -EREMOTEIO;
+		case ERRdiffdevice:
+			return -EXDEV;
+		case ERRnofiles:
+			return -ENOENT;
+		case ERRbadshare:
+			return -ETXTBSY;
+		case ERRlock:
+			return -EDEADLK;
+		case ERRfilexists:
+			return -EEXIST;
+		case ERROR_INVALID_PARAMETER:
+			return -EINVAL;
+		case ERROR_DISK_FULL:
+			return -ENOSPC;
+		case ERROR_INVALID_NAME:
+			return -ENOENT;
+		case ERROR_DIR_NOT_EMPTY:
+			return -ENOTEMPTY;
+		case ERROR_NOT_LOCKED:
+                       return -ENOLCK;
+		case ERROR_ALREADY_EXISTS:
+			return -EEXIST;
+		default:
+			class = "ERRDOS";
+			goto err_unknown;
+		}
+	} else if (errcls == ERRSRV) {
+		switch (error) {
+		/* N.B. This is wrong ... EIO ? */
+		case ERRerror:
+			return -ENFILE;
+		case ERRbadpw:
+			return -EINVAL;
+		case ERRbadtype:
+		case ERRtimeout:
+			return -EIO;
+		case ERRaccess:
+			return -EACCES;
+		/*
+		 * This is a fatal error, as it means the "tree ID"
+		 * for this connection is no longer valid. We map
+		 * to a special error code and get a new connection.
+		 */
+		case ERRinvnid:
+			return -EBADSLT;
+		default:
+			class = "ERRSRV";
+			goto err_unknown;
+		}
+	} else if (errcls == ERRHRD) {
+		switch (error) {
+		case ERRnowrite:
+			return -EROFS;
+		case ERRbadunit:
+			return -ENODEV;
+		case ERRnotready:
+			return -EUCLEAN;
+		case ERRbadcmd:
+		case ERRdata:
+			return -EIO;
+		case ERRbadreq:
+			return -ERANGE;
+		case ERRbadshare:
+			return -ETXTBSY;
+		case ERRlock:
+			return -EDEADLK;
+		case ERRdiskfull:
+			return -ENOSPC;
+		default:
+			class = "ERRHRD";
+			goto err_unknown;
+		}
+	} else if (errcls == ERRCMD) {
+		class = "ERRCMD";
+	} else if (errcls == SUCCESS) {
+		return 0;	/* This is the only valid 0 return */
+	}
+
+err_unknown:
+	printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
+	       class, error, SMB_CMD(req->rq_header));
+	return -EIO;
+}
+
+/* smb_request_ok: We expect the server to be locked. Then we do the
+   request and check the answer completely. When smb_request_ok
+   returns 0, you can be quite sure that everything went well. When
+   the answer is <=0, the returned number is a valid unix errno. */
+
+static int
+smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
+{
+	int result;
+
+	req->rq_resp_wct = wct;
+	req->rq_resp_bcc = bcc;
+
+	result = smb_add_request(req);
+	if (result != 0) {
+		DEBUG1("smb_request failed\n");
+		goto out;
+	}
+
+	if (smb_valid_packet(req->rq_header) != 0) {
+		PARANOIA("invalid packet!\n");
+		goto out;
+	}
+
+	result = smb_verify(req->rq_header, command, wct, bcc);
+
+out:
+	return result;
+}
+
+/*
+ * This implements the NEWCONN ioctl. It installs the server pid,
+ * sets server->state to CONN_VALID, and wakes up the waiting process.
+ */
+int
+smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
+{
+	struct file *filp;
+	struct sock *sk;
+	int error;
+
+	VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
+
+	smb_lock_server(server);
+
+	/*
+	 * Make sure we don't already have a valid connection ...
+	 */
+	error = -EINVAL;
+	if (server->state == CONN_VALID)
+		goto out;
+
+	error = -EACCES;
+	if (current_uid() != server->mnt->mounted_uid &&
+	    !capable(CAP_SYS_ADMIN))
+		goto out;
+
+	error = -EBADF;
+	filp = fget(opt->fd);
+	if (!filp)
+		goto out;
+	if (!smb_valid_socket(filp->f_path.dentry->d_inode))
+		goto out_putf;
+
+	server->sock_file = filp;
+	server->conn_pid = get_pid(task_pid(current));
+	server->opt = *opt;
+	server->generation += 1;
+	server->state = CONN_VALID;
+	error = 0;
+
+	if (server->conn_error) {
+		/*
+		 * conn_error is the returncode we originally decided to
+		 * drop the old connection on. This message should be positive
+		 * and not make people ask questions on why smbfs is printing
+		 * error messages ...
+		 */
+		printk(KERN_INFO "SMB connection re-established (%d)\n",
+		       server->conn_error);
+		server->conn_error = 0;
+	}
+
+	/*
+	 * Store the server in sock user_data (Only used by sunrpc)
+	 */
+	sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
+	sk->sk_user_data = server;
+
+	/* chain into the data_ready callback */
+	server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
+
+	/* check if we have an old smbmount that uses seconds for the 
+	   serverzone */
+	if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
+		server->opt.serverzone /= 60;
+
+	/* now that we have an established connection we can detect the server
+	   type and enable bug workarounds */
+	if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
+		install_ops(server->ops, &smb_ops_core);
+	else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
+		install_ops(server->ops, &smb_ops_os2);
+	else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
+		 (server->opt.max_xmit < 0x1000) &&
+		 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
+		/* FIXME: can we kill the WIN95 flag now? */
+		server->mnt->flags |= SMB_MOUNT_WIN95;
+		VERBOSE("detected WIN95 server\n");
+		install_ops(server->ops, &smb_ops_win95);
+	} else {
+		/*
+		 * Samba has max_xmit 65535
+		 * NT4spX has max_xmit 4536 (or something like that)
+		 * win2k has ...
+		 */
+		VERBOSE("detected NT1 (Samba, NT4/5) server\n");
+		install_ops(server->ops, &smb_ops_winNT);
+	}
+
+	/* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
+	if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
+		server->ops->getattr = smb_proc_getattr_core;
+	} else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
+		server->ops->getattr = smb_proc_getattr_ff;
+	}
+
+	/* Decode server capabilities */
+	if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
+		/* Should be ok to set this now, as no one can access the
+		   mount until the connection has been established. */
+		SB_of(server)->s_maxbytes = ~0ULL >> 1;
+		VERBOSE("LFS enabled\n");
+	}
+	if (server->opt.capabilities & SMB_CAP_UNICODE) {
+		server->mnt->flags |= SMB_MOUNT_UNICODE;
+		VERBOSE("Unicode enabled\n");
+	} else {
+		server->mnt->flags &= ~SMB_MOUNT_UNICODE;
+	}
+#if 0
+	/* flags we may test for other patches ... */
+	if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
+		VERBOSE("Large reads enabled\n");
+	}
+	if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
+		VERBOSE("Large writes enabled\n");
+	}
+#endif
+	if (server->opt.capabilities & SMB_CAP_UNIX) {
+		struct inode *inode;
+		VERBOSE("Using UNIX CIFS extensions\n");
+		install_ops(server->ops, &smb_ops_unix);
+		inode = SB_of(server)->s_root->d_inode;
+		if (inode)
+			inode->i_op = &smb_dir_inode_operations_unix;
+	}
+
+	VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
+		server->opt.protocol, server->opt.max_xmit,
+		pid_nr(server->conn_pid), server->opt.capabilities);
+
+	/* FIXME: this really should be done by smbmount. */
+	if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
+		server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
+	}
+
+	smb_unlock_server(server);
+	smbiod_wake_up();
+	if (server->opt.capabilities & SMB_CAP_UNIX)
+		smb_proc_query_cifsunix(server);
+
+	server->conn_complete++;
+	wake_up_interruptible_all(&server->conn_wq);
+	return error;
+
+out:
+	smb_unlock_server(server);
+	smbiod_wake_up();
+	return error;
+
+out_putf:
+	fput(filp);
+	goto out;
+}
+
+/* smb_setup_header: We completely set up the packet. You only have to
+   insert the command-specific fields */
+
+__u8 *
+smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
+{
+	__u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
+	__u8 *p = req->rq_header;
+	struct smb_sb_info *server = req->rq_server;
+
+	p = smb_encode_smb_length(p, xmit_len - 4);
+
+	*p++ = 0xff;
+	*p++ = 'S';
+	*p++ = 'M';
+	*p++ = 'B';
+	*p++ = command;
+
+	memset(p, '\0', 19);
+	p += 19;
+	p += 8;
+
+	if (server->opt.protocol > SMB_PROTOCOL_CORE) {
+		int flags = SMB_FLAGS_CASELESS_PATHNAMES;
+		int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
+			SMB_FLAGS2_EXTENDED_ATTRIBUTES;	/* EA? not really ... */
+
+		*(req->rq_header + smb_flg) = flags;
+		if (server->mnt->flags & SMB_MOUNT_UNICODE)
+			flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
+		WSET(req->rq_header, smb_flg2, flags2);
+	}
+	*p++ = wct;		/* wct */
+	p += 2 * wct;
+	WSET(p, 0, bcc);
+
+	/* Include the header in the data to send */
+	req->rq_iovlen = 1;
+	req->rq_iov[0].iov_base = req->rq_header;
+	req->rq_iov[0].iov_len  = xmit_len - bcc;
+
+	return req->rq_buffer;
+}
+
+static void
+smb_setup_bcc(struct smb_request *req, __u8 *p)
+{
+	u16 bcc = p - req->rq_buffer;
+	u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
+
+	WSET(pbcc, 0, bcc);
+
+	smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN + 
+			      2*SMB_WCT(req->rq_header) - 2 + bcc);
+
+	/* Include the "bytes" in the data to send */
+	req->rq_iovlen = 2;
+	req->rq_iov[1].iov_base = req->rq_buffer;
+	req->rq_iov[1].iov_len  = bcc;
+}
+
+static int
+smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
+	      __u16 mode, off_t offset)
+{
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBlseek, 4, 0);
+	WSET(req->rq_header, smb_vwv0, fileid);
+	WSET(req->rq_header, smb_vwv1, mode);
+	DSET(req->rq_header, smb_vwv2, offset);
+	req->rq_flags |= SMB_REQ_NORETRY;
+
+	result = smb_request_ok(req, SMBlseek, 2, 0);
+	if (result < 0) {
+		result = 0;
+		goto out_free;
+	}
+
+	result = DVAL(req->rq_header, smb_vwv0);
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
+{
+	struct inode *ino = dentry->d_inode;
+	struct smb_inode_info *ei = SMB_I(ino);
+	int mode, read_write = 0x42, read_only = 0x40;
+	int res;
+	char *p;
+	struct smb_request *req;
+
+	/*
+	 * Attempt to open r/w, unless there are no write privileges.
+	 */
+	mode = read_write;
+	if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
+		mode = read_only;
+#if 0
+	/* FIXME: why is this code not in? below we fix it so that a caller
+	   wanting RO doesn't get RW. smb_revalidate_inode does some 
+	   optimization based on access mode. tail -f needs it to be correct.
+
+	   We must open rw since we don't do the open if called a second time
+	   with different 'wish'. Is that not supported by smb servers? */
+	if (!(wish & (O_WRONLY | O_RDWR)))
+		mode = read_only;
+#endif
+
+	res = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+      retry:
+	p = smb_setup_header(req, SMBopen, 2, 0);
+	WSET(req->rq_header, smb_vwv0, mode);
+	WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
+	res = smb_simple_encode_path(req, &p, dentry, NULL);
+	if (res < 0)
+		goto out_free;
+	smb_setup_bcc(req, p);
+
+	res = smb_request_ok(req, SMBopen, 7, 0);
+	if (res != 0) {
+		if (mode == read_write &&
+		    (res == -EACCES || res == -ETXTBSY || res == -EROFS))
+		{
+			VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
+				DENTRY_PATH(dentry), res);
+			mode = read_only;
+			req->rq_flags = 0;
+			goto retry;
+		}
+		goto out_free;
+	}
+	/* We should now have data in vwv[0..6]. */
+
+	ei->fileid = WVAL(req->rq_header, smb_vwv0);
+	ei->attr   = WVAL(req->rq_header, smb_vwv1);
+	/* smb_vwv2 has mtime */
+	/* smb_vwv4 has size  */
+	ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
+	ei->open = server->generation;
+
+out_free:
+	smb_rput(req);
+out:
+	return res;
+}
+
+/*
+ * Make sure the file is open, and check that the access
+ * is compatible with the desired access.
+ */
+int
+smb_open(struct dentry *dentry, int wish)
+{
+	struct inode *inode = dentry->d_inode;
+	int result;
+	__u16 access;
+
+	result = -ENOENT;
+	if (!inode) {
+		printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
+		       DENTRY_PATH(dentry));
+		goto out;
+	}
+
+	if (!smb_is_open(inode)) {
+		struct smb_sb_info *server = server_from_inode(inode);
+		result = 0;
+		if (!smb_is_open(inode))
+			result = smb_proc_open(server, dentry, wish);
+		if (result)
+			goto out;
+		/*
+		 * A successful open means the path is still valid ...
+		 */
+		smb_renew_times(dentry);
+	}
+
+	/*
+	 * Check whether the access is compatible with the desired mode.
+	 */
+	result = 0;
+	access = SMB_I(inode)->access;
+	if (access != wish && access != SMB_O_RDWR) {
+		PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
+			 DENTRY_PATH(dentry), access, wish);
+		result = -EACCES;
+	}
+out:
+	return result;
+}
+
+static int 
+smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
+{
+	struct smb_request *req;
+	int result = -ENOMEM;
+
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBclose, 3, 0);
+	WSET(req->rq_header, smb_vwv0, fileid);
+	DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
+	req->rq_flags |= SMB_REQ_NORETRY;
+	result = smb_request_ok(req, SMBclose, 0, 0);
+
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Win NT 4.0 has an apparent bug in that it fails to update the
+ * modify time when writing to a file. As a workaround, we update
+ * both modify and access time locally, and post the times to the
+ * server when closing the file.
+ */
+static int 
+smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
+{
+	struct smb_inode_info *ei = SMB_I(ino);
+	int result = 0;
+	if (smb_is_open(ino))
+	{
+		/*
+		 * We clear the open flag in advance, in case another
+ 		 * process observes the value while we block below.
+		 */
+		ei->open = 0;
+
+		/*
+		 * Kludge alert: SMB timestamps are accurate only to
+		 * two seconds ... round the times to avoid needless
+		 * cache invalidations!
+		 */
+		if (ino->i_mtime.tv_sec & 1) { 
+			ino->i_mtime.tv_sec--;
+			ino->i_mtime.tv_nsec = 0; 
+		}
+		if (ino->i_atime.tv_sec & 1) {
+			ino->i_atime.tv_sec--;
+			ino->i_atime.tv_nsec = 0;
+		}
+		/*
+		 * If the file is open with write permissions,
+		 * update the time stamps to sync mtime and atime.
+		 */
+		if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
+		    (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
+		    !(ei->access == SMB_O_RDONLY))
+		{
+			struct smb_fattr fattr;
+			smb_get_inode_attr(ino, &fattr);
+			smb_proc_setattr_ext(server, ino, &fattr);
+		}
+
+		result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
+		/*
+		 * Force a revalidation after closing ... some servers
+		 * don't post the size until the file has been closed.
+		 */
+		if (server->opt.protocol < SMB_PROTOCOL_NT1)
+			ei->oldmtime = 0;
+		ei->closed = jiffies;
+	}
+	return result;
+}
+
+int
+smb_close(struct inode *ino)
+{
+	int result = 0;
+
+	if (smb_is_open(ino)) {
+		struct smb_sb_info *server = server_from_inode(ino);
+		result = smb_proc_close_inode(server, ino);
+	}
+	return result;
+}
+
+/*
+ * This is used to close a file following a failed instantiate.
+ * Since we don't have an inode, we can't use any of the above.
+ */
+int
+smb_close_fileid(struct dentry *dentry, __u16 fileid)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	int result;
+
+	result = smb_proc_close(server, fileid, get_seconds());
+	return result;
+}
+
+/* In smb_proc_read and smb_proc_write we do not retry, because the
+   file-id would not be valid after a reconnection. */
+
+static void
+smb_proc_read_data(struct smb_request *req)
+{
+	req->rq_iov[0].iov_base = req->rq_buffer;
+	req->rq_iov[0].iov_len  = 3;
+
+	req->rq_iov[1].iov_base = req->rq_page;
+	req->rq_iov[1].iov_len  = req->rq_rsize;
+	req->rq_iovlen = 2;
+
+	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
+}
+
+static int
+smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	__u16 returned_count, data_len;
+	unsigned char *buf;
+	int result;
+	struct smb_request *req;
+	u8 rbuf[4];
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBread, 5, 0);
+	buf = req->rq_header;
+	WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
+	WSET(buf, smb_vwv1, count);
+	DSET(buf, smb_vwv2, offset);
+	WSET(buf, smb_vwv4, 0);
+
+	req->rq_page = data;
+	req->rq_rsize = count;
+	req->rq_callback = smb_proc_read_data;
+	req->rq_buffer = rbuf;
+	req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
+
+	result = smb_request_ok(req, SMBread, 5, -1);
+	if (result < 0)
+		goto out_free;
+	returned_count = WVAL(req->rq_header, smb_vwv0);
+
+	data_len = WVAL(rbuf, 1);
+
+	if (returned_count != data_len) {
+		printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
+		printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
+		       returned_count, data_len);
+	}
+	result = data_len;
+
+out_free:
+	smb_rput(req);
+out:
+	VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
+		inode->i_ino, SMB_I(inode)->fileid, count, result);
+	return result;
+}
+
+static int
+smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	int result;
+	u16 fileid = SMB_I(inode)->fileid;
+	u8 buf[4];
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
+		inode->i_ino, fileid, count, offset);
+
+	smb_setup_header(req, SMBwrite, 5, count + 3);
+	WSET(req->rq_header, smb_vwv0, fileid);
+	WSET(req->rq_header, smb_vwv1, count);
+	DSET(req->rq_header, smb_vwv2, offset);
+	WSET(req->rq_header, smb_vwv4, 0);
+
+	buf[0] = 1;
+	WSET(buf, 1, count);	/* yes, again ... */
+	req->rq_iov[1].iov_base = buf;
+	req->rq_iov[1].iov_len = 3;
+	req->rq_iov[2].iov_base = (char *) data;
+	req->rq_iov[2].iov_len = count;
+	req->rq_iovlen = 3;
+	req->rq_flags |= SMB_REQ_NORETRY;
+
+	result = smb_request_ok(req, SMBwrite, 1, 0);
+	if (result >= 0)
+		result = WVAL(req->rq_header, smb_vwv0);
+
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * In smb_proc_readX and smb_proc_writeX we do not retry, because the
+ * file-id would not be valid after a reconnection.
+ */
+
+#define SMB_READX_MAX_PAD      64
+static void
+smb_proc_readX_data(struct smb_request *req)
+{
+	/* header length, excluding the netbios length (-4) */
+	int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
+	int data_off = WVAL(req->rq_header, smb_vwv6);
+
+	/*
+	 * Some genius made the padding to the data bytes arbitrary.
+	 * So we must first calculate the amount of padding used by the server.
+	 */
+	data_off -= hdrlen;
+	if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
+		PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
+		PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
+		req->rq_rlen = req->rq_bufsize + 1;
+		return;
+	}
+	req->rq_iov[0].iov_base = req->rq_buffer;
+	req->rq_iov[0].iov_len  = data_off;
+
+	req->rq_iov[1].iov_base = req->rq_page;
+	req->rq_iov[1].iov_len  = req->rq_rsize;
+	req->rq_iovlen = 2;
+
+	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
+}
+
+static int
+smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	unsigned char *buf;
+	int result;
+	struct smb_request *req;
+	static char pad[SMB_READX_MAX_PAD];
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBreadX, 12, 0);
+	buf = req->rq_header;
+	WSET(buf, smb_vwv0, 0x00ff);
+	WSET(buf, smb_vwv1, 0);
+	WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
+	DSET(buf, smb_vwv3, (u32)offset);               /* low 32 bits */
+	WSET(buf, smb_vwv5, count);
+	WSET(buf, smb_vwv6, 0);
+	DSET(buf, smb_vwv7, 0);
+	WSET(buf, smb_vwv9, 0);
+	DSET(buf, smb_vwv10, (u32)(offset >> 32));      /* high 32 bits */
+	WSET(buf, smb_vwv11, 0);
+
+	req->rq_page = data;
+	req->rq_rsize = count;
+	req->rq_callback = smb_proc_readX_data;
+	req->rq_buffer = pad;
+	req->rq_bufsize = SMB_READX_MAX_PAD;
+	req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
+
+	result = smb_request_ok(req, SMBreadX, 12, -1);
+	if (result < 0)
+		goto out_free;
+	result = WVAL(req->rq_header, smb_vwv5);
+
+out_free:
+	smb_rput(req);
+out:
+	VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
+		inode->i_ino, SMB_I(inode)->fileid, count, result);
+	return result;
+}
+
+static int
+smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	int result;
+	u8 *p;
+	static u8 pad[4];
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
+		inode->i_ino, SMB_I(inode)->fileid, count, offset);
+
+	p = smb_setup_header(req, SMBwriteX, 14, count + 1);
+	WSET(req->rq_header, smb_vwv0, 0x00ff);
+	WSET(req->rq_header, smb_vwv1, 0);
+	WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
+	DSET(req->rq_header, smb_vwv3, (u32)offset);	/* low 32 bits */
+	DSET(req->rq_header, smb_vwv5, 0);
+	WSET(req->rq_header, smb_vwv7, 0);		/* write mode */
+	WSET(req->rq_header, smb_vwv8, 0);
+	WSET(req->rq_header, smb_vwv9, 0);
+	WSET(req->rq_header, smb_vwv10, count);		/* data length */
+	WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
+	DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
+
+	req->rq_iov[1].iov_base = pad;
+	req->rq_iov[1].iov_len = 1;
+	req->rq_iov[2].iov_base = (char *) data;
+	req->rq_iov[2].iov_len = count;
+	req->rq_iovlen = 3;
+	req->rq_flags |= SMB_REQ_NORETRY;
+
+	result = smb_request_ok(req, SMBwriteX, 6, 0);
+ 	if (result >= 0)
+		result = WVAL(req->rq_header, smb_vwv2);
+
+	smb_rput(req);
+out:
+	return result;
+}
+
+int
+smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	char *p;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	p = smb_setup_header(req, SMBcreate, 3, 0);
+	WSET(req->rq_header, smb_vwv0, attr);
+	DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
+	result = smb_simple_encode_path(req, &p, dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	smb_setup_bcc(req, p);
+
+	result = smb_request_ok(req, SMBcreate, 1, 0);
+	if (result < 0)
+		goto out_free;
+
+	*fileid = WVAL(req->rq_header, smb_vwv0);
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+int
+smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
+{
+	struct smb_sb_info *server = server_from_dentry(old_dentry);
+	char *p;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	p = smb_setup_header(req, SMBmv, 1, 0);
+	WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
+	result = smb_simple_encode_path(req, &p, old_dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	result = smb_simple_encode_path(req, &p, new_dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	smb_setup_bcc(req, p);
+
+	if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
+		goto out_free;
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Code common to mkdir and rmdir.
+ */
+static int
+smb_proc_generic_command(struct dentry *dentry, __u8 command)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	char *p;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	p = smb_setup_header(req, command, 0, 0);
+	result = smb_simple_encode_path(req, &p, dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	smb_setup_bcc(req, p);
+
+	result = smb_request_ok(req, command, 0, 0);
+	if (result < 0)
+		goto out_free;
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+int
+smb_proc_mkdir(struct dentry *dentry)
+{
+	return smb_proc_generic_command(dentry, SMBmkdir);
+}
+
+int
+smb_proc_rmdir(struct dentry *dentry)
+{
+	return smb_proc_generic_command(dentry, SMBrmdir);
+}
+
+#if SMBFS_POSIX_UNLINK
+/*
+ * Removes readonly attribute from a file. Used by unlink to give posix
+ * semantics.
+ */
+static int
+smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
+{
+	int result;
+	struct smb_fattr fattr;
+
+	/* FIXME: cifsUE should allow removing a readonly file. */
+
+	/* first get current attribute */
+	smb_init_dirent(server, &fattr);
+	result = server->ops->getattr(server, dentry, &fattr);
+	smb_finish_dirent(server, &fattr);
+	if (result < 0)
+		return result;
+
+	/* if RONLY attribute is set, remove it */
+	if (fattr.attr & aRONLY) {  /* read only attribute is set */
+		fattr.attr &= ~aRONLY;
+		result = smb_proc_setattr_core(server, dentry, fattr.attr);
+	}
+	return result;
+}
+#endif
+
+int
+smb_proc_unlink(struct dentry *dentry)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	int flag = 0;
+	char *p;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+      retry:
+	p = smb_setup_header(req, SMBunlink, 1, 0);
+	WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
+	result = smb_simple_encode_path(req, &p, dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	smb_setup_bcc(req, p);
+
+	if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
+#if SMBFS_POSIX_UNLINK
+		if (result == -EACCES && !flag) {
+			/* Posix semantics is for the read-only state
+			   of a file to be ignored in unlink(). In the
+			   SMB world a unlink() is refused on a
+			   read-only file. To make things easier for
+			   unix users we try to override the files
+			   permission if the unlink fails with the
+			   right error.
+			   This introduces a race condition that could
+			   lead to a file being written by someone who
+			   shouldn't have access, but as far as I can
+			   tell that is unavoidable */
+
+			/* remove RONLY attribute and try again */
+			result = smb_set_rw(dentry,server);
+			if (result == 0) {
+				flag = 1;
+				req->rq_flags = 0;
+				goto retry;
+			}
+		}
+#endif
+		goto out_free;
+	}
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+int
+smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
+{
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBflush, 1, 0);
+	WSET(req->rq_header, smb_vwv0, fileid);
+	req->rq_flags |= SMB_REQ_NORETRY;
+	result = smb_request_ok(req, SMBflush, 0, 0);
+
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_trunc32(struct inode *inode, loff_t length)
+{
+	/*
+	 * Writing 0bytes is old-SMB magic for truncating files.
+	 * MAX_NON_LFS should prevent this from being called with a too
+	 * large offset.
+	 */
+	return smb_proc_write(inode, length, 0, NULL);
+}
+
+static int
+smb_proc_trunc64(struct inode *inode, loff_t length)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	int result;
+	char *param;
+	char *data;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 14)))
+		goto out;
+
+	param = req->rq_buffer;
+	data = req->rq_buffer + 6;
+
+	/* FIXME: must we also set allocation size? winNT seems to do that */
+	WSET(param, 0, SMB_I(inode)->fileid);
+	WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
+	WSET(param, 4, 0);
+	LSET(data, 0, length);
+
+	req->rq_trans2_command = TRANSACT2_SETFILEINFO;
+	req->rq_ldata = 8;
+	req->rq_data  = data;
+	req->rq_lparm = 6;
+	req->rq_parm  = param;
+	req->rq_flags |= SMB_REQ_NORETRY;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+
+	result = 0;
+	if (req->rq_rcls != 0)
+		result = smb_errno(req);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_trunc95(struct inode *inode, loff_t length)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	int result = smb_proc_trunc32(inode, length);
+ 
+	/*
+	 * win9x doesn't appear to update the size immediately.
+	 * It will return the old file size after the truncate,
+	 * confusing smbfs. So we force an update.
+	 *
+	 * FIXME: is this still necessary?
+	 */
+	smb_proc_flush(server, SMB_I(inode)->fileid);
+	return result;
+}
+
+static void
+smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
+{
+	memset(fattr, 0, sizeof(*fattr));
+
+	fattr->f_nlink = 1;
+	fattr->f_uid = server->mnt->uid;
+	fattr->f_gid = server->mnt->gid;
+	fattr->f_unix = 0;
+}
+
+static void
+smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
+{
+	if (fattr->f_unix)
+		return;
+
+	fattr->f_mode = server->mnt->file_mode;
+	if (fattr->attr & aDIR) {
+		fattr->f_mode = server->mnt->dir_mode;
+		fattr->f_size = SMB_ST_BLKSIZE;
+	}
+	/* Check the read-only flag */
+	if (fattr->attr & aRONLY)
+		fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
+
+	/* How many 512 byte blocks do we need for this file? */
+	fattr->f_blocks = 0;
+	if (fattr->f_size != 0)
+		fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
+	return;
+}
+
+void
+smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
+		     struct super_block *sb)
+{
+	smb_init_dirent(server, fattr);
+	fattr->attr = aDIR;
+	fattr->f_ino = 2; /* traditional root inode number */
+	fattr->f_mtime = current_fs_time(sb);
+	smb_finish_dirent(server, fattr);
+}
+
+/*
+ * Decode a dirent for old protocols
+ *
+ * qname is filled with the decoded, and possibly translated, name.
+ * fattr receives decoded attributes
+ *
+ * Bugs Noted:
+ * (1) Pathworks servers may pad the name with extra spaces.
+ */
+static char *
+smb_decode_short_dirent(struct smb_sb_info *server, char *p,
+			struct qstr *qname, struct smb_fattr *fattr,
+			unsigned char *name_buf)
+{
+	int len;
+
+	/*
+	 * SMB doesn't have a concept of inode numbers ...
+	 */
+	smb_init_dirent(server, fattr);
+	fattr->f_ino = 0;	/* FIXME: do we need this? */
+
+	p += SMB_STATUS_SIZE;	/* reserved (search_status) */
+	fattr->attr = *p;
+	fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
+	fattr->f_mtime.tv_nsec = 0;
+	fattr->f_size = DVAL(p, 5);
+	fattr->f_ctime = fattr->f_mtime;
+	fattr->f_atime = fattr->f_mtime;
+	qname->name = p + 9;
+	len = strnlen(qname->name, 12);
+
+	/*
+	 * Trim trailing blanks for Pathworks servers
+	 */
+	while (len > 2 && qname->name[len-1] == ' ')
+		len--;
+
+	smb_finish_dirent(server, fattr);
+
+#if 0
+	/* FIXME: These only work for ascii chars, and recent smbmount doesn't
+	   allow the flag to be set anyway. It kills const. Remove? */
+	switch (server->opt.case_handling) {
+	case SMB_CASE_UPPER:
+		str_upper(entry->name, len);
+		break;
+	case SMB_CASE_LOWER:
+		str_lower(entry->name, len);
+		break;
+	default:
+		break;
+	}
+#endif
+
+	qname->len = 0;
+	len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
+				   qname->name, len,
+				   server->remote_nls, server->local_nls);
+	if (len > 0) {
+		qname->len = len;
+		qname->name = name_buf;
+		DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
+	}
+
+	return p + 22;
+}
+
+/*
+ * This routine is used to read in directory entries from the network.
+ * Note that it is for short directory name seeks, i.e.: protocol <
+ * SMB_PROTOCOL_LANMAN2
+ */
+static int
+smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
+		       struct smb_cache_control *ctl)
+{
+	struct dentry *dir = filp->f_path.dentry;
+	struct smb_sb_info *server = server_from_dentry(dir);
+	struct qstr qname;
+	struct smb_fattr fattr;
+	char *p;
+	int result;
+	int i, first, entries_seen, entries;
+	int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
+	__u16 bcc;
+	__u16 count;
+	char status[SMB_STATUS_SIZE];
+	static struct qstr mask = {
+		.name	= "*.*",
+		.len	= 3,
+	};
+	unsigned char *last_status;
+	struct smb_request *req;
+	unsigned char *name_buf;
+
+	VERBOSE("%s/%s\n", DENTRY_PATH(dir));
+
+	lock_kernel();
+
+	result = -ENOMEM;
+	if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
+		goto out;
+
+	first = 1;
+	entries = 0;
+	entries_seen = 2; /* implicit . and .. */
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
+		goto out_name;
+
+	while (1) {
+		p = smb_setup_header(req, SMBsearch, 2, 0);
+		WSET(req->rq_header, smb_vwv0, entries_asked);
+		WSET(req->rq_header, smb_vwv1, aDIR);
+		if (first == 1) {
+			result = smb_simple_encode_path(req, &p, dir, &mask);
+			if (result < 0)
+				goto out_free;
+			if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
+				result = -ENAMETOOLONG;
+				goto out_free;
+			}
+			*p++ = 5;
+			WSET(p, 0, 0);
+			p += 2;
+			first = 0;
+		} else {
+			if (p + 5 + SMB_STATUS_SIZE >
+			    (char *)req->rq_buffer + req->rq_bufsize) {
+				result = -ENAMETOOLONG;
+				goto out_free;
+			}
+				
+			*p++ = 4;
+			*p++ = 0;
+			*p++ = 5;
+			WSET(p, 0, SMB_STATUS_SIZE);
+			p += 2;
+			memcpy(p, status, SMB_STATUS_SIZE);
+			p += SMB_STATUS_SIZE;
+		}
+
+		smb_setup_bcc(req, p);
+
+		result = smb_request_ok(req, SMBsearch, 1, -1);
+		if (result < 0) {
+			if ((req->rq_rcls == ERRDOS) && 
+			    (req->rq_err  == ERRnofiles))
+				break;
+			goto out_free;
+		}
+		count = WVAL(req->rq_header, smb_vwv0);
+		if (count <= 0)
+			break;
+
+		result = -EIO;
+		bcc = smb_bcc(req->rq_header);
+		if (bcc != count * SMB_DIRINFO_SIZE + 3)
+			goto out_free;
+		p = req->rq_buffer + 3;
+
+
+		/* Make sure the response fits in the buffer. Fixed sized 
+		   entries means we don't have to check in the decode loop. */
+
+		last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
+
+		if (last_status + SMB_DIRINFO_SIZE >=
+		    req->rq_buffer + req->rq_bufsize) {
+			printk(KERN_ERR "smb_proc_readdir_short: "
+			       "last dir entry outside buffer! "
+			       "%d@%p  %d@%p\n", SMB_DIRINFO_SIZE, last_status,
+			       req->rq_bufsize, req->rq_buffer);
+			goto out_free;
+		}
+
+		/* Read the last entry into the status field. */
+		memcpy(status, last_status, SMB_STATUS_SIZE);
+
+
+		/* Now we are ready to parse smb directory entries. */
+
+		for (i = 0; i < count; i++) {
+			p = smb_decode_short_dirent(server, p, 
+						    &qname, &fattr, name_buf);
+			if (qname.len == 0)
+				continue;
+
+			if (entries_seen == 2 && qname.name[0] == '.') {
+				if (qname.len == 1)
+					continue;
+				if (qname.name[1] == '.' && qname.len == 2)
+					continue;
+			}
+			if (!smb_fill_cache(filp, dirent, filldir, ctl, 
+					    &qname, &fattr))
+				;	/* stop reading? */
+			entries_seen++;
+		}
+	}
+	result = entries;
+
+out_free:
+	smb_rput(req);
+out_name:
+	kfree(name_buf);
+out:
+	unlock_kernel();
+	return result;
+}
+
+static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
+{
+	u64 size, disk_bytes;
+
+	/* FIXME: verify nls support. all is sent as utf8? */
+
+	fattr->f_unix = 1;
+	fattr->f_mode = 0;
+
+	/* FIXME: use the uniqueID from the remote instead? */
+	/* 0 L file size in bytes */
+	/* 8 L file size on disk in bytes (block count) */
+	/* 40 L uid */
+	/* 48 L gid */
+	/* 56 W file type */
+	/* 60 L devmajor */
+	/* 68 L devminor */
+	/* 76 L unique ID (inode) */
+	/* 84 L permissions */
+	/* 92 L link count */
+
+	size = LVAL(p, 0);
+	disk_bytes = LVAL(p, 8);
+
+	/*
+	 * Some samba versions round up on-disk byte usage
+	 * to 1MB boundaries, making it useless. When seeing
+	 * that, use the size instead.
+	 */
+	if (!(disk_bytes & 0xfffff))
+		disk_bytes = size+511;
+
+	fattr->f_size = size;
+	fattr->f_blocks = disk_bytes >> 9;
+	fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
+	fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
+	fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
+
+	if (server->mnt->flags & SMB_MOUNT_UID)
+		fattr->f_uid = server->mnt->uid;
+	else
+		fattr->f_uid = LVAL(p, 40);
+
+	if (server->mnt->flags & SMB_MOUNT_GID)
+		fattr->f_gid = server->mnt->gid;
+	else
+		fattr->f_gid = LVAL(p, 48);
+
+	fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
+
+	if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
+		__u64 major = LVAL(p, 60);
+		__u64 minor = LVAL(p, 68);
+
+		fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
+		if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
+	    	MINOR(fattr->f_rdev) != (minor & 0xffffffff))
+			fattr->f_rdev = 0;
+	}
+
+	fattr->f_mode |= LVAL(p, 84);
+
+	if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
+	     (S_ISDIR(fattr->f_mode)) )
+		fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
+	else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
+	          !(S_ISDIR(fattr->f_mode)) )
+		fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
+				(fattr->f_mode & S_IFMT);
+
+}
+
+/*
+ * Interpret a long filename structure using the specified info level:
+ *   level 1 for anything below NT1 protocol
+ *   level 260 for NT1 protocol
+ *
+ * qname is filled with the decoded, and possibly translated, name
+ * fattr receives decoded attributes.
+ *
+ * Bugs Noted:
+ * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
+ */
+static char *
+smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
+		       struct qstr *qname, struct smb_fattr *fattr,
+		       unsigned char *name_buf)
+{
+	char *result;
+	unsigned int len = 0;
+	int n;
+	__u16 date, time;
+	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
+
+	/*
+	 * SMB doesn't have a concept of inode numbers ...
+	 */
+	smb_init_dirent(server, fattr);
+	fattr->f_ino = 0;	/* FIXME: do we need this? */
+
+	switch (level) {
+	case 1:
+		len = *((unsigned char *) p + 22);
+		qname->name = p + 23;
+		result = p + 24 + len;
+
+		date = WVAL(p, 0);
+		time = WVAL(p, 2);
+		fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
+		fattr->f_ctime.tv_nsec = 0;
+
+		date = WVAL(p, 4);
+		time = WVAL(p, 6);
+		fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
+		fattr->f_atime.tv_nsec = 0;
+
+		date = WVAL(p, 8);
+		time = WVAL(p, 10);
+		fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
+		fattr->f_mtime.tv_nsec = 0;
+		fattr->f_size = DVAL(p, 12);
+		/* ULONG allocation size */
+		fattr->attr = WVAL(p, 20);
+
+		VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
+			p, len, len, qname->name);
+		break;
+	case 260:
+		result = p + WVAL(p, 0);
+		len = DVAL(p, 60);
+		if (len > 255) len = 255;
+		/* NT4 null terminates, unless we are using unicode ... */
+		qname->name = p + 94;
+		if (!unicode && len && qname->name[len-1] == '\0')
+			len--;
+
+		fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
+		fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
+		fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
+		/* change time (32) */
+		fattr->f_size = LVAL(p, 40);
+		/* alloc size (48) */
+		fattr->attr = DVAL(p, 56);
+
+		VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
+			p, len, len, qname->name);
+		break;
+	case SMB_FIND_FILE_UNIX:
+		result = p + WVAL(p, 0);
+		qname->name = p + 108;
+
+		len = strlen(qname->name);
+		/* FIXME: should we check the length?? */
+
+		p += 8;
+		smb_decode_unix_basic(fattr, server, p);
+		VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
+			p, len, len, qname->name);
+		break;
+	default:
+		PARANOIA("Unknown info level %d\n", level);
+		result = p + WVAL(p, 0);
+		goto out;
+	}
+
+	smb_finish_dirent(server, fattr);
+
+#if 0
+	/* FIXME: These only work for ascii chars, and recent smbmount doesn't
+	   allow the flag to be set anyway. Remove? */
+	switch (server->opt.case_handling) {
+	case SMB_CASE_UPPER:
+		str_upper(qname->name, len);
+		break;
+	case SMB_CASE_LOWER:
+		str_lower(qname->name, len);
+		break;
+	default:
+		break;
+	}
+#endif
+
+	qname->len = 0;
+	n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
+				 qname->name, len,
+				 server->remote_nls, server->local_nls);
+	if (n > 0) {
+		qname->len = n;
+		qname->name = name_buf;
+	}
+
+out:
+	return result;
+}
+
+/* findfirst/findnext flags */
+#define SMB_CLOSE_AFTER_FIRST (1<<0)
+#define SMB_CLOSE_IF_END (1<<1)
+#define SMB_REQUIRE_RESUME_KEY (1<<2)
+#define SMB_CONTINUE_BIT (1<<3)
+
+/*
+ * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
+ * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
+ * go there for advise.
+ *
+ * Bugs Noted:
+ * (1) When using Info Level 1 Win NT 4.0 truncates directory listings 
+ * for certain patterns of names and/or lengths. The breakage pattern
+ * is completely reproducible and can be toggled by the creation of a
+ * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
+ */
+static int
+smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
+		      struct smb_cache_control *ctl)
+{
+	struct dentry *dir = filp->f_path.dentry;
+	struct smb_sb_info *server = server_from_dentry(dir);
+	struct qstr qname;
+	struct smb_fattr fattr;
+
+	unsigned char *p, *lastname;
+	char *mask, *param;
+	__u16 command;
+	int first, entries_seen;
+
+	/* Both NT and OS/2 accept info level 1 (but see note below). */
+	int info_level = 260;
+	const int max_matches = 512;
+
+	unsigned int ff_searchcount = 0;
+	unsigned int ff_eos = 0;
+	unsigned int ff_lastname = 0;
+	unsigned int ff_dir_handle = 0;
+	unsigned int loop_count = 0;
+	unsigned int mask_len, i;
+	int result;
+	struct smb_request *req;
+	unsigned char *name_buf;
+	static struct qstr star = {
+		.name	= "*",
+		.len	= 1,
+	};
+
+	lock_kernel();
+
+	/*
+	 * We always prefer unix style. Use info level 1 for older
+	 * servers that don't do 260.
+	 */
+	if (server->opt.capabilities & SMB_CAP_UNIX)
+		info_level = SMB_FIND_FILE_UNIX;
+	else if (server->opt.protocol < SMB_PROTOCOL_NT1)
+		info_level = 1;
+
+	result = -ENOMEM;
+	if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
+		goto out;
+	if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
+		goto out_name;
+	param = req->rq_buffer;
+
+	/*
+	 * Encode the initial path
+	 */
+	mask = param + 12;
+
+	result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
+	if (result <= 0)
+		goto out_free;
+	mask_len = result - 1;	/* mask_len is strlen, not #bytes */
+	result = 0;
+	first = 1;
+	VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
+
+	entries_seen = 2;
+	ff_eos = 0;
+
+	while (ff_eos == 0) {
+		loop_count += 1;
+		if (loop_count > 10) {
+			printk(KERN_WARNING "smb_proc_readdir_long: "
+			       "Looping in FIND_NEXT??\n");
+			result = -EIO;
+			break;
+		}
+
+		if (first != 0) {
+			command = TRANSACT2_FINDFIRST;
+			WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
+			WSET(param, 2, max_matches);	/* max count */
+			WSET(param, 4, SMB_CLOSE_IF_END);
+			WSET(param, 6, info_level);
+			DSET(param, 8, 0);
+		} else {
+			command = TRANSACT2_FINDNEXT;
+
+			VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
+				ff_dir_handle, ff_lastname, mask_len, mask);
+
+			WSET(param, 0, ff_dir_handle);	/* search handle */
+			WSET(param, 2, max_matches);	/* max count */
+			WSET(param, 4, info_level);
+			DSET(param, 6, 0);
+			WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
+		}
+
+		req->rq_trans2_command = command;
+		req->rq_ldata = 0;
+		req->rq_data  = NULL;
+		req->rq_lparm = 12 + mask_len + 1;
+		req->rq_parm  = param;
+		req->rq_flags = 0;
+		result = smb_add_request(req);
+		if (result < 0) {
+			PARANOIA("error=%d, breaking\n", result);
+			break;
+		}
+
+		if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
+			/* a damn Win95 bug - sometimes it clags if you 
+			   ask it too fast */
+			schedule_timeout_interruptible(msecs_to_jiffies(200));
+			continue;
+                }
+
+		if (req->rq_rcls != 0) {
+			result = smb_errno(req);
+			PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
+				 mask, result, req->rq_rcls, req->rq_err);
+			break;
+		}
+
+		/* parse out some important return info */
+		if (first != 0) {
+			ff_dir_handle = WVAL(req->rq_parm, 0);
+			ff_searchcount = WVAL(req->rq_parm, 2);
+			ff_eos = WVAL(req->rq_parm, 4);
+			ff_lastname = WVAL(req->rq_parm, 8);
+		} else {
+			ff_searchcount = WVAL(req->rq_parm, 0);
+			ff_eos = WVAL(req->rq_parm, 2);
+			ff_lastname = WVAL(req->rq_parm, 6);
+		}
+
+		if (ff_searchcount == 0)
+			break;
+
+		/* Now we are ready to parse smb directory entries. */
+
+		/* point to the data bytes */
+		p = req->rq_data;
+		for (i = 0; i < ff_searchcount; i++) {
+			/* make sure we stay within the buffer */
+			if (p >= req->rq_data + req->rq_ldata) {
+				printk(KERN_ERR "smb_proc_readdir_long: "
+				       "dirent pointer outside buffer! "
+				       "%p  %d@%p\n",
+				       p, req->rq_ldata, req->rq_data);
+				result = -EIO; /* always a comm. error? */
+				goto out_free;
+			}
+
+			p = smb_decode_long_dirent(server, p, info_level,
+						   &qname, &fattr, name_buf);
+
+			/* ignore . and .. from the server */
+			if (entries_seen == 2 && qname.name[0] == '.') {
+				if (qname.len == 1)
+					continue;
+				if (qname.name[1] == '.' && qname.len == 2)
+					continue;
+			}
+
+			if (!smb_fill_cache(filp, dirent, filldir, ctl, 
+					    &qname, &fattr))
+				;	/* stop reading? */
+			entries_seen++;
+		}
+
+		VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
+
+		/*
+		 * We might need the lastname for continuations.
+		 *
+		 * Note that some servers (win95?) point to the filename and
+		 * others (NT4, Samba using NT1) to the dir entry. We assume
+		 * here that those who do not point to a filename do not need
+		 * this info to continue the listing.
+		 *
+		 * OS/2 needs this and talks infolevel 1.
+		 * NetApps want lastname with infolevel 260.
+		 * win2k want lastname with infolevel 260, and points to
+		 *       the record not to the name.
+		 * Samba+CifsUnixExt doesn't need lastname.
+		 *
+		 * Both are happy if we return the data they point to. So we do.
+		 * (FIXME: above is not true with win2k)
+		 */
+		mask_len = 0;
+		if (info_level != SMB_FIND_FILE_UNIX &&
+		    ff_lastname > 0 && ff_lastname < req->rq_ldata) {
+			lastname = req->rq_data + ff_lastname;
+
+			switch (info_level) {
+			case 260:
+				mask_len = req->rq_ldata - ff_lastname;
+				break;
+			case 1:
+				/* lastname points to a length byte */
+				mask_len = *lastname++;
+				if (ff_lastname + 1 + mask_len > req->rq_ldata)
+					mask_len = req->rq_ldata - ff_lastname - 1;
+				break;
+			}
+
+			/*
+			 * Update the mask string for the next message.
+			 */
+			if (mask_len > 255)
+				mask_len = 255;
+			if (mask_len)
+				strncpy(mask, lastname, mask_len);
+		}
+		mask_len = strnlen(mask, mask_len);
+		VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
+			mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
+
+		first = 0;
+		loop_count = 0;
+	}
+
+out_free:
+	smb_rput(req);
+out_name:
+	kfree(name_buf);
+out:
+	unlock_kernel();
+	return result;
+}
+
+/*
+ * This version uses the trans2 TRANSACT2_FINDFIRST message 
+ * to get the attribute data.
+ *
+ * Bugs Noted:
+ */
+static int
+smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
+			struct smb_fattr *fattr)
+{
+	char *param, *mask;
+	__u16 date, time;
+	int mask_len, result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+	mask = param + 12;
+
+	mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
+	if (mask_len < 0) {
+		result = mask_len;
+		goto out_free;
+	}
+	VERBOSE("name=%s, len=%d\n", mask, mask_len);
+	WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
+	WSET(param, 2, 1);	/* max count */
+	WSET(param, 4, 1);	/* close after this call */
+	WSET(param, 6, 1);	/* info_level */
+	DSET(param, 8, 0);
+
+	req->rq_trans2_command = TRANSACT2_FINDFIRST;
+	req->rq_ldata = 0;
+	req->rq_data  = NULL;
+	req->rq_lparm = 12 + mask_len;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+	if (req->rq_rcls != 0) {
+		result = smb_errno(req);
+#ifdef SMBFS_PARANOIA
+		if (result != -ENOENT)
+			PARANOIA("error for %s, rcls=%d, err=%d\n",
+				 mask, req->rq_rcls, req->rq_err);
+#endif
+		goto out_free;
+	}
+	/* Make sure we got enough data ... */
+	result = -EINVAL;
+	if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
+		PARANOIA("bad result for %s, len=%d, count=%d\n",
+			 mask, req->rq_ldata, WVAL(req->rq_parm, 2));
+		goto out_free;
+	}
+
+	/*
+	 * Decode the response into the fattr ...
+	 */
+	date = WVAL(req->rq_data, 0);
+	time = WVAL(req->rq_data, 2);
+	fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
+	fattr->f_ctime.tv_nsec = 0;
+
+	date = WVAL(req->rq_data, 4);
+	time = WVAL(req->rq_data, 6);
+	fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
+	fattr->f_atime.tv_nsec = 0;
+
+	date = WVAL(req->rq_data, 8);
+	time = WVAL(req->rq_data, 10);
+	fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
+	fattr->f_mtime.tv_nsec = 0;
+	VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
+		mask, date, time, fattr->f_mtime.tv_sec);
+	fattr->f_size = DVAL(req->rq_data, 12);
+	/* ULONG allocation size */
+	fattr->attr = WVAL(req->rq_data, 20);
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
+		      struct smb_fattr *fattr)
+{
+	int result;
+	char *p;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	p = smb_setup_header(req, SMBgetatr, 0, 0);
+	result = smb_simple_encode_path(req, &p, dir, NULL);
+	if (result < 0)
+ 		goto out_free;
+	smb_setup_bcc(req, p);
+
+	if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
+		goto out_free;
+	fattr->attr    = WVAL(req->rq_header, smb_vwv0);
+	fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
+	fattr->f_mtime.tv_nsec = 0;
+	fattr->f_size  = DVAL(req->rq_header, smb_vwv3);
+	fattr->f_ctime = fattr->f_mtime; 
+	fattr->f_atime = fattr->f_mtime; 
+#ifdef SMBFS_DEBUG_TIMESTAMP
+	printk("getattr_core: %s/%s, mtime=%ld\n",
+	       DENTRY_PATH(dir), fattr->f_mtime);
+#endif
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Bugs Noted:
+ * (1) Win 95 swaps the date and time fields in the standard info level.
+ */
+static int
+smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
+			struct smb_request *req, int infolevel)
+{
+	char *p, *param;
+	int result;
+
+	param = req->rq_buffer;
+	WSET(param, 0, infolevel);
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
+	if (result < 0)
+		goto out;
+	p = param + 6 + result;
+
+	req->rq_trans2_command = TRANSACT2_QPATHINFO;
+	req->rq_ldata = 0;
+	req->rq_data  = NULL;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out;
+	if (req->rq_rcls != 0) {
+		VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
+			&param[6], result, req->rq_rcls, req->rq_err);
+		result = smb_errno(req);
+		goto out;
+	}
+	result = -ENOENT;
+	if (req->rq_ldata < 22) {
+		PARANOIA("not enough data for %s, len=%d\n",
+			 &param[6], req->rq_ldata);
+		goto out;
+	}
+
+	result = 0;
+out:
+	return result;
+}
+
+static int
+smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
+			    struct smb_fattr *attr)
+{
+	u16 date, time;
+	int off_date = 0, off_time = 2;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
+	if (result < 0)
+		goto out_free;
+
+	/*
+	 * Kludge alert: Win 95 swaps the date and time field,
+	 * contrary to the CIFS docs and Win NT practice.
+	 */
+	if (server->mnt->flags & SMB_MOUNT_WIN95) {
+		off_date = 2;
+		off_time = 0;
+	}
+	date = WVAL(req->rq_data, off_date);
+	time = WVAL(req->rq_data, off_time);
+	attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
+	attr->f_ctime.tv_nsec = 0;
+
+	date = WVAL(req->rq_data, 4 + off_date);
+	time = WVAL(req->rq_data, 4 + off_time);
+	attr->f_atime.tv_sec = date_dos2unix(server, date, time);
+	attr->f_atime.tv_nsec = 0;
+
+	date = WVAL(req->rq_data, 8 + off_date);
+	time = WVAL(req->rq_data, 8 + off_time);
+	attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
+	attr->f_mtime.tv_nsec = 0;
+#ifdef SMBFS_DEBUG_TIMESTAMP
+	printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
+	       DENTRY_PATH(dir), date, time, attr->f_mtime);
+#endif
+	attr->f_size = DVAL(req->rq_data, 12);
+	attr->attr = WVAL(req->rq_data, 20);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
+			    struct smb_fattr *attr)
+{
+	struct smb_request *req;
+	int result;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	result = smb_proc_getattr_trans2(server, dir, req,
+					 SMB_QUERY_FILE_ALL_INFO);
+	if (result < 0)
+		goto out_free;
+
+	attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
+	attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
+	attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
+	/* change (24) */
+	attr->attr = WVAL(req->rq_data, 32);
+	/* pad? (34) */
+	/* allocated size (40) */
+	attr->f_size = LVAL(req->rq_data, 48);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
+		      struct smb_fattr *attr)
+{
+	struct smb_request *req;
+	int result;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	result = smb_proc_getattr_trans2(server, dir, req,
+					 SMB_QUERY_FILE_UNIX_BASIC);
+	if (result < 0)
+		goto out_free;
+
+	smb_decode_unix_basic(attr, server, req->rq_data);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
+		    struct smb_fattr *attr)
+{
+	struct inode *inode = dir->d_inode;
+	int result;
+
+	/* FIXME: why not use the "all" version? */
+	result = smb_proc_getattr_trans2_std(server, dir, attr);
+	if (result < 0)
+		goto out;
+
+	/*
+	 * None of the getattr versions here can make win9x return the right
+	 * filesize if there are changes made to an open file.
+	 * A seek-to-end does return the right size, but we only need to do
+	 * that on files we have written.
+	 */
+	if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
+	    smb_is_open(inode))
+	{
+		__u16 fileid = SMB_I(inode)->fileid;
+		attr->f_size = smb_proc_seek(server, fileid, 2, 0);
+	}
+
+out:
+	return result;
+}
+
+static int
+smb_proc_ops_wait(struct smb_sb_info *server)
+{
+	int result;
+
+	result = wait_event_interruptible_timeout(server->conn_wq,
+				server->conn_complete, 30*HZ);
+
+	if (!result || signal_pending(current))
+		return -EIO;
+
+	return 0;
+}
+
+static int
+smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
+			  struct smb_fattr *fattr)
+{
+	int result;
+
+	if (smb_proc_ops_wait(server) < 0)
+		return -EIO;
+
+	smb_init_dirent(server, fattr);
+	result = server->ops->getattr(server, dir, fattr);
+	smb_finish_dirent(server, fattr);
+
+	return result;
+}
+
+static int
+smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
+		      struct smb_cache_control *ctl)
+{
+	struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
+
+	if (smb_proc_ops_wait(server) < 0)
+		return -EIO;
+
+	return server->ops->readdir(filp, dirent, filldir, ctl);
+}
+
+int
+smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
+{
+	struct smb_sb_info *server = server_from_dentry(dir);
+	int result;
+
+	smb_init_dirent(server, fattr);
+	result = server->ops->getattr(server, dir, fattr);
+	smb_finish_dirent(server, fattr);
+
+	return result;
+}
+
+
+/*
+ * Because of bugs in the core protocol, we use this only to set
+ * attributes. See smb_proc_settime() below for timestamp handling.
+ *
+ * Bugs Noted:
+ * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
+ * with an undocumented error (ERRDOS code 50). Setting
+ * mtime to 0 allows the attributes to be set.
+ * (2) The extra parameters following the name string aren't
+ * in the CIFS docs, but seem to be necessary for operation.
+ */
+static int
+smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
+		      __u16 attr)
+{
+	char *p;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	p = smb_setup_header(req, SMBsetatr, 8, 0);
+	WSET(req->rq_header, smb_vwv0, attr);
+	DSET(req->rq_header, smb_vwv1, 0); /* mtime */
+	WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
+	WSET(req->rq_header, smb_vwv4, 0);
+	WSET(req->rq_header, smb_vwv5, 0);
+	WSET(req->rq_header, smb_vwv6, 0);
+	WSET(req->rq_header, smb_vwv7, 0);
+	result = smb_simple_encode_path(req, &p, dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
+		result = -ENAMETOOLONG;
+		goto out_free;
+	}
+	*p++ = 4;
+	*p++ = 0;
+	smb_setup_bcc(req, p);
+
+	result = smb_request_ok(req, SMBsetatr, 0, 0);
+	if (result < 0)
+		goto out_free;
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Because of bugs in the trans2 setattr messages, we must set
+ * attributes and timestamps separately. The core SMBsetatr
+ * message seems to be the only reliable way to set attributes.
+ */
+int
+smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
+{
+	struct smb_sb_info *server = server_from_dentry(dir);
+	int result;
+
+	VERBOSE("setting %s/%s, open=%d\n", 
+		DENTRY_PATH(dir), smb_is_open(dir->d_inode));
+	result = smb_proc_setattr_core(server, dir, fattr->attr);
+	return result;
+}
+
+/*
+ * Sets the timestamps for an file open with write permissions.
+ */
+static int
+smb_proc_setattr_ext(struct smb_sb_info *server,
+		      struct inode *inode, struct smb_fattr *fattr)
+{
+	__u16 date, time;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBsetattrE, 7, 0);
+	WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
+	/* We don't change the creation time */
+	WSET(req->rq_header, smb_vwv1, 0);
+	WSET(req->rq_header, smb_vwv2, 0);
+	date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
+	WSET(req->rq_header, smb_vwv3, date);
+	WSET(req->rq_header, smb_vwv4, time);
+	date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
+	WSET(req->rq_header, smb_vwv5, date);
+	WSET(req->rq_header, smb_vwv6, time);
+#ifdef SMBFS_DEBUG_TIMESTAMP
+	printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
+	       date, time, fattr->f_mtime);
+#endif
+
+	req->rq_flags |= SMB_REQ_NORETRY;
+	result = smb_request_ok(req, SMBsetattrE, 0, 0);
+	if (result < 0)
+		goto out_free;
+	result = 0;
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Bugs Noted:
+ * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
+ * set the file's attribute flags.
+ */
+static int
+smb_proc_setattr_trans2(struct smb_sb_info *server,
+			struct dentry *dir, struct smb_fattr *fattr)
+{
+	__u16 date, time;
+	char *p, *param;
+	int result;
+	char data[26];
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+
+	WSET(param, 0, 1);	/* Info level SMB_INFO_STANDARD */
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
+	if (result < 0)
+		goto out_free;
+	p = param + 6 + result;
+
+	WSET(data, 0, 0); /* creation time */
+	WSET(data, 2, 0);
+	date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
+	WSET(data, 4, date);
+	WSET(data, 6, time);
+	date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
+	WSET(data, 8, date);
+	WSET(data, 10, time);
+#ifdef SMBFS_DEBUG_TIMESTAMP
+	printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n", 
+	       DENTRY_PATH(dir), date, time, fattr->f_mtime);
+#endif
+	DSET(data, 12, 0); /* size */
+	DSET(data, 16, 0); /* blksize */
+	WSET(data, 20, 0); /* attr */
+	DSET(data, 22, 0); /* ULONG EA size */
+
+	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
+	req->rq_ldata = 26;
+	req->rq_data  = data;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+	result = 0;
+	if (req->rq_rcls != 0)
+		result = smb_errno(req);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * ATTR_MODE      0x001
+ * ATTR_UID       0x002
+ * ATTR_GID       0x004
+ * ATTR_SIZE      0x008
+ * ATTR_ATIME     0x010
+ * ATTR_MTIME     0x020
+ * ATTR_CTIME     0x040
+ * ATTR_ATIME_SET 0x080
+ * ATTR_MTIME_SET 0x100
+ * ATTR_FORCE     0x200	
+ * ATTR_ATTR_FLAG 0x400
+ *
+ * major/minor should only be set by mknod.
+ */
+int
+smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
+		      unsigned int major, unsigned int minor)
+{
+	struct smb_sb_info *server = server_from_dentry(d);
+	u64 nttime;
+	char *p, *param;
+	int result;
+	char data[100];
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+
+	DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
+
+	WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
+	if (result < 0)
+		goto out_free;
+	p = param + 6 + result;
+
+	/* 0 L file size in bytes */
+	/* 8 L file size on disk in bytes (block count) */
+	/* 40 L uid */
+	/* 48 L gid */
+	/* 56 W file type enum */
+	/* 60 L devmajor */
+	/* 68 L devminor */
+	/* 76 L unique ID (inode) */
+	/* 84 L permissions */
+	/* 92 L link count */
+	LSET(data, 0, SMB_SIZE_NO_CHANGE);
+	LSET(data, 8, SMB_SIZE_NO_CHANGE);
+	LSET(data, 16, SMB_TIME_NO_CHANGE);
+	LSET(data, 24, SMB_TIME_NO_CHANGE);
+	LSET(data, 32, SMB_TIME_NO_CHANGE);
+	LSET(data, 40, SMB_UID_NO_CHANGE);
+	LSET(data, 48, SMB_GID_NO_CHANGE);
+	DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
+	LSET(data, 60, major);
+	LSET(data, 68, minor);
+	LSET(data, 76, 0);
+	LSET(data, 84, SMB_MODE_NO_CHANGE);
+	LSET(data, 92, 0);
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		LSET(data, 0, attr->ia_size);
+		LSET(data, 8, 0); /* can't set anyway */
+	}
+
+	/*
+	 * FIXME: check the conversion function it the correct one
+	 *
+	 * we can't set ctime but we might as well pass this to the server
+	 * and let it ignore it.
+	 */
+	if (attr->ia_valid & ATTR_CTIME) {
+		nttime = smb_unixutc2ntutc(attr->ia_ctime);
+		LSET(data, 16, nttime);
+	}
+	if (attr->ia_valid & ATTR_ATIME) {
+		nttime = smb_unixutc2ntutc(attr->ia_atime);
+		LSET(data, 24, nttime);
+	}
+	if (attr->ia_valid & ATTR_MTIME) {
+		nttime = smb_unixutc2ntutc(attr->ia_mtime);
+		LSET(data, 32, nttime);
+	}
+	
+	if (attr->ia_valid & ATTR_UID) {
+		LSET(data, 40, attr->ia_uid);
+	}
+	if (attr->ia_valid & ATTR_GID) {
+		LSET(data, 48, attr->ia_gid); 
+	}
+	
+	if (attr->ia_valid & ATTR_MODE) {
+		LSET(data, 84, attr->ia_mode);
+	}
+
+	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
+	req->rq_ldata = 100;
+	req->rq_data  = data;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+
+/*
+ * Set the modify and access timestamps for a file.
+ *
+ * Incredibly enough, in all of SMB there is no message to allow
+ * setting both attributes and timestamps at once. 
+ *
+ * Bugs Noted:
+ * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message 
+ * with info level 1 (INFO_STANDARD).
+ * (2) Win 95 seems not to support setting directory timestamps.
+ * (3) Under the core protocol apparently the only way to set the
+ * timestamp is to open and close the file.
+ */
+int
+smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	struct inode *inode = dentry->d_inode;
+	int result;
+
+	VERBOSE("setting %s/%s, open=%d\n",
+		DENTRY_PATH(dentry), smb_is_open(inode));
+
+	/* setting the time on a Win95 server fails (tridge) */
+	if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 && 
+	    !(server->mnt->flags & SMB_MOUNT_WIN95)) {
+		if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
+			result = smb_proc_setattr_ext(server, inode, fattr);
+		else
+			result = smb_proc_setattr_trans2(server, dentry, fattr);
+	} else {
+		/*
+		 * Fail silently on directories ... timestamp can't be set?
+		 */
+		result = 0;
+		if (S_ISREG(inode->i_mode)) {
+			/*
+			 * Set the mtime by opening and closing the file.
+			 * Note that the file is opened read-only, but this
+			 * still allows us to set the date (tridge)
+			 */
+			result = -EACCES;
+			if (!smb_is_open(inode))
+				smb_proc_open(server, dentry, SMB_O_RDONLY);
+			if (smb_is_open(inode)) {
+				inode->i_mtime = fattr->f_mtime;
+				result = smb_proc_close_inode(server, inode);
+			}
+		}
+	}
+
+	return result;
+}
+
+int
+smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
+{
+	struct smb_sb_info *server = SMB_SB(dentry->d_sb);
+	int result;
+	char *p;
+	long unit;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBdskattr, 0, 0);
+	if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
+		goto out_free;
+	p = SMB_VWV(req->rq_header);
+	unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
+	attr->f_blocks = WVAL(p, 0) * unit;
+	attr->f_bsize  = SMB_ST_BLKSIZE;
+	attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+int
+smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
+		   char *buffer, int len)
+{
+	char *p, *param;
+	int result;
+	struct smb_request *req;
+
+	DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+
+	WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
+	if (result < 0)
+		goto out_free;
+	p = param + 6 + result;
+
+	req->rq_trans2_command = TRANSACT2_QPATHINFO;
+	req->rq_ldata = 0;
+	req->rq_data  = NULL;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
+		&param[6], result, req->rq_rcls, req->rq_err);
+
+	/* copy data up to the \0 or buffer length */
+	result = len;
+	if (req->rq_ldata < len)
+		result = req->rq_ldata;
+	strncpy(buffer, req->rq_data, result);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+
+/*
+ * Create a symlink object called dentry which points to oldpath.
+ * Samba does not permit dangling links but returns a suitable error message.
+ */
+int
+smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
+		 const char *oldpath)
+{
+	char *p, *param;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+
+	WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
+	if (result < 0)
+		goto out_free;
+	p = param + 6 + result;
+
+	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
+	req->rq_ldata = strlen(oldpath) + 1;
+	req->rq_data  = (char *) oldpath;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+
+	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
+		&param[6], result, req->rq_rcls, req->rq_err);
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Create a hard link object called new_dentry which points to dentry.
+ */
+int
+smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
+	      struct dentry *new_dentry)
+{
+	char *p, *param;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+
+	WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
+				 new_dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	p = param + 6 + result;
+
+	/* Grr, pointless separation of parameters and data ... */
+	req->rq_data = p;
+	req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
+					dentry, NULL);
+
+	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+
+	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
+	       &param[6], result, req->rq_rcls, req->rq_err);
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_query_cifsunix(struct smb_sb_info *server)
+{
+	int result;
+	int major, minor;
+	u64 caps;
+	char param[2];
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 100)))
+		goto out;
+
+	WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
+
+	req->rq_trans2_command = TRANSACT2_QFSINFO;
+	req->rq_ldata = 0;
+	req->rq_data  = NULL;
+	req->rq_lparm = 2;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+
+	if (req->rq_ldata < 12) {
+		PARANOIA("Not enough data\n");
+		goto out_free;
+	}
+	major = WVAL(req->rq_data, 0);
+	minor = WVAL(req->rq_data, 2);
+
+	DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
+	       major, minor);
+	/* FIXME: verify that we are ok with this major/minor? */
+
+	caps = LVAL(req->rq_data, 4);
+	DEBUG1("Server capabilities 0x%016llx\n", caps);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+
+static void
+install_ops(struct smb_ops *dst, struct smb_ops *src)
+{
+	memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
+}
+
+/* < LANMAN2 */
+static struct smb_ops smb_ops_core =
+{
+	.read		= smb_proc_read,
+	.write		= smb_proc_write,
+	.readdir	= smb_proc_readdir_short,
+	.getattr	= smb_proc_getattr_core,
+	.truncate	= smb_proc_trunc32,
+};
+
+/* LANMAN2, OS/2, others? */
+static struct smb_ops smb_ops_os2 =
+{
+	.read		= smb_proc_read,
+	.write		= smb_proc_write,
+	.readdir	= smb_proc_readdir_long,
+	.getattr	= smb_proc_getattr_trans2_std,
+	.truncate	= smb_proc_trunc32,
+};
+
+/* Win95, and possibly some NetApp versions too */
+static struct smb_ops smb_ops_win95 =
+{
+	.read		= smb_proc_read,    /* does not support 12word readX */
+	.write		= smb_proc_write,
+	.readdir	= smb_proc_readdir_long,
+	.getattr	= smb_proc_getattr_95,
+	.truncate	= smb_proc_trunc95,
+};
+
+/* Samba, NT4 and NT5 */
+static struct smb_ops smb_ops_winNT =
+{
+	.read		= smb_proc_readX,
+	.write		= smb_proc_writeX,
+	.readdir	= smb_proc_readdir_long,
+	.getattr	= smb_proc_getattr_trans2_all,
+	.truncate	= smb_proc_trunc64,
+};
+
+/* Samba w/ unix extensions. Others? */
+static struct smb_ops smb_ops_unix =
+{
+	.read		= smb_proc_readX,
+	.write		= smb_proc_writeX,
+	.readdir	= smb_proc_readdir_long,
+	.getattr	= smb_proc_getattr_unix,
+	/* FIXME: core/ext/time setattr needs to be cleaned up! */
+	/* .setattr	= smb_proc_setattr_unix, */
+	.truncate	= smb_proc_trunc64,
+};
+
+/* Place holder until real ops are in place */
+static struct smb_ops smb_ops_null =
+{
+	.readdir	= smb_proc_readdir_null,
+	.getattr	= smb_proc_getattr_null,
+};
+
+void smb_install_null_ops(struct smb_ops *ops)
+{
+	install_ops(ops, &smb_ops_null);
+}
diff --git a/drivers/staging/smbfs/proto.h b/drivers/staging/smbfs/proto.h
new file mode 100644
index 00000000000..05939a6f43e
--- /dev/null
+++ b/drivers/staging/smbfs/proto.h
@@ -0,0 +1,87 @@
+/*
+ *  Autogenerated with cproto on:  Sat Sep 13 17:18:51 CEST 2003
+ */
+
+struct smb_request;
+struct sock;
+struct statfs;
+
+/* proc.c */
+extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
+extern __u32 smb_len(__u8 *p);
+extern int smb_get_rsize(struct smb_sb_info *server);
+extern int smb_get_wsize(struct smb_sb_info *server);
+extern int smb_errno(struct smb_request *req);
+extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
+extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
+extern int smb_open(struct dentry *dentry, int wish);
+extern int smb_close(struct inode *ino);
+extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
+extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
+extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
+extern int smb_proc_mkdir(struct dentry *dentry);
+extern int smb_proc_rmdir(struct dentry *dentry);
+extern int smb_proc_unlink(struct dentry *dentry);
+extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
+extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
+				 struct super_block *sb);
+extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
+extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
+extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
+extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
+extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
+extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
+extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
+extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
+extern void smb_install_null_ops(struct smb_ops *ops);
+/* dir.c */
+extern const struct file_operations smb_dir_operations;
+extern const struct inode_operations smb_dir_inode_operations;
+extern const struct inode_operations smb_dir_inode_operations_unix;
+extern void smb_new_dentry(struct dentry *dentry);
+extern void smb_renew_times(struct dentry *dentry);
+/* cache.c */
+extern void smb_invalid_dir_cache(struct inode *dir);
+extern void smb_invalidate_dircache_entries(struct dentry *parent);
+extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
+extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
+/* sock.c */
+extern void smb_data_ready(struct sock *sk, int len);
+extern int smb_valid_socket(struct inode *inode);
+extern void smb_close_socket(struct smb_sb_info *server);
+extern int smb_recv_available(struct smb_sb_info *server);
+extern int smb_receive_header(struct smb_sb_info *server);
+extern int smb_receive_drop(struct smb_sb_info *server);
+extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
+extern int smb_send_request(struct smb_request *req);
+/* inode.c */
+extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
+extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
+extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
+extern void smb_invalidate_inodes(struct smb_sb_info *server);
+extern int smb_revalidate_inode(struct dentry *dentry);
+extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
+extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
+/* file.c */
+extern const struct address_space_operations smb_file_aops;
+extern const struct file_operations smb_file_operations;
+extern const struct inode_operations smb_file_inode_operations;
+/* ioctl.c */
+extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+/* smbiod.c */
+extern void smbiod_wake_up(void);
+extern int smbiod_register_server(struct smb_sb_info *server);
+extern void smbiod_unregister_server(struct smb_sb_info *server);
+extern void smbiod_flush(struct smb_sb_info *server);
+extern int smbiod_retry(struct smb_sb_info *server);
+/* request.c */
+extern int smb_init_request_cache(void);
+extern void smb_destroy_request_cache(void);
+extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
+extern void smb_rput(struct smb_request *req);
+extern int smb_add_request(struct smb_request *req);
+extern int smb_request_send_server(struct smb_sb_info *server);
+extern int smb_request_recv(struct smb_sb_info *server);
+/* symlink.c */
+extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
+extern const struct inode_operations smb_link_inode_operations;
diff --git a/drivers/staging/smbfs/request.c b/drivers/staging/smbfs/request.c
new file mode 100644
index 00000000000..3e771686430
--- /dev/null
+++ b/drivers/staging/smbfs/request.c
@@ -0,0 +1,817 @@
+/*
+ *  request.c
+ *
+ *  Copyright (C) 2001 by Urban Widmark
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/sched.h>
+
+#include "smb_fs.h"
+#include "smbno.h"
+#include "smb_mount.h"
+#include "smb_debug.h"
+#include "request.h"
+#include "proto.h"
+
+/* #define SMB_SLAB_DEBUG	(SLAB_RED_ZONE | SLAB_POISON) */
+#define SMB_SLAB_DEBUG	0
+
+/* cache for request structures */
+static struct kmem_cache *req_cachep;
+
+static int smb_request_send_req(struct smb_request *req);
+
+/*
+  /proc/slabinfo:
+  name, active, num, objsize, active_slabs, num_slaps, #pages
+*/
+
+
+int smb_init_request_cache(void)
+{
+	req_cachep = kmem_cache_create("smb_request",
+				       sizeof(struct smb_request), 0,
+				       SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
+				       NULL);
+	if (req_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void smb_destroy_request_cache(void)
+{
+	kmem_cache_destroy(req_cachep);
+}
+
+/*
+ * Allocate and initialise a request structure
+ */
+static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
+						int bufsize)
+{
+	struct smb_request *req;
+	unsigned char *buf = NULL;
+
+	req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
+	VERBOSE("allocating request: %p\n", req);
+	if (!req)
+		goto out;
+
+	if (bufsize > 0) {
+		buf = kmalloc(bufsize, GFP_NOFS);
+		if (!buf) {
+			kmem_cache_free(req_cachep, req);
+			return NULL;
+		}
+	}
+
+	req->rq_buffer = buf;
+	req->rq_bufsize = bufsize;
+	req->rq_server = server;
+	init_waitqueue_head(&req->rq_wait);
+	INIT_LIST_HEAD(&req->rq_queue);
+	atomic_set(&req->rq_count, 1);
+
+out:
+	return req;
+}
+
+struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
+{
+	struct smb_request *req = NULL;
+
+	for (;;) {
+		atomic_inc(&server->nr_requests);
+		if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
+			req = smb_do_alloc_request(server, bufsize);
+			if (req != NULL)
+				break;
+		}
+
+#if 0
+		/*
+		 * Try to free up at least one request in order to stay
+		 * below the hard limit
+		 */
+                if (nfs_try_to_free_pages(server))
+			continue;
+
+		if (fatal_signal_pending(current))
+			return ERR_PTR(-ERESTARTSYS);
+		current->policy = SCHED_YIELD;
+		schedule();
+#else
+		/* FIXME: we want something like nfs does above, but that
+		   requires changes to all callers and can wait. */
+		break;
+#endif
+	}
+	return req;
+}
+
+static void smb_free_request(struct smb_request *req)
+{
+	atomic_dec(&req->rq_server->nr_requests);
+	if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
+		kfree(req->rq_buffer);
+	kfree(req->rq_trans2buffer);
+	kmem_cache_free(req_cachep, req);
+}
+
+/*
+ * What prevents a rget to race with a rput? The count must never drop to zero
+ * while it is in use. Only rput if it is ok that it is free'd.
+ */
+static void smb_rget(struct smb_request *req)
+{
+	atomic_inc(&req->rq_count);
+}
+void smb_rput(struct smb_request *req)
+{
+	if (atomic_dec_and_test(&req->rq_count)) {
+		list_del_init(&req->rq_queue);
+		smb_free_request(req);
+	}
+}
+
+/* setup to receive the data part of the SMB */
+static int smb_setup_bcc(struct smb_request *req)
+{
+	int result = 0;
+	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
+
+	if (req->rq_rlen > req->rq_bufsize) {
+		PARANOIA("Packet too large %d > %d\n",
+			 req->rq_rlen, req->rq_bufsize);
+		return -ENOBUFS;
+	}
+
+	req->rq_iov[0].iov_base = req->rq_buffer;
+	req->rq_iov[0].iov_len  = req->rq_rlen;
+	req->rq_iovlen = 1;
+
+	return result;
+}
+
+/*
+ * Prepare a "normal" request structure.
+ */
+static int smb_setup_request(struct smb_request *req)
+{
+	int len = smb_len(req->rq_header) + 4;
+	req->rq_slen = len;
+
+	/* if we expect a data part in the reply we set the iov's to read it */
+	if (req->rq_resp_bcc)
+		req->rq_setup_read = smb_setup_bcc;
+
+	/* This tries to support re-using the same request */
+	req->rq_bytes_sent = 0;
+	req->rq_rcls = 0;
+	req->rq_err = 0;
+	req->rq_errno = 0;
+	req->rq_fragment = 0;
+	kfree(req->rq_trans2buffer);
+	req->rq_trans2buffer = NULL;
+
+	return 0;
+}
+
+/*
+ * Prepare a transaction2 request structure
+ */
+static int smb_setup_trans2request(struct smb_request *req)
+{
+	struct smb_sb_info *server = req->rq_server;
+	int mparam, mdata;
+	static unsigned char padding[4];
+
+	/* I know the following is very ugly, but I want to build the
+	   smb packet as efficiently as possible. */
+
+	const int smb_parameters = 15;
+	const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
+	const int oparam = ALIGN(header + 3, sizeof(u32));
+	const int odata  = ALIGN(oparam + req->rq_lparm, sizeof(u32));
+	const int bcc = (req->rq_data ? odata + req->rq_ldata :
+					oparam + req->rq_lparm) - header;
+
+	if ((bcc + oparam) > server->opt.max_xmit)
+		return -ENOMEM;
+	smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
+
+	/*
+	 * max parameters + max data + max setup == bufsize to make NT4 happy
+	 * and not abort the transfer or split into multiple responses. It also
+	 * makes smbfs happy as handling packets larger than the buffer size
+	 * is extra work.
+	 *
+	 * OS/2 is probably going to hate me for this ...
+	 */
+	mparam = SMB_TRANS2_MAX_PARAM;
+	mdata = req->rq_bufsize - mparam;
+
+	mdata = server->opt.max_xmit - mparam - 100;
+	if (mdata < 1024) {
+		mdata = 1024;
+		mparam = 20;
+	}
+
+#if 0
+	/* NT/win2k has ~4k max_xmit, so with this we request more than it wants
+	   to return as one SMB. Useful for testing the fragmented trans2
+	   handling. */
+	mdata = 8192;
+#endif
+
+	WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
+	WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
+	WSET(req->rq_header, smb_mprcnt, mparam);
+	WSET(req->rq_header, smb_mdrcnt, mdata);
+	WSET(req->rq_header, smb_msrcnt, 0);    /* max setup always 0 ? */
+	WSET(req->rq_header, smb_flags, 0);
+	DSET(req->rq_header, smb_timeout, 0);
+	WSET(req->rq_header, smb_pscnt, req->rq_lparm);
+	WSET(req->rq_header, smb_psoff, oparam - 4);
+	WSET(req->rq_header, smb_dscnt, req->rq_ldata);
+	WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
+	*(req->rq_header + smb_suwcnt) = 0x01;          /* setup count */
+	*(req->rq_header + smb_suwcnt + 1) = 0x00;      /* reserved */
+	WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
+
+	req->rq_iovlen = 2;
+	req->rq_iov[0].iov_base = (void *) req->rq_header;
+	req->rq_iov[0].iov_len = oparam;
+	req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
+	req->rq_iov[1].iov_len = req->rq_lparm;
+	req->rq_slen = oparam + req->rq_lparm;
+
+	if (req->rq_data) {
+		req->rq_iovlen += 2;
+		req->rq_iov[2].iov_base = padding;
+		req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
+		req->rq_iov[3].iov_base = req->rq_data;
+		req->rq_iov[3].iov_len = req->rq_ldata;
+		req->rq_slen = odata + req->rq_ldata;
+	}
+
+	/* always a data part for trans2 replies */
+	req->rq_setup_read = smb_setup_bcc;
+
+	return 0;
+}
+
+/*
+ * Add a request and tell smbiod to process it
+ */
+int smb_add_request(struct smb_request *req)
+{
+	long timeleft;
+	struct smb_sb_info *server = req->rq_server;
+	int result = 0;
+
+	smb_setup_request(req);
+	if (req->rq_trans2_command) {
+		if (req->rq_buffer == NULL) {
+			PARANOIA("trans2 attempted without response buffer!\n");
+			return -EIO;
+		}
+		result = smb_setup_trans2request(req);
+	}
+	if (result < 0)
+		return result;
+
+#ifdef SMB_DEBUG_PACKET_SIZE
+	add_xmit_stats(req);
+#endif
+
+	/* add 'req' to the queue of requests */
+	if (smb_lock_server_interruptible(server))
+		return -EINTR;
+
+	/*
+	 * Try to send the request as the process. If that fails we queue the
+	 * request and let smbiod send it later.
+	 */
+
+	/* FIXME: each server has a number on the maximum number of parallel
+	   requests. 10, 50 or so. We should not allow more requests to be
+	   active. */
+	if (server->mid > 0xf000)
+		server->mid = 0;
+	req->rq_mid = server->mid++;
+	WSET(req->rq_header, smb_mid, req->rq_mid);
+
+	result = 0;
+	if (server->state == CONN_VALID) {
+		if (list_empty(&server->xmitq))
+			result = smb_request_send_req(req);
+		if (result < 0) {
+			/* Connection lost? */
+			server->conn_error = result;
+			server->state = CONN_INVALID;
+		}
+	}
+	if (result != 1)
+		list_add_tail(&req->rq_queue, &server->xmitq);
+	smb_rget(req);
+
+	if (server->state != CONN_VALID)
+		smbiod_retry(server);
+
+	smb_unlock_server(server);
+
+	smbiod_wake_up();
+
+	timeleft = wait_event_interruptible_timeout(req->rq_wait,
+				    req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
+	if (!timeleft || signal_pending(current)) {
+		/*
+		 * On timeout or on interrupt we want to try and remove the
+		 * request from the recvq/xmitq.
+		 * First check if the request is still part of a queue. (May
+		 * have been removed by some error condition)
+		 */
+		smb_lock_server(server);
+		if (!list_empty(&req->rq_queue)) {
+			list_del_init(&req->rq_queue);
+			smb_rput(req);
+		}
+		smb_unlock_server(server);
+	}
+
+	if (!timeleft) {
+		PARANOIA("request [%p, mid=%d] timed out!\n",
+			 req, req->rq_mid);
+		VERBOSE("smb_com:  %02x\n", *(req->rq_header + smb_com));
+		VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
+		VERBOSE("smb_flg:  %02x\n", *(req->rq_header + smb_flg));
+		VERBOSE("smb_tid:  %04x\n", WVAL(req->rq_header, smb_tid));
+		VERBOSE("smb_pid:  %04x\n", WVAL(req->rq_header, smb_pid));
+		VERBOSE("smb_uid:  %04x\n", WVAL(req->rq_header, smb_uid));
+		VERBOSE("smb_mid:  %04x\n", WVAL(req->rq_header, smb_mid));
+		VERBOSE("smb_wct:  %02x\n", *(req->rq_header + smb_wct));
+
+		req->rq_rcls = ERRSRV;
+		req->rq_err  = ERRtimeout;
+
+		/* Just in case it was "stuck" */
+		smbiod_wake_up();
+	}
+	VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
+
+	if (req->rq_rcls != 0)
+		req->rq_errno = smb_errno(req);
+	if (signal_pending(current))
+		req->rq_errno = -ERESTARTSYS;
+	return req->rq_errno;
+}
+
+/*
+ * Send a request and place it on the recvq if successfully sent.
+ * Must be called with the server lock held.
+ */
+static int smb_request_send_req(struct smb_request *req)
+{
+	struct smb_sb_info *server = req->rq_server;
+	int result;
+
+	if (req->rq_bytes_sent == 0) {
+		WSET(req->rq_header, smb_tid, server->opt.tid);
+		WSET(req->rq_header, smb_pid, 1);
+		WSET(req->rq_header, smb_uid, server->opt.server_uid);
+	}
+
+	result = smb_send_request(req);
+	if (result < 0 && result != -EAGAIN)
+		goto out;
+
+	result = 0;
+	if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
+		goto out;
+
+	list_move_tail(&req->rq_queue, &server->recvq);
+	result = 1;
+out:
+	return result;
+}
+
+/*
+ * Sends one request for this server. (smbiod)
+ * Must be called with the server lock held.
+ * Returns: <0 on error
+ *           0 if no request could be completely sent
+ *           1 if all data for one request was sent
+ */
+int smb_request_send_server(struct smb_sb_info *server)
+{
+	struct list_head *head;
+	struct smb_request *req;
+	int result;
+
+	if (server->state != CONN_VALID)
+		return 0;
+
+	/* dequeue first request, if any */
+	req = NULL;
+	head = server->xmitq.next;
+	if (head != &server->xmitq) {
+		req = list_entry(head, struct smb_request, rq_queue);
+	}
+	if (!req)
+		return 0;
+
+	result = smb_request_send_req(req);
+	if (result < 0) {
+		server->conn_error = result;
+		list_move(&req->rq_queue, &server->xmitq);
+		result = -EIO;
+		goto out;
+	}
+
+out:
+	return result;
+}
+
+/*
+ * Try to find a request matching this "mid". Typically the first entry will
+ * be the matching one.
+ */
+static struct smb_request *find_request(struct smb_sb_info *server, int mid)
+{
+	struct list_head *tmp;
+	struct smb_request *req = NULL;
+
+	list_for_each(tmp, &server->recvq) {
+		req = list_entry(tmp, struct smb_request, rq_queue);
+		if (req->rq_mid == mid) {
+			break;
+		}
+		req = NULL;
+	}
+
+	if (!req) {
+		VERBOSE("received reply with mid %d but no request!\n",
+			WVAL(server->header, smb_mid));
+		server->rstate = SMB_RECV_DROP;
+	}
+
+	return req;
+}
+
+/*
+ * Called when we have read the smb header and believe this is a response.
+ */
+static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
+{
+	int hdrlen, wct;
+
+	memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
+
+	wct = *(req->rq_header + smb_wct);
+	if (wct > 20) {	
+		PARANOIA("wct too large, %d > 20\n", wct);
+		server->rstate = SMB_RECV_DROP;
+		return 0;
+	}
+
+	req->rq_resp_wct = wct;
+	hdrlen = SMB_HEADER_LEN + wct*2 + 2;
+	VERBOSE("header length: %d   smb_wct: %2d\n", hdrlen, wct);
+
+	req->rq_bytes_recvd = SMB_HEADER_LEN;
+	req->rq_rlen = hdrlen;
+	req->rq_iov[0].iov_base = req->rq_header;
+	req->rq_iov[0].iov_len  = hdrlen;
+	req->rq_iovlen = 1;
+	server->rstate = SMB_RECV_PARAM;
+
+#ifdef SMB_DEBUG_PACKET_SIZE
+	add_recv_stats(smb_len(server->header));
+#endif
+	return 0;
+}
+
+/*
+ * Reads the SMB parameters
+ */
+static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
+{
+	int result;
+
+	result = smb_receive(server, req);
+	if (result < 0)
+		return result;
+	if (req->rq_bytes_recvd < req->rq_rlen)
+		return 0;
+
+	VERBOSE("result: %d   smb_bcc:  %04x\n", result,
+		WVAL(req->rq_header, SMB_HEADER_LEN +
+		     (*(req->rq_header + smb_wct) * 2)));
+
+	result = 0;
+	req->rq_iov[0].iov_base = NULL;
+	req->rq_rlen = 0;
+	if (req->rq_callback)
+		req->rq_callback(req);
+	else if (req->rq_setup_read)
+		result = req->rq_setup_read(req);
+	if (result < 0) {
+		server->rstate = SMB_RECV_DROP;
+		return result;
+	}
+
+	server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
+
+	req->rq_bytes_recvd = 0;	// recvd out of the iov
+
+	VERBOSE("rlen: %d\n", req->rq_rlen);
+	if (req->rq_rlen < 0) {
+		PARANOIA("Parameters read beyond end of packet!\n");
+		server->rstate = SMB_RECV_END;
+		return -EIO;
+	}
+	return 0;
+}
+
+/*
+ * Reads the SMB data
+ */
+static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
+{
+	int result;
+
+	result = smb_receive(server, req);
+	if (result < 0)
+		goto out;
+	if (req->rq_bytes_recvd < req->rq_rlen)
+		goto out;
+	server->rstate = SMB_RECV_END;
+out:
+	VERBOSE("result: %d\n", result);
+	return result;
+}
+
+/*
+ * Receive a transaction2 response
+ * Return: 0 if the response has been fully read
+ *         1 if there are further "fragments" to read
+ *        <0 if there is an error
+ */
+static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
+{
+	unsigned char *inbuf;
+	unsigned int parm_disp, parm_offset, parm_count, parm_tot;
+	unsigned int data_disp, data_offset, data_count, data_tot;
+	int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
+
+	VERBOSE("handling trans2\n");
+
+	inbuf = req->rq_header;
+	data_tot    = WVAL(inbuf, smb_tdrcnt);
+	parm_tot    = WVAL(inbuf, smb_tprcnt);
+	parm_disp   = WVAL(inbuf, smb_prdisp);
+	parm_offset = WVAL(inbuf, smb_proff);
+	parm_count  = WVAL(inbuf, smb_prcnt);
+	data_disp   = WVAL(inbuf, smb_drdisp);
+	data_offset = WVAL(inbuf, smb_droff);
+	data_count  = WVAL(inbuf, smb_drcnt);
+
+	/* Modify offset for the split header/buffer we use */
+	if (data_count || data_offset) {
+		if (unlikely(data_offset < hdrlen))
+			goto out_bad_data;
+		else
+			data_offset -= hdrlen;
+	}
+	if (parm_count || parm_offset) {
+		if (unlikely(parm_offset < hdrlen))
+			goto out_bad_parm;
+		else
+			parm_offset -= hdrlen;
+	}
+
+	if (parm_count == parm_tot && data_count == data_tot) {
+		/*
+		 * This packet has all the trans2 data.
+		 *
+		 * We setup the request so that this will be the common
+		 * case. It may be a server error to not return a
+		 * response that fits.
+		 */
+		VERBOSE("single trans2 response  "
+			"dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
+			data_count, parm_count,
+			data_offset, parm_offset);
+		req->rq_ldata = data_count;
+		req->rq_lparm = parm_count;
+		req->rq_data = req->rq_buffer + data_offset;
+		req->rq_parm = req->rq_buffer + parm_offset;
+		if (unlikely(parm_offset + parm_count > req->rq_rlen))
+			goto out_bad_parm;
+		if (unlikely(data_offset + data_count > req->rq_rlen))
+			goto out_bad_data;
+		return 0;
+	}
+
+	VERBOSE("multi trans2 response  "
+		"frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
+		req->rq_fragment,
+		data_count, parm_count,
+		data_offset, parm_offset);
+
+	if (!req->rq_fragment) {
+		int buf_len;
+
+		/* We got the first trans2 fragment */
+		req->rq_fragment = 1;
+		req->rq_total_data = data_tot;
+		req->rq_total_parm = parm_tot;
+		req->rq_ldata = 0;
+		req->rq_lparm = 0;
+
+		buf_len = data_tot + parm_tot;
+		if (buf_len > SMB_MAX_PACKET_SIZE)
+			goto out_too_long;
+
+		req->rq_trans2bufsize = buf_len;
+		req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
+		if (!req->rq_trans2buffer)
+			goto out_no_mem;
+
+		req->rq_parm = req->rq_trans2buffer;
+		req->rq_data = req->rq_trans2buffer + parm_tot;
+	} else if (unlikely(req->rq_total_data < data_tot ||
+			    req->rq_total_parm < parm_tot))
+		goto out_data_grew;
+
+	if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
+		     parm_offset + parm_count > req->rq_rlen))
+		goto out_bad_parm;
+	if (unlikely(data_disp + data_count > req->rq_total_data ||
+		     data_offset + data_count > req->rq_rlen))
+		goto out_bad_data;
+
+	inbuf = req->rq_buffer;
+	memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
+	memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
+
+	req->rq_ldata += data_count;
+	req->rq_lparm += parm_count;
+
+	/*
+	 * Check whether we've received all of the data. Note that
+	 * we use the packet totals -- total lengths might shrink!
+	 */
+	if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
+		req->rq_ldata = data_tot;
+		req->rq_lparm = parm_tot;
+		return 0;
+	}
+	return 1;
+
+out_too_long:
+	printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
+		data_tot, parm_tot);
+	goto out_EIO;
+out_no_mem:
+	printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
+	       req->rq_trans2bufsize);
+	req->rq_errno = -ENOMEM;
+	goto out;
+out_data_grew:
+	printk(KERN_ERR "smb_trans2: data/params grew!\n");
+	goto out_EIO;
+out_bad_parm:
+	printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
+	       parm_disp, parm_count, parm_tot, parm_offset);
+	goto out_EIO;
+out_bad_data:
+	printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
+	       data_disp, data_count, data_tot, data_offset);
+out_EIO:
+	req->rq_errno = -EIO;
+out:
+	return req->rq_errno;
+}
+
+/*
+ * State machine for receiving responses. We handle the fact that we can't
+ * read the full response in one try by having states telling us how much we
+ * have read.
+ *
+ * Must be called with the server lock held (only called from smbiod).
+ *
+ * Return: <0 on error
+ */
+int smb_request_recv(struct smb_sb_info *server)
+{
+	struct smb_request *req = NULL;
+	int result = 0;
+
+	if (smb_recv_available(server) <= 0)
+		return 0;
+
+	VERBOSE("state: %d\n", server->rstate);
+	switch (server->rstate) {
+	case SMB_RECV_DROP:
+		result = smb_receive_drop(server);
+		if (result < 0)
+			break;
+		if (server->rstate == SMB_RECV_DROP)
+			break;
+		server->rstate = SMB_RECV_START;
+		/* fallthrough */
+	case SMB_RECV_START:
+		server->smb_read = 0;
+		server->rstate = SMB_RECV_HEADER;
+		/* fallthrough */
+	case SMB_RECV_HEADER:
+		result = smb_receive_header(server);
+		if (result < 0)
+			break;
+		if (server->rstate == SMB_RECV_HEADER)
+			break;
+		if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
+			server->rstate = SMB_RECV_REQUEST;
+			break;
+		}
+		if (server->rstate != SMB_RECV_HCOMPLETE)
+			break;
+		/* fallthrough */
+	case SMB_RECV_HCOMPLETE:
+		req = find_request(server, WVAL(server->header, smb_mid));
+		if (!req)
+			break;
+		smb_init_request(server, req);
+		req->rq_rcls = *(req->rq_header + smb_rcls);
+		req->rq_err  = WVAL(req->rq_header, smb_err);
+		if (server->rstate != SMB_RECV_PARAM)
+			break;
+		/* fallthrough */
+	case SMB_RECV_PARAM:
+		if (!req)
+			req = find_request(server,WVAL(server->header,smb_mid));
+		if (!req)
+			break;
+		result = smb_recv_param(server, req);
+		if (result < 0)
+			break;
+		if (server->rstate != SMB_RECV_DATA)
+			break;
+		/* fallthrough */
+	case SMB_RECV_DATA:
+		if (!req)
+			req = find_request(server,WVAL(server->header,smb_mid));
+		if (!req)
+			break;
+		result = smb_recv_data(server, req);
+		if (result < 0)
+			break;
+		break;
+
+		/* We should never be called with any of these states */
+	case SMB_RECV_END:
+	case SMB_RECV_REQUEST:
+		BUG();
+	}
+
+	if (result < 0) {
+		/* We saw an error */
+		return result;
+	}
+
+	if (server->rstate != SMB_RECV_END)
+		return 0;
+
+	result = 0;
+	if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
+		result = smb_recv_trans2(server, req);
+
+	/*
+	 * Response completely read. Drop any extra bytes sent by the server.
+	 * (Yes, servers sometimes add extra bytes to responses)
+	 */
+	VERBOSE("smb_len: %d   smb_read: %d\n",
+		server->smb_len, server->smb_read);
+	if (server->smb_read < server->smb_len)
+		smb_receive_drop(server);
+
+	server->rstate = SMB_RECV_START;
+
+	if (!result) {
+		list_del_init(&req->rq_queue);
+		req->rq_flags |= SMB_REQ_RECEIVED;
+		smb_rput(req);
+		wake_up_interruptible(&req->rq_wait);
+	}
+	return 0;
+}
diff --git a/drivers/staging/smbfs/request.h b/drivers/staging/smbfs/request.h
new file mode 100644
index 00000000000..efb21451e7c
--- /dev/null
+++ b/drivers/staging/smbfs/request.h
@@ -0,0 +1,70 @@
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <linux/wait.h>
+
+struct smb_request {
+	struct list_head rq_queue;	/* recvq or xmitq for the server */
+
+	atomic_t rq_count;
+
+	wait_queue_head_t rq_wait;
+	int rq_flags;
+	int rq_mid;	/* multiplex ID, set by request.c */
+
+	struct smb_sb_info *rq_server;
+
+	/* header + word count + parameter words + byte count */
+	unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
+
+	int rq_bufsize;
+	unsigned char *rq_buffer;
+
+	/* FIXME: this is not good enough for merging IO requests. */
+	unsigned char *rq_page;
+	int rq_rsize;
+
+	int rq_resp_wct;
+	int rq_resp_bcc;
+
+	int rq_rlen;
+	int rq_bytes_recvd;
+
+	int rq_slen;
+	int rq_bytes_sent;
+
+	int rq_iovlen;
+	struct kvec rq_iov[4];
+
+	int (*rq_setup_read) (struct smb_request *);
+	void (*rq_callback) (struct smb_request *);
+
+	/* ------ trans2 stuff ------ */
+
+	u16 rq_trans2_command;	/* 0 if not a trans2 request */
+	unsigned int rq_ldata;
+	unsigned char *rq_data;
+	unsigned int rq_lparm;
+	unsigned char *rq_parm;
+
+	int rq_fragment;
+	u32 rq_total_data;
+	u32 rq_total_parm;
+	int rq_trans2bufsize;
+	unsigned char *rq_trans2buffer;
+
+	/* ------ response ------ */
+
+	unsigned short rq_rcls;
+	unsigned short rq_err;
+	int rq_errno;
+};
+
+#define SMB_REQ_STATIC		0x0001	/* rq_buffer is static */
+#define SMB_REQ_NORETRY		0x0002	/* request is invalid after retry */
+
+#define SMB_REQ_TRANSMITTED	0x4000	/* all data has been sent */
+#define SMB_REQ_RECEIVED	0x8000	/* reply received, smbiod is done */
+
+#define xSMB_REQ_NOREPLY	0x0004	/* we don't want the reply (if any) */
+#define xSMB_REQ_NORECEIVER	0x0008	/* caller doesn't wait for response */
diff --git a/drivers/staging/smbfs/smb.h b/drivers/staging/smbfs/smb.h
new file mode 100644
index 00000000000..82fefddc598
--- /dev/null
+++ b/drivers/staging/smbfs/smb.h
@@ -0,0 +1,118 @@
+/*
+ *  smb.h
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_SMB_H
+#define _LINUX_SMB_H
+
+#include <linux/types.h>
+#include <linux/magic.h>
+#ifdef __KERNEL__
+#include <linux/time.h>
+#endif
+
+enum smb_protocol { 
+	SMB_PROTOCOL_NONE, 
+	SMB_PROTOCOL_CORE, 
+	SMB_PROTOCOL_COREPLUS, 
+	SMB_PROTOCOL_LANMAN1, 
+	SMB_PROTOCOL_LANMAN2, 
+	SMB_PROTOCOL_NT1 
+};
+
+enum smb_case_hndl {
+	SMB_CASE_DEFAULT,
+	SMB_CASE_LOWER,
+	SMB_CASE_UPPER
+};
+
+struct smb_dskattr {
+        __u16 total;
+        __u16 allocblocks;
+        __u16 blocksize;
+        __u16 free;
+};
+
+struct smb_conn_opt {
+
+        /* The socket */
+	unsigned int fd;
+
+	enum smb_protocol protocol;
+	enum smb_case_hndl case_handling;
+
+	/* Connection-Options */
+
+	__u32              max_xmit;
+	__u16              server_uid;
+	__u16              tid;
+
+        /* The following are LANMAN 1.0 options */
+        __u16              secmode;
+        __u16              maxmux;
+        __u16              maxvcs;
+        __u16              rawmode;
+        __u32              sesskey;
+
+	/* The following are NT LM 0.12 options */
+	__u32              maxraw;
+	__u32              capabilities;
+	__s16              serverzone;
+};
+
+#ifdef __KERNEL__
+
+#define SMB_NLS_MAXNAMELEN 20
+struct smb_nls_codepage {
+	char local_name[SMB_NLS_MAXNAMELEN];
+	char remote_name[SMB_NLS_MAXNAMELEN];
+};
+
+
+#define SMB_MAXNAMELEN 255
+#define SMB_MAXPATHLEN 1024
+
+/*
+ * Contains all relevant data on a SMB networked file.
+ */
+struct smb_fattr {
+	__u16 attr;
+
+	unsigned long	f_ino;
+	umode_t		f_mode;
+	nlink_t		f_nlink;
+	uid_t		f_uid;
+	gid_t		f_gid;
+	dev_t		f_rdev;
+	loff_t		f_size;
+	struct timespec	f_atime;
+	struct timespec f_mtime;
+	struct timespec f_ctime;
+	unsigned long	f_blocks;
+	int		f_unix;
+};
+
+enum smb_conn_state {
+	CONN_VALID,		/* everything's fine */
+	CONN_INVALID,		/* Something went wrong, but did not
+				   try to reconnect yet. */
+	CONN_RETRIED,		/* Tried a reconnection, but was refused */
+	CONN_RETRYING		/* Currently trying to reconnect */
+};
+
+#define SMB_HEADER_LEN   37     /* includes everything up to, but not
+                                 * including smb_bcc */
+
+#define SMB_INITIAL_PACKET_SIZE		4000
+#define SMB_MAX_PACKET_SIZE		32768
+
+/* reserve this much space for trans2 parameters. Shouldn't have to be more
+   than 10 or so, but OS/2 seems happier like this. */
+#define SMB_TRANS2_MAX_PARAM 64
+
+#endif
+#endif
diff --git a/drivers/staging/smbfs/smb_debug.h b/drivers/staging/smbfs/smb_debug.h
new file mode 100644
index 00000000000..fc4b1a5dd75
--- /dev/null
+++ b/drivers/staging/smbfs/smb_debug.h
@@ -0,0 +1,34 @@
+/*
+ * Defines some debug macros for smbfs.
+ */
+
+/* This makes a dentry parent/child name pair. Useful for debugging printk's */
+#define DENTRY_PATH(dentry) \
+	(dentry)->d_parent->d_name.name,(dentry)->d_name.name
+
+/*
+ * safety checks that should never happen ???
+ * these are normally enabled.
+ */
+#ifdef SMBFS_PARANOIA
+# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
+#else
+# define PARANOIA(f, a...) do { ; } while(0)
+#endif
+
+/* lots of debug messages */
+#ifdef SMBFS_DEBUG_VERBOSE
+# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
+#else
+# define VERBOSE(f, a...) do { ; } while(0)
+#endif
+
+/*
+ * "normal" debug messages, but not with a normal DEBUG define ... way
+ * too common name.
+ */
+#ifdef SMBFS_DEBUG
+#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
+#else
+#define DEBUG1(f, a...) do { ; } while(0)
+#endif
diff --git a/drivers/staging/smbfs/smb_fs.h b/drivers/staging/smbfs/smb_fs.h
new file mode 100644
index 00000000000..20a05c188eb
--- /dev/null
+++ b/drivers/staging/smbfs/smb_fs.h
@@ -0,0 +1,153 @@
+/*
+ *  smb_fs.h
+ *
+ *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_SMB_FS_H
+#define _LINUX_SMB_FS_H
+
+#include "smb.h"
+
+/*
+ * ioctl commands
+ */
+#define	SMB_IOC_GETMOUNTUID		_IOR('u', 1, __kernel_old_uid_t)
+#define SMB_IOC_NEWCONN                 _IOW('u', 2, struct smb_conn_opt)
+
+/* __kernel_uid_t can never change, so we have to use __kernel_uid32_t */
+#define	SMB_IOC_GETMOUNTUID32		_IOR('u', 3, __kernel_uid32_t)
+
+
+#ifdef __KERNEL__
+#include "smb_fs_i.h"
+#include "smb_fs_sb.h"
+#include "smb_mount.h"
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/jiffies.h>
+#include <asm/unaligned.h>
+
+static inline struct smb_sb_info *SMB_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct smb_inode_info *SMB_I(struct inode *inode)
+{
+	return container_of(inode, struct smb_inode_info, vfs_inode);
+}
+
+/* macro names are short for word, double-word, long value (?) */
+#define WVAL(buf, pos) (get_unaligned_le16((u8 *)(buf) + (pos)))
+#define DVAL(buf, pos) (get_unaligned_le32((u8 *)(buf) + (pos)))
+#define LVAL(buf, pos) (get_unaligned_le64((u8 *)(buf) + (pos)))
+
+#define WSET(buf, pos, val) put_unaligned_le16((val), (u8 *)(buf) + (pos))
+#define DSET(buf, pos, val) put_unaligned_le32((val), (u8 *)(buf) + (pos))
+#define LSET(buf, pos, val) put_unaligned_le64((val), (u8 *)(buf) + (pos))
+
+/* where to find the base of the SMB packet proper */
+#define smb_base(buf) ((u8 *)(((u8 *)(buf))+4))
+
+/*
+ * Flags for the in-memory inode
+ */
+#define SMB_F_LOCALWRITE	0x02	/* file modified locally */
+
+
+/* NT1 protocol capability bits */
+#define SMB_CAP_RAW_MODE         0x00000001
+#define SMB_CAP_MPX_MODE         0x00000002
+#define SMB_CAP_UNICODE          0x00000004
+#define SMB_CAP_LARGE_FILES      0x00000008
+#define SMB_CAP_NT_SMBS          0x00000010
+#define SMB_CAP_RPC_REMOTE_APIS  0x00000020
+#define SMB_CAP_STATUS32         0x00000040
+#define SMB_CAP_LEVEL_II_OPLOCKS 0x00000080
+#define SMB_CAP_LOCK_AND_READ    0x00000100
+#define SMB_CAP_NT_FIND          0x00000200
+#define SMB_CAP_DFS              0x00001000
+#define SMB_CAP_LARGE_READX      0x00004000
+#define SMB_CAP_LARGE_WRITEX     0x00008000
+#define SMB_CAP_UNIX             0x00800000	/* unofficial ... */
+
+
+/*
+ * This is the time we allow an inode, dentry or dir cache to live. It is bad
+ * for performance to have shorter ttl on an inode than on the cache. It can
+ * cause refresh on each inode for a dir listing ... one-by-one
+ */
+#define SMB_MAX_AGE(server) (((server)->mnt->ttl * HZ) / 1000)
+
+static inline void
+smb_age_dentry(struct smb_sb_info *server, struct dentry *dentry)
+{
+	dentry->d_time = jiffies - SMB_MAX_AGE(server);
+}
+
+struct smb_cache_head {
+	time_t		mtime;	/* unused */
+	unsigned long	time;	/* cache age */
+	unsigned long	end;	/* last valid fpos in cache */
+	int		eof;
+};
+
+#define SMB_DIRCACHE_SIZE	((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *)))
+union smb_dir_cache {
+	struct smb_cache_head   head;
+	struct dentry           *dentry[SMB_DIRCACHE_SIZE];
+};
+
+#define SMB_FIRSTCACHE_SIZE	((int)((SMB_DIRCACHE_SIZE * \
+	sizeof(struct dentry *) - sizeof(struct smb_cache_head)) / \
+	sizeof(struct dentry *)))
+
+#define SMB_DIRCACHE_START      (SMB_DIRCACHE_SIZE - SMB_FIRSTCACHE_SIZE)
+
+struct smb_cache_control {
+	struct  smb_cache_head		head;
+	struct  page			*page;
+	union   smb_dir_cache		*cache;
+	unsigned long			fpos, ofs;
+	int				filled, valid, idx;
+};
+
+#define SMB_OPS_NUM_STATIC	5
+struct smb_ops {
+	int (*read)(struct inode *inode, loff_t offset, int count,
+		    char *data);
+	int (*write)(struct inode *inode, loff_t offset, int count, const
+		     char *data);
+	int (*readdir)(struct file *filp, void *dirent, filldir_t filldir,
+		       struct smb_cache_control *ctl);
+
+	int (*getattr)(struct smb_sb_info *server, struct dentry *dir,
+		       struct smb_fattr *fattr);
+	/* int (*setattr)(...); */      /* setattr is really icky! */
+
+	int (*truncate)(struct inode *inode, loff_t length);
+
+
+	/* --- --- --- end of "static" entries --- --- --- */
+
+	int (*convert)(unsigned char *output, int olen,
+		       const unsigned char *input, int ilen,
+		       struct nls_table *nls_from,
+		       struct nls_table *nls_to);
+};
+
+static inline int
+smb_is_open(struct inode *i)
+{
+	return (SMB_I(i)->open == server_from_inode(i)->generation);
+}
+
+extern void smb_install_null_ops(struct smb_ops *);
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_SMB_FS_H */
diff --git a/drivers/staging/smbfs/smb_fs_i.h b/drivers/staging/smbfs/smb_fs_i.h
new file mode 100644
index 00000000000..8ccf4eca2c3
--- /dev/null
+++ b/drivers/staging/smbfs/smb_fs_i.h
@@ -0,0 +1,37 @@
+/*
+ *  smb_fs_i.h
+ *
+ *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_SMB_FS_I
+#define _LINUX_SMB_FS_I
+
+#include <linux/types.h>
+#include <linux/fs.h>
+
+/*
+ * smb fs inode data (in memory only)
+ */
+struct smb_inode_info {
+
+	/*
+	 * file handles are local to a connection. A file is open if
+	 * (open == generation).
+	 */
+        unsigned int open;	/* open generation */
+	__u16 fileid;		/* What id to handle a file with? */
+	__u16 attr;		/* Attribute fields, DOS value */
+
+	__u16 access;		/* Access mode */
+	__u16 flags;
+	unsigned long oldmtime;	/* last time refreshed */
+	unsigned long closed;	/* timestamp when closed */
+	unsigned openers;	/* number of fileid users */
+
+	struct inode vfs_inode;	/* must be at the end */
+};
+
+#endif
diff --git a/drivers/staging/smbfs/smb_fs_sb.h b/drivers/staging/smbfs/smb_fs_sb.h
new file mode 100644
index 00000000000..ca058afda90
--- /dev/null
+++ b/drivers/staging/smbfs/smb_fs_sb.h
@@ -0,0 +1,100 @@
+/*
+ *  smb_fs_sb.h
+ *
+ *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ */
+
+#ifndef _SMB_FS_SB
+#define _SMB_FS_SB
+
+#include <linux/types.h>
+#include <linux/backing-dev.h>
+#include "smb.h"
+
+/*
+ * Upper limit on the total number of active smb_request structs.
+ */
+#define MAX_REQUEST_HARD       256
+
+enum smb_receive_state {
+	SMB_RECV_START,		/* No data read, looking for length + sig */
+	SMB_RECV_HEADER,	/* Reading the header data */
+	SMB_RECV_HCOMPLETE,	/* Done with the header */
+	SMB_RECV_PARAM,		/* Reading parameter words */
+	SMB_RECV_DATA,		/* Reading data bytes */
+	SMB_RECV_END,		/* End of request */
+	SMB_RECV_DROP,		/* Dropping this SMB */
+	SMB_RECV_REQUEST,	/* Received a request and not a reply */
+};
+
+/* structure access macros */
+#define server_from_inode(inode) SMB_SB((inode)->i_sb)
+#define server_from_dentry(dentry) SMB_SB((dentry)->d_sb)
+#define SB_of(server) ((server)->super_block)
+
+struct smb_sb_info {
+	/* List of all smbfs superblocks */
+	struct list_head entry;
+
+        enum smb_conn_state state;
+	struct file * sock_file;
+	int conn_error;
+	enum smb_receive_state rstate;
+
+	atomic_t nr_requests;
+	struct list_head xmitq;
+	struct list_head recvq;
+	u16 mid;
+
+        struct smb_mount_data_kernel *mnt;
+
+	/* Connections are counted. Each time a new socket arrives,
+	 * generation is incremented.
+	 */
+	unsigned int generation;
+	struct pid *conn_pid;
+	struct smb_conn_opt opt;
+	wait_queue_head_t conn_wq;
+	int conn_complete;
+	struct semaphore sem;
+
+	unsigned char      header[SMB_HEADER_LEN + 20*2 + 2];
+	u32                header_len;
+	u32                smb_len;
+	u32                smb_read;
+
+        /* We use our own data_ready callback, but need the original one */
+        void *data_ready;
+
+	/* nls pointers for codepage conversions */
+	struct nls_table *remote_nls;
+	struct nls_table *local_nls;
+
+	struct smb_ops *ops;
+
+	struct super_block *super_block;
+
+	struct backing_dev_info bdi;
+};
+
+static inline int
+smb_lock_server_interruptible(struct smb_sb_info *server)
+{
+	return down_interruptible(&(server->sem));
+}
+
+static inline void
+smb_lock_server(struct smb_sb_info *server)
+{
+	down(&(server->sem));
+}
+
+static inline void
+smb_unlock_server(struct smb_sb_info *server)
+{
+	up(&(server->sem));
+}
+
+#endif
diff --git a/drivers/staging/smbfs/smb_mount.h b/drivers/staging/smbfs/smb_mount.h
new file mode 100644
index 00000000000..d10f00cb570
--- /dev/null
+++ b/drivers/staging/smbfs/smb_mount.h
@@ -0,0 +1,65 @@
+/*
+ *  smb_mount.h
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_SMB_MOUNT_H
+#define _LINUX_SMB_MOUNT_H
+
+#include <linux/types.h>
+
+#define SMB_MOUNT_VERSION	6
+
+struct smb_mount_data {
+	int version;
+	__kernel_uid_t mounted_uid; /* Who may umount() this filesystem? */
+	__kernel_uid_t uid;
+	__kernel_gid_t gid;
+	__kernel_mode_t file_mode;
+	__kernel_mode_t dir_mode;
+};
+
+
+#ifdef __KERNEL__
+
+/* "vers" in big-endian */
+#define SMB_MOUNT_ASCII 0x76657273
+
+#define SMB_MOUNT_OLDVERSION	6
+#undef SMB_MOUNT_VERSION
+#define SMB_MOUNT_VERSION	7
+
+/* flags */
+#define SMB_MOUNT_WIN95		0x0001	/* Win 95 server */
+#define SMB_MOUNT_OLDATTR	0x0002	/* Use core getattr (Win 95 speedup) */
+#define SMB_MOUNT_DIRATTR	0x0004	/* Use find_first for getattr */
+#define SMB_MOUNT_CASE		0x0008	/* Be case sensitive */
+#define SMB_MOUNT_UNICODE	0x0010	/* Server talks unicode */
+#define SMB_MOUNT_UID		0x0020  /* Use user specified uid */
+#define SMB_MOUNT_GID		0x0040  /* Use user specified gid */
+#define SMB_MOUNT_FMODE		0x0080  /* Use user specified file mode */
+#define SMB_MOUNT_DMODE		0x0100  /* Use user specified dir mode */
+
+struct smb_mount_data_kernel {
+	int version;
+
+	uid_t mounted_uid;	/* Who may umount() this filesystem? */
+	uid_t uid;
+	gid_t gid;
+	mode_t file_mode;
+	mode_t dir_mode;
+
+	u32 flags;
+
+        /* maximum age in jiffies (inode, dentry and dircache) */
+	int ttl;
+
+	struct smb_nls_codepage codepage;
+};
+
+#endif
+
+#endif
diff --git a/drivers/staging/smbfs/smbfs.txt b/drivers/staging/smbfs/smbfs.txt
new file mode 100644
index 00000000000..194fb0decd2
--- /dev/null
+++ b/drivers/staging/smbfs/smbfs.txt
@@ -0,0 +1,8 @@
+Smbfs is a filesystem that implements the SMB protocol, which is the
+protocol used by Windows for Workgroups, Windows 95 and Windows NT.
+Smbfs was inspired by Samba, the program written by Andrew Tridgell
+that turns any Unix host into a file server for DOS or Windows clients.
+
+Smbfs is a SMB client, but uses parts of samba for its operation. For
+more info on samba, including documentation, please go to
+http://www.samba.org/ and then on to your nearest mirror.
diff --git a/drivers/staging/smbfs/smbiod.c b/drivers/staging/smbfs/smbiod.c
new file mode 100644
index 00000000000..ec998920f8d
--- /dev/null
+++ b/drivers/staging/smbfs/smbiod.c
@@ -0,0 +1,343 @@
+/*
+ *  smbiod.c
+ *
+ *  Copyright (C) 2000, Charles Loep / Corel Corp.
+ *  Copyright (C) 2001, Urban Widmark
+ */
+
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/dcache.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <net/ip.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "smb_fs.h"
+#include "smbno.h"
+#include "smb_mount.h"
+#include "smb_debug.h"
+#include "request.h"
+#include "proto.h"
+
+enum smbiod_state {
+	SMBIOD_DEAD,
+	SMBIOD_STARTING,
+	SMBIOD_RUNNING,
+};
+
+static enum smbiod_state smbiod_state = SMBIOD_DEAD;
+static struct task_struct *smbiod_thread;
+static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
+static LIST_HEAD(smb_servers);
+static DEFINE_SPINLOCK(servers_lock);
+
+#define SMBIOD_DATA_READY	(1<<0)
+static unsigned long smbiod_flags;
+
+static int smbiod(void *);
+static int smbiod_start(void);
+
+/*
+ * called when there's work for us to do
+ */
+void smbiod_wake_up(void)
+{
+	if (smbiod_state == SMBIOD_DEAD)
+		return;
+	set_bit(SMBIOD_DATA_READY, &smbiod_flags);
+	wake_up_interruptible(&smbiod_wait);
+}
+
+/*
+ * start smbiod if none is running
+ */
+static int smbiod_start(void)
+{
+	struct task_struct *tsk;
+	int err = 0;
+
+	if (smbiod_state != SMBIOD_DEAD)
+		return 0;
+	smbiod_state = SMBIOD_STARTING;
+	__module_get(THIS_MODULE);
+	spin_unlock(&servers_lock);
+	tsk = kthread_run(smbiod, NULL, "smbiod");
+	if (IS_ERR(tsk)) {
+		err = PTR_ERR(tsk);
+		module_put(THIS_MODULE);
+	}
+
+	spin_lock(&servers_lock);
+	if (err < 0) {
+		smbiod_state = SMBIOD_DEAD;
+		smbiod_thread = NULL;
+	} else {
+		smbiod_state = SMBIOD_RUNNING;
+		smbiod_thread = tsk;
+	}
+	return err;
+}
+
+/*
+ * register a server & start smbiod if necessary
+ */
+int smbiod_register_server(struct smb_sb_info *server)
+{
+	int ret;
+	spin_lock(&servers_lock);
+	list_add(&server->entry, &smb_servers);
+	VERBOSE("%p\n", server);
+	ret = smbiod_start();
+	spin_unlock(&servers_lock);
+	return ret;
+}
+
+/*
+ * Unregister a server
+ * Must be called with the server lock held.
+ */
+void smbiod_unregister_server(struct smb_sb_info *server)
+{
+	spin_lock(&servers_lock);
+	list_del_init(&server->entry);
+	VERBOSE("%p\n", server);
+	spin_unlock(&servers_lock);
+
+	smbiod_wake_up();
+	smbiod_flush(server);
+}
+
+void smbiod_flush(struct smb_sb_info *server)
+{
+	struct list_head *tmp, *n;
+	struct smb_request *req;
+
+	list_for_each_safe(tmp, n, &server->xmitq) {
+		req = list_entry(tmp, struct smb_request, rq_queue);
+		req->rq_errno = -EIO;
+		list_del_init(&req->rq_queue);
+		smb_rput(req);
+		wake_up_interruptible(&req->rq_wait);
+	}
+	list_for_each_safe(tmp, n, &server->recvq) {
+		req = list_entry(tmp, struct smb_request, rq_queue);
+		req->rq_errno = -EIO;
+		list_del_init(&req->rq_queue);
+		smb_rput(req);
+		wake_up_interruptible(&req->rq_wait);
+	}
+}
+
+/*
+ * Wake up smbmount and make it reconnect to the server.
+ * This must be called with the server locked.
+ *
+ * FIXME: add smbconnect version to this
+ */
+int smbiod_retry(struct smb_sb_info *server)
+{
+	struct list_head *head;
+	struct smb_request *req;
+	struct pid *pid = get_pid(server->conn_pid);
+	int result = 0;
+
+	VERBOSE("state: %d\n", server->state);
+	if (server->state == CONN_VALID || server->state == CONN_RETRYING)
+		goto out;
+
+	smb_invalidate_inodes(server);
+
+	/*
+	 * Some requests are meaningless after a retry, so we abort them.
+	 * One example are all requests using 'fileid' since the files are
+	 * closed on retry.
+	 */
+	head = server->xmitq.next;
+	while (head != &server->xmitq) {
+		req = list_entry(head, struct smb_request, rq_queue);
+		head = head->next;
+
+		req->rq_bytes_sent = 0;
+		if (req->rq_flags & SMB_REQ_NORETRY) {
+			VERBOSE("aborting request %p on xmitq\n", req);
+			req->rq_errno = -EIO;
+			list_del_init(&req->rq_queue);
+			smb_rput(req);
+			wake_up_interruptible(&req->rq_wait);
+		}
+	}
+
+	/*
+	 * FIXME: test the code for retrying request we already sent
+	 */
+	head = server->recvq.next;
+	while (head != &server->recvq) {
+		req = list_entry(head, struct smb_request, rq_queue);
+		head = head->next;
+#if 0
+		if (req->rq_flags & SMB_REQ_RETRY) {
+			/* must move the request to the xmitq */
+			VERBOSE("retrying request %p on recvq\n", req);
+			list_move(&req->rq_queue, &server->xmitq);
+			continue;
+		}
+#endif
+
+		VERBOSE("aborting request %p on recvq\n", req);
+		/* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
+		req->rq_errno = -EIO;
+		list_del_init(&req->rq_queue);
+		smb_rput(req);
+		wake_up_interruptible(&req->rq_wait);
+	}
+
+	smb_close_socket(server);
+
+	if (!pid) {
+		/* FIXME: this is fatal, umount? */
+		printk(KERN_ERR "smb_retry: no connection process\n");
+		server->state = CONN_RETRIED;
+		goto out;
+	}
+
+	/*
+	 * Change state so that only one retry per server will be started.
+	 */
+	server->state = CONN_RETRYING;
+
+	/*
+	 * Note: use the "priv" flag, as a user process may need to reconnect.
+	 */
+	result = kill_pid(pid, SIGUSR1, 1);
+	if (result) {
+		/* FIXME: this is most likely fatal, umount? */
+		printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
+		goto out;
+	}
+	VERBOSE("signalled pid %d\n", pid_nr(pid));
+
+	/* FIXME: The retried requests should perhaps get a "time boost". */
+
+out:
+	put_pid(pid);
+	return result;
+}
+
+/*
+ * Currently handles lockingX packets.
+ */
+static void smbiod_handle_request(struct smb_sb_info *server)
+{
+	PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
+	server->rstate = SMB_RECV_DROP;
+}
+
+/*
+ * Do some IO for one server.
+ */
+static void smbiod_doio(struct smb_sb_info *server)
+{
+	int result;
+	int maxwork = 7;
+
+	if (server->state != CONN_VALID)
+		goto out;
+
+	do {
+		result = smb_request_recv(server);
+		if (result < 0) {
+			server->state = CONN_INVALID;
+			smbiod_retry(server);
+			goto out;	/* reconnecting is slow */
+		} else if (server->rstate == SMB_RECV_REQUEST)
+			smbiod_handle_request(server);
+	} while (result > 0 && maxwork-- > 0);
+
+	/*
+	 * If there is more to read then we want to be sure to wake up again.
+	 */
+	if (server->state != CONN_VALID)
+		goto out;
+	if (smb_recv_available(server) > 0)
+		set_bit(SMBIOD_DATA_READY, &smbiod_flags);
+
+	do {
+		result = smb_request_send_server(server);
+		if (result < 0) {
+			server->state = CONN_INVALID;
+			smbiod_retry(server);
+			goto out;	/* reconnecting is slow */
+		}
+	} while (result > 0);
+
+	/*
+	 * If the last request was not sent out we want to wake up again.
+	 */
+	if (!list_empty(&server->xmitq))
+		set_bit(SMBIOD_DATA_READY, &smbiod_flags);
+
+out:
+	return;
+}
+
+/*
+ * smbiod kernel thread
+ */
+static int smbiod(void *unused)
+{
+	VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
+
+	for (;;) {
+		struct smb_sb_info *server;
+		struct list_head *pos, *n;
+
+		/* FIXME: Use poll? */
+		wait_event_interruptible(smbiod_wait,
+			 test_bit(SMBIOD_DATA_READY, &smbiod_flags));
+		if (signal_pending(current)) {
+			spin_lock(&servers_lock);
+			smbiod_state = SMBIOD_DEAD;
+			spin_unlock(&servers_lock);
+			break;
+		}
+
+		clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
+
+		spin_lock(&servers_lock);
+		if (list_empty(&smb_servers)) {
+			smbiod_state = SMBIOD_DEAD;
+			spin_unlock(&servers_lock);
+			break;
+		}
+
+		list_for_each_safe(pos, n, &smb_servers) {
+			server = list_entry(pos, struct smb_sb_info, entry);
+			VERBOSE("checking server %p\n", server);
+
+			if (server->state == CONN_VALID) {
+				spin_unlock(&servers_lock);
+
+				smb_lock_server(server);
+				smbiod_doio(server);
+				smb_unlock_server(server);
+
+				spin_lock(&servers_lock);
+			}
+		}
+		spin_unlock(&servers_lock);
+	}
+
+	VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
+	module_put_and_exit(0);
+}
diff --git a/drivers/staging/smbfs/smbno.h b/drivers/staging/smbfs/smbno.h
new file mode 100644
index 00000000000..f99e02d9ffe
--- /dev/null
+++ b/drivers/staging/smbfs/smbno.h
@@ -0,0 +1,363 @@
+#ifndef _SMBNO_H_
+#define _SMBNO_H_
+
+/* these define the attribute byte as seen by DOS */
+#define aRONLY	(1L<<0)
+#define aHIDDEN	(1L<<1)
+#define aSYSTEM	(1L<<2)
+#define aVOLID	(1L<<3)
+#define aDIR	(1L<<4)
+#define aARCH	(1L<<5)
+
+/* error classes */
+#define SUCCESS 0  /* The request was successful. */
+#define ERRDOS 0x01 /*  Error is from the core DOS operating system set. */
+#define ERRSRV 0x02 /* Error is generated by the server network file manager.*/
+#define ERRHRD 0x03  /* Error is an hardware error. */
+#define ERRCMD 0xFF  /* Command was not in the "SMB" format. */
+
+/* SMB X/Open error codes for the ERRdos error class */
+
+#define ERRbadfunc 1            /* Invalid function (or system call) */
+#define ERRbadfile 2            /* File not found (pathname error) */
+#define ERRbadpath 3            /* Directory not found */
+#define ERRnofids 4             /* Too many open files */
+#define ERRnoaccess 5           /* Access denied */
+#define ERRbadfid 6             /* Invalid fid */
+#define ERRbadmcb 7             /* Memory control blocks destroyed */
+#define ERRnomem 8              /* Out of memory */
+#define ERRbadmem 9             /* Invalid memory block address */
+#define ERRbadenv 10            /* Invalid environment */
+#define ERRbadformat 11         /* Invalid format */
+#define ERRbadaccess 12         /* Invalid open mode */
+#define ERRbaddata 13           /* Invalid data (only from ioctl call) */
+#define ERRres 14               /* reserved */
+#define ERRbaddrive 15          /* Invalid drive */
+#define ERRremcd 16             /* Attempt to delete current directory */
+#define ERRdiffdevice 17        /* rename/move across different filesystems */
+#define ERRnofiles 18           /* no more files found in file search */
+#define ERRbadshare 32          /* Share mode on file conflict with open mode */
+#define ERRlock 33              /* Lock request conflicts with existing lock */
+#define ERRfilexists 80         /* File in operation already exists */
+#define ERRbadpipe 230          /* Named pipe invalid */
+#define ERRpipebusy 231         /* All instances of pipe are busy */
+#define ERRpipeclosing 232      /* named pipe close in progress */
+#define ERRnotconnected 233     /* No process on other end of named pipe */
+#define ERRmoredata 234         /* More data to be returned */
+
+#define ERROR_INVALID_PARAMETER	 87
+#define ERROR_DISK_FULL		112
+#define ERROR_INVALID_NAME	123
+#define ERROR_DIR_NOT_EMPTY	145
+#define ERROR_NOT_LOCKED	158
+#define ERROR_ALREADY_EXISTS	183  /* see also 80 ? */
+#define ERROR_EAS_DIDNT_FIT	275 /* Extended attributes didn't fit */
+#define ERROR_EAS_NOT_SUPPORTED	282 /* Extended attributes not supported */
+
+/* Error codes for the ERRSRV class */
+
+#define ERRerror 1              /* Non specific error code */
+#define ERRbadpw 2              /* Bad password */
+#define ERRbadtype 3            /* reserved */
+#define ERRaccess 4          /* No permissions to do the requested operation */
+#define ERRinvnid 5             /* tid invalid */
+#define ERRinvnetname 6         /* Invalid servername */
+#define ERRinvdevice 7          /* Invalid device */
+#define ERRqfull 49             /* Print queue full */
+#define ERRqtoobig 50           /* Queued item too big */
+#define ERRinvpfid 52           /* Invalid print file in smb_fid */
+#define ERRsmbcmd 64            /* Unrecognised command */
+#define ERRsrverror 65          /* smb server internal error */
+#define ERRfilespecs 67         /* fid and pathname invalid combination */
+#define ERRbadlink 68           /* reserved */
+#define ERRbadpermits 69        /* Access specified for a file is not valid */
+#define ERRbadpid 70            /* reserved */
+#define ERRsetattrmode 71       /* attribute mode invalid */
+#define ERRpaused 81            /* Message server paused */
+#define ERRmsgoff 82            /* Not receiving messages */
+#define ERRnoroom 83            /* No room for message */
+#define ERRrmuns 87             /* too many remote usernames */
+#define ERRtimeout 88           /* operation timed out */
+#define ERRnoresource  89   /* No resources currently available for request. */
+#define ERRtoomanyuids 90       /* too many userids */
+#define ERRbaduid 91            /* bad userid */
+#define ERRuseMPX 250    /* temporarily unable to use raw mode, use MPX mode */
+#define ERRuseSTD 251    /* temporarily unable to use raw mode, use std.mode */
+#define ERRcontMPX 252          /* resume MPX mode */
+#define ERRbadPW                /* reserved */
+#define ERRnosupport 0xFFFF
+
+/* Error codes for the ERRHRD class */
+
+#define ERRnowrite 19           /* read only media */
+#define ERRbadunit 20           /* Unknown device */
+#define ERRnotready 21          /* Drive not ready */
+#define ERRbadcmd 22            /* Unknown command */
+#define ERRdata 23              /* Data (CRC) error */
+#define ERRbadreq 24            /* Bad request structure length */
+#define ERRseek 25
+#define ERRbadmedia 26
+#define ERRbadsector 27
+#define ERRnopaper 28
+#define ERRwrite 29             /* write fault */
+#define ERRread 30              /* read fault */
+#define ERRgeneral 31           /* General hardware failure */
+#define ERRwrongdisk 34
+#define ERRFCBunavail 35
+#define ERRsharebufexc 36       /* share buffer exceeded */
+#define ERRdiskfull 39
+
+/*
+ * Access modes when opening a file
+ */
+#define SMB_ACCMASK	0x0003
+#define SMB_O_RDONLY	0x0000
+#define SMB_O_WRONLY	0x0001
+#define SMB_O_RDWR	0x0002
+
+/* offsets into message for common items */
+#define smb_com 8
+#define smb_rcls 9
+#define smb_reh 10
+#define smb_err 11
+#define smb_flg 13
+#define smb_flg2 14
+#define smb_reb 13
+#define smb_tid 28
+#define smb_pid 30
+#define smb_uid 32
+#define smb_mid 34
+#define smb_wct 36
+#define smb_vwv 37
+#define smb_vwv0 37
+#define smb_vwv1 39
+#define smb_vwv2 41
+#define smb_vwv3 43
+#define smb_vwv4 45
+#define smb_vwv5 47
+#define smb_vwv6 49
+#define smb_vwv7 51
+#define smb_vwv8 53
+#define smb_vwv9 55
+#define smb_vwv10 57
+#define smb_vwv11 59
+#define smb_vwv12 61
+#define smb_vwv13 63
+#define smb_vwv14 65
+
+/* these are the trans2 sub fields for primary requests */
+#define smb_tpscnt smb_vwv0
+#define smb_tdscnt smb_vwv1
+#define smb_mprcnt smb_vwv2
+#define smb_mdrcnt smb_vwv3
+#define smb_msrcnt smb_vwv4
+#define smb_flags smb_vwv5
+#define smb_timeout smb_vwv6
+#define smb_pscnt smb_vwv9
+#define smb_psoff smb_vwv10
+#define smb_dscnt smb_vwv11
+#define smb_dsoff smb_vwv12
+#define smb_suwcnt smb_vwv13
+#define smb_setup smb_vwv14
+#define smb_setup0 smb_setup
+#define smb_setup1 (smb_setup+2)
+#define smb_setup2 (smb_setup+4)
+
+/* these are for the secondary requests */
+#define smb_spscnt smb_vwv2
+#define smb_spsoff smb_vwv3
+#define smb_spsdisp smb_vwv4
+#define smb_sdscnt smb_vwv5
+#define smb_sdsoff smb_vwv6
+#define smb_sdsdisp smb_vwv7
+#define smb_sfid smb_vwv8
+
+/* and these for responses */
+#define smb_tprcnt smb_vwv0
+#define smb_tdrcnt smb_vwv1
+#define smb_prcnt smb_vwv3
+#define smb_proff smb_vwv4
+#define smb_prdisp smb_vwv5
+#define smb_drcnt smb_vwv6
+#define smb_droff smb_vwv7
+#define smb_drdisp smb_vwv8
+
+/* the complete */
+#define SMBmkdir      0x00   /* create directory */
+#define SMBrmdir      0x01   /* delete directory */
+#define SMBopen       0x02   /* open file */
+#define SMBcreate     0x03   /* create file */
+#define SMBclose      0x04   /* close file */
+#define SMBflush      0x05   /* flush file */
+#define SMBunlink     0x06   /* delete file */
+#define SMBmv         0x07   /* rename file */
+#define SMBgetatr     0x08   /* get file attributes */
+#define SMBsetatr     0x09   /* set file attributes */
+#define SMBread       0x0A   /* read from file */
+#define SMBwrite      0x0B   /* write to file */
+#define SMBlock       0x0C   /* lock byte range */
+#define SMBunlock     0x0D   /* unlock byte range */
+#define SMBctemp      0x0E   /* create temporary file */
+#define SMBmknew      0x0F   /* make new file */
+#define SMBchkpth     0x10   /* check directory path */
+#define SMBexit       0x11   /* process exit */
+#define SMBlseek      0x12   /* seek */
+#define SMBtcon       0x70   /* tree connect */
+#define SMBtconX      0x75   /* tree connect and X*/
+#define SMBtdis       0x71   /* tree disconnect */
+#define SMBnegprot    0x72   /* negotiate protocol */
+#define SMBdskattr    0x80   /* get disk attributes */
+#define SMBsearch     0x81   /* search directory */
+#define SMBsplopen    0xC0   /* open print spool file */
+#define SMBsplwr      0xC1   /* write to print spool file */
+#define SMBsplclose   0xC2   /* close print spool file */
+#define SMBsplretq    0xC3   /* return print queue */
+#define SMBsends      0xD0   /* send single block message */
+#define SMBsendb      0xD1   /* send broadcast message */
+#define SMBfwdname    0xD2   /* forward user name */
+#define SMBcancelf    0xD3   /* cancel forward */
+#define SMBgetmac     0xD4   /* get machine name */
+#define SMBsendstrt   0xD5   /* send start of multi-block message */
+#define SMBsendend    0xD6   /* send end of multi-block message */
+#define SMBsendtxt    0xD7   /* send text of multi-block message */
+
+/* Core+ protocol */
+#define SMBlockread	  0x13   /* Lock a range and read */
+#define SMBwriteunlock 0x14 /* Unlock a range then write */
+#define SMBreadbraw   0x1a  /* read a block of data with no smb header */
+#define SMBwritebraw  0x1d  /* write a block of data with no smb header */
+#define SMBwritec     0x20  /* secondary write request */
+#define SMBwriteclose 0x2c  /* write a file then close it */
+
+/* dos extended protocol */
+#define SMBreadBraw      0x1A   /* read block raw */
+#define SMBreadBmpx      0x1B   /* read block multiplexed */
+#define SMBreadBs        0x1C   /* read block (secondary response) */
+#define SMBwriteBraw     0x1D   /* write block raw */
+#define SMBwriteBmpx     0x1E   /* write block multiplexed */
+#define SMBwriteBs       0x1F   /* write block (secondary request) */
+#define SMBwriteC        0x20   /* write complete response */
+#define SMBsetattrE      0x22   /* set file attributes expanded */
+#define SMBgetattrE      0x23   /* get file attributes expanded */
+#define SMBlockingX      0x24   /* lock/unlock byte ranges and X */
+#define SMBtrans         0x25   /* transaction - name, bytes in/out */
+#define SMBtranss        0x26   /* transaction (secondary request/response) */
+#define SMBioctl         0x27   /* IOCTL */
+#define SMBioctls        0x28   /* IOCTL  (secondary request/response) */
+#define SMBcopy          0x29   /* copy */
+#define SMBmove          0x2A   /* move */
+#define SMBecho          0x2B   /* echo */
+#define SMBopenX         0x2D   /* open and X */
+#define SMBreadX         0x2E   /* read and X */
+#define SMBwriteX        0x2F   /* write and X */
+#define SMBsesssetupX    0x73   /* Session Set Up & X (including User Logon) */
+#define SMBtconX         0x75   /* tree connect and X */
+#define SMBffirst        0x82   /* find first */
+#define SMBfunique       0x83   /* find unique */
+#define SMBfclose        0x84   /* find close */
+#define SMBinvalid       0xFE   /* invalid command */
+
+
+/* Extended 2.0 protocol */
+#define SMBtrans2        0x32   /* TRANS2 protocol set */
+#define SMBtranss2       0x33   /* TRANS2 protocol set, secondary command */
+#define SMBfindclose     0x34   /* Terminate a TRANSACT2_FINDFIRST */
+#define SMBfindnclose    0x35   /* Terminate a TRANSACT2_FINDNOTIFYFIRST */
+#define SMBulogoffX      0x74   /* user logoff */
+
+/* these are the TRANS2 sub commands */
+#define TRANSACT2_OPEN          0
+#define TRANSACT2_FINDFIRST     1
+#define TRANSACT2_FINDNEXT      2
+#define TRANSACT2_QFSINFO       3
+#define TRANSACT2_SETFSINFO     4
+#define TRANSACT2_QPATHINFO     5
+#define TRANSACT2_SETPATHINFO   6
+#define TRANSACT2_QFILEINFO     7
+#define TRANSACT2_SETFILEINFO   8
+#define TRANSACT2_FSCTL         9
+#define TRANSACT2_IOCTL           10
+#define TRANSACT2_FINDNOTIFYFIRST 11
+#define TRANSACT2_FINDNOTIFYNEXT  12
+#define TRANSACT2_MKDIR           13
+
+/* Information Levels -  Shared? */
+#define SMB_INFO_STANDARD		1
+#define SMB_INFO_QUERY_EA_SIZE		2
+#define SMB_INFO_QUERY_EAS_FROM_LIST	3
+#define SMB_INFO_QUERY_ALL_EAS		4
+#define SMB_INFO_IS_NAME_VALID		6
+
+/* Information Levels -  TRANSACT2_FINDFIRST */
+#define SMB_FIND_FILE_DIRECTORY_INFO		0x101
+#define SMB_FIND_FILE_FULL_DIRECTORY_INFO	0x102
+#define SMB_FIND_FILE_NAMES_INFO		0x103
+#define SMB_FIND_FILE_BOTH_DIRECTORY_INFO	0x104
+
+/* Information Levels -  TRANSACT2_QPATHINFO */
+#define SMB_QUERY_FILE_BASIC_INFO	0x101
+#define SMB_QUERY_FILE_STANDARD_INFO	0x102
+#define SMB_QUERY_FILE_EA_INFO		0x103
+#define SMB_QUERY_FILE_NAME_INFO	0x104
+#define SMB_QUERY_FILE_ALL_INFO		0x107
+#define SMB_QUERY_FILE_ALT_NAME_INFO	0x108
+#define SMB_QUERY_FILE_STREAM_INFO	0x109
+#define SMB_QUERY_FILE_COMPRESSION_INFO	0x10b
+
+/* Information Levels - TRANSACT2_SETFILEINFO */
+#define SMB_SET_FILE_BASIC_INFO		0x101
+#define SMB_SET_FILE_DISPOSITION_INFO	0x102
+#define SMB_SET_FILE_ALLOCATION_INFO	0x103
+#define SMB_SET_FILE_END_OF_FILE_INFO	0x104
+
+/* smb_flg field flags */
+#define SMB_FLAGS_SUPPORT_LOCKREAD	0x01
+#define SMB_FLAGS_CLIENT_BUF_AVAIL	0x02
+#define SMB_FLAGS_RESERVED		0x04
+#define SMB_FLAGS_CASELESS_PATHNAMES	0x08
+#define SMB_FLAGS_CANONICAL_PATHNAMES	0x10
+#define SMB_FLAGS_REQUEST_OPLOCK	0x20
+#define SMB_FLAGS_REQUEST_BATCH_OPLOCK	0x40
+#define SMB_FLAGS_REPLY			0x80
+
+/* smb_flg2 field flags (samba-2.2.0/source/include/smb.h) */
+#define SMB_FLAGS2_LONG_PATH_COMPONENTS		0x0001
+#define SMB_FLAGS2_EXTENDED_ATTRIBUTES		0x0002
+#define SMB_FLAGS2_DFS_PATHNAMES		0x1000
+#define SMB_FLAGS2_READ_PERMIT_NO_EXECUTE	0x2000
+#define SMB_FLAGS2_32_BIT_ERROR_CODES		0x4000 
+#define SMB_FLAGS2_UNICODE_STRINGS		0x8000
+
+
+/*
+ * UNIX stuff  (from samba trans2.h)
+ */
+#define MIN_UNIX_INFO_LEVEL		0x200
+#define MAX_UNIX_INFO_LEVEL		0x2FF
+#define SMB_FIND_FILE_UNIX		0x202
+#define SMB_QUERY_FILE_UNIX_BASIC	0x200
+#define SMB_QUERY_FILE_UNIX_LINK	0x201
+#define SMB_QUERY_FILE_UNIX_HLINK	0x202
+#define SMB_SET_FILE_UNIX_BASIC		0x200
+#define SMB_SET_FILE_UNIX_LINK		0x201
+#define SMB_SET_FILE_UNIX_HLINK		0x203
+#define SMB_QUERY_CIFS_UNIX_INFO	0x200
+
+/* values which means "don't change it" */
+#define SMB_MODE_NO_CHANGE		0xFFFFFFFF
+#define SMB_UID_NO_CHANGE		0xFFFFFFFF
+#define SMB_GID_NO_CHANGE		0xFFFFFFFF
+#define SMB_TIME_NO_CHANGE		0xFFFFFFFFFFFFFFFFULL
+#define SMB_SIZE_NO_CHANGE		0xFFFFFFFFFFFFFFFFULL
+
+/* UNIX filetype mappings. */
+#define UNIX_TYPE_FILE		0
+#define UNIX_TYPE_DIR		1
+#define UNIX_TYPE_SYMLINK	2
+#define UNIX_TYPE_CHARDEV	3
+#define UNIX_TYPE_BLKDEV	4
+#define UNIX_TYPE_FIFO		5
+#define UNIX_TYPE_SOCKET	6
+#define UNIX_TYPE_UNKNOWN	0xFFFFFFFF
+
+#endif /* _SMBNO_H_ */
diff --git a/drivers/staging/smbfs/sock.c b/drivers/staging/smbfs/sock.c
new file mode 100644
index 00000000000..9e264090e61
--- /dev/null
+++ b/drivers/staging/smbfs/sock.c
@@ -0,0 +1,385 @@
+/*
+ *  sock.c
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/workqueue.h>
+#include <net/scm.h>
+#include <net/tcp_states.h>
+#include <net/ip.h>
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+#include "smb_fs.h"
+#include "smb.h"
+#include "smbno.h"
+#include "smb_debug.h"
+#include "proto.h"
+#include "request.h"
+
+
+static int
+_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
+{
+	struct kvec iov = {ubuf, size};
+	struct msghdr msg = {.msg_flags = flags};
+	msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
+	return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
+}
+
+/*
+ * Return the server this socket belongs to
+ */
+static struct smb_sb_info *
+server_from_socket(struct socket *socket)
+{
+	return socket->sk->sk_user_data;
+}
+
+/*
+ * Called when there is data on the socket.
+ */
+void
+smb_data_ready(struct sock *sk, int len)
+{
+	struct smb_sb_info *server = server_from_socket(sk->sk_socket);
+	void (*data_ready)(struct sock *, int) = server->data_ready;
+
+	data_ready(sk, len);
+	VERBOSE("(%p, %d)\n", sk, len);
+	smbiod_wake_up();
+}
+
+int
+smb_valid_socket(struct inode * inode)
+{
+	return (inode && S_ISSOCK(inode->i_mode) && 
+		SOCKET_I(inode)->type == SOCK_STREAM);
+}
+
+static struct socket *
+server_sock(struct smb_sb_info *server)
+{
+	struct file *file;
+
+	if (server && (file = server->sock_file))
+	{
+#ifdef SMBFS_PARANOIA
+		if (!smb_valid_socket(file->f_path.dentry->d_inode))
+			PARANOIA("bad socket!\n");
+#endif
+		return SOCKET_I(file->f_path.dentry->d_inode);
+	}
+	return NULL;
+}
+
+void
+smb_close_socket(struct smb_sb_info *server)
+{
+	struct file * file = server->sock_file;
+
+	if (file) {
+		struct socket *sock = server_sock(server);
+
+		VERBOSE("closing socket %p\n", sock);
+		sock->sk->sk_data_ready = server->data_ready;
+		server->sock_file = NULL;
+		fput(file);
+	}
+}
+
+static int
+smb_get_length(struct socket *socket, unsigned char *header)
+{
+	int result;
+
+	result = _recvfrom(socket, header, 4, MSG_PEEK);
+	if (result == -EAGAIN)
+		return -ENODATA;
+	if (result < 0) {
+		PARANOIA("recv error = %d\n", -result);
+		return result;
+	}
+	if (result < 4)
+		return -ENODATA;
+
+	switch (header[0]) {
+	case 0x00:
+	case 0x82:
+		break;
+
+	case 0x85:
+		DEBUG1("Got SESSION KEEP ALIVE\n");
+		_recvfrom(socket, header, 4, 0);	/* read away */
+		return -ENODATA;
+
+	default:
+		PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
+		return -EIO;
+	}
+
+	/* The length in the RFC NB header is the raw data length */
+	return smb_len(header);
+}
+
+int
+smb_recv_available(struct smb_sb_info *server)
+{
+	mm_segment_t oldfs;
+	int avail, err;
+	struct socket *sock = server_sock(server);
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
+	set_fs(oldfs);
+	return (err >= 0) ? avail : err;
+}
+
+/*
+ * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
+ */
+static int
+smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
+{
+	struct kvec *iv = *data;
+	int i;
+	int len;
+
+	/*
+	 *	Eat any sent kvecs
+	 */
+	while (iv->iov_len <= amount) {
+		amount -= iv->iov_len;
+		iv++;
+		(*num)--;
+	}
+
+	/*
+	 *	And chew down the partial one
+	 */
+	vec[0].iov_len = iv->iov_len-amount;
+	vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
+	iv++;
+
+	len = vec[0].iov_len;
+
+	/*
+	 *	And copy any others
+	 */
+	for (i = 1; i < *num; i++) {
+		vec[i] = *iv++;
+		len += vec[i].iov_len;
+	}
+
+	*data = vec;
+	return len;
+}
+
+/*
+ * smb_receive_header
+ * Only called by the smbiod thread.
+ */
+int
+smb_receive_header(struct smb_sb_info *server)
+{
+	struct socket *sock;
+	int result = 0;
+	unsigned char peek_buf[4];
+
+	result = -EIO; 
+	sock = server_sock(server);
+	if (!sock)
+		goto out;
+	if (sock->sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	if (!server->smb_read) {
+		result = smb_get_length(sock, peek_buf);
+		if (result < 0) {
+			if (result == -ENODATA)
+				result = 0;
+			goto out;
+		}
+		server->smb_len = result + 4;
+
+		if (server->smb_len < SMB_HEADER_LEN) {
+			PARANOIA("short packet: %d\n", result);
+			server->rstate = SMB_RECV_DROP;
+			result = -EIO;
+			goto out;
+		}
+		if (server->smb_len > SMB_MAX_PACKET_SIZE) {
+			PARANOIA("long packet: %d\n", result);
+			server->rstate = SMB_RECV_DROP;
+			result = -EIO;
+			goto out;
+		}
+	}
+
+	result = _recvfrom(sock, server->header + server->smb_read,
+			   SMB_HEADER_LEN - server->smb_read, 0);
+	VERBOSE("_recvfrom: %d\n", result);
+	if (result < 0) {
+		VERBOSE("receive error: %d\n", result);
+		goto out;
+	}
+	server->smb_read += result;
+
+	if (server->smb_read == SMB_HEADER_LEN)
+		server->rstate = SMB_RECV_HCOMPLETE;
+out:
+	return result;
+}
+
+static char drop_buffer[PAGE_SIZE];
+
+/*
+ * smb_receive_drop - read and throw away the data
+ * Only called by the smbiod thread.
+ *
+ * FIXME: we are in the kernel, could we just tell the socket that we want
+ * to drop stuff from the buffer?
+ */
+int
+smb_receive_drop(struct smb_sb_info *server)
+{
+	struct socket *sock;
+	unsigned int flags;
+	struct kvec iov;
+	struct msghdr msg;
+	int rlen = smb_len(server->header) - server->smb_read + 4;
+	int result = -EIO;
+
+	if (rlen > PAGE_SIZE)
+		rlen = PAGE_SIZE;
+
+	sock = server_sock(server);
+	if (!sock)
+		goto out;
+	if (sock->sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+	iov.iov_base = drop_buffer;
+	iov.iov_len = PAGE_SIZE;
+	msg.msg_flags = flags;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = NULL;
+
+	result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
+
+	VERBOSE("read: %d\n", result);
+	if (result < 0) {
+		VERBOSE("receive error: %d\n", result);
+		goto out;
+	}
+	server->smb_read += result;
+
+	if (server->smb_read >= server->smb_len)
+		server->rstate = SMB_RECV_END;
+
+out:
+	return result;
+}
+
+/*
+ * smb_receive
+ * Only called by the smbiod thread.
+ */
+int
+smb_receive(struct smb_sb_info *server, struct smb_request *req)
+{
+	struct socket *sock;
+	unsigned int flags;
+	struct kvec iov[4];
+	struct kvec *p = req->rq_iov;
+	size_t num = req->rq_iovlen;
+	struct msghdr msg;
+	int rlen;
+	int result = -EIO;
+
+	sock = server_sock(server);
+	if (!sock)
+		goto out;
+	if (sock->sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+	msg.msg_flags = flags;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = NULL;
+
+	/* Dont repeat bytes and count available bufferspace */
+	rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
+			(req->rq_rlen - req->rq_bytes_recvd));
+
+	result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
+
+	VERBOSE("read: %d\n", result);
+	if (result < 0) {
+		VERBOSE("receive error: %d\n", result);
+		goto out;
+	}
+	req->rq_bytes_recvd += result;
+	server->smb_read += result;
+
+out:
+	return result;
+}
+
+/*
+ * Try to send a SMB request. This may return after sending only parts of the
+ * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
+ *
+ * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
+ */
+int
+smb_send_request(struct smb_request *req)
+{
+	struct smb_sb_info *server = req->rq_server;
+	struct socket *sock;
+	struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
+        int slen = req->rq_slen - req->rq_bytes_sent;
+	int result = -EIO;
+	struct kvec iov[4];
+	struct kvec *p = req->rq_iov;
+	size_t num = req->rq_iovlen;
+
+	sock = server_sock(server);
+	if (!sock)
+		goto out;
+	if (sock->sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	/* Dont repeat bytes */
+	if (req->rq_bytes_sent)
+		smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
+
+	result = kernel_sendmsg(sock, &msg, p, num, slen);
+
+	if (result >= 0) {
+		req->rq_bytes_sent += result;
+		if (req->rq_bytes_sent >= req->rq_slen)
+			req->rq_flags |= SMB_REQ_TRANSMITTED;
+	}
+out:
+	return result;
+}
diff --git a/drivers/staging/smbfs/symlink.c b/drivers/staging/smbfs/symlink.c
new file mode 100644
index 00000000000..632c4acd062
--- /dev/null
+++ b/drivers/staging/smbfs/symlink.c
@@ -0,0 +1,67 @@
+/*
+ *  symlink.c
+ *
+ *  Copyright (C) 2002 by John Newbigin
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/net.h>
+#include <linux/namei.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include "smbno.h"
+#include "smb_fs.h"
+#include "smb_debug.h"
+#include "proto.h"
+
+int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
+{
+	DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
+
+	return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
+}
+
+static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *link = __getname();
+	DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
+
+	if (!link) {
+		link = ERR_PTR(-ENOMEM);
+	} else {
+		int len = smb_proc_read_link(server_from_dentry(dentry),
+						dentry, link, PATH_MAX - 1);
+		if (len < 0) {
+			__putname(link);
+			link = ERR_PTR(len);
+		} else {
+			link[len] = 0;
+		}
+	}
+	nd_set_link(nd, link);
+	return NULL;
+}
+
+static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		__putname(s);
+}
+
+const struct inode_operations smb_link_inode_operations =
+{
+	.readlink	= generic_readlink,
+	.follow_link	= smb_follow_link,
+	.put_link	= smb_put_link,
+};
diff --git a/fs/Kconfig b/fs/Kconfig
index 30da8ee16a9..25ce2dc1c6d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -233,7 +233,6 @@ config NFS_COMMON
 	default y
 
 source "net/sunrpc/Kconfig"
-source "fs/smbfs/Kconfig"
 source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index e571feddd7b..9284c74c2db 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -91,7 +91,6 @@ obj-$(CONFIG_NFSD)		+= nfsd/
 obj-$(CONFIG_LOCKD)		+= lockd/
 obj-$(CONFIG_NLS)		+= nls/
 obj-$(CONFIG_SYSV_FS)		+= sysv/
-obj-$(CONFIG_SMB_FS)		+= smbfs/
 obj-$(CONFIG_CIFS)		+= cifs/
 obj-$(CONFIG_NCP_FS)		+= ncpfs/
 obj-$(CONFIG_HPFS_FS)		+= hpfs/
diff --git a/fs/compat.c b/fs/compat.c
index 718c7062aec..b42f29a44ed 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -29,8 +29,6 @@
 #include <linux/vfs.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
-#include <linux/smb.h>
-#include <linux/smb_mount.h>
 #include <linux/ncp_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/syscalls.h>
@@ -745,30 +743,6 @@ static void *do_ncp_super_data_conv(void *raw_data)
 	return raw_data;
 }
 
-struct compat_smb_mount_data {
-	compat_int_t version;
-	__compat_uid_t mounted_uid;
-	__compat_uid_t uid;
-	__compat_gid_t gid;
-	compat_mode_t file_mode;
-	compat_mode_t dir_mode;
-};
-
-static void *do_smb_super_data_conv(void *raw_data)
-{
-	struct smb_mount_data *s = raw_data;
-	struct compat_smb_mount_data *c_s = raw_data;
-
-	if (c_s->version != SMB_MOUNT_OLDVERSION)
-		goto out;
-	s->dir_mode = c_s->dir_mode;
-	s->file_mode = c_s->file_mode;
-	s->gid = c_s->gid;
-	s->uid = c_s->uid;
-	s->mounted_uid = c_s->mounted_uid;
- out:
-	return raw_data;
-}
 
 struct compat_nfs_string {
 	compat_uint_t len;
@@ -835,7 +809,6 @@ static int do_nfs4_super_data_conv(void *raw_data)
 	return 0;
 }
 
-#define SMBFS_NAME      "smbfs"
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME	"nfs4"
 
@@ -870,9 +843,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
 	retval = -EINVAL;
 
 	if (kernel_type && data_page) {
-		if (!strcmp(kernel_type, SMBFS_NAME)) {
-			do_smb_super_data_conv((void *)data_page);
-		} else if (!strcmp(kernel_type, NCPFS_NAME)) {
+		if (!strcmp(kernel_type, NCPFS_NAME)) {
 			do_ncp_super_data_conv((void *)data_page);
 		} else if (!strcmp(kernel_type, NFS4_NAME)) {
 			if (do_nfs4_super_data_conv((void *) data_page))
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 03e59aa318e..34cf03cd791 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -46,7 +46,6 @@
 #include <linux/videodev.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
-#include <linux/smb_fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/rtc.h>
@@ -558,25 +557,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 
 #endif /* CONFIG_BLOCK */
 
-static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
-			compat_uid_t __user *argp)
-{
-	mm_segment_t old_fs = get_fs();
-	__kernel_uid_t kuid;
-	int err;
-
-	cmd = SMB_IOC_GETMOUNTUID;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&kuid);
-	set_fs(old_fs);
-
-	if (err >= 0)
-		err = put_user(kuid, argp);
-
-	return err;
-}
-
 /* Bluetooth ioctls */
 #define HCIUARTSETPROTO		_IOW('U', 200, int)
 #define HCIUARTGETPROTO		_IOR('U', 201, int)
@@ -1265,8 +1245,6 @@ COMPATIBLE_IOCTL(OSS_GETVERSION)
 /* Raw devices */
 COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
-/* SMB ioctls which do not need any translations */
-COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
 /* Watchdog */
 COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -1528,10 +1506,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 	case RAW_GETBIND:
 		return raw_ioctl(fd, cmd, argp);
 #endif
-	/* One SMB ioctl needs translations. */
-#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
-	case SMB_IOC_GETMOUNTUID_32:
-		return do_smb_getmountuid(fd, cmd, argp);
 	/* Serial */
 	case TIOCGSERIAL:
 	case TIOCSSERIAL:
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
deleted file mode 100644
index e668127c8b2..00000000000
--- a/fs/smbfs/Kconfig
+++ /dev/null
@@ -1,55 +0,0 @@
-config SMB_FS
-	tristate "SMB file system support (OBSOLETE, please use CIFS)"
-	depends on INET
-	select NLS
-	help
-	  SMB (Server Message Block) is the protocol Windows for Workgroups
-	  (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
-	  files and printers over local networks.  Saying Y here allows you to
-	  mount their file systems (often called "shares" in this context) and
-	  access them just like any other Unix directory.  Currently, this
-	  works only if the Windows machines use TCP/IP as the underlying
-	  transport protocol, and not NetBEUI.  For details, read
-	  <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
-	  available from <http://www.tldp.org/docs.html#howto>.
-
-	  Note: if you just want your box to act as an SMB *server* and make
-	  files and printing services available to Windows clients (which need
-	  to have a TCP/IP stack), you don't need to say Y here; you can use
-	  the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
-	  for that.
-
-	  General information about how to connect Linux, Windows machines and
-	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-
-	  To compile the SMB support as a module, choose M here:
-	  the module will be called smbfs.  Most people say N, however.
-
-config SMB_NLS_DEFAULT
-	bool "Use a default NLS"
-	depends on SMB_FS
-	help
-	  Enabling this will make smbfs use nls translations by default. You
-	  need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
-	  settings and you need to give the default nls for the SMB server as
-	  CONFIG_SMB_NLS_REMOTE.
-
-	  The nls settings can be changed at mount time, if your smbmount
-	  supports that, using the codepage and iocharset parameters.
-
-	  smbmount from samba 2.2.0 or later supports this.
-
-config SMB_NLS_REMOTE
-	string "Default Remote NLS Option"
-	depends on SMB_NLS_DEFAULT
-	default "cp437"
-	help
-	  This setting allows you to specify a default value for which
-	  codepage the server uses. If this field is left blank no
-	  translations will be done by default. The local codepage/charset
-	  default to CONFIG_NLS_DEFAULT.
-
-	  The nls settings can be changed at mount time, if your smbmount
-	  supports that, using the codepage and iocharset parameters.
-
-	  smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
deleted file mode 100644
index 4faf8c4722c..00000000000
--- a/fs/smbfs/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# Makefile for the linux smb-filesystem routines.
-#
-
-obj-$(CONFIG_SMB_FS) += smbfs.o
-
-smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
-		symlink.o smbiod.o request.o
-
-# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
-# SMBFS_PARANOIA should normally be enabled.
-
-EXTRA_CFLAGS += -DSMBFS_PARANOIA
-#EXTRA_CFLAGS += -DSMBFS_DEBUG
-#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
-#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
-#EXTRA_CFLAGS += -Werror
-
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
deleted file mode 100644
index 8c177eb7e34..00000000000
--- a/fs/smbfs/cache.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- *  cache.c
- *
- * Copyright (C) 1997 by Bill Hawes
- *
- * Routines to support directory cacheing using the page cache.
- * This cache code is almost directly taken from ncpfs.
- *
- * Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smb_fs.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-
-#include <asm/page.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-/*
- * Force the next attempt to use the cache to be a timeout.
- * If we can't find the page that's fine, it will cause a refresh.
- */
-void
-smb_invalid_dir_cache(struct inode * dir)
-{
-	struct smb_sb_info *server = server_from_inode(dir);
-	union  smb_dir_cache *cache = NULL;
-	struct page *page = NULL;
-
-	page = grab_cache_page(&dir->i_data, 0);
-	if (!page)
-		goto out;
-
-	if (!PageUptodate(page))
-		goto out_unlock;
-
-	cache = kmap(page);
-	cache->head.time = jiffies - SMB_MAX_AGE(server);
-
-	kunmap(page);
-	SetPageUptodate(page);
-out_unlock:
-	unlock_page(page);
-	page_cache_release(page);
-out:
-	return;
-}
-
-/*
- * Mark all dentries for 'parent' as invalid, forcing them to be re-read
- */
-void
-smb_invalidate_dircache_entries(struct dentry *parent)
-{
-	struct smb_sb_info *server = server_from_dentry(parent);
-	struct list_head *next;
-	struct dentry *dentry;
-
-	spin_lock(&dcache_lock);
-	next = parent->d_subdirs.next;
-	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_u.d_child);
-		dentry->d_fsdata = NULL;
-		smb_age_dentry(server, dentry);
-		next = next->next;
-	}
-	spin_unlock(&dcache_lock);
-}
-
-/*
- * dget, but require that fpos and parent matches what the dentry contains.
- * dentry is not known to be a valid pointer at entry.
- */
-struct dentry *
-smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
-{
-	struct dentry *dent = dentry;
-	struct list_head *next;
-
-	if (d_validate(dent, parent)) {
-		if (dent->d_name.len <= SMB_MAXNAMELEN &&
-		    (unsigned long)dent->d_fsdata == fpos) {
-			if (!dent->d_inode) {
-				dput(dent);
-				dent = NULL;
-			}
-			return dent;
-		}
-		dput(dent);
-	}
-
-	/* If a pointer is invalid, we search the dentry. */
-	spin_lock(&dcache_lock);
-	next = parent->d_subdirs.next;
-	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_u.d_child);
-		if ((unsigned long)dent->d_fsdata == fpos) {
-			if (dent->d_inode)
-				dget_locked(dent);
-			else
-				dent = NULL;
-			goto out_unlock;
-		}
-		next = next->next;
-	}
-	dent = NULL;
-out_unlock:
-	spin_unlock(&dcache_lock);
-	return dent;
-}
-
-
-/*
- * Create dentry/inode for this file and add it to the dircache.
- */
-int
-smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	       struct smb_cache_control *ctrl, struct qstr *qname,
-	       struct smb_fattr *entry)
-{
-	struct dentry *newdent, *dentry = filp->f_path.dentry;
-	struct inode *newino, *inode = dentry->d_inode;
-	struct smb_cache_control ctl = *ctrl;
-	int valid = 0;
-	int hashed = 0;
-	ino_t ino = 0;
-
-	qname->hash = full_name_hash(qname->name, qname->len);
-
-	if (dentry->d_op && dentry->d_op->d_hash)
-		if (dentry->d_op->d_hash(dentry, qname) != 0)
-			goto end_advance;
-
-	newdent = d_lookup(dentry, qname);
-
-	if (!newdent) {
-		newdent = d_alloc(dentry, qname);
-		if (!newdent)
-			goto end_advance;
-	} else {
-		hashed = 1;
-		memcpy((char *) newdent->d_name.name, qname->name,
-		       newdent->d_name.len);
-	}
-
-	if (!newdent->d_inode) {
-		smb_renew_times(newdent);
-		entry->f_ino = iunique(inode->i_sb, 2);
-		newino = smb_iget(inode->i_sb, entry);
-		if (newino) {
-			smb_new_dentry(newdent);
-			d_instantiate(newdent, newino);
-			if (!hashed)
-				d_rehash(newdent);
-		}
-	} else
-		smb_set_inode_attr(newdent->d_inode, entry);
-
-        if (newdent->d_inode) {
-		ino = newdent->d_inode->i_ino;
-		newdent->d_fsdata = (void *) ctl.fpos;
-		smb_new_dentry(newdent);
-	}
-
-	if (ctl.idx >= SMB_DIRCACHE_SIZE) {
-		if (ctl.page) {
-			kunmap(ctl.page);
-			SetPageUptodate(ctl.page);
-			unlock_page(ctl.page);
-			page_cache_release(ctl.page);
-		}
-		ctl.cache = NULL;
-		ctl.idx  -= SMB_DIRCACHE_SIZE;
-		ctl.ofs  += 1;
-		ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
-		if (ctl.page)
-			ctl.cache = kmap(ctl.page);
-	}
-	if (ctl.cache) {
-		ctl.cache->dentry[ctl.idx] = newdent;
-		valid = 1;
-	}
-	dput(newdent);
-
-end_advance:
-	if (!valid)
-		ctl.valid = 0;
-	if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
-		if (!ino)
-			ino = find_inode_number(dentry, qname);
-		if (!ino)
-			ino = iunique(inode->i_sb, 2);
-		ctl.filled = filldir(dirent, qname->name, qname->len,
-				     filp->f_pos, ino, DT_UNKNOWN);
-		if (!ctl.filled)
-			filp->f_pos += 1;
-	}
-	ctl.fpos += 1;
-	ctl.idx  += 1;
-	*ctrl = ctl;
-	return (ctl.valid || !ctl.filled);
-}
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
deleted file mode 100644
index 00a70cab1f3..00000000000
--- a/fs/smbfs/dir.c
+++ /dev/null
@@ -1,702 +0,0 @@
-/*
- *  dir.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/smp_lock.h>
-#include <linux/ctype.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-#include <linux/smbno.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-static int smb_readdir(struct file *, void *, filldir_t);
-static int smb_dir_open(struct inode *, struct file *);
-
-static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
-static int smb_mkdir(struct inode *, struct dentry *, int);
-static int smb_rmdir(struct inode *, struct dentry *);
-static int smb_unlink(struct inode *, struct dentry *);
-static int smb_rename(struct inode *, struct dentry *,
-		      struct inode *, struct dentry *);
-static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
-static int smb_link(struct dentry *, struct inode *, struct dentry *);
-
-const struct file_operations smb_dir_operations =
-{
-	.llseek		= generic_file_llseek,
-	.read		= generic_read_dir,
-	.readdir	= smb_readdir,
-	.unlocked_ioctl	= smb_ioctl,
-	.open		= smb_dir_open,
-};
-
-const struct inode_operations smb_dir_inode_operations =
-{
-	.create		= smb_create,
-	.lookup		= smb_lookup,
-	.unlink		= smb_unlink,
-	.mkdir		= smb_mkdir,
-	.rmdir		= smb_rmdir,
-	.rename		= smb_rename,
-	.getattr	= smb_getattr,
-	.setattr	= smb_notify_change,
-};
-
-const struct inode_operations smb_dir_inode_operations_unix =
-{
-	.create		= smb_create,
-	.lookup		= smb_lookup,
-	.unlink		= smb_unlink,
-	.mkdir		= smb_mkdir,
-	.rmdir		= smb_rmdir,
-	.rename		= smb_rename,
-	.getattr	= smb_getattr,
-	.setattr	= smb_notify_change,
-	.symlink	= smb_symlink,
-	.mknod		= smb_make_node,
-	.link		= smb_link,
-};
-
-/*
- * Read a directory, using filldir to fill the dirent memory.
- * smb_proc_readdir does the actual reading from the smb server.
- *
- * The cache code is almost directly taken from ncpfs
- */
-static int 
-smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *dir = dentry->d_inode;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	union  smb_dir_cache *cache = NULL;
-	struct smb_cache_control ctl;
-	struct page *page = NULL;
-	int result;
-
-	ctl.page  = NULL;
-	ctl.cache = NULL;
-
-	VERBOSE("reading %s/%s, f_pos=%d\n",
-		DENTRY_PATH(dentry),  (int) filp->f_pos);
-
-	result = 0;
-
-	lock_kernel();
-
-	switch ((unsigned int) filp->f_pos) {
-	case 0:
-		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos = 1;
-		/* fallthrough */
-	case 1:
-		if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
-			goto out;
-		filp->f_pos = 2;
-	}
-
-	/*
-	 * Make sure our inode is up-to-date.
-	 */
-	result = smb_revalidate_inode(dentry);
-	if (result)
-		goto out;
-
-
-	page = grab_cache_page(&dir->i_data, 0);
-	if (!page)
-		goto read_really;
-
-	ctl.cache = cache = kmap(page);
-	ctl.head  = cache->head;
-
-	if (!PageUptodate(page) || !ctl.head.eof) {
-		VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
-			 DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
-		goto init_cache;
-	}
-
-	if (filp->f_pos == 2) {
-		if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
-			goto init_cache;
-
-		/*
-		 * N.B. ncpfs checks mtime of dentry too here, we don't.
-		 *   1. common smb servers do not update mtime on dir changes
-		 *   2. it requires an extra smb request
-		 *      (revalidate has the same timeout as ctl.head.time)
-		 *
-		 * Instead smbfs invalidates its own cache on local changes
-		 * and remote changes are not seen until timeout.
-		 */
-	}
-
-	if (filp->f_pos > ctl.head.end)
-		goto finished;
-
-	ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
-	ctl.ofs  = ctl.fpos / SMB_DIRCACHE_SIZE;
-	ctl.idx  = ctl.fpos % SMB_DIRCACHE_SIZE;
-
-	for (;;) {
-		if (ctl.ofs != 0) {
-			ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
-			if (!ctl.page)
-				goto invalid_cache;
-			ctl.cache = kmap(ctl.page);
-			if (!PageUptodate(ctl.page))
-				goto invalid_cache;
-		}
-		while (ctl.idx < SMB_DIRCACHE_SIZE) {
-			struct dentry *dent;
-			int res;
-
-			dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
-					     dentry, filp->f_pos);
-			if (!dent)
-				goto invalid_cache;
-
-			res = filldir(dirent, dent->d_name.name,
-				      dent->d_name.len, filp->f_pos,
-				      dent->d_inode->i_ino, DT_UNKNOWN);
-			dput(dent);
-			if (res)
-				goto finished;
-			filp->f_pos += 1;
-			ctl.idx += 1;
-			if (filp->f_pos > ctl.head.end)
-				goto finished;
-		}
-		if (ctl.page) {
-			kunmap(ctl.page);
-			SetPageUptodate(ctl.page);
-			unlock_page(ctl.page);
-			page_cache_release(ctl.page);
-			ctl.page = NULL;
-		}
-		ctl.idx  = 0;
-		ctl.ofs += 1;
-	}
-invalid_cache:
-	if (ctl.page) {
-		kunmap(ctl.page);
-		unlock_page(ctl.page);
-		page_cache_release(ctl.page);
-		ctl.page = NULL;
-	}
-	ctl.cache = cache;
-init_cache:
-	smb_invalidate_dircache_entries(dentry);
-	ctl.head.time = jiffies;
-	ctl.head.eof = 0;
-	ctl.fpos = 2;
-	ctl.ofs = 0;
-	ctl.idx = SMB_DIRCACHE_START;
-	ctl.filled = 0;
-	ctl.valid  = 1;
-read_really:
-	result = server->ops->readdir(filp, dirent, filldir, &ctl);
-	if (result == -ERESTARTSYS && page)
-		ClearPageUptodate(page);
-	if (ctl.idx == -1)
-		goto invalid_cache;	/* retry */
-	ctl.head.end = ctl.fpos - 1;
-	ctl.head.eof = ctl.valid;
-finished:
-	if (page) {
-		cache->head = ctl.head;
-		kunmap(page);
-		if (result != -ERESTARTSYS)
-			SetPageUptodate(page);
-		unlock_page(page);
-		page_cache_release(page);
-	}
-	if (ctl.page) {
-		kunmap(ctl.page);
-		SetPageUptodate(ctl.page);
-		unlock_page(ctl.page);
-		page_cache_release(ctl.page);
-	}
-out:
-	unlock_kernel();
-	return result;
-}
-
-static int
-smb_dir_open(struct inode *dir, struct file *file)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct smb_sb_info *server;
-	int error = 0;
-
-	VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
-		file->f_path.dentry->d_name.name);
-
-	/*
-	 * Directory timestamps in the core protocol aren't updated
-	 * when a file is added, so we give them a very short TTL.
-	 */
-	lock_kernel();
-	server = server_from_dentry(dentry);
-	if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
-		unsigned long age = jiffies - SMB_I(dir)->oldmtime;
-		if (age > 2*HZ)
-			smb_invalid_dir_cache(dir);
-	}
-
-	/*
-	 * Note: in order to allow the smbmount process to open the
-	 * mount point, we only revalidate if the connection is valid or
-	 * if the process is trying to access something other than the root.
-	 */
-	if (server->state == CONN_VALID || !IS_ROOT(dentry))
-		error = smb_revalidate_inode(dentry);
-	unlock_kernel();
-	return error;
-}
-
-/*
- * Dentry operations routines
- */
-static int smb_lookup_validate(struct dentry *, struct nameidata *);
-static int smb_hash_dentry(struct dentry *, struct qstr *);
-static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
-static int smb_delete_dentry(struct dentry *);
-
-static const struct dentry_operations smbfs_dentry_operations =
-{
-	.d_revalidate	= smb_lookup_validate,
-	.d_hash		= smb_hash_dentry,
-	.d_compare	= smb_compare_dentry,
-	.d_delete	= smb_delete_dentry,
-};
-
-static const struct dentry_operations smbfs_dentry_operations_case =
-{
-	.d_revalidate	= smb_lookup_validate,
-	.d_delete	= smb_delete_dentry,
-};
-
-
-/*
- * This is the callback when the dcache has a lookup hit.
- */
-static int
-smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	struct inode * inode = dentry->d_inode;
-	unsigned long age = jiffies - dentry->d_time;
-	int valid;
-
-	/*
-	 * The default validation is based on dentry age:
-	 * we believe in dentries for a few seconds.  (But each
-	 * successful server lookup renews the timestamp.)
-	 */
-	valid = (age <= SMB_MAX_AGE(server));
-#ifdef SMBFS_DEBUG_VERBOSE
-	if (!valid)
-		VERBOSE("%s/%s not valid, age=%lu\n", 
-			DENTRY_PATH(dentry), age);
-#endif
-
-	if (inode) {
-		lock_kernel();
-		if (is_bad_inode(inode)) {
-			PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
-			valid = 0;
-		} else if (!valid)
-			valid = (smb_revalidate_inode(dentry) == 0);
-		unlock_kernel();
-	} else {
-		/*
-		 * What should we do for negative dentries?
-		 */
-	}
-	return valid;
-}
-
-static int 
-smb_hash_dentry(struct dentry *dir, struct qstr *this)
-{
-	unsigned long hash;
-	int i;
-
-	hash = init_name_hash();
-	for (i=0; i < this->len ; i++)
-		hash = partial_name_hash(tolower(this->name[i]), hash);
-	this->hash = end_name_hash(hash);
-  
-	return 0;
-}
-
-static int
-smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
-{
-	int i, result = 1;
-
-	if (a->len != b->len)
-		goto out;
-	for (i=0; i < a->len; i++) {
-		if (tolower(a->name[i]) != tolower(b->name[i]))
-			goto out;
-	}
-	result = 0;
-out:
-	return result;
-}
-
-/*
- * This is the callback from dput() when d_count is going to 0.
- * We use this to unhash dentries with bad inodes.
- */
-static int
-smb_delete_dentry(struct dentry * dentry)
-{
-	if (dentry->d_inode) {
-		if (is_bad_inode(dentry->d_inode)) {
-			PARANOIA("bad inode, unhashing %s/%s\n",
-				 DENTRY_PATH(dentry));
-			return 1;
-		}
-	} else {
-		/* N.B. Unhash negative dentries? */
-	}
-	return 0;
-}
-
-/*
- * Initialize a new dentry
- */
-void
-smb_new_dentry(struct dentry *dentry)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-
-	if (server->mnt->flags & SMB_MOUNT_CASE)
-		dentry->d_op = &smbfs_dentry_operations_case;
-	else
-		dentry->d_op = &smbfs_dentry_operations;
-	dentry->d_time = jiffies;
-}
-
-
-/*
- * Whenever a lookup succeeds, we know the parent directories
- * are all valid, so we want to update the dentry timestamps.
- * N.B. Move this to dcache?
- */
-void
-smb_renew_times(struct dentry * dentry)
-{
-	dget(dentry);
-	spin_lock(&dentry->d_lock);
-	for (;;) {
-		struct dentry *parent;
-
-		dentry->d_time = jiffies;
-		if (IS_ROOT(dentry))
-			break;
-		parent = dentry->d_parent;
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		dput(dentry);
-		dentry = parent;
-		spin_lock(&dentry->d_lock);
-	}
-	spin_unlock(&dentry->d_lock);
-	dput(dentry);
-}
-
-static struct dentry *
-smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-	struct smb_fattr finfo;
-	struct inode *inode;
-	int error;
-	struct smb_sb_info *server;
-
-	error = -ENAMETOOLONG;
-	if (dentry->d_name.len > SMB_MAXNAMELEN)
-		goto out;
-
-	/* Do not allow lookup of names with backslashes in */
-	error = -EINVAL;
-	if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
-		goto out;
-
-	lock_kernel();
-	error = smb_proc_getattr(dentry, &finfo);
-#ifdef SMBFS_PARANOIA
-	if (error && error != -ENOENT)
-		PARANOIA("find %s/%s failed, error=%d\n",
-			 DENTRY_PATH(dentry), error);
-#endif
-
-	inode = NULL;
-	if (error == -ENOENT)
-		goto add_entry;
-	if (!error) {
-		error = -EACCES;
-		finfo.f_ino = iunique(dentry->d_sb, 2);
-		inode = smb_iget(dir->i_sb, &finfo);
-		if (inode) {
-	add_entry:
-			server = server_from_dentry(dentry);
-			if (server->mnt->flags & SMB_MOUNT_CASE)
-				dentry->d_op = &smbfs_dentry_operations_case;
-			else
-				dentry->d_op = &smbfs_dentry_operations;
-
-			d_add(dentry, inode);
-			smb_renew_times(dentry);
-			error = 0;
-		}
-	}
-	unlock_kernel();
-out:
-	return ERR_PTR(error);
-}
-
-/*
- * This code is common to all routines creating a new inode.
- */
-static int
-smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	struct inode *inode;
-	int error;
-	struct smb_fattr fattr;
-
-	VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
-
-	error = smb_proc_getattr(dentry, &fattr);
-	if (error)
-		goto out_close;
-
-	smb_renew_times(dentry);
-	fattr.f_ino = iunique(dentry->d_sb, 2);
-	inode = smb_iget(dentry->d_sb, &fattr);
-	if (!inode)
-		goto out_no_inode;
-
-	if (have_id) {
-		struct smb_inode_info *ei = SMB_I(inode);
-		ei->fileid = fileid;
-		ei->access = SMB_O_RDWR;
-		ei->open = server->generation;
-	}
-	d_instantiate(dentry, inode);
-out:
-	return error;
-
-out_no_inode:
-	error = -EACCES;
-out_close:
-	if (have_id) {
-		PARANOIA("%s/%s failed, error=%d, closing %u\n",
-			 DENTRY_PATH(dentry), error, fileid);
-		smb_close_fileid(dentry, fileid);
-	}
-	goto out;
-}
-
-/* N.B. How should the mode argument be used? */
-static int
-smb_create(struct inode *dir, struct dentry *dentry, int mode,
-		struct nameidata *nd)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	__u16 fileid;
-	int error;
-	struct iattr attr;
-
-	VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
-
-	lock_kernel();
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
-	if (!error) {
-		if (server->opt.capabilities & SMB_CAP_UNIX) {
-			/* Set attributes for new file */
-			attr.ia_valid = ATTR_MODE;
-			attr.ia_mode = mode;
-			error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-		}
-		error = smb_instantiate(dentry, fileid, 1);
-	} else {
-		PARANOIA("%s/%s failed, error=%d\n",
-			 DENTRY_PATH(dentry), error);
-	}
-	unlock_kernel();
-	return error;
-}
-
-/* N.B. How should the mode argument be used? */
-static int
-smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int error;
-	struct iattr attr;
-
-	lock_kernel();
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_mkdir(dentry);
-	if (!error) {
-		if (server->opt.capabilities & SMB_CAP_UNIX) {
-			/* Set attributes for new directory */
-			attr.ia_valid = ATTR_MODE;
-			attr.ia_mode = mode;
-			error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-		}
-		error = smb_instantiate(dentry, 0, 0);
-	}
-	unlock_kernel();
-	return error;
-}
-
-static int
-smb_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	int error;
-
-	/*
-	 * Close the directory if it's open.
-	 */
-	lock_kernel();
-	smb_close(inode);
-
-	/*
-	 * Check that nobody else is using the directory..
-	 */
-	error = -EBUSY;
-	if (!d_unhashed(dentry))
-		goto out;
-
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_rmdir(dentry);
-
-out:
-	unlock_kernel();
-	return error;
-}
-
-static int
-smb_unlink(struct inode *dir, struct dentry *dentry)
-{
-	int error;
-
-	/*
-	 * Close the file if it's open.
-	 */
-	lock_kernel();
-	smb_close(dentry->d_inode);
-
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_unlink(dentry);
-	if (!error)
-		smb_renew_times(dentry);
-	unlock_kernel();
-	return error;
-}
-
-static int
-smb_rename(struct inode *old_dir, struct dentry *old_dentry,
-	   struct inode *new_dir, struct dentry *new_dentry)
-{
-	int error;
-
-	/*
-	 * Close any open files, and check whether to delete the
-	 * target before attempting the rename.
-	 */
-	lock_kernel();
-	if (old_dentry->d_inode)
-		smb_close(old_dentry->d_inode);
-	if (new_dentry->d_inode) {
-		smb_close(new_dentry->d_inode);
-		error = smb_proc_unlink(new_dentry);
-		if (error) {
-			VERBOSE("unlink %s/%s, error=%d\n",
-				DENTRY_PATH(new_dentry), error);
-			goto out;
-		}
-		/* FIXME */
-		d_delete(new_dentry);
-	}
-
-	smb_invalid_dir_cache(old_dir);
-	smb_invalid_dir_cache(new_dir);
-	error = smb_proc_mv(old_dentry, new_dentry);
-	if (!error) {
-		smb_renew_times(old_dentry);
-		smb_renew_times(new_dentry);
-	}
-out:
-	unlock_kernel();
-	return error;
-}
-
-/*
- * FIXME: samba servers won't let you create device nodes unless uid/gid
- * matches the connection credentials (and we don't know which those are ...)
- */
-static int
-smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
-{
-	int error;
-	struct iattr attr;
-
-	attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
-	attr.ia_mode = mode;
-	current_euid_egid(&attr.ia_uid, &attr.ia_gid);
-
-	if (!new_valid_dev(dev))
-		return -EINVAL;
-
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
-	if (!error) {
-		error = smb_instantiate(dentry, 0, 0);
-	}
-	return error;
-}
-
-/*
- * dentry = existing file
- * new_dentry = new file
- */
-static int
-smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
-{
-	int error;
-
-	DEBUG1("smb_link old=%s/%s new=%s/%s\n",
-	       DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
-	if (!error) {
-		smb_renew_times(dentry);
-		error = smb_instantiate(new_dentry, 0, 0);
-	}
-	return error;
-}
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
deleted file mode 100644
index 8e187a0f94b..00000000000
--- a/fs/smbfs/file.c
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- *  file.c
- *
- *  Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/aio.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-static int
-smb_fsync(struct file *file, int datasync)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int result;
-
-	VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
-
-	/*
-	 * The VFS will writepage() all dirty pages for us, but we
-	 * should send a SMBflush to the server, letting it know that
-	 * we want things synchronized with actual storage.
-	 *
-	 * Note: this function requires all pages to have been written already
-	 *       (should be ok with writepage_sync)
-	 */
-	result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
-	return result;
-}
-
-/*
- * Read a page synchronously.
- */
-static int
-smb_readpage_sync(struct dentry *dentry, struct page *page)
-{
-	char *buffer = kmap(page);
-	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	unsigned int rsize = smb_get_rsize(server);
-	int count = PAGE_SIZE;
-	int result;
-
-	VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
-		DENTRY_PATH(dentry), count, offset, rsize);
-
-	result = smb_open(dentry, SMB_O_RDONLY);
-	if (result < 0)
-		goto io_error;
-
-	do {
-		if (count < rsize)
-			rsize = count;
-
-		result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
-		if (result < 0)
-			goto io_error;
-
-		count -= result;
-		offset += result;
-		buffer += result;
-		dentry->d_inode->i_atime =
-			current_fs_time(dentry->d_inode->i_sb);
-		if (result < rsize)
-			break;
-	} while (count);
-
-	memset(buffer, 0, count);
-	flush_dcache_page(page);
-	SetPageUptodate(page);
-	result = 0;
-
-io_error:
-	kunmap(page);
-	unlock_page(page);
-	return result;
-}
-
-/*
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_readpage(struct file *file, struct page *page)
-{
-	int		error;
-	struct dentry  *dentry = file->f_path.dentry;
-
-	page_cache_get(page);
-	error = smb_readpage_sync(dentry, page);
-	page_cache_release(page);
-	return error;
-}
-
-/*
- * Write a page synchronously.
- * Offset is the data offset within the page.
- */
-static int
-smb_writepage_sync(struct inode *inode, struct page *page,
-		   unsigned long pageoffset, unsigned int count)
-{
-	loff_t offset;
-	char *buffer = kmap(page) + pageoffset;
-	struct smb_sb_info *server = server_from_inode(inode);
-	unsigned int wsize = smb_get_wsize(server);
-	int ret = 0;
-
-	offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
-	VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
-
-	do {
-		int write_ret;
-
-		if (count < wsize)
-			wsize = count;
-
-		write_ret = server->ops->write(inode, offset, wsize, buffer);
-		if (write_ret < 0) {
-			PARANOIA("failed write, wsize=%d, write_ret=%d\n",
-				 wsize, write_ret);
-			ret = write_ret;
-			break;
-		}
-		/* N.B. what if result < wsize?? */
-#ifdef SMBFS_PARANOIA
-		if (write_ret < wsize)
-			PARANOIA("short write, wsize=%d, write_ret=%d\n",
-				 wsize, write_ret);
-#endif
-		buffer += wsize;
-		offset += wsize;
-		count -= wsize;
-		/*
-		 * Update the inode now rather than waiting for a refresh.
-		 */
-		inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
-		SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
-		if (offset > inode->i_size)
-			inode->i_size = offset;
-	} while (count);
-
-	kunmap(page);
-	return ret;
-}
-
-/*
- * Write a page to the server. This will be used for NFS swapping only
- * (for now), and we currently do this synchronously only.
- *
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *inode;
-	unsigned long end_index;
-	unsigned offset = PAGE_CACHE_SIZE;
-	int err;
-
-	BUG_ON(!mapping);
-	inode = mapping->host;
-	BUG_ON(!inode);
-
-	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
-
-	/* easy case */
-	if (page->index < end_index)
-		goto do_it;
-	/* things got complicated... */
-	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
-	/* OK, are we completely out? */
-	if (page->index >= end_index+1 || !offset)
-		return 0; /* truncated - don't care */
-do_it:
-	page_cache_get(page);
-	err = smb_writepage_sync(inode, page, 0, offset);
-	SetPageUptodate(page);
-	unlock_page(page);
-	page_cache_release(page);
-	return err;
-}
-
-static int
-smb_updatepage(struct file *file, struct page *page, unsigned long offset,
-	       unsigned int count)
-{
-	struct dentry *dentry = file->f_path.dentry;
-
-	DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
-		((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
-
-	return smb_writepage_sync(dentry->d_inode, page, offset, count);
-}
-
-static ssize_t
-smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-			unsigned long nr_segs, loff_t pos)
-{
-	struct file * file = iocb->ki_filp;
-	struct dentry * dentry = file->f_path.dentry;
-	ssize_t	status;
-
-	VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
-		(unsigned long) iocb->ki_left, (unsigned long) pos);
-
-	status = smb_revalidate_inode(dentry);
-	if (status) {
-		PARANOIA("%s/%s validation failed, error=%Zd\n",
-			 DENTRY_PATH(dentry), status);
-		goto out;
-	}
-
-	VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
-		(long)dentry->d_inode->i_size,
-		dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
-
-	status = generic_file_aio_read(iocb, iov, nr_segs, pos);
-out:
-	return status;
-}
-
-static int
-smb_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	struct dentry * dentry = file->f_path.dentry;
-	int	status;
-
-	VERBOSE("file %s/%s, address %lu - %lu\n",
-		DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
-
-	status = smb_revalidate_inode(dentry);
-	if (status) {
-		PARANOIA("%s/%s validation failed, error=%d\n",
-			 DENTRY_PATH(dentry), status);
-		goto out;
-	}
-	status = generic_file_mmap(file, vma);
-out:
-	return status;
-}
-
-static ssize_t
-smb_file_splice_read(struct file *file, loff_t *ppos,
-		     struct pipe_inode_info *pipe, size_t count,
-		     unsigned int flags)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	ssize_t status;
-
-	VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
-		DENTRY_PATH(dentry), *ppos, count);
-
-	status = smb_revalidate_inode(dentry);
-	if (status) {
-		PARANOIA("%s/%s validation failed, error=%Zd\n",
-			 DENTRY_PATH(dentry), status);
-		goto out;
-	}
-	status = generic_file_splice_read(file, ppos, pipe, count, flags);
-out:
-	return status;
-}
-
-/*
- * This does the "real" work of the write. The generic routine has
- * allocated the page, locked it, done all the page alignment stuff
- * calculations etc. Now we should just copy the data from user
- * space and write it back to the real medium..
- *
- * If the writer ends up delaying the write, the writer needs to
- * increment the page use counts until he is done with the page.
- */
-static int smb_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata)
-{
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	*pagep = grab_cache_page_write_begin(mapping, index, flags);
-	if (!*pagep)
-		return -ENOMEM;
-	return 0;
-}
-
-static int smb_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct page *page, void *fsdata)
-{
-	int status;
-	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-
-	lock_kernel();
-	status = smb_updatepage(file, page, offset, copied);
-	unlock_kernel();
-
-	if (!status) {
-		if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
-			SetPageUptodate(page);
-		status = copied;
-	}
-
-	unlock_page(page);
-	page_cache_release(page);
-
-	return status;
-}
-
-const struct address_space_operations smb_file_aops = {
-	.readpage = smb_readpage,
-	.writepage = smb_writepage,
-	.write_begin = smb_write_begin,
-	.write_end = smb_write_end,
-};
-
-/* 
- * Write to a file (through the page cache).
- */
-static ssize_t
-smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t pos)
-{
-	struct file * file = iocb->ki_filp;
-	struct dentry * dentry = file->f_path.dentry;
-	ssize_t	result;
-
-	VERBOSE("file %s/%s, count=%lu@%lu\n",
-		DENTRY_PATH(dentry),
-		(unsigned long) iocb->ki_left, (unsigned long) pos);
-
-	result = smb_revalidate_inode(dentry);
-	if (result) {
-		PARANOIA("%s/%s validation failed, error=%Zd\n",
-			 DENTRY_PATH(dentry), result);
-		goto out;
-	}
-
-	result = smb_open(dentry, SMB_O_WRONLY);
-	if (result)
-		goto out;
-
-	if (iocb->ki_left > 0) {
-		result = generic_file_aio_write(iocb, iov, nr_segs, pos);
-		VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
-			(long) file->f_pos, (long) dentry->d_inode->i_size,
-			dentry->d_inode->i_mtime.tv_sec,
-			dentry->d_inode->i_atime.tv_sec);
-	}
-out:
-	return result;
-}
-
-static int
-smb_file_open(struct inode *inode, struct file * file)
-{
-	int result;
-	struct dentry *dentry = file->f_path.dentry;
-	int smb_mode = (file->f_mode & O_ACCMODE) - 1;
-
-	lock_kernel();
-	result = smb_open(dentry, smb_mode);
-	if (result)
-		goto out;
-	SMB_I(inode)->openers++;
-out:
-	unlock_kernel();
-	return result;
-}
-
-static int
-smb_file_release(struct inode *inode, struct file * file)
-{
-	lock_kernel();
-	if (!--SMB_I(inode)->openers) {
-		/* We must flush any dirty pages now as we won't be able to
-		   write anything after close. mmap can trigger this.
-		   "openers" should perhaps include mmap'ers ... */
-		filemap_write_and_wait(inode->i_mapping);
-		smb_close(inode);
-	}
-	unlock_kernel();
-	return 0;
-}
-
-/*
- * Check whether the required access is compatible with
- * an inode's permission. SMB doesn't recognize superuser
- * privileges, so we need our own check for this.
- */
-static int
-smb_file_permission(struct inode *inode, int mask)
-{
-	int mode = inode->i_mode;
-	int error = 0;
-
-	VERBOSE("mode=%x, mask=%x\n", mode, mask);
-
-	/* Look at user permissions */
-	mode >>= 6;
-	if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
-		error = -EACCES;
-	return error;
-}
-
-static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
-{
-	loff_t ret;
-	lock_kernel();
-	ret = generic_file_llseek_unlocked(file, offset, origin);
-	unlock_kernel();
-	return ret;
-}
-
-const struct file_operations smb_file_operations =
-{
-	.llseek 	= smb_remote_llseek,
-	.read		= do_sync_read,
-	.aio_read	= smb_file_aio_read,
-	.write		= do_sync_write,
-	.aio_write	= smb_file_aio_write,
-	.unlocked_ioctl	= smb_ioctl,
-	.mmap		= smb_file_mmap,
-	.open		= smb_file_open,
-	.release	= smb_file_release,
-	.fsync		= smb_fsync,
-	.splice_read	= smb_file_splice_read,
-};
-
-const struct inode_operations smb_file_inode_operations =
-{
-	.permission	= smb_file_permission,
-	.getattr	= smb_getattr,
-	.setattr	= smb_notify_change,
-};
diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c
deleted file mode 100644
index 7ae0f5273ab..00000000000
--- a/fs/smbfs/getopt.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * getopt.c
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/net.h>
-
-#include "getopt.h"
-
-/**
- *	smb_getopt - option parser
- *	@caller: name of the caller, for error messages
- *	@options: the options string
- *	@opts: an array of &struct option entries controlling parser operations
- *	@optopt: output; will contain the current option
- *	@optarg: output; will contain the value (if one exists)
- *	@flag: output; may be NULL; should point to a long for or'ing flags
- *	@value: output; may be NULL; will be overwritten with the integer value
- *		of the current argument.
- *
- *	Helper to parse options on the format used by mount ("a=b,c=d,e,f").
- *	Returns opts->val if a matching entry in the 'opts' array is found,
- *	0 when no more tokens are found, -1 if an error is encountered.
- */
-int smb_getopt(char *caller, char **options, struct option *opts,
-	       char **optopt, char **optarg, unsigned long *flag,
-	       unsigned long *value)
-{
-	char *token;
-	char *val;
-	int i;
-
-	do {
-		if ((token = strsep(options, ",")) == NULL)
-			return 0;
-	} while (*token == '\0');
-	*optopt = token;
-
-	*optarg = NULL;
-	if ((val = strchr (token, '=')) != NULL) {
-		*val++ = 0;
-		if (value)
-			*value = simple_strtoul(val, NULL, 0);
-		*optarg = val;
-	}
-
-	for (i = 0; opts[i].name != NULL; i++) {
-		if (!strcmp(opts[i].name, token)) {
-			if (!opts[i].flag && (!val || !*val)) {
-				printk("%s: the %s option requires an argument\n",
-				       caller, token);
-				return -1;
-			}
-
-			if (flag && opts[i].flag)
-				*flag |= opts[i].flag;
-
-			return opts[i].val;
-		}
-	}
-	printk("%s: Unrecognized mount option %s\n", caller, token);
-	return -1;
-}
diff --git a/fs/smbfs/getopt.h b/fs/smbfs/getopt.h
deleted file mode 100644
index 146219ac7c4..00000000000
--- a/fs/smbfs/getopt.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _LINUX_GETOPT_H
-#define _LINUX_GETOPT_H
-
-struct option {
-	const char *name;
-	unsigned long flag;
-	int val;
-};
-
-extern int smb_getopt(char *caller, char **options, struct option *opts,
-		      char **optopt, char **optarg, unsigned long *flag,
-		      unsigned long *value);
-
-#endif /* _LINUX_GETOPT_H */
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
deleted file mode 100644
index 450c9194198..00000000000
--- a/fs/smbfs/inode.c
+++ /dev/null
@@ -1,839 +0,0 @@
-/*
- *  inode.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/smp_lock.h>
-#include <linux/nls.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/highuid.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include "smb_debug.h"
-#include "getopt.h"
-#include "proto.h"
-
-/* Always pick a default string */
-#ifdef CONFIG_SMB_NLS_REMOTE
-#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
-#else
-#define SMB_NLS_REMOTE ""
-#endif
-
-#define SMB_TTL_DEFAULT 1000
-
-static void smb_evict_inode(struct inode *);
-static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct dentry *, struct kstatfs *);
-static int  smb_show_options(struct seq_file *, struct vfsmount *);
-
-static struct kmem_cache *smb_inode_cachep;
-
-static struct inode *smb_alloc_inode(struct super_block *sb)
-{
-	struct smb_inode_info *ei;
-	ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
-	if (!ei)
-		return NULL;
-	return &ei->vfs_inode;
-}
-
-static void smb_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
-}
-
-static void init_once(void *foo)
-{
-	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
-
-	inode_init_once(&ei->vfs_inode);
-}
-
-static int init_inodecache(void)
-{
-	smb_inode_cachep = kmem_cache_create("smb_inode_cache",
-					     sizeof(struct smb_inode_info),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     init_once);
-	if (smb_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-static void destroy_inodecache(void)
-{
-	kmem_cache_destroy(smb_inode_cachep);
-}
-
-static int smb_remount(struct super_block *sb, int *flags, char *data)
-{
-	*flags |= MS_NODIRATIME;
-	return 0;
-}
-
-static const struct super_operations smb_sops =
-{
-	.alloc_inode	= smb_alloc_inode,
-	.destroy_inode	= smb_destroy_inode,
-	.drop_inode	= generic_delete_inode,
-	.evict_inode	= smb_evict_inode,
-	.put_super	= smb_put_super,
-	.statfs		= smb_statfs,
-	.show_options	= smb_show_options,
-	.remount_fs	= smb_remount,
-};
-
-
-/* We are always generating a new inode here */
-struct inode *
-smb_iget(struct super_block *sb, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = SMB_SB(sb);
-	struct inode *result;
-
-	DEBUG1("smb_iget: %p\n", fattr);
-
-	result = new_inode(sb);
-	if (!result)
-		return result;
-	result->i_ino = fattr->f_ino;
-	SMB_I(result)->open = 0;
-	SMB_I(result)->fileid = 0;
-	SMB_I(result)->access = 0;
-	SMB_I(result)->flags = 0;
-	SMB_I(result)->closed = 0;
-	SMB_I(result)->openers = 0;
-	smb_set_inode_attr(result, fattr);
-	if (S_ISREG(result->i_mode)) {
-		result->i_op = &smb_file_inode_operations;
-		result->i_fop = &smb_file_operations;
-		result->i_data.a_ops = &smb_file_aops;
-	} else if (S_ISDIR(result->i_mode)) {
-		if (server->opt.capabilities & SMB_CAP_UNIX)
-			result->i_op = &smb_dir_inode_operations_unix;
-		else
-			result->i_op = &smb_dir_inode_operations;
-		result->i_fop = &smb_dir_operations;
-	} else if (S_ISLNK(result->i_mode)) {
-		result->i_op = &smb_link_inode_operations;
-	} else {
-		init_special_inode(result, result->i_mode, fattr->f_rdev);
-	}
-	insert_inode_hash(result);
-	return result;
-}
-
-/*
- * Copy the inode data to a smb_fattr structure.
- */
-void
-smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-	memset(fattr, 0, sizeof(struct smb_fattr));
-	fattr->f_mode	= inode->i_mode;
-	fattr->f_nlink	= inode->i_nlink;
-	fattr->f_ino	= inode->i_ino;
-	fattr->f_uid	= inode->i_uid;
-	fattr->f_gid	= inode->i_gid;
-	fattr->f_size	= inode->i_size;
-	fattr->f_mtime	= inode->i_mtime;
-	fattr->f_ctime	= inode->i_ctime;
-	fattr->f_atime	= inode->i_atime;
-	fattr->f_blocks	= inode->i_blocks;
-
-	fattr->attr	= SMB_I(inode)->attr;
-	/*
-	 * Keep the attributes in sync with the inode permissions.
-	 */
-	if (fattr->f_mode & S_IWUSR)
-		fattr->attr &= ~aRONLY;
-	else
-		fattr->attr |= aRONLY;
-}
-
-/*
- * Update the inode, possibly causing it to invalidate its pages if mtime/size
- * is different from last time.
- */
-void
-smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-	struct smb_inode_info *ei = SMB_I(inode);
-
-	/*
-	 * A size change should have a different mtime, or same mtime
-	 * but different size.
-	 */
-	time_t last_time = inode->i_mtime.tv_sec;
-	loff_t last_sz = inode->i_size;
-
-	inode->i_mode	= fattr->f_mode;
-	inode->i_nlink	= fattr->f_nlink;
-	inode->i_uid	= fattr->f_uid;
-	inode->i_gid	= fattr->f_gid;
-	inode->i_ctime	= fattr->f_ctime;
-	inode->i_blocks = fattr->f_blocks;
-	inode->i_size	= fattr->f_size;
-	inode->i_mtime	= fattr->f_mtime;
-	inode->i_atime	= fattr->f_atime;
-	ei->attr = fattr->attr;
-
-	/*
-	 * Update the "last time refreshed" field for revalidation.
-	 */
-	ei->oldmtime = jiffies;
-
-	if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
-		VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
-			inode->i_ino,
-			(long) last_time, (long) inode->i_mtime.tv_sec,
-			(long) last_sz, (long) inode->i_size);
-
-		if (!S_ISDIR(inode->i_mode))
-			invalidate_remote_inode(inode);
-	}
-}
-
-/*
- * This is called if the connection has gone bad ...
- * try to kill off all the current inodes.
- */
-void
-smb_invalidate_inodes(struct smb_sb_info *server)
-{
-	VERBOSE("\n");
-	shrink_dcache_sb(SB_of(server));
-	invalidate_inodes(SB_of(server));
-}
-
-/*
- * This is called to update the inode attributes after
- * we've made changes to a file or directory.
- */
-static int
-smb_refresh_inode(struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	int error;
-	struct smb_fattr fattr;
-
-	error = smb_proc_getattr(dentry, &fattr);
-	if (!error) {
-		smb_renew_times(dentry);
-		/*
-		 * Check whether the type part of the mode changed,
-		 * and don't update the attributes if it did.
-		 *
-		 * And don't dick with the root inode
-		 */
-		if (inode->i_ino == 2)
-			return error;
-		if (S_ISLNK(inode->i_mode))
-			return error;	/* VFS will deal with it */
-
-		if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
-			smb_set_inode_attr(inode, &fattr);
-		} else {
-			/*
-			 * Big trouble! The inode has become a new object,
-			 * so any operations attempted on it are invalid.
-			 *
-			 * To limit damage, mark the inode as bad so that
-			 * subsequent lookup validations will fail.
-			 */
-			PARANOIA("%s/%s changed mode, %07o to %07o\n",
-				 DENTRY_PATH(dentry),
-				 inode->i_mode, fattr.f_mode);
-
-			fattr.f_mode = inode->i_mode; /* save mode */
-			make_bad_inode(inode);
-			inode->i_mode = fattr.f_mode; /* restore mode */
-			/*
-			 * No need to worry about unhashing the dentry: the
-			 * lookup validation will see that the inode is bad.
-			 * But we do want to invalidate the caches ...
-			 */
-			if (!S_ISDIR(inode->i_mode))
-				invalidate_remote_inode(inode);
-			else
-				smb_invalid_dir_cache(inode);
-			error = -EIO;
-		}
-	}
-	return error;
-}
-
-/*
- * This is called when we want to check whether the inode
- * has changed on the server.  If it has changed, we must
- * invalidate our local caches.
- */
-int
-smb_revalidate_inode(struct dentry *dentry)
-{
-	struct smb_sb_info *s = server_from_dentry(dentry);
-	struct inode *inode = dentry->d_inode;
-	int error = 0;
-
-	DEBUG1("smb_revalidate_inode\n");
-	lock_kernel();
-
-	/*
-	 * Check whether we've recently refreshed the inode.
-	 */
-	if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
-		VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
-			inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
-		goto out;
-	}
-
-	error = smb_refresh_inode(dentry);
-out:
-	unlock_kernel();
-	return error;
-}
-
-/*
- * This routine is called when i_nlink == 0 and i_count goes to 0.
- * All blocking cleanup operations need to go here to avoid races.
- */
-static void
-smb_evict_inode(struct inode *ino)
-{
-	DEBUG1("ino=%ld\n", ino->i_ino);
-	truncate_inode_pages(&ino->i_data, 0);
-	end_writeback(ino);
-	lock_kernel();
-	if (smb_close(ino))
-		PARANOIA("could not close inode %ld\n", ino->i_ino);
-	unlock_kernel();
-}
-
-static struct option opts[] = {
-	{ "version",	0, 'v' },
-	{ "win95",	SMB_MOUNT_WIN95, 1 },
-	{ "oldattr",	SMB_MOUNT_OLDATTR, 1 },
-	{ "dirattr",	SMB_MOUNT_DIRATTR, 1 },
-	{ "case",	SMB_MOUNT_CASE, 1 },
-	{ "uid",	0, 'u' },
-	{ "gid",	0, 'g' },
-	{ "file_mode",	0, 'f' },
-	{ "dir_mode",	0, 'd' },
-	{ "iocharset",	0, 'i' },
-	{ "codepage",	0, 'c' },
-	{ "ttl",	0, 't' },
-	{ NULL,		0, 0}
-};
-
-static int
-parse_options(struct smb_mount_data_kernel *mnt, char *options)
-{
-	int c;
-	unsigned long flags;
-	unsigned long value;
-	char *optarg;
-	char *optopt;
-
-	flags = 0;
-	while ( (c = smb_getopt("smbfs", &options, opts,
-				&optopt, &optarg, &flags, &value)) > 0) {
-
-		VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
-		switch (c) {
-		case 1:
-			/* got a "flag" option */
-			break;
-		case 'v':
-			if (value != SMB_MOUNT_VERSION) {
-			printk ("smbfs: Bad mount version %ld, expected %d\n",
-				value, SMB_MOUNT_VERSION);
-				return 0;
-			}
-			mnt->version = value;
-			break;
-		case 'u':
-			mnt->uid = value;
-			flags |= SMB_MOUNT_UID;
-			break;
-		case 'g':
-			mnt->gid = value;
-			flags |= SMB_MOUNT_GID;
-			break;
-		case 'f':
-			mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
-			flags |= SMB_MOUNT_FMODE;
-			break;
-		case 'd':
-			mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
-			flags |= SMB_MOUNT_DMODE;
-			break;
-		case 'i':
-			strlcpy(mnt->codepage.local_name, optarg, 
-				SMB_NLS_MAXNAMELEN);
-			break;
-		case 'c':
-			strlcpy(mnt->codepage.remote_name, optarg,
-				SMB_NLS_MAXNAMELEN);
-			break;
-		case 't':
-			mnt->ttl = value;
-			break;
-		default:
-			printk ("smbfs: Unrecognized mount option %s\n",
-				optopt);
-			return -1;
-		}
-	}
-	mnt->flags = flags;
-	return c;
-}
-
-/*
- * smb_show_options() is for displaying mount options in /proc/mounts.
- * It tries to avoid showing settings that were not changed from their
- * defaults.
- */
-static int
-smb_show_options(struct seq_file *s, struct vfsmount *m)
-{
-	struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
-	int i;
-
-	for (i = 0; opts[i].name != NULL; i++)
-		if (mnt->flags & opts[i].flag)
-			seq_printf(s, ",%s", opts[i].name);
-
-	if (mnt->flags & SMB_MOUNT_UID)
-		seq_printf(s, ",uid=%d", mnt->uid);
-	if (mnt->flags & SMB_MOUNT_GID)
-		seq_printf(s, ",gid=%d", mnt->gid);
-	if (mnt->mounted_uid != 0)
-		seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
-
-	/* 
-	 * Defaults for file_mode and dir_mode are unknown to us; they
-	 * depend on the current umask of the user doing the mount.
-	 */
-	if (mnt->flags & SMB_MOUNT_FMODE)
-		seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
-	if (mnt->flags & SMB_MOUNT_DMODE)
-		seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
-
-	if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
-		seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
-	if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
-		seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
-
-	if (mnt->ttl != SMB_TTL_DEFAULT)
-		seq_printf(s, ",ttl=%d", mnt->ttl);
-
-	return 0;
-}
-
-static void
-smb_unload_nls(struct smb_sb_info *server)
-{
-	unload_nls(server->remote_nls);
-	unload_nls(server->local_nls);
-}
-
-static void
-smb_put_super(struct super_block *sb)
-{
-	struct smb_sb_info *server = SMB_SB(sb);
-
-	lock_kernel();
-
-	smb_lock_server(server);
-	server->state = CONN_INVALID;
-	smbiod_unregister_server(server);
-
-	smb_close_socket(server);
-
-	if (server->conn_pid)
-		kill_pid(server->conn_pid, SIGTERM, 1);
-
-	bdi_destroy(&server->bdi);
-	kfree(server->ops);
-	smb_unload_nls(server);
-	sb->s_fs_info = NULL;
-	smb_unlock_server(server);
-	put_pid(server->conn_pid);
-	kfree(server);
-
-	unlock_kernel();
-}
-
-static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
-	struct smb_sb_info *server;
-	struct smb_mount_data_kernel *mnt;
-	struct smb_mount_data *oldmnt;
-	struct inode *root_inode;
-	struct smb_fattr root;
-	int ver;
-	void *mem;
-	static int warn_count;
-
-	if (warn_count < 5) {
-		warn_count++;
-		printk(KERN_EMERG "smbfs is deprecated and will be removed"
-			" from the 2.6.27 kernel. Please migrate to cifs\n");
-	}
-
-	if (!raw_data)
-		goto out_no_data;
-
-	oldmnt = (struct smb_mount_data *) raw_data;
-	ver = oldmnt->version;
-	if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
-		goto out_wrong_data;
-
-	sb->s_flags |= MS_NODIRATIME;
-	sb->s_blocksize = 1024;	/* Eh...  Is this correct? */
-	sb->s_blocksize_bits = 10;
-	sb->s_magic = SMB_SUPER_MAGIC;
-	sb->s_op = &smb_sops;
-	sb->s_time_gran = 100;
-
-	server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
-	if (!server)
-		goto out_no_server;
-	sb->s_fs_info = server;
-	
-	if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
-		goto out_bdi;
-
-	sb->s_bdi = &server->bdi;
-
-	server->super_block = sb;
-	server->mnt = NULL;
-	server->sock_file = NULL;
-	init_waitqueue_head(&server->conn_wq);
-	init_MUTEX(&server->sem);
-	INIT_LIST_HEAD(&server->entry);
-	INIT_LIST_HEAD(&server->xmitq);
-	INIT_LIST_HEAD(&server->recvq);
-	server->conn_error = 0;
-	server->conn_pid = NULL;
-	server->state = CONN_INVALID; /* no connection yet */
-	server->generation = 0;
-
-	/* Allocate the global temp buffer and some superblock helper structs */
-	/* FIXME: move these to the smb_sb_info struct */
-	VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
-		sizeof(struct smb_mount_data_kernel));
-	mem = kmalloc(sizeof(struct smb_ops) +
-		      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
-	if (!mem)
-		goto out_no_mem;
-
-	server->ops = mem;
-	smb_install_null_ops(server->ops);
-	server->mnt = mem + sizeof(struct smb_ops);
-
-	/* Setup NLS stuff */
-	server->remote_nls = NULL;
-	server->local_nls = NULL;
-
-	mnt = server->mnt;
-
-	memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
-	strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
-		SMB_NLS_MAXNAMELEN);
-	strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
-		SMB_NLS_MAXNAMELEN);
-
-	mnt->ttl = SMB_TTL_DEFAULT;
-	if (ver == SMB_MOUNT_OLDVERSION) {
-		mnt->version = oldmnt->version;
-
-		SET_UID(mnt->uid, oldmnt->uid);
-		SET_GID(mnt->gid, oldmnt->gid);
-
-		mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
-		mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-
-		mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
-			SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
-	} else {
-		mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-				S_IROTH | S_IXOTH | S_IFREG;
-		mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-				S_IROTH | S_IXOTH | S_IFDIR;
-		if (parse_options(mnt, raw_data))
-			goto out_bad_option;
-	}
-	mnt->mounted_uid = current_uid();
-	smb_setcodepage(server, &mnt->codepage);
-
-	/*
-	 * Display the enabled options
-	 * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
-	 */
-	if (mnt->flags & SMB_MOUNT_OLDATTR)
-		printk("SMBFS: Using core getattr (Win 95 speedup)\n");
-	else if (mnt->flags & SMB_MOUNT_DIRATTR)
-		printk("SMBFS: Using dir ff getattr\n");
-
-	if (smbiod_register_server(server) < 0) {
-		printk(KERN_ERR "smbfs: failed to start smbiod\n");
-		goto out_no_smbiod;
-	}
-
-	/*
-	 * Keep the super block locked while we get the root inode.
-	 */
-	smb_init_root_dirent(server, &root, sb);
-	root_inode = smb_iget(sb, &root);
-	if (!root_inode)
-		goto out_no_root;
-
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root)
-		goto out_no_root;
-
-	smb_new_dentry(sb->s_root);
-
-	return 0;
-
-out_no_root:
-	iput(root_inode);
-out_no_smbiod:
-	smb_unload_nls(server);
-out_bad_option:
-	kfree(mem);
-out_no_mem:
-	bdi_destroy(&server->bdi);
-out_bdi:
-	if (!server->mnt)
-		printk(KERN_ERR "smb_fill_super: allocation failure\n");
-	sb->s_fs_info = NULL;
-	kfree(server);
-	goto out_fail;
-out_wrong_data:
-	printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
-	goto out_fail;
-out_no_data:
-	printk(KERN_ERR "smb_fill_super: missing data argument\n");
-out_fail:
-	return -EINVAL;
-out_no_server:
-	printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
-	return -ENOMEM;
-}
-
-static int
-smb_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	int result;
-	
-	lock_kernel();
-
-	result = smb_proc_dskattr(dentry, buf);
-
-	unlock_kernel();
-
-	buf->f_type = SMB_SUPER_MAGIC;
-	buf->f_namelen = SMB_MAXPATHLEN;
-	return result;
-}
-
-int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
-{
-	int err = smb_revalidate_inode(dentry);
-	if (!err)
-		generic_fillattr(dentry->d_inode, stat);
-	return err;
-}
-
-int
-smb_notify_change(struct dentry *dentry, struct iattr *attr)
-{
-	struct inode *inode = dentry->d_inode;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
-	int error, changed, refresh = 0;
-	struct smb_fattr fattr;
-
-	lock_kernel();
-
-	error = smb_revalidate_inode(dentry);
-	if (error)
-		goto out;
-
-	if ((error = inode_change_ok(inode, attr)) < 0)
-		goto out;
-
-	error = -EPERM;
-	if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
-		goto out;
-
-	if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
-		goto out;
-
-	if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
-		goto out;
-
-	if ((attr->ia_valid & ATTR_SIZE) != 0) {
-		VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
-			DENTRY_PATH(dentry),
-			(long) inode->i_size, (long) attr->ia_size);
-
-		filemap_write_and_wait(inode->i_mapping);
-
-		error = smb_open(dentry, O_WRONLY);
-		if (error)
-			goto out;
-		error = server->ops->truncate(inode, attr->ia_size);
-		if (error)
-			goto out;
-		truncate_setsize(inode, attr->ia_size);
-		refresh = 1;
-	}
-
-	if (server->opt.capabilities & SMB_CAP_UNIX) {
-		/* For now we don't want to set the size with setattr_unix */
-		attr->ia_valid &= ~ATTR_SIZE;
-		/* FIXME: only call if we actually want to set something? */
-		error = smb_proc_setattr_unix(dentry, attr, 0, 0);
-		if (!error)
-			refresh = 1;
-
-		goto out;
-	}
-
-	/*
-	 * Initialize the fattr and check for changed fields.
-	 * Note: CTIME under SMB is creation time rather than
-	 * change time, so we don't attempt to change it.
-	 */
-	smb_get_inode_attr(inode, &fattr);
-
-	changed = 0;
-	if ((attr->ia_valid & ATTR_MTIME) != 0) {
-		fattr.f_mtime = attr->ia_mtime;
-		changed = 1;
-	}
-	if ((attr->ia_valid & ATTR_ATIME) != 0) {
-		fattr.f_atime = attr->ia_atime;
-		/* Earlier protocols don't have an access time */
-		if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
-			changed = 1;
-	}
-	if (changed) {
-		error = smb_proc_settime(dentry, &fattr);
-		if (error)
-			goto out;
-		refresh = 1;
-	}
-
-	/*
-	 * Check for mode changes ... we're extremely limited in
-	 * what can be set for SMB servers: just the read-only bit.
-	 */
-	if ((attr->ia_valid & ATTR_MODE) != 0) {
-		VERBOSE("%s/%s mode change, old=%x, new=%x\n",
-			DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
-		changed = 0;
-		if (attr->ia_mode & S_IWUSR) {
-			if (fattr.attr & aRONLY) {
-				fattr.attr &= ~aRONLY;
-				changed = 1;
-			}
-		} else {
-			if (!(fattr.attr & aRONLY)) {
-				fattr.attr |= aRONLY;
-				changed = 1;
-			}
-		}
-		if (changed) {
-			error = smb_proc_setattr(dentry, &fattr);
-			if (error)
-				goto out;
-			refresh = 1;
-		}
-	}
-	error = 0;
-
-out:
-	if (refresh)
-		smb_refresh_inode(dentry);
-	unlock_kernel();
-	return error;
-}
-
-static int smb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
-}
-
-static struct file_system_type smb_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "smbfs",
-	.get_sb		= smb_get_sb,
-	.kill_sb	= kill_anon_super,
-	.fs_flags	= FS_BINARY_MOUNTDATA,
-};
-
-static int __init init_smb_fs(void)
-{
-	int err;
-	DEBUG1("registering ...\n");
-
-	err = init_inodecache();
-	if (err)
-		goto out_inode;
-	err = smb_init_request_cache();
-	if (err)
-		goto out_request;
-	err = register_filesystem(&smb_fs_type);
-	if (err)
-		goto out;
-	return 0;
-out:
-	smb_destroy_request_cache();
-out_request:
-	destroy_inodecache();
-out_inode:
-	return err;
-}
-
-static void __exit exit_smb_fs(void)
-{
-	DEBUG1("unregistering ...\n");
-	unregister_filesystem(&smb_fs_type);
-	smb_destroy_request_cache();
-	destroy_inodecache();
-}
-
-module_init(init_smb_fs)
-module_exit(exit_smb_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
deleted file mode 100644
index 07215312ad3..00000000000
--- a/fs/smbfs/ioctl.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  ioctl.c
- *
- *  Copyright (C) 1995, 1996 by Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/time.h>
-#include <linux/mm.h>
-#include <linux/highuid.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-
-#include <asm/uaccess.h>
-
-#include "proto.h"
-
-long
-smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
-	struct smb_conn_opt opt;
-	int result = -EINVAL;
-
-	lock_kernel();
-	switch (cmd) {
-		uid16_t uid16;
-		uid_t uid32;
-	case SMB_IOC_GETMOUNTUID:
-		SET_UID(uid16, server->mnt->mounted_uid);
-		result = put_user(uid16, (uid16_t __user *) arg);
-		break;
-	case SMB_IOC_GETMOUNTUID32:
-		SET_UID(uid32, server->mnt->mounted_uid);
-		result = put_user(uid32, (uid_t __user *) arg);
-		break;
-
-	case SMB_IOC_NEWCONN:
-		/* arg is smb_conn_opt, or NULL if no connection was made */
-		if (!arg) {
-			result = 0;
-			smb_lock_server(server);
-			server->state = CONN_RETRIED;
-			printk(KERN_ERR "Connection attempt failed!  [%d]\n",
-			       server->conn_error);
-			smbiod_flush(server);
-			smb_unlock_server(server);
-			break;
-		}
-
-		result = -EFAULT;
-		if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
-			result = smb_newconn(server, &opt);
-		break;
-	default:
-		break;
-	}
-	unlock_kernel();
-
-	return result;
-}
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
deleted file mode 100644
index 71c29b6670b..00000000000
--- a/fs/smbfs/proc.c
+++ /dev/null
@@ -1,3507 +0,0 @@
-/*
- *  proc.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/dcache.h>
-#include <linux/nls.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include <net/sock.h>
-
-#include <asm/string.h>
-#include <asm/div64.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-
-
-/* Features. Undefine if they cause problems, this should perhaps be a
-   config option. */
-#define SMBFS_POSIX_UNLINK 1
-
-/* Allow smb_retry to be interrupted. */
-#define SMB_RETRY_INTR
-
-#define SMB_VWV(packet)  ((packet) + SMB_HEADER_LEN)
-#define SMB_CMD(packet)  (*(packet+8))
-#define SMB_WCT(packet)  (*(packet+SMB_HEADER_LEN - 1))
-
-#define SMB_DIRINFO_SIZE 43
-#define SMB_STATUS_SIZE  21
-
-#define SMB_ST_BLKSIZE	(PAGE_SIZE)
-#define SMB_ST_BLKSHIFT	(PAGE_SHIFT)
-
-static struct smb_ops smb_ops_core;
-static struct smb_ops smb_ops_os2;
-static struct smb_ops smb_ops_win95;
-static struct smb_ops smb_ops_winNT;
-static struct smb_ops smb_ops_unix;
-static struct smb_ops smb_ops_null;
-
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-		      struct smb_fattr *fattr);
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-		    struct smb_fattr *fattr);
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-		      u16 attr);
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-		     struct inode *inode, struct smb_fattr *fattr);
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server);
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src);
-
-
-static void
-str_upper(char *name, int len)
-{
-	while (len--)
-	{
-		if (*name >= 'a' && *name <= 'z')
-			*name -= ('a' - 'A');
-		name++;
-	}
-}
-
-#if 0
-static void
-str_lower(char *name, int len)
-{
-	while (len--)
-	{
-		if (*name >= 'A' && *name <= 'Z')
-			*name += ('a' - 'A');
-		name++;
-	}
-}
-#endif
-
-/* reverse a string inline. This is used by the dircache walking routines */
-static void reverse_string(char *buf, int len)
-{
-	char c;
-	char *end = buf+len-1;
-
-	while(buf < end) {
-		c = *buf;
-		*(buf++) = *end;
-		*(end--) = c;
-	}
-}
-
-/* no conversion, just a wrapper for memcpy. */
-static int convert_memcpy(unsigned char *output, int olen,
-			  const unsigned char *input, int ilen,
-			  struct nls_table *nls_from,
-			  struct nls_table *nls_to)
-{
-	if (olen < ilen)
-		return -ENAMETOOLONG;
-	memcpy(output, input, ilen);
-	return ilen;
-}
-
-static inline int write_char(unsigned char ch, char *output, int olen)
-{
-	if (olen < 4)
-		return -ENAMETOOLONG;
-	sprintf(output, ":x%02x", ch);
-	return 4;
-}
-
-static inline int write_unichar(wchar_t ch, char *output, int olen)
-{
-	if (olen < 5)
-		return -ENAMETOOLONG;
-	sprintf(output, ":%04x", ch);
-	return 5;
-}
-
-/* convert from one "codepage" to another (possibly being utf8). */
-static int convert_cp(unsigned char *output, int olen,
-		      const unsigned char *input, int ilen,
-		      struct nls_table *nls_from,
-		      struct nls_table *nls_to)
-{
-	int len = 0;
-	int n;
-	wchar_t ch;
-
-	while (ilen > 0) {
-		/* convert by changing to unicode and back to the new cp */
-		n = nls_from->char2uni(input, ilen, &ch);
-		if (n == -EINVAL) {
-			ilen--;
-			n = write_char(*input++, output, olen);
-			if (n < 0)
-				goto fail;
-			output += n;
-			olen -= n;
-			len += n;
-			continue;
-		} else if (n < 0)
-			goto fail;
-		input += n;
-		ilen -= n;
-
-		n = nls_to->uni2char(ch, output, olen);
-		if (n == -EINVAL)
-			n = write_unichar(ch, output, olen);
-		if (n < 0)
-			goto fail;
-		output += n;
-		olen -= n;
-
-		len += n;
-	}
-	return len;
-fail:
-	return n;
-}
-
-/* ----------------------------------------------------------- */
-
-/*
- * nls_unicode
- *
- * This encodes/decodes little endian unicode format
- */
-
-static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
-{
-	if (boundlen < 2)
-		return -EINVAL;
-	*out++ = uni & 0xff;
-	*out++ = uni >> 8;
-	return 2;
-}
-
-static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
-{
-	if (boundlen < 2)
-		return -EINVAL;
-	*uni = (rawstring[1] << 8) | rawstring[0];
-	return 2;
-}
-
-static struct nls_table unicode_table = {
-	.charset	= "unicode",
-	.uni2char	= uni2char,
-	.char2uni	= char2uni,
-};
-
-/* ----------------------------------------------------------- */
-
-static int setcodepage(struct nls_table **p, char *name)
-{
-	struct nls_table *nls;
-
-	if (!name || !*name) {
-		nls = NULL;
-	} else if ( (nls = load_nls(name)) == NULL) {
-		printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
-		return -EINVAL;
-	}
-
-	/* if already set, unload the previous one. */
-	if (*p && *p != &unicode_table)
-		unload_nls(*p);
-	*p = nls;
-
-	return 0;
-}
-
-/* Handles all changes to codepage settings. */
-int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
-{
-	int n = 0;
-
-	smb_lock_server(server);
-
-	/* Don't load any nls_* at all, if no remote is requested */
-	if (!*cp->remote_name)
-		goto out;
-
-	/* local */
-	n = setcodepage(&server->local_nls, cp->local_name);
-	if (n != 0)
-		goto out;
-
-	/* remote */
-	if (!strcmp(cp->remote_name, "unicode")) {
-		server->remote_nls = &unicode_table;
-	} else {
-		n = setcodepage(&server->remote_nls, cp->remote_name);
-		if (n != 0)
-			setcodepage(&server->local_nls, NULL);
-	}
-
-out:
-	if (server->local_nls != NULL && server->remote_nls != NULL)
-		server->ops->convert = convert_cp;
-	else
-		server->ops->convert = convert_memcpy;
-
-	smb_unlock_server(server);
-	return n;
-}
-
-
-/*****************************************************************************/
-/*                                                                           */
-/*  Encoding/Decoding section                                                */
-/*                                                                           */
-/*****************************************************************************/
-
-static __u8 *
-smb_encode_smb_length(__u8 * p, __u32 len)
-{
-	*p = 0;
-	*(p+1) = 0;
-	*(p+2) = (len & 0xFF00) >> 8;
-	*(p+3) = (len & 0xFF);
-	if (len > 0xFFFF)
-	{
-		*(p+1) = 1;
-	}
-	return p + 4;
-}
-
-/*
- * smb_build_path: build the path to entry and name storing it in buf.
- * The path returned will have the trailing '\0'.
- */
-static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
-			  int maxlen,
-			  struct dentry *entry, struct qstr *name)
-{
-	unsigned char *path = buf;
-	int len;
-	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
-
-	if (maxlen < (2<<unicode))
-		return -ENAMETOOLONG;
-
-	if (maxlen > SMB_MAXPATHLEN + 1)
-		maxlen = SMB_MAXPATHLEN + 1;
-
-	if (entry == NULL)
-		goto test_name_and_out;
-
-	/*
-	 * If IS_ROOT, we have to do no walking at all.
-	 */
-	if (IS_ROOT(entry) && !name) {
-		*path++ = '\\';
-		if (unicode) *path++ = '\0';
-		*path++ = '\0';
-		if (unicode) *path++ = '\0';
-		return path-buf;
-	}
-
-	/*
-	 * Build the path string walking the tree backward from end to ROOT
-	 * and store it in reversed order [see reverse_string()]
-	 */
-	dget(entry);
-	spin_lock(&entry->d_lock);
-	while (!IS_ROOT(entry)) {
-		struct dentry *parent;
-
-		if (maxlen < (3<<unicode)) {
-			spin_unlock(&entry->d_lock);
-			dput(entry);
-			return -ENAMETOOLONG;
-		}
-
-		len = server->ops->convert(path, maxlen-2, 
-				      entry->d_name.name, entry->d_name.len,
-				      server->local_nls, server->remote_nls);
-		if (len < 0) {
-			spin_unlock(&entry->d_lock);
-			dput(entry);
-			return len;
-		}
-		reverse_string(path, len);
-		path += len;
-		if (unicode) {
-			/* Note: reverse order */
-			*path++ = '\0';
-			maxlen--;
-		}
-		*path++ = '\\';
-		maxlen -= len+1;
-
-		parent = entry->d_parent;
-		dget(parent);
-		spin_unlock(&entry->d_lock);
-		dput(entry);
-		entry = parent;
-		spin_lock(&entry->d_lock);
-	}
-	spin_unlock(&entry->d_lock);
-	dput(entry);
-	reverse_string(buf, path-buf);
-
-	/* maxlen has space for at least one char */
-test_name_and_out:
-	if (name) {
-		if (maxlen < (3<<unicode))
-			return -ENAMETOOLONG;
-		*path++ = '\\';
-		if (unicode) {
-			*path++ = '\0';
-			maxlen--;
-		}
-		len = server->ops->convert(path, maxlen-2, 
-				      name->name, name->len,
-				      server->local_nls, server->remote_nls);
-		if (len < 0)
-			return len;
-		path += len;
-		maxlen -= len+1;
-	}
-	/* maxlen has space for at least one char */
-	*path++ = '\0';
-	if (unicode) *path++ = '\0';
-	return path-buf;
-}
-
-static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
-			   struct dentry *dir, struct qstr *name)
-{
-	int result;
-
-	result = smb_build_path(server, buf, maxlen, dir, name);
-	if (result < 0)
-		goto out;
-	if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
-		str_upper(buf, result);
-out:
-	return result;
-}
-
-/* encode_path for non-trans2 request SMBs */
-static int smb_simple_encode_path(struct smb_request *req, char **p,
-				  struct dentry * entry, struct qstr * name)
-{
-	struct smb_sb_info *server = req->rq_server;
-	char *s = *p;
-	int res;
-	int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
-	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-
-	if (!maxlen)
-		return -ENAMETOOLONG;
-	*s++ = 4;	/* ASCII data format */
-
-	/*
-	 * SMB Unicode strings must be 16bit aligned relative the start of the
-	 * packet. If they are not they must be padded with 0.
-	 */
-	if (unicode) {
-		int align = s - (char *)req->rq_buffer;
-		if (!(align & 1)) {
-			*s++ = '\0';
-			maxlen--;
-		}
-	}
-
-	res = smb_encode_path(server, s, maxlen-1, entry, name);
-	if (res < 0)
-		return res;
-	*p = s + res;
-	return 0;
-}
-
-/* The following are taken directly from msdos-fs */
-
-/* Linear day numbers of the respective 1sts in non-leap years. */
-
-static int day_n[] =
-{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
-		  /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
-
-
-static time_t
-utc2local(struct smb_sb_info *server, time_t time)
-{
-	return time - server->opt.serverzone*60;
-}
-
-static time_t
-local2utc(struct smb_sb_info *server, time_t time)
-{
-	return time + server->opt.serverzone*60;
-}
-
-/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-
-static time_t
-date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
-{
-	int month, year;
-	time_t secs;
-
-	/* first subtract and mask after that... Otherwise, if
-	   date == 0, bad things happen */
-	month = ((date >> 5) - 1) & 15;
-	year = date >> 9;
-	secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
-	    ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
-						   month < 2 ? 1 : 0) + 3653);
-	/* days since 1.1.70 plus 80's leap day */
-	return local2utc(server, secs);
-}
-
-
-/* Convert linear UNIX date to a MS-DOS time/date pair. */
-
-static void
-date_unix2dos(struct smb_sb_info *server,
-	      int unix_date, __u16 *date, __u16 *time)
-{
-	int day, year, nl_day, month;
-
-	unix_date = utc2local(server, unix_date);
-	if (unix_date < 315532800)
-		unix_date = 315532800;
-
-	*time = (unix_date % 60) / 2 +
-		(((unix_date / 60) % 60) << 5) +
-		(((unix_date / 3600) % 24) << 11);
-
-	day = unix_date / 86400 - 3652;
-	year = day / 365;
-	if ((year + 3) / 4 + 365 * year > day)
-		year--;
-	day -= (year + 3) / 4 + 365 * year;
-	if (day == 59 && !(year & 3)) {
-		nl_day = day;
-		month = 2;
-	} else {
-		nl_day = (year & 3) || day <= 59 ? day : day - 1;
-		for (month = 1; month < 12; month++)
-			if (day_n[month] > nl_day)
-				break;
-	}
-	*date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
-}
-
-/* The following are taken from fs/ntfs/util.c */
-
-#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
-
-/*
- * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
- * into Unix UTC (based 1970-01-01, in seconds).
- */
-static struct timespec
-smb_ntutc2unixutc(u64 ntutc)
-{
-	struct timespec ts;
-	/* FIXME: what about the timezone difference? */
-	/* Subtract the NTFS time offset, then convert to 1s intervals. */
-	u64 t = ntutc - NTFS_TIME_OFFSET;
-	ts.tv_nsec = do_div(t, 10000000) * 100;
-	ts.tv_sec = t; 
-	return ts;
-}
-
-/* Convert the Unix UTC into NT time */
-static u64
-smb_unixutc2ntutc(struct timespec ts)
-{
-	/* Note: timezone conversion is probably wrong. */
-	/* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
-	return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
-}
-
-#define MAX_FILE_MODE	6
-static mode_t file_mode[] = {
-	S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
-};
-
-static int smb_filetype_to_mode(u32 filetype)
-{
-	if (filetype > MAX_FILE_MODE) {
-		PARANOIA("Filetype out of range: %d\n", filetype);
-		return S_IFREG;
-	}
-	return file_mode[filetype];
-}
-
-static u32 smb_filetype_from_mode(int mode)
-{
-	if (S_ISREG(mode))
-		return UNIX_TYPE_FILE;
-	if (S_ISDIR(mode))
-		return UNIX_TYPE_DIR;
-	if (S_ISLNK(mode))
-		return UNIX_TYPE_SYMLINK;
-	if (S_ISCHR(mode))
-		return UNIX_TYPE_CHARDEV;
-	if (S_ISBLK(mode))
-		return UNIX_TYPE_BLKDEV;
-	if (S_ISFIFO(mode))
-		return UNIX_TYPE_FIFO;
-	if (S_ISSOCK(mode))
-		return UNIX_TYPE_SOCKET;
-	return UNIX_TYPE_UNKNOWN;
-}
-
-
-/*****************************************************************************/
-/*                                                                           */
-/*  Support section.                                                         */
-/*                                                                           */
-/*****************************************************************************/
-
-__u32
-smb_len(__u8 * p)
-{
-	return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
-}
-
-static __u16
-smb_bcc(__u8 * packet)
-{
-	int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
-	return WVAL(packet, pos);
-}
-
-/* smb_valid_packet: We check if packet fulfills the basic
-   requirements of a smb packet */
-
-static int
-smb_valid_packet(__u8 * packet)
-{
-	return (packet[4] == 0xff
-		&& packet[5] == 'S'
-		&& packet[6] == 'M'
-		&& packet[7] == 'B'
-		&& (smb_len(packet) + 4 == SMB_HEADER_LEN
-		    + SMB_WCT(packet) * 2 + smb_bcc(packet)));
-}
-
-/* smb_verify: We check if we got the answer we expected, and if we
-   got enough data. If bcc == -1, we don't care. */
-
-static int
-smb_verify(__u8 * packet, int command, int wct, int bcc)
-{
-	if (SMB_CMD(packet) != command)
-		goto bad_command;
-	if (SMB_WCT(packet) < wct)
-		goto bad_wct;
-	if (bcc != -1 && smb_bcc(packet) < bcc)
-		goto bad_bcc;
-	return 0;
-
-bad_command:
-	printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
-	       command, SMB_CMD(packet));
-	goto fail;
-bad_wct:
-	printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
-	       command, wct, SMB_WCT(packet));
-	goto fail;
-bad_bcc:
-	printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
-	       command, bcc, smb_bcc(packet));
-fail:
-	return -EIO;
-}
-
-/*
- * Returns the maximum read or write size for the "payload". Making all of the
- * packet fit within the negotiated max_xmit size.
- *
- * N.B. Since this value is usually computed before locking the server,
- * the server's packet size must never be decreased!
- */
-static inline int
-smb_get_xmitsize(struct smb_sb_info *server, int overhead)
-{
-	return server->opt.max_xmit - overhead;
-}
-
-/*
- * Calculate the maximum read size
- */
-int
-smb_get_rsize(struct smb_sb_info *server)
-{
-	/* readX has 12 parameters, read has 5 */
-	int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
-	int size = smb_get_xmitsize(server, overhead);
-
-	VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-
-	return size;
-}
-
-/*
- * Calculate the maximum write size
- */
-int
-smb_get_wsize(struct smb_sb_info *server)
-{
-	/* writeX has 14 parameters, write has 5 */
-	int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
-	int size = smb_get_xmitsize(server, overhead);
-
-	VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-
-	return size;
-}
-
-/*
- * Convert SMB error codes to -E... errno values.
- */
-int
-smb_errno(struct smb_request *req)
-{
-	int errcls = req->rq_rcls;
-	int error  = req->rq_err;
-	char *class = "Unknown";
-
-	VERBOSE("errcls %d  code %d  from command 0x%x\n",
-		errcls, error, SMB_CMD(req->rq_header));
-
-	if (errcls == ERRDOS) {
-		switch (error) {
-		case ERRbadfunc:
-			return -EINVAL;
-		case ERRbadfile:
-		case ERRbadpath:
-			return -ENOENT;
-		case ERRnofids:
-			return -EMFILE;
-		case ERRnoaccess:
-			return -EACCES;
-		case ERRbadfid:
-			return -EBADF;
-		case ERRbadmcb:
-			return -EREMOTEIO;
-		case ERRnomem:
-			return -ENOMEM;
-		case ERRbadmem:
-			return -EFAULT;
-		case ERRbadenv:
-		case ERRbadformat:
-			return -EREMOTEIO;
-		case ERRbadaccess:
-			return -EACCES;
-		case ERRbaddata:
-			return -E2BIG;
-		case ERRbaddrive:
-			return -ENXIO;
-		case ERRremcd:
-			return -EREMOTEIO;
-		case ERRdiffdevice:
-			return -EXDEV;
-		case ERRnofiles:
-			return -ENOENT;
-		case ERRbadshare:
-			return -ETXTBSY;
-		case ERRlock:
-			return -EDEADLK;
-		case ERRfilexists:
-			return -EEXIST;
-		case ERROR_INVALID_PARAMETER:
-			return -EINVAL;
-		case ERROR_DISK_FULL:
-			return -ENOSPC;
-		case ERROR_INVALID_NAME:
-			return -ENOENT;
-		case ERROR_DIR_NOT_EMPTY:
-			return -ENOTEMPTY;
-		case ERROR_NOT_LOCKED:
-                       return -ENOLCK;
-		case ERROR_ALREADY_EXISTS:
-			return -EEXIST;
-		default:
-			class = "ERRDOS";
-			goto err_unknown;
-		}
-	} else if (errcls == ERRSRV) {
-		switch (error) {
-		/* N.B. This is wrong ... EIO ? */
-		case ERRerror:
-			return -ENFILE;
-		case ERRbadpw:
-			return -EINVAL;
-		case ERRbadtype:
-		case ERRtimeout:
-			return -EIO;
-		case ERRaccess:
-			return -EACCES;
-		/*
-		 * This is a fatal error, as it means the "tree ID"
-		 * for this connection is no longer valid. We map
-		 * to a special error code and get a new connection.
-		 */
-		case ERRinvnid:
-			return -EBADSLT;
-		default:
-			class = "ERRSRV";
-			goto err_unknown;
-		}
-	} else if (errcls == ERRHRD) {
-		switch (error) {
-		case ERRnowrite:
-			return -EROFS;
-		case ERRbadunit:
-			return -ENODEV;
-		case ERRnotready:
-			return -EUCLEAN;
-		case ERRbadcmd:
-		case ERRdata:
-			return -EIO;
-		case ERRbadreq:
-			return -ERANGE;
-		case ERRbadshare:
-			return -ETXTBSY;
-		case ERRlock:
-			return -EDEADLK;
-		case ERRdiskfull:
-			return -ENOSPC;
-		default:
-			class = "ERRHRD";
-			goto err_unknown;
-		}
-	} else if (errcls == ERRCMD) {
-		class = "ERRCMD";
-	} else if (errcls == SUCCESS) {
-		return 0;	/* This is the only valid 0 return */
-	}
-
-err_unknown:
-	printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
-	       class, error, SMB_CMD(req->rq_header));
-	return -EIO;
-}
-
-/* smb_request_ok: We expect the server to be locked. Then we do the
-   request and check the answer completely. When smb_request_ok
-   returns 0, you can be quite sure that everything went well. When
-   the answer is <=0, the returned number is a valid unix errno. */
-
-static int
-smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
-{
-	int result;
-
-	req->rq_resp_wct = wct;
-	req->rq_resp_bcc = bcc;
-
-	result = smb_add_request(req);
-	if (result != 0) {
-		DEBUG1("smb_request failed\n");
-		goto out;
-	}
-
-	if (smb_valid_packet(req->rq_header) != 0) {
-		PARANOIA("invalid packet!\n");
-		goto out;
-	}
-
-	result = smb_verify(req->rq_header, command, wct, bcc);
-
-out:
-	return result;
-}
-
-/*
- * This implements the NEWCONN ioctl. It installs the server pid,
- * sets server->state to CONN_VALID, and wakes up the waiting process.
- */
-int
-smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
-{
-	struct file *filp;
-	struct sock *sk;
-	int error;
-
-	VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
-
-	smb_lock_server(server);
-
-	/*
-	 * Make sure we don't already have a valid connection ...
-	 */
-	error = -EINVAL;
-	if (server->state == CONN_VALID)
-		goto out;
-
-	error = -EACCES;
-	if (current_uid() != server->mnt->mounted_uid &&
-	    !capable(CAP_SYS_ADMIN))
-		goto out;
-
-	error = -EBADF;
-	filp = fget(opt->fd);
-	if (!filp)
-		goto out;
-	if (!smb_valid_socket(filp->f_path.dentry->d_inode))
-		goto out_putf;
-
-	server->sock_file = filp;
-	server->conn_pid = get_pid(task_pid(current));
-	server->opt = *opt;
-	server->generation += 1;
-	server->state = CONN_VALID;
-	error = 0;
-
-	if (server->conn_error) {
-		/*
-		 * conn_error is the returncode we originally decided to
-		 * drop the old connection on. This message should be positive
-		 * and not make people ask questions on why smbfs is printing
-		 * error messages ...
-		 */
-		printk(KERN_INFO "SMB connection re-established (%d)\n",
-		       server->conn_error);
-		server->conn_error = 0;
-	}
-
-	/*
-	 * Store the server in sock user_data (Only used by sunrpc)
-	 */
-	sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
-	sk->sk_user_data = server;
-
-	/* chain into the data_ready callback */
-	server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
-
-	/* check if we have an old smbmount that uses seconds for the 
-	   serverzone */
-	if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
-		server->opt.serverzone /= 60;
-
-	/* now that we have an established connection we can detect the server
-	   type and enable bug workarounds */
-	if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
-		install_ops(server->ops, &smb_ops_core);
-	else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
-		install_ops(server->ops, &smb_ops_os2);
-	else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
-		 (server->opt.max_xmit < 0x1000) &&
-		 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
-		/* FIXME: can we kill the WIN95 flag now? */
-		server->mnt->flags |= SMB_MOUNT_WIN95;
-		VERBOSE("detected WIN95 server\n");
-		install_ops(server->ops, &smb_ops_win95);
-	} else {
-		/*
-		 * Samba has max_xmit 65535
-		 * NT4spX has max_xmit 4536 (or something like that)
-		 * win2k has ...
-		 */
-		VERBOSE("detected NT1 (Samba, NT4/5) server\n");
-		install_ops(server->ops, &smb_ops_winNT);
-	}
-
-	/* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
-	if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
-		server->ops->getattr = smb_proc_getattr_core;
-	} else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
-		server->ops->getattr = smb_proc_getattr_ff;
-	}
-
-	/* Decode server capabilities */
-	if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
-		/* Should be ok to set this now, as no one can access the
-		   mount until the connection has been established. */
-		SB_of(server)->s_maxbytes = ~0ULL >> 1;
-		VERBOSE("LFS enabled\n");
-	}
-	if (server->opt.capabilities & SMB_CAP_UNICODE) {
-		server->mnt->flags |= SMB_MOUNT_UNICODE;
-		VERBOSE("Unicode enabled\n");
-	} else {
-		server->mnt->flags &= ~SMB_MOUNT_UNICODE;
-	}
-#if 0
-	/* flags we may test for other patches ... */
-	if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
-		VERBOSE("Large reads enabled\n");
-	}
-	if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
-		VERBOSE("Large writes enabled\n");
-	}
-#endif
-	if (server->opt.capabilities & SMB_CAP_UNIX) {
-		struct inode *inode;
-		VERBOSE("Using UNIX CIFS extensions\n");
-		install_ops(server->ops, &smb_ops_unix);
-		inode = SB_of(server)->s_root->d_inode;
-		if (inode)
-			inode->i_op = &smb_dir_inode_operations_unix;
-	}
-
-	VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
-		server->opt.protocol, server->opt.max_xmit,
-		pid_nr(server->conn_pid), server->opt.capabilities);
-
-	/* FIXME: this really should be done by smbmount. */
-	if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
-		server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
-	}
-
-	smb_unlock_server(server);
-	smbiod_wake_up();
-	if (server->opt.capabilities & SMB_CAP_UNIX)
-		smb_proc_query_cifsunix(server);
-
-	server->conn_complete++;
-	wake_up_interruptible_all(&server->conn_wq);
-	return error;
-
-out:
-	smb_unlock_server(server);
-	smbiod_wake_up();
-	return error;
-
-out_putf:
-	fput(filp);
-	goto out;
-}
-
-/* smb_setup_header: We completely set up the packet. You only have to
-   insert the command-specific fields */
-
-__u8 *
-smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
-{
-	__u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
-	__u8 *p = req->rq_header;
-	struct smb_sb_info *server = req->rq_server;
-
-	p = smb_encode_smb_length(p, xmit_len - 4);
-
-	*p++ = 0xff;
-	*p++ = 'S';
-	*p++ = 'M';
-	*p++ = 'B';
-	*p++ = command;
-
-	memset(p, '\0', 19);
-	p += 19;
-	p += 8;
-
-	if (server->opt.protocol > SMB_PROTOCOL_CORE) {
-		int flags = SMB_FLAGS_CASELESS_PATHNAMES;
-		int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
-			SMB_FLAGS2_EXTENDED_ATTRIBUTES;	/* EA? not really ... */
-
-		*(req->rq_header + smb_flg) = flags;
-		if (server->mnt->flags & SMB_MOUNT_UNICODE)
-			flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
-		WSET(req->rq_header, smb_flg2, flags2);
-	}
-	*p++ = wct;		/* wct */
-	p += 2 * wct;
-	WSET(p, 0, bcc);
-
-	/* Include the header in the data to send */
-	req->rq_iovlen = 1;
-	req->rq_iov[0].iov_base = req->rq_header;
-	req->rq_iov[0].iov_len  = xmit_len - bcc;
-
-	return req->rq_buffer;
-}
-
-static void
-smb_setup_bcc(struct smb_request *req, __u8 *p)
-{
-	u16 bcc = p - req->rq_buffer;
-	u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
-
-	WSET(pbcc, 0, bcc);
-
-	smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN + 
-			      2*SMB_WCT(req->rq_header) - 2 + bcc);
-
-	/* Include the "bytes" in the data to send */
-	req->rq_iovlen = 2;
-	req->rq_iov[1].iov_base = req->rq_buffer;
-	req->rq_iov[1].iov_len  = bcc;
-}
-
-static int
-smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
-	      __u16 mode, off_t offset)
-{
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBlseek, 4, 0);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	WSET(req->rq_header, smb_vwv1, mode);
-	DSET(req->rq_header, smb_vwv2, offset);
-	req->rq_flags |= SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBlseek, 2, 0);
-	if (result < 0) {
-		result = 0;
-		goto out_free;
-	}
-
-	result = DVAL(req->rq_header, smb_vwv0);
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
-{
-	struct inode *ino = dentry->d_inode;
-	struct smb_inode_info *ei = SMB_I(ino);
-	int mode, read_write = 0x42, read_only = 0x40;
-	int res;
-	char *p;
-	struct smb_request *req;
-
-	/*
-	 * Attempt to open r/w, unless there are no write privileges.
-	 */
-	mode = read_write;
-	if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
-		mode = read_only;
-#if 0
-	/* FIXME: why is this code not in? below we fix it so that a caller
-	   wanting RO doesn't get RW. smb_revalidate_inode does some 
-	   optimization based on access mode. tail -f needs it to be correct.
-
-	   We must open rw since we don't do the open if called a second time
-	   with different 'wish'. Is that not supported by smb servers? */
-	if (!(wish & (O_WRONLY | O_RDWR)))
-		mode = read_only;
-#endif
-
-	res = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-      retry:
-	p = smb_setup_header(req, SMBopen, 2, 0);
-	WSET(req->rq_header, smb_vwv0, mode);
-	WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
-	res = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (res < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	res = smb_request_ok(req, SMBopen, 7, 0);
-	if (res != 0) {
-		if (mode == read_write &&
-		    (res == -EACCES || res == -ETXTBSY || res == -EROFS))
-		{
-			VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
-				DENTRY_PATH(dentry), res);
-			mode = read_only;
-			req->rq_flags = 0;
-			goto retry;
-		}
-		goto out_free;
-	}
-	/* We should now have data in vwv[0..6]. */
-
-	ei->fileid = WVAL(req->rq_header, smb_vwv0);
-	ei->attr   = WVAL(req->rq_header, smb_vwv1);
-	/* smb_vwv2 has mtime */
-	/* smb_vwv4 has size  */
-	ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
-	ei->open = server->generation;
-
-out_free:
-	smb_rput(req);
-out:
-	return res;
-}
-
-/*
- * Make sure the file is open, and check that the access
- * is compatible with the desired access.
- */
-int
-smb_open(struct dentry *dentry, int wish)
-{
-	struct inode *inode = dentry->d_inode;
-	int result;
-	__u16 access;
-
-	result = -ENOENT;
-	if (!inode) {
-		printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
-		       DENTRY_PATH(dentry));
-		goto out;
-	}
-
-	if (!smb_is_open(inode)) {
-		struct smb_sb_info *server = server_from_inode(inode);
-		result = 0;
-		if (!smb_is_open(inode))
-			result = smb_proc_open(server, dentry, wish);
-		if (result)
-			goto out;
-		/*
-		 * A successful open means the path is still valid ...
-		 */
-		smb_renew_times(dentry);
-	}
-
-	/*
-	 * Check whether the access is compatible with the desired mode.
-	 */
-	result = 0;
-	access = SMB_I(inode)->access;
-	if (access != wish && access != SMB_O_RDWR) {
-		PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
-			 DENTRY_PATH(dentry), access, wish);
-		result = -EACCES;
-	}
-out:
-	return result;
-}
-
-static int 
-smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
-{
-	struct smb_request *req;
-	int result = -ENOMEM;
-
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBclose, 3, 0);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_request_ok(req, SMBclose, 0, 0);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Win NT 4.0 has an apparent bug in that it fails to update the
- * modify time when writing to a file. As a workaround, we update
- * both modify and access time locally, and post the times to the
- * server when closing the file.
- */
-static int 
-smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
-{
-	struct smb_inode_info *ei = SMB_I(ino);
-	int result = 0;
-	if (smb_is_open(ino))
-	{
-		/*
-		 * We clear the open flag in advance, in case another
- 		 * process observes the value while we block below.
-		 */
-		ei->open = 0;
-
-		/*
-		 * Kludge alert: SMB timestamps are accurate only to
-		 * two seconds ... round the times to avoid needless
-		 * cache invalidations!
-		 */
-		if (ino->i_mtime.tv_sec & 1) { 
-			ino->i_mtime.tv_sec--;
-			ino->i_mtime.tv_nsec = 0; 
-		}
-		if (ino->i_atime.tv_sec & 1) {
-			ino->i_atime.tv_sec--;
-			ino->i_atime.tv_nsec = 0;
-		}
-		/*
-		 * If the file is open with write permissions,
-		 * update the time stamps to sync mtime and atime.
-		 */
-		if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
-		    (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
-		    !(ei->access == SMB_O_RDONLY))
-		{
-			struct smb_fattr fattr;
-			smb_get_inode_attr(ino, &fattr);
-			smb_proc_setattr_ext(server, ino, &fattr);
-		}
-
-		result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
-		/*
-		 * Force a revalidation after closing ... some servers
-		 * don't post the size until the file has been closed.
-		 */
-		if (server->opt.protocol < SMB_PROTOCOL_NT1)
-			ei->oldmtime = 0;
-		ei->closed = jiffies;
-	}
-	return result;
-}
-
-int
-smb_close(struct inode *ino)
-{
-	int result = 0;
-
-	if (smb_is_open(ino)) {
-		struct smb_sb_info *server = server_from_inode(ino);
-		result = smb_proc_close_inode(server, ino);
-	}
-	return result;
-}
-
-/*
- * This is used to close a file following a failed instantiate.
- * Since we don't have an inode, we can't use any of the above.
- */
-int
-smb_close_fileid(struct dentry *dentry, __u16 fileid)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int result;
-
-	result = smb_proc_close(server, fileid, get_seconds());
-	return result;
-}
-
-/* In smb_proc_read and smb_proc_write we do not retry, because the
-   file-id would not be valid after a reconnection. */
-
-static void
-smb_proc_read_data(struct smb_request *req)
-{
-	req->rq_iov[0].iov_base = req->rq_buffer;
-	req->rq_iov[0].iov_len  = 3;
-
-	req->rq_iov[1].iov_base = req->rq_page;
-	req->rq_iov[1].iov_len  = req->rq_rsize;
-	req->rq_iovlen = 2;
-
-	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-
-static int
-smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	__u16 returned_count, data_len;
-	unsigned char *buf;
-	int result;
-	struct smb_request *req;
-	u8 rbuf[4];
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBread, 5, 0);
-	buf = req->rq_header;
-	WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
-	WSET(buf, smb_vwv1, count);
-	DSET(buf, smb_vwv2, offset);
-	WSET(buf, smb_vwv4, 0);
-
-	req->rq_page = data;
-	req->rq_rsize = count;
-	req->rq_callback = smb_proc_read_data;
-	req->rq_buffer = rbuf;
-	req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
-
-	result = smb_request_ok(req, SMBread, 5, -1);
-	if (result < 0)
-		goto out_free;
-	returned_count = WVAL(req->rq_header, smb_vwv0);
-
-	data_len = WVAL(rbuf, 1);
-
-	if (returned_count != data_len) {
-		printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
-		printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
-		       returned_count, data_len);
-	}
-	result = data_len;
-
-out_free:
-	smb_rput(req);
-out:
-	VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, result);
-	return result;
-}
-
-static int
-smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result;
-	u16 fileid = SMB_I(inode)->fileid;
-	u8 buf[4];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-		inode->i_ino, fileid, count, offset);
-
-	smb_setup_header(req, SMBwrite, 5, count + 3);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	WSET(req->rq_header, smb_vwv1, count);
-	DSET(req->rq_header, smb_vwv2, offset);
-	WSET(req->rq_header, smb_vwv4, 0);
-
-	buf[0] = 1;
-	WSET(buf, 1, count);	/* yes, again ... */
-	req->rq_iov[1].iov_base = buf;
-	req->rq_iov[1].iov_len = 3;
-	req->rq_iov[2].iov_base = (char *) data;
-	req->rq_iov[2].iov_len = count;
-	req->rq_iovlen = 3;
-	req->rq_flags |= SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBwrite, 1, 0);
-	if (result >= 0)
-		result = WVAL(req->rq_header, smb_vwv0);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * In smb_proc_readX and smb_proc_writeX we do not retry, because the
- * file-id would not be valid after a reconnection.
- */
-
-#define SMB_READX_MAX_PAD      64
-static void
-smb_proc_readX_data(struct smb_request *req)
-{
-	/* header length, excluding the netbios length (-4) */
-	int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-	int data_off = WVAL(req->rq_header, smb_vwv6);
-
-	/*
-	 * Some genius made the padding to the data bytes arbitrary.
-	 * So we must first calculate the amount of padding used by the server.
-	 */
-	data_off -= hdrlen;
-	if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
-		PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
-		PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
-		req->rq_rlen = req->rq_bufsize + 1;
-		return;
-	}
-	req->rq_iov[0].iov_base = req->rq_buffer;
-	req->rq_iov[0].iov_len  = data_off;
-
-	req->rq_iov[1].iov_base = req->rq_page;
-	req->rq_iov[1].iov_len  = req->rq_rsize;
-	req->rq_iovlen = 2;
-
-	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-
-static int
-smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	unsigned char *buf;
-	int result;
-	struct smb_request *req;
-	static char pad[SMB_READX_MAX_PAD];
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBreadX, 12, 0);
-	buf = req->rq_header;
-	WSET(buf, smb_vwv0, 0x00ff);
-	WSET(buf, smb_vwv1, 0);
-	WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
-	DSET(buf, smb_vwv3, (u32)offset);               /* low 32 bits */
-	WSET(buf, smb_vwv5, count);
-	WSET(buf, smb_vwv6, 0);
-	DSET(buf, smb_vwv7, 0);
-	WSET(buf, smb_vwv9, 0);
-	DSET(buf, smb_vwv10, (u32)(offset >> 32));      /* high 32 bits */
-	WSET(buf, smb_vwv11, 0);
-
-	req->rq_page = data;
-	req->rq_rsize = count;
-	req->rq_callback = smb_proc_readX_data;
-	req->rq_buffer = pad;
-	req->rq_bufsize = SMB_READX_MAX_PAD;
-	req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBreadX, 12, -1);
-	if (result < 0)
-		goto out_free;
-	result = WVAL(req->rq_header, smb_vwv5);
-
-out_free:
-	smb_rput(req);
-out:
-	VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, result);
-	return result;
-}
-
-static int
-smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result;
-	u8 *p;
-	static u8 pad[4];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, offset);
-
-	p = smb_setup_header(req, SMBwriteX, 14, count + 1);
-	WSET(req->rq_header, smb_vwv0, 0x00ff);
-	WSET(req->rq_header, smb_vwv1, 0);
-	WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
-	DSET(req->rq_header, smb_vwv3, (u32)offset);	/* low 32 bits */
-	DSET(req->rq_header, smb_vwv5, 0);
-	WSET(req->rq_header, smb_vwv7, 0);		/* write mode */
-	WSET(req->rq_header, smb_vwv8, 0);
-	WSET(req->rq_header, smb_vwv9, 0);
-	WSET(req->rq_header, smb_vwv10, count);		/* data length */
-	WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
-	DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
-
-	req->rq_iov[1].iov_base = pad;
-	req->rq_iov[1].iov_len = 1;
-	req->rq_iov[2].iov_base = (char *) data;
-	req->rq_iov[2].iov_len = count;
-	req->rq_iovlen = 3;
-	req->rq_flags |= SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBwriteX, 6, 0);
- 	if (result >= 0)
-		result = WVAL(req->rq_header, smb_vwv2);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBcreate, 3, 0);
-	WSET(req->rq_header, smb_vwv0, attr);
-	DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	result = smb_request_ok(req, SMBcreate, 1, 0);
-	if (result < 0)
-		goto out_free;
-
-	*fileid = WVAL(req->rq_header, smb_vwv0);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
-{
-	struct smb_sb_info *server = server_from_dentry(old_dentry);
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBmv, 1, 0);
-	WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
-	result = smb_simple_encode_path(req, &p, old_dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	result = smb_simple_encode_path(req, &p, new_dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
-		goto out_free;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Code common to mkdir and rmdir.
- */
-static int
-smb_proc_generic_command(struct dentry *dentry, __u8 command)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, command, 0, 0);
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	result = smb_request_ok(req, command, 0, 0);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_mkdir(struct dentry *dentry)
-{
-	return smb_proc_generic_command(dentry, SMBmkdir);
-}
-
-int
-smb_proc_rmdir(struct dentry *dentry)
-{
-	return smb_proc_generic_command(dentry, SMBrmdir);
-}
-
-#if SMBFS_POSIX_UNLINK
-/*
- * Removes readonly attribute from a file. Used by unlink to give posix
- * semantics.
- */
-static int
-smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
-{
-	int result;
-	struct smb_fattr fattr;
-
-	/* FIXME: cifsUE should allow removing a readonly file. */
-
-	/* first get current attribute */
-	smb_init_dirent(server, &fattr);
-	result = server->ops->getattr(server, dentry, &fattr);
-	smb_finish_dirent(server, &fattr);
-	if (result < 0)
-		return result;
-
-	/* if RONLY attribute is set, remove it */
-	if (fattr.attr & aRONLY) {  /* read only attribute is set */
-		fattr.attr &= ~aRONLY;
-		result = smb_proc_setattr_core(server, dentry, fattr.attr);
-	}
-	return result;
-}
-#endif
-
-int
-smb_proc_unlink(struct dentry *dentry)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int flag = 0;
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-      retry:
-	p = smb_setup_header(req, SMBunlink, 1, 0);
-	WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
-#if SMBFS_POSIX_UNLINK
-		if (result == -EACCES && !flag) {
-			/* Posix semantics is for the read-only state
-			   of a file to be ignored in unlink(). In the
-			   SMB world a unlink() is refused on a
-			   read-only file. To make things easier for
-			   unix users we try to override the files
-			   permission if the unlink fails with the
-			   right error.
-			   This introduces a race condition that could
-			   lead to a file being written by someone who
-			   shouldn't have access, but as far as I can
-			   tell that is unavoidable */
-
-			/* remove RONLY attribute and try again */
-			result = smb_set_rw(dentry,server);
-			if (result == 0) {
-				flag = 1;
-				req->rq_flags = 0;
-				goto retry;
-			}
-		}
-#endif
-		goto out_free;
-	}
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
-{
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBflush, 1, 0);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_request_ok(req, SMBflush, 0, 0);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_trunc32(struct inode *inode, loff_t length)
-{
-	/*
-	 * Writing 0bytes is old-SMB magic for truncating files.
-	 * MAX_NON_LFS should prevent this from being called with a too
-	 * large offset.
-	 */
-	return smb_proc_write(inode, length, 0, NULL);
-}
-
-static int
-smb_proc_trunc64(struct inode *inode, loff_t length)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result;
-	char *param;
-	char *data;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 14)))
-		goto out;
-
-	param = req->rq_buffer;
-	data = req->rq_buffer + 6;
-
-	/* FIXME: must we also set allocation size? winNT seems to do that */
-	WSET(param, 0, SMB_I(inode)->fileid);
-	WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
-	WSET(param, 4, 0);
-	LSET(data, 0, length);
-
-	req->rq_trans2_command = TRANSACT2_SETFILEINFO;
-	req->rq_ldata = 8;
-	req->rq_data  = data;
-	req->rq_lparm = 6;
-	req->rq_parm  = param;
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	result = 0;
-	if (req->rq_rcls != 0)
-		result = smb_errno(req);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_trunc95(struct inode *inode, loff_t length)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result = smb_proc_trunc32(inode, length);
- 
-	/*
-	 * win9x doesn't appear to update the size immediately.
-	 * It will return the old file size after the truncate,
-	 * confusing smbfs. So we force an update.
-	 *
-	 * FIXME: is this still necessary?
-	 */
-	smb_proc_flush(server, SMB_I(inode)->fileid);
-	return result;
-}
-
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-	memset(fattr, 0, sizeof(*fattr));
-
-	fattr->f_nlink = 1;
-	fattr->f_uid = server->mnt->uid;
-	fattr->f_gid = server->mnt->gid;
-	fattr->f_unix = 0;
-}
-
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-	if (fattr->f_unix)
-		return;
-
-	fattr->f_mode = server->mnt->file_mode;
-	if (fattr->attr & aDIR) {
-		fattr->f_mode = server->mnt->dir_mode;
-		fattr->f_size = SMB_ST_BLKSIZE;
-	}
-	/* Check the read-only flag */
-	if (fattr->attr & aRONLY)
-		fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
-
-	/* How many 512 byte blocks do we need for this file? */
-	fattr->f_blocks = 0;
-	if (fattr->f_size != 0)
-		fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
-	return;
-}
-
-void
-smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-		     struct super_block *sb)
-{
-	smb_init_dirent(server, fattr);
-	fattr->attr = aDIR;
-	fattr->f_ino = 2; /* traditional root inode number */
-	fattr->f_mtime = current_fs_time(sb);
-	smb_finish_dirent(server, fattr);
-}
-
-/*
- * Decode a dirent for old protocols
- *
- * qname is filled with the decoded, and possibly translated, name.
- * fattr receives decoded attributes
- *
- * Bugs Noted:
- * (1) Pathworks servers may pad the name with extra spaces.
- */
-static char *
-smb_decode_short_dirent(struct smb_sb_info *server, char *p,
-			struct qstr *qname, struct smb_fattr *fattr,
-			unsigned char *name_buf)
-{
-	int len;
-
-	/*
-	 * SMB doesn't have a concept of inode numbers ...
-	 */
-	smb_init_dirent(server, fattr);
-	fattr->f_ino = 0;	/* FIXME: do we need this? */
-
-	p += SMB_STATUS_SIZE;	/* reserved (search_status) */
-	fattr->attr = *p;
-	fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
-	fattr->f_mtime.tv_nsec = 0;
-	fattr->f_size = DVAL(p, 5);
-	fattr->f_ctime = fattr->f_mtime;
-	fattr->f_atime = fattr->f_mtime;
-	qname->name = p + 9;
-	len = strnlen(qname->name, 12);
-
-	/*
-	 * Trim trailing blanks for Pathworks servers
-	 */
-	while (len > 2 && qname->name[len-1] == ' ')
-		len--;
-
-	smb_finish_dirent(server, fattr);
-
-#if 0
-	/* FIXME: These only work for ascii chars, and recent smbmount doesn't
-	   allow the flag to be set anyway. It kills const. Remove? */
-	switch (server->opt.case_handling) {
-	case SMB_CASE_UPPER:
-		str_upper(entry->name, len);
-		break;
-	case SMB_CASE_LOWER:
-		str_lower(entry->name, len);
-		break;
-	default:
-		break;
-	}
-#endif
-
-	qname->len = 0;
-	len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-				   qname->name, len,
-				   server->remote_nls, server->local_nls);
-	if (len > 0) {
-		qname->len = len;
-		qname->name = name_buf;
-		DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
-	}
-
-	return p + 22;
-}
-
-/*
- * This routine is used to read in directory entries from the network.
- * Note that it is for short directory name seeks, i.e.: protocol <
- * SMB_PROTOCOL_LANMAN2
- */
-static int
-smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
-		       struct smb_cache_control *ctl)
-{
-	struct dentry *dir = filp->f_path.dentry;
-	struct smb_sb_info *server = server_from_dentry(dir);
-	struct qstr qname;
-	struct smb_fattr fattr;
-	char *p;
-	int result;
-	int i, first, entries_seen, entries;
-	int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
-	__u16 bcc;
-	__u16 count;
-	char status[SMB_STATUS_SIZE];
-	static struct qstr mask = {
-		.name	= "*.*",
-		.len	= 3,
-	};
-	unsigned char *last_status;
-	struct smb_request *req;
-	unsigned char *name_buf;
-
-	VERBOSE("%s/%s\n", DENTRY_PATH(dir));
-
-	lock_kernel();
-
-	result = -ENOMEM;
-	if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
-		goto out;
-
-	first = 1;
-	entries = 0;
-	entries_seen = 2; /* implicit . and .. */
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-		goto out_name;
-
-	while (1) {
-		p = smb_setup_header(req, SMBsearch, 2, 0);
-		WSET(req->rq_header, smb_vwv0, entries_asked);
-		WSET(req->rq_header, smb_vwv1, aDIR);
-		if (first == 1) {
-			result = smb_simple_encode_path(req, &p, dir, &mask);
-			if (result < 0)
-				goto out_free;
-			if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
-				result = -ENAMETOOLONG;
-				goto out_free;
-			}
-			*p++ = 5;
-			WSET(p, 0, 0);
-			p += 2;
-			first = 0;
-		} else {
-			if (p + 5 + SMB_STATUS_SIZE >
-			    (char *)req->rq_buffer + req->rq_bufsize) {
-				result = -ENAMETOOLONG;
-				goto out_free;
-			}
-				
-			*p++ = 4;
-			*p++ = 0;
-			*p++ = 5;
-			WSET(p, 0, SMB_STATUS_SIZE);
-			p += 2;
-			memcpy(p, status, SMB_STATUS_SIZE);
-			p += SMB_STATUS_SIZE;
-		}
-
-		smb_setup_bcc(req, p);
-
-		result = smb_request_ok(req, SMBsearch, 1, -1);
-		if (result < 0) {
-			if ((req->rq_rcls == ERRDOS) && 
-			    (req->rq_err  == ERRnofiles))
-				break;
-			goto out_free;
-		}
-		count = WVAL(req->rq_header, smb_vwv0);
-		if (count <= 0)
-			break;
-
-		result = -EIO;
-		bcc = smb_bcc(req->rq_header);
-		if (bcc != count * SMB_DIRINFO_SIZE + 3)
-			goto out_free;
-		p = req->rq_buffer + 3;
-
-
-		/* Make sure the response fits in the buffer. Fixed sized 
-		   entries means we don't have to check in the decode loop. */
-
-		last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
-
-		if (last_status + SMB_DIRINFO_SIZE >=
-		    req->rq_buffer + req->rq_bufsize) {
-			printk(KERN_ERR "smb_proc_readdir_short: "
-			       "last dir entry outside buffer! "
-			       "%d@%p  %d@%p\n", SMB_DIRINFO_SIZE, last_status,
-			       req->rq_bufsize, req->rq_buffer);
-			goto out_free;
-		}
-
-		/* Read the last entry into the status field. */
-		memcpy(status, last_status, SMB_STATUS_SIZE);
-
-
-		/* Now we are ready to parse smb directory entries. */
-
-		for (i = 0; i < count; i++) {
-			p = smb_decode_short_dirent(server, p, 
-						    &qname, &fattr, name_buf);
-			if (qname.len == 0)
-				continue;
-
-			if (entries_seen == 2 && qname.name[0] == '.') {
-				if (qname.len == 1)
-					continue;
-				if (qname.name[1] == '.' && qname.len == 2)
-					continue;
-			}
-			if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-					    &qname, &fattr))
-				;	/* stop reading? */
-			entries_seen++;
-		}
-	}
-	result = entries;
-
-out_free:
-	smb_rput(req);
-out_name:
-	kfree(name_buf);
-out:
-	unlock_kernel();
-	return result;
-}
-
-static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
-{
-	u64 size, disk_bytes;
-
-	/* FIXME: verify nls support. all is sent as utf8? */
-
-	fattr->f_unix = 1;
-	fattr->f_mode = 0;
-
-	/* FIXME: use the uniqueID from the remote instead? */
-	/* 0 L file size in bytes */
-	/* 8 L file size on disk in bytes (block count) */
-	/* 40 L uid */
-	/* 48 L gid */
-	/* 56 W file type */
-	/* 60 L devmajor */
-	/* 68 L devminor */
-	/* 76 L unique ID (inode) */
-	/* 84 L permissions */
-	/* 92 L link count */
-
-	size = LVAL(p, 0);
-	disk_bytes = LVAL(p, 8);
-
-	/*
-	 * Some samba versions round up on-disk byte usage
-	 * to 1MB boundaries, making it useless. When seeing
-	 * that, use the size instead.
-	 */
-	if (!(disk_bytes & 0xfffff))
-		disk_bytes = size+511;
-
-	fattr->f_size = size;
-	fattr->f_blocks = disk_bytes >> 9;
-	fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
-	fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
-	fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
-
-	if (server->mnt->flags & SMB_MOUNT_UID)
-		fattr->f_uid = server->mnt->uid;
-	else
-		fattr->f_uid = LVAL(p, 40);
-
-	if (server->mnt->flags & SMB_MOUNT_GID)
-		fattr->f_gid = server->mnt->gid;
-	else
-		fattr->f_gid = LVAL(p, 48);
-
-	fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
-
-	if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
-		__u64 major = LVAL(p, 60);
-		__u64 minor = LVAL(p, 68);
-
-		fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
-		if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
-	    	MINOR(fattr->f_rdev) != (minor & 0xffffffff))
-			fattr->f_rdev = 0;
-	}
-
-	fattr->f_mode |= LVAL(p, 84);
-
-	if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
-	     (S_ISDIR(fattr->f_mode)) )
-		fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-	else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
-	          !(S_ISDIR(fattr->f_mode)) )
-		fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
-				(fattr->f_mode & S_IFMT);
-
-}
-
-/*
- * Interpret a long filename structure using the specified info level:
- *   level 1 for anything below NT1 protocol
- *   level 260 for NT1 protocol
- *
- * qname is filled with the decoded, and possibly translated, name
- * fattr receives decoded attributes.
- *
- * Bugs Noted:
- * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
- */
-static char *
-smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
-		       struct qstr *qname, struct smb_fattr *fattr,
-		       unsigned char *name_buf)
-{
-	char *result;
-	unsigned int len = 0;
-	int n;
-	__u16 date, time;
-	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-
-	/*
-	 * SMB doesn't have a concept of inode numbers ...
-	 */
-	smb_init_dirent(server, fattr);
-	fattr->f_ino = 0;	/* FIXME: do we need this? */
-
-	switch (level) {
-	case 1:
-		len = *((unsigned char *) p + 22);
-		qname->name = p + 23;
-		result = p + 24 + len;
-
-		date = WVAL(p, 0);
-		time = WVAL(p, 2);
-		fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-		fattr->f_ctime.tv_nsec = 0;
-
-		date = WVAL(p, 4);
-		time = WVAL(p, 6);
-		fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-		fattr->f_atime.tv_nsec = 0;
-
-		date = WVAL(p, 8);
-		time = WVAL(p, 10);
-		fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-		fattr->f_mtime.tv_nsec = 0;
-		fattr->f_size = DVAL(p, 12);
-		/* ULONG allocation size */
-		fattr->attr = WVAL(p, 20);
-
-		VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
-			p, len, len, qname->name);
-		break;
-	case 260:
-		result = p + WVAL(p, 0);
-		len = DVAL(p, 60);
-		if (len > 255) len = 255;
-		/* NT4 null terminates, unless we are using unicode ... */
-		qname->name = p + 94;
-		if (!unicode && len && qname->name[len-1] == '\0')
-			len--;
-
-		fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
-		fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
-		fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
-		/* change time (32) */
-		fattr->f_size = LVAL(p, 40);
-		/* alloc size (48) */
-		fattr->attr = DVAL(p, 56);
-
-		VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
-			p, len, len, qname->name);
-		break;
-	case SMB_FIND_FILE_UNIX:
-		result = p + WVAL(p, 0);
-		qname->name = p + 108;
-
-		len = strlen(qname->name);
-		/* FIXME: should we check the length?? */
-
-		p += 8;
-		smb_decode_unix_basic(fattr, server, p);
-		VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
-			p, len, len, qname->name);
-		break;
-	default:
-		PARANOIA("Unknown info level %d\n", level);
-		result = p + WVAL(p, 0);
-		goto out;
-	}
-
-	smb_finish_dirent(server, fattr);
-
-#if 0
-	/* FIXME: These only work for ascii chars, and recent smbmount doesn't
-	   allow the flag to be set anyway. Remove? */
-	switch (server->opt.case_handling) {
-	case SMB_CASE_UPPER:
-		str_upper(qname->name, len);
-		break;
-	case SMB_CASE_LOWER:
-		str_lower(qname->name, len);
-		break;
-	default:
-		break;
-	}
-#endif
-
-	qname->len = 0;
-	n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-				 qname->name, len,
-				 server->remote_nls, server->local_nls);
-	if (n > 0) {
-		qname->len = n;
-		qname->name = name_buf;
-	}
-
-out:
-	return result;
-}
-
-/* findfirst/findnext flags */
-#define SMB_CLOSE_AFTER_FIRST (1<<0)
-#define SMB_CLOSE_IF_END (1<<1)
-#define SMB_REQUIRE_RESUME_KEY (1<<2)
-#define SMB_CONTINUE_BIT (1<<3)
-
-/*
- * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
- * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
- * go there for advise.
- *
- * Bugs Noted:
- * (1) When using Info Level 1 Win NT 4.0 truncates directory listings 
- * for certain patterns of names and/or lengths. The breakage pattern
- * is completely reproducible and can be toggled by the creation of a
- * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
- */
-static int
-smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
-		      struct smb_cache_control *ctl)
-{
-	struct dentry *dir = filp->f_path.dentry;
-	struct smb_sb_info *server = server_from_dentry(dir);
-	struct qstr qname;
-	struct smb_fattr fattr;
-
-	unsigned char *p, *lastname;
-	char *mask, *param;
-	__u16 command;
-	int first, entries_seen;
-
-	/* Both NT and OS/2 accept info level 1 (but see note below). */
-	int info_level = 260;
-	const int max_matches = 512;
-
-	unsigned int ff_searchcount = 0;
-	unsigned int ff_eos = 0;
-	unsigned int ff_lastname = 0;
-	unsigned int ff_dir_handle = 0;
-	unsigned int loop_count = 0;
-	unsigned int mask_len, i;
-	int result;
-	struct smb_request *req;
-	unsigned char *name_buf;
-	static struct qstr star = {
-		.name	= "*",
-		.len	= 1,
-	};
-
-	lock_kernel();
-
-	/*
-	 * We always prefer unix style. Use info level 1 for older
-	 * servers that don't do 260.
-	 */
-	if (server->opt.capabilities & SMB_CAP_UNIX)
-		info_level = SMB_FIND_FILE_UNIX;
-	else if (server->opt.protocol < SMB_PROTOCOL_NT1)
-		info_level = 1;
-
-	result = -ENOMEM;
-	if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
-		goto out;
-	if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-		goto out_name;
-	param = req->rq_buffer;
-
-	/*
-	 * Encode the initial path
-	 */
-	mask = param + 12;
-
-	result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
-	if (result <= 0)
-		goto out_free;
-	mask_len = result - 1;	/* mask_len is strlen, not #bytes */
-	result = 0;
-	first = 1;
-	VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
-
-	entries_seen = 2;
-	ff_eos = 0;
-
-	while (ff_eos == 0) {
-		loop_count += 1;
-		if (loop_count > 10) {
-			printk(KERN_WARNING "smb_proc_readdir_long: "
-			       "Looping in FIND_NEXT??\n");
-			result = -EIO;
-			break;
-		}
-
-		if (first != 0) {
-			command = TRANSACT2_FINDFIRST;
-			WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-			WSET(param, 2, max_matches);	/* max count */
-			WSET(param, 4, SMB_CLOSE_IF_END);
-			WSET(param, 6, info_level);
-			DSET(param, 8, 0);
-		} else {
-			command = TRANSACT2_FINDNEXT;
-
-			VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
-				ff_dir_handle, ff_lastname, mask_len, mask);
-
-			WSET(param, 0, ff_dir_handle);	/* search handle */
-			WSET(param, 2, max_matches);	/* max count */
-			WSET(param, 4, info_level);
-			DSET(param, 6, 0);
-			WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
-		}
-
-		req->rq_trans2_command = command;
-		req->rq_ldata = 0;
-		req->rq_data  = NULL;
-		req->rq_lparm = 12 + mask_len + 1;
-		req->rq_parm  = param;
-		req->rq_flags = 0;
-		result = smb_add_request(req);
-		if (result < 0) {
-			PARANOIA("error=%d, breaking\n", result);
-			break;
-		}
-
-		if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
-			/* a damn Win95 bug - sometimes it clags if you 
-			   ask it too fast */
-			schedule_timeout_interruptible(msecs_to_jiffies(200));
-			continue;
-                }
-
-		if (req->rq_rcls != 0) {
-			result = smb_errno(req);
-			PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
-				 mask, result, req->rq_rcls, req->rq_err);
-			break;
-		}
-
-		/* parse out some important return info */
-		if (first != 0) {
-			ff_dir_handle = WVAL(req->rq_parm, 0);
-			ff_searchcount = WVAL(req->rq_parm, 2);
-			ff_eos = WVAL(req->rq_parm, 4);
-			ff_lastname = WVAL(req->rq_parm, 8);
-		} else {
-			ff_searchcount = WVAL(req->rq_parm, 0);
-			ff_eos = WVAL(req->rq_parm, 2);
-			ff_lastname = WVAL(req->rq_parm, 6);
-		}
-
-		if (ff_searchcount == 0)
-			break;
-
-		/* Now we are ready to parse smb directory entries. */
-
-		/* point to the data bytes */
-		p = req->rq_data;
-		for (i = 0; i < ff_searchcount; i++) {
-			/* make sure we stay within the buffer */
-			if (p >= req->rq_data + req->rq_ldata) {
-				printk(KERN_ERR "smb_proc_readdir_long: "
-				       "dirent pointer outside buffer! "
-				       "%p  %d@%p\n",
-				       p, req->rq_ldata, req->rq_data);
-				result = -EIO; /* always a comm. error? */
-				goto out_free;
-			}
-
-			p = smb_decode_long_dirent(server, p, info_level,
-						   &qname, &fattr, name_buf);
-
-			/* ignore . and .. from the server */
-			if (entries_seen == 2 && qname.name[0] == '.') {
-				if (qname.len == 1)
-					continue;
-				if (qname.name[1] == '.' && qname.len == 2)
-					continue;
-			}
-
-			if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-					    &qname, &fattr))
-				;	/* stop reading? */
-			entries_seen++;
-		}
-
-		VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
-
-		/*
-		 * We might need the lastname for continuations.
-		 *
-		 * Note that some servers (win95?) point to the filename and
-		 * others (NT4, Samba using NT1) to the dir entry. We assume
-		 * here that those who do not point to a filename do not need
-		 * this info to continue the listing.
-		 *
-		 * OS/2 needs this and talks infolevel 1.
-		 * NetApps want lastname with infolevel 260.
-		 * win2k want lastname with infolevel 260, and points to
-		 *       the record not to the name.
-		 * Samba+CifsUnixExt doesn't need lastname.
-		 *
-		 * Both are happy if we return the data they point to. So we do.
-		 * (FIXME: above is not true with win2k)
-		 */
-		mask_len = 0;
-		if (info_level != SMB_FIND_FILE_UNIX &&
-		    ff_lastname > 0 && ff_lastname < req->rq_ldata) {
-			lastname = req->rq_data + ff_lastname;
-
-			switch (info_level) {
-			case 260:
-				mask_len = req->rq_ldata - ff_lastname;
-				break;
-			case 1:
-				/* lastname points to a length byte */
-				mask_len = *lastname++;
-				if (ff_lastname + 1 + mask_len > req->rq_ldata)
-					mask_len = req->rq_ldata - ff_lastname - 1;
-				break;
-			}
-
-			/*
-			 * Update the mask string for the next message.
-			 */
-			if (mask_len > 255)
-				mask_len = 255;
-			if (mask_len)
-				strncpy(mask, lastname, mask_len);
-		}
-		mask_len = strnlen(mask, mask_len);
-		VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
-			mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
-
-		first = 0;
-		loop_count = 0;
-	}
-
-out_free:
-	smb_rput(req);
-out_name:
-	kfree(name_buf);
-out:
-	unlock_kernel();
-	return result;
-}
-
-/*
- * This version uses the trans2 TRANSACT2_FINDFIRST message 
- * to get the attribute data.
- *
- * Bugs Noted:
- */
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-			struct smb_fattr *fattr)
-{
-	char *param, *mask;
-	__u16 date, time;
-	int mask_len, result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-	mask = param + 12;
-
-	mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
-	if (mask_len < 0) {
-		result = mask_len;
-		goto out_free;
-	}
-	VERBOSE("name=%s, len=%d\n", mask, mask_len);
-	WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-	WSET(param, 2, 1);	/* max count */
-	WSET(param, 4, 1);	/* close after this call */
-	WSET(param, 6, 1);	/* info_level */
-	DSET(param, 8, 0);
-
-	req->rq_trans2_command = TRANSACT2_FINDFIRST;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = 12 + mask_len;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-	if (req->rq_rcls != 0) {
-		result = smb_errno(req);
-#ifdef SMBFS_PARANOIA
-		if (result != -ENOENT)
-			PARANOIA("error for %s, rcls=%d, err=%d\n",
-				 mask, req->rq_rcls, req->rq_err);
-#endif
-		goto out_free;
-	}
-	/* Make sure we got enough data ... */
-	result = -EINVAL;
-	if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
-		PARANOIA("bad result for %s, len=%d, count=%d\n",
-			 mask, req->rq_ldata, WVAL(req->rq_parm, 2));
-		goto out_free;
-	}
-
-	/*
-	 * Decode the response into the fattr ...
-	 */
-	date = WVAL(req->rq_data, 0);
-	time = WVAL(req->rq_data, 2);
-	fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-	fattr->f_ctime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 4);
-	time = WVAL(req->rq_data, 6);
-	fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-	fattr->f_atime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 8);
-	time = WVAL(req->rq_data, 10);
-	fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-	fattr->f_mtime.tv_nsec = 0;
-	VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
-		mask, date, time, fattr->f_mtime.tv_sec);
-	fattr->f_size = DVAL(req->rq_data, 12);
-	/* ULONG allocation size */
-	fattr->attr = WVAL(req->rq_data, 20);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-		      struct smb_fattr *fattr)
-{
-	int result;
-	char *p;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBgetatr, 0, 0);
-	result = smb_simple_encode_path(req, &p, dir, NULL);
-	if (result < 0)
- 		goto out_free;
-	smb_setup_bcc(req, p);
-
-	if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
-		goto out_free;
-	fattr->attr    = WVAL(req->rq_header, smb_vwv0);
-	fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
-	fattr->f_mtime.tv_nsec = 0;
-	fattr->f_size  = DVAL(req->rq_header, smb_vwv3);
-	fattr->f_ctime = fattr->f_mtime; 
-	fattr->f_atime = fattr->f_mtime; 
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk("getattr_core: %s/%s, mtime=%ld\n",
-	       DENTRY_PATH(dir), fattr->f_mtime);
-#endif
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Bugs Noted:
- * (1) Win 95 swaps the date and time fields in the standard info level.
- */
-static int
-smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
-			struct smb_request *req, int infolevel)
-{
-	char *p, *param;
-	int result;
-
-	param = req->rq_buffer;
-	WSET(param, 0, infolevel);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-	if (result < 0)
-		goto out;
-	p = param + 6 + result;
-
-	req->rq_trans2_command = TRANSACT2_QPATHINFO;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out;
-	if (req->rq_rcls != 0) {
-		VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
-			&param[6], result, req->rq_rcls, req->rq_err);
-		result = smb_errno(req);
-		goto out;
-	}
-	result = -ENOENT;
-	if (req->rq_ldata < 22) {
-		PARANOIA("not enough data for %s, len=%d\n",
-			 &param[6], req->rq_ldata);
-		goto out;
-	}
-
-	result = 0;
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
-			    struct smb_fattr *attr)
-{
-	u16 date, time;
-	int off_date = 0, off_time = 2;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
-	if (result < 0)
-		goto out_free;
-
-	/*
-	 * Kludge alert: Win 95 swaps the date and time field,
-	 * contrary to the CIFS docs and Win NT practice.
-	 */
-	if (server->mnt->flags & SMB_MOUNT_WIN95) {
-		off_date = 2;
-		off_time = 0;
-	}
-	date = WVAL(req->rq_data, off_date);
-	time = WVAL(req->rq_data, off_time);
-	attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-	attr->f_ctime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 4 + off_date);
-	time = WVAL(req->rq_data, 4 + off_time);
-	attr->f_atime.tv_sec = date_dos2unix(server, date, time);
-	attr->f_atime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 8 + off_date);
-	time = WVAL(req->rq_data, 8 + off_time);
-	attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-	attr->f_mtime.tv_nsec = 0;
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
-	       DENTRY_PATH(dir), date, time, attr->f_mtime);
-#endif
-	attr->f_size = DVAL(req->rq_data, 12);
-	attr->attr = WVAL(req->rq_data, 20);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
-			    struct smb_fattr *attr)
-{
-	struct smb_request *req;
-	int result;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	result = smb_proc_getattr_trans2(server, dir, req,
-					 SMB_QUERY_FILE_ALL_INFO);
-	if (result < 0)
-		goto out_free;
-
-	attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
-	attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
-	attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
-	/* change (24) */
-	attr->attr = WVAL(req->rq_data, 32);
-	/* pad? (34) */
-	/* allocated size (40) */
-	attr->f_size = LVAL(req->rq_data, 48);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
-		      struct smb_fattr *attr)
-{
-	struct smb_request *req;
-	int result;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	result = smb_proc_getattr_trans2(server, dir, req,
-					 SMB_QUERY_FILE_UNIX_BASIC);
-	if (result < 0)
-		goto out_free;
-
-	smb_decode_unix_basic(attr, server, req->rq_data);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
-		    struct smb_fattr *attr)
-{
-	struct inode *inode = dir->d_inode;
-	int result;
-
-	/* FIXME: why not use the "all" version? */
-	result = smb_proc_getattr_trans2_std(server, dir, attr);
-	if (result < 0)
-		goto out;
-
-	/*
-	 * None of the getattr versions here can make win9x return the right
-	 * filesize if there are changes made to an open file.
-	 * A seek-to-end does return the right size, but we only need to do
-	 * that on files we have written.
-	 */
-	if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
-	    smb_is_open(inode))
-	{
-		__u16 fileid = SMB_I(inode)->fileid;
-		attr->f_size = smb_proc_seek(server, fileid, 2, 0);
-	}
-
-out:
-	return result;
-}
-
-static int
-smb_proc_ops_wait(struct smb_sb_info *server)
-{
-	int result;
-
-	result = wait_event_interruptible_timeout(server->conn_wq,
-				server->conn_complete, 30*HZ);
-
-	if (!result || signal_pending(current))
-		return -EIO;
-
-	return 0;
-}
-
-static int
-smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
-			  struct smb_fattr *fattr)
-{
-	int result;
-
-	if (smb_proc_ops_wait(server) < 0)
-		return -EIO;
-
-	smb_init_dirent(server, fattr);
-	result = server->ops->getattr(server, dir, fattr);
-	smb_finish_dirent(server, fattr);
-
-	return result;
-}
-
-static int
-smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
-		      struct smb_cache_control *ctl)
-{
-	struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
-
-	if (smb_proc_ops_wait(server) < 0)
-		return -EIO;
-
-	return server->ops->readdir(filp, dirent, filldir, ctl);
-}
-
-int
-smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = server_from_dentry(dir);
-	int result;
-
-	smb_init_dirent(server, fattr);
-	result = server->ops->getattr(server, dir, fattr);
-	smb_finish_dirent(server, fattr);
-
-	return result;
-}
-
-
-/*
- * Because of bugs in the core protocol, we use this only to set
- * attributes. See smb_proc_settime() below for timestamp handling.
- *
- * Bugs Noted:
- * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
- * with an undocumented error (ERRDOS code 50). Setting
- * mtime to 0 allows the attributes to be set.
- * (2) The extra parameters following the name string aren't
- * in the CIFS docs, but seem to be necessary for operation.
- */
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-		      __u16 attr)
-{
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBsetatr, 8, 0);
-	WSET(req->rq_header, smb_vwv0, attr);
-	DSET(req->rq_header, smb_vwv1, 0); /* mtime */
-	WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
-	WSET(req->rq_header, smb_vwv4, 0);
-	WSET(req->rq_header, smb_vwv5, 0);
-	WSET(req->rq_header, smb_vwv6, 0);
-	WSET(req->rq_header, smb_vwv7, 0);
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
-		result = -ENAMETOOLONG;
-		goto out_free;
-	}
-	*p++ = 4;
-	*p++ = 0;
-	smb_setup_bcc(req, p);
-
-	result = smb_request_ok(req, SMBsetatr, 0, 0);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Because of bugs in the trans2 setattr messages, we must set
- * attributes and timestamps separately. The core SMBsetatr
- * message seems to be the only reliable way to set attributes.
- */
-int
-smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = server_from_dentry(dir);
-	int result;
-
-	VERBOSE("setting %s/%s, open=%d\n", 
-		DENTRY_PATH(dir), smb_is_open(dir->d_inode));
-	result = smb_proc_setattr_core(server, dir, fattr->attr);
-	return result;
-}
-
-/*
- * Sets the timestamps for an file open with write permissions.
- */
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-		      struct inode *inode, struct smb_fattr *fattr)
-{
-	__u16 date, time;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBsetattrE, 7, 0);
-	WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
-	/* We don't change the creation time */
-	WSET(req->rq_header, smb_vwv1, 0);
-	WSET(req->rq_header, smb_vwv2, 0);
-	date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-	WSET(req->rq_header, smb_vwv3, date);
-	WSET(req->rq_header, smb_vwv4, time);
-	date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-	WSET(req->rq_header, smb_vwv5, date);
-	WSET(req->rq_header, smb_vwv6, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
-	       date, time, fattr->f_mtime);
-#endif
-
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_request_ok(req, SMBsetattrE, 0, 0);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Bugs Noted:
- * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
- * set the file's attribute flags.
- */
-static int
-smb_proc_setattr_trans2(struct smb_sb_info *server,
-			struct dentry *dir, struct smb_fattr *fattr)
-{
-	__u16 date, time;
-	char *p, *param;
-	int result;
-	char data[26];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, 1);	/* Info level SMB_INFO_STANDARD */
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	WSET(data, 0, 0); /* creation time */
-	WSET(data, 2, 0);
-	date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-	WSET(data, 4, date);
-	WSET(data, 6, time);
-	date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-	WSET(data, 8, date);
-	WSET(data, 10, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n", 
-	       DENTRY_PATH(dir), date, time, fattr->f_mtime);
-#endif
-	DSET(data, 12, 0); /* size */
-	DSET(data, 16, 0); /* blksize */
-	WSET(data, 20, 0); /* attr */
-	DSET(data, 22, 0); /* ULONG EA size */
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_ldata = 26;
-	req->rq_data  = data;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-	if (req->rq_rcls != 0)
-		result = smb_errno(req);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * ATTR_MODE      0x001
- * ATTR_UID       0x002
- * ATTR_GID       0x004
- * ATTR_SIZE      0x008
- * ATTR_ATIME     0x010
- * ATTR_MTIME     0x020
- * ATTR_CTIME     0x040
- * ATTR_ATIME_SET 0x080
- * ATTR_MTIME_SET 0x100
- * ATTR_FORCE     0x200	
- * ATTR_ATTR_FLAG 0x400
- *
- * major/minor should only be set by mknod.
- */
-int
-smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
-		      unsigned int major, unsigned int minor)
-{
-	struct smb_sb_info *server = server_from_dentry(d);
-	u64 nttime;
-	char *p, *param;
-	int result;
-	char data[100];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
-
-	WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	/* 0 L file size in bytes */
-	/* 8 L file size on disk in bytes (block count) */
-	/* 40 L uid */
-	/* 48 L gid */
-	/* 56 W file type enum */
-	/* 60 L devmajor */
-	/* 68 L devminor */
-	/* 76 L unique ID (inode) */
-	/* 84 L permissions */
-	/* 92 L link count */
-	LSET(data, 0, SMB_SIZE_NO_CHANGE);
-	LSET(data, 8, SMB_SIZE_NO_CHANGE);
-	LSET(data, 16, SMB_TIME_NO_CHANGE);
-	LSET(data, 24, SMB_TIME_NO_CHANGE);
-	LSET(data, 32, SMB_TIME_NO_CHANGE);
-	LSET(data, 40, SMB_UID_NO_CHANGE);
-	LSET(data, 48, SMB_GID_NO_CHANGE);
-	DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
-	LSET(data, 60, major);
-	LSET(data, 68, minor);
-	LSET(data, 76, 0);
-	LSET(data, 84, SMB_MODE_NO_CHANGE);
-	LSET(data, 92, 0);
-
-	if (attr->ia_valid & ATTR_SIZE) {
-		LSET(data, 0, attr->ia_size);
-		LSET(data, 8, 0); /* can't set anyway */
-	}
-
-	/*
-	 * FIXME: check the conversion function it the correct one
-	 *
-	 * we can't set ctime but we might as well pass this to the server
-	 * and let it ignore it.
-	 */
-	if (attr->ia_valid & ATTR_CTIME) {
-		nttime = smb_unixutc2ntutc(attr->ia_ctime);
-		LSET(data, 16, nttime);
-	}
-	if (attr->ia_valid & ATTR_ATIME) {
-		nttime = smb_unixutc2ntutc(attr->ia_atime);
-		LSET(data, 24, nttime);
-	}
-	if (attr->ia_valid & ATTR_MTIME) {
-		nttime = smb_unixutc2ntutc(attr->ia_mtime);
-		LSET(data, 32, nttime);
-	}
-	
-	if (attr->ia_valid & ATTR_UID) {
-		LSET(data, 40, attr->ia_uid);
-	}
-	if (attr->ia_valid & ATTR_GID) {
-		LSET(data, 48, attr->ia_gid); 
-	}
-	
-	if (attr->ia_valid & ATTR_MODE) {
-		LSET(data, 84, attr->ia_mode);
-	}
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_ldata = 100;
-	req->rq_data  = data;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-
-/*
- * Set the modify and access timestamps for a file.
- *
- * Incredibly enough, in all of SMB there is no message to allow
- * setting both attributes and timestamps at once. 
- *
- * Bugs Noted:
- * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message 
- * with info level 1 (INFO_STANDARD).
- * (2) Win 95 seems not to support setting directory timestamps.
- * (3) Under the core protocol apparently the only way to set the
- * timestamp is to open and close the file.
- */
-int
-smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	struct inode *inode = dentry->d_inode;
-	int result;
-
-	VERBOSE("setting %s/%s, open=%d\n",
-		DENTRY_PATH(dentry), smb_is_open(inode));
-
-	/* setting the time on a Win95 server fails (tridge) */
-	if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 && 
-	    !(server->mnt->flags & SMB_MOUNT_WIN95)) {
-		if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
-			result = smb_proc_setattr_ext(server, inode, fattr);
-		else
-			result = smb_proc_setattr_trans2(server, dentry, fattr);
-	} else {
-		/*
-		 * Fail silently on directories ... timestamp can't be set?
-		 */
-		result = 0;
-		if (S_ISREG(inode->i_mode)) {
-			/*
-			 * Set the mtime by opening and closing the file.
-			 * Note that the file is opened read-only, but this
-			 * still allows us to set the date (tridge)
-			 */
-			result = -EACCES;
-			if (!smb_is_open(inode))
-				smb_proc_open(server, dentry, SMB_O_RDONLY);
-			if (smb_is_open(inode)) {
-				inode->i_mtime = fattr->f_mtime;
-				result = smb_proc_close_inode(server, inode);
-			}
-		}
-	}
-
-	return result;
-}
-
-int
-smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
-{
-	struct smb_sb_info *server = SMB_SB(dentry->d_sb);
-	int result;
-	char *p;
-	long unit;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBdskattr, 0, 0);
-	if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
-		goto out_free;
-	p = SMB_VWV(req->rq_header);
-	unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
-	attr->f_blocks = WVAL(p, 0) * unit;
-	attr->f_bsize  = SMB_ST_BLKSIZE;
-	attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
-		   char *buffer, int len)
-{
-	char *p, *param;
-	int result;
-	struct smb_request *req;
-
-	DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	req->rq_trans2_command = TRANSACT2_QPATHINFO;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-		&param[6], result, req->rq_rcls, req->rq_err);
-
-	/* copy data up to the \0 or buffer length */
-	result = len;
-	if (req->rq_ldata < len)
-		result = req->rq_ldata;
-	strncpy(buffer, req->rq_data, result);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-
-/*
- * Create a symlink object called dentry which points to oldpath.
- * Samba does not permit dangling links but returns a suitable error message.
- */
-int
-smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
-		 const char *oldpath)
-{
-	char *p, *param;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_ldata = strlen(oldpath) + 1;
-	req->rq_data  = (char *) oldpath;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-		&param[6], result, req->rq_rcls, req->rq_err);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Create a hard link object called new_dentry which points to dentry.
- */
-int
-smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
-	      struct dentry *new_dentry)
-{
-	char *p, *param;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
-				 new_dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	/* Grr, pointless separation of parameters and data ... */
-	req->rq_data = p;
-	req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
-					dentry, NULL);
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-	       &param[6], result, req->rq_rcls, req->rq_err);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server)
-{
-	int result;
-	int major, minor;
-	u64 caps;
-	char param[2];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 100)))
-		goto out;
-
-	WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
-
-	req->rq_trans2_command = TRANSACT2_QFSINFO;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = 2;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	if (req->rq_ldata < 12) {
-		PARANOIA("Not enough data\n");
-		goto out_free;
-	}
-	major = WVAL(req->rq_data, 0);
-	minor = WVAL(req->rq_data, 2);
-
-	DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
-	       major, minor);
-	/* FIXME: verify that we are ok with this major/minor? */
-
-	caps = LVAL(req->rq_data, 4);
-	DEBUG1("Server capabilities 0x%016llx\n", caps);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src)
-{
-	memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
-}
-
-/* < LANMAN2 */
-static struct smb_ops smb_ops_core =
-{
-	.read		= smb_proc_read,
-	.write		= smb_proc_write,
-	.readdir	= smb_proc_readdir_short,
-	.getattr	= smb_proc_getattr_core,
-	.truncate	= smb_proc_trunc32,
-};
-
-/* LANMAN2, OS/2, others? */
-static struct smb_ops smb_ops_os2 =
-{
-	.read		= smb_proc_read,
-	.write		= smb_proc_write,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_trans2_std,
-	.truncate	= smb_proc_trunc32,
-};
-
-/* Win95, and possibly some NetApp versions too */
-static struct smb_ops smb_ops_win95 =
-{
-	.read		= smb_proc_read,    /* does not support 12word readX */
-	.write		= smb_proc_write,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_95,
-	.truncate	= smb_proc_trunc95,
-};
-
-/* Samba, NT4 and NT5 */
-static struct smb_ops smb_ops_winNT =
-{
-	.read		= smb_proc_readX,
-	.write		= smb_proc_writeX,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_trans2_all,
-	.truncate	= smb_proc_trunc64,
-};
-
-/* Samba w/ unix extensions. Others? */
-static struct smb_ops smb_ops_unix =
-{
-	.read		= smb_proc_readX,
-	.write		= smb_proc_writeX,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_unix,
-	/* FIXME: core/ext/time setattr needs to be cleaned up! */
-	/* .setattr	= smb_proc_setattr_unix, */
-	.truncate	= smb_proc_trunc64,
-};
-
-/* Place holder until real ops are in place */
-static struct smb_ops smb_ops_null =
-{
-	.readdir	= smb_proc_readdir_null,
-	.getattr	= smb_proc_getattr_null,
-};
-
-void smb_install_null_ops(struct smb_ops *ops)
-{
-	install_ops(ops, &smb_ops_null);
-}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
deleted file mode 100644
index 05939a6f43e..00000000000
--- a/fs/smbfs/proto.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Autogenerated with cproto on:  Sat Sep 13 17:18:51 CEST 2003
- */
-
-struct smb_request;
-struct sock;
-struct statfs;
-
-/* proc.c */
-extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
-extern __u32 smb_len(__u8 *p);
-extern int smb_get_rsize(struct smb_sb_info *server);
-extern int smb_get_wsize(struct smb_sb_info *server);
-extern int smb_errno(struct smb_request *req);
-extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
-extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
-extern int smb_open(struct dentry *dentry, int wish);
-extern int smb_close(struct inode *ino);
-extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
-extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
-extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
-extern int smb_proc_mkdir(struct dentry *dentry);
-extern int smb_proc_rmdir(struct dentry *dentry);
-extern int smb_proc_unlink(struct dentry *dentry);
-extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
-extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-				 struct super_block *sb);
-extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
-extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
-extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
-extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
-extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
-extern void smb_install_null_ops(struct smb_ops *ops);
-/* dir.c */
-extern const struct file_operations smb_dir_operations;
-extern const struct inode_operations smb_dir_inode_operations;
-extern const struct inode_operations smb_dir_inode_operations_unix;
-extern void smb_new_dentry(struct dentry *dentry);
-extern void smb_renew_times(struct dentry *dentry);
-/* cache.c */
-extern void smb_invalid_dir_cache(struct inode *dir);
-extern void smb_invalidate_dircache_entries(struct dentry *parent);
-extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
-extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
-/* sock.c */
-extern void smb_data_ready(struct sock *sk, int len);
-extern int smb_valid_socket(struct inode *inode);
-extern void smb_close_socket(struct smb_sb_info *server);
-extern int smb_recv_available(struct smb_sb_info *server);
-extern int smb_receive_header(struct smb_sb_info *server);
-extern int smb_receive_drop(struct smb_sb_info *server);
-extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
-extern int smb_send_request(struct smb_request *req);
-/* inode.c */
-extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
-extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_invalidate_inodes(struct smb_sb_info *server);
-extern int smb_revalidate_inode(struct dentry *dentry);
-extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
-extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
-/* file.c */
-extern const struct address_space_operations smb_file_aops;
-extern const struct file_operations smb_file_operations;
-extern const struct inode_operations smb_file_inode_operations;
-/* ioctl.c */
-extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-/* smbiod.c */
-extern void smbiod_wake_up(void);
-extern int smbiod_register_server(struct smb_sb_info *server);
-extern void smbiod_unregister_server(struct smb_sb_info *server);
-extern void smbiod_flush(struct smb_sb_info *server);
-extern int smbiod_retry(struct smb_sb_info *server);
-/* request.c */
-extern int smb_init_request_cache(void);
-extern void smb_destroy_request_cache(void);
-extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
-extern void smb_rput(struct smb_request *req);
-extern int smb_add_request(struct smb_request *req);
-extern int smb_request_send_server(struct smb_sb_info *server);
-extern int smb_request_recv(struct smb_sb_info *server);
-/* symlink.c */
-extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
-extern const struct inode_operations smb_link_inode_operations;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
deleted file mode 100644
index 45f45933e86..00000000000
--- a/fs/smbfs/request.c
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- *  request.c
- *
- *  Copyright (C) 2001 by Urban Widmark
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-
-/* #define SMB_SLAB_DEBUG	(SLAB_RED_ZONE | SLAB_POISON) */
-#define SMB_SLAB_DEBUG	0
-
-/* cache for request structures */
-static struct kmem_cache *req_cachep;
-
-static int smb_request_send_req(struct smb_request *req);
-
-/*
-  /proc/slabinfo:
-  name, active, num, objsize, active_slabs, num_slaps, #pages
-*/
-
-
-int smb_init_request_cache(void)
-{
-	req_cachep = kmem_cache_create("smb_request",
-				       sizeof(struct smb_request), 0,
-				       SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
-				       NULL);
-	if (req_cachep == NULL)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void smb_destroy_request_cache(void)
-{
-	kmem_cache_destroy(req_cachep);
-}
-
-/*
- * Allocate and initialise a request structure
- */
-static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
-						int bufsize)
-{
-	struct smb_request *req;
-	unsigned char *buf = NULL;
-
-	req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
-	VERBOSE("allocating request: %p\n", req);
-	if (!req)
-		goto out;
-
-	if (bufsize > 0) {
-		buf = kmalloc(bufsize, GFP_NOFS);
-		if (!buf) {
-			kmem_cache_free(req_cachep, req);
-			return NULL;
-		}
-	}
-
-	req->rq_buffer = buf;
-	req->rq_bufsize = bufsize;
-	req->rq_server = server;
-	init_waitqueue_head(&req->rq_wait);
-	INIT_LIST_HEAD(&req->rq_queue);
-	atomic_set(&req->rq_count, 1);
-
-out:
-	return req;
-}
-
-struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
-{
-	struct smb_request *req = NULL;
-
-	for (;;) {
-		atomic_inc(&server->nr_requests);
-		if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
-			req = smb_do_alloc_request(server, bufsize);
-			if (req != NULL)
-				break;
-		}
-
-#if 0
-		/*
-		 * Try to free up at least one request in order to stay
-		 * below the hard limit
-		 */
-                if (nfs_try_to_free_pages(server))
-			continue;
-
-		if (fatal_signal_pending(current))
-			return ERR_PTR(-ERESTARTSYS);
-		current->policy = SCHED_YIELD;
-		schedule();
-#else
-		/* FIXME: we want something like nfs does above, but that
-		   requires changes to all callers and can wait. */
-		break;
-#endif
-	}
-	return req;
-}
-
-static void smb_free_request(struct smb_request *req)
-{
-	atomic_dec(&req->rq_server->nr_requests);
-	if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
-		kfree(req->rq_buffer);
-	kfree(req->rq_trans2buffer);
-	kmem_cache_free(req_cachep, req);
-}
-
-/*
- * What prevents a rget to race with a rput? The count must never drop to zero
- * while it is in use. Only rput if it is ok that it is free'd.
- */
-static void smb_rget(struct smb_request *req)
-{
-	atomic_inc(&req->rq_count);
-}
-void smb_rput(struct smb_request *req)
-{
-	if (atomic_dec_and_test(&req->rq_count)) {
-		list_del_init(&req->rq_queue);
-		smb_free_request(req);
-	}
-}
-
-/* setup to receive the data part of the SMB */
-static int smb_setup_bcc(struct smb_request *req)
-{
-	int result = 0;
-	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-
-	if (req->rq_rlen > req->rq_bufsize) {
-		PARANOIA("Packet too large %d > %d\n",
-			 req->rq_rlen, req->rq_bufsize);
-		return -ENOBUFS;
-	}
-
-	req->rq_iov[0].iov_base = req->rq_buffer;
-	req->rq_iov[0].iov_len  = req->rq_rlen;
-	req->rq_iovlen = 1;
-
-	return result;
-}
-
-/*
- * Prepare a "normal" request structure.
- */
-static int smb_setup_request(struct smb_request *req)
-{
-	int len = smb_len(req->rq_header) + 4;
-	req->rq_slen = len;
-
-	/* if we expect a data part in the reply we set the iov's to read it */
-	if (req->rq_resp_bcc)
-		req->rq_setup_read = smb_setup_bcc;
-
-	/* This tries to support re-using the same request */
-	req->rq_bytes_sent = 0;
-	req->rq_rcls = 0;
-	req->rq_err = 0;
-	req->rq_errno = 0;
-	req->rq_fragment = 0;
-	kfree(req->rq_trans2buffer);
-	req->rq_trans2buffer = NULL;
-
-	return 0;
-}
-
-/*
- * Prepare a transaction2 request structure
- */
-static int smb_setup_trans2request(struct smb_request *req)
-{
-	struct smb_sb_info *server = req->rq_server;
-	int mparam, mdata;
-	static unsigned char padding[4];
-
-	/* I know the following is very ugly, but I want to build the
-	   smb packet as efficiently as possible. */
-
-	const int smb_parameters = 15;
-	const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
-	const int oparam = ALIGN(header + 3, sizeof(u32));
-	const int odata  = ALIGN(oparam + req->rq_lparm, sizeof(u32));
-	const int bcc = (req->rq_data ? odata + req->rq_ldata :
-					oparam + req->rq_lparm) - header;
-
-	if ((bcc + oparam) > server->opt.max_xmit)
-		return -ENOMEM;
-	smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
-
-	/*
-	 * max parameters + max data + max setup == bufsize to make NT4 happy
-	 * and not abort the transfer or split into multiple responses. It also
-	 * makes smbfs happy as handling packets larger than the buffer size
-	 * is extra work.
-	 *
-	 * OS/2 is probably going to hate me for this ...
-	 */
-	mparam = SMB_TRANS2_MAX_PARAM;
-	mdata = req->rq_bufsize - mparam;
-
-	mdata = server->opt.max_xmit - mparam - 100;
-	if (mdata < 1024) {
-		mdata = 1024;
-		mparam = 20;
-	}
-
-#if 0
-	/* NT/win2k has ~4k max_xmit, so with this we request more than it wants
-	   to return as one SMB. Useful for testing the fragmented trans2
-	   handling. */
-	mdata = 8192;
-#endif
-
-	WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
-	WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
-	WSET(req->rq_header, smb_mprcnt, mparam);
-	WSET(req->rq_header, smb_mdrcnt, mdata);
-	WSET(req->rq_header, smb_msrcnt, 0);    /* max setup always 0 ? */
-	WSET(req->rq_header, smb_flags, 0);
-	DSET(req->rq_header, smb_timeout, 0);
-	WSET(req->rq_header, smb_pscnt, req->rq_lparm);
-	WSET(req->rq_header, smb_psoff, oparam - 4);
-	WSET(req->rq_header, smb_dscnt, req->rq_ldata);
-	WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
-	*(req->rq_header + smb_suwcnt) = 0x01;          /* setup count */
-	*(req->rq_header + smb_suwcnt + 1) = 0x00;      /* reserved */
-	WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
-
-	req->rq_iovlen = 2;
-	req->rq_iov[0].iov_base = (void *) req->rq_header;
-	req->rq_iov[0].iov_len = oparam;
-	req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
-	req->rq_iov[1].iov_len = req->rq_lparm;
-	req->rq_slen = oparam + req->rq_lparm;
-
-	if (req->rq_data) {
-		req->rq_iovlen += 2;
-		req->rq_iov[2].iov_base = padding;
-		req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
-		req->rq_iov[3].iov_base = req->rq_data;
-		req->rq_iov[3].iov_len = req->rq_ldata;
-		req->rq_slen = odata + req->rq_ldata;
-	}
-
-	/* always a data part for trans2 replies */
-	req->rq_setup_read = smb_setup_bcc;
-
-	return 0;
-}
-
-/*
- * Add a request and tell smbiod to process it
- */
-int smb_add_request(struct smb_request *req)
-{
-	long timeleft;
-	struct smb_sb_info *server = req->rq_server;
-	int result = 0;
-
-	smb_setup_request(req);
-	if (req->rq_trans2_command) {
-		if (req->rq_buffer == NULL) {
-			PARANOIA("trans2 attempted without response buffer!\n");
-			return -EIO;
-		}
-		result = smb_setup_trans2request(req);
-	}
-	if (result < 0)
-		return result;
-
-#ifdef SMB_DEBUG_PACKET_SIZE
-	add_xmit_stats(req);
-#endif
-
-	/* add 'req' to the queue of requests */
-	if (smb_lock_server_interruptible(server))
-		return -EINTR;
-
-	/*
-	 * Try to send the request as the process. If that fails we queue the
-	 * request and let smbiod send it later.
-	 */
-
-	/* FIXME: each server has a number on the maximum number of parallel
-	   requests. 10, 50 or so. We should not allow more requests to be
-	   active. */
-	if (server->mid > 0xf000)
-		server->mid = 0;
-	req->rq_mid = server->mid++;
-	WSET(req->rq_header, smb_mid, req->rq_mid);
-
-	result = 0;
-	if (server->state == CONN_VALID) {
-		if (list_empty(&server->xmitq))
-			result = smb_request_send_req(req);
-		if (result < 0) {
-			/* Connection lost? */
-			server->conn_error = result;
-			server->state = CONN_INVALID;
-		}
-	}
-	if (result != 1)
-		list_add_tail(&req->rq_queue, &server->xmitq);
-	smb_rget(req);
-
-	if (server->state != CONN_VALID)
-		smbiod_retry(server);
-
-	smb_unlock_server(server);
-
-	smbiod_wake_up();
-
-	timeleft = wait_event_interruptible_timeout(req->rq_wait,
-				    req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
-	if (!timeleft || signal_pending(current)) {
-		/*
-		 * On timeout or on interrupt we want to try and remove the
-		 * request from the recvq/xmitq.
-		 * First check if the request is still part of a queue. (May
-		 * have been removed by some error condition)
-		 */
-		smb_lock_server(server);
-		if (!list_empty(&req->rq_queue)) {
-			list_del_init(&req->rq_queue);
-			smb_rput(req);
-		}
-		smb_unlock_server(server);
-	}
-
-	if (!timeleft) {
-		PARANOIA("request [%p, mid=%d] timed out!\n",
-			 req, req->rq_mid);
-		VERBOSE("smb_com:  %02x\n", *(req->rq_header + smb_com));
-		VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
-		VERBOSE("smb_flg:  %02x\n", *(req->rq_header + smb_flg));
-		VERBOSE("smb_tid:  %04x\n", WVAL(req->rq_header, smb_tid));
-		VERBOSE("smb_pid:  %04x\n", WVAL(req->rq_header, smb_pid));
-		VERBOSE("smb_uid:  %04x\n", WVAL(req->rq_header, smb_uid));
-		VERBOSE("smb_mid:  %04x\n", WVAL(req->rq_header, smb_mid));
-		VERBOSE("smb_wct:  %02x\n", *(req->rq_header + smb_wct));
-
-		req->rq_rcls = ERRSRV;
-		req->rq_err  = ERRtimeout;
-
-		/* Just in case it was "stuck" */
-		smbiod_wake_up();
-	}
-	VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
-
-	if (req->rq_rcls != 0)
-		req->rq_errno = smb_errno(req);
-	if (signal_pending(current))
-		req->rq_errno = -ERESTARTSYS;
-	return req->rq_errno;
-}
-
-/*
- * Send a request and place it on the recvq if successfully sent.
- * Must be called with the server lock held.
- */
-static int smb_request_send_req(struct smb_request *req)
-{
-	struct smb_sb_info *server = req->rq_server;
-	int result;
-
-	if (req->rq_bytes_sent == 0) {
-		WSET(req->rq_header, smb_tid, server->opt.tid);
-		WSET(req->rq_header, smb_pid, 1);
-		WSET(req->rq_header, smb_uid, server->opt.server_uid);
-	}
-
-	result = smb_send_request(req);
-	if (result < 0 && result != -EAGAIN)
-		goto out;
-
-	result = 0;
-	if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
-		goto out;
-
-	list_move_tail(&req->rq_queue, &server->recvq);
-	result = 1;
-out:
-	return result;
-}
-
-/*
- * Sends one request for this server. (smbiod)
- * Must be called with the server lock held.
- * Returns: <0 on error
- *           0 if no request could be completely sent
- *           1 if all data for one request was sent
- */
-int smb_request_send_server(struct smb_sb_info *server)
-{
-	struct list_head *head;
-	struct smb_request *req;
-	int result;
-
-	if (server->state != CONN_VALID)
-		return 0;
-
-	/* dequeue first request, if any */
-	req = NULL;
-	head = server->xmitq.next;
-	if (head != &server->xmitq) {
-		req = list_entry(head, struct smb_request, rq_queue);
-	}
-	if (!req)
-		return 0;
-
-	result = smb_request_send_req(req);
-	if (result < 0) {
-		server->conn_error = result;
-		list_move(&req->rq_queue, &server->xmitq);
-		result = -EIO;
-		goto out;
-	}
-
-out:
-	return result;
-}
-
-/*
- * Try to find a request matching this "mid". Typically the first entry will
- * be the matching one.
- */
-static struct smb_request *find_request(struct smb_sb_info *server, int mid)
-{
-	struct list_head *tmp;
-	struct smb_request *req = NULL;
-
-	list_for_each(tmp, &server->recvq) {
-		req = list_entry(tmp, struct smb_request, rq_queue);
-		if (req->rq_mid == mid) {
-			break;
-		}
-		req = NULL;
-	}
-
-	if (!req) {
-		VERBOSE("received reply with mid %d but no request!\n",
-			WVAL(server->header, smb_mid));
-		server->rstate = SMB_RECV_DROP;
-	}
-
-	return req;
-}
-
-/*
- * Called when we have read the smb header and believe this is a response.
- */
-static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
-{
-	int hdrlen, wct;
-
-	memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
-
-	wct = *(req->rq_header + smb_wct);
-	if (wct > 20) {	
-		PARANOIA("wct too large, %d > 20\n", wct);
-		server->rstate = SMB_RECV_DROP;
-		return 0;
-	}
-
-	req->rq_resp_wct = wct;
-	hdrlen = SMB_HEADER_LEN + wct*2 + 2;
-	VERBOSE("header length: %d   smb_wct: %2d\n", hdrlen, wct);
-
-	req->rq_bytes_recvd = SMB_HEADER_LEN;
-	req->rq_rlen = hdrlen;
-	req->rq_iov[0].iov_base = req->rq_header;
-	req->rq_iov[0].iov_len  = hdrlen;
-	req->rq_iovlen = 1;
-	server->rstate = SMB_RECV_PARAM;
-
-#ifdef SMB_DEBUG_PACKET_SIZE
-	add_recv_stats(smb_len(server->header));
-#endif
-	return 0;
-}
-
-/*
- * Reads the SMB parameters
- */
-static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
-{
-	int result;
-
-	result = smb_receive(server, req);
-	if (result < 0)
-		return result;
-	if (req->rq_bytes_recvd < req->rq_rlen)
-		return 0;
-
-	VERBOSE("result: %d   smb_bcc:  %04x\n", result,
-		WVAL(req->rq_header, SMB_HEADER_LEN +
-		     (*(req->rq_header + smb_wct) * 2)));
-
-	result = 0;
-	req->rq_iov[0].iov_base = NULL;
-	req->rq_rlen = 0;
-	if (req->rq_callback)
-		req->rq_callback(req);
-	else if (req->rq_setup_read)
-		result = req->rq_setup_read(req);
-	if (result < 0) {
-		server->rstate = SMB_RECV_DROP;
-		return result;
-	}
-
-	server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
-
-	req->rq_bytes_recvd = 0;	// recvd out of the iov
-
-	VERBOSE("rlen: %d\n", req->rq_rlen);
-	if (req->rq_rlen < 0) {
-		PARANOIA("Parameters read beyond end of packet!\n");
-		server->rstate = SMB_RECV_END;
-		return -EIO;
-	}
-	return 0;
-}
-
-/*
- * Reads the SMB data
- */
-static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
-{
-	int result;
-
-	result = smb_receive(server, req);
-	if (result < 0)
-		goto out;
-	if (req->rq_bytes_recvd < req->rq_rlen)
-		goto out;
-	server->rstate = SMB_RECV_END;
-out:
-	VERBOSE("result: %d\n", result);
-	return result;
-}
-
-/*
- * Receive a transaction2 response
- * Return: 0 if the response has been fully read
- *         1 if there are further "fragments" to read
- *        <0 if there is an error
- */
-static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
-{
-	unsigned char *inbuf;
-	unsigned int parm_disp, parm_offset, parm_count, parm_tot;
-	unsigned int data_disp, data_offset, data_count, data_tot;
-	int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-
-	VERBOSE("handling trans2\n");
-
-	inbuf = req->rq_header;
-	data_tot    = WVAL(inbuf, smb_tdrcnt);
-	parm_tot    = WVAL(inbuf, smb_tprcnt);
-	parm_disp   = WVAL(inbuf, smb_prdisp);
-	parm_offset = WVAL(inbuf, smb_proff);
-	parm_count  = WVAL(inbuf, smb_prcnt);
-	data_disp   = WVAL(inbuf, smb_drdisp);
-	data_offset = WVAL(inbuf, smb_droff);
-	data_count  = WVAL(inbuf, smb_drcnt);
-
-	/* Modify offset for the split header/buffer we use */
-	if (data_count || data_offset) {
-		if (unlikely(data_offset < hdrlen))
-			goto out_bad_data;
-		else
-			data_offset -= hdrlen;
-	}
-	if (parm_count || parm_offset) {
-		if (unlikely(parm_offset < hdrlen))
-			goto out_bad_parm;
-		else
-			parm_offset -= hdrlen;
-	}
-
-	if (parm_count == parm_tot && data_count == data_tot) {
-		/*
-		 * This packet has all the trans2 data.
-		 *
-		 * We setup the request so that this will be the common
-		 * case. It may be a server error to not return a
-		 * response that fits.
-		 */
-		VERBOSE("single trans2 response  "
-			"dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-			data_count, parm_count,
-			data_offset, parm_offset);
-		req->rq_ldata = data_count;
-		req->rq_lparm = parm_count;
-		req->rq_data = req->rq_buffer + data_offset;
-		req->rq_parm = req->rq_buffer + parm_offset;
-		if (unlikely(parm_offset + parm_count > req->rq_rlen))
-			goto out_bad_parm;
-		if (unlikely(data_offset + data_count > req->rq_rlen))
-			goto out_bad_data;
-		return 0;
-	}
-
-	VERBOSE("multi trans2 response  "
-		"frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-		req->rq_fragment,
-		data_count, parm_count,
-		data_offset, parm_offset);
-
-	if (!req->rq_fragment) {
-		int buf_len;
-
-		/* We got the first trans2 fragment */
-		req->rq_fragment = 1;
-		req->rq_total_data = data_tot;
-		req->rq_total_parm = parm_tot;
-		req->rq_ldata = 0;
-		req->rq_lparm = 0;
-
-		buf_len = data_tot + parm_tot;
-		if (buf_len > SMB_MAX_PACKET_SIZE)
-			goto out_too_long;
-
-		req->rq_trans2bufsize = buf_len;
-		req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
-		if (!req->rq_trans2buffer)
-			goto out_no_mem;
-
-		req->rq_parm = req->rq_trans2buffer;
-		req->rq_data = req->rq_trans2buffer + parm_tot;
-	} else if (unlikely(req->rq_total_data < data_tot ||
-			    req->rq_total_parm < parm_tot))
-		goto out_data_grew;
-
-	if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
-		     parm_offset + parm_count > req->rq_rlen))
-		goto out_bad_parm;
-	if (unlikely(data_disp + data_count > req->rq_total_data ||
-		     data_offset + data_count > req->rq_rlen))
-		goto out_bad_data;
-
-	inbuf = req->rq_buffer;
-	memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
-	memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
-
-	req->rq_ldata += data_count;
-	req->rq_lparm += parm_count;
-
-	/*
-	 * Check whether we've received all of the data. Note that
-	 * we use the packet totals -- total lengths might shrink!
-	 */
-	if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
-		req->rq_ldata = data_tot;
-		req->rq_lparm = parm_tot;
-		return 0;
-	}
-	return 1;
-
-out_too_long:
-	printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
-		data_tot, parm_tot);
-	goto out_EIO;
-out_no_mem:
-	printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
-	       req->rq_trans2bufsize);
-	req->rq_errno = -ENOMEM;
-	goto out;
-out_data_grew:
-	printk(KERN_ERR "smb_trans2: data/params grew!\n");
-	goto out_EIO;
-out_bad_parm:
-	printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-	       parm_disp, parm_count, parm_tot, parm_offset);
-	goto out_EIO;
-out_bad_data:
-	printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-	       data_disp, data_count, data_tot, data_offset);
-out_EIO:
-	req->rq_errno = -EIO;
-out:
-	return req->rq_errno;
-}
-
-/*
- * State machine for receiving responses. We handle the fact that we can't
- * read the full response in one try by having states telling us how much we
- * have read.
- *
- * Must be called with the server lock held (only called from smbiod).
- *
- * Return: <0 on error
- */
-int smb_request_recv(struct smb_sb_info *server)
-{
-	struct smb_request *req = NULL;
-	int result = 0;
-
-	if (smb_recv_available(server) <= 0)
-		return 0;
-
-	VERBOSE("state: %d\n", server->rstate);
-	switch (server->rstate) {
-	case SMB_RECV_DROP:
-		result = smb_receive_drop(server);
-		if (result < 0)
-			break;
-		if (server->rstate == SMB_RECV_DROP)
-			break;
-		server->rstate = SMB_RECV_START;
-		/* fallthrough */
-	case SMB_RECV_START:
-		server->smb_read = 0;
-		server->rstate = SMB_RECV_HEADER;
-		/* fallthrough */
-	case SMB_RECV_HEADER:
-		result = smb_receive_header(server);
-		if (result < 0)
-			break;
-		if (server->rstate == SMB_RECV_HEADER)
-			break;
-		if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
-			server->rstate = SMB_RECV_REQUEST;
-			break;
-		}
-		if (server->rstate != SMB_RECV_HCOMPLETE)
-			break;
-		/* fallthrough */
-	case SMB_RECV_HCOMPLETE:
-		req = find_request(server, WVAL(server->header, smb_mid));
-		if (!req)
-			break;
-		smb_init_request(server, req);
-		req->rq_rcls = *(req->rq_header + smb_rcls);
-		req->rq_err  = WVAL(req->rq_header, smb_err);
-		if (server->rstate != SMB_RECV_PARAM)
-			break;
-		/* fallthrough */
-	case SMB_RECV_PARAM:
-		if (!req)
-			req = find_request(server,WVAL(server->header,smb_mid));
-		if (!req)
-			break;
-		result = smb_recv_param(server, req);
-		if (result < 0)
-			break;
-		if (server->rstate != SMB_RECV_DATA)
-			break;
-		/* fallthrough */
-	case SMB_RECV_DATA:
-		if (!req)
-			req = find_request(server,WVAL(server->header,smb_mid));
-		if (!req)
-			break;
-		result = smb_recv_data(server, req);
-		if (result < 0)
-			break;
-		break;
-
-		/* We should never be called with any of these states */
-	case SMB_RECV_END:
-	case SMB_RECV_REQUEST:
-		BUG();
-	}
-
-	if (result < 0) {
-		/* We saw an error */
-		return result;
-	}
-
-	if (server->rstate != SMB_RECV_END)
-		return 0;
-
-	result = 0;
-	if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
-		result = smb_recv_trans2(server, req);
-
-	/*
-	 * Response completely read. Drop any extra bytes sent by the server.
-	 * (Yes, servers sometimes add extra bytes to responses)
-	 */
-	VERBOSE("smb_len: %d   smb_read: %d\n",
-		server->smb_len, server->smb_read);
-	if (server->smb_read < server->smb_len)
-		smb_receive_drop(server);
-
-	server->rstate = SMB_RECV_START;
-
-	if (!result) {
-		list_del_init(&req->rq_queue);
-		req->rq_flags |= SMB_REQ_RECEIVED;
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-	return 0;
-}
diff --git a/fs/smbfs/request.h b/fs/smbfs/request.h
deleted file mode 100644
index efb21451e7c..00000000000
--- a/fs/smbfs/request.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-#include <linux/wait.h>
-
-struct smb_request {
-	struct list_head rq_queue;	/* recvq or xmitq for the server */
-
-	atomic_t rq_count;
-
-	wait_queue_head_t rq_wait;
-	int rq_flags;
-	int rq_mid;	/* multiplex ID, set by request.c */
-
-	struct smb_sb_info *rq_server;
-
-	/* header + word count + parameter words + byte count */
-	unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
-
-	int rq_bufsize;
-	unsigned char *rq_buffer;
-
-	/* FIXME: this is not good enough for merging IO requests. */
-	unsigned char *rq_page;
-	int rq_rsize;
-
-	int rq_resp_wct;
-	int rq_resp_bcc;
-
-	int rq_rlen;
-	int rq_bytes_recvd;
-
-	int rq_slen;
-	int rq_bytes_sent;
-
-	int rq_iovlen;
-	struct kvec rq_iov[4];
-
-	int (*rq_setup_read) (struct smb_request *);
-	void (*rq_callback) (struct smb_request *);
-
-	/* ------ trans2 stuff ------ */
-
-	u16 rq_trans2_command;	/* 0 if not a trans2 request */
-	unsigned int rq_ldata;
-	unsigned char *rq_data;
-	unsigned int rq_lparm;
-	unsigned char *rq_parm;
-
-	int rq_fragment;
-	u32 rq_total_data;
-	u32 rq_total_parm;
-	int rq_trans2bufsize;
-	unsigned char *rq_trans2buffer;
-
-	/* ------ response ------ */
-
-	unsigned short rq_rcls;
-	unsigned short rq_err;
-	int rq_errno;
-};
-
-#define SMB_REQ_STATIC		0x0001	/* rq_buffer is static */
-#define SMB_REQ_NORETRY		0x0002	/* request is invalid after retry */
-
-#define SMB_REQ_TRANSMITTED	0x4000	/* all data has been sent */
-#define SMB_REQ_RECEIVED	0x8000	/* reply received, smbiod is done */
-
-#define xSMB_REQ_NOREPLY	0x0004	/* we don't want the reply (if any) */
-#define xSMB_REQ_NORECEIVER	0x0008	/* caller doesn't wait for response */
diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h
deleted file mode 100644
index fc4b1a5dd75..00000000000
--- a/fs/smbfs/smb_debug.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Defines some debug macros for smbfs.
- */
-
-/* This makes a dentry parent/child name pair. Useful for debugging printk's */
-#define DENTRY_PATH(dentry) \
-	(dentry)->d_parent->d_name.name,(dentry)->d_name.name
-
-/*
- * safety checks that should never happen ???
- * these are normally enabled.
- */
-#ifdef SMBFS_PARANOIA
-# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
-#else
-# define PARANOIA(f, a...) do { ; } while(0)
-#endif
-
-/* lots of debug messages */
-#ifdef SMBFS_DEBUG_VERBOSE
-# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-# define VERBOSE(f, a...) do { ; } while(0)
-#endif
-
-/*
- * "normal" debug messages, but not with a normal DEBUG define ... way
- * too common name.
- */
-#ifdef SMBFS_DEBUG
-#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-#define DEBUG1(f, a...) do { ; } while(0)
-#endif
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
deleted file mode 100644
index 0e39a924f10..00000000000
--- a/fs/smbfs/smbiod.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- *  smbiod.c
- *
- *  Copyright (C) 2000, Charles Loep / Corel Corp.
- *  Copyright (C) 2001, Urban Widmark
- */
-
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/kthread.h>
-#include <net/ip.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-
-enum smbiod_state {
-	SMBIOD_DEAD,
-	SMBIOD_STARTING,
-	SMBIOD_RUNNING,
-};
-
-static enum smbiod_state smbiod_state = SMBIOD_DEAD;
-static struct task_struct *smbiod_thread;
-static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
-static LIST_HEAD(smb_servers);
-static DEFINE_SPINLOCK(servers_lock);
-
-#define SMBIOD_DATA_READY	(1<<0)
-static unsigned long smbiod_flags;
-
-static int smbiod(void *);
-static int smbiod_start(void);
-
-/*
- * called when there's work for us to do
- */
-void smbiod_wake_up(void)
-{
-	if (smbiod_state == SMBIOD_DEAD)
-		return;
-	set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-	wake_up_interruptible(&smbiod_wait);
-}
-
-/*
- * start smbiod if none is running
- */
-static int smbiod_start(void)
-{
-	struct task_struct *tsk;
-	int err = 0;
-
-	if (smbiod_state != SMBIOD_DEAD)
-		return 0;
-	smbiod_state = SMBIOD_STARTING;
-	__module_get(THIS_MODULE);
-	spin_unlock(&servers_lock);
-	tsk = kthread_run(smbiod, NULL, "smbiod");
-	if (IS_ERR(tsk)) {
-		err = PTR_ERR(tsk);
-		module_put(THIS_MODULE);
-	}
-
-	spin_lock(&servers_lock);
-	if (err < 0) {
-		smbiod_state = SMBIOD_DEAD;
-		smbiod_thread = NULL;
-	} else {
-		smbiod_state = SMBIOD_RUNNING;
-		smbiod_thread = tsk;
-	}
-	return err;
-}
-
-/*
- * register a server & start smbiod if necessary
- */
-int smbiod_register_server(struct smb_sb_info *server)
-{
-	int ret;
-	spin_lock(&servers_lock);
-	list_add(&server->entry, &smb_servers);
-	VERBOSE("%p\n", server);
-	ret = smbiod_start();
-	spin_unlock(&servers_lock);
-	return ret;
-}
-
-/*
- * Unregister a server
- * Must be called with the server lock held.
- */
-void smbiod_unregister_server(struct smb_sb_info *server)
-{
-	spin_lock(&servers_lock);
-	list_del_init(&server->entry);
-	VERBOSE("%p\n", server);
-	spin_unlock(&servers_lock);
-
-	smbiod_wake_up();
-	smbiod_flush(server);
-}
-
-void smbiod_flush(struct smb_sb_info *server)
-{
-	struct list_head *tmp, *n;
-	struct smb_request *req;
-
-	list_for_each_safe(tmp, n, &server->xmitq) {
-		req = list_entry(tmp, struct smb_request, rq_queue);
-		req->rq_errno = -EIO;
-		list_del_init(&req->rq_queue);
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-	list_for_each_safe(tmp, n, &server->recvq) {
-		req = list_entry(tmp, struct smb_request, rq_queue);
-		req->rq_errno = -EIO;
-		list_del_init(&req->rq_queue);
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-}
-
-/*
- * Wake up smbmount and make it reconnect to the server.
- * This must be called with the server locked.
- *
- * FIXME: add smbconnect version to this
- */
-int smbiod_retry(struct smb_sb_info *server)
-{
-	struct list_head *head;
-	struct smb_request *req;
-	struct pid *pid = get_pid(server->conn_pid);
-	int result = 0;
-
-	VERBOSE("state: %d\n", server->state);
-	if (server->state == CONN_VALID || server->state == CONN_RETRYING)
-		goto out;
-
-	smb_invalidate_inodes(server);
-
-	/*
-	 * Some requests are meaningless after a retry, so we abort them.
-	 * One example are all requests using 'fileid' since the files are
-	 * closed on retry.
-	 */
-	head = server->xmitq.next;
-	while (head != &server->xmitq) {
-		req = list_entry(head, struct smb_request, rq_queue);
-		head = head->next;
-
-		req->rq_bytes_sent = 0;
-		if (req->rq_flags & SMB_REQ_NORETRY) {
-			VERBOSE("aborting request %p on xmitq\n", req);
-			req->rq_errno = -EIO;
-			list_del_init(&req->rq_queue);
-			smb_rput(req);
-			wake_up_interruptible(&req->rq_wait);
-		}
-	}
-
-	/*
-	 * FIXME: test the code for retrying request we already sent
-	 */
-	head = server->recvq.next;
-	while (head != &server->recvq) {
-		req = list_entry(head, struct smb_request, rq_queue);
-		head = head->next;
-#if 0
-		if (req->rq_flags & SMB_REQ_RETRY) {
-			/* must move the request to the xmitq */
-			VERBOSE("retrying request %p on recvq\n", req);
-			list_move(&req->rq_queue, &server->xmitq);
-			continue;
-		}
-#endif
-
-		VERBOSE("aborting request %p on recvq\n", req);
-		/* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
-		req->rq_errno = -EIO;
-		list_del_init(&req->rq_queue);
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-
-	smb_close_socket(server);
-
-	if (!pid) {
-		/* FIXME: this is fatal, umount? */
-		printk(KERN_ERR "smb_retry: no connection process\n");
-		server->state = CONN_RETRIED;
-		goto out;
-	}
-
-	/*
-	 * Change state so that only one retry per server will be started.
-	 */
-	server->state = CONN_RETRYING;
-
-	/*
-	 * Note: use the "priv" flag, as a user process may need to reconnect.
-	 */
-	result = kill_pid(pid, SIGUSR1, 1);
-	if (result) {
-		/* FIXME: this is most likely fatal, umount? */
-		printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
-		goto out;
-	}
-	VERBOSE("signalled pid %d\n", pid_nr(pid));
-
-	/* FIXME: The retried requests should perhaps get a "time boost". */
-
-out:
-	put_pid(pid);
-	return result;
-}
-
-/*
- * Currently handles lockingX packets.
- */
-static void smbiod_handle_request(struct smb_sb_info *server)
-{
-	PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
-	server->rstate = SMB_RECV_DROP;
-}
-
-/*
- * Do some IO for one server.
- */
-static void smbiod_doio(struct smb_sb_info *server)
-{
-	int result;
-	int maxwork = 7;
-
-	if (server->state != CONN_VALID)
-		goto out;
-
-	do {
-		result = smb_request_recv(server);
-		if (result < 0) {
-			server->state = CONN_INVALID;
-			smbiod_retry(server);
-			goto out;	/* reconnecting is slow */
-		} else if (server->rstate == SMB_RECV_REQUEST)
-			smbiod_handle_request(server);
-	} while (result > 0 && maxwork-- > 0);
-
-	/*
-	 * If there is more to read then we want to be sure to wake up again.
-	 */
-	if (server->state != CONN_VALID)
-		goto out;
-	if (smb_recv_available(server) > 0)
-		set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-
-	do {
-		result = smb_request_send_server(server);
-		if (result < 0) {
-			server->state = CONN_INVALID;
-			smbiod_retry(server);
-			goto out;	/* reconnecting is slow */
-		}
-	} while (result > 0);
-
-	/*
-	 * If the last request was not sent out we want to wake up again.
-	 */
-	if (!list_empty(&server->xmitq))
-		set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-
-out:
-	return;
-}
-
-/*
- * smbiod kernel thread
- */
-static int smbiod(void *unused)
-{
-	VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
-
-	for (;;) {
-		struct smb_sb_info *server;
-		struct list_head *pos, *n;
-
-		/* FIXME: Use poll? */
-		wait_event_interruptible(smbiod_wait,
-			 test_bit(SMBIOD_DATA_READY, &smbiod_flags));
-		if (signal_pending(current)) {
-			spin_lock(&servers_lock);
-			smbiod_state = SMBIOD_DEAD;
-			spin_unlock(&servers_lock);
-			break;
-		}
-
-		clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
-
-		spin_lock(&servers_lock);
-		if (list_empty(&smb_servers)) {
-			smbiod_state = SMBIOD_DEAD;
-			spin_unlock(&servers_lock);
-			break;
-		}
-
-		list_for_each_safe(pos, n, &smb_servers) {
-			server = list_entry(pos, struct smb_sb_info, entry);
-			VERBOSE("checking server %p\n", server);
-
-			if (server->state == CONN_VALID) {
-				spin_unlock(&servers_lock);
-
-				smb_lock_server(server);
-				smbiod_doio(server);
-				smb_unlock_server(server);
-
-				spin_lock(&servers_lock);
-			}
-		}
-		spin_unlock(&servers_lock);
-	}
-
-	VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
-	module_put_and_exit(0);
-}
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
deleted file mode 100644
index e37fe4deebd..00000000000
--- a/fs/smbfs/sock.c
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- *  sock.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/socket.h>
-#include <linux/fcntl.h>
-#include <linux/file.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <linux/mm.h>
-#include <linux/netdevice.h>
-#include <linux/workqueue.h>
-#include <net/scm.h>
-#include <net/tcp_states.h>
-#include <net/ip.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smb.h>
-#include <linux/smbno.h>
-
-#include <asm/uaccess.h>
-#include <asm/ioctls.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-
-
-static int
-_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
-{
-	struct kvec iov = {ubuf, size};
-	struct msghdr msg = {.msg_flags = flags};
-	msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
-	return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
-}
-
-/*
- * Return the server this socket belongs to
- */
-static struct smb_sb_info *
-server_from_socket(struct socket *socket)
-{
-	return socket->sk->sk_user_data;
-}
-
-/*
- * Called when there is data on the socket.
- */
-void
-smb_data_ready(struct sock *sk, int len)
-{
-	struct smb_sb_info *server = server_from_socket(sk->sk_socket);
-	void (*data_ready)(struct sock *, int) = server->data_ready;
-
-	data_ready(sk, len);
-	VERBOSE("(%p, %d)\n", sk, len);
-	smbiod_wake_up();
-}
-
-int
-smb_valid_socket(struct inode * inode)
-{
-	return (inode && S_ISSOCK(inode->i_mode) && 
-		SOCKET_I(inode)->type == SOCK_STREAM);
-}
-
-static struct socket *
-server_sock(struct smb_sb_info *server)
-{
-	struct file *file;
-
-	if (server && (file = server->sock_file))
-	{
-#ifdef SMBFS_PARANOIA
-		if (!smb_valid_socket(file->f_path.dentry->d_inode))
-			PARANOIA("bad socket!\n");
-#endif
-		return SOCKET_I(file->f_path.dentry->d_inode);
-	}
-	return NULL;
-}
-
-void
-smb_close_socket(struct smb_sb_info *server)
-{
-	struct file * file = server->sock_file;
-
-	if (file) {
-		struct socket *sock = server_sock(server);
-
-		VERBOSE("closing socket %p\n", sock);
-		sock->sk->sk_data_ready = server->data_ready;
-		server->sock_file = NULL;
-		fput(file);
-	}
-}
-
-static int
-smb_get_length(struct socket *socket, unsigned char *header)
-{
-	int result;
-
-	result = _recvfrom(socket, header, 4, MSG_PEEK);
-	if (result == -EAGAIN)
-		return -ENODATA;
-	if (result < 0) {
-		PARANOIA("recv error = %d\n", -result);
-		return result;
-	}
-	if (result < 4)
-		return -ENODATA;
-
-	switch (header[0]) {
-	case 0x00:
-	case 0x82:
-		break;
-
-	case 0x85:
-		DEBUG1("Got SESSION KEEP ALIVE\n");
-		_recvfrom(socket, header, 4, 0);	/* read away */
-		return -ENODATA;
-
-	default:
-		PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
-		return -EIO;
-	}
-
-	/* The length in the RFC NB header is the raw data length */
-	return smb_len(header);
-}
-
-int
-smb_recv_available(struct smb_sb_info *server)
-{
-	mm_segment_t oldfs;
-	int avail, err;
-	struct socket *sock = server_sock(server);
-
-	oldfs = get_fs();
-	set_fs(get_ds());
-	err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
-	set_fs(oldfs);
-	return (err >= 0) ? avail : err;
-}
-
-/*
- * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
- */
-static int
-smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
-{
-	struct kvec *iv = *data;
-	int i;
-	int len;
-
-	/*
-	 *	Eat any sent kvecs
-	 */
-	while (iv->iov_len <= amount) {
-		amount -= iv->iov_len;
-		iv++;
-		(*num)--;
-	}
-
-	/*
-	 *	And chew down the partial one
-	 */
-	vec[0].iov_len = iv->iov_len-amount;
-	vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
-	iv++;
-
-	len = vec[0].iov_len;
-
-	/*
-	 *	And copy any others
-	 */
-	for (i = 1; i < *num; i++) {
-		vec[i] = *iv++;
-		len += vec[i].iov_len;
-	}
-
-	*data = vec;
-	return len;
-}
-
-/*
- * smb_receive_header
- * Only called by the smbiod thread.
- */
-int
-smb_receive_header(struct smb_sb_info *server)
-{
-	struct socket *sock;
-	int result = 0;
-	unsigned char peek_buf[4];
-
-	result = -EIO; 
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	if (!server->smb_read) {
-		result = smb_get_length(sock, peek_buf);
-		if (result < 0) {
-			if (result == -ENODATA)
-				result = 0;
-			goto out;
-		}
-		server->smb_len = result + 4;
-
-		if (server->smb_len < SMB_HEADER_LEN) {
-			PARANOIA("short packet: %d\n", result);
-			server->rstate = SMB_RECV_DROP;
-			result = -EIO;
-			goto out;
-		}
-		if (server->smb_len > SMB_MAX_PACKET_SIZE) {
-			PARANOIA("long packet: %d\n", result);
-			server->rstate = SMB_RECV_DROP;
-			result = -EIO;
-			goto out;
-		}
-	}
-
-	result = _recvfrom(sock, server->header + server->smb_read,
-			   SMB_HEADER_LEN - server->smb_read, 0);
-	VERBOSE("_recvfrom: %d\n", result);
-	if (result < 0) {
-		VERBOSE("receive error: %d\n", result);
-		goto out;
-	}
-	server->smb_read += result;
-
-	if (server->smb_read == SMB_HEADER_LEN)
-		server->rstate = SMB_RECV_HCOMPLETE;
-out:
-	return result;
-}
-
-static char drop_buffer[PAGE_SIZE];
-
-/*
- * smb_receive_drop - read and throw away the data
- * Only called by the smbiod thread.
- *
- * FIXME: we are in the kernel, could we just tell the socket that we want
- * to drop stuff from the buffer?
- */
-int
-smb_receive_drop(struct smb_sb_info *server)
-{
-	struct socket *sock;
-	unsigned int flags;
-	struct kvec iov;
-	struct msghdr msg;
-	int rlen = smb_len(server->header) - server->smb_read + 4;
-	int result = -EIO;
-
-	if (rlen > PAGE_SIZE)
-		rlen = PAGE_SIZE;
-
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-	iov.iov_base = drop_buffer;
-	iov.iov_len = PAGE_SIZE;
-	msg.msg_flags = flags;
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_control = NULL;
-
-	result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
-
-	VERBOSE("read: %d\n", result);
-	if (result < 0) {
-		VERBOSE("receive error: %d\n", result);
-		goto out;
-	}
-	server->smb_read += result;
-
-	if (server->smb_read >= server->smb_len)
-		server->rstate = SMB_RECV_END;
-
-out:
-	return result;
-}
-
-/*
- * smb_receive
- * Only called by the smbiod thread.
- */
-int
-smb_receive(struct smb_sb_info *server, struct smb_request *req)
-{
-	struct socket *sock;
-	unsigned int flags;
-	struct kvec iov[4];
-	struct kvec *p = req->rq_iov;
-	size_t num = req->rq_iovlen;
-	struct msghdr msg;
-	int rlen;
-	int result = -EIO;
-
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-	msg.msg_flags = flags;
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_control = NULL;
-
-	/* Dont repeat bytes and count available bufferspace */
-	rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
-			(req->rq_rlen - req->rq_bytes_recvd));
-
-	result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
-
-	VERBOSE("read: %d\n", result);
-	if (result < 0) {
-		VERBOSE("receive error: %d\n", result);
-		goto out;
-	}
-	req->rq_bytes_recvd += result;
-	server->smb_read += result;
-
-out:
-	return result;
-}
-
-/*
- * Try to send a SMB request. This may return after sending only parts of the
- * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
- *
- * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
- */
-int
-smb_send_request(struct smb_request *req)
-{
-	struct smb_sb_info *server = req->rq_server;
-	struct socket *sock;
-	struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
-        int slen = req->rq_slen - req->rq_bytes_sent;
-	int result = -EIO;
-	struct kvec iov[4];
-	struct kvec *p = req->rq_iov;
-	size_t num = req->rq_iovlen;
-
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	/* Dont repeat bytes */
-	if (req->rq_bytes_sent)
-		smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
-
-	result = kernel_sendmsg(sock, &msg, p, num, slen);
-
-	if (result >= 0) {
-		req->rq_bytes_sent += result;
-		if (req->rq_bytes_sent >= req->rq_slen)
-			req->rq_flags |= SMB_REQ_TRANSMITTED;
-	}
-out:
-	return result;
-}
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
deleted file mode 100644
index 00b2909bd46..00000000000
--- a/fs/smbfs/symlink.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  symlink.c
- *
- *  Copyright (C) 2002 by John Newbigin
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-#include <linux/namei.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
-{
-	DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
-
-	return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
-}
-
-static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	char *link = __getname();
-	DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
-
-	if (!link) {
-		link = ERR_PTR(-ENOMEM);
-	} else {
-		int len = smb_proc_read_link(server_from_dentry(dentry),
-						dentry, link, PATH_MAX - 1);
-		if (len < 0) {
-			__putname(link);
-			link = ERR_PTR(len);
-		} else {
-			link[len] = 0;
-		}
-	}
-	nd_set_link(nd, link);
-	return NULL;
-}
-
-static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		__putname(s);
-}
-
-const struct inode_operations smb_link_inode_operations =
-{
-	.readlink	= generic_readlink,
-	.follow_link	= smb_follow_link,
-	.put_link	= smb_put_link,
-};
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 626b629429f..98d520d371e 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -326,10 +326,6 @@ header-y += serio.h
 header-y += shm.h
 header-y += signal.h
 header-y += signalfd.h
-header-y += smb.h
-header-y += smb_fs.h
-header-y += smb_mount.h
-header-y += smbno.h
 header-y += snmp.h
 header-y += socket.h
 header-y += sockios.h
diff --git a/include/linux/smb.h b/include/linux/smb.h
deleted file mode 100644
index 82fefddc598..00000000000
--- a/include/linux/smb.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *  smb.h
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- */
-
-#ifndef _LINUX_SMB_H
-#define _LINUX_SMB_H
-
-#include <linux/types.h>
-#include <linux/magic.h>
-#ifdef __KERNEL__
-#include <linux/time.h>
-#endif
-
-enum smb_protocol { 
-	SMB_PROTOCOL_NONE, 
-	SMB_PROTOCOL_CORE, 
-	SMB_PROTOCOL_COREPLUS, 
-	SMB_PROTOCOL_LANMAN1, 
-	SMB_PROTOCOL_LANMAN2, 
-	SMB_PROTOCOL_NT1 
-};
-
-enum smb_case_hndl {
-	SMB_CASE_DEFAULT,
-	SMB_CASE_LOWER,
-	SMB_CASE_UPPER
-};
-
-struct smb_dskattr {
-        __u16 total;
-        __u16 allocblocks;
-        __u16 blocksize;
-        __u16 free;
-};
-
-struct smb_conn_opt {
-
-        /* The socket */
-	unsigned int fd;
-
-	enum smb_protocol protocol;
-	enum smb_case_hndl case_handling;
-
-	/* Connection-Options */
-
-	__u32              max_xmit;
-	__u16              server_uid;
-	__u16              tid;
-
-        /* The following are LANMAN 1.0 options */
-        __u16              secmode;
-        __u16              maxmux;
-        __u16              maxvcs;
-        __u16              rawmode;
-        __u32              sesskey;
-
-	/* The following are NT LM 0.12 options */
-	__u32              maxraw;
-	__u32              capabilities;
-	__s16              serverzone;
-};
-
-#ifdef __KERNEL__
-
-#define SMB_NLS_MAXNAMELEN 20
-struct smb_nls_codepage {
-	char local_name[SMB_NLS_MAXNAMELEN];
-	char remote_name[SMB_NLS_MAXNAMELEN];
-};
-
-
-#define SMB_MAXNAMELEN 255
-#define SMB_MAXPATHLEN 1024
-
-/*
- * Contains all relevant data on a SMB networked file.
- */
-struct smb_fattr {
-	__u16 attr;
-
-	unsigned long	f_ino;
-	umode_t		f_mode;
-	nlink_t		f_nlink;
-	uid_t		f_uid;
-	gid_t		f_gid;
-	dev_t		f_rdev;
-	loff_t		f_size;
-	struct timespec	f_atime;
-	struct timespec f_mtime;
-	struct timespec f_ctime;
-	unsigned long	f_blocks;
-	int		f_unix;
-};
-
-enum smb_conn_state {
-	CONN_VALID,		/* everything's fine */
-	CONN_INVALID,		/* Something went wrong, but did not
-				   try to reconnect yet. */
-	CONN_RETRIED,		/* Tried a reconnection, but was refused */
-	CONN_RETRYING		/* Currently trying to reconnect */
-};
-
-#define SMB_HEADER_LEN   37     /* includes everything up to, but not
-                                 * including smb_bcc */
-
-#define SMB_INITIAL_PACKET_SIZE		4000
-#define SMB_MAX_PACKET_SIZE		32768
-
-/* reserve this much space for trans2 parameters. Shouldn't have to be more
-   than 10 or so, but OS/2 seems happier like this. */
-#define SMB_TRANS2_MAX_PARAM 64
-
-#endif
-#endif
diff --git a/include/linux/smb_fs.h b/include/linux/smb_fs.h
deleted file mode 100644
index 923cd8a247b..00000000000
--- a/include/linux/smb_fs.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- *  smb_fs.h
- *
- *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- */
-
-#ifndef _LINUX_SMB_FS_H
-#define _LINUX_SMB_FS_H
-
-#include <linux/smb.h>
-
-/*
- * ioctl commands
- */
-#define	SMB_IOC_GETMOUNTUID		_IOR('u', 1, __kernel_old_uid_t)
-#define SMB_IOC_NEWCONN                 _IOW('u', 2, struct smb_conn_opt)
-
-/* __kernel_uid_t can never change, so we have to use __kernel_uid32_t */
-#define	SMB_IOC_GETMOUNTUID32		_IOR('u', 3, __kernel_uid32_t)
-
-
-#ifdef __KERNEL__
-#include <linux/smb_fs_i.h>
-#include <linux/smb_fs_sb.h>
-
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/vmalloc.h>
-#include <linux/smb_mount.h>
-#include <linux/jiffies.h>
-#include <asm/unaligned.h>
-
-static inline struct smb_sb_info *SMB_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct smb_inode_info *SMB_I(struct inode *inode)
-{
-	return container_of(inode, struct smb_inode_info, vfs_inode);
-}
-
-/* macro names are short for word, double-word, long value (?) */
-#define WVAL(buf, pos) (get_unaligned_le16((u8 *)(buf) + (pos)))
-#define DVAL(buf, pos) (get_unaligned_le32((u8 *)(buf) + (pos)))
-#define LVAL(buf, pos) (get_unaligned_le64((u8 *)(buf) + (pos)))
-
-#define WSET(buf, pos, val) put_unaligned_le16((val), (u8 *)(buf) + (pos))
-#define DSET(buf, pos, val) put_unaligned_le32((val), (u8 *)(buf) + (pos))
-#define LSET(buf, pos, val) put_unaligned_le64((val), (u8 *)(buf) + (pos))
-
-/* where to find the base of the SMB packet proper */
-#define smb_base(buf) ((u8 *)(((u8 *)(buf))+4))
-
-/*
- * Flags for the in-memory inode
- */
-#define SMB_F_LOCALWRITE	0x02	/* file modified locally */
-
-
-/* NT1 protocol capability bits */
-#define SMB_CAP_RAW_MODE         0x00000001
-#define SMB_CAP_MPX_MODE         0x00000002
-#define SMB_CAP_UNICODE          0x00000004
-#define SMB_CAP_LARGE_FILES      0x00000008
-#define SMB_CAP_NT_SMBS          0x00000010
-#define SMB_CAP_RPC_REMOTE_APIS  0x00000020
-#define SMB_CAP_STATUS32         0x00000040
-#define SMB_CAP_LEVEL_II_OPLOCKS 0x00000080
-#define SMB_CAP_LOCK_AND_READ    0x00000100
-#define SMB_CAP_NT_FIND          0x00000200
-#define SMB_CAP_DFS              0x00001000
-#define SMB_CAP_LARGE_READX      0x00004000
-#define SMB_CAP_LARGE_WRITEX     0x00008000
-#define SMB_CAP_UNIX             0x00800000	/* unofficial ... */
-
-
-/*
- * This is the time we allow an inode, dentry or dir cache to live. It is bad
- * for performance to have shorter ttl on an inode than on the cache. It can
- * cause refresh on each inode for a dir listing ... one-by-one
- */
-#define SMB_MAX_AGE(server) (((server)->mnt->ttl * HZ) / 1000)
-
-static inline void
-smb_age_dentry(struct smb_sb_info *server, struct dentry *dentry)
-{
-	dentry->d_time = jiffies - SMB_MAX_AGE(server);
-}
-
-struct smb_cache_head {
-	time_t		mtime;	/* unused */
-	unsigned long	time;	/* cache age */
-	unsigned long	end;	/* last valid fpos in cache */
-	int		eof;
-};
-
-#define SMB_DIRCACHE_SIZE	((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *)))
-union smb_dir_cache {
-	struct smb_cache_head   head;
-	struct dentry           *dentry[SMB_DIRCACHE_SIZE];
-};
-
-#define SMB_FIRSTCACHE_SIZE	((int)((SMB_DIRCACHE_SIZE * \
-	sizeof(struct dentry *) - sizeof(struct smb_cache_head)) / \
-	sizeof(struct dentry *)))
-
-#define SMB_DIRCACHE_START      (SMB_DIRCACHE_SIZE - SMB_FIRSTCACHE_SIZE)
-
-struct smb_cache_control {
-	struct  smb_cache_head		head;
-	struct  page			*page;
-	union   smb_dir_cache		*cache;
-	unsigned long			fpos, ofs;
-	int				filled, valid, idx;
-};
-
-#define SMB_OPS_NUM_STATIC	5
-struct smb_ops {
-	int (*read)(struct inode *inode, loff_t offset, int count,
-		    char *data);
-	int (*write)(struct inode *inode, loff_t offset, int count, const
-		     char *data);
-	int (*readdir)(struct file *filp, void *dirent, filldir_t filldir,
-		       struct smb_cache_control *ctl);
-
-	int (*getattr)(struct smb_sb_info *server, struct dentry *dir,
-		       struct smb_fattr *fattr);
-	/* int (*setattr)(...); */      /* setattr is really icky! */
-
-	int (*truncate)(struct inode *inode, loff_t length);
-
-
-	/* --- --- --- end of "static" entries --- --- --- */
-
-	int (*convert)(unsigned char *output, int olen,
-		       const unsigned char *input, int ilen,
-		       struct nls_table *nls_from,
-		       struct nls_table *nls_to);
-};
-
-static inline int
-smb_is_open(struct inode *i)
-{
-	return (SMB_I(i)->open == server_from_inode(i)->generation);
-}
-
-extern void smb_install_null_ops(struct smb_ops *);
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_SMB_FS_H */
diff --git a/include/linux/smb_fs_i.h b/include/linux/smb_fs_i.h
deleted file mode 100644
index 8ccf4eca2c3..00000000000
--- a/include/linux/smb_fs_i.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- *  smb_fs_i.h
- *
- *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- */
-
-#ifndef _LINUX_SMB_FS_I
-#define _LINUX_SMB_FS_I
-
-#include <linux/types.h>
-#include <linux/fs.h>
-
-/*
- * smb fs inode data (in memory only)
- */
-struct smb_inode_info {
-
-	/*
-	 * file handles are local to a connection. A file is open if
-	 * (open == generation).
-	 */
-        unsigned int open;	/* open generation */
-	__u16 fileid;		/* What id to handle a file with? */
-	__u16 attr;		/* Attribute fields, DOS value */
-
-	__u16 access;		/* Access mode */
-	__u16 flags;
-	unsigned long oldmtime;	/* last time refreshed */
-	unsigned long closed;	/* timestamp when closed */
-	unsigned openers;	/* number of fileid users */
-
-	struct inode vfs_inode;	/* must be at the end */
-};
-
-#endif
diff --git a/include/linux/smb_fs_sb.h b/include/linux/smb_fs_sb.h
deleted file mode 100644
index bb947dd1fba..00000000000
--- a/include/linux/smb_fs_sb.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- *  smb_fs_sb.h
- *
- *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- */
-
-#ifndef _SMB_FS_SB
-#define _SMB_FS_SB
-
-#include <linux/types.h>
-#include <linux/backing-dev.h>
-#include <linux/smb.h>
-
-/*
- * Upper limit on the total number of active smb_request structs.
- */
-#define MAX_REQUEST_HARD       256
-
-enum smb_receive_state {
-	SMB_RECV_START,		/* No data read, looking for length + sig */
-	SMB_RECV_HEADER,	/* Reading the header data */
-	SMB_RECV_HCOMPLETE,	/* Done with the header */
-	SMB_RECV_PARAM,		/* Reading parameter words */
-	SMB_RECV_DATA,		/* Reading data bytes */
-	SMB_RECV_END,		/* End of request */
-	SMB_RECV_DROP,		/* Dropping this SMB */
-	SMB_RECV_REQUEST,	/* Received a request and not a reply */
-};
-
-/* structure access macros */
-#define server_from_inode(inode) SMB_SB((inode)->i_sb)
-#define server_from_dentry(dentry) SMB_SB((dentry)->d_sb)
-#define SB_of(server) ((server)->super_block)
-
-struct smb_sb_info {
-	/* List of all smbfs superblocks */
-	struct list_head entry;
-
-        enum smb_conn_state state;
-	struct file * sock_file;
-	int conn_error;
-	enum smb_receive_state rstate;
-
-	atomic_t nr_requests;
-	struct list_head xmitq;
-	struct list_head recvq;
-	u16 mid;
-
-        struct smb_mount_data_kernel *mnt;
-
-	/* Connections are counted. Each time a new socket arrives,
-	 * generation is incremented.
-	 */
-	unsigned int generation;
-	struct pid *conn_pid;
-	struct smb_conn_opt opt;
-	wait_queue_head_t conn_wq;
-	int conn_complete;
-	struct semaphore sem;
-
-	unsigned char      header[SMB_HEADER_LEN + 20*2 + 2];
-	u32                header_len;
-	u32                smb_len;
-	u32                smb_read;
-
-        /* We use our own data_ready callback, but need the original one */
-        void *data_ready;
-
-	/* nls pointers for codepage conversions */
-	struct nls_table *remote_nls;
-	struct nls_table *local_nls;
-
-	struct smb_ops *ops;
-
-	struct super_block *super_block;
-
-	struct backing_dev_info bdi;
-};
-
-static inline int
-smb_lock_server_interruptible(struct smb_sb_info *server)
-{
-	return down_interruptible(&(server->sem));
-}
-
-static inline void
-smb_lock_server(struct smb_sb_info *server)
-{
-	down(&(server->sem));
-}
-
-static inline void
-smb_unlock_server(struct smb_sb_info *server)
-{
-	up(&(server->sem));
-}
-
-#endif
diff --git a/include/linux/smb_mount.h b/include/linux/smb_mount.h
deleted file mode 100644
index d10f00cb570..00000000000
--- a/include/linux/smb_mount.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  smb_mount.h
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- */
-
-#ifndef _LINUX_SMB_MOUNT_H
-#define _LINUX_SMB_MOUNT_H
-
-#include <linux/types.h>
-
-#define SMB_MOUNT_VERSION	6
-
-struct smb_mount_data {
-	int version;
-	__kernel_uid_t mounted_uid; /* Who may umount() this filesystem? */
-	__kernel_uid_t uid;
-	__kernel_gid_t gid;
-	__kernel_mode_t file_mode;
-	__kernel_mode_t dir_mode;
-};
-
-
-#ifdef __KERNEL__
-
-/* "vers" in big-endian */
-#define SMB_MOUNT_ASCII 0x76657273
-
-#define SMB_MOUNT_OLDVERSION	6
-#undef SMB_MOUNT_VERSION
-#define SMB_MOUNT_VERSION	7
-
-/* flags */
-#define SMB_MOUNT_WIN95		0x0001	/* Win 95 server */
-#define SMB_MOUNT_OLDATTR	0x0002	/* Use core getattr (Win 95 speedup) */
-#define SMB_MOUNT_DIRATTR	0x0004	/* Use find_first for getattr */
-#define SMB_MOUNT_CASE		0x0008	/* Be case sensitive */
-#define SMB_MOUNT_UNICODE	0x0010	/* Server talks unicode */
-#define SMB_MOUNT_UID		0x0020  /* Use user specified uid */
-#define SMB_MOUNT_GID		0x0040  /* Use user specified gid */
-#define SMB_MOUNT_FMODE		0x0080  /* Use user specified file mode */
-#define SMB_MOUNT_DMODE		0x0100  /* Use user specified dir mode */
-
-struct smb_mount_data_kernel {
-	int version;
-
-	uid_t mounted_uid;	/* Who may umount() this filesystem? */
-	uid_t uid;
-	gid_t gid;
-	mode_t file_mode;
-	mode_t dir_mode;
-
-	u32 flags;
-
-        /* maximum age in jiffies (inode, dentry and dircache) */
-	int ttl;
-
-	struct smb_nls_codepage codepage;
-};
-
-#endif
-
-#endif
diff --git a/include/linux/smbno.h b/include/linux/smbno.h
deleted file mode 100644
index f99e02d9ffe..00000000000
--- a/include/linux/smbno.h
+++ /dev/null
@@ -1,363 +0,0 @@
-#ifndef _SMBNO_H_
-#define _SMBNO_H_
-
-/* these define the attribute byte as seen by DOS */
-#define aRONLY	(1L<<0)
-#define aHIDDEN	(1L<<1)
-#define aSYSTEM	(1L<<2)
-#define aVOLID	(1L<<3)
-#define aDIR	(1L<<4)
-#define aARCH	(1L<<5)
-
-/* error classes */
-#define SUCCESS 0  /* The request was successful. */
-#define ERRDOS 0x01 /*  Error is from the core DOS operating system set. */
-#define ERRSRV 0x02 /* Error is generated by the server network file manager.*/
-#define ERRHRD 0x03  /* Error is an hardware error. */
-#define ERRCMD 0xFF  /* Command was not in the "SMB" format. */
-
-/* SMB X/Open error codes for the ERRdos error class */
-
-#define ERRbadfunc 1            /* Invalid function (or system call) */
-#define ERRbadfile 2            /* File not found (pathname error) */
-#define ERRbadpath 3            /* Directory not found */
-#define ERRnofids 4             /* Too many open files */
-#define ERRnoaccess 5           /* Access denied */
-#define ERRbadfid 6             /* Invalid fid */
-#define ERRbadmcb 7             /* Memory control blocks destroyed */
-#define ERRnomem 8              /* Out of memory */
-#define ERRbadmem 9             /* Invalid memory block address */
-#define ERRbadenv 10            /* Invalid environment */
-#define ERRbadformat 11         /* Invalid format */
-#define ERRbadaccess 12         /* Invalid open mode */
-#define ERRbaddata 13           /* Invalid data (only from ioctl call) */
-#define ERRres 14               /* reserved */
-#define ERRbaddrive 15          /* Invalid drive */
-#define ERRremcd 16             /* Attempt to delete current directory */
-#define ERRdiffdevice 17        /* rename/move across different filesystems */
-#define ERRnofiles 18           /* no more files found in file search */
-#define ERRbadshare 32          /* Share mode on file conflict with open mode */
-#define ERRlock 33              /* Lock request conflicts with existing lock */
-#define ERRfilexists 80         /* File in operation already exists */
-#define ERRbadpipe 230          /* Named pipe invalid */
-#define ERRpipebusy 231         /* All instances of pipe are busy */
-#define ERRpipeclosing 232      /* named pipe close in progress */
-#define ERRnotconnected 233     /* No process on other end of named pipe */
-#define ERRmoredata 234         /* More data to be returned */
-
-#define ERROR_INVALID_PARAMETER	 87
-#define ERROR_DISK_FULL		112
-#define ERROR_INVALID_NAME	123
-#define ERROR_DIR_NOT_EMPTY	145
-#define ERROR_NOT_LOCKED	158
-#define ERROR_ALREADY_EXISTS	183  /* see also 80 ? */
-#define ERROR_EAS_DIDNT_FIT	275 /* Extended attributes didn't fit */
-#define ERROR_EAS_NOT_SUPPORTED	282 /* Extended attributes not supported */
-
-/* Error codes for the ERRSRV class */
-
-#define ERRerror 1              /* Non specific error code */
-#define ERRbadpw 2              /* Bad password */
-#define ERRbadtype 3            /* reserved */
-#define ERRaccess 4          /* No permissions to do the requested operation */
-#define ERRinvnid 5             /* tid invalid */
-#define ERRinvnetname 6         /* Invalid servername */
-#define ERRinvdevice 7          /* Invalid device */
-#define ERRqfull 49             /* Print queue full */
-#define ERRqtoobig 50           /* Queued item too big */
-#define ERRinvpfid 52           /* Invalid print file in smb_fid */
-#define ERRsmbcmd 64            /* Unrecognised command */
-#define ERRsrverror 65          /* smb server internal error */
-#define ERRfilespecs 67         /* fid and pathname invalid combination */
-#define ERRbadlink 68           /* reserved */
-#define ERRbadpermits 69        /* Access specified for a file is not valid */
-#define ERRbadpid 70            /* reserved */
-#define ERRsetattrmode 71       /* attribute mode invalid */
-#define ERRpaused 81            /* Message server paused */
-#define ERRmsgoff 82            /* Not receiving messages */
-#define ERRnoroom 83            /* No room for message */
-#define ERRrmuns 87             /* too many remote usernames */
-#define ERRtimeout 88           /* operation timed out */
-#define ERRnoresource  89   /* No resources currently available for request. */
-#define ERRtoomanyuids 90       /* too many userids */
-#define ERRbaduid 91            /* bad userid */
-#define ERRuseMPX 250    /* temporarily unable to use raw mode, use MPX mode */
-#define ERRuseSTD 251    /* temporarily unable to use raw mode, use std.mode */
-#define ERRcontMPX 252          /* resume MPX mode */
-#define ERRbadPW                /* reserved */
-#define ERRnosupport 0xFFFF
-
-/* Error codes for the ERRHRD class */
-
-#define ERRnowrite 19           /* read only media */
-#define ERRbadunit 20           /* Unknown device */
-#define ERRnotready 21          /* Drive not ready */
-#define ERRbadcmd 22            /* Unknown command */
-#define ERRdata 23              /* Data (CRC) error */
-#define ERRbadreq 24            /* Bad request structure length */
-#define ERRseek 25
-#define ERRbadmedia 26
-#define ERRbadsector 27
-#define ERRnopaper 28
-#define ERRwrite 29             /* write fault */
-#define ERRread 30              /* read fault */
-#define ERRgeneral 31           /* General hardware failure */
-#define ERRwrongdisk 34
-#define ERRFCBunavail 35
-#define ERRsharebufexc 36       /* share buffer exceeded */
-#define ERRdiskfull 39
-
-/*
- * Access modes when opening a file
- */
-#define SMB_ACCMASK	0x0003
-#define SMB_O_RDONLY	0x0000
-#define SMB_O_WRONLY	0x0001
-#define SMB_O_RDWR	0x0002
-
-/* offsets into message for common items */
-#define smb_com 8
-#define smb_rcls 9
-#define smb_reh 10
-#define smb_err 11
-#define smb_flg 13
-#define smb_flg2 14
-#define smb_reb 13
-#define smb_tid 28
-#define smb_pid 30
-#define smb_uid 32
-#define smb_mid 34
-#define smb_wct 36
-#define smb_vwv 37
-#define smb_vwv0 37
-#define smb_vwv1 39
-#define smb_vwv2 41
-#define smb_vwv3 43
-#define smb_vwv4 45
-#define smb_vwv5 47
-#define smb_vwv6 49
-#define smb_vwv7 51
-#define smb_vwv8 53
-#define smb_vwv9 55
-#define smb_vwv10 57
-#define smb_vwv11 59
-#define smb_vwv12 61
-#define smb_vwv13 63
-#define smb_vwv14 65
-
-/* these are the trans2 sub fields for primary requests */
-#define smb_tpscnt smb_vwv0
-#define smb_tdscnt smb_vwv1
-#define smb_mprcnt smb_vwv2
-#define smb_mdrcnt smb_vwv3
-#define smb_msrcnt smb_vwv4
-#define smb_flags smb_vwv5
-#define smb_timeout smb_vwv6
-#define smb_pscnt smb_vwv9
-#define smb_psoff smb_vwv10
-#define smb_dscnt smb_vwv11
-#define smb_dsoff smb_vwv12
-#define smb_suwcnt smb_vwv13
-#define smb_setup smb_vwv14
-#define smb_setup0 smb_setup
-#define smb_setup1 (smb_setup+2)
-#define smb_setup2 (smb_setup+4)
-
-/* these are for the secondary requests */
-#define smb_spscnt smb_vwv2
-#define smb_spsoff smb_vwv3
-#define smb_spsdisp smb_vwv4
-#define smb_sdscnt smb_vwv5
-#define smb_sdsoff smb_vwv6
-#define smb_sdsdisp smb_vwv7
-#define smb_sfid smb_vwv8
-
-/* and these for responses */
-#define smb_tprcnt smb_vwv0
-#define smb_tdrcnt smb_vwv1
-#define smb_prcnt smb_vwv3
-#define smb_proff smb_vwv4
-#define smb_prdisp smb_vwv5
-#define smb_drcnt smb_vwv6
-#define smb_droff smb_vwv7
-#define smb_drdisp smb_vwv8
-
-/* the complete */
-#define SMBmkdir      0x00   /* create directory */
-#define SMBrmdir      0x01   /* delete directory */
-#define SMBopen       0x02   /* open file */
-#define SMBcreate     0x03   /* create file */
-#define SMBclose      0x04   /* close file */
-#define SMBflush      0x05   /* flush file */
-#define SMBunlink     0x06   /* delete file */
-#define SMBmv         0x07   /* rename file */
-#define SMBgetatr     0x08   /* get file attributes */
-#define SMBsetatr     0x09   /* set file attributes */
-#define SMBread       0x0A   /* read from file */
-#define SMBwrite      0x0B   /* write to file */
-#define SMBlock       0x0C   /* lock byte range */
-#define SMBunlock     0x0D   /* unlock byte range */
-#define SMBctemp      0x0E   /* create temporary file */
-#define SMBmknew      0x0F   /* make new file */
-#define SMBchkpth     0x10   /* check directory path */
-#define SMBexit       0x11   /* process exit */
-#define SMBlseek      0x12   /* seek */
-#define SMBtcon       0x70   /* tree connect */
-#define SMBtconX      0x75   /* tree connect and X*/
-#define SMBtdis       0x71   /* tree disconnect */
-#define SMBnegprot    0x72   /* negotiate protocol */
-#define SMBdskattr    0x80   /* get disk attributes */
-#define SMBsearch     0x81   /* search directory */
-#define SMBsplopen    0xC0   /* open print spool file */
-#define SMBsplwr      0xC1   /* write to print spool file */
-#define SMBsplclose   0xC2   /* close print spool file */
-#define SMBsplretq    0xC3   /* return print queue */
-#define SMBsends      0xD0   /* send single block message */
-#define SMBsendb      0xD1   /* send broadcast message */
-#define SMBfwdname    0xD2   /* forward user name */
-#define SMBcancelf    0xD3   /* cancel forward */
-#define SMBgetmac     0xD4   /* get machine name */
-#define SMBsendstrt   0xD5   /* send start of multi-block message */
-#define SMBsendend    0xD6   /* send end of multi-block message */
-#define SMBsendtxt    0xD7   /* send text of multi-block message */
-
-/* Core+ protocol */
-#define SMBlockread	  0x13   /* Lock a range and read */
-#define SMBwriteunlock 0x14 /* Unlock a range then write */
-#define SMBreadbraw   0x1a  /* read a block of data with no smb header */
-#define SMBwritebraw  0x1d  /* write a block of data with no smb header */
-#define SMBwritec     0x20  /* secondary write request */
-#define SMBwriteclose 0x2c  /* write a file then close it */
-
-/* dos extended protocol */
-#define SMBreadBraw      0x1A   /* read block raw */
-#define SMBreadBmpx      0x1B   /* read block multiplexed */
-#define SMBreadBs        0x1C   /* read block (secondary response) */
-#define SMBwriteBraw     0x1D   /* write block raw */
-#define SMBwriteBmpx     0x1E   /* write block multiplexed */
-#define SMBwriteBs       0x1F   /* write block (secondary request) */
-#define SMBwriteC        0x20   /* write complete response */
-#define SMBsetattrE      0x22   /* set file attributes expanded */
-#define SMBgetattrE      0x23   /* get file attributes expanded */
-#define SMBlockingX      0x24   /* lock/unlock byte ranges and X */
-#define SMBtrans         0x25   /* transaction - name, bytes in/out */
-#define SMBtranss        0x26   /* transaction (secondary request/response) */
-#define SMBioctl         0x27   /* IOCTL */
-#define SMBioctls        0x28   /* IOCTL  (secondary request/response) */
-#define SMBcopy          0x29   /* copy */
-#define SMBmove          0x2A   /* move */
-#define SMBecho          0x2B   /* echo */
-#define SMBopenX         0x2D   /* open and X */
-#define SMBreadX         0x2E   /* read and X */
-#define SMBwriteX        0x2F   /* write and X */
-#define SMBsesssetupX    0x73   /* Session Set Up & X (including User Logon) */
-#define SMBtconX         0x75   /* tree connect and X */
-#define SMBffirst        0x82   /* find first */
-#define SMBfunique       0x83   /* find unique */
-#define SMBfclose        0x84   /* find close */
-#define SMBinvalid       0xFE   /* invalid command */
-
-
-/* Extended 2.0 protocol */
-#define SMBtrans2        0x32   /* TRANS2 protocol set */
-#define SMBtranss2       0x33   /* TRANS2 protocol set, secondary command */
-#define SMBfindclose     0x34   /* Terminate a TRANSACT2_FINDFIRST */
-#define SMBfindnclose    0x35   /* Terminate a TRANSACT2_FINDNOTIFYFIRST */
-#define SMBulogoffX      0x74   /* user logoff */
-
-/* these are the TRANS2 sub commands */
-#define TRANSACT2_OPEN          0
-#define TRANSACT2_FINDFIRST     1
-#define TRANSACT2_FINDNEXT      2
-#define TRANSACT2_QFSINFO       3
-#define TRANSACT2_SETFSINFO     4
-#define TRANSACT2_QPATHINFO     5
-#define TRANSACT2_SETPATHINFO   6
-#define TRANSACT2_QFILEINFO     7
-#define TRANSACT2_SETFILEINFO   8
-#define TRANSACT2_FSCTL         9
-#define TRANSACT2_IOCTL           10
-#define TRANSACT2_FINDNOTIFYFIRST 11
-#define TRANSACT2_FINDNOTIFYNEXT  12
-#define TRANSACT2_MKDIR           13
-
-/* Information Levels -  Shared? */
-#define SMB_INFO_STANDARD		1
-#define SMB_INFO_QUERY_EA_SIZE		2
-#define SMB_INFO_QUERY_EAS_FROM_LIST	3
-#define SMB_INFO_QUERY_ALL_EAS		4
-#define SMB_INFO_IS_NAME_VALID		6
-
-/* Information Levels -  TRANSACT2_FINDFIRST */
-#define SMB_FIND_FILE_DIRECTORY_INFO		0x101
-#define SMB_FIND_FILE_FULL_DIRECTORY_INFO	0x102
-#define SMB_FIND_FILE_NAMES_INFO		0x103
-#define SMB_FIND_FILE_BOTH_DIRECTORY_INFO	0x104
-
-/* Information Levels -  TRANSACT2_QPATHINFO */
-#define SMB_QUERY_FILE_BASIC_INFO	0x101
-#define SMB_QUERY_FILE_STANDARD_INFO	0x102
-#define SMB_QUERY_FILE_EA_INFO		0x103
-#define SMB_QUERY_FILE_NAME_INFO	0x104
-#define SMB_QUERY_FILE_ALL_INFO		0x107
-#define SMB_QUERY_FILE_ALT_NAME_INFO	0x108
-#define SMB_QUERY_FILE_STREAM_INFO	0x109
-#define SMB_QUERY_FILE_COMPRESSION_INFO	0x10b
-
-/* Information Levels - TRANSACT2_SETFILEINFO */
-#define SMB_SET_FILE_BASIC_INFO		0x101
-#define SMB_SET_FILE_DISPOSITION_INFO	0x102
-#define SMB_SET_FILE_ALLOCATION_INFO	0x103
-#define SMB_SET_FILE_END_OF_FILE_INFO	0x104
-
-/* smb_flg field flags */
-#define SMB_FLAGS_SUPPORT_LOCKREAD	0x01
-#define SMB_FLAGS_CLIENT_BUF_AVAIL	0x02
-#define SMB_FLAGS_RESERVED		0x04
-#define SMB_FLAGS_CASELESS_PATHNAMES	0x08
-#define SMB_FLAGS_CANONICAL_PATHNAMES	0x10
-#define SMB_FLAGS_REQUEST_OPLOCK	0x20
-#define SMB_FLAGS_REQUEST_BATCH_OPLOCK	0x40
-#define SMB_FLAGS_REPLY			0x80
-
-/* smb_flg2 field flags (samba-2.2.0/source/include/smb.h) */
-#define SMB_FLAGS2_LONG_PATH_COMPONENTS		0x0001
-#define SMB_FLAGS2_EXTENDED_ATTRIBUTES		0x0002
-#define SMB_FLAGS2_DFS_PATHNAMES		0x1000
-#define SMB_FLAGS2_READ_PERMIT_NO_EXECUTE	0x2000
-#define SMB_FLAGS2_32_BIT_ERROR_CODES		0x4000 
-#define SMB_FLAGS2_UNICODE_STRINGS		0x8000
-
-
-/*
- * UNIX stuff  (from samba trans2.h)
- */
-#define MIN_UNIX_INFO_LEVEL		0x200
-#define MAX_UNIX_INFO_LEVEL		0x2FF
-#define SMB_FIND_FILE_UNIX		0x202
-#define SMB_QUERY_FILE_UNIX_BASIC	0x200
-#define SMB_QUERY_FILE_UNIX_LINK	0x201
-#define SMB_QUERY_FILE_UNIX_HLINK	0x202
-#define SMB_SET_FILE_UNIX_BASIC		0x200
-#define SMB_SET_FILE_UNIX_LINK		0x201
-#define SMB_SET_FILE_UNIX_HLINK		0x203
-#define SMB_QUERY_CIFS_UNIX_INFO	0x200
-
-/* values which means "don't change it" */
-#define SMB_MODE_NO_CHANGE		0xFFFFFFFF
-#define SMB_UID_NO_CHANGE		0xFFFFFFFF
-#define SMB_GID_NO_CHANGE		0xFFFFFFFF
-#define SMB_TIME_NO_CHANGE		0xFFFFFFFFFFFFFFFFULL
-#define SMB_SIZE_NO_CHANGE		0xFFFFFFFFFFFFFFFFULL
-
-/* UNIX filetype mappings. */
-#define UNIX_TYPE_FILE		0
-#define UNIX_TYPE_DIR		1
-#define UNIX_TYPE_SYMLINK	2
-#define UNIX_TYPE_CHARDEV	3
-#define UNIX_TYPE_BLKDEV	4
-#define UNIX_TYPE_FIFO		5
-#define UNIX_TYPE_SOCKET	6
-#define UNIX_TYPE_UNKNOWN	0xFFFFFFFF
-
-#endif /* _SMBNO_H_ */
-- 
cgit v1.2.3


From 955a857e062642cd3ebe1dc7bb38c0f85d8f8f17 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Wed, 29 Sep 2010 15:41:49 -0400
Subject: NFS: new idmapper

This patch creates a new idmapper system that uses the request-key function to
place a call into userspace to map user and group ids to names.  The old
idmapper was single threaded, which prevented more than one request from running
at a single time.  This means that a user would have to wait for an upcall to
finish before accessing a cached result.

The upcall result is stored on a keyring of type id_resolver.  See the file
Documentation/filesystems/nfs/idmapper.txt for instructions.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
[Trond: fix up the return value of nfs_idmap_lookup_name and clean up code]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/nfs/00-INDEX     |   2 +
 Documentation/filesystems/nfs/idmapper.txt |  67 +++++++++
 fs/nfs/Kconfig                             |  11 ++
 fs/nfs/idmap.c                             | 211 ++++++++++++++++++++++++++++-
 fs/nfs/inode.c                             |   7 +
 fs/nfs/nfs4xdr.c                           |   4 +-
 fs/nfs/sysctl.c                            |   2 +
 include/linux/nfs_idmap.h                  |  31 ++++-
 8 files changed, 329 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/filesystems/nfs/idmapper.txt

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
index 2f68cd68876..3225a566211 100644
--- a/Documentation/filesystems/nfs/00-INDEX
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -14,3 +14,5 @@ nfsroot.txt
 	- short guide on setting up a diskless box with NFS root filesystem.
 rpc-cache.txt
 	- introduction to the caching mechanisms in the sunrpc layer.
+idmapper.txt
+	- information for configuring request-keys to be used by idmapper
diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt
new file mode 100644
index 00000000000..c3852041a21
--- /dev/null
+++ b/Documentation/filesystems/nfs/idmapper.txt
@@ -0,0 +1,67 @@
+
+=========
+ID Mapper
+=========
+Id mapper is used by NFS to translate user and group ids into names, and to
+translate user and group names into ids.  Part of this translation involves
+performing an upcall to userspace to request the information.  Id mapper will
+user request-key to perform this upcall and cache the result.  The program
+/usr/sbin/nfs.upcall should be called by request-key, and will perform the
+translation and initialize a key with the resulting information.
+
+ NFS_USE_NEW_IDMAPPER must be selected when configuring the kernel to use this
+ feature.
+
+===========
+Configuring
+===========
+The file /etc/request-key.conf will need to be modified so /sbin/request-key can
+direct the upcall.  The following line should be added:
+
+#OP	TYPE	DESCRIPTION	CALLOUT INFO	PROGRAM ARG1 ARG2 ARG3 ...
+#======	=======	===============	===============	===============================
+create	id_resolver	*	*		/usr/sbin/nfs.upcall %k %d 600
+
+This will direct all id_resolver requests to the program /usr/sbin/nfs.upcall.
+The last parameter, 600, defines how many seconds into the future the key will
+expire.  This parameter is optional for /usr/sbin/nfs.upcall.  When the timeout
+is not specified, nfs.upcall will default to 600 seconds.
+
+id mapper uses for key descriptions:
+	  uid:  Find the UID for the given user
+	  gid:  Find the GID for the given group
+	 user:  Find the user  name for the given UID
+	group:  Find the group name for the given GID
+
+You can handle any of these individually, rather than using the generic upcall
+program.  If you would like to use your own program for a uid lookup then you
+would edit your request-key.conf so it look similar to this:
+
+#OP	TYPE	DESCRIPTION	CALLOUT INFO	PROGRAM ARG1 ARG2 ARG3 ...
+#======	=======	===============	===============	===============================
+create	id_resolver	uid:*	*		/some/other/program  %k %d 600
+create	id_resolver	*	*		/usr/sbin/nfs.upcall %k %d 600
+
+Notice that the new line was added above the line for the generic program.
+request-key will find the first matching line and corresponding program.  In
+this case, /some/other/program will handle all uid lookups and
+/usr/sbin/nfs.upcall will handle gid, user, and group lookups.
+
+See <file:Documentation/keys-request-keys.txt> for more information about the
+request-key function.
+
+
+==========
+nfs.upcall
+==========
+nfs.upcall is designed to be called by request-key, and should not be run "by
+hand".  This program takes two arguments, a serialized key and a key
+description.  The serialized key is first converted into a key_serial_t, and
+then passed as an argument to keyctl_instantiate (both are part of keyutils.h).
+
+The actual lookups are performed by functions found in nfsidmap.h.  nfs.upcall
+determines the correct function to call by looking at the first part of the
+description string.  For example, a uid lookup description will appear as
+"uid:user@domain".
+
+nfs.upcall will return 0 if the key was instantiated, and non-zero otherwise.
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 6c2aad49d73..3f69752d6f1 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -116,3 +116,14 @@ config NFS_USE_KERNEL_DNS
 	select DNS_RESOLVER
 	select KEYS
 	default y
+
+config NFS_USE_NEW_IDMAPPER
+	bool "Use the new idmapper upcall routine"
+	depends on NFS_V4 && KEYS
+	help
+	  Say Y here if you want NFS to use the new idmapper upcall functions.
+	  You will need /sbin/request-key (usually provided by the keyutils
+	  package).  For details, read
+	  <file:Documentation/filesystems/nfs/idmapper.txt>.
+
+	  If you are unsure, say N.
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 21a84d45916..dec47ed8b6b 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,6 +34,212 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
+
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/nfs_idmap.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+
+#include <keys/user-type.h>
+
+#define NFS_UINT_MAXLEN 11
+
+const struct cred *id_resolver_cache;
+
+struct key_type key_type_id_resolver = {
+	.name		= "id_resolver",
+	.instantiate	= user_instantiate,
+	.match		= user_match,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= user_describe,
+	.read		= user_read,
+};
+
+int nfs_idmap_init(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret = 0;
+
+	printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
+
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
+			     (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			     KEY_USR_VIEW | KEY_USR_READ,
+			     KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&key_type_id_resolver);
+	if (ret < 0)
+		goto failed_put_key;
+
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	id_resolver_cache = cred;
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+void nfs_idmap_quit(void)
+{
+	key_revoke(id_resolver_cache->thread_keyring);
+	unregister_key_type(&key_type_id_resolver);
+	put_cred(id_resolver_cache);
+}
+
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it.  The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned.  Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+				const char *type, size_t typelen, char **desc)
+{
+	char *cp;
+	size_t desclen = typelen + namelen + 2;
+
+	*desc = kmalloc(desclen, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	cp = *desc;
+	memcpy(cp, type, typelen);
+	cp += typelen;
+	*cp++ = ':';
+
+	memcpy(cp, name, namelen);
+	cp += namelen;
+	*cp = '\0';
+	return desclen;
+}
+
+static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
+		const char *type, void *data, size_t data_size)
+{
+	const struct cred *saved_cred;
+	struct key *rkey;
+	char *desc;
+	struct user_key_payload *payload;
+	ssize_t ret;
+
+	ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+	if (ret <= 0)
+		goto out;
+
+	saved_cred = override_creds(id_resolver_cache);
+	rkey = request_key(&key_type_id_resolver, desc, "");
+	revert_creds(saved_cred);
+	kfree(desc);
+	if (IS_ERR(rkey)) {
+		ret = PTR_ERR(rkey);
+		goto out;
+	}
+
+	rcu_read_lock();
+	rkey->perm |= KEY_USR_VIEW;
+
+	ret = key_validate(rkey);
+	if (ret < 0)
+		goto out_up;
+
+	payload = rcu_dereference(rkey->payload.data);
+	if (IS_ERR_OR_NULL(payload)) {
+		ret = PTR_ERR(payload);
+		goto out_up;
+	}
+
+	ret = payload->datalen;
+	if (ret > 0 && ret <= data_size)
+		memcpy(data, payload->data, ret);
+	else
+		ret = -EINVAL;
+
+out_up:
+	rcu_read_unlock();
+	key_put(rkey);
+out:
+	return ret;
+}
+
+
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	int id_len;
+	ssize_t ret;
+
+	id_len = snprintf(id_str, sizeof(id_str), "%u", id);
+	ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
+	if (ret < 0)
+		return -EINVAL;
+	return ret;
+}
+
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen,
+				const char *type, __u32 *id)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	long id_long;
+	ssize_t data_size;
+	int ret = 0;
+
+	data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
+	if (data_size <= 0) {
+		ret = -EINVAL;
+	} else {
+		ret = strict_strtol(id_str, 10, &id_long);
+		*id = (__u32)id_long;
+	}
+	return ret;
+}
+
+int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+{
+	return nfs_idmap_lookup_id(name, namelen, "uid", uid);
+}
+
+int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+{
+	return nfs_idmap_lookup_id(name, namelen, "gid", gid);
+}
+
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+{
+	return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+}
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+{
+	return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+}
+
+#else  /* CONFIG_NFS_USE_IDMAPPER not defined */
+
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
@@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
 	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
 
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
 	return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
 	return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
 }
 
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 18be041abd2..f2d2c801e0a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1526,6 +1526,10 @@ static int __init init_nfs_fs(void)
 {
 	int err;
 
+	err = nfs_idmap_init();
+	if (err < 0)
+		goto out9;
+
 	err = nfs_dns_resolver_init();
 	if (err < 0)
 		goto out8;
@@ -1590,6 +1594,8 @@ out6:
 out7:
 	nfs_dns_resolver_destroy();
 out8:
+	nfs_idmap_quit();
+out9:
 	return err;
 }
 
@@ -1602,6 +1608,7 @@ static void __exit exit_nfs_fs(void)
 	nfs_destroy_nfspagecache();
 	nfs_fscache_unregister();
 	nfs_dns_resolver_destroy();
+	nfs_idmap_quit();
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3feace66b98..6ea5c9392fe 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -816,7 +816,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	if (iap->ia_valid & ATTR_MODE)
 		len += 4;
 	if (iap->ia_valid & ATTR_UID) {
-		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
+		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
 		if (owner_namelen < 0) {
 			dprintk("nfs: couldn't resolve uid %d to string\n",
 					iap->ia_uid);
@@ -828,7 +828,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
 	}
 	if (iap->ia_valid & ATTR_GID) {
-		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
+		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
 		if (owner_grouplen < 0) {
 			dprintk("nfs: couldn't resolve gid %d to string\n",
 					iap->ia_gid);
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index ad4d2e787b2..978aaeb8a09 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = {
 		.extra1 = (int *)&nfs_set_port_min,
 		.extra2 = (int *)&nfs_set_port_max,
 	},
+#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
 	{
 		.procname = "idmap_cache_timeout",
 		.data = &nfs_idmap_cache_timeout,
@@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = {
 		.mode = 0644,
 		.proc_handler = proc_dointvec_jiffies,
 	},
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 #endif
 	{
 		.procname	= "nfs_mountpoint_timeout",
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index 91a1c24e0cb..e8352dc5afb 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -66,13 +66,40 @@ struct idmap_msg {
 /* Forward declaration to make this header independent of others */
 struct nfs_client;
 
+#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
+
+int nfs_idmap_init(void);
+void nfs_idmap_quit(void);
+
+static inline int nfs_idmap_new(struct nfs_client *clp)
+{
+	return 0;
+}
+
+static inline void nfs_idmap_delete(struct nfs_client *clp)
+{
+}
+
+#else /* CONFIG_NFS_USE_NEW_IDMAPPER not set */
+
+static inline int nfs_idmap_init(void)
+{
+	return 0;
+}
+
+static inline void nfs_idmap_quit(void)
+{
+}
+
 int nfs_idmap_new(struct nfs_client *);
 void nfs_idmap_delete(struct nfs_client *);
 
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
+
 int nfs_map_name_to_uid(struct nfs_client *, const char *, size_t, __u32 *);
 int nfs_map_group_to_gid(struct nfs_client *, const char *, size_t, __u32 *);
-int nfs_map_uid_to_name(struct nfs_client *, __u32, char *);
-int nfs_map_gid_to_group(struct nfs_client *, __u32, char *);
+int nfs_map_uid_to_name(struct nfs_client *, __u32, char *, size_t);
+int nfs_map_gid_to_group(struct nfs_client *, __u32, char *, size_t);
 
 extern unsigned int nfs_idmap_cache_timeout;
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 7bdb0d18bfd381cc5491eb95973ec5604b356c7e Mon Sep 17 00:00:00 2001
From: Tristan Ye <tristan.ye@oracle.com>
Date: Mon, 11 Oct 2010 16:46:39 +0800
Subject: ocfs2: Add a mount option "coherency=*" to handle cluster coherency
 for O_DIRECT writes.

Currently, the default behavior of O_DIRECT writes was allowing
concurrent writing among nodes to the same file, with no cluster
coherency guaranteed (no EX lock held).  This can leave stale data in
the cache for buffered reads on other nodes.

The new mount option introduce a chance to choose two different
behaviors for O_DIRECT writes:

    * coherency=full, as the default value, will disallow
                      concurrent O_DIRECT writes by taking
                      EX locks.

    * coherency=buffered, allow concurrent O_DIRECT writes
                          without EX lock among nodes, which
                          gains high performance at risk of
                          getting stale data on other nodes.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 Documentation/filesystems/ocfs2.txt |  7 +++++++
 fs/ocfs2/file.c                     | 29 +++++++++++++++++++++++++++--
 fs/ocfs2/ocfs2.h                    |  3 +++
 fs/ocfs2/super.c                    | 15 +++++++++++++++
 4 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index 1f7ae144f6d..5393e661169 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -87,3 +87,10 @@ dir_resv_level=	(*)	By default, directory reservations will scale with file
 			reservations - users should rarely need to change this
 			value. If allocation reservations are turned off, this
 			option will have no effect.
+coherency=full  (*)	Disallow concurrent O_DIRECT writes, cluster inode
+			lock will be taken to force other nodes drop cache,
+			therefore full cluster coherency is guaranteed even
+			for O_DIRECT writes.
+coherency=buffered	Allow concurrent O_DIRECT writes without EX lock among
+			nodes, which gains high performance at risk of getting
+			stale data on other nodes.
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 13af9937bdd..9e8cc4346b7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2225,6 +2225,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int full_coherency = !(osb->s_mount_opt &
+			       OCFS2_MOUNT_COHERENCY_BUFFERED);
 
 	mlog_entry("(0x%p, %u, '%.*s')\n", file,
 		   (unsigned int)nr_segs,
@@ -2248,14 +2250,37 @@ relock:
 		have_alloc_sem = 1;
 	}
 
-	/* concurrent O_DIRECT writes are allowed */
-	rw_level = !direct_io;
+	/*
+	 * Concurrent O_DIRECT writes are allowed with
+	 * mount_option "coherency=buffered".
+	 */
+	rw_level = (!direct_io || full_coherency);
+
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_sems;
 	}
 
+	/*
+	 * O_DIRECT writes with "coherency=full" need to take EX cluster
+	 * inode_lock to guarantee coherency.
+	 */
+	if (direct_io && full_coherency) {
+		/*
+		 * We need to take and drop the inode lock to force
+		 * other nodes to drop their caches.  Buffered I/O
+		 * already does this in write_begin().
+		 */
+		ret = ocfs2_inode_lock(inode, NULL, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_sems;
+		}
+
+		ocfs2_inode_unlock(inode, 1);
+	}
+
 	can_do_direct = direct_io;
 	ret = ocfs2_prepare_inode_for_write(file, ppos,
 					    iocb->ki_left, appending,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 687e291d73f..3064feef143 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -263,6 +263,9 @@ enum ocfs2_mount_options
 						   control lists */
 	OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
 	OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+
+	OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12 /* Allow concurrent O_DIRECT
+						    writes */
 };
 
 #define OCFS2_OSB_SOFT_RO			0x0001
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b578644b663..9122d59f812 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -177,6 +177,8 @@ enum {
 	Opt_noacl,
 	Opt_usrquota,
 	Opt_grpquota,
+	Opt_coherency_buffered,
+	Opt_coherency_full,
 	Opt_resv_level,
 	Opt_dir_resv_level,
 	Opt_err,
@@ -205,6 +207,8 @@ static const match_table_t tokens = {
 	{Opt_noacl, "noacl"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
+	{Opt_coherency_buffered, "coherency=buffered"},
+	{Opt_coherency_full, "coherency=full"},
 	{Opt_resv_level, "resv_level=%u"},
 	{Opt_dir_resv_level, "dir_resv_level=%u"},
 	{Opt_err, NULL}
@@ -1452,6 +1456,12 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_grpquota:
 			mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
 			break;
+		case Opt_coherency_buffered:
+			mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+			break;
+		case Opt_coherency_full:
+			mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+			break;
 		case Opt_acl:
 			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
 			mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1550,6 +1560,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (opts & OCFS2_MOUNT_GRPQUOTA)
 		seq_printf(s, ",grpquota");
 
+	if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+		seq_printf(s, ",coherency=buffered");
+	else
+		seq_printf(s, ",coherency=full");
+
 	if (opts & OCFS2_MOUNT_NOUSERXATTR)
 		seq_printf(s, ",nouser_xattr");
 	else
-- 
cgit v1.2.3


From f4a3e0bceb57466c31757f25e4e0ed108d1299ec Mon Sep 17 00:00:00 2001
From: "Dr. Werner Fink" <werner@suse.de>
Date: Wed, 22 Sep 2010 12:45:40 +0200
Subject: tty: Add a new file /proc/tty/consoles

Add a new file /proc/tty/consoles to be able to determine the registered
system console lines.  If the reading process holds /dev/console open at
the regular standard input stream the active device will be marked by an
asterisk.  Show possible operations and also decode the used flags of
the listed console lines.

Signed-off-by: Werner Fink <werner@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/filesystems/proc.txt |  32 ++++++++
 fs/proc/proc_tty.c                 | 158 +++++++++++++++++++++++++++++++++++++
 2 files changed, 190 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a6aca874088..98223a67694 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1075,6 +1075,7 @@ Table 1-11: Files in /proc/tty
  drivers       list of drivers and their usage                
  ldiscs        registered line disciplines                    
  driver/serial usage statistic and status of single tty lines 
+ consoles      registered system console lines
 ..............................................................................
 
 To see  which  tty's  are  currently in use, you can simply look into the file
@@ -1093,6 +1094,37 @@ To see  which  tty's  are  currently in use, you can simply look into the file
   /dev/tty             /dev/tty        5       0 system:/dev/tty 
   unknown              /dev/tty        4    1-63 console 
 
+To see which character device lines are currently used for the system console
+/dev/console, you may simply look into the file /proc/tty/consoles:
+
+  > cat /proc/tty/consoles
+  tty0                 -WU (ECp)       4:7
+  ttyS0                -W- (Ep)        4:64
+
+The columns are:
+
+  device               name of the device
+  operations           R = can do read operations
+                       W = can do write operations
+                       U = can do unblank
+  flags                E = it is enabled
+                       C = it is prefered console
+                       B = it is primary boot console
+                       p = it is used for printk buffer
+                       b = it is not a TTY but a Braille device
+                       a = it is safe to use when cpu is offline
+                       * = it is standard input of the reading process
+  major:minor          major and minor number of the device separated by a colon
+
+If the reading process holds /dev/console open at the regular standard input
+stream the active device will be marked by an asterisk:
+
+  > cat /proc/tty/consoles < /dev/console
+  tty0                 -WU (ECp*)      4:7
+  ttyS0                -W- (Ep)        4:64
+  > tty
+  /dev/pts/3
+
 
 1.8 Miscellaneous kernel statistics in /proc/stat
 -------------------------------------------------
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc86943..dc44f94022f 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -12,7 +12,10 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/console.h>
 #include <linux/seq_file.h>
+#include <linux/fdtable.h>
 #include <linux/bitops.h>
 
 /*
@@ -136,6 +139,160 @@ static const struct file_operations proc_tty_drivers_operations = {
 	.release	= seq_release,
 };
 
+/*
+ * The device ID of file descriptor 0 of the current reading
+ * task if a character device...
+ */
+static dev_t current_dev;
+
+/*
+ * This is the handler for /proc/tty/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+	const struct tty_driver *driver;
+	struct console *con;
+	int index, len;
+	char flags[10];
+	dev_t dev;
+
+	if (v == SEQ_START_TOKEN)
+		return 0;
+	con = (struct console *)v;
+	if (!con)
+		return 0;
+	driver = con->device(con, &index);
+	if (!driver)
+		return 0;
+	dev = MKDEV(driver->major, driver->minor_start) + index;
+
+	index = 0;
+	if (con->flags & CON_ENABLED)
+		flags[index++] = 'E';
+	if (con->flags & CON_CONSDEV)
+		flags[index++] = 'C';
+	if (con->flags & CON_BOOT)
+		flags[index++] = 'B';
+	if (con->flags & CON_PRINTBUFFER)
+		flags[index++] = 'p';
+	if (con->flags & CON_BRL)
+		flags[index++] = 'b';
+	if (con->flags & CON_ANYTIME)
+		flags[index++] = 'a';
+	if (current_dev == dev)
+		flags[index++] = '*';
+	flags[index] = 0;
+
+	seq_printf(m, "%s%d%n", con->name, con->index, &len);
+	len = 21 - len;
+	if (len < 1)
+		len = 1;
+	seq_printf(m, "%*c", len, ' ');
+	seq_printf(m, "%c%c%c (%s)%n", con->read ? 'R' : '-',
+			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+			flags, &len);
+	len = 13 - len;
+	if (len < 1)
+		len = 1;
+	seq_printf(m, "%*c%4d:%d\n", len, ' ', MAJOR(dev), MINOR(dev));
+
+	return 0;
+}
+
+/* iterator for consoles */
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	struct console *con;
+	loff_t off = 0;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	acquire_console_sem();
+	for (con = console_drivers; con; con = con->next) {
+		if (!con->device)
+			continue;
+		if (++off == *pos)
+			break;
+	}
+	release_console_sem();
+
+	return con;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct console *con;
+
+	acquire_console_sem();
+	if (v == SEQ_START_TOKEN)
+		con = console_drivers;
+	else
+		con = ((struct console *)v)->next;
+	for (; con; con = con->next) {
+		if (!con->device)
+			continue;
+		++*pos;
+		break;
+	}
+	release_console_sem();
+
+	return con;
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations tty_consoles_op = {
+	.start	= c_start,
+	.next	= c_next,
+	.stop	= c_stop,
+	.show	= show_console_dev
+};
+
+/*
+ * Used for open /proc/tty/consoles. Before this detect
+ * the device ID of file descriptor 0 of the current
+ * reading task if a character device...
+ */
+static int tty_consoles_open(struct inode *inode, struct file *file)
+{
+	struct files_struct *curfiles;
+
+	current_dev = 0;
+	curfiles = get_files_struct(current);
+	if (curfiles) {
+		const struct file *curfp;
+		spin_lock(&curfiles->file_lock);
+		curfp = fcheck_files(curfiles, 0);
+		if (curfp && curfp->private_data) {
+			const struct inode *inode;
+			dget(curfp->f_dentry);
+			inode = curfp->f_dentry->d_inode;
+			if (S_ISCHR(inode->i_mode)) {
+				struct tty_struct *tty;
+				tty = (struct tty_struct *)curfp->private_data;
+				if (tty && tty->magic == TTY_MAGIC) {
+					tty = tty_pair_get_tty(tty);
+					current_dev = tty_devnum(tty);
+				}
+			}
+			dput(curfp->f_dentry);
+		}
+		spin_unlock(&curfiles->file_lock);
+		put_files_struct(curfiles);
+	}
+	return seq_open(file, &tty_consoles_op);
+}
+
+static const struct file_operations proc_tty_consoles_operations = {
+	.open		= tty_consoles_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 /*
  * This function is called by tty_register_driver() to handle
  * registering the driver's /proc handler into /proc/tty/driver/<foo>
@@ -186,4 +343,5 @@ void __init proc_tty_init(void)
 	proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
 	proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
 	proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
+	proc_create("tty/consoles", 0, NULL, &proc_tty_consoles_operations);
 }
-- 
cgit v1.2.3


From 6c2754c28f2388a276fe21edde826f2113c8f60e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 23 Oct 2010 08:14:12 -0700
Subject: Revert "tty: Add a new file /proc/tty/consoles"

This reverts commit f4a3e0bceb57466c31757f25e4e0ed108d1299ec.  Jiri
Sladby points out that the tty structure we're using may already be
gone, and Al Viro doesn't hold back in complaining about the random
loading of 'filp->private_data' which doesn't have to be a pointer at
all, nor does checking the magic field for TTY_MAGIC prove anything.

Belated review by Al:

 "a) global variable depending on stdin of the last opener? Affecting
     output of read(2)? Really?

  b) iterator is broken; list should be locked in ->start(), unlocked in
     ->stop() and *NOT* unlocked/relocked in ->next()

  c) ->show() ought to do nothing in case of ->device == NULL, instead
     of skipping those in ->next()/->start()

  d) regardless of the merits of the bright idea about asterisk at that
     line in output *and* regardless of (a), the implementation is not
     only atrociously ugly, it's actually very likely to be a roothole.
     Verifying that Cthulhu knows what number happens to be address of a
     tty_struct by blindly dereferencing memory at that address...
     Ouch.

  Please revert that crap."

And Christoph pipes in and NAK's the approach of walking fd tables etc
too.  So it's pretty unanimous.

Noticed-by: Jri Slaby <jslaby@suse.cz>
Requested-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Werner Fink <werner@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  32 --------
 fs/proc/proc_tty.c                 | 158 -------------------------------------
 2 files changed, 190 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 98223a67694..a6aca874088 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1075,7 +1075,6 @@ Table 1-11: Files in /proc/tty
  drivers       list of drivers and their usage                
  ldiscs        registered line disciplines                    
  driver/serial usage statistic and status of single tty lines 
- consoles      registered system console lines
 ..............................................................................
 
 To see  which  tty's  are  currently in use, you can simply look into the file
@@ -1094,37 +1093,6 @@ To see  which  tty's  are  currently in use, you can simply look into the file
   /dev/tty             /dev/tty        5       0 system:/dev/tty 
   unknown              /dev/tty        4    1-63 console 
 
-To see which character device lines are currently used for the system console
-/dev/console, you may simply look into the file /proc/tty/consoles:
-
-  > cat /proc/tty/consoles
-  tty0                 -WU (ECp)       4:7
-  ttyS0                -W- (Ep)        4:64
-
-The columns are:
-
-  device               name of the device
-  operations           R = can do read operations
-                       W = can do write operations
-                       U = can do unblank
-  flags                E = it is enabled
-                       C = it is prefered console
-                       B = it is primary boot console
-                       p = it is used for printk buffer
-                       b = it is not a TTY but a Braille device
-                       a = it is safe to use when cpu is offline
-                       * = it is standard input of the reading process
-  major:minor          major and minor number of the device separated by a colon
-
-If the reading process holds /dev/console open at the regular standard input
-stream the active device will be marked by an asterisk:
-
-  > cat /proc/tty/consoles < /dev/console
-  tty0                 -WU (ECp*)      4:7
-  ttyS0                -W- (Ep)        4:64
-  > tty
-  /dev/pts/3
-
 
 1.8 Miscellaneous kernel statistics in /proc/stat
 -------------------------------------------------
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index dc44f94022f..83adcc86943 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -12,10 +12,7 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/tty.h>
-#include <linux/tty_driver.h>
-#include <linux/console.h>
 #include <linux/seq_file.h>
-#include <linux/fdtable.h>
 #include <linux/bitops.h>
 
 /*
@@ -139,160 +136,6 @@ static const struct file_operations proc_tty_drivers_operations = {
 	.release	= seq_release,
 };
 
-/*
- * The device ID of file descriptor 0 of the current reading
- * task if a character device...
- */
-static dev_t current_dev;
-
-/*
- * This is the handler for /proc/tty/consoles
- */
-static int show_console_dev(struct seq_file *m, void *v)
-{
-	const struct tty_driver *driver;
-	struct console *con;
-	int index, len;
-	char flags[10];
-	dev_t dev;
-
-	if (v == SEQ_START_TOKEN)
-		return 0;
-	con = (struct console *)v;
-	if (!con)
-		return 0;
-	driver = con->device(con, &index);
-	if (!driver)
-		return 0;
-	dev = MKDEV(driver->major, driver->minor_start) + index;
-
-	index = 0;
-	if (con->flags & CON_ENABLED)
-		flags[index++] = 'E';
-	if (con->flags & CON_CONSDEV)
-		flags[index++] = 'C';
-	if (con->flags & CON_BOOT)
-		flags[index++] = 'B';
-	if (con->flags & CON_PRINTBUFFER)
-		flags[index++] = 'p';
-	if (con->flags & CON_BRL)
-		flags[index++] = 'b';
-	if (con->flags & CON_ANYTIME)
-		flags[index++] = 'a';
-	if (current_dev == dev)
-		flags[index++] = '*';
-	flags[index] = 0;
-
-	seq_printf(m, "%s%d%n", con->name, con->index, &len);
-	len = 21 - len;
-	if (len < 1)
-		len = 1;
-	seq_printf(m, "%*c", len, ' ');
-	seq_printf(m, "%c%c%c (%s)%n", con->read ? 'R' : '-',
-			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
-			flags, &len);
-	len = 13 - len;
-	if (len < 1)
-		len = 1;
-	seq_printf(m, "%*c%4d:%d\n", len, ' ', MAJOR(dev), MINOR(dev));
-
-	return 0;
-}
-
-/* iterator for consoles */
-static void *c_start(struct seq_file *m, loff_t *pos)
-{
-	struct console *con;
-	loff_t off = 0;
-
-	if (*pos == 0)
-		return SEQ_START_TOKEN;
-
-	acquire_console_sem();
-	for (con = console_drivers; con; con = con->next) {
-		if (!con->device)
-			continue;
-		if (++off == *pos)
-			break;
-	}
-	release_console_sem();
-
-	return con;
-}
-
-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct console *con;
-
-	acquire_console_sem();
-	if (v == SEQ_START_TOKEN)
-		con = console_drivers;
-	else
-		con = ((struct console *)v)->next;
-	for (; con; con = con->next) {
-		if (!con->device)
-			continue;
-		++*pos;
-		break;
-	}
-	release_console_sem();
-
-	return con;
-}
-
-static void c_stop(struct seq_file *m, void *v)
-{
-}
-
-static const struct seq_operations tty_consoles_op = {
-	.start	= c_start,
-	.next	= c_next,
-	.stop	= c_stop,
-	.show	= show_console_dev
-};
-
-/*
- * Used for open /proc/tty/consoles. Before this detect
- * the device ID of file descriptor 0 of the current
- * reading task if a character device...
- */
-static int tty_consoles_open(struct inode *inode, struct file *file)
-{
-	struct files_struct *curfiles;
-
-	current_dev = 0;
-	curfiles = get_files_struct(current);
-	if (curfiles) {
-		const struct file *curfp;
-		spin_lock(&curfiles->file_lock);
-		curfp = fcheck_files(curfiles, 0);
-		if (curfp && curfp->private_data) {
-			const struct inode *inode;
-			dget(curfp->f_dentry);
-			inode = curfp->f_dentry->d_inode;
-			if (S_ISCHR(inode->i_mode)) {
-				struct tty_struct *tty;
-				tty = (struct tty_struct *)curfp->private_data;
-				if (tty && tty->magic == TTY_MAGIC) {
-					tty = tty_pair_get_tty(tty);
-					current_dev = tty_devnum(tty);
-				}
-			}
-			dput(curfp->f_dentry);
-		}
-		spin_unlock(&curfiles->file_lock);
-		put_files_struct(curfiles);
-	}
-	return seq_open(file, &tty_consoles_op);
-}
-
-static const struct file_operations proc_tty_consoles_operations = {
-	.open		= tty_consoles_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * This function is called by tty_register_driver() to handle
  * registering the driver's /proc handler into /proc/tty/driver/<foo>
@@ -343,5 +186,4 @@ void __init proc_tty_init(void)
 	proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
 	proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
 	proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
-	proc_create("tty/consoles", 0, NULL, &proc_tty_consoles_operations);
 }
-- 
cgit v1.2.3


From 02c35fca7cf4ea2dfdc6db279e230cacbbf4b870 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Wed, 20 Oct 2010 00:17:59 -0400
Subject: NFSv4.1: pnfs: full mount/umount infrastructure

Allow a module implementing a layout type to register, and
have its mount/umount routines called for filesystems that
the server declares support it.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Bian Naimeng <biannm@cn.fujitsu.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/nfs/00-INDEX |  2 +
 Documentation/filesystems/nfs/pnfs.txt | 48 +++++++++++++++++++
 fs/nfs/Kconfig                         |  8 +++-
 fs/nfs/pnfs.c                          | 88 +++++++++++++++++++++++++++++++++-
 fs/nfs/pnfs.h                          |  9 ++++
 5 files changed, 151 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/filesystems/nfs/pnfs.txt

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
index 3225a566211..a57e12411d2 100644
--- a/Documentation/filesystems/nfs/00-INDEX
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -12,6 +12,8 @@ nfs-rdma.txt
 	- how to install and setup the Linux NFS/RDMA client and server software
 nfsroot.txt
 	- short guide on setting up a diskless box with NFS root filesystem.
+pnfs.txt
+	- short explanation of some of the internals of the pnfs client code
 rpc-cache.txt
 	- introduction to the caching mechanisms in the sunrpc layer.
 idmapper.txt
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
new file mode 100644
index 00000000000..bc0b9cfe095
--- /dev/null
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -0,0 +1,48 @@
+Reference counting in pnfs:
+==========================
+
+The are several inter-related caches.  We have layouts which can
+reference multiple devices, each of which can reference multiple data servers.
+Each data server can be referenced by multiple devices.  Each device
+can be referenced by multiple layouts.  To keep all of this straight,
+we need to reference count.
+
+
+struct pnfs_layout_hdr
+----------------------
+The on-the-wire command LAYOUTGET corresponds to struct
+pnfs_layout_segment, usually referred to by the variable name lseg.
+Each nfs_inode may hold a pointer to a cache of of these layout
+segments in nfsi->layout, of type struct pnfs_layout_hdr.
+
+We reference the header for the inode pointing to it, across each
+outstanding RPC call that references it (LAYOUTGET, LAYOUTRETURN,
+LAYOUTCOMMIT), and for each lseg held within.
+
+Each header is also (when non-empty) put on a list associated with
+struct nfs_client (cl_layouts).  Being put on this list does not bump
+the reference count, as the layout is kept around by the lseg that
+keeps it in the list.
+
+deviceid_cache
+--------------
+lsegs reference device ids, which are resolved per nfs_client and
+layout driver type.  The device ids are held in a RCU cache (struct
+nfs4_deviceid_cache).  The cache itself is referenced across each
+mount.  The entries (struct nfs4_deviceid) themselves are held across
+the lifetime of each lseg referencing them.
+
+RCU is used because the deviceid is basically a write once, read many
+data structure.  The hlist size of 32 buckets needs better
+justification, but seems reasonable given that we can have multiple
+deviceid's per filesystem, and multiple filesystems per nfs_client.
+
+The hash code is copied from the nfsd code base.  A discussion of
+hashing and variations of this algorithm can be found at:
+http://groups.google.com/group/comp.lang.c/browse_thread/thread/9522965e2b8d3809
+
+data server cache
+-----------------
+file driver devices refer to data servers, which are kept in a module
+level cache.  Its reference is held over the lifetime of the deviceid
+pointing to it.
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 3f69752d6f1..d94311943ec 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -75,13 +75,17 @@ config NFS_V4
 
 config NFS_V4_1
 	bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
-	depends on NFS_V4 && EXPERIMENTAL
+	depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+	select PNFS_FILE_LAYOUT
 	help
 	  This option enables support for minor version 1 of the NFSv4 protocol
-	  (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
+	  (RFC 5661) in the kernel's NFS client.
 
 	  If unsure, say N.
 
+config PNFS_FILE_LAYOUT
+	tristate
+
 config ROOT_NFS
 	bool "Root file system on NFS"
 	depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index b483026e82a..cf795625610 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -32,16 +32,51 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS
 
-/* STUB that returns the equivalent of "no module found" */
+/* Locking:
+ *
+ * pnfs_spinlock:
+ *      protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+	struct pnfs_layoutdriver_type *local;
+
+	list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+		if (local->id == id)
+			goto out;
+	local = NULL;
+out:
+	dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+	return local;
+}
+
 static struct pnfs_layoutdriver_type *
 find_pnfs_driver(u32 id)
 {
-	return NULL;
+	struct pnfs_layoutdriver_type *local;
+
+	spin_lock(&pnfs_spinlock);
+	local = find_pnfs_driver_locked(id);
+	spin_unlock(&pnfs_spinlock);
+	return local;
 }
 
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
+	if (nfss->pnfs_curr_ld) {
+		nfss->pnfs_curr_ld->uninitialize_mountpoint(nfss);
+		module_put(nfss->pnfs_curr_ld->owner);
+	}
 	nfss->pnfs_curr_ld = NULL;
 }
 
@@ -74,7 +109,18 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
 			goto out_no_driver;
 		}
 	}
+	if (!try_module_get(ld_type->owner)) {
+		dprintk("%s: Could not grab reference on module\n", __func__);
+		goto out_no_driver;
+	}
 	server->pnfs_curr_ld = ld_type;
+	if (ld_type->initialize_mountpoint(server)) {
+		printk(KERN_ERR
+		       "%s: Error initializing mount point for layout driver %u.\n",
+		       __func__, id);
+		module_put(ld_type->owner);
+		goto out_no_driver;
+	}
 	dprintk("%s: pNFS module for %u set\n", __func__, id);
 	return;
 
@@ -82,3 +128,41 @@ out_no_driver:
 	dprintk("%s: Using NFSv4 I/O\n", __func__);
 	server->pnfs_curr_ld = NULL;
 }
+
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+	int status = -EINVAL;
+	struct pnfs_layoutdriver_type *tmp;
+
+	if (ld_type->id == 0) {
+		printk(KERN_ERR "%s id 0 is reserved\n", __func__);
+		return status;
+	}
+
+	spin_lock(&pnfs_spinlock);
+	tmp = find_pnfs_driver_locked(ld_type->id);
+	if (!tmp) {
+		list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+		status = 0;
+		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+			ld_type->name);
+	} else {
+		printk(KERN_ERR "%s Module with id %d already loaded!\n",
+			__func__, ld_type->id);
+	}
+	spin_unlock(&pnfs_spinlock);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+	dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+	spin_lock(&pnfs_spinlock);
+	list_del(&ld_type->pnfs_tblid);
+	spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index c628ef131d8..61531f33857 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -36,8 +36,17 @@
 
 /* Per-layout driver specific registration structure */
 struct pnfs_layoutdriver_type {
+	struct list_head pnfs_tblid;
+	const u32 id;
+	const char *name;
+	struct module *owner;
+	int (*initialize_mountpoint) (struct nfs_server *);
+	int (*uninitialize_mountpoint) (struct nfs_server *);
 };
 
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 
-- 
cgit v1.2.3


From d9d1dc802ffae507956ceb350eb3f4a995734f1e Mon Sep 17 00:00:00 2001
From: Valerie Aurora <vaurora@redhat.com>
Date: Mon, 30 Aug 2010 17:23:12 -0400
Subject: Documentation: Fix trivial typo in filesystems/sharedsubtree.txt

Documentation: Fix trivial typo in filesystems/sharedsubtree.txt

This typo is easy to ignore unless you have spent a great deal of time
thinking about how to eliminate duplicate dentries in unions.

Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/sharedsubtree.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index fc0e39af43c..4ede421c968 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -62,10 +62,10 @@ replicas continue to be exactly same.
 	# mount /dev/sd0  /tmp/a
 
 	#ls /tmp/a
-	t1 t2 t2
+	t1 t2 t3
 
 	#ls /mnt/a
-	t1 t2 t2
+	t1 t2 t3
 
 	Note that the mount has propagated to the mount at /mnt as well.
 
-- 
cgit v1.2.3


From e1455d1bdccbe056ba53479741b1452106ce59aa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Oct 2010 10:46:53 +0200
Subject: update block_device_operations documentation

Updated Documentation/filesystems/Locking to match the code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/filesystems/Locking | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 2db4283efa8..8a817f656f0 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -349,21 +349,36 @@ call this method upon the IO completion.
 
 --------------------------- block_device_operations -----------------------
 prototypes:
-	int (*open) (struct inode *, struct file *);
-	int (*release) (struct inode *, struct file *);
-	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+	int (*open) (struct block_device *, fmode_t);
+	int (*release) (struct gendisk *, fmode_t);
+	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+	int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *);
 	int (*media_changed) (struct gendisk *);
+	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
+	int (*getgeo)(struct block_device *, struct hd_geometry *);
+	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 
 locking rules:
-			BKL	bd_sem
-open:			yes	yes
-release:		yes	yes
-ioctl:			yes	no
+			BKL	bd_mutex
+open:			no	yes
+release:		no	yes
+ioctl:			no	no
+compat_ioctl:		no	no
+direct_access:		no	no
 media_changed:		no	no
+unlock_native_capacity:	no	no
 revalidate_disk:	no	no
+getgeo:			no	no
+swap_slot_free_notify:	no	no	(see below)
+
+media_changed, unlock_native_capacity and revalidate_disk are called only from
+check_disk_change().
+
+swap_slot_free_notify is called with swap_lock and sometimes the page lock
+held.
 
-The last two are called only from check_disk_change().
 
 --------------------------- file_operations -------------------------------
 prototypes:
-- 
cgit v1.2.3


From eb1c86b8b501ad9a073d9d519105979d31fa0ef2 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Tue, 26 Oct 2010 13:27:42 -0400
Subject: NFS: rename nfs.upcall -> nfs.idmap

This patch renames the idmapper upcall program from nfs.upcall to nfs.idmap in
the NFS documentation.  This is because the program has been renamed in the
nfs-utils source.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/nfs/idmapper.txt | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt
index c3852041a21..b9b4192ea8b 100644
--- a/Documentation/filesystems/nfs/idmapper.txt
+++ b/Documentation/filesystems/nfs/idmapper.txt
@@ -6,7 +6,7 @@ Id mapper is used by NFS to translate user and group ids into names, and to
 translate user and group names into ids.  Part of this translation involves
 performing an upcall to userspace to request the information.  Id mapper will
 user request-key to perform this upcall and cache the result.  The program
-/usr/sbin/nfs.upcall should be called by request-key, and will perform the
+/usr/sbin/nfs.idmap should be called by request-key, and will perform the
 translation and initialize a key with the resulting information.
 
  NFS_USE_NEW_IDMAPPER must be selected when configuring the kernel to use this
@@ -20,12 +20,12 @@ direct the upcall.  The following line should be added:
 
 #OP	TYPE	DESCRIPTION	CALLOUT INFO	PROGRAM ARG1 ARG2 ARG3 ...
 #======	=======	===============	===============	===============================
-create	id_resolver	*	*		/usr/sbin/nfs.upcall %k %d 600
+create	id_resolver	*	*		/usr/sbin/nfs.idmap %k %d 600
 
-This will direct all id_resolver requests to the program /usr/sbin/nfs.upcall.
+This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap.
 The last parameter, 600, defines how many seconds into the future the key will
-expire.  This parameter is optional for /usr/sbin/nfs.upcall.  When the timeout
-is not specified, nfs.upcall will default to 600 seconds.
+expire.  This parameter is optional for /usr/sbin/nfs.idmap.  When the timeout
+is not specified, nfs.idmap will default to 600 seconds.
 
 id mapper uses for key descriptions:
 	  uid:  Find the UID for the given user
@@ -39,29 +39,29 @@ would edit your request-key.conf so it look similar to this:
 
 #OP	TYPE	DESCRIPTION	CALLOUT INFO	PROGRAM ARG1 ARG2 ARG3 ...
 #======	=======	===============	===============	===============================
-create	id_resolver	uid:*	*		/some/other/program  %k %d 600
-create	id_resolver	*	*		/usr/sbin/nfs.upcall %k %d 600
+create	id_resolver	uid:*	*		/some/other/program %k %d 600
+create	id_resolver	*	*		/usr/sbin/nfs.idmap %k %d 600
 
 Notice that the new line was added above the line for the generic program.
 request-key will find the first matching line and corresponding program.  In
 this case, /some/other/program will handle all uid lookups and
-/usr/sbin/nfs.upcall will handle gid, user, and group lookups.
+/usr/sbin/nfs.idmap will handle gid, user, and group lookups.
 
 See <file:Documentation/keys-request-keys.txt> for more information about the
 request-key function.
 
 
-==========
-nfs.upcall
-==========
-nfs.upcall is designed to be called by request-key, and should not be run "by
+=========
+nfs.idmap
+=========
+nfs.idmap is designed to be called by request-key, and should not be run "by
 hand".  This program takes two arguments, a serialized key and a key
 description.  The serialized key is first converted into a key_serial_t, and
 then passed as an argument to keyctl_instantiate (both are part of keyutils.h).
 
-The actual lookups are performed by functions found in nfsidmap.h.  nfs.upcall
+The actual lookups are performed by functions found in nfsidmap.h.  nfs.idmap
 determines the correct function to call by looking at the first part of the
 description string.  For example, a uid lookup description will appear as
 "uid:user@domain".
 
-nfs.upcall will return 0 if the key was instantiated, and non-zero otherwise.
+nfs.idmap will return 0 if the key was instantiated, and non-zero otherwise.
-- 
cgit v1.2.3


From 0f4d208f1975f16f269134cee5f44c1f048581da Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Tue, 26 Oct 2010 14:21:22 -0700
Subject: Documentation/filesystems/proc.txt: improve smaps field documentation

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Nikanth Karthikesan <knikanth@suse.de>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a6aca874088..a563b74c7ae 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -374,13 +374,13 @@ Swap:                  0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 
-The first  of these lines shows  the same information  as is displayed for the
-mapping in /proc/PID/maps.  The remaining lines show  the size of the mapping,
-the amount of the mapping that is currently resident in RAM, the "proportional
-set size” (divide each shared page by the number of processes sharing it), the
-number of clean and dirty shared pages in the mapping, and the number of clean
-and dirty private pages in the mapping.  The "Referenced" indicates the amount
-of memory currently marked as referenced or accessed.
+The first of these lines shows the same information as is displayed for the
+mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
+(size), the amount of the mapping that is currently resident in RAM (RSS), the
+process' proportional share of this mapping (PSS), the number of clean and
+dirty shared pages in the mapping, and the number of clean and dirty private
+pages in the mapping.  The "Referenced" indicates the amount of memory
+currently marked as referenced or accessed.
 
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
-- 
cgit v1.2.3


From b40d4f84becd69275451baee7f0801c85eb58437 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Wed, 27 Oct 2010 15:34:10 -0700
Subject: /proc/pid/smaps: export amount of anonymous memory in a mapping

Export the number of anonymous pages in a mapping via smaps.

Even the private pages in a mapping backed by a file, would be marked as
anonymous, when they are modified. Export this information to user-space via
smaps.

Exporting this count will help gdb to make a better decision on which
areas need to be dumped in its coredump; and should be useful to others
studying the memory usage of a process.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 13 ++++++++++---
 fs/proc/task_mmu.c                 |  6 ++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a563b74c7ae..976de6e19dd 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -370,6 +370,7 @@ Shared_Dirty:          0 kB
 Private_Clean:         0 kB
 Private_Dirty:         0 kB
 Referenced:          892 kB
+Anonymous:             0 kB
 Swap:                  0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
@@ -378,9 +379,15 @@ The first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
 (size), the amount of the mapping that is currently resident in RAM (RSS), the
 process' proportional share of this mapping (PSS), the number of clean and
-dirty shared pages in the mapping, and the number of clean and dirty private
-pages in the mapping.  The "Referenced" indicates the amount of memory
-currently marked as referenced or accessed.
+dirty private pages in the mapping.  Note that even a page which is part of a
+MAP_SHARED mapping, but has only a single pte mapped, i.e.  is currently used
+by only one process, is accounted as private and not as shared.  "Referenced"
+indicates the amount of memory currently marked as referenced or accessed.
+"Anonymous" shows the amount of memory that does not belong to any file.  Even
+a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
+and a page is modified, the file page is replaced by a private anonymous copy.
+"Swap" shows how much would-be-anonymous memory is also used, but out on
+swap.
 
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 871e25ed006..da6b01d70f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -327,6 +327,7 @@ struct mem_size_stats {
 	unsigned long private_clean;
 	unsigned long private_dirty;
 	unsigned long referenced;
+	unsigned long anonymous;
 	unsigned long swap;
 	u64 pss;
 };
@@ -357,6 +358,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		if (!page)
 			continue;
 
+		if (PageAnon(page))
+			mss->anonymous += PAGE_SIZE;
+
 		mss->resident += PAGE_SIZE;
 		/* Accumulate the size in pages that have been accessed. */
 		if (pte_young(ptent) || PageReferenced(page))
@@ -410,6 +414,7 @@ static int show_smap(struct seq_file *m, void *v)
 		   "Private_Clean:  %8lu kB\n"
 		   "Private_Dirty:  %8lu kB\n"
 		   "Referenced:     %8lu kB\n"
+		   "Anonymous:      %8lu kB\n"
 		   "Swap:           %8lu kB\n"
 		   "KernelPageSize: %8lu kB\n"
 		   "MMUPageSize:    %8lu kB\n",
@@ -421,6 +426,7 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.private_clean >> 10,
 		   mss.private_dirty >> 10,
 		   mss.referenced >> 10,
+		   mss.anonymous >> 10,
 		   mss.swap >> 10,
 		   vma_kernel_pagesize(vma) >> 10,
 		   vma_mmu_pagesize(vma) >> 10);
-- 
cgit v1.2.3


From 03f890f8c2f5c9008d3d8f6d85267717ced4bd79 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Wed, 27 Oct 2010 15:34:11 -0700
Subject: /proc/pid/pagemap: document in Documentation/filesystems/proc.txt

Document /proc/pid/pagemap in Documentation/filesystems/proc.txt

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Cc: Richard Guenther <rguenther@suse.de>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 976de6e19dd..e73df2722ff 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -136,6 +136,7 @@ Table 1-1: Process specific entries in /proc
  statm		Process memory status information
  status		Process status in human readable form
  wchan		If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ pagemap	Page table
  stack		Report full stack trace, enable via CONFIG_STACKTRACE
  smaps		a extension based on maps, showing the memory consumption of
 		each mapping
@@ -404,6 +405,9 @@ To clear the bits for the file mapped pages associated with the process
     > echo 3 > /proc/PID/clear_refs
 Any other value written to /proc/PID/clear_refs will have no effect.
 
+The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
+using /proc/kpageflags and number of times a page is mapped using
+/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt.
 
 1.2 Kernel data
 ---------------
-- 
cgit v1.2.3


From bfff68738f1cb5c93dab1114634cea02aae9e7ba Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:05 -0400
Subject: ext4: add support for lazy inode table initialization

When the lazy_itable_init extended option is passed to mke2fs, it
considerably speeds up filesystem creation because inode tables are
not zeroed out.  The fact that parts of the inode table are
uninitialized is not a problem so long as the block group descriptors,
which contain information regarding how much of the inode table has
been initialized, has not been corrupted However, if the block group
checksums are not valid, e2fsck must scan the entire inode table, and
the the old, uninitialized data could potentially cause e2fsck to
report false problems.

Hence, it is important for the inode tables to be initialized as soon
as possble.  This commit adds this feature so that mke2fs can safely
use the lazy inode table initialization feature to speed up formatting
file systems.

This is done via a new new kernel thread called ext4lazyinit, which is
created on demand and destroyed, when it is no longer needed.  There
is only one thread for all ext4 filesystems in the system. When the
first filesystem with inititable mount option is mounted, ext4lazyinit
thread is created, then the filesystem can register its request in the
request list.

This thread then walks through the list of requests picking up
scheduled requests and invoking ext4_init_inode_table(). Next schedule
time for the request is computed by multiplying the time it took to
zero out last inode table with wait multiplier, which can be set with
the (init_itable=n) mount option (default is 10).  We are doing
this so we do not take the whole I/O bandwidth. When the thread is no
longer necessary (request list is empty) it frees the appropriate
structures and exits (and can be created later later by another
filesystem).

We do not disturb regular inode allocations in any way, it just do not
care whether the inode table is, or is not zeroed. But when zeroing, we
have to skip used inodes, obviously. Also we should prevent new inode
allocations from the group, while zeroing is on the way. For that we
take write alloc_sem lock in ext4_init_inode_table() and read alloc_sem
in the ext4_claim_inode, so when we are unlucky and allocator hits the
group which is currently being zeroed, it just has to wait.

This can be suppresed using the mount option no_init_itable.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt |  14 ++
 fs/ext4/ext4.h                     |  40 ++++
 fs/ext4/ialloc.c                   | 120 ++++++++++
 fs/ext4/super.c                    | 440 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 611 insertions(+), 3 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index e1def1786e5..6ab9442d7ee 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -353,6 +353,20 @@ noauto_da_alloc		replacing existing files via patterns such as
 			system crashes before the delayed allocation
 			blocks are forced to disk.
 
+noinit_itable		Do not initialize any uninitialized inode table
+			blocks in the background.  This feature may be
+			used by installation CD's so that the install
+			process can complete as quickly as possible; the
+			inode table initialization process would then be
+			deferred until the next time the  file system
+			is unmounted.
+
+init_itable=n		The lazy itable init code will wait n times the
+			number of milliseconds it took to zero out the
+			previous block group's inode table.  This
+			minimizes the impact on the systme performance
+			while file system's inode table is being initialized.
+
 discard		Controls whether ext4 should issue discard/TRIM
 nodiscard(*)		commands to the underlying block device when
 			blocks are freed.  This is useful for SSD devices
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b364b9df09b..0fe078d368d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -890,6 +890,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
 
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)			o |= EXT4_MOUNT_##opt
@@ -1173,6 +1174,11 @@ struct ext4_sb_info {
 
 	/* timer for periodic error stats printing */
 	struct timer_list s_err_report;
+
+	/* Lazy inode table initialization info */
+	struct ext4_li_request *s_li_request;
+	/* Wait multiplier for lazy initialization thread */
+	unsigned int s_li_wait_mult;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1536,6 +1542,38 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 extern struct proc_dir_entry *ext4_proc_root;
 
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT			10
+#define EXT4_DEF_LI_MAX_START_DELAY		5
+#define EXT4_LAZYINIT_QUIT			0x0001
+#define EXT4_LAZYINIT_RUNNING			0x0002
+
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+	unsigned long		li_state;
+
+	wait_queue_head_t	li_wait_daemon;
+	wait_queue_head_t	li_wait_task;
+	struct timer_list	li_timer;
+	struct task_struct	*li_task;
+
+	struct list_head	li_request_list;
+	struct mutex		li_list_mtx;
+};
+
+struct ext4_li_request {
+	struct super_block	*lr_super;
+	struct ext4_sb_info	*lr_sbi;
+	ext4_group_t		lr_next_group;
+	struct list_head	lr_request;
+	unsigned long		lr_next_sched;
+	unsigned long		lr_timeout;
+};
+
 /*
  * Function prototypes
  */
@@ -1611,6 +1649,8 @@ extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
 				       ext4_group_t group,
 				       struct ext4_group_desc *desc);
 extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern int ext4_init_inode_table(struct super_block *sb,
+				 ext4_group_t group, int barrier);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f2..e428f23215c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -107,6 +107,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	desc = ext4_get_group_desc(sb, block_group, NULL);
 	if (!desc)
 		return NULL;
+
 	bitmap_blk = ext4_inode_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
@@ -123,6 +124,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		unlock_buffer(bh);
 		return bh;
 	}
+
 	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +135,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		return bh;
 	}
 	ext4_unlock_group(sb, block_group);
+
 	if (buffer_uptodate(bh)) {
 		/*
 		 * if not uninit if bh is uptodate,
@@ -712,8 +715,17 @@ static int ext4_claim_inode(struct super_block *sb,
 {
 	int free = 0, retval = 0, count;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 
+	/*
+	 * We have to be sure that new inode allocation does not race with
+	 * inode table initialization, because otherwise we may end up
+	 * allocating and writing new inode right before sb_issue_zeroout
+	 * takes place and overwriting our new inode with zeroes. So we
+	 * take alloc_sem to prevent it.
+	 */
+	down_read(&grp->alloc_sem);
 	ext4_lock_group(sb, group);
 	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
 		/* not a free inode */
@@ -724,6 +736,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 			ino > EXT4_INODES_PER_GROUP(sb)) {
 		ext4_unlock_group(sb, group);
+		up_read(&grp->alloc_sem);
 		ext4_error(sb, "reserved inode or inode > inodes count - "
 			   "block_group = %u, inode=%lu", group,
 			   ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +785,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
 	ext4_unlock_group(sb, group);
+	up_read(&grp->alloc_sem);
 	return retval;
 }
 
@@ -1205,3 +1219,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
 	}
 	return count;
 }
+
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_claim_inode until we are finished.
+ */
+extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+				 int barrier)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	struct buffer_head *group_desc_bh;
+	handle_t *handle;
+	ext4_fsblk_t blk;
+	int num, ret = 0, used_blks = 0;
+	unsigned long flags = BLKDEV_IFL_WAIT;
+
+	/* This should not happen, but just to be sure check this */
+	if (sb->s_flags & MS_RDONLY) {
+		ret = 1;
+		goto out;
+	}
+
+	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+	if (!gdp)
+		goto out;
+
+	/*
+	 * We do not need to lock this, because we are the only one
+	 * handling this flag.
+	 */
+	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+		goto out;
+
+	handle = ext4_journal_start_sb(sb, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	down_write(&grp->alloc_sem);
+	/*
+	 * If inode bitmap was already initialized there may be some
+	 * used inodes so we need to skip blocks with used inodes in
+	 * inode table.
+	 */
+	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+		used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+			    ext4_itable_unused_count(sb, gdp)),
+			    sbi->s_inodes_per_block);
+
+	blk = ext4_inode_table(sb, gdp) + used_blks;
+	num = sbi->s_itb_per_group - used_blks;
+
+	BUFFER_TRACE(group_desc_bh, "get_write_access");
+	ret = ext4_journal_get_write_access(handle,
+					    group_desc_bh);
+	if (ret)
+		goto err_out;
+
+	if (unlikely(num > EXT4_INODES_PER_GROUP(sb))) {
+		ext4_error(sb, "Something is wrong with group %u\n"
+			   "Used itable blocks: %d"
+			   "Itable blocks per group: %lu\n",
+			   group, used_blks, sbi->s_itb_per_group);
+		ret = 1;
+		goto err_out;
+	}
+
+	/*
+	 * Skip zeroout if the inode table is full. But we set the ZEROED
+	 * flag anyway, because obviously, when it is full it does not need
+	 * further zeroing.
+	 */
+	if (unlikely(num == 0))
+		goto skip_zeroout;
+
+	ext4_debug("going to zero out inode table in group %d\n",
+		   group);
+	if (barrier)
+		flags |= BLKDEV_IFL_BARRIER;
+	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS, flags);
+	if (ret < 0)
+		goto err_out;
+
+skip_zeroout:
+	ext4_lock_group(sb, group);
+	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+	ext4_unlock_group(sb, group);
+
+	BUFFER_TRACE(group_desc_bh,
+		     "call ext4_handle_dirty_metadata");
+	ret = ext4_handle_dirty_metadata(handle, NULL,
+					 group_desc_bh);
+
+err_out:
+	up_write(&grp->alloc_sem);
+	ext4_journal_stop(handle);
+out:
+	return ret;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 751997d2cef..5066537e5a3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -41,6 +41,9 @@
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
 
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -52,6 +55,8 @@
 
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
+struct ext4_lazy_init *ext4_li_info;
+struct mutex ext4_li_mtx;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
@@ -70,6 +75,8 @@ static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
 static int ext4_get_sb(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data, struct vfsmount *mnt);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
 
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
@@ -720,6 +727,7 @@ static void ext4_put_super(struct super_block *sb)
 	}
 
 	del_timer(&sbi->s_err_report);
+	ext4_unregister_li_request(sb);
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
@@ -1046,6 +1054,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	    !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
 		seq_puts(seq, ",block_validity");
 
+	if (!test_opt(sb, INIT_INODE_TABLE))
+		seq_puts(seq, ",noinit_inode_table");
+	else if (sbi->s_li_wait_mult)
+		seq_printf(seq, ",init_inode_table=%u",
+			   (unsigned) sbi->s_li_wait_mult);
+
 	ext4_show_quota_options(seq, sb);
 
 	return 0;
@@ -1220,6 +1234,7 @@ enum {
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard,
+	Opt_init_inode_table, Opt_noinit_inode_table,
 };
 
 static const match_table_t tokens = {
@@ -1290,6 +1305,9 @@ static const match_table_t tokens = {
 	{Opt_dioread_lock, "dioread_lock"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
+	{Opt_init_inode_table, "init_itable=%u"},
+	{Opt_init_inode_table, "init_itable"},
+	{Opt_noinit_inode_table, "noinit_itable"},
 	{Opt_err, NULL},
 };
 
@@ -1760,6 +1778,20 @@ set_qf_format:
 		case Opt_dioread_lock:
 			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
 			break;
+		case Opt_init_inode_table:
+			set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = EXT4_DEF_LI_WAIT_MULT;
+			if (option < 0)
+				return 0;
+			sbi->s_li_wait_mult = option;
+			break;
+		case Opt_noinit_inode_table:
+			clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+			break;
 		default:
 			ext4_msg(sb, KERN_ERR,
 			       "Unrecognized mount option \"%s\" "
@@ -1943,7 +1975,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
 }
 
 /* Called at mount-time, super-block is locked */
-static int ext4_check_descriptors(struct super_block *sb)
+static int ext4_check_descriptors(struct super_block *sb,
+				  ext4_group_t *first_not_zeroed)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1952,7 +1985,7 @@ static int ext4_check_descriptors(struct super_block *sb)
 	ext4_fsblk_t inode_bitmap;
 	ext4_fsblk_t inode_table;
 	int flexbg_flag = 0;
-	ext4_group_t i;
+	ext4_group_t i, grp = sbi->s_groups_count;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		flexbg_flag = 1;
@@ -1968,6 +2001,10 @@ static int ext4_check_descriptors(struct super_block *sb)
 			last_block = first_block +
 				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
 
+		if ((grp == sbi->s_groups_count) &&
+		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			grp = i;
+
 		block_bitmap = ext4_block_bitmap(sb, gdp);
 		if (block_bitmap < first_block || block_bitmap > last_block) {
 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2005,6 +2042,8 @@ static int ext4_check_descriptors(struct super_block *sb)
 		if (!flexbg_flag)
 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
 	}
+	if (NULL != first_not_zeroed)
+		*first_not_zeroed = grp;
 
 	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
 	sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2543,6 +2582,378 @@ static void print_daily_error_info(unsigned long arg)
 	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
 
+static void ext4_lazyinode_timeout(unsigned long data)
+{
+	struct task_struct *p = (struct task_struct *)data;
+	wake_up_process(p);
+}
+
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_group_desc *gdp = NULL;
+	ext4_group_t group, ngroups;
+	struct super_block *sb;
+	unsigned long timeout = 0;
+	int ret = 0;
+
+	sb = elr->lr_super;
+	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	for (group = elr->lr_next_group; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp) {
+			ret = 1;
+			break;
+		}
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	if (group == ngroups)
+		ret = 1;
+
+	if (!ret) {
+		timeout = jiffies;
+		ret = ext4_init_inode_table(sb, group,
+					    elr->lr_timeout ? 0 : 1);
+		if (elr->lr_timeout == 0) {
+			timeout = jiffies - timeout;
+			if (elr->lr_sbi->s_li_wait_mult)
+				timeout *= elr->lr_sbi->s_li_wait_mult;
+			else
+				timeout *= 20;
+			elr->lr_timeout = timeout;
+		}
+		elr->lr_next_sched = jiffies + elr->lr_timeout;
+		elr->lr_next_group = group + 1;
+	}
+
+	return ret;
+}
+
+/*
+ * Remove lr_request from the list_request and free the
+ * request tructure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_sb_info *sbi;
+
+	if (!elr)
+		return;
+
+	sbi = elr->lr_sbi;
+
+	list_del(&elr->lr_request);
+	sbi->s_li_request = NULL;
+	kfree(elr);
+}
+
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+	struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+
+	if (!ext4_li_info)
+		return;
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	ext4_remove_li_request(elr);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+	unsigned long next_wakeup;
+	DEFINE_WAIT(wait);
+	int ret;
+
+	BUG_ON(NULL == eli);
+
+	eli->li_timer.data = (unsigned long)current;
+	eli->li_timer.function = ext4_lazyinode_timeout;
+
+	eli->li_task = current;
+	wake_up(&eli->li_wait_task);
+
+cont_thread:
+	while (true) {
+		next_wakeup = MAX_JIFFY_OFFSET;
+
+		mutex_lock(&eli->li_list_mtx);
+		if (list_empty(&eli->li_request_list)) {
+			mutex_unlock(&eli->li_list_mtx);
+			goto exit_thread;
+		}
+
+		list_for_each_safe(pos, n, &eli->li_request_list) {
+			elr = list_entry(pos, struct ext4_li_request,
+					 lr_request);
+
+			if (time_after_eq(jiffies, elr->lr_next_sched))
+				ret = ext4_run_li_request(elr);
+
+			if (ret) {
+				ret = 0;
+				ext4_remove_li_request(elr);
+				continue;
+			}
+
+			if (time_before(elr->lr_next_sched, next_wakeup))
+				next_wakeup = elr->lr_next_sched;
+		}
+		mutex_unlock(&eli->li_list_mtx);
+
+		if (freezing(current))
+			refrigerator();
+
+		if (time_after_eq(jiffies, next_wakeup)) {
+			cond_resched();
+			continue;
+		}
+
+		eli->li_timer.expires = next_wakeup;
+		add_timer(&eli->li_timer);
+		prepare_to_wait(&eli->li_wait_daemon, &wait,
+				TASK_INTERRUPTIBLE);
+		if (time_before(jiffies, next_wakeup))
+			schedule();
+		finish_wait(&eli->li_wait_daemon, &wait);
+	}
+
+exit_thread:
+	/*
+	 * It looks like the request list is empty, but we need
+	 * to check it under the li_list_mtx lock, to prevent any
+	 * additions into it, and of course we should lock ext4_li_mtx
+	 * to atomically free the list and ext4_li_info, because at
+	 * this point another ext4 filesystem could be registering
+	 * new one.
+	 */
+	mutex_lock(&ext4_li_mtx);
+	mutex_lock(&eli->li_list_mtx);
+	if (!list_empty(&eli->li_request_list)) {
+		mutex_unlock(&eli->li_list_mtx);
+		mutex_unlock(&ext4_li_mtx);
+		goto cont_thread;
+	}
+	mutex_unlock(&eli->li_list_mtx);
+	del_timer_sync(&ext4_li_info->li_timer);
+	eli->li_task = NULL;
+	wake_up(&eli->li_wait_task);
+
+	kfree(ext4_li_info);
+	ext4_li_info = NULL;
+	mutex_unlock(&ext4_li_mtx);
+
+	return 0;
+}
+
+static void ext4_clear_request_list(void)
+{
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	if (list_empty(&ext4_li_info->li_request_list))
+		return;
+
+	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+		elr = list_entry(pos, struct ext4_li_request,
+				 lr_request);
+		ext4_remove_li_request(elr);
+	}
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+static int ext4_run_lazyinit_thread(void)
+{
+	struct task_struct *t;
+
+	t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+	if (IS_ERR(t)) {
+		int err = PTR_ERR(t);
+		ext4_clear_request_list();
+		del_timer_sync(&ext4_li_info->li_timer);
+		kfree(ext4_li_info);
+		ext4_li_info = NULL;
+		printk(KERN_CRIT "EXT4: error %d creating inode table "
+				 "initialization thread\n",
+				 err);
+		return err;
+	}
+	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+
+	wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
+	return 0;
+}
+
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+	struct ext4_group_desc *gdp = NULL;
+
+	for (group = 0; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			continue;
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	return group;
+}
+
+static int ext4_li_info_new(void)
+{
+	struct ext4_lazy_init *eli = NULL;
+
+	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+	if (!eli)
+		return -ENOMEM;
+
+	eli->li_task = NULL;
+	INIT_LIST_HEAD(&eli->li_request_list);
+	mutex_init(&eli->li_list_mtx);
+
+	init_waitqueue_head(&eli->li_wait_daemon);
+	init_waitqueue_head(&eli->li_wait_task);
+	init_timer(&eli->li_timer);
+	eli->li_state |= EXT4_LAZYINIT_QUIT;
+
+	ext4_li_info = eli;
+
+	return 0;
+}
+
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+					    ext4_group_t start)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr;
+	unsigned long rnd;
+
+	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+	if (!elr)
+		return NULL;
+
+	elr->lr_super = sb;
+	elr->lr_sbi = sbi;
+	elr->lr_next_group = start;
+
+	/*
+	 * Randomize first schedule time of the request to
+	 * spread the inode table initialization requests
+	 * better.
+	 */
+	get_random_bytes(&rnd, sizeof(rnd));
+	elr->lr_next_sched = jiffies + (unsigned long)rnd %
+			     (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+
+	return elr;
+}
+
+static int ext4_register_li_request(struct super_block *sb,
+				    ext4_group_t first_not_zeroed)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr;
+	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	int ret = 0;
+
+	if (sbi->s_li_request != NULL)
+		goto out;
+
+	if (first_not_zeroed == ngroups ||
+	    (sb->s_flags & MS_RDONLY) ||
+	    !test_opt(sb, INIT_INODE_TABLE)) {
+		sbi->s_li_request = NULL;
+		goto out;
+	}
+
+	if (first_not_zeroed == ngroups) {
+		sbi->s_li_request = NULL;
+		goto out;
+	}
+
+	elr = ext4_li_request_new(sb, first_not_zeroed);
+	if (!elr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&ext4_li_mtx);
+
+	if (NULL == ext4_li_info) {
+		ret = ext4_li_info_new();
+		if (ret)
+			goto out;
+	}
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+
+	sbi->s_li_request = elr;
+
+	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+		ret = ext4_run_lazyinit_thread();
+		if (ret)
+			goto out;
+	}
+
+	mutex_unlock(&ext4_li_mtx);
+
+out:
+	if (ret) {
+		mutex_unlock(&ext4_li_mtx);
+		kfree(elr);
+	}
+	return ret;
+}
+
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+	/*
+	 * If thread exited earlier
+	 * there's nothing to be done.
+	 */
+	if (!ext4_li_info)
+		return;
+
+	ext4_clear_request_list();
+
+	while (ext4_li_info->li_task) {
+		wake_up(&ext4_li_info->li_wait_daemon);
+		wait_event(ext4_li_info->li_wait_task,
+			   ext4_li_info->li_task == NULL);
+	}
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
@@ -2568,6 +2979,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	__u64 blocks_count;
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	ext4_group_t first_not_zeroed;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -2630,6 +3042,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* Set defaults before we parse the mount options */
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
 	if (def_mount_opts & EXT4_DEFM_DEBUG)
 		set_opt(sbi->s_mount_opt, DEBUG);
 	if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
@@ -2909,7 +3322,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed_mount2;
 		}
 	}
-	if (!ext4_check_descriptors(sb)) {
+	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
 		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
 		goto failed_mount2;
 	}
@@ -3130,6 +3543,10 @@ no_journal:
 		goto failed_mount4;
 	}
 
+	err = ext4_register_li_request(sb, first_not_zeroed);
+	if (err)
+		goto failed_mount4;
+
 	sbi->s_kobj.kset = ext4_kset;
 	init_completion(&sbi->s_kobj_unregister);
 	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3847,6 +4264,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			enable_quota = 1;
 		}
 	}
+
+	/*
+	 * Reinitialize lazy itable initialization thread based on
+	 * current settings
+	 */
+	if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+		ext4_unregister_li_request(sb);
+	else {
+		ext4_group_t first_not_zeroed;
+		first_not_zeroed = ext4_has_uninit_itable(sb);
+		ext4_register_li_request(sb, first_not_zeroed);
+	}
+
 	ext4_setup_system_zone(sb);
 	if (sbi->s_journal == NULL)
 		ext4_commit_super(sb, 1);
@@ -4317,6 +4747,9 @@ static int __init init_ext4_fs(void)
 	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
+
+	ext4_li_info = NULL;
+	mutex_init(&ext4_li_mtx);
 	return 0;
 out:
 	unregister_as_ext2();
@@ -4336,6 +4769,7 @@ out4:
 
 static void __exit exit_ext4_fs(void)
 {
+	ext4_destroy_lazyinit_thread();
 	unregister_as_ext2();
 	unregister_as_ext3();
 	unregister_filesystem(&ext4_fs_type);
-- 
cgit v1.2.3


From 76381a42e4a5606774fd48413e6282cd7130ff2c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 28 Sep 2010 00:27:41 +0530
Subject: fs/9p: Add access = client option to opt in acl evaluation.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 Documentation/filesystems/9p.txt |  4 ++-
 fs/9p/acl.c                      | 78 +++++++++++++++++++++++++++++++++++++++-
 fs/9p/fid.c                      |  1 +
 fs/9p/v9fs.c                     | 22 +++++++++++-
 fs/9p/v9fs.h                     |  8 +++--
 fs/9p/vfs_super.c                |  4 +--
 6 files changed, 110 insertions(+), 7 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index f9765e8cf08..b22abba78fe 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -111,7 +111,7 @@ OPTIONS
   		This can be used to share devices/named pipes/sockets between
 		hosts.  This functionality will be expanded in later versions.
 
-  access	there are three access modes.
+  access	there are four access modes.
 			user  = if a user tries to access a file on v9fs
 			        filesystem for the first time, v9fs sends an
 			        attach command (Tattach) for that user.
@@ -120,6 +120,8 @@ OPTIONS
 				the files on the mounted filesystem
 			any   = v9fs does single attach and performs all
 				operations as one user
+			client = ACL based access check on the 9p client
+			         side for access validation
 
   cachetag	cache tag to use the specified persistent cache.
 		cache tags for existing cache sessions can be listed at
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 8b3c54a4958..12d602351db 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -22,6 +22,7 @@
 #include "xattr.h"
 #include "acl.h"
 #include "v9fs_vfs.h"
+#include "v9fs.h"
 
 static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
@@ -55,7 +56,14 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 {
 	int retval = 0;
 	struct posix_acl *pacl, *dacl;
+	struct v9fs_session_info *v9ses;
 
+	v9ses = v9fs_inode2v9ses(inode);
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
+		set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
+		return 0;
+	}
 	/* get the default/access acl values and cache them */
 	dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
 	pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
@@ -85,7 +93,18 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
 
 int v9fs_check_acl(struct inode *inode, int mask)
 {
-	struct posix_acl *acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+		/*
+		 * On access = client mode get the acl
+		 * values from the server
+		 */
+		return 0;
+	}
+	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
 
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
@@ -204,15 +223,41 @@ cleanup:
 
 }
 
+static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
+			       void *buffer, size_t size, int type)
+{
+	char *full_name;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		full_name =  POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		full_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+
 static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
 			      void *buffer, size_t size, int type)
 {
+	struct v9fs_session_info *v9ses;
 	struct posix_acl *acl;
 	int error;
 
 	if (strcmp(name, "") != 0)
 		return -EINVAL;
 
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	/*
+	 * We allow set/get/list of acl when access=client is not specified
+	 */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+
 	acl = v9fs_get_cached_acl(dentry->d_inode, type);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
@@ -224,16 +269,47 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
 	return error;
 }
 
+static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
+			      const void *value, size_t size,
+			      int flags, int type)
+{
+	char *full_name;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		full_name =  POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		full_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+
+
 static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 			      const void *value, size_t size,
 			      int flags, int type)
 {
 	int retval;
 	struct posix_acl *acl;
+	struct v9fs_session_info *v9ses;
 	struct inode *inode = dentry->d_inode;
 
 	if (strcmp(name, "") != 0)
 		return -EINVAL;
+
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	/*
+	 * set the attribute on the remote. Without even looking at the
+	 * xattr value. We leave it to the server to validate
+	 */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_remote_set_acl(dentry, name,
+					   value, size, flags, type);
+
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
 	if (!is_owner_or_cap(inode))
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 6406f896bf9..b00223c99d7 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -149,6 +149,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 	switch (access) {
 	case V9FS_ACCESS_SINGLE:
 	case V9FS_ACCESS_USER:
+	case V9FS_ACCESS_CLIENT:
 		uid = current_fsuid();
 		any = 0;
 		break;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 38dc0e06759..2f77cd33ba8 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -193,7 +193,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 				v9ses->flags |= V9FS_ACCESS_USER;
 			else if (strcmp(s, "any") == 0)
 				v9ses->flags |= V9FS_ACCESS_ANY;
-			else {
+			else if (strcmp(s, "client") == 0) {
+#ifdef CONFIG_9P_FS_POSIX_ACL
+				v9ses->flags |= V9FS_ACCESS_CLIENT;
+#else
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					"access=client option not supported\n");
+				kfree(s);
+				ret = -EINVAL;
+				goto free_and_return;
+#endif
+			} else {
 				v9ses->flags |= V9FS_ACCESS_SINGLE;
 				v9ses->uid = simple_strtoul(s, &e, 10);
 				if (*e != '\0')
@@ -278,6 +288,16 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 
 	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
+	if (!v9fs_proto_dotl(v9ses) &&
+	    ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+		/*
+		 * We support ACCESS_CLIENT only for dotl.
+		 * Fall back to ACCESS_USER
+		 */
+		v9ses->flags &= ~V9FS_ACCESS_MASK;
+		v9ses->flags |= V9FS_ACCESS_USER;
+	}
+	/*FIXME !! */
 	/* for legacy mode, fall back to V9FS_ACCESS_ANY */
 	if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
 		((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 4c963c9fc41..8bb7792afe2 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -33,13 +33,17 @@
  *
  * Session flags reflect options selected by users at mount time
  */
+#define	V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
+			 V9FS_ACCESS_USER |   \
+			 V9FS_ACCESS_CLIENT)
+#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
+
 enum p9_session_flags {
 	V9FS_PROTO_2000U	= 0x01,
 	V9FS_PROTO_2000L	= 0x02,
 	V9FS_ACCESS_SINGLE	= 0x04,
 	V9FS_ACCESS_USER	= 0x08,
-	V9FS_ACCESS_ANY		= 0x0C,
-	V9FS_ACCESS_MASK	= 0x0C,
+	V9FS_ACCESS_CLIENT	= 0x10
 };
 
 /* possible values of ->cache */
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 14da5778d44..174643f4f90 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -90,7 +90,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	    MS_NOATIME;
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+		sb->s_flags |= MS_POSIXACL;
 #endif
 
 	save_mount_options(sb, data);
@@ -181,7 +182,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	retval = v9fs_get_acl(inode, fid);
 	if (retval)
 		goto release_sb;
-
 	v9fs_fid_add(root, fid);
 
 	P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
-- 
cgit v1.2.3


From bb8430a2c8fe2b726033017daadf73c69b0348ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 31 Oct 2010 08:35:31 -0400
Subject: locks: remove fl_copy_lock lock_manager operation

This one was only used for a nasty hack in nfsd, which has recently
been removed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking | 2 --
 fs/locks.c                        | 5 +----
 include/linux/fs.h                | 1 -
 3 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 8a817f656f0..a91f3089001 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -322,7 +322,6 @@ fl_release_private:	yes	yes
 prototypes:
 	int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
 	void (*fl_notify)(struct file_lock *);  /* unblock callback */
-	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
 	void (*fl_break)(struct file_lock *); /* break_lease callback */
 
@@ -330,7 +329,6 @@ locking rules:
 			BKL	may block
 fl_compare_owner:	yes	no
 fl_notify:		yes	no
-fl_copy_lock:		yes	no
 fl_release_private:	yes	yes
 fl_break:		yes	no
 
diff --git a/fs/locks.c b/fs/locks.c
index a2ab790471b..65765cb6afe 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -235,11 +235,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
 			fl->fl_ops->fl_copy_lock(new, fl);
 		new->fl_ops = fl->fl_ops;
 	}
-	if (fl->fl_lmops) {
-		if (fl->fl_lmops->fl_copy_lock)
-			fl->fl_lmops->fl_copy_lock(new, fl);
+	if (fl->fl_lmops)
 		new->fl_lmops = fl->fl_lmops;
-	}
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1eb29399a4f..334d68a1710 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1056,7 +1056,6 @@ struct lock_manager_operations {
 	int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
 	void (*fl_notify)(struct file_lock *);	/* unblock callback */
 	int (*fl_grant)(struct file_lock *, struct file_lock *, int);
-	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
 	void (*fl_break)(struct file_lock *);
 	int (*fl_mylease)(struct file_lock *, struct file_lock *);
-- 
cgit v1.2.3


From 5d0af85cd0964bb845b63d5059bb20e8f7731e65 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 28 Oct 2010 21:37:10 +0000
Subject: xfs: remove experimental tag from the delaylog option

We promised to do this for 2.6.37, and the code looks stable enough to
keep that promise.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 Documentation/filesystems/xfs-delayed-logging-design.txt | 11 -----------
 fs/xfs/linux-2.6/xfs_super.c                             |  3 ---
 2 files changed, 14 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
index 96d0df28bed..7445bf335da 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.txt
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -794,17 +794,6 @@ designed.
 
 Roadmap:
 
-2.6.37 Remove experimental tag from mount option
-	=> should be roughly 6 months after initial merge
-	=> enough time to:
-		=> gain confidence and fix problems reported by early
-		   adopters (a.k.a. guinea pigs)
-		=> address worst performance regressions and undesired
-		   behaviours
-		=> start tuning/optimising code for parallelism
-		=> start tuning/optimising algorithms consuming
-		   excessive CPU time
-
 2.6.39 Switch default mount option to use delayed logging
 	=> should be roughly 12 months after initial merge
 	=> enough time to shake out remaining problems before next round of
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae..064f964d4f3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -353,9 +353,6 @@ xfs_parseargs(
 			mp->m_qflags &= ~XFS_OQUOTA_ENFD;
 		} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
 			mp->m_flags |= XFS_MOUNT_DELAYLOG;
-			cmn_err(CE_WARN,
-				"Enabling EXPERIMENTAL delayed logging feature "
-				"- use at your own risk.\n");
 		} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
 			mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
 		} else if (!strcmp(this_char, "ihashsize")) {
-- 
cgit v1.2.3


From 23308ba54dcdb54481163bfb07dd8aeca76a7a2e Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 4 Nov 2010 16:20:24 +0100
Subject: console: add /proc/consoles

It allows users to see what consoles are currently known to the system
and with what flags.

It is based on Werner's patch, the part about traversing fds was
removed, the code was moved to kernel/printk.c, where consoles are
handled and it makes more sense to me.

Signed-off-by: Jiri Slaby <jslaby@suse.cz> [cleanups]
Signed-off-by: "Dr. Werner Fink" <werner@suse.de>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/filesystems/proc.txt |  24 ++++++++
 fs/proc/Makefile                   |   1 +
 fs/proc/proc_console.c             | 114 +++++++++++++++++++++++++++++++++++++
 3 files changed, 139 insertions(+)
 create mode 100644 fs/proc/proc_console.c

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index e73df2722ff..9471225212c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1181,6 +1181,30 @@ Table 1-12: Files in /proc/fs/ext4/<devname>
  mb_groups       details of multiblock allocator buddy cache of free blocks
 ..............................................................................
 
+2.0 /proc/consoles
+------------------
+Shows registered system console lines.
+
+To see which character device lines are currently used for the system console
+/dev/console, you may simply look into the file /proc/consoles:
+
+  > cat /proc/consoles
+  tty0                 -WU (ECp)       4:7
+  ttyS0                -W- (Ep)        4:64
+
+The columns are:
+
+  device               name of the device
+  operations           R = can do read operations
+                       W = can do write operations
+                       U = can do unblank
+  flags                E = it is enabled
+                       C = it is prefered console
+                       B = it is primary boot console
+                       p = it is used for printk buffer
+                       b = it is not a TTY but a Braille device
+                       a = it is safe to use when cpu is offline
+  major:minor          major and minor number of the device separated by a colon
 
 ------------------------------------------------------------------------------
 Summary
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 2758e2afc51..288a49e098b 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -15,6 +15,7 @@ proc-y	+= devices.o
 proc-y	+= interrupts.o
 proc-y	+= loadavg.o
 proc-y	+= meminfo.o
+proc-y	+= proc_console.o
 proc-y	+= stat.o
 proc-y	+= uptime.o
 proc-y	+= version.o
diff --git a/fs/proc/proc_console.c b/fs/proc/proc_console.c
new file mode 100644
index 00000000000..8a707609f52
--- /dev/null
+++ b/fs/proc/proc_console.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2010 Werner Fink, Jiri Slaby
+ *
+ * Licensed under GPLv2
+ */
+
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/tty_driver.h>
+
+/*
+ * This is handler for /proc/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+	static const struct {
+		short flag;
+		char name;
+	} con_flags[] = {
+		{ CON_ENABLED,		'E' },
+		{ CON_CONSDEV,		'C' },
+		{ CON_BOOT,		'B' },
+		{ CON_PRINTBUFFER,	'p' },
+		{ CON_BRL,		'b' },
+		{ CON_ANYTIME,		'a' },
+	};
+	char flags[ARRAY_SIZE(con_flags) + 1];
+	struct console *con = v;
+	unsigned int a;
+	int len;
+	dev_t dev = 0;
+
+	if (con->device) {
+		const struct tty_driver *driver;
+		int index;
+		driver = con->device(con, &index);
+		if (driver) {
+			dev = MKDEV(driver->major, driver->minor_start);
+			dev += index;
+		}
+	}
+
+	for (a = 0; a < ARRAY_SIZE(con_flags); a++)
+		flags[a] = (con->flags & con_flags[a].flag) ?
+			con_flags[a].name : ' ';
+	flags[a] = 0;
+
+	seq_printf(m, "%s%d%n", con->name, con->index, &len);
+	len = 21 - len;
+	if (len < 1)
+		len = 1;
+	seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+			flags);
+	if (dev)
+		seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
+
+	seq_printf(m, "\n");
+
+	return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	struct console *con;
+	loff_t off = 0;
+
+	acquire_console_sem();
+	for_each_console(con)
+		if (off++ == *pos)
+			break;
+
+	return con;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct console *con = v;
+	++*pos;
+	return con->next;
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+	release_console_sem();
+}
+
+static const struct seq_operations consoles_op = {
+	.start	= c_start,
+	.next	= c_next,
+	.stop	= c_stop,
+	.show	= show_console_dev
+};
+
+static int consoles_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &consoles_op);
+}
+
+static const struct file_operations proc_consoles_operations = {
+	.open		= consoles_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int register_proc_consoles(void)
+{
+	proc_create("consoles", 0, NULL, &proc_consoles_operations);
+	return 0;
+}
+module_init(register_proc_consoles);
-- 
cgit v1.2.3


From 09c9feb94672bdb3ca6d424a292ffc26eff8ca0b Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 18 Nov 2010 12:27:32 -0800
Subject: Documentation: make configfs example code simpler, clearer

If "p" is NULL then it will cause an oops when we pass it to
simple_strtoul().  In this case "p" can not be NULL so I removed the
check.  I also changed the check a little to make it more explicit that
we are testing whether p points to the NUL char.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/configfs/configfs_example_explicit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/configfs/configfs_example_explicit.c b/Documentation/filesystems/configfs/configfs_example_explicit.c
index d428cc9f07f..fd53869f563 100644
--- a/Documentation/filesystems/configfs/configfs_example_explicit.c
+++ b/Documentation/filesystems/configfs/configfs_example_explicit.c
@@ -89,7 +89,7 @@ static ssize_t childless_storeme_write(struct childless *childless,
 	char *p = (char *) page;
 
 	tmp = simple_strtoul(p, &p, 10);
-	if (!p || (*p && (*p != '\n')))
+	if ((*p != '\0') && (*p != '\n'))
 		return -EINVAL;
 
 	if (tmp > INT_MAX)
-- 
cgit v1.2.3


From 6072d13c429373c5d63b69dadbbef40a9b035552 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 1 Dec 2010 13:35:19 -0500
Subject: Call the filesystem back whenever a page is removed from the page
 cache

NFS needs to be able to release objects that are stored in the page
cache once the page itself is no longer visible from the page cache.

This patch adds a callback to the address space operations that allows
filesystems to perform page cleanups once the page has been removed
from the page cache.

Original patch by: Linus Torvalds <torvalds@linux-foundation.org>
[trondmy: cover the cases of invalidate_inode_pages2() and
          truncate_inode_pages()]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/Locking | 7 ++++++-
 Documentation/filesystems/vfs.txt | 7 +++++++
 include/linux/fs.h                | 1 +
 mm/filemap.c                      | 5 +++++
 mm/truncate.c                     | 4 ++++
 mm/vmscan.c                       | 7 +++++++
 6 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index a91f3089001..b6426f15b4a 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -173,12 +173,13 @@ prototypes:
 	sector_t (*bmap)(struct address_space *, sector_t);
 	int (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, int);
+	void (*freepage)(struct page *);
 	int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
 	int (*launder_page) (struct page *);
 
 locking rules:
-	All except set_page_dirty may block
+	All except set_page_dirty and freepage may block
 
 			BKL	PageLocked(page)	i_mutex
 writepage:		no	yes, unlocks (see below)
@@ -193,6 +194,7 @@ perform_write:		no	n/a			yes
 bmap:			no
 invalidatepage:		no	yes
 releasepage:		no	yes
+freepage:		no	yes
 direct_IO:		no
 launder_page:		no	yes
 
@@ -288,6 +290,9 @@ buffers from the page in preparation for freeing it.  It returns zero to
 indicate that the buffers are (or may be) freeable.  If ->releasepage is zero,
 the kernel assumes that the fs has no private interest in the buffers.
 
+	->freepage() is called when the kernel is done dropping the page
+from the page cache.
+
 	->launder_page() may be called prior to releasing a page if
 it is still found to be dirty. It returns zero if the page was successfully
 cleaned, or an error value if not. Note that in order to prevent the page
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ed7e5efc06d..3b14a557eca 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -534,6 +534,7 @@ struct address_space_operations {
 	sector_t (*bmap)(struct address_space *, sector_t);
 	int (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, int);
+	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
@@ -679,6 +680,12 @@ struct address_space_operations {
         need to ensure this.  Possibly it can clear the PageUptodate
         bit if it cannot free private data yet.
 
+  freepage: freepage is called once the page is no longer visible in
+        the page cache in order to allow the cleanup of any private
+	data. Since it may be called by the memory reclaimer, it
+	should not assume that the original address_space mapping still
+	exists, and it should not block.
+
   direct_IO: called by the generic read/write routines to perform
         direct_IO - that is IO requests which bypass the page cache
         and transfer data directly between the storage and the
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c9e06cc70da..090f0eacde2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -602,6 +602,7 @@ struct address_space_operations {
 	sector_t (*bmap)(struct address_space *, sector_t);
 	void (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, gfp_t);
+	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
 	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
diff --git a/mm/filemap.c b/mm/filemap.c
index ea89840fc65..6b9aee20f24 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -143,13 +143,18 @@ void __remove_from_page_cache(struct page *page)
 void remove_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
+	void (*freepage)(struct page *);
 
 	BUG_ON(!PageLocked(page));
 
+	freepage = mapping->a_ops->freepage;
 	spin_lock_irq(&mapping->tree_lock);
 	__remove_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
 	mem_cgroup_uncharge_cache_page(page);
+
+	if (freepage)
+		freepage(page);
 }
 EXPORT_SYMBOL(remove_from_page_cache);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index ba887bff48c..3c2d5ddfa0d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	__remove_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
 	mem_cgroup_uncharge_cache_page(page);
+
+	if (mapping->a_ops->freepage)
+		mapping->a_ops->freepage(page);
+
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
 failed:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d31d7ce52c0..9ca587c6927 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -494,9 +494,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 		spin_unlock_irq(&mapping->tree_lock);
 		swapcache_free(swap, page);
 	} else {
+		void (*freepage)(struct page *);
+
+		freepage = mapping->a_ops->freepage;
+
 		__remove_from_page_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
 		mem_cgroup_uncharge_cache_page(page);
+
+		if (freepage != NULL)
+			freepage(page);
 	}
 
 	return 1;
-- 
cgit v1.2.3


From 4fe65cab844e6d3d7d310e66a501d5e7242ecb54 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 2 Dec 2010 14:31:19 -0800
Subject: Documentation/filesystems/vfs.txt: fix ->repeasepage() description

->releasepage() does not remove the page from the mapping.

Acked-by: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/vfs.txt | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ed7e5efc06d..55c28b79d8d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -660,11 +660,10 @@ struct address_space_operations {
   releasepage: releasepage is called on PagePrivate pages to indicate
         that the page should be freed if possible.  ->releasepage
         should remove any private data from the page and clear the
-        PagePrivate flag.  It may also remove the page from the
-        address_space.  If this fails for some reason, it may indicate
-        failure with a 0 return value.
-	This is used in two distinct though related cases.  The first
-        is when the VM finds a clean page with no active users and
+        PagePrivate flag. If releasepage() fails for some reason, it must
+	indicate failure with a 0 return value.
+	releasepage() is used in two distinct though related cases.  The
+	first is when the VM finds a clean page with no active users and
         wants to make it a free page.  If ->releasepage succeeds, the
         page will be removed from the address_space and become free.
 
-- 
cgit v1.2.3


From b83be6f20a0e468f715b14225c9f897538dfe5ad Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch>
Date: Thu, 16 Dec 2010 12:04:54 +0100
Subject: update Documentation/filesystems/Locking

Mostly inspired by all the recent BKL removal changes, but a lot of older
updates also weren't properly recorded.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking | 214 ++++++++++++++++++--------------------
 1 file changed, 102 insertions(+), 112 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index b6426f15b4a..7686e768449 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -18,7 +18,6 @@ prototypes:
 	char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
 
 locking rules:
-	none have BKL
 		dcache_lock	rename_lock	->d_lock	may block
 d_revalidate:	no		no		no		yes
 d_hash		no		no		no		yes
@@ -42,18 +41,23 @@ ata *);
 	int (*rename) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *);
 	int (*readlink) (struct dentry *, char __user *,int);
-	int (*follow_link) (struct dentry *, struct nameidata *);
+	void * (*follow_link) (struct dentry *, struct nameidata *);
+	void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int, struct nameidata *);
+	int (*check_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
+	void (*truncate_range)(struct inode *, loff_t, loff_t);
+	long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len);
+	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
 
 locking rules:
-	all may block, none have BKL
+	all may block
 		i_mutex(inode)
 lookup:		yes
 create:		yes
@@ -66,19 +70,24 @@ rmdir:		yes (both)	(see below)
 rename:		yes (all)	(see below)
 readlink:	no
 follow_link:	no
+put_link:	no
 truncate:	yes		(see below)
 setattr:	yes
 permission:	no
+check_acl:	no
 getattr:	no
 setxattr:	yes
 getxattr:	no
 listxattr:	no
 removexattr:	yes
+truncate_range:	yes
+fallocate:	no
+fiemap:		no
 	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
 	cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
 	->truncate() is never called directly - it's a callback, not a
-method. It's called by vmtruncate() - library function normally used by
+method. It's called by vmtruncate() - deprecated library function used by
 ->setattr(). Locking information above applies to that call (i.e. is
 inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
 passed).
@@ -91,7 +100,7 @@ prototypes:
 	struct inode *(*alloc_inode)(struct super_block *sb);
 	void (*destroy_inode)(struct inode *);
 	void (*dirty_inode) (struct inode *);
-	int (*write_inode) (struct inode *, int);
+	int (*write_inode) (struct inode *, struct writeback_control *wbc);
 	int (*drop_inode) (struct inode *);
 	void (*evict_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
@@ -105,10 +114,11 @@ prototypes:
 	int (*show_options)(struct seq_file *, struct vfsmount *);
 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+	int (*trim_fs) (struct super_block *, struct fstrim_range *);
 
 locking rules:
 	All may block [not true, see below]
-	None have BKL
 			s_umount
 alloc_inode:
 destroy_inode:
@@ -127,6 +137,8 @@ umount_begin:		no
 show_options:		no		(namespace_sem)
 quota_read:		no		(see below)
 quota_write:		no		(see below)
+bdev_try_to_free_page:	no		(see below)
+trim_fs:		no
 
 ->statfs() has s_umount (shared) when called by ustat(2) (native or
 compat), but that's an accident of bad API; s_umount is used to pin
@@ -139,19 +151,25 @@ be the only ones operating on the quota file by the quota code (via
 dqio_sem) (unless an admin really wants to screw up something and
 writes to quota files with quotas on). For other details about locking
 see also dquot_operations section.
+->bdev_try_to_free_page is called from the ->releasepage handler of
+the block device inode.  See there for more details.
 
 --------------------------- file_system_type ---------------------------
 prototypes:
 	int (*get_sb) (struct file_system_type *, int,
 		       const char *, void *, struct vfsmount *);
+	struct dentry *(*mount) (struct file_system_type *, int,
+		       const char *, void *);
 	void (*kill_sb) (struct super_block *);
 locking rules:
-		may block	BKL
-get_sb		yes		no
-kill_sb		yes		no
+		may block
+get_sb		yes
+mount		yes
+kill_sb		yes
 
 ->get_sb() returns error or 0 with locked superblock attached to the vfsmount
 (exclusive on ->s_umount).
+->mount() returns ERR_PTR or the root dentry.
 ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
 unlocks and drops the reference.
 
@@ -176,27 +194,35 @@ prototypes:
 	void (*freepage)(struct page *);
 	int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
-	int (*launder_page) (struct page *);
+	int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
+				unsigned long *);
+	int (*migratepage)(struct address_space *, struct page *, struct page *);
+	int (*launder_page)(struct page *);
+	int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
+	int (*error_remove_page)(struct address_space *, struct page *);
 
 locking rules:
 	All except set_page_dirty and freepage may block
 
-			BKL	PageLocked(page)	i_mutex
-writepage:		no	yes, unlocks (see below)
-readpage:		no	yes, unlocks
-sync_page:		no	maybe
-writepages:		no
-set_page_dirty		no	no
-readpages:		no
-write_begin:		no	locks the page		yes
-write_end:		no	yes, unlocks		yes
-perform_write:		no	n/a			yes
-bmap:			no
-invalidatepage:		no	yes
-releasepage:		no	yes
-freepage:		no	yes
-direct_IO:		no
-launder_page:		no	yes
+			PageLocked(page)	i_mutex
+writepage:		yes, unlocks (see below)
+readpage:		yes, unlocks
+sync_page:		maybe
+writepages:
+set_page_dirty		no
+readpages:
+write_begin:		locks the page		yes
+write_end:		yes, unlocks		yes
+bmap:
+invalidatepage:		yes
+releasepage:		yes
+freepage:		yes
+direct_IO:
+get_xip_mem:					maybe
+migratepage:		yes (both)
+launder_page:		yes
+is_partially_uptodate:	yes
+error_remove_page:	yes
 
 	->write_begin(), ->write_end(), ->sync_page() and ->readpage()
 may be called from the request handler (/dev/loop).
@@ -276,9 +302,8 @@ under spinlock (it cannot block) and is sometimes called with the page
 not locked.
 
 	->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
-filesystems and by the swapper. The latter will eventually go away. All
-instances do not actually need the BKL. Please, keep it that way and don't
-breed new callers.
+filesystems and by the swapper. The latter will eventually go away.  Please,
+keep it that way and don't breed new callers.
 
 	->invalidatepage() is called when the filesystem must attempt to drop
 some or all of the buffers from the page when it is being truncated.  It
@@ -299,47 +324,37 @@ cleaned, or an error value if not. Note that in order to prevent the page
 getting mapped back in and redirtied, it needs to be kept locked
 across the entire operation.
 
-	Note: currently almost all instances of address_space methods are
-using BKL for internal serialization and that's one of the worst sources
-of contention. Normally they are calling library functions (in fs/buffer.c)
-and pass foo_get_block() as a callback (on local block-based filesystems,
-indeed). BKL is not needed for library stuff and is usually taken by
-foo_get_block(). It's an overkill, since block bitmaps can be protected by
-internal fs locking and real critical areas are much smaller than the areas
-filesystems protect now.
-
 ----------------------- file_lock_operations ------------------------------
 prototypes:
-	void (*fl_insert)(struct file_lock *);	/* lock insertion callback */
-	void (*fl_remove)(struct file_lock *);	/* lock removal callback */
 	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
 
 
 locking rules:
-			BKL	may block
-fl_insert:		yes	no
-fl_remove:		yes	no
-fl_copy_lock:		yes	no
-fl_release_private:	yes	yes
+			file_lock_lock	may block
+fl_copy_lock:		yes		no
+fl_release_private:	maybe		no
 
 ----------------------- lock_manager_operations ---------------------------
 prototypes:
 	int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
 	void (*fl_notify)(struct file_lock *);  /* unblock callback */
+	int (*fl_grant)(struct file_lock *, struct file_lock *, int);
 	void (*fl_release_private)(struct file_lock *);
 	void (*fl_break)(struct file_lock *); /* break_lease callback */
+	int (*fl_mylease)(struct file_lock *, struct file_lock *);
+	int (*fl_change)(struct file_lock **, int);
 
 locking rules:
-			BKL	may block
-fl_compare_owner:	yes	no
-fl_notify:		yes	no
-fl_release_private:	yes	yes
-fl_break:		yes	no
-
-	Currently only NFSD and NLM provide instances of this class. None of the
-them block. If you have out-of-tree instances - please, show up. Locking
-in that area will change.
+			file_lock_lock	may block
+fl_compare_owner:	yes		no
+fl_notify:		yes		no
+fl_grant:		no		no
+fl_release_private:	maybe		no
+fl_break:		yes		no
+fl_mylease:		yes		no
+fl_change		yes		no
+
 --------------------------- buffer_head -----------------------------------
 prototypes:
 	void (*b_end_io)(struct buffer_head *bh, int uptodate);
@@ -364,17 +379,17 @@ prototypes:
 	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 
 locking rules:
-			BKL	bd_mutex
-open:			no	yes
-release:		no	yes
-ioctl:			no	no
-compat_ioctl:		no	no
-direct_access:		no	no
-media_changed:		no	no
-unlock_native_capacity:	no	no
-revalidate_disk:	no	no
-getgeo:			no	no
-swap_slot_free_notify:	no	no	(see below)
+			bd_mutex
+open:			yes
+release:		yes
+ioctl:			no
+compat_ioctl:		no
+direct_access:		no
+media_changed:		no
+unlock_native_capacity:	no
+revalidate_disk:	no
+getgeo:			no
+swap_slot_free_notify:	no	(see below)
 
 media_changed, unlock_native_capacity and revalidate_disk are called only from
 check_disk_change().
@@ -413,34 +428,21 @@ prototypes:
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long,
 			unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
+	int (*flock) (struct file *, int, struct file_lock *);
+	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *,
+			size_t, unsigned int);
+	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
+			size_t, unsigned int);
+	int (*setlease)(struct file *, long, struct file_lock **);
 };
 
 locking rules:
-	All may block.
-			BKL
-llseek:			no	(see below)
-read:			no
-aio_read:		no
-write:			no
-aio_write:		no
-readdir: 		no
-poll:			no
-unlocked_ioctl:		no
-compat_ioctl:		no
-mmap:			no
-open:			no
-flush:			no
-release:		no
-fsync:			no	(see below)
-aio_fsync:		no
-fasync:			no
-lock:			yes
-readv:			no
-writev:			no
-sendfile:		no
-sendpage:		no
-get_unmapped_area:	no
-check_flags:		no
+	All may block except for ->setlease.
+	No VFS locks held on entry except for ->fsync and ->setlease.
+
+->fsync() has i_mutex on inode.
+
+->setlease has the file_list_lock held and must not sleep.
 
 ->llseek() locking has moved from llseek to the individual llseek
 implementations.  If your fs is not using generic_file_llseek, you
@@ -450,17 +452,10 @@ mutex or just to use i_size_read() instead.
 Note: this does not protect the file->f_pos against concurrent modifications
 since this is something the userspace has to take care about.
 
-Note: ext2_release() was *the* source of contention on fs-intensive
-loads and dropping BKL on ->release() helps to get rid of that (we still
-grab BKL for cases when we close a file that had been opened r/w, but that
-can and should be done using the internal locking with smaller critical areas).
-Current worst offender is ext2_get_block()...
-
-->fasync() is called without BKL protection, and is responsible for
-maintaining the FASYNC bit in filp->f_flags.  Most instances call
-fasync_helper(), which does that maintenance, so it's not normally
-something one needs to worry about.  Return values > 0 will be mapped to
-zero in the VFS layer.
+->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
+Most instances call fasync_helper(), which does that maintenance, so it's
+not normally something one needs to worry about.  Return values > 0 will be
+mapped to zero in the VFS layer.
 
 ->readdir() and ->ioctl() on directories must be changed. Ideally we would
 move ->readdir() to inode_operations and use a separate method for directory
@@ -471,8 +466,6 @@ components. And there are other reasons why the current interface is a mess...
 ->read on directories probably must go away - we should just enforce -EISDIR
 in sys_read() and friends.
 
-->fsync() has i_mutex on inode.
-
 --------------------------- dquot_operations -------------------------------
 prototypes:
 	int (*write_dquot) (struct dquot *);
@@ -507,12 +500,12 @@ prototypes:
 	int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
 
 locking rules:
-		BKL	mmap_sem	PageLocked(page)
-open:		no	yes
-close:		no	yes
-fault:		no	yes		can return with page locked
-page_mkwrite:	no	yes		can return with page locked
-access:		no	yes
+		mmap_sem	PageLocked(page)
+open:		yes
+close:		yes
+fault:		yes		can return with page locked
+page_mkwrite:	yes		can return with page locked
+access:		yes
 
 	->fault() is called when a previously not present pte is about
 to be faulted in. The filesystem must find and return the page associated
@@ -539,6 +532,3 @@ VM_IO | VM_PFNMAP VMAs.
 
 (if you break something or notice that it is broken and do not fix it yourself
 - at least put it here)
-
-ipc/shm.c::shm_delete() - may need BKL.
-->read() and ->write() in many drivers are (probably) missing BKL.
-- 
cgit v1.2.3


From 8a87694ed159d7abd2c9ed657416696c05db2252 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 4 Jan 2011 07:14:24 +0100
Subject: remove trim_fs method from Documentation/filesystems/Locking

The ->trim_fs has been removed meanwhile, so remove it from the documentation
as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking | 2 --
 1 file changed, 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 7686e768449..33fa3e5d38f 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -115,7 +115,6 @@ prototypes:
 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
-	int (*trim_fs) (struct super_block *, struct fstrim_range *);
 
 locking rules:
 	All may block [not true, see below]
@@ -138,7 +137,6 @@ show_options:		no		(namespace_sem)
 quota_read:		no		(see below)
 quota_write:		no		(see below)
 bdev_try_to_free_page:	no		(see below)
-trim_fs:		no
 
 ->statfs() has s_umount (shared) when called by ustat(2) (native or
 compat), but that's an accident of bad API; s_umount is used to pin
-- 
cgit v1.2.3


From fe15ce446beb3a33583af81ffe6c9d01a75314ed Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:23 +1100
Subject: fs: change d_delete semantics

Change d_delete from a dentry deletion notification to a dentry caching
advise, more like ->drop_inode. Require it to be constant and idempotent,
and not take d_lock. This is how all existing filesystems use the callback
anyway.

This makes fine grained dentry locking of dput and dentry lru scanning
much simpler.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 Documentation/filesystems/porting |  8 ++++++++
 Documentation/filesystems/vfs.txt | 27 +++++++++++++--------------
 arch/ia64/kernel/perfmon.c        |  2 +-
 drivers/staging/smbfs/dir.c       |  4 ++--
 fs/9p/vfs_dentry.c                |  4 ++--
 fs/afs/dir.c                      |  4 ++--
 fs/btrfs/inode.c                  |  2 +-
 fs/coda/dir.c                     |  4 ++--
 fs/configfs/dir.c                 |  2 +-
 fs/dcache.c                       |  2 --
 fs/gfs2/dentry.c                  |  2 +-
 fs/hostfs/hostfs_kern.c           |  2 +-
 fs/libfs.c                        |  2 +-
 fs/ncpfs/dir.c                    |  4 ++--
 fs/nfs/dir.c                      |  2 +-
 fs/proc/base.c                    |  2 +-
 fs/proc/generic.c                 |  2 +-
 fs/proc/proc_sysctl.c             |  2 +-
 fs/sysfs/dir.c                    |  2 +-
 include/linux/dcache.h            |  6 +++---
 kernel/cgroup.c                   |  2 +-
 net/sunrpc/rpc_pipe.c             |  2 +-
 22 files changed, 47 insertions(+), 42 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index b12c8953868..9e71c9ad310 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -318,3 +318,11 @@ if it's zero is not *and* *never* *had* *been* enough.  Final unlink() and iput(
 may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly
 free the on-disk inode, you may end up doing that while ->write_inode() is writing
 to it.
+
+---
+[mandatory]
+
+	.d_delete() now only advises the dcache as to whether or not to cache
+unreferenced dentries, and is now only called when the dentry refcount goes to
+0. Even on 0 refcount transition, it must be able to tolerate being called 0,
+1, or more times (eg. constant, idempotent).
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 20899e095e7..95c0a93f056 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -847,9 +847,9 @@ defined:
 
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, struct nameidata *);
-	int (*d_hash) (struct dentry *, struct qstr *);
-	int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
-	int (*d_delete)(struct dentry *);
+	int (*d_hash)(struct dentry *, struct qstr *);
+	int (*d_compare)(struct dentry *, struct qstr *, struct qstr *);
+	int (*d_delete)(const struct dentry *);
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
@@ -864,9 +864,11 @@ struct dentry_operations {
 
   d_compare: called when a dentry should be compared with another
 
-  d_delete: called when the last reference to a dentry is
-	deleted. This means no-one is using the dentry, however it is
-	still valid and in the dcache
+  d_delete: called when the last reference to a dentry is dropped and the
+	dcache is deciding whether or not to cache it. Return 1 to delete
+	immediately, or 0 to cache the dentry. Default is NULL which means to
+	always cache a reachable dentry. d_delete must be constant and
+	idempotent.
 
   d_release: called when a dentry is really deallocated
 
@@ -910,14 +912,11 @@ manipulate dentries:
 	the usage count)
 
   dput: close a handle for a dentry (decrements the usage count). If
-	the usage count drops to 0, the "d_delete" method is called
-	and the dentry is placed on the unused list if the dentry is
-	still in its parents hash list. Putting the dentry on the
-	unused list just means that if the system needs some RAM, it
-	goes through the unused list of dentries and deallocates them.
-	If the dentry has already been unhashed and the usage count
-	drops to 0, in this case the dentry is deallocated after the
-	"d_delete" method is called
+	the usage count drops to 0, and the dentry is still in its
+	parent's hash, the "d_delete" method is called to check whether
+	it should be cached. If it should not be cached, or if the dentry
+	is not hashed, it is deleted. Otherwise cached dentries are put
+	into an LRU list to be reclaimed on memory shortage.
 
   d_drop: this unhashes a dentry from its parents hash list. A
 	subsequent call to dput() will deallocate the dentry if its
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 39e534f5a3b..d39d8a53b57 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2185,7 +2185,7 @@ static const struct file_operations pfm_file_ops = {
 };
 
 static int
-pfmfs_delete_dentry(struct dentry *dentry)
+pfmfs_delete_dentry(const struct dentry *dentry)
 {
 	return 1;
 }
diff --git a/drivers/staging/smbfs/dir.c b/drivers/staging/smbfs/dir.c
index f088ea2f6ac..2270d4822c2 100644
--- a/drivers/staging/smbfs/dir.c
+++ b/drivers/staging/smbfs/dir.c
@@ -276,7 +276,7 @@ smb_dir_open(struct inode *dir, struct file *file)
 static int smb_lookup_validate(struct dentry *, struct nameidata *);
 static int smb_hash_dentry(struct dentry *, struct qstr *);
 static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
-static int smb_delete_dentry(struct dentry *);
+static int smb_delete_dentry(const struct dentry *);
 
 static const struct dentry_operations smbfs_dentry_operations =
 {
@@ -367,7 +367,7 @@ out:
  * We use this to unhash dentries with bad inodes.
  */
 static int
-smb_delete_dentry(struct dentry * dentry)
+smb_delete_dentry(const struct dentry *dentry)
 {
 	if (dentry->d_inode) {
 		if (is_bad_inode(dentry->d_inode)) {
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index cbf4e50f393..466d2a4fc5c 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -51,7 +51,7 @@
  *
  */
 
-static int v9fs_dentry_delete(struct dentry *dentry)
+static int v9fs_dentry_delete(const struct dentry *dentry)
 {
 	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
 									dentry);
@@ -68,7 +68,7 @@ static int v9fs_dentry_delete(struct dentry *dentry)
  *
  */
 
-static int v9fs_cached_dentry_delete(struct dentry *dentry)
+static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5439e1bc9a8..2c18cde2700 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -23,7 +23,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 static int afs_dir_open(struct inode *inode, struct file *file);
 static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
-static int afs_d_delete(struct dentry *dentry);
+static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
 static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 				  loff_t fpos, u64 ino, unsigned dtype);
@@ -730,7 +730,7 @@ out_bad:
  * - called from dput() when d_count is going to 0.
  * - return 1 to request dentry be unhashed, 0 otherwise
  */
-static int afs_d_delete(struct dentry *dentry)
+static int afs_d_delete(const struct dentry *dentry)
 {
 	_enter("%s", dentry->d_name.name);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 72f31ecb5c9..7ce9f893278 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4127,7 +4127,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	return inode;
 }
 
-static int btrfs_dentry_delete(struct dentry *dentry)
+static int btrfs_dentry_delete(const struct dentry *dentry)
 {
 	struct btrfs_root *root;
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 5d8b3553960..4cce3b07d9d 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -47,7 +47,7 @@ static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
 
 /* dentry ops */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
-static int coda_dentry_delete(struct dentry *);
+static int coda_dentry_delete(const struct dentry *);
 
 /* support routines */
 static int coda_venus_readdir(struct file *coda_file, void *buf,
@@ -577,7 +577,7 @@ out:
  * This is the callback from dput() when d_count is going to 0.
  * We use this to unhash dentries with bad inodes.
  */
-static int coda_dentry_delete(struct dentry * dentry)
+static int coda_dentry_delete(const struct dentry * dentry)
 {
 	int flags;
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 57870696941..20024a9ef5a 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,7 +67,7 @@ static void configfs_d_iput(struct dentry * dentry,
  * We _must_ delete our dentries on last dput, as the chain-to-parent
  * behavior is required to clear the parents of default_groups.
  */
-static int configfs_d_delete(struct dentry *dentry)
+static int configfs_d_delete(const struct dentry *dentry)
 {
 	return 1;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index b2cb2662ca0..6ee6bc40cb6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -453,8 +453,6 @@ static void prune_one_dentry(struct dentry * dentry)
 		if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
 			return;
 
-		if (dentry->d_op && dentry->d_op->d_delete)
-			dentry->d_op->d_delete(dentry);
 		dentry_lru_del(dentry);
 		__d_drop(dentry);
 		dentry = d_kill(dentry);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 6798755b385..e80fea2f65f 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -106,7 +106,7 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
 	return 0;
 }
 
-static int gfs2_dentry_delete(struct dentry *dentry)
+static int gfs2_dentry_delete(const struct dentry *dentry)
 {
 	struct gfs2_inode *ginode;
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2c0f148a49e..cfe8bc7de51 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -32,7 +32,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 
 #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
 
-static int hostfs_d_delete(struct dentry *dentry)
+static int hostfs_d_delete(const struct dentry *dentry)
 {
 	return 1;
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index a3accdf528a..b9d25d83e22 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -37,7 +37,7 @@ int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
  * Retaining negative dentries for an in-memory filesystem just wastes
  * memory and lookup time: arrange for them to be deleted immediately.
  */
-static int simple_delete_dentry(struct dentry *dentry)
+static int simple_delete_dentry(const struct dentry *dentry)
 {
 	return 1;
 }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f22b12e7d33..d6e6453881c 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -76,7 +76,7 @@ const struct inode_operations ncp_dir_inode_operations =
 static int ncp_lookup_validate(struct dentry *, struct nameidata *);
 static int ncp_hash_dentry(struct dentry *, struct qstr *);
 static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
-static int ncp_delete_dentry(struct dentry *);
+static int ncp_delete_dentry(const struct dentry *);
 
 static const struct dentry_operations ncp_dentry_operations =
 {
@@ -162,7 +162,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
  * Closing files can be safely postponed until iput() - it's done there anyway.
  */
 static int
-ncp_delete_dentry(struct dentry * dentry)
+ncp_delete_dentry(const struct dentry * dentry)
 {
 	struct inode *inode = dentry->d_inode;
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 996dd8989a9..9184c7c80f7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1117,7 +1117,7 @@ out_error:
 /*
  * This is called from dput() when d_count is going to 0.
  */
-static int nfs_dentry_delete(struct dentry *dentry)
+static int nfs_dentry_delete(const struct dentry *dentry)
 {
 	dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 182845147fe..d932fdb6a24 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1744,7 +1744,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
-static int pid_delete_dentry(struct dentry * dentry)
+static int pid_delete_dentry(const struct dentry * dentry)
 {
 	/* Is the task we represent dead?
 	 * If so, then don't put the dentry on the lru list,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index dd29f033766..1d607be36d9 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -400,7 +400,7 @@ static const struct inode_operations proc_link_inode_operations = {
  * smarter: we could keep a "volatile" flag in the 
  * inode to indicate which ones to keep.
  */
-static int proc_delete_dentry(struct dentry * dentry)
+static int proc_delete_dentry(const struct dentry * dentry)
 {
 	return 1;
 }
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b652cb00906..a256d770ea1 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -392,7 +392,7 @@ static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return !PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
 
-static int proc_sys_delete(struct dentry *dentry)
+static int proc_sys_delete(const struct dentry *dentry)
 {
 	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7e54bac8c4b..27e1102e303 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -231,7 +231,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
 		goto repeat;
 }
 
-static int sysfs_dentry_delete(struct dentry *dentry)
+static int sysfs_dentry_delete(const struct dentry *dentry)
 {
 	struct sysfs_dirent *sd = dentry->d_fsdata;
 	return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index fff975576b5..cbfc9567e4e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -133,9 +133,9 @@ enum dentry_d_lock_class
 
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, struct nameidata *);
-	int (*d_hash) (struct dentry *, struct qstr *);
-	int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
-	int (*d_delete)(struct dentry *);
+	int (*d_hash)(struct dentry *, struct qstr *);
+	int (*d_compare)(struct dentry *, struct qstr *, struct qstr *);
+	int (*d_delete)(const struct dentry *);
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 163c890f436..746055b214d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2198,7 +2198,7 @@ static inline struct cftype *__file_cft(struct file *file)
 	return __d_cft(file->f_dentry);
 }
 
-static int cgroup_delete_dentry(struct dentry *dentry)
+static int cgroup_delete_dentry(const struct dentry *dentry)
 {
 	return 1;
 }
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 10a17a37ec4..a0dc1a86fce 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -430,7 +430,7 @@ void rpc_put_mount(void)
 }
 EXPORT_SYMBOL_GPL(rpc_put_mount);
 
-static int rpc_delete_dentry(struct dentry *dentry)
+static int rpc_delete_dentry(const struct dentry *dentry)
 {
 	return 1;
 }
-- 
cgit v1.2.3


From 621e155a3591962420eacdd39f6f0aa29ceb221e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:27 +1100
Subject: fs: change d_compare for rcu-walk

Change d_compare so it may be called from lock-free RCU lookups. This
does put significant restrictions on what may be done from the callback,
however there don't seem to have been any problems with in-tree fses.
If some strange use case pops up that _really_ cannot cope with the
rcu-walk rules, we can just add new rcu-unaware callbacks, which would
cause name lookup to drop out of rcu-walk mode.

For in-tree filesystems, this is just a mechanical change.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 Documentation/filesystems/Locking |  4 +-
 Documentation/filesystems/porting |  7 +++
 Documentation/filesystems/vfs.txt | 26 +++++++++--
 drivers/staging/smbfs/dir.c       | 16 ++++---
 fs/adfs/dir.c                     |  8 ++--
 fs/affs/namei.c                   | 46 ++++++++++++--------
 fs/cifs/dir.c                     | 12 ++---
 fs/dcache.c                       |  4 +-
 fs/fat/namei_msdos.c              | 14 +++---
 fs/fat/namei_vfat.c               | 33 ++++++++------
 fs/hfs/hfs_fs.h                   |  5 ++-
 fs/hfs/string.c                   | 14 +++---
 fs/hfsplus/hfsplus_fs.h           |  5 ++-
 fs/hfsplus/unicode.c              | 15 ++++---
 fs/hpfs/dentry.c                  | 22 ++++++----
 fs/isofs/inode.c                  | 92 ++++++++++++++++++++-------------------
 fs/isofs/namei.c                  |  3 +-
 fs/jfs/namei.c                    | 11 +++--
 fs/ncpfs/dir.c                    | 25 ++++++-----
 fs/ncpfs/ncplib_kernel.h          |  8 ++--
 fs/proc/proc_sysctl.c             | 13 +++---
 include/linux/dcache.h            | 12 +++--
 include/linux/ncp_fs.h            |  4 +-
 23 files changed, 242 insertions(+), 157 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 33fa3e5d38f..9a76f8d8bf9 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -11,7 +11,9 @@ be able to use diff(1).
 prototypes:
 	int (*d_revalidate)(struct dentry *, int);
 	int (*d_hash) (struct dentry *, struct qstr *);
-	int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
+	int (*d_compare)(const struct dentry *, const struct inode *,
+			const struct dentry *, const struct inode *,
+			unsigned int, const char *, const struct qstr *);
 	int (*d_delete)(struct dentry *);
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 9e71c9ad310..d44511e2082 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -326,3 +326,10 @@ to it.
 unreferenced dentries, and is now only called when the dentry refcount goes to
 0. Even on 0 refcount transition, it must be able to tolerate being called 0,
 1, or more times (eg. constant, idempotent).
+
+---
+[mandatory]
+
+	.d_compare() calling convention and locking rules are significantly
+changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
+look at examples of other filesystems) for guidance.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 95c0a93f056..250681b8c7c 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -848,7 +848,9 @@ defined:
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, struct nameidata *);
 	int (*d_hash)(struct dentry *, struct qstr *);
-	int (*d_compare)(struct dentry *, struct qstr *, struct qstr *);
+	int (*d_compare)(const struct dentry *, const struct inode *,
+			const struct dentry *, const struct inode *,
+			unsigned int, const char *, const struct qstr *);
 	int (*d_delete)(const struct dentry *);
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
@@ -860,9 +862,27 @@ struct dentry_operations {
 	dcache. Most filesystems leave this as NULL, because all their
 	dentries in the dcache are valid
 
-  d_hash: called when the VFS adds a dentry to the hash table
+  d_hash: called when the VFS adds a dentry to the hash table. The first
+	dentry passed to d_hash is the parent directory that the name is
+	to be hashed into.
 
-  d_compare: called when a dentry should be compared with another
+  d_compare: called to compare a dentry name with a given name. The first
+	dentry is the parent of the dentry to be compared, the second is
+	the parent's inode, then the dentry and inode (may be NULL) of the
+	child dentry. len and name string are properties of the dentry to be
+	compared. qstr is the name to compare it with.
+
+	Must be constant and idempotent, and should not take locks if
+	possible, and should not or store into the dentry or inodes.
+	Should not dereference pointers outside the dentry or inodes without
+	lots of care (eg.  d_parent, d_inode, d_name should not be used).
+
+	However, our vfsmount is pinned, and RCU held, so the dentries and
+	inodes won't disappear, neither will our sb or filesystem module.
+	->i_sb and ->d_sb may be used.
+
+	It is a tricky calling convention because it needs to be called under
+	"rcu-walk", ie. without any locks or references on things.
 
   d_delete: called when the last reference to a dentry is dropped and the
 	dcache is deciding whether or not to cache it. Return 1 to delete
diff --git a/drivers/staging/smbfs/dir.c b/drivers/staging/smbfs/dir.c
index 2270d4822c2..bd63229f43f 100644
--- a/drivers/staging/smbfs/dir.c
+++ b/drivers/staging/smbfs/dir.c
@@ -275,7 +275,10 @@ smb_dir_open(struct inode *dir, struct file *file)
  */
 static int smb_lookup_validate(struct dentry *, struct nameidata *);
 static int smb_hash_dentry(struct dentry *, struct qstr *);
-static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+static int smb_compare_dentry(const struct dentry *,
+		const struct inode *,
+		const struct dentry *, const struct inode *,
+		unsigned int, const char *, const struct qstr *);
 static int smb_delete_dentry(const struct dentry *);
 
 static const struct dentry_operations smbfs_dentry_operations =
@@ -347,14 +350,17 @@ smb_hash_dentry(struct dentry *dir, struct qstr *this)
 }
 
 static int
-smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
+smb_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
 	int i, result = 1;
 
-	if (a->len != b->len)
+	if (len != name->len)
 		goto out;
-	for (i=0; i < a->len; i++) {
-		if (tolower(a->name[i]) != tolower(b->name[i]))
+	for (i=0; i < len; i++) {
+		if (tolower(str[i]) != tolower(name->name[i]))
 			goto out;
 	}
 	result = 0;
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index f4287e4de74..c8ed66162bd 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -237,17 +237,19 @@ adfs_hash(struct dentry *parent, struct qstr *qstr)
  * requirements of the underlying filesystem.
  */
 static int
-adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name)
+adfs_compare(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
 	int i;
 
-	if (entry->len != name->len)
+	if (len != name->len)
 		return 1;
 
 	for (i = 0; i < name->len; i++) {
 		char a, b;
 
-		a = entry->name[i];
+		a = str[i];
 		b = name->name[i];
 
 		if (a >= 'A' && a <= 'Z')
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 914d1c0bc07..547d5deb0d4 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -14,10 +14,16 @@ typedef int (*toupper_t)(int);
 
 static int	 affs_toupper(int ch);
 static int	 affs_hash_dentry(struct dentry *, struct qstr *);
-static int       affs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 static int	 affs_intl_toupper(int ch);
 static int	 affs_intl_hash_dentry(struct dentry *, struct qstr *);
-static int       affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 
 const struct dentry_operations affs_dentry_operations = {
 	.d_hash		= affs_hash_dentry,
@@ -88,29 +94,29 @@ affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr)
 	return __affs_hash_dentry(dentry, qstr, affs_intl_toupper);
 }
 
-static inline int
-__affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, toupper_t toupper)
+static inline int __affs_compare_dentry(unsigned int len,
+		const char *str, const struct qstr *name, toupper_t toupper)
 {
-	const u8 *aname = a->name;
-	const u8 *bname = b->name;
-	int len;
+	const u8 *aname = str;
+	const u8 *bname = name->name;
 
-	/* 'a' is the qstr of an already existing dentry, so the name
-	 * must be valid. 'b' must be validated first.
+	/*
+	 * 'str' is the name of an already existing dentry, so the name
+	 * must be valid. 'name' must be validated first.
 	 */
 
-	if (affs_check_name(b->name,b->len))
+	if (affs_check_name(name->name, name->len))
 		return 1;
 
-	/* If the names are longer than the allowed 30 chars,
+	/*
+	 * If the names are longer than the allowed 30 chars,
 	 * the excess is ignored, so their length may differ.
 	 */
-	len = a->len;
 	if (len >= 30) {
-		if (b->len < 30)
+		if (name->len < 30)
 			return 1;
 		len = 30;
-	} else if (len != b->len)
+	} else if (len != name->len)
 		return 1;
 
 	for (; len > 0; len--)
@@ -121,14 +127,18 @@ __affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, tou
 }
 
 static int
-affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return __affs_compare_dentry(dentry, a, b, affs_toupper);
+	return __affs_compare_dentry(len, str, name, affs_toupper);
 }
 static int
-affs_intl_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return __affs_compare_dentry(dentry, a, b, affs_intl_toupper);
+	return __affs_compare_dentry(len, str, name, affs_intl_toupper);
 }
 
 /*
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 521d841b1fd..c60133f0d8e 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -715,13 +715,15 @@ static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
 	return 0;
 }
 
-static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
-			   struct qstr *b)
+static int cifs_ci_compare(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+	struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
 
-	if ((a->len == b->len) &&
-	    (nls_strnicmp(codepage, a->name, b->name, a->len) == 0))
+	if ((name->len == len) &&
+	    (nls_strnicmp(codepage, name->name, str, len) == 0))
 		return 0;
 	return 1;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index 814e5f491e9..7075555fbb0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1437,7 +1437,9 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 		 */
 		qstr = &dentry->d_name;
 		if (parent->d_op && parent->d_op->d_compare) {
-			if (parent->d_op->d_compare(parent, qstr, name))
+			if (parent->d_op->d_compare(parent, parent->d_inode,
+						dentry, dentry->d_inode,
+						qstr->len, qstr->name, name))
 				goto next;
 		} else {
 			if (qstr->len != len)
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3345aabd1dd..99d3c7ac973 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -164,16 +164,18 @@ static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
  * Compare two msdos names. If either of the names are invalid,
  * we fall back to doing the standard name comparison.
  */
-static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+	struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
 	unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
 	int error;
 
-	error = msdos_format_name(a->name, a->len, a_msdos_name, options);
+	error = msdos_format_name(name->name, name->len, a_msdos_name, options);
 	if (error)
 		goto old_compare;
-	error = msdos_format_name(b->name, b->len, b_msdos_name, options);
+	error = msdos_format_name(str, len, b_msdos_name, options);
 	if (error)
 		goto old_compare;
 	error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
@@ -182,8 +184,8 @@ out:
 
 old_compare:
 	error = 1;
-	if (a->len == b->len)
-		error = memcmp(a->name, b->name, a->len);
+	if (name->len == len)
+		error = memcmp(name->name, str, len);
 	goto out;
 }
 
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b936703b892..95e00ab84c3 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -85,15 +85,18 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 }
 
 /* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int vfat_striptail_len(struct qstr *qstr)
+static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
 {
-	unsigned int len = qstr->len;
-
-	while (len && qstr->name[len - 1] == '.')
+	while (len && name[len - 1] == '.')
 		len--;
 	return len;
 }
 
+static unsigned int vfat_striptail_len(const struct qstr *qstr)
+{
+	return __vfat_striptail_len(qstr->len, qstr->name);
+}
+
 /*
  * Compute the hash for the vfat name corresponding to the dentry.
  * Note: if the name is invalid, we leave the hash code unchanged so
@@ -133,16 +136,18 @@ static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
 /*
  * Case insensitive compare of two vfat names.
  */
-static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+	struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
 	unsigned int alen, blen;
 
 	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = vfat_striptail_len(a);
-	blen = vfat_striptail_len(b);
+	alen = vfat_striptail_len(name);
+	blen = __vfat_striptail_len(len, str);
 	if (alen == blen) {
-		if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+		if (nls_strnicmp(t, name->name, str, alen) == 0)
 			return 0;
 	}
 	return 1;
@@ -151,15 +156,17 @@ static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
 /*
  * Case sensitive compare of two vfat names.
  */
-static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
 	unsigned int alen, blen;
 
 	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = vfat_striptail_len(a);
-	blen = vfat_striptail_len(b);
+	alen = vfat_striptail_len(name);
+	blen = __vfat_striptail_len(len, str);
 	if (alen == blen) {
-		if (strncmp(a->name, b->name, alen) == 0)
+		if (strncmp(name->name, str, alen) == 0)
 			return 0;
 	}
 	return 1;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index c8cffb81e84..8cd876f0e96 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -216,7 +216,10 @@ extern const struct dentry_operations hfs_dentry_operations;
 extern int hfs_hash_dentry(struct dentry *, struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
 		      const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+extern int hfs_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 
 /* trans.c */
 extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 927a5af7942..aaf90d0d694 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -92,21 +92,21 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
  * Test for equality of two strings in the HFS filename character ordering.
  * return 1 on failure and 0 on success
  */
-int hfs_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
 	const unsigned char *n1, *n2;
-	int len;
 
-	len = s1->len;
 	if (len >= HFS_NAMELEN) {
-		if (s2->len < HFS_NAMELEN)
+		if (name->len < HFS_NAMELEN)
 			return 1;
 		len = HFS_NAMELEN;
-	} else if (len != s2->len)
+	} else if (len != name->len)
 		return 1;
 
-	n1 = s1->name;
-	n2 = s2->name;
+	n1 = str;
+	n2 = name->name;
 	while (len--) {
 		if (caseorder[*n1++] != caseorder[*n2++])
 			return 1;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index cb3653efb57..7aa96eefe48 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -380,7 +380,10 @@ int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *)
 int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
 int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
 int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2);
+int hfsplus_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 
 /* wrapper.c */
 int hfsplus_read_wrapper(struct super_block *);
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index b66d67de882..b178c997efa 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -363,9 +363,12 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
  * Composed unicode characters are decomposed and case-folding is performed
  * if the appropriate bits are (un)set on the superblock.
  */
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfsplus_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct super_block *sb = dentry->d_sb;
+	struct super_block *sb = parent->d_sb;
 	int casefold, decompose, size;
 	int dsize1, dsize2, len1, len2;
 	const u16 *dstr1, *dstr2;
@@ -375,10 +378,10 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
 
 	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
-	astr1 = s1->name;
-	len1 = s1->len;
-	astr2 = s2->name;
-	len2 = s2->len;
+	astr1 = str;
+	len1 = len;
+	astr2 = name->name;
+	len2 = name->len;
 	dsize1 = dsize2 = 0;
 	dstr1 = dstr2 = NULL;
 
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 67d9d36b3d5..dd9b1e74a73 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -34,19 +34,25 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
 	return 0;
 }
 
-static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int hpfs_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	unsigned al=a->len;
-	unsigned bl=b->len;
-	hpfs_adjust_length(a->name, &al);
+	unsigned al = len;
+	unsigned bl = name->len;
+
+	hpfs_adjust_length(str, &al);
 	/*hpfs_adjust_length(b->name, &bl);*/
-	/* 'a' is the qstr of an already existing dentry, so the name
-	 * must be valid. 'b' must be validated first.
+
+	/*
+	 * 'str' is the nane of an already existing dentry, so the name
+	 * must be valid. 'name' must be validated first.
 	 */
 
-	if (hpfs_chk_name(b->name, &bl))
+	if (hpfs_chk_name(name->name, &bl))
 		return 1;
-	if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
+	if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
 		return 1;
 	return 0;
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bfdeb82a53b..7b0fbc61af8 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -28,14 +28,26 @@
 
 static int isofs_hashi(struct dentry *parent, struct qstr *qstr);
 static int isofs_hash(struct dentry *parent, struct qstr *qstr);
-static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b);
-static int isofs_dentry_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_dentry_cmpi(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 
 #ifdef CONFIG_JOLIET
 static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr);
 static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr);
-static int isofs_dentry_cmpi_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
-static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_dentry_cmpi_ms(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp_ms(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 #endif
 
 static void isofs_put_super(struct super_block *sb)
@@ -206,49 +218,31 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
 }
 
 /*
- * Case insensitive compare of two isofs names.
+ * Compare of two isofs names.
  */
-static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a,
-				struct qstr *b, int ms)
+static int isofs_dentry_cmp_common(
+		unsigned int len, const char *str,
+		const struct qstr *name, int ms, int ci)
 {
 	int alen, blen;
 
 	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = a->len;
-	blen = b->len;
+	alen = name->len;
+	blen = len;
 	if (ms) {
-		while (alen && a->name[alen-1] == '.')
+		while (alen && name->name[alen-1] == '.')
 			alen--;
-		while (blen && b->name[blen-1] == '.')
+		while (blen && str[blen-1] == '.')
 			blen--;
 	}
 	if (alen == blen) {
-		if (strnicmp(a->name, b->name, alen) == 0)
-			return 0;
-	}
-	return 1;
-}
-
-/*
- * Case sensitive compare of two isofs names.
- */
-static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a,
-					struct qstr *b, int ms)
-{
-	int alen, blen;
-
-	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = a->len;
-	blen = b->len;
-	if (ms) {
-		while (alen && a->name[alen-1] == '.')
-			alen--;
-		while (blen && b->name[blen-1] == '.')
-			blen--;
-	}
-	if (alen == blen) {
-		if (strncmp(a->name, b->name, alen) == 0)
-			return 0;
+		if (ci) {
+			if (strnicmp(name->name, str, alen) == 0)
+				return 0;
+		} else {
+			if (strncmp(name->name, str, alen) == 0)
+				return 0;
+		}
 	}
 	return 1;
 }
@@ -266,15 +260,19 @@ isofs_hashi(struct dentry *dentry, struct qstr *qstr)
 }
 
 static int
-isofs_dentry_cmp(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return isofs_dentry_cmp_common(dentry, a, b, 0);
+	return isofs_dentry_cmp_common(len, str, name, 0, 0);
 }
 
 static int
-isofs_dentry_cmpi(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return isofs_dentry_cmpi_common(dentry, a, b, 0);
+	return isofs_dentry_cmp_common(len, str, name, 0, 1);
 }
 
 #ifdef CONFIG_JOLIET
@@ -291,15 +289,19 @@ isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr)
 }
 
 static int
-isofs_dentry_cmp_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return isofs_dentry_cmp_common(dentry, a, b, 1);
+	return isofs_dentry_cmp_common(len, str, name, 1, 0);
 }
 
 static int
-isofs_dentry_cmpi_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return isofs_dentry_cmpi_common(dentry, a, b, 1);
+	return isofs_dentry_cmp_common(len, str, name, 1, 1);
 }
 #endif
 
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 0d23abfd428..715f7d31804 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,7 +37,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
 
 	qstr.name = compare;
 	qstr.len = dlen;
-	return dentry->d_op->d_compare(dentry, &dentry->d_name, &qstr);
+	return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
+			dentry->d_name.len, dentry->d_name.name, &qstr);
 }
 
 /*
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 2da1546161f..92129016cd7 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1587,14 +1587,17 @@ static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
 	return 0;
 }
 
-static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b)
+static int jfs_ci_compare(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
 	int i, result = 1;
 
-	if (a->len != b->len)
+	if (len != name->len)
 		goto out;
-	for (i=0; i < a->len; i++) {
-		if (tolower(a->name[i]) != tolower(b->name[i]))
+	for (i=0; i < len; i++) {
+		if (tolower(str[i]) != tolower(name->name[i]))
 			goto out;
 	}
 	result = 0;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index e80ea4e37c4..3bcc68aed41 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -75,7 +75,9 @@ const struct inode_operations ncp_dir_inode_operations =
  */
 static int ncp_lookup_validate(struct dentry *, struct nameidata *);
 static int ncp_hash_dentry(struct dentry *, struct qstr *);
-static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
+static int ncp_compare_dentry(const struct dentry *, const struct inode *,
+		const struct dentry *, const struct inode *,
+		unsigned int, const char *, const struct qstr *);
 static int ncp_delete_dentry(const struct dentry *);
 
 static const struct dentry_operations ncp_dentry_operations =
@@ -113,10 +115,10 @@ static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
 
 #define ncp_preserve_case(i)	(ncp_namespace(i) != NW_NS_DOS)
 
-static inline int ncp_case_sensitive(struct dentry *dentry)
+static inline int ncp_case_sensitive(const struct inode *i)
 {
 #ifdef CONFIG_NCPFS_NFS_NS
-	return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
+	return ncp_namespace(i) == NW_NS_NFS;
 #else
 	return 0;
 #endif /* CONFIG_NCPFS_NFS_NS */
@@ -129,12 +131,13 @@ static inline int ncp_case_sensitive(struct dentry *dentry)
 static int 
 ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 {
-	if (!ncp_case_sensitive(dentry)) {
+	if (!ncp_case_sensitive(dentry->d_inode)) {
+		struct super_block *sb = dentry->d_sb;
 		struct nls_table *t;
 		unsigned long hash;
 		int i;
 
-		t = NCP_IO_TABLE(dentry);
+		t = NCP_IO_TABLE(sb);
 		hash = init_name_hash();
 		for (i=0; i<this->len ; i++)
 			hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -145,15 +148,17 @@ ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 }
 
 static int
-ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	if (a->len != b->len)
+	if (len != name->len)
 		return 1;
 
-	if (ncp_case_sensitive(dentry))
-		return strncmp(a->name, b->name, a->len);
+	if (ncp_case_sensitive(pinode))
+		return strncmp(str, name->name, len);
 
-	return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
+	return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len);
 }
 
 /*
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 3c57eca634c..244d1b73fda 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -135,7 +135,7 @@ int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *,
 				const unsigned char *, unsigned int, int);
 
 #define NCP_ESC			':'
-#define NCP_IO_TABLE(dentry)	(NCP_SERVER((dentry)->d_inode)->nls_io)
+#define NCP_IO_TABLE(sb)	(NCP_SBP(sb)->nls_io)
 #define ncp_tolower(t, c)	nls_tolower(t, c)
 #define ncp_toupper(t, c)	nls_toupper(t, c)
 #define ncp_strnicmp(t, s1, s2, len) \
@@ -150,15 +150,15 @@ int ncp__io2vol(unsigned char *, unsigned int *,
 int ncp__vol2io(unsigned char *, unsigned int *,
 				const unsigned char *, unsigned int, int);
 
-#define NCP_IO_TABLE(dentry)	NULL
+#define NCP_IO_TABLE(sb)	NULL
 #define ncp_tolower(t, c)	tolower(c)
 #define ncp_toupper(t, c)	toupper(c)
 #define ncp_io2vol(S,m,i,n,k,U)	ncp__io2vol(m,i,n,k,U)
 #define ncp_vol2io(S,m,i,n,k,U)	ncp__vol2io(m,i,n,k,U)
 
 
-static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
-		const unsigned char *s2, int len)
+static inline int ncp_strnicmp(const struct nls_table *t,
+		const unsigned char *s1, const unsigned char *s2, int len)
 {
 	while (len--) {
 		if (tolower(*s1++) != tolower(*s2++))
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a256d770ea1..ae4b0fd9033 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -397,15 +397,16 @@ static int proc_sys_delete(const struct dentry *dentry)
 	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
 
-static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
-			    struct qstr *name)
+static int proc_sys_compare(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct dentry *dentry = container_of(qstr, struct dentry, d_name);
-	if (qstr->len != name->len)
+	if (name->len != len)
 		return 1;
-	if (memcmp(qstr->name, name->name, name->len))
+	if (memcmp(name->name, str, len))
 		return 1;
-	return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
+	return !sysctl_is_seen(PROC_I(inode)->sysctl);
 }
 
 static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6cdf4995c90..75a072bf2a3 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -134,7 +134,9 @@ enum dentry_d_lock_class
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, struct nameidata *);
 	int (*d_hash)(struct dentry *, struct qstr *);
-	int (*d_compare)(struct dentry *, struct qstr *, struct qstr *);
+	int (*d_compare)(const struct dentry *, const struct inode *,
+			const struct dentry *, const struct inode *,
+			unsigned int, const char *, const struct qstr *);
 	int (*d_delete)(const struct dentry *);
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
@@ -145,12 +147,8 @@ struct dentry_operations {
  * Locking rules for dentry_operations callbacks are to be found in
  * Documentation/filesystems/Locking. Keep it updated!
  *
- * the dentry parameter passed to d_hash and d_compare is the parent
- * directory of the entries to be compared. It is used in case these
- * functions need any directory specific information for determining
- * equivalency classes.  Using the dentry itself might not work, as it
- * might be a negative dentry which has no information associated with
- * it.
+ * FUrther descriptions are found in Documentation/filesystems/vfs.txt.
+ * Keep it updated too!
  */
 
 /* d_flags entries */
diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h
index ef663061d5a..1c27f201c85 100644
--- a/include/linux/ncp_fs.h
+++ b/include/linux/ncp_fs.h
@@ -184,13 +184,13 @@ struct ncp_entry_info {
 	__u8			file_handle[6];
 };
 
-static inline struct ncp_server *NCP_SBP(struct super_block *sb)
+static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
 
 #define NCP_SERVER(inode)	NCP_SBP((inode)->i_sb)
-static inline struct ncp_inode_info *NCP_FINFO(struct inode *inode)
+static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
 {
 	return container_of(inode, struct ncp_inode_info, vfs_inode);
 }
-- 
cgit v1.2.3


From b1e6a015a580ad145689ad1d6b4aa0e03e6c868b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:28 +1100
Subject: fs: change d_hash for rcu-walk

Change d_hash so it may be called from lock-free RCU lookups. See similar
patch for d_compare for details.

For in-tree filesystems, this is just a mechanical change.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 Documentation/filesystems/Locking |  5 +++--
 Documentation/filesystems/porting |  7 +++++++
 Documentation/filesystems/vfs.txt |  8 ++++++--
 drivers/staging/smbfs/cache.c     |  2 +-
 drivers/staging/smbfs/dir.c       |  6 ++++--
 fs/adfs/dir.c                     |  3 ++-
 fs/affs/namei.c                   | 20 ++++++++++++--------
 fs/cifs/dir.c                     |  5 +++--
 fs/cifs/readdir.c                 |  2 +-
 fs/dcache.c                       |  2 +-
 fs/ecryptfs/inode.c               |  4 ++--
 fs/fat/namei_msdos.c              |  3 ++-
 fs/fat/namei_vfat.c               |  8 +++++---
 fs/gfs2/dentry.c                  |  3 ++-
 fs/hfs/hfs_fs.h                   |  3 ++-
 fs/hfs/string.c                   |  3 ++-
 fs/hfsplus/hfsplus_fs.h           |  3 ++-
 fs/hfsplus/unicode.c              |  3 ++-
 fs/hpfs/dentry.c                  |  3 ++-
 fs/isofs/inode.c                  | 28 ++++++++++++++++++----------
 fs/jfs/namei.c                    |  3 ++-
 fs/namei.c                        |  5 +++--
 fs/ncpfs/dir.c                    | 10 ++++++----
 fs/sysv/namei.c                   |  3 ++-
 include/linux/dcache.h            |  3 ++-
 25 files changed, 94 insertions(+), 51 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 9a76f8d8bf9..a15ee207b44 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -10,7 +10,8 @@ be able to use diff(1).
 --------------------------- dentry_operations --------------------------
 prototypes:
 	int (*d_revalidate)(struct dentry *, int);
-	int (*d_hash) (struct dentry *, struct qstr *);
+	int (*d_hash)(const struct dentry *, const struct inode *,
+			struct qstr *);
 	int (*d_compare)(const struct dentry *, const struct inode *,
 			const struct dentry *, const struct inode *,
 			unsigned int, const char *, const struct qstr *);
@@ -22,7 +23,7 @@ prototypes:
 locking rules:
 		dcache_lock	rename_lock	->d_lock	may block
 d_revalidate:	no		no		no		yes
-d_hash		no		no		no		yes
+d_hash		no		no		no		no
 d_compare:	no		yes		no		no 
 d_delete:	yes		no		yes		no
 d_release:	no		no		no		yes
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index d44511e2082..9fd31940a8e 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -333,3 +333,10 @@ unreferenced dentries, and is now only called when the dentry refcount goes to
 	.d_compare() calling convention and locking rules are significantly
 changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
 look at examples of other filesystems) for guidance.
+
+---
+[mandatory]
+
+	.d_hash() calling convention and locking rules are significantly
+changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
+look at examples of other filesystems) for guidance.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 250681b8c7c..69b10ff5ec8 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -847,7 +847,8 @@ defined:
 
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, struct nameidata *);
-	int (*d_hash)(struct dentry *, struct qstr *);
+	int (*d_hash)(const struct dentry *, const struct inode *,
+			struct qstr *);
 	int (*d_compare)(const struct dentry *, const struct inode *,
 			const struct dentry *, const struct inode *,
 			unsigned int, const char *, const struct qstr *);
@@ -864,7 +865,10 @@ struct dentry_operations {
 
   d_hash: called when the VFS adds a dentry to the hash table. The first
 	dentry passed to d_hash is the parent directory that the name is
-	to be hashed into.
+	to be hashed into. The inode is the dentry's inode.
+
+	Same locking and synchronisation rules as d_compare regarding
+	what is safe to dereference etc.
 
   d_compare: called to compare a dentry name with a given name. The first
 	dentry is the parent of the dentry to be compared, the second is
diff --git a/drivers/staging/smbfs/cache.c b/drivers/staging/smbfs/cache.c
index dbd2e1df3ba..0beded260b0 100644
--- a/drivers/staging/smbfs/cache.c
+++ b/drivers/staging/smbfs/cache.c
@@ -134,7 +134,7 @@ smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	qname->hash = full_name_hash(qname->name, qname->len);
 
 	if (dentry->d_op && dentry->d_op->d_hash)
-		if (dentry->d_op->d_hash(dentry, qname) != 0)
+		if (dentry->d_op->d_hash(dentry, inode, qname) != 0)
 			goto end_advance;
 
 	newdent = d_lookup(dentry, qname);
diff --git a/drivers/staging/smbfs/dir.c b/drivers/staging/smbfs/dir.c
index bd63229f43f..5f79799d5d4 100644
--- a/drivers/staging/smbfs/dir.c
+++ b/drivers/staging/smbfs/dir.c
@@ -274,7 +274,8 @@ smb_dir_open(struct inode *dir, struct file *file)
  * Dentry operations routines
  */
 static int smb_lookup_validate(struct dentry *, struct nameidata *);
-static int smb_hash_dentry(struct dentry *, struct qstr *);
+static int smb_hash_dentry(const struct dentry *, const struct inode *,
+		struct qstr *);
 static int smb_compare_dentry(const struct dentry *,
 		const struct inode *,
 		const struct dentry *, const struct inode *,
@@ -336,7 +337,8 @@ smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
 }
 
 static int 
-smb_hash_dentry(struct dentry *dir, struct qstr *this)
+smb_hash_dentry(const struct dentry *dir, const struct inode *inode,
+		struct qstr *this)
 {
 	unsigned long hash;
 	int i;
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index c8ed66162bd..a11e5e10271 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -201,7 +201,8 @@ const struct file_operations adfs_dir_operations = {
 };
 
 static int
-adfs_hash(struct dentry *parent, struct qstr *qstr)
+adfs_hash(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr)
 {
 	const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
 	const unsigned char *name;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 547d5deb0d4..5aca08c2110 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,13 +13,15 @@
 typedef int (*toupper_t)(int);
 
 static int	 affs_toupper(int ch);
-static int	 affs_hash_dentry(struct dentry *, struct qstr *);
+static int	 affs_hash_dentry(const struct dentry *,
+		const struct inode *, struct qstr *);
 static int       affs_compare_dentry(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
 		unsigned int len, const char *str, const struct qstr *name);
 static int	 affs_intl_toupper(int ch);
-static int	 affs_intl_hash_dentry(struct dentry *, struct qstr *);
+static int	 affs_intl_hash_dentry(const struct dentry *,
+		const struct inode *, struct qstr *);
 static int       affs_intl_compare_dentry(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
@@ -64,13 +66,13 @@ affs_get_toupper(struct super_block *sb)
  * Note: the dentry argument is the parent dentry.
  */
 static inline int
-__affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 {
 	const u8 *name = qstr->name;
 	unsigned long hash;
 	int i;
 
-	i = affs_check_name(qstr->name,qstr->len);
+	i = affs_check_name(qstr->name, qstr->len);
 	if (i)
 		return i;
 
@@ -84,14 +86,16 @@ __affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
 }
 
 static int
-affs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
-	return __affs_hash_dentry(dentry, qstr, affs_toupper);
+	return __affs_hash_dentry(qstr, affs_toupper);
 }
 static int
-affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
-	return __affs_hash_dentry(dentry, qstr, affs_intl_toupper);
+	return __affs_hash_dentry(qstr, affs_intl_toupper);
 }
 
 static inline int __affs_compare_dentry(unsigned int len,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index c60133f0d8e..88bfe686ac0 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -700,9 +700,10 @@ const struct dentry_operations cifs_dentry_ops = {
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
 
-static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
+static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *q)
 {
-	struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+	struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
 	unsigned long hash;
 	int i;
 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a73eb9f4bda..ee463aeca0b 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -79,7 +79,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 	cFYI(1, "For %s", name->name);
 
 	if (parent->d_op && parent->d_op->d_hash)
-		parent->d_op->d_hash(parent, name);
+		parent->d_op->d_hash(parent, parent->d_inode, name);
 	else
 		name->hash = full_name_hash(name->name, name->len);
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 7075555fbb0..6f59f375ed9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1478,7 +1478,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
 	 */
 	name->hash = full_name_hash(name->name, name->len);
 	if (dir->d_op && dir->d_op->d_hash) {
-		if (dir->d_op->d_hash(dir, name) < 0)
+		if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
 			goto out;
 	}
 	dentry = d_lookup(dir, name);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 9d1a22d6276..a1ed7a7cb17 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -454,7 +454,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	lower_name.hash = ecryptfs_dentry->d_name.hash;
 	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
 		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-						    &lower_name);
+				lower_dir_dentry->d_inode, &lower_name);
 		if (rc < 0)
 			goto out_d_drop;
 	}
@@ -489,7 +489,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
 	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
 		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-						    &lower_name);
+				lower_dir_dentry->d_inode, &lower_name);
 		if (rc < 0)
 			goto out_d_drop;
 	}
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 99d3c7ac973..3b3e072d898 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,7 +148,8 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
  * that the existing dentry can be used. The msdos fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
+static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
+	       struct qstr *qstr)
 {
 	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
 	unsigned char msdos_name[MSDOS_NAME];
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 95e00ab84c3..4fc06278db4 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -103,7 +103,8 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
  * that the existing dentry can be used. The vfat fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
 	return 0;
@@ -115,9 +116,10 @@ static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
  * that the existing dentry can be used. The vfat fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
-	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+	struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
 	const unsigned char *name;
 	unsigned int len;
 	unsigned long hash;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index e80fea2f65f..50497f65763 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -100,7 +100,8 @@ fail:
 	return 0;
 }
 
-static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *str)
 {
 	str->hash = gfs2_disk_hash(str->name, str->len);
 	return 0;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 8cd876f0e96..ad97c2d5828 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,7 +213,8 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 /* string.c */
 extern const struct dentry_operations hfs_dentry_operations;
 
-extern int hfs_hash_dentry(struct dentry *, struct qstr *);
+extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
+		struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
 		      const unsigned char *, unsigned int);
 extern int hfs_compare_dentry(const struct dentry *parent,
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index aaf90d0d694..495a976a3cc 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,7 +51,8 @@ static unsigned char caseorder[256] = {
 /*
  * Hash a string to an integer in a case-independent way
  */
-int hfs_hash_dentry(struct dentry *dentry, struct qstr *this)
+int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *this)
 {
 	const unsigned char *name = this->name;
 	unsigned int hash, len = this->len;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 7aa96eefe48..a5308f491e3 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -379,7 +379,8 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unist
 int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
 int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
 int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
+int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *str);
 int hfsplus_compare_dentry(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index b178c997efa..d800aa0f2c8 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -320,7 +320,8 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
  * Composed unicode characters are decomposed and case-folding is performed
  * if the appropriate bits are (un)set on the superblock.
  */
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
+int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *str)
 {
 	struct super_block *sb = dentry->d_sb;
 	const char *astr;
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index dd9b1e74a73..35526df1fd3 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,7 +12,8 @@
  * Note: the dentry argument is the parent dentry.
  */
 
-static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	unsigned long	 hash;
 	int		 i;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 7b0fbc61af8..d204ee4235f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -26,8 +26,10 @@
 
 #define BEQUIET
 
-static int isofs_hashi(struct dentry *parent, struct qstr *qstr);
-static int isofs_hash(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
+static int isofs_hash(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
 static int isofs_dentry_cmpi(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
@@ -38,8 +40,10 @@ static int isofs_dentry_cmp(const struct dentry *parent,
 		unsigned int len, const char *str, const struct qstr *name);
 
 #ifdef CONFIG_JOLIET
-static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr);
-static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
+static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
 static int isofs_dentry_cmpi_ms(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
@@ -172,7 +176,7 @@ struct iso9660_options{
  * Compute the hash for the isofs name corresponding to the dentry.
  */
 static int
-isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
 	const char *name;
 	int len;
@@ -193,7 +197,7 @@ isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
  * Compute the hash for the isofs name corresponding to the dentry.
  */
 static int
-isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
 	const char *name;
 	int len;
@@ -248,13 +252,15 @@ static int isofs_dentry_cmp_common(
 }
 
 static int
-isofs_hash(struct dentry *dentry, struct qstr *qstr)
+isofs_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	return isofs_hash_common(dentry, qstr, 0);
 }
 
 static int
-isofs_hashi(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	return isofs_hashi_common(dentry, qstr, 0);
 }
@@ -277,13 +283,15 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
 
 #ifdef CONFIG_JOLIET
 static int
-isofs_hash_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	return isofs_hash_common(dentry, qstr, 1);
 }
 
 static int
-isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	return isofs_hashi_common(dentry, qstr, 1);
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 92129016cd7..57f90dad891 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1574,7 +1574,8 @@ const struct file_operations jfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
+static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
+		struct qstr *this)
 {
 	unsigned long hash;
 	int i;
diff --git a/fs/namei.c b/fs/namei.c
index 4ff7ca53053..f3b5ca40465 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -731,7 +731,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	 * to use its own hash..
 	 */
 	if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
-		int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
+		int err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
+				nd->path.dentry->d_inode, name);
 		if (err < 0)
 			return err;
 	}
@@ -1134,7 +1135,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	 * to use its own hash..
 	 */
 	if (base->d_op && base->d_op->d_hash) {
-		err = base->d_op->d_hash(base, name);
+		err = base->d_op->d_hash(base, inode, name);
 		dentry = ERR_PTR(err);
 		if (err < 0)
 			goto out;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 3bcc68aed41..bbbf7922f42 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -74,7 +74,8 @@ const struct inode_operations ncp_dir_inode_operations =
  * Dentry operations routines
  */
 static int ncp_lookup_validate(struct dentry *, struct nameidata *);
-static int ncp_hash_dentry(struct dentry *, struct qstr *);
+static int ncp_hash_dentry(const struct dentry *, const struct inode *,
+		struct qstr *);
 static int ncp_compare_dentry(const struct dentry *, const struct inode *,
 		const struct dentry *, const struct inode *,
 		unsigned int, const char *, const struct qstr *);
@@ -129,9 +130,10 @@ static inline int ncp_case_sensitive(const struct inode *i)
  * is case-sensitive.
  */
 static int 
-ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
+ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *this)
 {
-	if (!ncp_case_sensitive(dentry->d_inode)) {
+	if (!ncp_case_sensitive(inode)) {
 		struct super_block *sb = dentry->d_sb;
 		struct nls_table *t;
 		unsigned long hash;
@@ -597,7 +599,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	qname.hash = full_name_hash(qname.name, qname.len);
 
 	if (dentry->d_op && dentry->d_op->d_hash)
-		if (dentry->d_op->d_hash(dentry, &qname) != 0)
+		if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
 			goto end_advance;
 
 	newdent = d_lookup(dentry, &qname);
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 11e7f7d11cd..7507aeb4c90 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,7 +27,8 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
 	return err;
 }
 
-static int sysv_hash(struct dentry *dentry, struct qstr *qstr)
+static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	/* Truncate the name in place, avoids having to define a compare
 	   function. */
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 75a072bf2a3..1149e706f04 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -133,7 +133,8 @@ enum dentry_d_lock_class
 
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, struct nameidata *);
-	int (*d_hash)(struct dentry *, struct qstr *);
+	int (*d_hash)(const struct dentry *, const struct inode *,
+			struct qstr *);
 	int (*d_compare)(const struct dentry *, const struct inode *,
 			const struct dentry *, const struct inode *,
 			unsigned int, const char *, const struct qstr *);
-- 
cgit v1.2.3


From b5c84bf6f6fa3a7dfdcb556023a62953574b60ee Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:38 +1100
Subject: fs: dcache remove dcache_lock

dcache_lock no longer protects anything. remove it.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 Documentation/filesystems/Locking            |  16 +--
 Documentation/filesystems/dentry-locking.txt |  40 ++++---
 Documentation/filesystems/porting            |   8 +-
 arch/powerpc/platforms/cell/spufs/inode.c    |   5 +-
 drivers/infiniband/hw/ipath/ipath_fs.c       |   6 +-
 drivers/infiniband/hw/qib/qib_fs.c           |   3 -
 drivers/staging/pohmelfs/path_entry.c        |   2 -
 drivers/staging/smbfs/cache.c                |   4 -
 drivers/usb/core/inode.c                     |   3 -
 fs/9p/vfs_inode.c                            |   2 -
 fs/affs/amigaffs.c                           |   2 -
 fs/autofs4/autofs_i.h                        |   3 +
 fs/autofs4/expire.c                          |  10 +-
 fs/autofs4/root.c                            |  44 ++++----
 fs/autofs4/waitq.c                           |   7 +-
 fs/ceph/dir.c                                |   6 +-
 fs/ceph/inode.c                              |   4 -
 fs/cifs/inode.c                              |   3 -
 fs/coda/cache.c                              |   2 -
 fs/configfs/configfs_internal.h              |   2 -
 fs/configfs/inode.c                          |   6 +-
 fs/dcache.c                                  | 160 ++++-----------------------
 fs/exportfs/expfs.c                          |   4 -
 fs/libfs.c                                   |   8 --
 fs/namei.c                                   |   9 +-
 fs/ncpfs/dir.c                               |   3 -
 fs/ncpfs/ncplib_kernel.h                     |   4 -
 fs/nfs/dir.c                                 |   3 -
 fs/nfs/getroot.c                             |   2 -
 fs/nfs/namespace.c                           |   3 -
 fs/notify/fsnotify.c                         |   2 -
 fs/ocfs2/dcache.c                            |   2 -
 include/linux/dcache.h                       |   5 +-
 include/linux/fs.h                           |   6 +-
 include/linux/fsnotify.h                     |   2 -
 include/linux/fsnotify_backend.h             |  11 +-
 include/linux/namei.h                        |   1 -
 kernel/cgroup.c                              |   6 -
 mm/filemap.c                                 |   3 -
 security/selinux/selinuxfs.c                 |   4 -
 40 files changed, 109 insertions(+), 307 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index a15ee207b44..bdad6414dfa 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -21,14 +21,14 @@ prototypes:
 	char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
 
 locking rules:
-		dcache_lock	rename_lock	->d_lock	may block
-d_revalidate:	no		no		no		yes
-d_hash		no		no		no		no
-d_compare:	no		yes		no		no 
-d_delete:	yes		no		yes		no
-d_release:	no		no		no		yes
-d_iput:		no		no		no		yes
-d_dname:	no		no		no		no
+		rename_lock	->d_lock	may block
+d_revalidate:	no		no		yes
+d_hash		no		no		no
+d_compare:	yes		no		no
+d_delete:	no		yes		no
+d_release:	no		no		yes
+d_iput:		no		no		yes
+d_dname:	no		no		no
 
 --------------------------- inode_operations --------------------------- 
 prototypes:
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt
index 79334ed5daa..30b6a40f565 100644
--- a/Documentation/filesystems/dentry-locking.txt
+++ b/Documentation/filesystems/dentry-locking.txt
@@ -31,6 +31,7 @@ significant change is the way d_lookup traverses the hash chain, it
 doesn't acquire the dcache_lock for this and rely on RCU to ensure
 that the dentry has not been *freed*.
 
+dcache_lock no longer exists, dentry locking is explained in fs/dcache.c
 
 Dcache locking details
 ======================
@@ -50,14 +51,12 @@ Safe lock-free look-up of dcache hash table
 
 Dcache is a complex data structure with the hash table entries also
 linked together in other lists. In 2.4 kernel, dcache_lock protected
-all the lists. We applied RCU only on hash chain walking. The rest of
-the lists are still protected by dcache_lock.  Some of the important
-changes are :
+all the lists. RCU dentry hash walking works like this:
 
 1. The deletion from hash chain is done using hlist_del_rcu() macro
    which doesn't initialize next pointer of the deleted dentry and
    this allows us to walk safely lock-free while a deletion is
-   happening.
+   happening. This is a standard hlist_rcu iteration.
 
 2. Insertion of a dentry into the hash table is done using
    hlist_add_head_rcu() which take care of ordering the writes - the
@@ -66,19 +65,18 @@ changes are :
    which has since been replaced by hlist_for_each_entry_rcu(), while
    walking the hash chain. The only requirement is that all
    initialization to the dentry must be done before
-   hlist_add_head_rcu() since we don't have dcache_lock protection
-   while traversing the hash chain. This isn't different from the
-   existing code.
-
-3. The dentry looked up without holding dcache_lock by cannot be
-   returned for walking if it is unhashed. It then may have a NULL
-   d_inode or other bogosity since RCU doesn't protect the other
-   fields in the dentry. We therefore use a flag DCACHE_UNHASHED to
-   indicate unhashed dentries and use this in conjunction with a
-   per-dentry lock (d_lock). Once looked up without the dcache_lock,
-   we acquire the per-dentry lock (d_lock) and check if the dentry is
-   unhashed. If so, the look-up is failed. If not, the reference count
-   of the dentry is increased and the dentry is returned.
+   hlist_add_head_rcu() since we don't have lock protection
+   while traversing the hash chain.
+
+3. The dentry looked up without holding locks cannot be returned for
+   walking if it is unhashed. It then may have a NULL d_inode or other
+   bogosity since RCU doesn't protect the other fields in the dentry. We
+   therefore use a flag DCACHE_UNHASHED to indicate unhashed dentries
+   and use this in conjunction with a per-dentry lock (d_lock). Once
+   looked up without locks, we acquire the per-dentry lock (d_lock) and
+   check if the dentry is unhashed. If so, the look-up is failed. If not,
+   the reference count of the dentry is increased and the dentry is
+   returned.
 
 4. Once a dentry is looked up, it must be ensured during the path walk
    for that component it doesn't go away. In pre-2.5.10 code, this was
@@ -86,10 +84,10 @@ changes are :
    In some sense, dcache_rcu path walking looks like the pre-2.5.10
    version.
 
-5. All dentry hash chain updates must take the dcache_lock as well as
-   the per-dentry lock in that order. dput() does this to ensure that
-   a dentry that has just been looked up in another CPU doesn't get
-   deleted before dget() can be done on it.
+5. All dentry hash chain updates must take the per-dentry lock (see
+   fs/dcache.c). This excludes dput() to ensure that a dentry that has
+   been looked up concurrently does not get deleted before dget() can
+   take a ref.
 
 6. There are several ways to do reference counting of RCU protected
    objects. One such example is in ipv4 route cache where deferred
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 9fd31940a8e..1eb76959d09 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -216,7 +216,6 @@ had ->revalidate()) add calls in ->follow_link()/->readlink().
 ->d_parent changes are not protected by BKL anymore.  Read access is safe
 if at least one of the following is true:
 	* filesystem has no cross-directory rename()
-	* dcache_lock is held
 	* we know that parent had been locked (e.g. we are looking at
 ->d_parent of ->lookup() argument).
 	* we are called from ->rename().
@@ -340,3 +339,10 @@ look at examples of other filesystems) for guidance.
 	.d_hash() calling convention and locking rules are significantly
 changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
 look at examples of other filesystems) for guidance.
+
+---
+[mandatory]
+	dcache_lock is gone, replaced by fine grained locks. See fs/dcache.c
+for details of what locks to replace dcache_lock with in order to protect
+particular things. Most of the time, a filesystem only needs ->d_lock, which
+protects *all* the dcache state of a given dentry.
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 5aef1a7f5e4..2662b50ea8d 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -159,21 +159,18 @@ static void spufs_prune_dir(struct dentry *dir)
 
 	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_u.d_child) {
-		spin_lock(&dcache_lock);
 		spin_lock(&dentry->d_lock);
 		if (!(d_unhashed(dentry)) && dentry->d_inode) {
 			dget_locked_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
 			simple_unlink(dir->d_inode, dentry);
-			/* XXX: what is dcache_lock protecting here? Other
+			/* XXX: what was dcache_lock protecting here? Other
 			 * filesystems (IB, configfs) release dcache_lock
 			 * before unlink */
-			spin_unlock(&dcache_lock);
 			dput(dentry);
 		} else {
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 		}
 	}
 	shrink_dcache_parent(dir);
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index 18aee04a841..925e88227de 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -277,18 +277,14 @@ static int remove_file(struct dentry *parent, char *name)
 		goto bail;
 	}
 
-	spin_lock(&dcache_lock);
 	spin_lock(&tmp->d_lock);
 	if (!(d_unhashed(tmp) && tmp->d_inode)) {
 		dget_locked_dlock(tmp);
 		__d_drop(tmp);
 		spin_unlock(&tmp->d_lock);
-		spin_unlock(&dcache_lock);
 		simple_unlink(parent->d_inode, tmp);
-	} else {
+	} else
 		spin_unlock(&tmp->d_lock);
-		spin_unlock(&dcache_lock);
-	}
 
 	ret = 0;
 bail:
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index fe4b242f009..49af4a6538b 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -453,17 +453,14 @@ static int remove_file(struct dentry *parent, char *name)
 		goto bail;
 	}
 
-	spin_lock(&dcache_lock);
 	spin_lock(&tmp->d_lock);
 	if (!(d_unhashed(tmp) && tmp->d_inode)) {
 		dget_locked_dlock(tmp);
 		__d_drop(tmp);
 		spin_unlock(&tmp->d_lock);
-		spin_unlock(&dcache_lock);
 		simple_unlink(parent->d_inode, tmp);
 	} else {
 		spin_unlock(&tmp->d_lock);
-		spin_unlock(&dcache_lock);
 	}
 
 	ret = 0;
diff --git a/drivers/staging/pohmelfs/path_entry.c b/drivers/staging/pohmelfs/path_entry.c
index bbe42f42ca8..400a9fc386a 100644
--- a/drivers/staging/pohmelfs/path_entry.c
+++ b/drivers/staging/pohmelfs/path_entry.c
@@ -101,7 +101,6 @@ rename_retry:
 	d = first;
 	seq = read_seqbegin(&rename_lock);
 	rcu_read_lock();
-	spin_lock(&dcache_lock);
 
 	if (!IS_ROOT(d) && d_unhashed(d))
 		len += UNHASHED_OBSCURE_STRING_SIZE; /* Obscure " (deleted)" string */
@@ -110,7 +109,6 @@ rename_retry:
 		len += d->d_name.len + 1; /* Plus slash */
 		d = d->d_parent;
 	}
-	spin_unlock(&dcache_lock);
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
diff --git a/drivers/staging/smbfs/cache.c b/drivers/staging/smbfs/cache.c
index 920434b6c07..75dfd403fb9 100644
--- a/drivers/staging/smbfs/cache.c
+++ b/drivers/staging/smbfs/cache.c
@@ -62,7 +62,6 @@ smb_invalidate_dircache_entries(struct dentry *parent)
 	struct list_head *next;
 	struct dentry *dentry;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
@@ -72,7 +71,6 @@ smb_invalidate_dircache_entries(struct dentry *parent)
 		next = next->next;
 	}
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 /*
@@ -98,7 +96,6 @@ smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	}
 
 	/* If a pointer is invalid, we search the dentry. */
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
@@ -115,7 +112,6 @@ smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	dent = NULL;
 out_unlock:
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 	return dent;
 }
 
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 89a0e836658..1b125c224dc 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -343,7 +343,6 @@ static int usbfs_empty (struct dentry *dentry)
 {
 	struct list_head *list;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	list_for_each(list, &dentry->d_subdirs) {
 		struct dentry *de = list_entry(list, struct dentry, d_u.d_child);
@@ -352,13 +351,11 @@ static int usbfs_empty (struct dentry *dentry)
 		if (usbfs_positive(de)) {
 			spin_unlock(&de->d_lock);
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 			return 0;
 		}
 		spin_unlock(&de->d_lock);
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 	return 1;
 }
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 47dfd5d29a6..1073bca8488 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -270,13 +270,11 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
 {
 	struct dentry *dentry;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	/* Directory should have only one entry. */
 	BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
 	dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	return dentry;
 }
 
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 2321cc92d44..600101a21ba 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -128,7 +128,6 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
 	void *data = dentry->d_fsdata;
 	struct list_head *head, *next;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	head = &inode->i_dentry;
 	next = head->next;
@@ -141,7 +140,6 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
 		next = next->next;
 	}
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 }
 
 
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 9d2ae9b30d9..0fffe1c24ce 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -16,6 +16,7 @@
 #include <linux/auto_fs4.h>
 #include <linux/auto_dev-ioctl.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/list.h>
 
 /* This is the range of ioctl() numbers we claim as ours */
@@ -60,6 +61,8 @@ do {							\
 		current->pid, __func__, ##args);	\
 } while (0)
 
+extern spinlock_t autofs4_lock;
+
 /* Unified info structure.  This is pointed to by both the dentry and
    inode structures.  Each file in the filesystem has an instance of this
    structure.  It holds a reference to the dentry, so dentries are never
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 968c1434af6..2f7951d67d1 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -102,7 +102,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
 	if (prev == NULL)
 		return dget(prev);
 
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 relock:
 	p = prev;
 	spin_lock(&p->d_lock);
@@ -114,7 +114,7 @@ again:
 
 			if (p == root) {
 				spin_unlock(&p->d_lock);
-				spin_unlock(&dcache_lock);
+				spin_unlock(&autofs4_lock);
 				dput(prev);
 				return NULL;
 			}
@@ -144,7 +144,7 @@ again:
 	dget_dlock(ret);
 	spin_unlock(&ret->d_lock);
 	spin_unlock(&p->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	dput(prev);
 
@@ -408,13 +408,13 @@ found:
 	ino->flags |= AUTOFS_INF_EXPIRING;
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&expired->d_parent->d_lock);
 	spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
 	list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 	spin_unlock(&expired->d_lock);
 	spin_unlock(&expired->d_parent->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 	return expired;
 }
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 7a9ed6b8829..10ca68a96dc 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,6 +23,8 @@
 
 #include "autofs_i.h"
 
+DEFINE_SPINLOCK(autofs4_lock);
+
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -142,15 +144,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	 * autofs file system so just let the libfs routines handle
 	 * it.
 	 */
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&dentry->d_lock);
 	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 		return -ENOENT;
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 out:
 	return dcache_dir_open(inode, file);
@@ -255,11 +257,11 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	/* We trigger a mount for almost all flags */
 	lookup_type = autofs4_need_mount(nd->flags);
 	spin_lock(&sbi->fs_lock);
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&dentry->d_lock);
 	if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 		spin_unlock(&sbi->fs_lock);
 		goto follow;
 	}
@@ -272,7 +274,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (ino->flags & AUTOFS_INF_PENDING ||
 	    (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 		spin_unlock(&sbi->fs_lock);
 
 		status = try_to_fill_dentry(dentry, nd->flags);
@@ -282,7 +284,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		goto follow;
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 	spin_unlock(&sbi->fs_lock);
 follow:
 	/*
@@ -353,14 +355,14 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		return 0;
 
 	/* Check for a non-mountpoint directory with no contents */
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&dentry->d_lock);
 	if (S_ISDIR(dentry->d_inode->i_mode) &&
 	    !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		DPRINTK("dentry=%p %.*s, emptydir",
 			 dentry, dentry->d_name.len, dentry->d_name.name);
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 
 		/* The daemon never causes a mount to trigger */
 		if (oz_mode)
@@ -377,7 +379,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		return status;
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	return 1;
 }
@@ -432,7 +434,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->active_list;
 	list_for_each(p, head) {
@@ -465,14 +467,14 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 			dget_dlock(active);
 			spin_unlock(&active->d_lock);
 			spin_unlock(&sbi->lookup_lock);
-			spin_unlock(&dcache_lock);
+			spin_unlock(&autofs4_lock);
 			return active;
 		}
 next:
 		spin_unlock(&active->d_lock);
 	}
 	spin_unlock(&sbi->lookup_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	return NULL;
 }
@@ -487,7 +489,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->expiring_list;
 	list_for_each(p, head) {
@@ -520,14 +522,14 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 			dget_dlock(expiring);
 			spin_unlock(&expiring->d_lock);
 			spin_unlock(&sbi->lookup_lock);
-			spin_unlock(&dcache_lock);
+			spin_unlock(&autofs4_lock);
 			return expiring;
 		}
 next:
 		spin_unlock(&expiring->d_lock);
 	}
 	spin_unlock(&sbi->lookup_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	return NULL;
 }
@@ -763,12 +765,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 
 	dir->i_mtime = CURRENT_TIME;
 
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	autofs4_add_expiring(dentry);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	return 0;
 }
@@ -785,20 +787,20 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!autofs4_oz_mode(sbi))
 		return -EACCES;
 
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	if (!list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&sbi->lookup_lock);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 		return -ENOTEMPTY;
 	}
 	__autofs4_add_expiring(dentry);
 	spin_unlock(&sbi->lookup_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs4_dentry_ino(dentry->d_parent);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 4be8f778a41..c5f8459c905 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -194,14 +194,15 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 rename_retry:
 	buf = *name;
 	len = 0;
+
 	seq = read_seqbegin(&rename_lock);
 	rcu_read_lock();
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
 		len += tmp->d_name.len + 1;
 
 	if (!len || --len > NAME_MAX) {
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 		rcu_read_unlock();
 		if (read_seqretry(&rename_lock, seq))
 			goto rename_retry;
@@ -217,7 +218,7 @@ rename_retry:
 		p -= tmp->d_name.len;
 		strncpy(p, tmp->d_name.name, tmp->d_name.len);
 	}
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2c924e8d85f..58abc3da611 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -112,7 +112,6 @@ static int __dcache_readdir(struct file *filp,
 	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
 	     last);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 
 	/* start at beginning? */
@@ -156,7 +155,6 @@ more:
 	dget_dlock(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 
 	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
 	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -182,21 +180,19 @@ more:
 
 	filp->f_pos++;
 
-	/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+	/* make sure a dentry wasn't dropped while we didn't have parent lock */
 	if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
 		dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
 		err = -EAGAIN;
 		goto out;
 	}
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	p = p->prev;	/* advance to next dentry */
 	goto more;
 
 out_unlock:
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 out:
 	if (last)
 		dput(last);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2c694447336..2a48cafc174 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -841,7 +841,6 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 	di->offset = ceph_inode(inode)->i_max_offset++;
 	spin_unlock(&inode->i_lock);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dir->d_lock);
 	spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
 	list_move(&dn->d_u.d_child, &dir->d_subdirs);
@@ -849,7 +848,6 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
 	spin_unlock(&dn->d_lock);
 	spin_unlock(&dir->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 /*
@@ -1233,13 +1231,11 @@ retry_lookup:
 			goto retry_lookup;
 		} else {
 			/* reorder parent's d_subdirs */
-			spin_lock(&dcache_lock);
 			spin_lock(&parent->d_lock);
 			spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
 			list_move(&dn->d_u.d_child, &parent->d_subdirs);
 			spin_unlock(&dn->d_lock);
 			spin_unlock(&parent->d_lock);
-			spin_unlock(&dcache_lock);
 		}
 
 		di = dn->d_fsdata;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 003698365ec..99b9a2cc14b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -809,17 +809,14 @@ inode_has_hashed_dentries(struct inode *inode)
 {
 	struct dentry *dentry;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
 			spin_unlock(&dcache_inode_lock);
-			spin_unlock(&dcache_lock);
 			return true;
 		}
 	}
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	return false;
 }
 
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 859393fca2b..5525e1c660f 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,7 +93,6 @@ static void coda_flag_children(struct dentry *parent, int flag)
 	struct list_head *child;
 	struct dentry *de;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	list_for_each(child, &parent->d_subdirs)
 	{
@@ -104,7 +103,6 @@ static void coda_flag_children(struct dentry *parent, int flag)
 		coda_flag_inode(de->d_inode, flag);
 	}
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 	return; 
 }
 
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index e58b4c30e21..026cf68553a 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -120,7 +120,6 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 {
 	struct config_item * item = NULL;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (!d_unhashed(dentry)) {
 		struct configfs_dirent * sd = dentry->d_fsdata;
@@ -131,7 +130,6 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 			item = config_item_get(sd->s_element);
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 
 	return item;
 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 79b37765d8f..fb3a55fff82 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -250,18 +250,14 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 	struct dentry * dentry = sd->s_dentry;
 
 	if (dentry) {
-		spin_lock(&dcache_lock);
 		spin_lock(&dentry->d_lock);
 		if (!(d_unhashed(dentry) && dentry->d_inode)) {
 			dget_locked_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 			simple_unlink(parent->d_inode, dentry);
-		} else {
+		} else
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
-		}
 	}
 }
 
diff --git a/fs/dcache.c b/fs/dcache.c
index a9bc4ecc21e..0dbae053b66 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -54,11 +54,10 @@
  *   - d_alias, d_inode
  *
  * Ordering:
- * dcache_lock
- *   dcache_inode_lock
- *     dentry->d_lock
- *       dcache_lru_lock
- *       dcache_hash_lock
+ * dcache_inode_lock
+ *   dentry->d_lock
+ *     dcache_lru_lock
+ *     dcache_hash_lock
  *
  * If there is an ancestor relationship:
  * dentry->d_parent->...->d_parent->d_lock
@@ -77,12 +76,10 @@ EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_inode_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_hash_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
 EXPORT_SYMBOL(dcache_inode_lock);
-EXPORT_SYMBOL(dcache_lock);
 
 static struct kmem_cache *dentry_cache __read_mostly;
 
@@ -139,7 +136,7 @@ static void __d_free(struct rcu_head *head)
 }
 
 /*
- * no dcache_lock, please.
+ * no locks, please.
  */
 static void d_free(struct dentry *dentry)
 {
@@ -162,7 +159,6 @@ static void d_free(struct dentry *dentry)
 static void dentry_iput(struct dentry * dentry)
 	__releases(dentry->d_lock)
 	__releases(dcache_inode_lock)
-	__releases(dcache_lock)
 {
 	struct inode *inode = dentry->d_inode;
 	if (inode) {
@@ -170,7 +166,6 @@ static void dentry_iput(struct dentry * dentry)
 		list_del_init(&dentry->d_alias);
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 		if (!inode->i_nlink)
 			fsnotify_inoderemove(inode);
 		if (dentry->d_op && dentry->d_op->d_iput)
@@ -180,7 +175,6 @@ static void dentry_iput(struct dentry * dentry)
 	} else {
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 	}
 }
 
@@ -235,14 +229,13 @@ static void dentry_lru_move_tail(struct dentry *dentry)
  *
  * If this is the root of the dentry tree, return NULL.
  *
- * dcache_lock and d_lock and d_parent->d_lock must be held by caller, and
- * are dropped by d_kill.
+ * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
+ * d_kill.
  */
 static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	__releases(dentry->d_lock)
 	__releases(parent->d_lock)
 	__releases(dcache_inode_lock)
-	__releases(dcache_lock)
 {
 	dentry->d_parent = NULL;
 	list_del(&dentry->d_u.d_child);
@@ -285,11 +278,9 @@ EXPORT_SYMBOL(__d_drop);
 
 void d_drop(struct dentry *dentry)
 {
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_drop);
 
@@ -337,21 +328,10 @@ repeat:
 	else
 		parent = dentry->d_parent;
 	if (dentry->d_count == 1) {
-		if (!spin_trylock(&dcache_lock)) {
-			/*
-			 * Something of a livelock possibility we could avoid
-			 * by taking dcache_lock and trying again, but we
-			 * want to reduce dcache_lock anyway so this will
-			 * get improved.
-			 */
-drop1:
-			spin_unlock(&dentry->d_lock);
-			goto repeat;
-		}
 		if (!spin_trylock(&dcache_inode_lock)) {
 drop2:
-			spin_unlock(&dcache_lock);
-			goto drop1;
+			spin_unlock(&dentry->d_lock);
+			goto repeat;
 		}
 		if (parent && !spin_trylock(&parent->d_lock)) {
 			spin_unlock(&dcache_inode_lock);
@@ -363,7 +343,6 @@ drop2:
 		spin_unlock(&dentry->d_lock);
 		if (parent)
 			spin_unlock(&parent->d_lock);
-		spin_unlock(&dcache_lock);
 		return;
 	}
 
@@ -387,7 +366,6 @@ drop2:
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	return;
 
 unhash_it:
@@ -418,11 +396,9 @@ int d_invalidate(struct dentry * dentry)
 	/*
 	 * If it's already been dropped, return OK.
 	 */
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (d_unhashed(dentry)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
 		return 0;
 	}
 	/*
@@ -431,9 +407,7 @@ int d_invalidate(struct dentry * dentry)
 	 */
 	if (!list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
 		shrink_dcache_parent(dentry);
-		spin_lock(&dcache_lock);
 		spin_lock(&dentry->d_lock);
 	}
 
@@ -450,19 +424,17 @@ int d_invalidate(struct dentry * dentry)
 	if (dentry->d_count > 1) {
 		if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 			return -EBUSY;
 		}
 	}
 
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 	return 0;
 }
 EXPORT_SYMBOL(d_invalidate);
 
-/* This must be called with dcache_lock and d_lock held */
+/* This must be called with d_lock held */
 static inline struct dentry * __dget_locked_dlock(struct dentry *dentry)
 {
 	dentry->d_count++;
@@ -470,7 +442,7 @@ static inline struct dentry * __dget_locked_dlock(struct dentry *dentry)
 	return dentry;
 }
 
-/* This should be called _only_ with dcache_lock held */
+/* This must be called with d_lock held */
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	spin_lock(&dentry->d_lock);
@@ -575,11 +547,9 @@ struct dentry *d_find_alias(struct inode *inode)
 	struct dentry *de = NULL;
 
 	if (!list_empty(&inode->i_dentry)) {
-		spin_lock(&dcache_lock);
 		spin_lock(&dcache_inode_lock);
 		de = __d_find_alias(inode, 0);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 	}
 	return de;
 }
@@ -593,7 +563,6 @@ void d_prune_aliases(struct inode *inode)
 {
 	struct dentry *dentry;
 restart:
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
@@ -602,14 +571,12 @@ restart:
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_inode_lock);
-			spin_unlock(&dcache_lock);
 			dput(dentry);
 			goto restart;
 		}
 		spin_unlock(&dentry->d_lock);
 	}
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
@@ -625,17 +592,14 @@ static void prune_one_dentry(struct dentry *dentry, struct dentry *parent)
 	__releases(dentry->d_lock)
 	__releases(parent->d_lock)
 	__releases(dcache_inode_lock)
-	__releases(dcache_lock)
 {
 	__d_drop(dentry);
 	dentry = d_kill(dentry, parent);
 
 	/*
-	 * Prune ancestors.  Locking is simpler than in dput(),
-	 * because dcache_lock needs to be taken anyway.
+	 * Prune ancestors.
 	 */
 	while (dentry) {
-		spin_lock(&dcache_lock);
 		spin_lock(&dcache_inode_lock);
 again:
 		spin_lock(&dentry->d_lock);
@@ -653,7 +617,6 @@ again:
 				spin_unlock(&parent->d_lock);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_inode_lock);
-			spin_unlock(&dcache_lock);
 			return;
 		}
 
@@ -702,8 +665,7 @@ relock:
 		spin_unlock(&dcache_lru_lock);
 
 		prune_one_dentry(dentry, parent);
-		/* dcache_lock, dcache_inode_lock and dentry->d_lock dropped */
-		spin_lock(&dcache_lock);
+		/* dcache_inode_lock and dentry->d_lock dropped */
 		spin_lock(&dcache_inode_lock);
 		spin_lock(&dcache_lru_lock);
 	}
@@ -725,7 +687,6 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 	LIST_HEAD(tmp);
 	int cnt = *count;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 relock:
 	spin_lock(&dcache_lru_lock);
@@ -766,7 +727,6 @@ relock:
 		list_splice(&referenced, &sb->s_dentry_lru);
 	spin_unlock(&dcache_lru_lock);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 }
 
 /**
@@ -788,7 +748,6 @@ static void prune_dcache(int count)
 
 	if (unused == 0 || count == 0)
 		return;
-	spin_lock(&dcache_lock);
 	if (count >= unused)
 		prune_ratio = 1;
 	else
@@ -825,11 +784,9 @@ static void prune_dcache(int count)
 		if (down_read_trylock(&sb->s_umount)) {
 			if ((sb->s_root != NULL) &&
 			    (!list_empty(&sb->s_dentry_lru))) {
-				spin_unlock(&dcache_lock);
 				__shrink_dcache_sb(sb, &w_count,
 						DCACHE_REFERENCED);
 				pruned -= w_count;
-				spin_lock(&dcache_lock);
 			}
 			up_read(&sb->s_umount);
 		}
@@ -845,7 +802,6 @@ static void prune_dcache(int count)
 	if (p)
 		__put_super(p);
 	spin_unlock(&sb_lock);
-	spin_unlock(&dcache_lock);
 }
 
 /**
@@ -859,7 +815,6 @@ void shrink_dcache_sb(struct super_block *sb)
 {
 	LIST_HEAD(tmp);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	spin_lock(&dcache_lru_lock);
 	while (!list_empty(&sb->s_dentry_lru)) {
@@ -868,7 +823,6 @@ void shrink_dcache_sb(struct super_block *sb)
 	}
 	spin_unlock(&dcache_lru_lock);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
@@ -885,12 +839,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 	BUG_ON(!IS_ROOT(dentry));
 
 	/* detach this root from the system */
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	dentry_lru_del(dentry);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 
 	for (;;) {
 		/* descend to the first leaf in the current subtree */
@@ -899,7 +851,6 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 
 			/* this is a branch with children - detach all of them
 			 * from the system in one go */
-			spin_lock(&dcache_lock);
 			spin_lock(&dentry->d_lock);
 			list_for_each_entry(loop, &dentry->d_subdirs,
 					    d_u.d_child) {
@@ -910,7 +861,6 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 				spin_unlock(&loop->d_lock);
 			}
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 
 			/* move to the first child */
 			dentry = list_entry(dentry->d_subdirs.next,
@@ -977,8 +927,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 
 /*
  * destroy the dentries attached to a superblock on unmounting
- * - we don't need to use dentry->d_lock, and only need dcache_lock when
- *   removing the dentry from the system lists and hashes because:
+ * - we don't need to use dentry->d_lock because:
  *   - the superblock is detached from all mountings and open files, so the
  *     dentry trees will not be rearranged by the VFS
  *   - s_umount is write-locked, so the memory pressure shrinker will ignore
@@ -1029,7 +978,6 @@ rename_retry:
 	this_parent = parent;
 	seq = read_seqbegin(&rename_lock);
 
-	spin_lock(&dcache_lock);
 	if (d_mountpoint(parent))
 		goto positive;
 	spin_lock(&this_parent->d_lock);
@@ -1075,7 +1023,6 @@ resume:
 		if (this_parent != child->d_parent ||
 				read_seqretry(&rename_lock, seq)) {
 			spin_unlock(&this_parent->d_lock);
-			spin_unlock(&dcache_lock);
 			rcu_read_unlock();
 			goto rename_retry;
 		}
@@ -1084,12 +1031,10 @@ resume:
 		goto resume;
 	}
 	spin_unlock(&this_parent->d_lock);
-	spin_unlock(&dcache_lock);
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
 	return 0; /* No mount points found in tree */
 positive:
-	spin_unlock(&dcache_lock);
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
 	return 1;
@@ -1121,7 +1066,6 @@ rename_retry:
 	this_parent = parent;
 	seq = read_seqbegin(&rename_lock);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&this_parent->d_lock);
 repeat:
 	next = this_parent->d_subdirs.next;
@@ -1185,7 +1129,6 @@ resume:
 		if (this_parent != child->d_parent ||
 				read_seqretry(&rename_lock, seq)) {
 			spin_unlock(&this_parent->d_lock);
-			spin_unlock(&dcache_lock);
 			rcu_read_unlock();
 			goto rename_retry;
 		}
@@ -1195,7 +1138,6 @@ resume:
 	}
 out:
 	spin_unlock(&this_parent->d_lock);
-	spin_unlock(&dcache_lock);
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
 	return found;
@@ -1297,7 +1239,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	INIT_LIST_HEAD(&dentry->d_u.d_child);
 
 	if (parent) {
-		spin_lock(&dcache_lock);
 		spin_lock(&parent->d_lock);
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 		dentry->d_parent = dget_dlock(parent);
@@ -1305,7 +1246,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&parent->d_lock);
-		spin_unlock(&dcache_lock);
 	}
 
 	this_cpu_inc(nr_dentry);
@@ -1325,7 +1265,6 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 }
 EXPORT_SYMBOL(d_alloc_name);
 
-/* the caller must hold dcache_lock */
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	spin_lock(&dentry->d_lock);
@@ -1354,11 +1293,9 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 void d_instantiate(struct dentry *entry, struct inode * inode)
 {
 	BUG_ON(!list_empty(&entry->d_alias));
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	__d_instantiate(entry, inode);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 }
 EXPORT_SYMBOL(d_instantiate);
@@ -1422,11 +1359,9 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 
 	BUG_ON(!list_empty(&entry->d_alias));
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	result = __d_instantiate_unique(entry, inode);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 
 	if (!result) {
 		security_d_instantiate(entry, inode);
@@ -1515,12 +1450,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	}
 	tmp->d_parent = tmp; /* make sure dput doesn't croak */
 
-	spin_lock(&dcache_lock);
+
 	spin_lock(&dcache_inode_lock);
 	res = __d_find_alias(inode, 0);
 	if (res) {
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 		dput(tmp);
 		goto out_iput;
 	}
@@ -1538,7 +1472,6 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	spin_unlock(&tmp->d_lock);
 	spin_unlock(&dcache_inode_lock);
 
-	spin_unlock(&dcache_lock);
 	return tmp;
 
  out_iput:
@@ -1568,21 +1501,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 	struct dentry *new = NULL;
 
 	if (inode && S_ISDIR(inode->i_mode)) {
-		spin_lock(&dcache_lock);
 		spin_lock(&dcache_inode_lock);
 		new = __d_find_alias(inode, 1);
 		if (new) {
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
 			spin_unlock(&dcache_inode_lock);
-			spin_unlock(&dcache_lock);
 			security_d_instantiate(new, inode);
 			d_move(new, dentry);
 			iput(inode);
 		} else {
-			/* already taking dcache_lock, so d_add() by hand */
+			/* already taking dcache_inode_lock, so d_add() by hand */
 			__d_instantiate(dentry, inode);
 			spin_unlock(&dcache_inode_lock);
-			spin_unlock(&dcache_lock);
 			security_d_instantiate(dentry, inode);
 			d_rehash(dentry);
 		}
@@ -1655,12 +1585,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	 * Negative dentry: instantiate it unless the inode is a directory and
 	 * already has a dentry.
 	 */
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
 		__d_instantiate(found, inode);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 		security_d_instantiate(found, inode);
 		return found;
 	}
@@ -1672,7 +1600,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
 	dget_locked(new);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	security_d_instantiate(found, inode);
 	d_move(new, found);
 	iput(inode);
@@ -1843,7 +1770,6 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
 	struct dentry *child;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dparent->d_lock);
 	list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
 		if (dentry == child) {
@@ -1851,12 +1777,10 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
 			__dget_locked_dlock(dentry);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dparent->d_lock);
-			spin_unlock(&dcache_lock);
 			return 1;
 		}
 	}
 	spin_unlock(&dparent->d_lock);
-	spin_unlock(&dcache_lock);
 
 	return 0;
 }
@@ -1889,7 +1813,6 @@ void d_delete(struct dentry * dentry)
 	/*
 	 * Are we the only user?
 	 */
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	spin_lock(&dentry->d_lock);
 	isdir = S_ISDIR(dentry->d_inode->i_mode);
@@ -1905,7 +1828,6 @@ void d_delete(struct dentry * dentry)
 
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 
 	fsnotify_nameremove(dentry, isdir);
 }
@@ -1932,13 +1854,11 @@ static void _d_rehash(struct dentry * entry)
  
 void d_rehash(struct dentry * entry)
 {
-	spin_lock(&dcache_lock);
 	spin_lock(&entry->d_lock);
 	spin_lock(&dcache_hash_lock);
 	_d_rehash(entry);
 	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&entry->d_lock);
-	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_rehash);
 
@@ -1961,11 +1881,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
 	BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(dentry_update_name_case);
 
@@ -2058,14 +1976,14 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry,
  * The hash value has to match the hash queue that the dentry is on..
  */
 /*
- * d_move_locked - move a dentry
+ * d_move - move a dentry
  * @dentry: entry to move
  * @target: new dentry
  *
  * Update the dcache to reflect the move of a file name. Negative
  * dcache entries should not be moved in this way.
  */
-static void d_move_locked(struct dentry * dentry, struct dentry * target)
+void d_move(struct dentry * dentry, struct dentry * target)
 {
 	if (!dentry->d_inode)
 		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
@@ -2114,22 +2032,6 @@ static void d_move_locked(struct dentry * dentry, struct dentry * target)
 	spin_unlock(&dentry->d_lock);
 	write_sequnlock(&rename_lock);
 }
-
-/**
- * d_move - move a dentry
- * @dentry: entry to move
- * @target: new dentry
- *
- * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way.
- */
-
-void d_move(struct dentry * dentry, struct dentry * target)
-{
-	spin_lock(&dcache_lock);
-	d_move_locked(dentry, target);
-	spin_unlock(&dcache_lock);
-}
 EXPORT_SYMBOL(d_move);
 
 /**
@@ -2155,13 +2057,12 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
  * This helper attempts to cope with remotely renamed directories
  *
  * It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex and the dcache_lock
+ * dentry->d_parent->d_inode->i_mutex and the dcache_inode_lock
  *
  * Note: If ever the locking in lock_rename() changes, then please
  * remember to update this too...
  */
 static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
-	__releases(dcache_lock)
 	__releases(dcache_inode_lock)
 {
 	struct mutex *m1 = NULL, *m2 = NULL;
@@ -2185,11 +2086,10 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
 		goto out_err;
 	m2 = &alias->d_parent->d_inode->i_mutex;
 out_unalias:
-	d_move_locked(alias, dentry);
+	d_move(alias, dentry);
 	ret = alias;
 out_err:
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	if (m2)
 		mutex_unlock(m2);
 	if (m1)
@@ -2249,7 +2149,6 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 
 	BUG_ON(!d_unhashed(dentry));
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 
 	if (!inode) {
@@ -2295,7 +2194,6 @@ found:
 	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&actual->d_lock);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 out_nolock:
 	if (actual == dentry) {
 		security_d_instantiate(dentry, inode);
@@ -2307,7 +2205,6 @@ out_nolock:
 
 shouldnt_be_hashed:
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	BUG();
 }
 EXPORT_SYMBOL_GPL(d_materialise_unique);
@@ -2421,11 +2318,9 @@ char *__d_path(const struct path *path, struct path *root,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	error = prepend_path(path, root, &res, &buflen);
 	write_sequnlock(&rename_lock);
-	spin_unlock(&dcache_lock);
 
 	if (error)
 		return ERR_PTR(error);
@@ -2487,14 +2382,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
 	get_fs_root(current->fs, &root);
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	tmp = root;
 	error = path_with_deleted(path, &tmp, &res, &buflen);
 	if (error)
 		res = ERR_PTR(error);
 	write_sequnlock(&rename_lock);
-	spin_unlock(&dcache_lock);
 	path_put(&root);
 	return res;
 }
@@ -2520,14 +2413,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
 	get_fs_root(current->fs, &root);
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	tmp = root;
 	error = path_with_deleted(path, &tmp, &res, &buflen);
 	if (!error && !path_equal(&tmp, &root))
 		error = prepend_unreachable(&res, &buflen);
 	write_sequnlock(&rename_lock);
-	spin_unlock(&dcache_lock);
 	path_put(&root);
 	if (error)
 		res =  ERR_PTR(error);
@@ -2594,11 +2485,9 @@ char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
 {
 	char *retval;
 
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	retval = __dentry_path(dentry, buf, buflen);
 	write_sequnlock(&rename_lock);
-	spin_unlock(&dcache_lock);
 
 	return retval;
 }
@@ -2609,7 +2498,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 	char *p = NULL;
 	char *retval;
 
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	if (d_unlinked(dentry)) {
 		p = buf + buflen;
@@ -2619,12 +2507,10 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 	}
 	retval = __dentry_path(dentry, buf, buflen);
 	write_sequnlock(&rename_lock);
-	spin_unlock(&dcache_lock);
 	if (!IS_ERR(retval) && p)
 		*p = '/';	/* restore '/' overriden with '\0' */
 	return retval;
 Elong:
-	spin_unlock(&dcache_lock);
 	return ERR_PTR(-ENAMETOOLONG);
 }
 
@@ -2658,7 +2544,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	get_fs_root_and_pwd(current->fs, &root, &pwd);
 
 	error = -ENOENT;
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
@@ -2669,7 +2554,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 		prepend(&cwd, &buflen, "\0", 1);
 		error = prepend_path(&pwd, &tmp, &cwd, &buflen);
 		write_sequnlock(&rename_lock);
-		spin_unlock(&dcache_lock);
 
 		if (error)
 			goto out;
@@ -2690,7 +2574,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 		}
 	} else {
 		write_sequnlock(&rename_lock);
-		spin_unlock(&dcache_lock);
 	}
 
 out:
@@ -2776,7 +2659,6 @@ void d_genocide(struct dentry *root)
 rename_retry:
 	this_parent = root;
 	seq = read_seqbegin(&rename_lock);
-	spin_lock(&dcache_lock);
 	spin_lock(&this_parent->d_lock);
 repeat:
 	next = this_parent->d_subdirs.next;
@@ -2823,7 +2705,6 @@ resume:
 		if (this_parent != child->d_parent ||
 				read_seqretry(&rename_lock, seq)) {
 			spin_unlock(&this_parent->d_lock);
-			spin_unlock(&dcache_lock);
 			rcu_read_unlock();
 			goto rename_retry;
 		}
@@ -2832,7 +2713,6 @@ resume:
 		goto resume;
 	}
 	spin_unlock(&this_parent->d_lock);
-	spin_unlock(&dcache_lock);
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 84b8c460a78..53a5c08fb63 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -47,24 +47,20 @@ find_acceptable_alias(struct dentry *result,
 	if (acceptable(context, result))
 		return result;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
 		dget_locked(dentry);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 		if (toput)
 			dput(toput);
 		if (dentry != result && acceptable(context, dentry)) {
 			dput(result);
 			return dentry;
 		}
-		spin_lock(&dcache_lock);
 		spin_lock(&dcache_inode_lock);
 		toput = dentry;
 	}
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 
 	if (toput)
 		dput(toput);
diff --git a/fs/libfs.c b/fs/libfs.c
index cc4794914b5..28b36663c44 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -100,7 +100,6 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			struct dentry *cursor = file->private_data;
 			loff_t n = file->f_pos - 2;
 
-			spin_lock(&dcache_lock);
 			spin_lock(&dentry->d_lock);
 			/* d_lock not required for cursor */
 			list_del(&cursor->d_u.d_child);
@@ -116,7 +115,6 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			}
 			list_add_tail(&cursor->d_u.d_child, p);
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 		}
 	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
@@ -159,7 +157,6 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			i++;
 			/* fallthrough */
 		default:
-			spin_lock(&dcache_lock);
 			spin_lock(&dentry->d_lock);
 			if (filp->f_pos == 2)
 				list_move(q, &dentry->d_subdirs);
@@ -175,13 +172,11 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 
 				spin_unlock(&next->d_lock);
 				spin_unlock(&dentry->d_lock);
-				spin_unlock(&dcache_lock);
 				if (filldir(dirent, next->d_name.name, 
 					    next->d_name.len, filp->f_pos, 
 					    next->d_inode->i_ino, 
 					    dt_type(next->d_inode)) < 0)
 					return 0;
-				spin_lock(&dcache_lock);
 				spin_lock(&dentry->d_lock);
 				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
 				/* next is still alive */
@@ -191,7 +186,6 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 				filp->f_pos++;
 			}
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 	}
 	return 0;
 }
@@ -285,7 +279,6 @@ int simple_empty(struct dentry *dentry)
 	struct dentry *child;
 	int ret = 0;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
 		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
@@ -298,7 +291,6 @@ int simple_empty(struct dentry *dentry)
 	ret = 1;
 out:
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 	return ret;
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index cbfa5fb3107..5642bc2be41 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -612,8 +612,8 @@ int follow_up(struct path *path)
 	return 1;
 }
 
-/* no need for dcache_lock, as serialization is taken care in
- * namespace.c
+/*
+ * serialization is taken care of in namespace.c
  */
 static int __follow_mount(struct path *path)
 {
@@ -645,9 +645,6 @@ static void follow_mount(struct path *path)
 	}
 }
 
-/* no need for dcache_lock, as serialization is taken care in
- * namespace.c
- */
 int follow_down(struct path *path)
 {
 	struct vfsmount *mounted;
@@ -2131,12 +2128,10 @@ void dentry_unhash(struct dentry *dentry)
 {
 	dget(dentry);
 	shrink_dcache_parent(dentry);
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (dentry->d_count == 2)
 		__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 102278ed38b..de15c533311 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -391,7 +391,6 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	}
 
 	/* If a pointer is invalid, we search the dentry. */
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
@@ -402,13 +401,11 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 			else
 				dent = NULL;
 			spin_unlock(&parent->d_lock);
-			spin_unlock(&dcache_lock);
 			goto out;
 		}
 		next = next->next;
 	}
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 	return NULL;
 
 out:
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index c4b718ff9a6..1220df75ff2 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -193,7 +193,6 @@ ncp_renew_dentries(struct dentry *parent)
 	struct list_head *next;
 	struct dentry *dentry;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
@@ -207,7 +206,6 @@ ncp_renew_dentries(struct dentry *parent)
 		next = next->next;
 	}
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 static inline void
@@ -217,7 +215,6 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
 	struct list_head *next;
 	struct dentry *dentry;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
@@ -227,7 +224,6 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
 		next = next->next;
 	}
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 struct ncp_cache_head {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 12de824edb5..eb77471b882 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1718,11 +1718,9 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 	dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
 		dir->i_ino, dentry->d_name.name);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (dentry->d_count > 1) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
 		/* Start asynchronous writeout of the inode */
 		write_inode_now(dentry->d_inode, 0);
 		error = nfs_sillyrename(dir, dentry);
@@ -1733,7 +1731,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 		need_rehash = 1;
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 	error = nfs_safe_remove(dentry);
 	if (!error || error == -ENOENT) {
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 850f67d5f0a..b3e36c3430d 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -63,13 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 		 * This again causes shrink_dcache_for_umount_subtree() to
 		 * Oops, since the test for IS_ROOT() will fail.
 		 */
-		spin_lock(&dcache_lock);
 		spin_lock(&dcache_inode_lock);
 		spin_lock(&sb->s_root->d_lock);
 		list_del_init(&sb->s_root->d_alias);
 		spin_unlock(&sb->s_root->d_lock);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 	}
 	return 0;
 }
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 78c0ebb0b07..74aaf3963c1 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -60,7 +60,6 @@ rename_retry:
 
 	seq = read_seqbegin(&rename_lock);
 	rcu_read_lock();
-	spin_lock(&dcache_lock);
 	while (!IS_ROOT(dentry) && dentry != droot) {
 		namelen = dentry->d_name.len;
 		buflen -= namelen + 1;
@@ -71,7 +70,6 @@ rename_retry:
 		*--end = '/';
 		dentry = dentry->d_parent;
 	}
-	spin_unlock(&dcache_lock);
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
@@ -91,7 +89,6 @@ rename_retry:
 	memcpy(end, base, namelen);
 	return end;
 Elong_unlock:
-	spin_unlock(&dcache_lock);
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ae769fc9b66..9be6ec1f36d 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -59,7 +59,6 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 	/* determine if the children should tell inode about their events */
 	watched = fsnotify_inode_watches_children(inode);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	/* run all of the dentries associated with this inode.  Since this is a
 	 * directory, there damn well better only be one item on this list */
@@ -84,7 +83,6 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 		spin_unlock(&alias->d_lock);
 	}
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 }
 
 /* Notify this dentry's parent about a child's events. */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index c31b5c647ac..b7de749bdd1 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -169,7 +169,6 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 	struct list_head *p;
 	struct dentry *dentry = NULL;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	list_for_each(p, &inode->i_dentry) {
 		dentry = list_entry(p, struct dentry, d_alias);
@@ -189,7 +188,6 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 	}
 
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 
 	return dentry;
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index c963ebada92..a2ceb94b0e3 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -183,7 +183,6 @@ struct dentry_operations {
 #define DCACHE_GENOCIDE		0x0200
 
 extern spinlock_t dcache_inode_lock;
-extern spinlock_t dcache_lock;
 extern seqlock_t rename_lock;
 
 static inline int dname_external(struct dentry *dentry)
@@ -296,8 +295,8 @@ extern char *dentry_path(struct dentry *, char *, int);
  *	destroyed when it has references. dget() should never be
  *	called for dentries with zero reference counter. For these cases
  *	(preferably none, functions in dcache.c are sufficient for normal
- *	needs and they take necessary precautions) you should hold dcache_lock
- *	and call dget_locked() instead of dget().
+ *	needs and they take necessary precautions) you should hold d_lock
+ *	and call dget_dlock() instead of dget().
  */
 static inline struct dentry *dget_dlock(struct dentry *dentry)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 090f0eacde2..296cf2fde94 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1378,7 +1378,7 @@ struct super_block {
 #else
 	struct list_head	s_files;
 #endif
-	/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
+	/* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
 	struct list_head	s_dentry_lru;	/* unused dentry lru */
 	int			s_nr_dentry_unused;	/* # of dentry on lru */
 
@@ -2446,6 +2446,10 @@ static inline ino_t parent_ino(struct dentry *dentry)
 {
 	ino_t res;
 
+	/*
+	 * Don't strictly need d_lock here? If the parent ino could change
+	 * then surely we'd have a deeper race in the caller?
+	 */
 	spin_lock(&dentry->d_lock);
 	res = dentry->d_parent->d_inode->i_ino;
 	spin_unlock(&dentry->d_lock);
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index b10bcdeaef7..2a53f10712b 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -17,7 +17,6 @@
 
 /*
  * fsnotify_d_instantiate - instantiate a dentry for inode
- * Called with dcache_lock held.
  */
 static inline void fsnotify_d_instantiate(struct dentry *dentry,
 					  struct inode *inode)
@@ -62,7 +61,6 @@ static inline int fsnotify_perm(struct file *file, int mask)
 
 /*
  * fsnotify_d_move - dentry has been moved
- * Called with dcache_lock and dentry->d_lock held.
  */
 static inline void fsnotify_d_move(struct dentry *dentry)
 {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7380763595d..69ad89b5048 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -329,9 +329,15 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
 {
 	struct dentry *parent;
 
-	assert_spin_locked(&dcache_lock);
 	assert_spin_locked(&dentry->d_lock);
 
+	/*
+	 * Serialisation of setting PARENT_WATCHED on the dentries is provided
+	 * by d_lock. If inotify_inode_watched changes after we have taken
+	 * d_lock, the following __fsnotify_update_child_dentry_flags call will
+	 * find our entry, so it will spin until we complete here, and update
+	 * us with the new state.
+	 */
 	parent = dentry->d_parent;
 	if (parent->d_inode && fsnotify_inode_watches_children(parent->d_inode))
 		dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
@@ -341,15 +347,12 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
 
 /*
  * fsnotify_d_instantiate - instantiate a dentry for inode
- * Called with dcache_lock held.
  */
 static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	if (!inode)
 		return;
 
-	assert_spin_locked(&dcache_lock);
-
 	spin_lock(&dentry->d_lock);
 	__fsnotify_update_dcache_flags(dentry);
 	spin_unlock(&dentry->d_lock);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 05b441d9364..aec730b5393 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -41,7 +41,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
  *  - require a directory
  *  - ending slashes ok even for nonexistent files
  *  - internal "there are more path components" flag
- *  - locked when lookup done with dcache_lock held
  *  - dentry cache is untrusted; force a real lookup
  */
 #define LOOKUP_FOLLOW		 1
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7b4705b51d4..1864cb6a6a5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -876,7 +876,6 @@ static void cgroup_clear_directory(struct dentry *dentry)
 	struct list_head *node;
 
 	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	node = dentry->d_subdirs.next;
 	while (node != &dentry->d_subdirs) {
@@ -891,18 +890,15 @@ static void cgroup_clear_directory(struct dentry *dentry)
 			dget_locked_dlock(d);
 			spin_unlock(&d->d_lock);
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 			d_delete(d);
 			simple_unlink(dentry->d_inode, d);
 			dput(d);
-			spin_lock(&dcache_lock);
 			spin_lock(&dentry->d_lock);
 		} else
 			spin_unlock(&d->d_lock);
 		node = dentry->d_subdirs.next;
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 /*
@@ -914,14 +910,12 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 
 	cgroup_clear_directory(dentry);
 
-	spin_lock(&dcache_lock);
 	parent = dentry->d_parent;
 	spin_lock(&parent->d_lock);
 	spin_lock(&dentry->d_lock);
 	list_del_init(&dentry->d_u.d_child);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 	remove_dir(dentry);
 }
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 6b9aee20f24..ca389394fa2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,9 +102,6 @@
  *    ->inode_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
  *
- *  ->task->proc_lock
- *    ->dcache_lock		(proc_pid_lookup)
- *
  *  (code doesn't rely on that order, so you could switch it around)
  *  ->tasklist_lock             (memory_failure, collect_procs_ao)
  *    ->i_mmap_lock
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 017ec096446..2285d693f29 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1145,7 +1145,6 @@ static void sel_remove_entries(struct dentry *de)
 {
 	struct list_head *node;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&de->d_lock);
 	node = de->d_subdirs.next;
 	while (node != &de->d_subdirs) {
@@ -1158,11 +1157,9 @@ static void sel_remove_entries(struct dentry *de)
 			dget_locked_dlock(d);
 			spin_unlock(&de->d_lock);
 			spin_unlock(&d->d_lock);
-			spin_unlock(&dcache_lock);
 			d_delete(d);
 			simple_unlink(de->d_inode, d);
 			dput(d);
-			spin_lock(&dcache_lock);
 			spin_lock(&de->d_lock);
 		} else
 			spin_unlock(&d->d_lock);
@@ -1170,7 +1167,6 @@ static void sel_remove_entries(struct dentry *de)
 	}
 
 	spin_unlock(&de->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 #define BOOL_DIR_NAME "booleans"
-- 
cgit v1.2.3


From fa0d7e3de6d6fc5004ad9dea0dd6b286af8f03e9 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:49 +1100
Subject: fs: icache RCU free inodes

RCU free the struct inode. This will allow:

- Subsequent store-free path walking patch. The inode must be consulted for
  permissions when walking, so an RCU inode reference is a must.
- sb_inode_list_lock to be moved inside i_lock because sb list walkers who want
  to take i_lock no longer need to take sb_inode_list_lock to walk the list in
  the first place. This will simplify and optimize locking.
- Could remove some nested trylock loops in dcache code
- Could potentially simplify things a bit in VM land. Do not need to take the
  page lock to follow page->mapping.

The downsides of this is the performance cost of using RCU. In a simple
creat/unlink microbenchmark, performance drops by about 10% due to inability to
reuse cache-hot slab objects. As iterations increase and RCU freeing starts
kicking over, this increases to about 20%.

In cases where inode lifetimes are longer (ie. many inodes may be allocated
during the average life span of a single inode), a lot of this cache reuse is
not applicable, so the regression caused by this patch is smaller.

The cache-hot regression could largely be avoided by using SLAB_DESTROY_BY_RCU,
however this adds some complexity to list walking and store-free path walking,
so I prefer to implement this at a later date, if it is shown to be a win in
real situations. I haven't found a regression in any non-micro benchmark so I
doubt it will be a problem.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 Documentation/filesystems/porting         | 14 ++++++++++++++
 arch/powerpc/platforms/cell/spufs/inode.c | 10 ++++++++--
 drivers/staging/pohmelfs/inode.c          |  9 ++++++++-
 drivers/staging/smbfs/inode.c             |  9 ++++++++-
 fs/9p/vfs_inode.c                         |  9 ++++++++-
 fs/adfs/super.c                           |  9 ++++++++-
 fs/affs/super.c                           |  9 ++++++++-
 fs/afs/super.c                            | 10 +++++++++-
 fs/befs/linuxvfs.c                        | 10 ++++++++--
 fs/bfs/inode.c                            |  9 ++++++++-
 fs/block_dev.c                            |  9 ++++++++-
 fs/btrfs/inode.c                          |  9 ++++++++-
 fs/ceph/inode.c                           | 11 ++++++++++-
 fs/cifs/cifsfs.c                          |  9 ++++++++-
 fs/coda/inode.c                           |  9 ++++++++-
 fs/ecryptfs/super.c                       | 12 +++++++++++-
 fs/efs/super.c                            |  9 ++++++++-
 fs/exofs/super.c                          |  9 ++++++++-
 fs/ext2/super.c                           |  9 ++++++++-
 fs/ext3/super.c                           |  9 ++++++++-
 fs/ext4/super.c                           |  9 ++++++++-
 fs/fat/inode.c                            |  9 ++++++++-
 fs/freevxfs/vxfs_inode.c                  |  9 ++++++++-
 fs/fuse/inode.c                           |  9 ++++++++-
 fs/gfs2/super.c                           |  9 ++++++++-
 fs/hfs/super.c                            |  9 ++++++++-
 fs/hfsplus/super.c                        | 10 +++++++++-
 fs/hostfs/hostfs_kern.c                   |  9 ++++++++-
 fs/hpfs/super.c                           |  9 ++++++++-
 fs/hppfs/hppfs.c                          |  9 ++++++++-
 fs/hugetlbfs/inode.c                      |  9 ++++++++-
 fs/inode.c                                | 10 +++++++++-
 fs/isofs/inode.c                          |  9 ++++++++-
 fs/jffs2/super.c                          |  9 ++++++++-
 fs/jfs/super.c                            | 10 +++++++++-
 fs/logfs/inode.c                          |  9 ++++++++-
 fs/minix/inode.c                          |  9 ++++++++-
 fs/ncpfs/inode.c                          |  9 ++++++++-
 fs/nfs/inode.c                            |  9 ++++++++-
 fs/nilfs2/super.c                         | 10 +++++++++-
 fs/ntfs/inode.c                           |  9 ++++++++-
 fs/ocfs2/dlmfs/dlmfs.c                    |  9 ++++++++-
 fs/ocfs2/super.c                          |  9 ++++++++-
 fs/openpromfs/inode.c                     |  9 ++++++++-
 fs/proc/inode.c                           |  9 ++++++++-
 fs/qnx4/inode.c                           |  9 ++++++++-
 fs/reiserfs/super.c                       |  9 ++++++++-
 fs/romfs/super.c                          |  9 ++++++++-
 fs/squashfs/super.c                       |  9 ++++++++-
 fs/sysv/inode.c                           |  9 ++++++++-
 fs/ubifs/super.c                          | 10 +++++++++-
 fs/udf/super.c                            |  9 ++++++++-
 fs/ufs/super.c                            |  9 ++++++++-
 fs/xfs/xfs_iget.c                         | 13 ++++++++++++-
 include/linux/fs.h                        |  5 ++++-
 include/linux/net.h                       |  1 -
 ipc/mqueue.c                              |  9 ++++++++-
 mm/shmem.c                                |  9 ++++++++-
 net/socket.c                              | 16 ++++++++--------
 net/sunrpc/rpc_pipe.c                     | 10 +++++++++-
 60 files changed, 490 insertions(+), 68 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 1eb76959d09..ccf0ce7866b 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -346,3 +346,17 @@ look at examples of other filesystems) for guidance.
 for details of what locks to replace dcache_lock with in order to protect
 particular things. Most of the time, a filesystem only needs ->d_lock, which
 protects *all* the dcache state of a given dentry.
+
+--
+[mandatory]
+
+	Filesystems must RCU-free their inodes, if they can have been accessed
+via rcu-walk path walk (basically, if the file can have had a path name in the
+vfs namespace).
+
+	i_dentry and i_rcu share storage in a union, and the vfs expects
+i_dentry to be reinitialized before it is freed, so an:
+
+  INIT_LIST_HEAD(&inode->i_dentry);
+
+must be done in the RCU callback.
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 03185de7cd4..856e9c39806 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -71,12 +71,18 @@ spufs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void
-spufs_destroy_inode(struct inode *inode)
+static void spufs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(spufs_inode_cache, SPUFS_I(inode));
 }
 
+static void spufs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, spufs_i_callback);
+}
+
 static void
 spufs_init_once(void *p)
 {
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 61685ccceda..cc8d2840f9b 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -826,6 +826,13 @@ const struct address_space_operations pohmelfs_aops = {
 	.set_page_dirty 	= __set_page_dirty_nobuffers,
 };
 
+static void pohmelfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(pohmelfs_inode_cache, POHMELFS_I(inode));
+}
+
 /*
  * ->detroy_inode() callback. Deletes inode from the caches
  *  and frees private data.
@@ -842,8 +849,8 @@ static void pohmelfs_destroy_inode(struct inode *inode)
 
 	dprintk("%s: pi: %p, inode: %p, ino: %llu.\n",
 		__func__, pi, &pi->vfs_inode, pi->ino);
-	kmem_cache_free(pohmelfs_inode_cache, pi);
 	atomic_long_dec(&psb->total_inodes);
+	call_rcu(&inode->i_rcu, pohmelfs_i_callback);
 }
 
 /*
diff --git a/drivers/staging/smbfs/inode.c b/drivers/staging/smbfs/inode.c
index 540a984bb51..244319dc970 100644
--- a/drivers/staging/smbfs/inode.c
+++ b/drivers/staging/smbfs/inode.c
@@ -62,11 +62,18 @@ static struct inode *smb_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void smb_destroy_inode(struct inode *inode)
+static void smb_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
 }
 
+static void smb_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, smb_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 1073bca8488..f6f9081e6d2 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -237,10 +237,17 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
  *
  */
 
-void v9fs_destroy_inode(struct inode *inode)
+static void v9fs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
 }
+
+void v9fs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, v9fs_i_callback);
+}
 #endif
 
 /**
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 959dbff2d42..47dffc513a2 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -240,11 +240,18 @@ static struct inode *adfs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void adfs_destroy_inode(struct inode *inode)
+static void adfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
 
+static void adfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, adfs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 0cf7f4384cb..4c18fcfb0a1 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -95,11 +95,18 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
 	return &i->vfs_inode;
 }
 
-static void affs_destroy_inode(struct inode *inode)
+static void affs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
 
+static void affs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, affs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 27201cffece..f901a9d7c11 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -498,6 +498,14 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 	return &vnode->vfs_inode;
 }
 
+static void afs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(afs_inode_cachep, vnode);
+}
+
 /*
  * destroy an AFS inode struct
  */
@@ -511,7 +519,7 @@ static void afs_destroy_inode(struct inode *inode)
 
 	ASSERTCMP(vnode->server, ==, NULL);
 
-	kmem_cache_free(afs_inode_cachep, vnode);
+	call_rcu(&inode->i_rcu, afs_i_callback);
 	atomic_dec(&afs_count_active_inodes);
 }
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index aa4e7c7ae3c..de93581b79a 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -284,12 +284,18 @@ befs_alloc_inode(struct super_block *sb)
         return &bi->vfs_inode;
 }
 
-static void
-befs_destroy_inode(struct inode *inode)
+static void befs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
         kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
+static void befs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, befs_i_callback);
+}
+
 static void init_once(void *foo)
 {
         struct befs_inode_info *bi = (struct befs_inode_info *) foo;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 76db6d7d49b..a8e37f81d09 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -248,11 +248,18 @@ static struct inode *bfs_alloc_inode(struct super_block *sb)
 	return &bi->vfs_inode;
 }
 
-static void bfs_destroy_inode(struct inode *inode)
+static void bfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
 
+static void bfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bfs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct bfs_inode_info *bi = foo;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4230252fd68..771f2352701 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -409,13 +409,20 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void bdev_destroy_inode(struct inode *inode)
+static void bdev_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct bdev_inode *bdi = BDEV_I(inode);
 
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
+static void bdev_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bdev_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7ce9f893278..f9d2994a42a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6495,6 +6495,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	return inode;
 }
 
+static void btrfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
 void btrfs_destroy_inode(struct inode *inode)
 {
 	struct btrfs_ordered_extent *ordered;
@@ -6564,7 +6571,7 @@ void btrfs_destroy_inode(struct inode *inode)
 	inode_tree_del(inode);
 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+	call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 
 int btrfs_drop_inode(struct inode *inode)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2a48cafc174..47f8c8baf3b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -368,6 +368,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	return &ci->vfs_inode;
 }
 
+static void ceph_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ceph_inode_cachep, ci);
+}
+
 void ceph_destroy_inode(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -407,7 +416,7 @@ void ceph_destroy_inode(struct inode *inode)
 	if (ci->i_xattrs.prealloc_blob)
 		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 
-	kmem_cache_free(ceph_inode_cachep, ci);
+	call_rcu(&inode->i_rcu, ceph_i_callback);
 }
 
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3936aa7f2c2..223717dcc40 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -334,10 +334,17 @@ cifs_alloc_inode(struct super_block *sb)
 	return &cifs_inode->vfs_inode;
 }
 
+static void cifs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+}
+
 static void
 cifs_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+	call_rcu(&inode->i_rcu, cifs_i_callback);
 }
 
 static void
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 5ea57c8c7f9..50dc7d189f5 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -56,11 +56,18 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void coda_destroy_inode(struct inode *inode)
+static void coda_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
 
+static void coda_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, coda_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 2720178b771..3042fe123a3 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -62,6 +62,16 @@ out:
 	return inode;
 }
 
+static void ecryptfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ecryptfs_inode_info *inode_info;
+	inode_info = ecryptfs_inode_to_private(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
+
 /**
  * ecryptfs_destroy_inode
  * @inode: The ecryptfs inode
@@ -88,7 +98,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
 		}
 	}
 	ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
-	kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+	call_rcu(&inode->i_rcu, ecryptfs_i_callback);
 }
 
 /**
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 5073a07652c..0f31acb0131 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -65,11 +65,18 @@ static struct inode *efs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void efs_destroy_inode(struct inode *inode)
+static void efs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
 
+static void efs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, efs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 79c3ae6e045..8c6c4669b38 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -150,12 +150,19 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
 	return &oi->vfs_inode;
 }
 
+static void exofs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+
 /*
  * Remove an inode from the cache
  */
 static void exofs_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+	call_rcu(&inode->i_rcu, exofs_i_callback);
 }
 
 /*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d89e0b6a2d7..e0c6380ff99 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -161,11 +161,18 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void ext2_destroy_inode(struct inode *inode)
+static void ext2_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
 
+static void ext2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ext2_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index acf8695fa8f..77ce1616f72 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -479,6 +479,13 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
+static void ext3_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+}
+
 static void ext3_destroy_inode(struct inode *inode)
 {
 	if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
@@ -489,7 +496,7 @@ static void ext3_destroy_inode(struct inode *inode)
 				false);
 		dump_stack();
 	}
-	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+	call_rcu(&inode->i_rcu, ext3_i_callback);
 }
 
 static void init_once(void *foo)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fb15c9c0be7..cd37f9d5e44 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -841,6 +841,13 @@ static int ext4_drop_inode(struct inode *inode)
 	return drop;
 }
 
+static void ext4_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+
 static void ext4_destroy_inode(struct inode *inode)
 {
 	ext4_ioend_wait(inode);
@@ -853,7 +860,7 @@ static void ext4_destroy_inode(struct inode *inode)
 				true);
 		dump_stack();
 	}
-	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+	call_rcu(&inode->i_rcu, ext4_i_callback);
 }
 
 static void init_once(void *foo)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ad6998a92c3..8cccfebee18 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -514,11 +514,18 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void fat_destroy_inode(struct inode *inode)
+static void fat_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
 
+static void fat_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, fat_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 8c04eac5079..2ba6719ac61 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -337,6 +337,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
 	return ip;
 }
 
+static void vxfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(vxfs_inode_cachep, inode->i_private);
+}
+
 /**
  * vxfs_evict_inode - remove inode from main memory
  * @ip:		inode to discard.
@@ -350,5 +357,5 @@ vxfs_evict_inode(struct inode *ip)
 {
 	truncate_inode_pages(&ip->i_data, 0);
 	end_writeback(ip);
-	kmem_cache_free(vxfs_inode_cachep, ip->i_private);
+	call_rcu(&ip->i_rcu, vxfs_i_callback);
 }
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cfce3ad86a9..44e0a6c57e8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -99,6 +99,13 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	return inode;
 }
 
+static void fuse_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(fuse_inode_cachep, inode);
+}
+
 static void fuse_destroy_inode(struct inode *inode)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
@@ -106,7 +113,7 @@ static void fuse_destroy_inode(struct inode *inode)
 	BUG_ON(!list_empty(&fi->queued_writes));
 	if (fi->forget_req)
 		fuse_request_free(fi->forget_req);
-	kmem_cache_free(fuse_inode_cachep, inode);
+	call_rcu(&inode->i_rcu, fuse_i_callback);
 }
 
 void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2b2c4997430..16c2ecac7eb 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1405,11 +1405,18 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
 	return &ip->i_inode;
 }
 
-static void gfs2_destroy_inode(struct inode *inode)
+static void gfs2_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(gfs2_inode_cachep, inode);
 }
 
+static void gfs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, gfs2_i_callback);
+}
+
 const struct super_operations gfs2_super_ops = {
 	.alloc_inode		= gfs2_alloc_inode,
 	.destroy_inode		= gfs2_destroy_inode,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4824c27cebb..ef4ee5716ab 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -167,11 +167,18 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
 	return i ? &i->vfs_inode : NULL;
 }
 
-static void hfs_destroy_inode(struct inode *inode)
+static void hfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
 }
 
+static void hfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hfs_i_callback);
+}
+
 static const struct super_operations hfs_super_operations = {
 	.alloc_inode	= hfs_alloc_inode,
 	.destroy_inode	= hfs_destroy_inode,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 52cc746d3ba..182e83a9079 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -488,11 +488,19 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
 	return i ? &i->vfs_inode : NULL;
 }
 
-static void hfsplus_destroy_inode(struct inode *inode)
+static void hfsplus_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
 }
 
+static void hfsplus_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hfsplus_i_callback);
+}
+
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
 
 static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 39dc505ed27..861113fcfc8 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -247,11 +247,18 @@ static void hostfs_evict_inode(struct inode *inode)
 	}
 }
 
-static void hostfs_destroy_inode(struct inode *inode)
+static void hostfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kfree(HOSTFS_I(inode));
 }
 
+static void hostfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hostfs_i_callback);
+}
+
 static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	const char *root_path = vfs->mnt_sb->s_fs_info;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 6c5f01597c3..49935ba78db 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -177,11 +177,18 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void hpfs_destroy_inode(struct inode *inode)
+static void hpfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
 
+static void hpfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hpfs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index f702b5f713f..87ed48e0343 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -632,11 +632,18 @@ void hppfs_evict_inode(struct inode *ino)
 	mntput(ino->i_sb->s_fs_info);
 }
 
-static void hppfs_destroy_inode(struct inode *inode)
+static void hppfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kfree(HPPFS_I(inode));
 }
 
+static void hppfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hppfs_i_callback);
+}
+
 static const struct super_operations hppfs_sbops = {
 	.alloc_inode	= hppfs_alloc_inode,
 	.destroy_inode	= hppfs_destroy_inode,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a5fe68189ee..9885082b470 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -663,11 +663,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 	return &p->vfs_inode;
 }
 
+static void hugetlbfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+}
+
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
-	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
 }
 
 static const struct address_space_operations hugetlbfs_aops = {
diff --git a/fs/inode.c b/fs/inode.c
index 5a0a898f55d..6751dfe8cc0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -272,6 +272,13 @@ void __destroy_inode(struct inode *inode)
 }
 EXPORT_SYMBOL(__destroy_inode);
 
+static void i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(inode_cachep, inode);
+}
+
 static void destroy_inode(struct inode *inode)
 {
 	BUG_ON(!list_empty(&inode->i_lru));
@@ -279,7 +286,7 @@ static void destroy_inode(struct inode *inode)
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
 	else
-		kmem_cache_free(inode_cachep, (inode));
+		call_rcu(&inode->i_rcu, i_callback);
 }
 
 /*
@@ -432,6 +439,7 @@ void end_writeback(struct inode *inode)
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	inode_sync_wait(inode);
+	/* don't need i_lock here, no concurrent mods to i_state */
 	inode->i_state = I_FREEING | I_CLEAR;
 }
 EXPORT_SYMBOL(end_writeback);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d204ee4235f..d8f3a652243 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -81,11 +81,18 @@ static struct inode *isofs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void isofs_destroy_inode(struct inode *inode)
+static void isofs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
 
+static void isofs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, isofs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct iso_inode_info *ei = foo;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index c86041b866a..853b8e30008 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -40,11 +40,18 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
 	return &f->vfs_inode;
 }
 
-static void jffs2_destroy_inode(struct inode *inode)
+static void jffs2_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
 
+static void jffs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, jffs2_i_callback);
+}
+
 static void jffs2_i_init_once(void *foo)
 {
 	struct jffs2_inode_info *f = foo;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0669fc1cc3b..b715b0f7bdf 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -115,6 +115,14 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
 	return &jfs_inode->vfs_inode;
 }
 
+static void jfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(jfs_inode_cachep, ji);
+}
+
 static void jfs_destroy_inode(struct inode *inode)
 {
 	struct jfs_inode_info *ji = JFS_IP(inode);
@@ -128,7 +136,7 @@ static void jfs_destroy_inode(struct inode *inode)
 		ji->active_ag = -1;
 	}
 	spin_unlock_irq(&ji->ag_lock);
-	kmem_cache_free(jfs_inode_cachep, ji);
+	call_rcu(&inode->i_rcu, jfs_i_callback);
 }
 
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index d8c71ece098..03b8c240aed 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -141,13 +141,20 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
 	return __logfs_iget(sb, ino);
 }
 
+static void logfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+}
+
 static void __logfs_destroy_inode(struct inode *inode)
 {
 	struct logfs_inode *li = logfs_inode(inode);
 
 	BUG_ON(li->li_block);
 	list_del(&li->li_freeing_list);
-	kmem_cache_free(logfs_inode_cache, li);
+	call_rcu(&inode->i_rcu, logfs_i_callback);
 }
 
 static void logfs_destroy_inode(struct inode *inode)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index fb2020858a3..ae0b83f476a 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,11 +68,18 @@ static struct inode *minix_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void minix_destroy_inode(struct inode *inode)
+static void minix_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
 
+static void minix_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, minix_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8fb93b604e7..60047dbeb38 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -58,11 +58,18 @@ static struct inode *ncp_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void ncp_destroy_inode(struct inode *inode)
+static void ncp_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
 
+static void ncp_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ncp_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e67e31c7341..017daa3bed3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1438,11 +1438,18 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
 	return &nfsi->vfs_inode;
 }
 
-void nfs_destroy_inode(struct inode *inode)
+static void nfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
 
+void nfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, nfs_i_callback);
+}
+
 static inline void nfs4_init_once(struct nfs_inode *nfsi)
 {
 #ifdef CONFIG_NFS_V4
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index d36fc7ee615..e2dcc9c733f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -162,10 +162,13 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
 	return &ii->vfs_inode;
 }
 
-void nilfs_destroy_inode(struct inode *inode)
+static void nilfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 
+	INIT_LIST_HEAD(&inode->i_dentry);
+
 	if (mdi) {
 		kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
 		kfree(mdi);
@@ -173,6 +176,11 @@ void nilfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
 
+void nilfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, nilfs_i_callback);
+}
+
 static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 93622b175fc..a627ed82c0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -332,6 +332,13 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
 	return NULL;
 }
 
+static void ntfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
+
 void ntfs_destroy_big_inode(struct inode *inode)
 {
 	ntfs_inode *ni = NTFS_I(inode);
@@ -340,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode)
 	BUG_ON(ni->page);
 	if (!atomic_dec_and_test(&ni->count))
 		BUG();
-	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+	call_rcu(&inode->i_rcu, ntfs_i_callback);
 }
 
 static inline ntfs_inode *ntfs_alloc_extent_inode(void)
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b2df490a19e..8c5c0eddc36 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -351,11 +351,18 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
 	return &ip->ip_vfs_inode;
 }
 
-static void dlmfs_destroy_inode(struct inode *inode)
+static void dlmfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 }
 
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
+
 static void dlmfs_evict_inode(struct inode *inode)
 {
 	int status;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index cfeab7ce369..17ff46fa8a1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -569,11 +569,18 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
 	return &oi->vfs_inode;
 }
 
-static void ocfs2_destroy_inode(struct inode *inode)
+static void ocfs2_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
 }
 
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
+
 static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 						unsigned int cbits)
 {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 911e61f348f..a2a5bff774e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -343,11 +343,18 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
 	return &oi->vfs_inode;
 }
 
-static void openprom_destroy_inode(struct inode *inode)
+static void openprom_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(op_inode_cachep, OP_I(inode));
 }
 
+static void openprom_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, openprom_i_callback);
+}
+
 static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 {
 	struct inode *inode;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3ddb6068177..6bcb926b101 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -65,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	return inode;
 }
 
-static void proc_destroy_inode(struct inode *inode)
+static void proc_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 
+static void proc_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, proc_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fcada42f1aa..e63b4171d58 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -425,11 +425,18 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void qnx4_destroy_inode(struct inode *inode)
+static void qnx4_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
 
+static void qnx4_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, qnx4_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b243117b875..2575682a9ea 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -529,11 +529,18 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void reiserfs_destroy_inode(struct inode *inode)
+static void reiserfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
 
+static void reiserfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, reiserfs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 6647f90e55c..2305e3121cb 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -400,11 +400,18 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
 /*
  * return a spent inode to the slab cache
  */
-static void romfs_destroy_inode(struct inode *inode)
+static void romfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
 
+static void romfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, romfs_i_callback);
+}
+
 /*
  * get filesystem statistics
  */
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 24de30ba34c..20700b9f2b4 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -440,11 +440,18 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb)
 }
 
 
-static void squashfs_destroy_inode(struct inode *inode)
+static void squashfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
 }
 
+static void squashfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, squashfs_i_callback);
+}
+
 
 static struct file_system_type squashfs_fs_type = {
 	.owner = THIS_MODULE,
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index de44d067b9e..0630eb969a2 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -333,11 +333,18 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
 	return &si->vfs_inode;
 }
 
-static void sysv_destroy_inode(struct inode *inode)
+static void sysv_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
 
+static void sysv_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, sysv_i_callback);
+}
+
 static void init_once(void *p)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 91fac54c70e..6e11c2975dc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -272,12 +272,20 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
 	return &ui->vfs_inode;
 };
 
+static void ubifs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ubifs_inode_slab, ui);
+}
+
 static void ubifs_destroy_inode(struct inode *inode)
 {
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
 	kfree(ui->data);
-	kmem_cache_free(ubifs_inode_slab, inode);
+	call_rcu(&inode->i_rcu, ubifs_i_callback);
 }
 
 /*
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4a5c7c61836..b539d53320f 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -139,11 +139,18 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void udf_destroy_inode(struct inode *inode)
+static void udf_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
 
+static void udf_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, udf_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c47daed56d..2c61ac5d4e4 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1412,11 +1412,18 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void ufs_destroy_inode(struct inode *inode)
+static void ufs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
 
+static void ufs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ufs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0cdd26932d8..d7de5a3f786 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -91,6 +91,17 @@ xfs_inode_alloc(
 	return ip;
 }
 
+STATIC void
+xfs_inode_free_callback(
+	struct rcu_head		*head)
+{
+	struct inode		*inode = container_of(head, struct inode, i_rcu);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
 void
 xfs_inode_free(
 	struct xfs_inode	*ip)
@@ -134,7 +145,7 @@ xfs_inode_free(
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
 
-	kmem_zone_free(xfs_inode_zone, ip);
+	call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 296cf2fde94..1ff4d0a33b2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -737,7 +737,10 @@ struct inode {
 	struct list_head	i_wb_list;	/* backing dev IO list */
 	struct list_head	i_lru;		/* inode LRU list */
 	struct list_head	i_sb_list;
-	struct list_head	i_dentry;
+	union {
+		struct list_head	i_dentry;
+		struct rcu_head		i_rcu;
+	};
 	unsigned long		i_ino;
 	atomic_t		i_count;
 	unsigned int		i_nlink;
diff --git a/include/linux/net.h b/include/linux/net.h
index 16faa130088..06bde490847 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -120,7 +120,6 @@ enum sock_shutdown_cmd {
 struct socket_wq {
 	wait_queue_head_t	wait;
 	struct fasync_struct	*fasync_list;
-	struct rcu_head		rcu;
 } ____cacheline_aligned_in_smp;
 
 /**
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 035f4399edb..14fb6d67e6a 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -237,11 +237,18 @@ static struct inode *mqueue_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void mqueue_destroy_inode(struct inode *inode)
+static void mqueue_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
 }
 
+static void mqueue_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, mqueue_i_callback);
+}
+
 static void mqueue_evict_inode(struct inode *inode)
 {
 	struct mqueue_inode_info *info;
diff --git a/mm/shmem.c b/mm/shmem.c
index 47fdeeb9d63..5ee67c99060 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2415,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 	return &p->vfs_inode;
 }
 
+static void shmem_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+}
+
 static void shmem_destroy_inode(struct inode *inode)
 {
 	if ((inode->i_mode & S_IFMT) == S_IFREG) {
 		/* only struct inode is valid if it's an inline symlink */
 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
 	}
-	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+	call_rcu(&inode->i_rcu, shmem_i_callback);
 }
 
 static void init_once(void *foo)
diff --git a/net/socket.c b/net/socket.c
index 088fb3fd45e..97fff3a4e72 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -262,20 +262,20 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 }
 
 
-static void wq_free_rcu(struct rcu_head *head)
+static void sock_free_rcu(struct rcu_head *head)
 {
-	struct socket_wq *wq = container_of(head, struct socket_wq, rcu);
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct socket_alloc *ei = container_of(inode, struct socket_alloc,
+								vfs_inode);
 
-	kfree(wq);
+	kfree(ei->socket.wq);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(sock_inode_cachep, ei);
 }
 
 static void sock_destroy_inode(struct inode *inode)
 {
-	struct socket_alloc *ei;
-
-	ei = container_of(inode, struct socket_alloc, vfs_inode);
-	call_rcu(&ei->socket.wq->rcu, wq_free_rcu);
-	kmem_cache_free(sock_inode_cachep, ei);
+	call_rcu(&inode->i_rcu, sock_free_rcu);
 }
 
 static void init_once(void *foo)
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index a0dc1a86fce..2899fe27f88 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -162,11 +162,19 @@ rpc_alloc_inode(struct super_block *sb)
 }
 
 static void
-rpc_destroy_inode(struct inode *inode)
+rpc_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(rpc_inode_cachep, RPC_I(inode));
 }
 
+static void
+rpc_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, rpc_i_callback);
+}
+
 static int
 rpc_pipe_open(struct inode *inode, struct file *filp)
 {
-- 
cgit v1.2.3


From 31e6b01f4183ff419a6d1f86177cbf4662347cec Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:52 +1100
Subject: fs: rcu-walk for path lookup

Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.

This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.

The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
  of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
  not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
  access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
  refcounts are not required for persistence. Also we are free to perform mount
  lookups, and to assume dentry mount points and mount roots are stable up and
  down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
  so we can load this tuple atomically, and also check whether any of its
  members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
  sequence after the child is found in case anything changed in the parent
  during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
  limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.

When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.

Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).

The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links

In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.

Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 Documentation/filesystems/dentry-locking.txt | 172 -------
 Documentation/filesystems/path-lookup.txt    | 345 +++++++++++++
 fs/dcache.c                                  | 203 +++++++-
 fs/filesystems.c                             |   3 +
 fs/namei.c                                   | 743 ++++++++++++++++++++++-----
 fs/proc/proc_sysctl.c                        |   4 +
 include/linux/dcache.h                       |  32 +-
 include/linux/namei.h                        |  15 +-
 include/linux/security.h                     |   8 +-
 security/security.c                          |   9 +
 10 files changed, 1194 insertions(+), 340 deletions(-)
 delete mode 100644 Documentation/filesystems/dentry-locking.txt
 create mode 100644 Documentation/filesystems/path-lookup.txt

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt
deleted file mode 100644
index 30b6a40f565..00000000000
--- a/Documentation/filesystems/dentry-locking.txt
+++ /dev/null
@@ -1,172 +0,0 @@
-RCU-based dcache locking model
-==============================
-
-On many workloads, the most common operation on dcache is to look up a
-dentry, given a parent dentry and the name of the child. Typically,
-for every open(), stat() etc., the dentry corresponding to the
-pathname will be looked up by walking the tree starting with the first
-component of the pathname and using that dentry along with the next
-component to look up the next level and so on. Since it is a frequent
-operation for workloads like multiuser environments and web servers,
-it is important to optimize this path.
-
-Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus in
-every component during path look-up. Since 2.5.10 onwards, fast-walk
-algorithm changed this by holding the dcache_lock at the beginning and
-walking as many cached path component dentries as possible. This
-significantly decreases the number of acquisition of
-dcache_lock. However it also increases the lock hold time
-significantly and affects performance in large SMP machines. Since
-2.5.62 kernel, dcache has been using a new locking model that uses RCU
-to make dcache look-up lock-free.
-
-The current dcache locking model is not very different from the
-existing dcache locking model. Prior to 2.5.62 kernel, dcache_lock
-protected the hash chain, d_child, d_alias, d_lru lists as well as
-d_inode and several other things like mount look-up. RCU-based changes
-affect only the way the hash chain is protected. For everything else
-the dcache_lock must be taken for both traversing as well as
-updating. The hash chain updates too take the dcache_lock.  The
-significant change is the way d_lookup traverses the hash chain, it
-doesn't acquire the dcache_lock for this and rely on RCU to ensure
-that the dentry has not been *freed*.
-
-dcache_lock no longer exists, dentry locking is explained in fs/dcache.c
-
-Dcache locking details
-======================
-
-For many multi-user workloads, open() and stat() on files are very
-frequently occurring operations. Both involve walking of path names to
-find the dentry corresponding to the concerned file. In 2.4 kernel,
-dcache_lock was held during look-up of each path component. Contention
-and cache-line bouncing of this global lock caused significant
-scalability problems. With the introduction of RCU in Linux kernel,
-this was worked around by making the look-up of path components during
-path walking lock-free.
-
-
-Safe lock-free look-up of dcache hash table
-===========================================
-
-Dcache is a complex data structure with the hash table entries also
-linked together in other lists. In 2.4 kernel, dcache_lock protected
-all the lists. RCU dentry hash walking works like this:
-
-1. The deletion from hash chain is done using hlist_del_rcu() macro
-   which doesn't initialize next pointer of the deleted dentry and
-   this allows us to walk safely lock-free while a deletion is
-   happening. This is a standard hlist_rcu iteration.
-
-2. Insertion of a dentry into the hash table is done using
-   hlist_add_head_rcu() which take care of ordering the writes - the
-   writes to the dentry must be visible before the dentry is
-   inserted. This works in conjunction with hlist_for_each_rcu(),
-   which has since been replaced by hlist_for_each_entry_rcu(), while
-   walking the hash chain. The only requirement is that all
-   initialization to the dentry must be done before
-   hlist_add_head_rcu() since we don't have lock protection
-   while traversing the hash chain.
-
-3. The dentry looked up without holding locks cannot be returned for
-   walking if it is unhashed. It then may have a NULL d_inode or other
-   bogosity since RCU doesn't protect the other fields in the dentry. We
-   therefore use a flag DCACHE_UNHASHED to indicate unhashed dentries
-   and use this in conjunction with a per-dentry lock (d_lock). Once
-   looked up without locks, we acquire the per-dentry lock (d_lock) and
-   check if the dentry is unhashed. If so, the look-up is failed. If not,
-   the reference count of the dentry is increased and the dentry is
-   returned.
-
-4. Once a dentry is looked up, it must be ensured during the path walk
-   for that component it doesn't go away. In pre-2.5.10 code, this was
-   done holding a reference to the dentry. dcache_rcu does the same.
-   In some sense, dcache_rcu path walking looks like the pre-2.5.10
-   version.
-
-5. All dentry hash chain updates must take the per-dentry lock (see
-   fs/dcache.c). This excludes dput() to ensure that a dentry that has
-   been looked up concurrently does not get deleted before dget() can
-   take a ref.
-
-6. There are several ways to do reference counting of RCU protected
-   objects. One such example is in ipv4 route cache where deferred
-   freeing (using call_rcu()) is done as soon as the reference count
-   goes to zero. This cannot be done in the case of dentries because
-   tearing down of dentries require blocking (dentry_iput()) which
-   isn't supported from RCU callbacks. Instead, tearing down of
-   dentries happen synchronously in dput(), but actual freeing happens
-   later when RCU grace period is over. This allows safe lock-free
-   walking of the hash chains, but a matched dentry may have been
-   partially torn down. The checking of DCACHE_UNHASHED flag with
-   d_lock held detects such dentries and prevents them from being
-   returned from look-up.
-
-
-Maintaining POSIX rename semantics
-==================================
-
-Since look-up of dentries is lock-free, it can race against a
-concurrent rename operation. For example, during rename of file A to
-B, look-up of either A or B must succeed.  So, if look-up of B happens
-after A has been removed from the hash chain but not added to the new
-hash chain, it may fail.  Also, a comparison while the name is being
-written concurrently by a rename may result in false positive matches
-violating rename semantics.  Issues related to race with rename are
-handled as described below :
-
-1. Look-up can be done in two ways - d_lookup() which is safe from
-   simultaneous renames and __d_lookup() which is not.  If
-   __d_lookup() fails, it must be followed up by a d_lookup() to
-   correctly determine whether a dentry is in the hash table or
-   not. d_lookup() protects look-ups using a sequence lock
-   (rename_lock).
-
-2. The name associated with a dentry (d_name) may be changed if a
-   rename is allowed to happen simultaneously. To avoid memcmp() in
-   __d_lookup() go out of bounds due to a rename and false positive
-   comparison, the name comparison is done while holding the
-   per-dentry lock. This prevents concurrent renames during this
-   operation.
-
-3. Hash table walking during look-up may move to a different bucket as
-   the current dentry is moved to a different bucket due to rename.
-   But we use hlists in dcache hash table and they are
-   null-terminated.  So, even if a dentry moves to a different bucket,
-   hash chain walk will terminate. [with a list_head list, it may not
-   since termination is when the list_head in the original bucket is
-   reached].  Since we redo the d_parent check and compare name while
-   holding d_lock, lock-free look-up will not race against d_move().
-
-4. There can be a theoretical race when a dentry keeps coming back to
-   original bucket due to double moves. Due to this look-up may
-   consider that it has never moved and can end up in a infinite loop.
-   But this is not any worse that theoretical livelocks we already
-   have in the kernel.
-
-
-Important guidelines for filesystem developers related to dcache_rcu
-====================================================================
-
-1. Existing dcache interfaces (pre-2.5.62) exported to filesystem
-   don't change. Only dcache internal implementation changes. However
-   filesystems *must not* delete from the dentry hash chains directly
-   using the list macros like allowed earlier. They must use dcache
-   APIs like d_drop() or __d_drop() depending on the situation.
-
-2. d_flags is now protected by a per-dentry lock (d_lock). All access
-   to d_flags must be protected by it.
-
-3. For a hashed dentry, checking of d_count needs to be protected by
-   d_lock.
-
-
-Papers and other documentation on dcache locking
-================================================
-
-1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124).
-
-2. http://lse.sourceforge.net/locking/dcache/dcache.html
-
-
-
diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt
new file mode 100644
index 00000000000..09b2878724a
--- /dev/null
+++ b/Documentation/filesystems/path-lookup.txt
@@ -0,0 +1,345 @@
+Path walking and name lookup locking
+====================================
+
+Path resolution is the finding a dentry corresponding to a path name string, by
+performing a path walk. Typically, for every open(), stat() etc., the path name
+will be resolved. Paths are resolved by walking the namespace tree, starting
+with the first component of the pathname (eg. root or cwd) with a known dentry,
+then finding the child of that dentry, which is named the next component in the
+path string. Then repeating the lookup from the child dentry and finding its
+child with the next element, and so on.
+
+Since it is a frequent operation for workloads like multiuser environments and
+web servers, it is important to optimize this code.
+
+Path walking synchronisation history:
+Prior to 2.5.10, dcache_lock was acquired in d_lookup (dcache hash lookup) and
+thus in every component during path look-up. Since 2.5.10 onwards, fast-walk
+algorithm changed this by holding the dcache_lock at the beginning and walking
+as many cached path component dentries as possible. This significantly
+decreases the number of acquisition of dcache_lock. However it also increases
+the lock hold time significantly and affects performance in large SMP machines.
+Since 2.5.62 kernel, dcache has been using a new locking model that uses RCU to
+make dcache look-up lock-free.
+
+All the above algorithms required taking a lock and reference count on the
+dentry that was looked up, so that may be used as the basis for walking the
+next path element. This is inefficient and unscalable. It is inefficient
+because of the locks and atomic operations required for every dentry element
+slows things down. It is not scalable because many parallel applications that
+are path-walk intensive tend to do path lookups starting from a common dentry
+(usually, the root "/" or current working directory). So contention on these
+common path elements causes lock and cacheline queueing.
+
+Since 2.6.38, RCU is used to make a significant part of the entire path walk
+(including dcache look-up) completely "store-free" (so, no locks, atomics, or
+even stores into cachelines of common dentries). This is known as "rcu-walk"
+path walking.
+
+Path walking overview
+=====================
+
+A name string specifies a start (root directory, cwd, fd-relative) and a
+sequence of elements (directory entry names), which together refer to a path in
+the namespace. A path is represented as a (dentry, vfsmount) tuple. The name
+elements are sub-strings, seperated by '/'.
+
+Name lookups will want to find a particular path that a name string refers to
+(usually the final element, or parent of final element). This is done by taking
+the path given by the name's starting point (which we know in advance -- eg.
+current->fs->cwd or current->fs->root) as the first parent of the lookup. Then
+iteratively for each subsequent name element, look up the child of the current
+parent with the given name and if it is not the desired entry, make it the
+parent for the next lookup.
+
+A parent, of course, must be a directory, and we must have appropriate
+permissions on the parent inode to be able to walk into it.
+
+Turning the child into a parent for the next lookup requires more checks and
+procedures. Symlinks essentially substitute the symlink name for the target
+name in the name string, and require some recursive path walking.  Mount points
+must be followed into (thus changing the vfsmount that subsequent path elements
+refer to), switching from the mount point path to the root of the particular
+mounted vfsmount. These behaviours are variously modified depending on the
+exact path walking flags.
+
+Path walking then must, broadly, do several particular things:
+- find the start point of the walk;
+- perform permissions and validity checks on inodes;
+- perform dcache hash name lookups on (parent, name element) tuples;
+- traverse mount points;
+- traverse symlinks;
+- lookup and create missing parts of the path on demand.
+
+Safe store-free look-up of dcache hash table
+============================================
+
+Dcache name lookup
+------------------
+In order to lookup a dcache (parent, name) tuple, we take a hash on the tuple
+and use that to select a bucket in the dcache-hash table. The list of entries
+in that bucket is then walked, and we do a full comparison of each entry
+against our (parent, name) tuple.
+
+The hash lists are RCU protected, so list walking is not serialised with
+concurrent updates (insertion, deletion from the hash). This is a standard RCU
+list application with the exception of renames, which will be covered below.
+
+Parent and name members of a dentry, as well as its membership in the dcache
+hash, and its inode are protected by the per-dentry d_lock spinlock. A
+reference is taken on the dentry (while the fields are verified under d_lock),
+and this stabilises its d_inode pointer and actual inode. This gives a stable
+point to perform the next step of our path walk against.
+
+These members are also protected by d_seq seqlock, although this offers
+read-only protection and no durability of results, so care must be taken when
+using d_seq for synchronisation (see seqcount based lookups, below).
+
+Renames
+-------
+Back to the rename case. In usual RCU protected lists, the only operations that
+will happen to an object is insertion, and then eventually removal from the
+list. The object will not be reused until an RCU grace period is complete.
+This ensures the RCU list traversal primitives can run over the object without
+problems (see RCU documentation for how this works).
+
+However when a dentry is renamed, its hash value can change, requiring it to be
+moved to a new hash list. Allocating and inserting a new alias would be
+expensive and also problematic for directory dentries. Latency would be far to
+high to wait for a grace period after removing the dentry and before inserting
+it in the new hash bucket. So what is done is to insert the dentry into the
+new list immediately.
+
+However, when the dentry's list pointers are updated to point to objects in the
+new list before waiting for a grace period, this can result in a concurrent RCU
+lookup of the old list veering off into the new (incorrect) list and missing
+the remaining dentries on the list.
+
+There is no fundamental problem with walking down the wrong list, because the
+dentry comparisons will never match. However it is fatal to miss a matching
+dentry. So a seqlock is used to detect when a rename has occurred, and so the
+lookup can be retried.
+
+         1      2      3
+        +---+  +---+  +---+
+hlist-->| N-+->| N-+->| N-+->
+head <--+-P |<-+-P |<-+-P |
+        +---+  +---+  +---+
+
+Rename of dentry 2 may require it deleted from the above list, and inserted
+into a new list. Deleting 2 gives the following list.
+
+         1             3
+        +---+         +---+     (don't worry, the longer pointers do not
+hlist-->| N-+-------->| N-+->    impose a measurable performance overhead
+head <--+-P |<--------+-P |      on modern CPUs)
+        +---+         +---+
+          ^      2      ^
+          |    +---+    |
+          |    | N-+----+
+          +----+-P |
+               +---+
+
+This is a standard RCU-list deletion, which leaves the deleted object's
+pointers intact, so a concurrent list walker that is currently looking at
+object 2 will correctly continue to object 3 when it is time to traverse the
+next object.
+
+However, when inserting object 2 onto a new list, we end up with this:
+
+         1             3
+        +---+         +---+
+hlist-->| N-+-------->| N-+->
+head <--+-P |<--------+-P |
+        +---+         +---+
+                 2
+               +---+
+               | N-+---->
+          <----+-P |
+               +---+
+
+Because we didn't wait for a grace period, there may be a concurrent lookup
+still at 2. Now when it follows 2's 'next' pointer, it will walk off into
+another list without ever having checked object 3.
+
+A related, but distinctly different, issue is that of rename atomicity versus
+lookup operations. If a file is renamed from 'A' to 'B', a lookup must only
+find either 'A' or 'B'. So if a lookup of 'A' returns NULL, a subsequent lookup
+of 'B' must succeed (note the reverse is not true).
+
+Between deleting the dentry from the old hash list, and inserting it on the new
+hash list, a lookup may find neither 'A' nor 'B' matching the dentry. The same
+rename seqlock is also used to cover this race in much the same way, by
+retrying a negative lookup result if a rename was in progress.
+
+Seqcount based lookups
+----------------------
+In refcount based dcache lookups, d_lock is used to serialise access to
+the dentry, stabilising it while comparing its name and parent and then
+taking a reference count (the reference count then gives a stable place to
+start the next part of the path walk from).
+
+As explained above, we would like to do path walking without taking locks or
+reference counts on intermediate dentries along the path. To do this, a per
+dentry seqlock (d_seq) is used to take a "coherent snapshot" of what the dentry
+looks like (its name, parent, and inode). That snapshot is then used to start
+the next part of the path walk. When loading the coherent snapshot under d_seq,
+care must be taken to load the members up-front, and use those pointers rather
+than reloading from the dentry later on (otherwise we'd have interesting things
+like d_inode going NULL underneath us, if the name was unlinked).
+
+Also important is to avoid performing any destructive operations (pretty much:
+no non-atomic stores to shared data), and to recheck the seqcount when we are
+"done" with the operation. Retry or abort if the seqcount does not match.
+Avoiding destructive or changing operations means we can easily unwind from
+failure.
+
+What this means is that a caller, provided they are holding RCU lock to
+protect the dentry object from disappearing, can perform a seqcount based
+lookup which does not increment the refcount on the dentry or write to
+it in any way. This returned dentry can be used for subsequent operations,
+provided that d_seq is rechecked after that operation is complete.
+
+Inodes are also rcu freed, so the seqcount lookup dentry's inode may also be
+queried for permissions.
+
+With this two parts of the puzzle, we can do path lookups without taking
+locks or refcounts on dentry elements.
+
+RCU-walk path walking design
+============================
+
+Path walking code now has two distinct modes, ref-walk and rcu-walk. ref-walk
+is the traditional[*] way of performing dcache lookups using d_lock to
+serialise concurrent modifications to the dentry and take a reference count on
+it. ref-walk is simple and obvious, and may sleep, take locks, etc while path
+walking is operating on each dentry. rcu-walk uses seqcount based dentry
+lookups, and can perform lookup of intermediate elements without any stores to
+shared data in the dentry or inode. rcu-walk can not be applied to all cases,
+eg. if the filesystem must sleep or perform non trivial operations, rcu-walk
+must be switched to ref-walk mode.
+
+[*] RCU is still used for the dentry hash lookup in ref-walk, but not the full
+    path walk.
+
+Where ref-walk uses a stable, refcounted ``parent'' to walk the remaining
+path string, rcu-walk uses a d_seq protected snapshot. When looking up a
+child of this parent snapshot, we open d_seq critical section on the child
+before closing d_seq critical section on the parent. This gives an interlocking
+ladder of snapshots to walk down.
+
+
+     proc 101
+      /----------------\
+     / comm:    "vi"    \
+    /  fs.root: dentry0  \
+    \  fs.cwd:  dentry2  /
+     \                  /
+      \----------------/
+
+So when vi wants to open("/home/npiggin/test.c", O_RDWR), then it will
+start from current->fs->root, which is a pinned dentry. Alternatively,
+"./test.c" would start from cwd; both names refer to the same path in
+the context of proc101.
+
+     dentry 0
+    +---------------------+   rcu-walk begins here, we note d_seq, check the
+    | name:    "/"        |   inode's permission, and then look up the next
+    | inode:   10         |   path element which is "home"...
+    | children:"home", ...|
+    +---------------------+
+              |
+     dentry 1 V
+    +---------------------+   ... which brings us here. We find dentry1 via
+    | name:    "home"     |   hash lookup, then note d_seq and compare name
+    | inode:   678        |   string and parent pointer. When we have a match,
+    | children:"npiggin"  |   we now recheck the d_seq of dentry0. Then we
+    +---------------------+   check inode and look up the next element.
+              |
+     dentry2  V
+    +---------------------+   Note: if dentry0 is now modified, lookup is
+    | name:    "npiggin"  |   not necessarily invalid, so we need only keep a
+    | inode:   543        |   parent for d_seq verification, and grandparents
+    | children:"a.c", ... |   can be forgotten.
+    +---------------------+
+              |
+     dentry3  V
+    +---------------------+   At this point we have our destination dentry.
+    | name:    "a.c"      |   We now take its d_lock, verify d_seq of this
+    | inode:   14221      |   dentry. If that checks out, we can increment
+    | children:NULL       |   its refcount because we're holding d_lock.
+    +---------------------+
+
+Taking a refcount on a dentry from rcu-walk mode, by taking its d_lock,
+re-checking its d_seq, and then incrementing its refcount is called
+"dropping rcu" or dropping from rcu-walk into ref-walk mode.
+
+It is, in some sense, a bit of a house of cards. If the seqcount check of the
+parent snapshot fails, the house comes down, because we had closed the d_seq
+section on the grandparent, so we have nothing left to stand on. In that case,
+the path walk must be fully restarted (which we do in ref-walk mode, to avoid
+live locks). It is costly to have a full restart, but fortunately they are
+quite rare.
+
+When we reach a point where sleeping is required, or a filesystem callout
+requires ref-walk, then instead of restarting the walk, we attempt to drop rcu
+at the last known good dentry we have. Avoiding a full restart in ref-walk in
+these cases is fundamental for performance and scalability because blocking
+operations such as creates and unlinks are not uncommon.
+
+The detailed design for rcu-walk is like this:
+* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
+* Take the RCU lock for the entire path walk, starting with the acquiring
+  of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
+  not required for dentry persistence.
+* synchronize_rcu is called when unregistering a filesystem, so we can
+  access d_ops and i_ops during rcu-walk.
+* Similarly take the vfsmount lock for the entire path walk. So now mnt
+  refcounts are not required for persistence. Also we are free to perform mount
+  lookups, and to assume dentry mount points and mount roots are stable up and
+  down the path.
+* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
+  so we can load this tuple atomically, and also check whether any of its
+  members have changed.
+* Dentry lookups (based on parent, candidate string tuple) recheck the parent
+  sequence after the child is found in case anything changed in the parent
+  during the path walk.
+* inode is also RCU protected so we can load d_inode and use the inode for
+  limited things.
+* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
+* i_op can be loaded.
+* When the destination dentry is reached, drop rcu there (ie. take d_lock,
+  verify d_seq, increment refcount).
+* If seqlock verification fails anywhere along the path, do a full restart
+  of the path lookup in ref-walk mode. -ECHILD tends to be used (for want of
+  a better errno) to signal an rcu-walk failure.
+
+The cases where rcu-walk cannot continue are:
+* NULL dentry (ie. any uncached path element)
+* parent with d_inode->i_op->permission or ACLs
+* dentries with d_revalidate
+* Following links
+
+In future patches, permission checks and d_revalidate become rcu-walk aware. It
+may be possible eventually to make following links rcu-walk aware.
+
+Uncached path elements will always require dropping to ref-walk mode, at the
+very least because i_mutex needs to be grabbed, and objects allocated.
+
+Final note:
+"store-free" path walking is not strictly store free. We take vfsmount lock
+and refcounts (both of which can be made per-cpu), and we also store to the
+stack (which is essentially CPU-local), and we also have to take locks and
+refcount on final dentry.
+
+The point is that shared data, where practically possible, is not locked
+or stored into. The result is massive improvements in performance and
+scalability of path resolution.
+
+
+Papers and other documentation on dcache locking
+================================================
+
+1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124).
+
+2. http://lse.sourceforge.net/locking/dcache/dcache.html
diff --git a/fs/dcache.c b/fs/dcache.c
index dc0551c9755..187fea04010 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -152,9 +152,23 @@ static void d_free(struct dentry *dentry)
 		call_rcu(&dentry->d_u.d_rcu, __d_free);
 }
 
+/**
+ * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * After this call, in-progress rcu-walk path lookup will fail. This
+ * should be called after unhashing, and after changing d_inode (if
+ * the dentry has not already been unhashed).
+ */
+static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+{
+	assert_spin_locked(&dentry->d_lock);
+	/* Go through a barrier */
+	write_seqcount_barrier(&dentry->d_seq);
+}
+
 /*
  * Release the dentry's inode, using the filesystem
- * d_iput() operation if defined.
+ * d_iput() operation if defined. Dentry has no refcount
+ * and is unhashed.
  */
 static void dentry_iput(struct dentry * dentry)
 	__releases(dentry->d_lock)
@@ -178,6 +192,28 @@ static void dentry_iput(struct dentry * dentry)
 	}
 }
 
+/*
+ * Release the dentry's inode, using the filesystem
+ * d_iput() operation if defined. dentry remains in-use.
+ */
+static void dentry_unlink_inode(struct dentry * dentry)
+	__releases(dentry->d_lock)
+	__releases(dcache_inode_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	dentry->d_inode = NULL;
+	list_del_init(&dentry->d_alias);
+	dentry_rcuwalk_barrier(dentry);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&dcache_inode_lock);
+	if (!inode->i_nlink)
+		fsnotify_inoderemove(inode);
+	if (dentry->d_op && dentry->d_op->d_iput)
+		dentry->d_op->d_iput(dentry, inode);
+	else
+		iput(inode);
+}
+
 /*
  * dentry_lru_(add|del|move_tail) must be called with d_lock held.
  */
@@ -272,6 +308,7 @@ void __d_drop(struct dentry *dentry)
 		spin_lock(&dcache_hash_lock);
 		hlist_del_rcu(&dentry->d_hash);
 		spin_unlock(&dcache_hash_lock);
+		dentry_rcuwalk_barrier(dentry);
 	}
 }
 EXPORT_SYMBOL(__d_drop);
@@ -309,6 +346,7 @@ relock:
 		spin_unlock(&dcache_inode_lock);
 		goto relock;
 	}
+
 	if (ref)
 		dentry->d_count--;
 	/* if dentry was on the d_lru list delete it from there */
@@ -1221,6 +1259,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	dentry->d_count = 1;
 	dentry->d_flags = DCACHE_UNHASHED;
 	spin_lock_init(&dentry->d_lock);
+	seqcount_init(&dentry->d_seq);
 	dentry->d_inode = NULL;
 	dentry->d_parent = NULL;
 	dentry->d_sb = NULL;
@@ -1269,6 +1308,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	if (inode)
 		list_add(&dentry->d_alias, &inode->i_dentry);
 	dentry->d_inode = inode;
+	dentry_rcuwalk_barrier(dentry);
 	spin_unlock(&dentry->d_lock);
 	fsnotify_d_instantiate(dentry, inode);
 }
@@ -1610,6 +1650,111 @@ err_out:
 }
 EXPORT_SYMBOL(d_add_ci);
 
+/**
+ * __d_lookup_rcu - search for a dentry (racy, store-free)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * @seq: returns d_seq value at the point where the dentry was found
+ * @inode: returns dentry->d_inode when the inode was found valid.
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup_rcu is the dcache lookup function for rcu-walk name
+ * resolution (store-free path walking) design described in
+ * Documentation/filesystems/path-lookup.txt.
+ *
+ * This is not to be used outside core vfs.
+ *
+ * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
+ * held, and rcu_read_lock held. The returned dentry must not be stored into
+ * without taking d_lock and checking d_seq sequence count against @seq
+ * returned here.
+ *
+ * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
+ * function.
+ *
+ * Alternatively, __d_lookup_rcu may be called again to look up the child of
+ * the returned dentry, so long as its parent's seqlock is checked after the
+ * child is looked up. Thus, an interlocking stepping of sequence lock checks
+ * is formed, giving integrity down the path walk.
+ */
+struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
+				unsigned *seq, struct inode **inode)
+{
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct hlist_head *head = d_hash(parent, hash);
+	struct hlist_node *node;
+	struct dentry *dentry;
+
+	/*
+	 * Note: There is significant duplication with __d_lookup_rcu which is
+	 * required to prevent single threaded performance regressions
+	 * especially on architectures where smp_rmb (in seqcounts) are costly.
+	 * Keep the two functions in sync.
+	 */
+
+	/*
+	 * The hash list is protected using RCU.
+	 *
+	 * Carefully use d_seq when comparing a candidate dentry, to avoid
+	 * races with d_move().
+	 *
+	 * It is possible that concurrent renames can mess up our list
+	 * walk here and result in missing our dentry, resulting in the
+	 * false-negative result. d_lookup() protects against concurrent
+	 * renames using rename_lock seqlock.
+	 *
+	 * See Documentation/vfs/dcache-locking.txt for more details.
+	 */
+	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
+		struct inode *i;
+		const char *tname;
+		int tlen;
+
+		if (dentry->d_name.hash != hash)
+			continue;
+
+seqretry:
+		*seq = read_seqcount_begin(&dentry->d_seq);
+		if (dentry->d_parent != parent)
+			continue;
+		if (d_unhashed(dentry))
+			continue;
+		tlen = dentry->d_name.len;
+		tname = dentry->d_name.name;
+		i = dentry->d_inode;
+		/*
+		 * This seqcount check is required to ensure name and
+		 * len are loaded atomically, so as not to walk off the
+		 * edge of memory when walking. If we could load this
+		 * atomically some other way, we could drop this check.
+		 */
+		if (read_seqcount_retry(&dentry->d_seq, *seq))
+			goto seqretry;
+		if (parent->d_op && parent->d_op->d_compare) {
+			if (parent->d_op->d_compare(parent, *inode,
+						dentry, i,
+						tlen, tname, name))
+				continue;
+		} else {
+			if (tlen != len)
+				continue;
+			if (memcmp(tname, str, tlen))
+				continue;
+		}
+		/*
+		 * No extra seqcount check is required after the name
+		 * compare. The caller must perform a seqcount check in
+		 * order to do anything useful with the returned dentry
+		 * anyway.
+		 */
+		*inode = i;
+		return dentry;
+	}
+	return NULL;
+}
+
 /**
  * d_lookup - search for a dentry
  * @parent: parent dentry
@@ -1621,9 +1766,9 @@ EXPORT_SYMBOL(d_add_ci);
  * dentry is returned. The caller must use dput to free the entry when it has
  * finished using it. %NULL is returned if the dentry does not exist.
  */
-struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
 {
-	struct dentry * dentry = NULL;
+	struct dentry *dentry;
 	unsigned seq;
 
         do {
@@ -1636,7 +1781,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
 
-/*
+/**
  * __d_lookup - search for a dentry (racy)
  * @parent: parent dentry
  * @name: qstr of name we wish to find
@@ -1651,16 +1796,23 @@ EXPORT_SYMBOL(d_lookup);
  *
  * __d_lookup callers must be commented.
  */
-struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 {
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
 	const unsigned char *str = name->name;
 	struct hlist_head *head = d_hash(parent,hash);
-	struct dentry *found = NULL;
 	struct hlist_node *node;
+	struct dentry *found = NULL;
 	struct dentry *dentry;
 
+	/*
+	 * Note: There is significant duplication with __d_lookup_rcu which is
+	 * required to prevent single threaded performance regressions
+	 * especially on architectures where smp_rmb (in seqcounts) are costly.
+	 * Keep the two functions in sync.
+	 */
+
 	/*
 	 * The hash list is protected using RCU.
 	 *
@@ -1677,24 +1829,15 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 	rcu_read_lock();
 	
 	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
-		struct qstr *qstr;
+		const char *tname;
+		int tlen;
 
 		if (dentry->d_name.hash != hash)
 			continue;
-		if (dentry->d_parent != parent)
-			continue;
 
 		spin_lock(&dentry->d_lock);
-
-		/*
-		 * Recheck the dentry after taking the lock - d_move may have
-		 * changed things. Don't bother checking the hash because
-		 * we're about to compare the whole name anyway.
-		 */
 		if (dentry->d_parent != parent)
 			goto next;
-
-		/* non-existing due to RCU? */
 		if (d_unhashed(dentry))
 			goto next;
 
@@ -1702,16 +1845,17 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 		 * It is safe to compare names since d_move() cannot
 		 * change the qstr (protected by d_lock).
 		 */
-		qstr = &dentry->d_name;
+		tlen = dentry->d_name.len;
+		tname = dentry->d_name.name;
 		if (parent->d_op && parent->d_op->d_compare) {
 			if (parent->d_op->d_compare(parent, parent->d_inode,
 						dentry, dentry->d_inode,
-						qstr->len, qstr->name, name))
+						tlen, tname, name))
 				goto next;
 		} else {
-			if (qstr->len != len)
+			if (tlen != len)
 				goto next;
-			if (memcmp(qstr->name, str, len))
+			if (memcmp(tname, str, tlen))
 				goto next;
 		}
 
@@ -1821,7 +1965,7 @@ again:
 			goto again;
 		}
 		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
-		dentry_iput(dentry);
+		dentry_unlink_inode(dentry);
 		fsnotify_nameremove(dentry, isdir);
 		return;
 	}
@@ -1884,7 +2028,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 	BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
 
 	spin_lock(&dentry->d_lock);
+	write_seqcount_begin(&dentry->d_seq);
 	memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
+	write_seqcount_end(&dentry->d_seq);
 	spin_unlock(&dentry->d_lock);
 }
 EXPORT_SYMBOL(dentry_update_name_case);
@@ -1997,6 +2143,9 @@ void d_move(struct dentry * dentry, struct dentry * target)
 
 	dentry_lock_for_move(dentry, target);
 
+	write_seqcount_begin(&dentry->d_seq);
+	write_seqcount_begin(&target->d_seq);
+
 	/* Move the dentry to the target hash queue, if on different bucket */
 	spin_lock(&dcache_hash_lock);
 	if (!d_unhashed(dentry))
@@ -2005,6 +2154,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
 	spin_unlock(&dcache_hash_lock);
 
 	/* Unhash the target: dput() will then get rid of it */
+	/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
 	__d_drop(target);
 
 	list_del(&dentry->d_u.d_child);
@@ -2028,6 +2178,9 @@ void d_move(struct dentry * dentry, struct dentry * target)
 
 	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
 
+	write_seqcount_end(&target->d_seq);
+	write_seqcount_end(&dentry->d_seq);
+
 	dentry_unlock_parents_for_move(dentry, target);
 	spin_unlock(&target->d_lock);
 	fsnotify_d_move(dentry);
@@ -2110,6 +2263,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 
 	dentry_lock_for_move(anon, dentry);
 
+	write_seqcount_begin(&dentry->d_seq);
+	write_seqcount_begin(&anon->d_seq);
+
 	dparent = dentry->d_parent;
 	aparent = anon->d_parent;
 
@@ -2130,6 +2286,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	else
 		INIT_LIST_HEAD(&anon->d_u.d_child);
 
+	write_seqcount_end(&dentry->d_seq);
+	write_seqcount_end(&anon->d_seq);
+
 	dentry_unlock_parents_for_move(anon, dentry);
 	spin_unlock(&dentry->d_lock);
 
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 68ba492d8ee..751d6b255a1 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs)
 		tmp = &(*tmp)->next;
 	}
 	write_unlock(&file_systems_lock);
+
+	synchronize_rcu();
+
 	return -EINVAL;
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index 5642bc2be41..8d3f15b3a54 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
 /*
  * This does basic POSIX ACL permission checking
  */
-static int acl_permission_check(struct inode *inode, int mask,
-		int (*check_acl)(struct inode *inode, int mask))
+static inline int __acl_permission_check(struct inode *inode, int mask,
+		int (*check_acl)(struct inode *inode, int mask), int rcu)
 {
 	umode_t			mode = inode->i_mode;
 
@@ -180,9 +180,13 @@ static int acl_permission_check(struct inode *inode, int mask,
 		mode >>= 6;
 	else {
 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
-			int error = check_acl(inode, mask);
-			if (error != -EAGAIN)
-				return error;
+			if (rcu) {
+				return -ECHILD;
+			} else {
+				int error = check_acl(inode, mask);
+				if (error != -EAGAIN)
+					return error;
+			}
 		}
 
 		if (in_group_p(inode->i_gid))
@@ -197,6 +201,12 @@ static int acl_permission_check(struct inode *inode, int mask,
 	return -EACCES;
 }
 
+static inline int acl_permission_check(struct inode *inode, int mask,
+		int (*check_acl)(struct inode *inode, int mask))
+{
+	return __acl_permission_check(inode, mask, check_acl, 0);
+}
+
 /**
  * generic_permission  -  check for access rights on a Posix-like filesystem
  * @inode:	inode to check access rights for
@@ -374,6 +384,173 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
+/**
+ * nameidata_drop_rcu - drop this nameidata out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @Returns: 0 on success, -ECHLID on failure
+ *
+ * Path walking has 2 modes, rcu-walk and ref-walk (see
+ * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
+ * to drop out of rcu-walk mode and take normal reference counts on dentries
+ * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
+ * refcounts at the last known good point before rcu-walk got stuck, so
+ * ref-walk may continue from there. If this is not successful (eg. a seqcount
+ * has changed), then failure is returned and path walk restarts from the
+ * beginning in ref-walk mode.
+ *
+ * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
+ * ref-walk. Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu(struct nameidata *nd)
+{
+	struct fs_struct *fs = current->fs;
+	struct dentry *dentry = nd->path.dentry;
+
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+	if (nd->root.mnt) {
+		spin_lock(&fs->lock);
+		if (nd->root.mnt != fs->root.mnt ||
+				nd->root.dentry != fs->root.dentry)
+			goto err_root;
+	}
+	spin_lock(&dentry->d_lock);
+	if (!__d_rcu_to_refcount(dentry, nd->seq))
+		goto err;
+	BUG_ON(nd->inode != dentry->d_inode);
+	spin_unlock(&dentry->d_lock);
+	if (nd->root.mnt) {
+		path_get(&nd->root);
+		spin_unlock(&fs->lock);
+	}
+	mntget(nd->path.mnt);
+
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	nd->flags &= ~LOOKUP_RCU;
+	return 0;
+err:
+	spin_unlock(&dentry->d_lock);
+err_root:
+	if (nd->root.mnt)
+		spin_unlock(&fs->lock);
+	return -ECHILD;
+}
+
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU)
+		return nameidata_drop_rcu(nd);
+	return 0;
+}
+
+/**
+ * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @dentry: dentry to drop
+ * @Returns: 0 on success, -ECHLID on failure
+ *
+ * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
+ * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
+ * @nd. Must be called from rcu-walk context.
+ */
+static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+{
+	struct fs_struct *fs = current->fs;
+	struct dentry *parent = nd->path.dentry;
+
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+	if (nd->root.mnt) {
+		spin_lock(&fs->lock);
+		if (nd->root.mnt != fs->root.mnt ||
+				nd->root.dentry != fs->root.dentry)
+			goto err_root;
+	}
+	spin_lock(&parent->d_lock);
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	if (!__d_rcu_to_refcount(dentry, nd->seq))
+		goto err;
+	/*
+	 * If the sequence check on the child dentry passed, then the child has
+	 * not been removed from its parent. This means the parent dentry must
+	 * be valid and able to take a reference at this point.
+	 */
+	BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+	BUG_ON(!parent->d_count);
+	parent->d_count++;
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&parent->d_lock);
+	if (nd->root.mnt) {
+		path_get(&nd->root);
+		spin_unlock(&fs->lock);
+	}
+	mntget(nd->path.mnt);
+
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	nd->flags &= ~LOOKUP_RCU;
+	return 0;
+err:
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&parent->d_lock);
+err_root:
+	if (nd->root.mnt)
+		spin_unlock(&fs->lock);
+	return -ECHILD;
+}
+
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
+{
+	if (nd->flags & LOOKUP_RCU)
+		return nameidata_dentry_drop_rcu(nd, dentry);
+	return 0;
+}
+
+/**
+ * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @Returns: 0 on success, -ECHLID on failure
+ *
+ * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
+ * nd->path should be the final element of the lookup, so nd->root is discarded.
+ * Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu_last(struct nameidata *nd)
+{
+	struct dentry *dentry = nd->path.dentry;
+
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+	nd->flags &= ~LOOKUP_RCU;
+	nd->root.mnt = NULL;
+	spin_lock(&dentry->d_lock);
+	if (!__d_rcu_to_refcount(dentry, nd->seq))
+		goto err_unlock;
+	BUG_ON(nd->inode != dentry->d_inode);
+	spin_unlock(&dentry->d_lock);
+
+	mntget(nd->path.mnt);
+
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+
+	return 0;
+
+err_unlock:
+	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	return -ECHILD;
+}
+
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
+{
+	if (likely(nd->flags & LOOKUP_RCU))
+		return nameidata_drop_rcu_last(nd);
+	return 0;
+}
+
 /**
  * release_open_intent - free up open intent resources
  * @nd: pointer to nameidata
@@ -459,26 +636,40 @@ force_reval_path(struct path *path, struct nameidata *nd)
  * short-cut DAC fails, then call ->permission() to do more
  * complete permission check.
  */
-static int exec_permission(struct inode *inode)
+static inline int __exec_permission(struct inode *inode, int rcu)
 {
 	int ret;
 
 	if (inode->i_op->permission) {
+		if (rcu)
+			return -ECHILD;
 		ret = inode->i_op->permission(inode, MAY_EXEC);
 		if (!ret)
 			goto ok;
 		return ret;
 	}
-	ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+	ret = __acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl, rcu);
 	if (!ret)
 		goto ok;
+	if (rcu && ret == -ECHILD)
+		return ret;
 
 	if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
 		goto ok;
 
 	return ret;
 ok:
-	return security_inode_permission(inode, MAY_EXEC);
+	return security_inode_exec_permission(inode, rcu);
+}
+
+static int exec_permission(struct inode *inode)
+{
+	return __exec_permission(inode, 0);
+}
+
+static int exec_permission_rcu(struct inode *inode)
+{
+	return __exec_permission(inode, 1);
 }
 
 static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +680,20 @@ static __always_inline void set_root(struct nameidata *nd)
 
 static int link_path_walk(const char *, struct nameidata *);
 
+static __always_inline void set_root_rcu(struct nameidata *nd)
+{
+	if (!nd->root.mnt) {
+		struct fs_struct *fs = current->fs;
+		spin_lock(&fs->lock);
+		nd->root = fs->root;
+		spin_unlock(&fs->lock);
+	}
+}
+
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
+	int ret;
+
 	if (IS_ERR(link))
 		goto fail;
 
@@ -500,8 +703,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		nd->path = nd->root;
 		path_get(&nd->root);
 	}
+	nd->inode = nd->path.dentry->d_inode;
 
-	return link_path_walk(link, nd);
+	ret = link_path_walk(link, nd);
+	return ret;
 fail:
 	path_put(&nd->path);
 	return PTR_ERR(link);
@@ -516,11 +721,12 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
 
 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 {
-	dput(nd->path.dentry);
-	if (nd->path.mnt != path->mnt) {
-		mntput(nd->path.mnt);
-		nd->path.mnt = path->mnt;
+	if (!(nd->flags & LOOKUP_RCU)) {
+		dput(nd->path.dentry);
+		if (nd->path.mnt != path->mnt)
+			mntput(nd->path.mnt);
 	}
+	nd->path.mnt = path->mnt;
 	nd->path.dentry = path->dentry;
 }
 
@@ -535,9 +741,11 @@ __do_follow_link(struct path *path, struct nameidata *nd, void **p)
 
 	if (path->mnt != nd->path.mnt) {
 		path_to_nameidata(path, nd);
+		nd->inode = nd->path.dentry->d_inode;
 		dget(dentry);
 	}
 	mntget(path->mnt);
+
 	nd->last_type = LAST_BIND;
 	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
 	error = PTR_ERR(*p);
@@ -591,6 +799,20 @@ loop:
 	return err;
 }
 
+static int follow_up_rcu(struct path *path)
+{
+	struct vfsmount *parent;
+	struct dentry *mountpoint;
+
+	parent = path->mnt->mnt_parent;
+	if (parent == path->mnt)
+		return 0;
+	mountpoint = path->mnt->mnt_mountpoint;
+	path->dentry = mountpoint;
+	path->mnt = parent;
+	return 1;
+}
+
 int follow_up(struct path *path)
 {
 	struct vfsmount *parent;
@@ -615,6 +837,21 @@ int follow_up(struct path *path)
 /*
  * serialization is taken care of in namespace.c
  */
+static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
+				struct inode **inode)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted;
+		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+		if (!mounted)
+			return;
+		path->mnt = mounted;
+		path->dentry = mounted->mnt_root;
+		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+		*inode = path->dentry->d_inode;
+	}
+}
+
 static int __follow_mount(struct path *path)
 {
 	int res = 0;
@@ -660,7 +897,42 @@ int follow_down(struct path *path)
 	return 0;
 }
 
-static __always_inline void follow_dotdot(struct nameidata *nd)
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
+	struct inode *inode = nd->inode;
+
+	set_root_rcu(nd);
+
+	while(1) {
+		if (nd->path.dentry == nd->root.dentry &&
+		    nd->path.mnt == nd->root.mnt) {
+			break;
+		}
+		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+			struct dentry *old = nd->path.dentry;
+			struct dentry *parent = old->d_parent;
+			unsigned seq;
+
+			seq = read_seqcount_begin(&parent->d_seq);
+			if (read_seqcount_retry(&old->d_seq, nd->seq))
+				return -ECHILD;
+			inode = parent->d_inode;
+			nd->path.dentry = parent;
+			nd->seq = seq;
+			break;
+		}
+		if (!follow_up_rcu(&nd->path))
+			break;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		inode = nd->path.dentry->d_inode;
+	}
+	__follow_mount_rcu(nd, &nd->path, &inode);
+	nd->inode = inode;
+
+	return 0;
+}
+
+static void follow_dotdot(struct nameidata *nd)
 {
 	set_root(nd);
 
@@ -681,6 +953,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 			break;
 	}
 	follow_mount(&nd->path);
+	nd->inode = nd->path.dentry->d_inode;
 }
 
 /*
@@ -718,18 +991,17 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
  *  It _is_ time-critical.
  */
 static int do_lookup(struct nameidata *nd, struct qstr *name,
-		     struct path *path)
+			struct path *path, struct inode **inode)
 {
 	struct vfsmount *mnt = nd->path.mnt;
-	struct dentry *dentry, *parent;
+	struct dentry *dentry, *parent = nd->path.dentry;
 	struct inode *dir;
 	/*
 	 * See if the low-level filesystem might want
 	 * to use its own hash..
 	 */
-	if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
-		int err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
-				nd->path.dentry->d_inode, name);
+	if (parent->d_op && parent->d_op->d_hash) {
+		int err = parent->d_op->d_hash(parent, nd->inode, name);
 		if (err < 0)
 			return err;
 	}
@@ -739,21 +1011,48 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	 * of a false negative due to a concurrent rename, we're going to
 	 * do the non-racy lookup, below.
 	 */
-	dentry = __d_lookup(nd->path.dentry, name);
-	if (!dentry)
-		goto need_lookup;
+	if (nd->flags & LOOKUP_RCU) {
+		unsigned seq;
+
+		*inode = nd->inode;
+		dentry = __d_lookup_rcu(parent, name, &seq, inode);
+		if (!dentry) {
+			if (nameidata_drop_rcu(nd))
+				return -ECHILD;
+			goto need_lookup;
+		}
+		/* Memory barrier in read_seqcount_begin of child is enough */
+		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+			return -ECHILD;
+
+		nd->seq = seq;
+		if (dentry->d_op && dentry->d_op->d_revalidate) {
+			/* We commonly drop rcu-walk here */
+			if (nameidata_dentry_drop_rcu(nd, dentry))
+				return -ECHILD;
+			goto need_revalidate;
+		}
+		path->mnt = mnt;
+		path->dentry = dentry;
+		__follow_mount_rcu(nd, path, inode);
+	} else {
+		dentry = __d_lookup(parent, name);
+		if (!dentry)
+			goto need_lookup;
 found:
-	if (dentry->d_op && dentry->d_op->d_revalidate)
-		goto need_revalidate;
+		if (dentry->d_op && dentry->d_op->d_revalidate)
+			goto need_revalidate;
 done:
-	path->mnt = mnt;
-	path->dentry = dentry;
-	__follow_mount(path);
+		path->mnt = mnt;
+		path->dentry = dentry;
+		__follow_mount(path);
+		*inode = path->dentry->d_inode;
+	}
 	return 0;
 
 need_lookup:
-	parent = nd->path.dentry;
 	dir = parent->d_inode;
+	BUG_ON(nd->inode != dir);
 
 	mutex_lock(&dir->i_mutex);
 	/*
@@ -815,7 +1114,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
 static int link_path_walk(const char *name, struct nameidata *nd)
 {
 	struct path next;
-	struct inode *inode;
 	int err;
 	unsigned int lookup_flags = nd->flags;
 	
@@ -824,18 +1122,28 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	if (!*name)
 		goto return_reval;
 
-	inode = nd->path.dentry->d_inode;
 	if (nd->depth)
 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
+		struct inode *inode;
 		unsigned long hash;
 		struct qstr this;
 		unsigned int c;
 
 		nd->flags |= LOOKUP_CONTINUE;
-		err = exec_permission(inode);
+		if (nd->flags & LOOKUP_RCU) {
+			err = exec_permission_rcu(nd->inode);
+			if (err == -ECHILD) {
+				if (nameidata_drop_rcu(nd))
+					return -ECHILD;
+				goto exec_again;
+			}
+		} else {
+exec_again:
+			err = exec_permission(nd->inode);
+		}
  		if (err)
 			break;
 
@@ -866,37 +1174,44 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		if (this.name[0] == '.') switch (this.len) {
 			default:
 				break;
-			case 2:	
+			case 2:
 				if (this.name[1] != '.')
 					break;
-				follow_dotdot(nd);
-				inode = nd->path.dentry->d_inode;
+				if (nd->flags & LOOKUP_RCU) {
+					if (follow_dotdot_rcu(nd))
+						return -ECHILD;
+				} else
+					follow_dotdot(nd);
 				/* fallthrough */
 			case 1:
 				continue;
 		}
 		/* This does the actual lookups.. */
-		err = do_lookup(nd, &this, &next);
+		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-
 		err = -ENOENT;
-		inode = next.dentry->d_inode;
 		if (!inode)
 			goto out_dput;
 
 		if (inode->i_op->follow_link) {
+			/* We commonly drop rcu-walk here */
+			if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+				return -ECHILD;
+			BUG_ON(inode != next.dentry->d_inode);
 			err = do_follow_link(&next, nd);
 			if (err)
 				goto return_err;
+			nd->inode = nd->path.dentry->d_inode;
 			err = -ENOENT;
-			inode = nd->path.dentry->d_inode;
-			if (!inode)
+			if (!nd->inode)
 				break;
-		} else
+		} else {
 			path_to_nameidata(&next, nd);
+			nd->inode = inode;
+		}
 		err = -ENOTDIR; 
-		if (!inode->i_op->lookup)
+		if (!nd->inode->i_op->lookup)
 			break;
 		continue;
 		/* here ends the main loop */
@@ -911,32 +1226,39 @@ last_component:
 		if (this.name[0] == '.') switch (this.len) {
 			default:
 				break;
-			case 2:	
+			case 2:
 				if (this.name[1] != '.')
 					break;
-				follow_dotdot(nd);
-				inode = nd->path.dentry->d_inode;
+				if (nd->flags & LOOKUP_RCU) {
+					if (follow_dotdot_rcu(nd))
+						return -ECHILD;
+				} else
+					follow_dotdot(nd);
 				/* fallthrough */
 			case 1:
 				goto return_reval;
 		}
-		err = do_lookup(nd, &this, &next);
+		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-		inode = next.dentry->d_inode;
 		if (follow_on_final(inode, lookup_flags)) {
+			if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+				return -ECHILD;
+			BUG_ON(inode != next.dentry->d_inode);
 			err = do_follow_link(&next, nd);
 			if (err)
 				goto return_err;
-			inode = nd->path.dentry->d_inode;
-		} else
+			nd->inode = nd->path.dentry->d_inode;
+		} else {
 			path_to_nameidata(&next, nd);
+			nd->inode = inode;
+		}
 		err = -ENOENT;
-		if (!inode)
+		if (!nd->inode)
 			break;
 		if (lookup_flags & LOOKUP_DIRECTORY) {
 			err = -ENOTDIR; 
-			if (!inode->i_op->lookup)
+			if (!nd->inode->i_op->lookup)
 				break;
 		}
 		goto return_base;
@@ -958,6 +1280,8 @@ return_reval:
 		 */
 		if (nd->path.dentry && nd->path.dentry->d_sb &&
 		    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
+			if (nameidata_drop_rcu_maybe(nd))
+				return -ECHILD;
 			err = -ESTALE;
 			/* Note: we do not d_invalidate() */
 			if (!nd->path.dentry->d_op->d_revalidate(
@@ -965,16 +1289,34 @@ return_reval:
 				break;
 		}
 return_base:
+		if (nameidata_drop_rcu_last_maybe(nd))
+			return -ECHILD;
 		return 0;
 out_dput:
-		path_put_conditional(&next, nd);
+		if (!(nd->flags & LOOKUP_RCU))
+			path_put_conditional(&next, nd);
 		break;
 	}
-	path_put(&nd->path);
+	if (!(nd->flags & LOOKUP_RCU))
+		path_put(&nd->path);
 return_err:
 	return err;
 }
 
+static inline int path_walk_rcu(const char *name, struct nameidata *nd)
+{
+	current->total_link_count = 0;
+
+	return link_path_walk(name, nd);
+}
+
+static inline int path_walk_simple(const char *name, struct nameidata *nd)
+{
+	current->total_link_count = 0;
+
+	return link_path_walk(name, nd);
+}
+
 static int path_walk(const char *name, struct nameidata *nd)
 {
 	struct path save = nd->path;
@@ -1000,6 +1342,88 @@ static int path_walk(const char *name, struct nameidata *nd)
 	return result;
 }
 
+static void path_finish_rcu(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU) {
+		/* RCU dangling. Cancel it. */
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+	if (nd->file)
+		fput(nd->file);
+}
+
+static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+{
+	int retval = 0;
+	int fput_needed;
+	struct file *file;
+
+	nd->last_type = LAST_ROOT; /* if there are only slashes... */
+	nd->flags = flags | LOOKUP_RCU;
+	nd->depth = 0;
+	nd->root.mnt = NULL;
+	nd->file = NULL;
+
+	if (*name=='/') {
+		struct fs_struct *fs = current->fs;
+
+		br_read_lock(vfsmount_lock);
+		rcu_read_lock();
+
+		spin_lock(&fs->lock);
+		nd->root = fs->root;
+		nd->path = nd->root;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		spin_unlock(&fs->lock);
+
+	} else if (dfd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
+
+		br_read_lock(vfsmount_lock);
+		rcu_read_lock();
+
+		spin_lock(&fs->lock);
+		nd->path = fs->pwd;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		spin_unlock(&fs->lock);
+	} else {
+		struct dentry *dentry;
+
+		file = fget_light(dfd, &fput_needed);
+		retval = -EBADF;
+		if (!file)
+			goto out_fail;
+
+		dentry = file->f_path.dentry;
+
+		retval = -ENOTDIR;
+		if (!S_ISDIR(dentry->d_inode->i_mode))
+			goto fput_fail;
+
+		retval = file_permission(file, MAY_EXEC);
+		if (retval)
+			goto fput_fail;
+
+		nd->path = file->f_path;
+		if (fput_needed)
+			nd->file = file;
+
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		br_read_lock(vfsmount_lock);
+		rcu_read_lock();
+	}
+	nd->inode = nd->path.dentry->d_inode;
+	return 0;
+
+fput_fail:
+	fput_light(file, fput_needed);
+out_fail:
+	return retval;
+}
+
 static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
@@ -1040,6 +1464,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 
 		fput_light(file, fput_needed);
 	}
+	nd->inode = nd->path.dentry->d_inode;
 	return 0;
 
 fput_fail:
@@ -1052,16 +1477,53 @@ out_fail:
 static int do_path_lookup(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
-	int retval = path_init(dfd, name, flags, nd);
-	if (!retval)
-		retval = path_walk(name, nd);
-	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-				nd->path.dentry->d_inode))
-		audit_inode(name, nd->path.dentry);
+	int retval;
+
+	/*
+	 * Path walking is largely split up into 2 different synchronisation
+	 * schemes, rcu-walk and ref-walk (explained in
+	 * Documentation/filesystems/path-lookup.txt). These share much of the
+	 * path walk code, but some things particularly setup, cleanup, and
+	 * following mounts are sufficiently divergent that functions are
+	 * duplicated. Typically there is a function foo(), and its RCU
+	 * analogue, foo_rcu().
+	 *
+	 * -ECHILD is the error number of choice (just to avoid clashes) that
+	 * is returned if some aspect of an rcu-walk fails. Such an error must
+	 * be handled by restarting a traditional ref-walk (which will always
+	 * be able to complete).
+	 */
+	retval = path_init_rcu(dfd, name, flags, nd);
+	if (unlikely(retval))
+		return retval;
+	retval = path_walk_rcu(name, nd);
+	path_finish_rcu(nd);
 	if (nd->root.mnt) {
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
+
+	if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
+		/* slower, locked walk */
+		if (retval == -ESTALE)
+			flags |= LOOKUP_REVAL;
+		retval = path_init(dfd, name, flags, nd);
+		if (unlikely(retval))
+			return retval;
+		retval = path_walk(name, nd);
+		if (nd->root.mnt) {
+			path_put(&nd->root);
+			nd->root.mnt = NULL;
+		}
+	}
+
+	if (likely(!retval)) {
+		if (unlikely(!audit_dummy_context())) {
+			if (nd->path.dentry && nd->inode)
+				audit_inode(name, nd->path.dentry);
+		}
+	}
+
 	return retval;
 }
 
@@ -1104,10 +1566,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	path_get(&nd->path);
 	nd->root = nd->path;
 	path_get(&nd->root);
+	nd->inode = nd->path.dentry->d_inode;
 
 	retval = path_walk(name, nd);
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-				nd->path.dentry->d_inode))
+				nd->inode))
 		audit_inode(name, nd->path.dentry);
 
 	path_put(&nd->root);
@@ -1488,6 +1951,7 @@ out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(nd->path.dentry);
 	nd->path.dentry = path->dentry;
+
 	if (error)
 		return error;
 	/* Don't check for write permission, don't truncate */
@@ -1582,6 +2046,9 @@ exit:
 	return ERR_PTR(error);
 }
 
+/*
+ * Handle O_CREAT case for do_filp_open
+ */
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    int open_flag, int acc_mode,
 			    int mode, const char *pathname)
@@ -1603,42 +2070,16 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		}
 		/* fallthrough */
 	case LAST_ROOT:
-		if (open_flag & O_CREAT)
-			goto exit;
-		/* fallthrough */
+		goto exit;
 	case LAST_BIND:
 		audit_inode(pathname, dir);
 		goto ok;
 	}
 
 	/* trailing slashes? */
-	if (nd->last.name[nd->last.len]) {
-		if (open_flag & O_CREAT)
-			goto exit;
-		nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
-	}
-
-	/* just plain open? */
-	if (!(open_flag & O_CREAT)) {
-		error = do_lookup(nd, &nd->last, path);
-		if (error)
-			goto exit;
-		error = -ENOENT;
-		if (!path->dentry->d_inode)
-			goto exit_dput;
-		if (path->dentry->d_inode->i_op->follow_link)
-			return NULL;
-		error = -ENOTDIR;
-		if (nd->flags & LOOKUP_DIRECTORY) {
-			if (!path->dentry->d_inode->i_op->lookup)
-				goto exit_dput;
-		}
-		path_to_nameidata(path, nd);
-		audit_inode(pathname, nd->path.dentry);
-		goto ok;
-	}
+	if (nd->last.name[nd->last.len])
+		goto exit;
 
-	/* OK, it's O_CREAT */
 	mutex_lock(&dir->d_inode->i_mutex);
 
 	path->dentry = lookup_hash(nd);
@@ -1709,8 +2150,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		return NULL;
 
 	path_to_nameidata(path, nd);
+	nd->inode = path->dentry->d_inode;
 	error = -EISDIR;
-	if (S_ISDIR(path->dentry->d_inode->i_mode))
+	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
 	filp = finish_open(nd, open_flag, acc_mode);
@@ -1741,7 +2183,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	struct path path;
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
-	int force_reval = 0;
+	int flags;
 
 	if (!(open_flag & O_CREAT))
 		mode = 0;
@@ -1770,54 +2212,84 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (open_flag & O_APPEND)
 		acc_mode |= MAY_APPEND;
 
-	/* find the parent */
-reval:
-	error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
+	flags = LOOKUP_OPEN;
+	if (open_flag & O_CREAT) {
+		flags |= LOOKUP_CREATE;
+		if (open_flag & O_EXCL)
+			flags |= LOOKUP_EXCL;
+	}
+	if (open_flag & O_DIRECTORY)
+		flags |= LOOKUP_DIRECTORY;
+	if (!(open_flag & O_NOFOLLOW))
+		flags |= LOOKUP_FOLLOW;
+
+	filp = get_empty_filp();
+	if (!filp)
+		return ERR_PTR(-ENFILE);
+
+	filp->f_flags = open_flag;
+	nd.intent.open.file = filp;
+	nd.intent.open.flags = flag;
+	nd.intent.open.create_mode = mode;
+
+	if (open_flag & O_CREAT)
+		goto creat;
+
+	/* !O_CREAT, simple open */
+	error = do_path_lookup(dfd, pathname, flags, &nd);
+	if (unlikely(error))
+		goto out_filp;
+	error = -ELOOP;
+	if (!(nd.flags & LOOKUP_FOLLOW)) {
+		if (nd.inode->i_op->follow_link)
+			goto out_path;
+	}
+	error = -ENOTDIR;
+	if (nd.flags & LOOKUP_DIRECTORY) {
+		if (!nd.inode->i_op->lookup)
+			goto out_path;
+	}
+	audit_inode(pathname, nd.path.dentry);
+	filp = finish_open(&nd, open_flag, acc_mode);
+	return filp;
+
+creat:
+	/* OK, have to create the file. Find the parent. */
+	error = path_init_rcu(dfd, pathname,
+			LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
 	if (error)
-		return ERR_PTR(error);
-	if (force_reval)
-		nd.flags |= LOOKUP_REVAL;
+		goto out_filp;
+	error = path_walk_rcu(pathname, &nd);
+	path_finish_rcu(&nd);
+	if (unlikely(error == -ECHILD || error == -ESTALE)) {
+		/* slower, locked walk */
+		if (error == -ESTALE) {
+reval:
+			flags |= LOOKUP_REVAL;
+		}
+		error = path_init(dfd, pathname,
+				LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
+		if (error)
+			goto out_filp;
 
-	current->total_link_count = 0;
-	error = link_path_walk(pathname, &nd);
-	if (error) {
-		filp = ERR_PTR(error);
-		goto out;
+		error = path_walk_simple(pathname, &nd);
 	}
-	if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
+	if (unlikely(error))
+		goto out_filp;
+	if (unlikely(!audit_dummy_context()))
 		audit_inode(pathname, nd.path.dentry);
 
 	/*
 	 * We have the parent and last component.
 	 */
-
-	error = -ENFILE;
-	filp = get_empty_filp();
-	if (filp == NULL)
-		goto exit_parent;
-	nd.intent.open.file = filp;
-	filp->f_flags = open_flag;
-	nd.intent.open.flags = flag;
-	nd.intent.open.create_mode = mode;
-	nd.flags &= ~LOOKUP_PARENT;
-	nd.flags |= LOOKUP_OPEN;
-	if (open_flag & O_CREAT) {
-		nd.flags |= LOOKUP_CREATE;
-		if (open_flag & O_EXCL)
-			nd.flags |= LOOKUP_EXCL;
-	}
-	if (open_flag & O_DIRECTORY)
-		nd.flags |= LOOKUP_DIRECTORY;
-	if (!(open_flag & O_NOFOLLOW))
-		nd.flags |= LOOKUP_FOLLOW;
+	nd.flags = flags;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path holder;
-		struct inode *inode = path.dentry->d_inode;
 		void *cookie;
 		error = -ELOOP;
 		/* S_ISDIR part is a temporary automount kludge */
-		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
+		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
 			goto exit_dput;
 		if (count++ == 32)
 			goto exit_dput;
@@ -1838,36 +2310,33 @@ reval:
 			goto exit_dput;
 		error = __do_follow_link(&path, &nd, &cookie);
 		if (unlikely(error)) {
+			if (!IS_ERR(cookie) && nd.inode->i_op->put_link)
+				nd.inode->i_op->put_link(path.dentry, &nd, cookie);
 			/* nd.path had been dropped */
-			if (!IS_ERR(cookie) && inode->i_op->put_link)
-				inode->i_op->put_link(path.dentry, &nd, cookie);
-			path_put(&path);
-			release_open_intent(&nd);
-			filp = ERR_PTR(error);
-			goto out;
+			nd.path = path;
+			goto out_path;
 		}
 		holder = path;
 		nd.flags &= ~LOOKUP_PARENT;
 		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		if (inode->i_op->put_link)
-			inode->i_op->put_link(holder.dentry, &nd, cookie);
+		if (nd.inode->i_op->put_link)
+			nd.inode->i_op->put_link(holder.dentry, &nd, cookie);
 		path_put(&holder);
 	}
 out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
-	if (filp == ERR_PTR(-ESTALE) && !force_reval) {
-		force_reval = 1;
+	if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
 		goto reval;
-	}
 	return filp;
 
 exit_dput:
 	path_put_conditional(&path, &nd);
+out_path:
+	path_put(&nd.path);
+out_filp:
 	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
-exit_parent:
-	path_put(&nd.path);
 	filp = ERR_PTR(error);
 	goto out;
 }
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ae4b0fd9033..998e3a715bc 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -402,6 +402,10 @@ static int proc_sys_compare(const struct dentry *parent,
 		const struct dentry *dentry, const struct inode *inode,
 		unsigned int len, const char *str, const struct qstr *name)
 {
+	/* Although proc doesn't have negative dentries, rcu-walk means
+	 * that inode here can be NULL */
+	if (!inode)
+		return 0;
 	if (name->len != len)
 		return 1;
 	if (memcmp(name->name, str, len))
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index ca648685f0c..c2e7390289c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -5,6 +5,7 @@
 #include <linux/list.h>
 #include <linux/rculist.h>
 #include <linux/spinlock.h>
+#include <linux/seqlock.h>
 #include <linux/cache.h>
 #include <linux/rcupdate.h>
 
@@ -90,6 +91,7 @@ struct dentry {
 	unsigned int d_count;		/* protected by d_lock */
 	unsigned int d_flags;		/* protected by d_lock */
 	spinlock_t d_lock;		/* per dentry lock */
+	seqcount_t d_seq;		/* per dentry seqlock */
 	int d_mounted;
 	struct inode *d_inode;		/* Where the name belongs to - NULL is
 					 * negative */
@@ -266,9 +268,33 @@ extern void d_move(struct dentry *, struct dentry *);
 extern struct dentry *d_ancestor(struct dentry *, struct dentry *);
 
 /* appendix may either be NULL or be used for transname suffixes */
-extern struct dentry * d_lookup(struct dentry *, struct qstr *);
-extern struct dentry * __d_lookup(struct dentry *, struct qstr *);
-extern struct dentry * d_hash_and_lookup(struct dentry *, struct qstr *);
+extern struct dentry *d_lookup(struct dentry *, struct qstr *);
+extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
+extern struct dentry *__d_lookup(struct dentry *, struct qstr *);
+extern struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
+				unsigned *seq, struct inode **inode);
+
+/**
+ * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok
+ * @dentry: dentry to take a ref on
+ * @seq: seqcount to verify against
+ * @Returns: 0 on failure, else 1.
+ *
+ * __d_rcu_to_refcount operates on a dentry,seq pair that was returned
+ * by __d_lookup_rcu, to get a reference on an rcu-walk dentry.
+ */
+static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq)
+{
+	int ret = 0;
+
+	assert_spin_locked(&dentry->d_lock);
+	if (!read_seqcount_retry(&dentry->d_seq, seq)) {
+		ret = 1;
+		dentry->d_count++;
+	}
+
+	return ret;
+}
 
 /* validate "insecure" dentry pointer */
 extern int d_validate(struct dentry *, struct dentry *);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index aec730b5393..18d06add0a4 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -19,7 +19,10 @@ struct nameidata {
 	struct path	path;
 	struct qstr	last;
 	struct path	root;
+	struct file	*file;
+	struct inode	*inode; /* path.dentry.d_inode */
 	unsigned int	flags;
+	unsigned	seq;
 	int		last_type;
 	unsigned	depth;
 	char *saved_names[MAX_NESTED_LINKS + 1];
@@ -43,11 +46,13 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
  *  - internal "there are more path components" flag
  *  - dentry cache is untrusted; force a real lookup
  */
-#define LOOKUP_FOLLOW		 1
-#define LOOKUP_DIRECTORY	 2
-#define LOOKUP_CONTINUE		 4
-#define LOOKUP_PARENT		16
-#define LOOKUP_REVAL		64
+#define LOOKUP_FOLLOW		0x0001
+#define LOOKUP_DIRECTORY	0x0002
+#define LOOKUP_CONTINUE		0x0004
+
+#define LOOKUP_PARENT		0x0010
+#define LOOKUP_REVAL		0x0020
+#define LOOKUP_RCU		0x0040
 /*
  * Intent data
  */
diff --git a/include/linux/security.h b/include/linux/security.h
index fd4d55fb884..ed95401970c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -457,7 +457,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	called when the actual read/write operations are performed.
  *	@inode contains the inode structure to check.
  *	@mask contains the permission mask.
- *	@nd contains the nameidata (may be NULL).
  *	Return 0 if permission is granted.
  * @inode_setattr:
  *	Check permission before setting file attributes.  Note that the kernel
@@ -1713,6 +1712,7 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
 int security_inode_readlink(struct dentry *dentry);
 int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd);
 int security_inode_permission(struct inode *inode, int mask);
+int security_inode_exec_permission(struct inode *inode, unsigned int flags);
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
 int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry);
 int security_inode_setxattr(struct dentry *dentry, const char *name,
@@ -2102,6 +2102,12 @@ static inline int security_inode_permission(struct inode *inode, int mask)
 	return 0;
 }
 
+static inline int security_inode_exec_permission(struct inode *inode,
+						  unsigned int flags)
+{
+	return 0;
+}
+
 static inline int security_inode_setattr(struct dentry *dentry,
 					  struct iattr *attr)
 {
diff --git a/security/security.c b/security/security.c
index 1b798d3df71..c645e263ca8 100644
--- a/security/security.c
+++ b/security/security.c
@@ -513,6 +513,15 @@ int security_inode_permission(struct inode *inode, int mask)
 	return security_ops->inode_permission(inode, mask);
 }
 
+int security_inode_exec_permission(struct inode *inode, unsigned int flags)
+{
+	if (unlikely(IS_PRIVATE(inode)))
+		return 0;
+	if (flags)
+		return -ECHILD;
+	return security_ops->inode_permission(inode, MAY_EXEC);
+}
+
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	if (unlikely(IS_PRIVATE(dentry->d_inode)))
-- 
cgit v1.2.3


From 34286d6662308d82aed891852d04c7c3a2649b16 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:57 +1100
Subject: fs: rcu-walk aware d_revalidate method

Require filesystems be aware of .d_revalidate being called in rcu-walk
mode (nd->flags & LOOKUP_RCU). For now do a simple push down, returning
-ECHILD from all implementations.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 Documentation/filesystems/Locking         | 18 +++++------
 Documentation/filesystems/path-lookup.txt |  5 ++-
 Documentation/filesystems/porting         | 20 ++++++++++++
 Documentation/filesystems/vfs.txt         |  9 ++++++
 drivers/staging/autofs/root.c             |  5 ++-
 drivers/staging/smbfs/dir.c               | 16 ++++++---
 fs/afs/dir.c                              |  4 +++
 fs/autofs4/root.c                         | 13 ++++++--
 fs/ceph/dir.c                             |  7 +++-
 fs/cifs/dir.c                             |  3 ++
 fs/coda/dir.c                             |  7 +++-
 fs/ecryptfs/dentry.c                      |  9 ++++--
 fs/fat/namei_vfat.c                       |  6 ++++
 fs/fuse/dir.c                             |  6 +++-
 fs/gfs2/dentry.c                          | 17 +++++++---
 fs/hfs/sysdep.c                           |  7 +++-
 fs/jfs/namei.c                            |  2 ++
 fs/namei.c                                | 54 ++++++++++++++++++++-----------
 fs/ncpfs/dir.c                            |  4 +++
 fs/ncpfs/inode.c                          |  1 +
 fs/nfs/dir.c                              |  8 +++--
 fs/ocfs2/dcache.c                         | 10 ++++--
 fs/proc/base.c                            | 33 +++++++++++++++----
 fs/proc/proc_sysctl.c                     |  3 ++
 fs/reiserfs/xattr.c                       |  2 ++
 fs/sysfs/dir.c                            |  6 +++-
 include/linux/dcache.h                    |  1 -
 27 files changed, 215 insertions(+), 61 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index bdad6414dfa..e90ffe61eb6 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -9,7 +9,7 @@ be able to use diff(1).
 
 --------------------------- dentry_operations --------------------------
 prototypes:
-	int (*d_revalidate)(struct dentry *, int);
+	int (*d_revalidate)(struct dentry *, struct nameidata *);
 	int (*d_hash)(const struct dentry *, const struct inode *,
 			struct qstr *);
 	int (*d_compare)(const struct dentry *, const struct inode *,
@@ -21,14 +21,14 @@ prototypes:
 	char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
 
 locking rules:
-		rename_lock	->d_lock	may block
-d_revalidate:	no		no		yes
-d_hash		no		no		no
-d_compare:	yes		no		no
-d_delete:	no		yes		no
-d_release:	no		no		yes
-d_iput:		no		no		yes
-d_dname:	no		no		no
+		rename_lock	->d_lock	may block	rcu-walk
+d_revalidate:	no		no		yes (ref-walk)	maybe
+d_hash		no		no		no		maybe
+d_compare:	yes		no		no		maybe
+d_delete:	no		yes		no		no
+d_release:	no		no		yes		no
+d_iput:		no		no		yes		no
+d_dname:	no		no		no		no
 
 --------------------------- inode_operations --------------------------- 
 prototypes:
diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt
index 09b2878724a..8789d1810be 100644
--- a/Documentation/filesystems/path-lookup.txt
+++ b/Documentation/filesystems/path-lookup.txt
@@ -317,11 +317,10 @@ The detailed design for rcu-walk is like this:
 The cases where rcu-walk cannot continue are:
 * NULL dentry (ie. any uncached path element)
 * parent with d_inode->i_op->permission or ACLs
-* dentries with d_revalidate
 * Following links
 
-In future patches, permission checks and d_revalidate become rcu-walk aware. It
-may be possible eventually to make following links rcu-walk aware.
+In future patches, permission checks become rcu-walk aware. It may be possible
+eventually to make following links rcu-walk aware.
 
 Uncached path elements will always require dropping to ref-walk mode, at the
 very least because i_mutex needs to be grabbed, and objects allocated.
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index ccf0ce7866b..cd9756a2709 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -360,3 +360,23 @@ i_dentry to be reinitialized before it is freed, so an:
   INIT_LIST_HEAD(&inode->i_dentry);
 
 must be done in the RCU callback.
+
+--
+[recommended]
+	vfs now tries to do path walking in "rcu-walk mode", which avoids
+atomic operations and scalability hazards on dentries and inodes (see
+Documentation/filesystems/path-walk.txt). d_hash and d_compare changes (above)
+are examples of the changes required to support this. For more complex
+filesystem callbacks, the vfs drops out of rcu-walk mode before the fs call, so
+no changes are required to the filesystem. However, this is costly and loses
+the benefits of rcu-walk mode. We will begin to add filesystem callbacks that
+are rcu-walk aware, shown below. Filesystems should take advantage of this
+where possible.
+
+--
+[mandatory]
+	d_revalidate is a callback that is made on every path element (if
+the filesystem provides it), which requires dropping out of rcu-walk mode. This
+may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be
+returned if the filesystem cannot handle rcu-walk. See
+Documentation/filesystems/vfs.txt for more details.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 69b10ff5ec8..c936b491238 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -863,6 +863,15 @@ struct dentry_operations {
 	dcache. Most filesystems leave this as NULL, because all their
 	dentries in the dcache are valid
 
+	d_revalidate may be called in rcu-walk mode (nd->flags & LOOKUP_RCU).
+	If in rcu-walk mode, the filesystem must revalidate the dentry without
+	blocking or storing to the dentry, d_parent and d_inode should not be
+	used without care (because they can go NULL), instead nd->inode should
+	be used.
+
+	If a situation is encountered that rcu-walk cannot handle, return
+	-ECHILD and it will be called again in ref-walk mode.
+
   d_hash: called when the VFS adds a dentry to the hash table. The first
 	dentry passed to d_hash is the parent directory that the name is
 	to be hashed into. The inode is the dentry's inode.
diff --git a/drivers/staging/autofs/root.c b/drivers/staging/autofs/root.c
index b09adb57971..bf0e9755da6 100644
--- a/drivers/staging/autofs/root.c
+++ b/drivers/staging/autofs/root.c
@@ -154,13 +154,16 @@ static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, str
  * yet completely filled in, and revalidate has to delay such
  * lookups..
  */
-static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
+static int autofs_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode * dir;
 	struct autofs_sb_info *sbi;
 	struct autofs_dir_ent *ent;
 	int res;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	lock_kernel();
 	dir = dentry->d_parent->d_inode;
 	sbi = autofs_sbi(dir->i_sb);
diff --git a/drivers/staging/smbfs/dir.c b/drivers/staging/smbfs/dir.c
index 78f09412740..dd612f50749 100644
--- a/drivers/staging/smbfs/dir.c
+++ b/drivers/staging/smbfs/dir.c
@@ -14,6 +14,7 @@
 #include <linux/ctype.h>
 #include <linux/net.h>
 #include <linux/sched.h>
+#include <linux/namei.h>
 
 #include "smb_fs.h"
 #include "smb_mount.h"
@@ -301,13 +302,20 @@ static const struct dentry_operations smbfs_dentry_operations_case =
  * This is the callback when the dcache has a lookup hit.
  */
 static int
-smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
+smb_lookup_validate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	struct inode * inode = dentry->d_inode;
-	unsigned long age = jiffies - dentry->d_time;
+	struct smb_sb_info *server;
+	struct inode *inode;
+	unsigned long age;
 	int valid;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	server = server_from_dentry(dentry);
+	inode = dentry->d_inode;
+	age = jiffies - dentry->d_time;
+
 	/*
 	 * The default validation is based on dentry age:
 	 * we believe in dentries for a few seconds.  (But each
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b8bb7e7148d..34a3263d60a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
@@ -607,6 +608,9 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 	void *dir_version;
 	int ret;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	vnode = AFS_FS_I(dentry->d_inode);
 
 	if (dentry->d_inode)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index bfe3f2eb684..651e4ef563b 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -315,12 +315,19 @@ out_error:
  */
 static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *dir = dentry->d_parent->d_inode;
-	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
-	int oz_mode = autofs4_oz_mode(sbi);
+	struct inode *dir;
+	struct autofs_sb_info *sbi;
+	int oz_mode;
 	int flags = nd ? nd->flags : 0;
 	int status = 1;
 
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	dir = dentry->d_parent->d_inode;
+	sbi = autofs4_sbi(dir->i_sb);
+	oz_mode = autofs4_oz_mode(sbi);
+
 	/* Pending dentry */
 	spin_lock(&sbi->fs_lock);
 	if (autofs4_ispending(dentry)) {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index cc01cf82676..fa7ca04ee81 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -990,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
  */
 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *dir = dentry->d_parent->d_inode;
+	struct inode *dir;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	dir = dentry->d_parent->d_inode;
 
 	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
 	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e3b10ca6d45..db2a58c00f7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -656,6 +656,9 @@ lookup_out:
 static int
 cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	if (direntry->d_inode) {
 		if (cifs_revalidate_dentry(direntry))
 			return 0;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index aa40c811f8d..619a8303766 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
+#include <linux/namei.h>
 
 #include <asm/uaccess.h>
 
@@ -541,9 +542,13 @@ out:
 /* called when a cache lookup succeeds */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 {
-	struct inode *inode = de->d_inode;
+	struct inode *inode;
 	struct coda_inode_info *cii;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = de->d_inode;
 	if (!inode || coda_isroot(inode))
 		goto out;
 	if (is_bad_inode(inode))
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 906e803f7f7..6fc4f319b55 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -44,12 +44,17 @@
  */
 static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
 	struct dentry *dentry_save;
 	struct vfsmount *vfsmount_save;
 	int rc = 1;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
 	if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
 		goto out;
 	dentry_save = nd->path.dentry;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 3be5ed7d859..e3ffc5e1233 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,6 +43,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	/* This is not negative dentry. Always valid. */
 	if (dentry->d_inode)
 		return 1;
@@ -51,6 +54,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	/*
 	 * This is not negative dentry. Always valid.
 	 *
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c9a8a426a39..07f4b5e675f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -156,8 +156,12 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
  */
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
-	struct inode *inode = entry->d_inode;
+	struct inode *inode;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
 
+	inode = entry->d_inode;
 	if (inode && is_bad_inode(inode))
 		return 0;
 	else if (fuse_dentry_time(entry) < get_jiffies_64()) {
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 50497f65763..4a456338b87 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -11,6 +11,7 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/namei.h>
 #include <linux/crc32.h>
 
 #include "gfs2.h"
@@ -34,15 +35,23 @@
 
 static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct dentry *parent = dget_parent(dentry);
-	struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
-	struct gfs2_inode *dip = GFS2_I(parent->d_inode);
-	struct inode *inode = dentry->d_inode;
+	struct dentry *parent;
+	struct gfs2_sbd *sdp;
+	struct gfs2_inode *dip;
+	struct inode *inode;
 	struct gfs2_holder d_gh;
 	struct gfs2_inode *ip = NULL;
 	int error;
 	int had_lock = 0;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	parent = dget_parent(dentry);
+	sdp = GFS2_SB(parent->d_inode);
+	dip = GFS2_I(parent->d_inode);
+	inode = dentry->d_inode;
+
 	if (inode) {
 		if (is_bad_inode(inode))
 			goto invalid;
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 7478f5c219a..19cf291eb91 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -8,15 +8,20 @@
  * This file contains the code to do various system dependent things.
  */
 
+#include <linux/namei.h>
 #include "hfs_fs.h"
 
 /* dentry case-handling: just lowercase everything */
 
 static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode;
 	int diff;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
 	if(!inode)
 		return 1;
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a151cbdec62..4414e3a4226 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1608,6 +1608,8 @@ out:
 
 static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
 	/*
 	 * This is not negative dentry. Always valid.
 	 *
diff --git a/fs/namei.c b/fs/namei.c
index 90bd2873e11..6e275363e89 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -563,10 +563,26 @@ void release_open_intent(struct nameidata *nd)
 		fput(nd->intent.open.file);
 }
 
+static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	int status;
+
+	status = dentry->d_op->d_revalidate(dentry, nd);
+	if (status == -ECHILD) {
+		if (nameidata_dentry_drop_rcu(nd, dentry))
+			return status;
+		status = dentry->d_op->d_revalidate(dentry, nd);
+	}
+
+	return status;
+}
+
 static inline struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	int status = dentry->d_op->d_revalidate(dentry, nd);
+	int status;
+
+	status = d_revalidate(dentry, nd);
 	if (unlikely(status <= 0)) {
 		/*
 		 * The dentry failed validation.
@@ -574,14 +590,20 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 		 * the dentry otherwise d_revalidate is asking us
 		 * to return a fail status.
 		 */
-		if (!status) {
+		if (status < 0) {
+			/* If we're in rcu-walk, we don't have a ref */
+			if (!(nd->flags & LOOKUP_RCU))
+				dput(dentry);
+			dentry = ERR_PTR(status);
+
+		} else {
+			/* Don't d_invalidate in rcu-walk mode */
+			if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
+				return ERR_PTR(-ECHILD);
 			if (!d_invalidate(dentry)) {
 				dput(dentry);
 				dentry = NULL;
 			}
-		} else {
-			dput(dentry);
-			dentry = ERR_PTR(status);
 		}
 	}
 	return dentry;
@@ -626,7 +648,7 @@ force_reval_path(struct path *path, struct nameidata *nd)
 	if (!need_reval_dot(dentry))
 		return 0;
 
-	status = dentry->d_op->d_revalidate(dentry, nd);
+	status = d_revalidate(dentry, nd);
 	if (status > 0)
 		return 0;
 
@@ -1039,12 +1061,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 			return -ECHILD;
 
 		nd->seq = seq;
-		if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
-			/* We commonly drop rcu-walk here */
-			if (nameidata_dentry_drop_rcu(nd, dentry))
-				return -ECHILD;
+		if (dentry->d_flags & DCACHE_OP_REVALIDATE)
 			goto need_revalidate;
-		}
 		path->mnt = mnt;
 		path->dentry = dentry;
 		__follow_mount_rcu(nd, path, inode);
@@ -1292,12 +1310,11 @@ return_reval:
 		 * We may need to check the cached dentry for staleness.
 		 */
 		if (need_reval_dot(nd->path.dentry)) {
-			if (nameidata_drop_rcu_maybe(nd))
-				return -ECHILD;
-			err = -ESTALE;
 			/* Note: we do not d_invalidate() */
-			if (!nd->path.dentry->d_op->d_revalidate(
-					nd->path.dentry, nd))
+			err = d_revalidate(nd->path.dentry, nd);
+			if (!err)
+				err = -ESTALE;
+			if (err < 0)
 				break;
 		}
 return_base:
@@ -2080,10 +2097,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		dir = nd->path.dentry;
 	case LAST_DOT:
 		if (need_reval_dot(dir)) {
-			if (!dir->d_op->d_revalidate(dir, nd)) {
+			error = d_revalidate(nd->path.dentry, nd);
+			if (!error)
 				error = -ESTALE;
+			if (error < 0)
 				goto exit;
-			}
 		}
 		/* fallthrough */
 	case LAST_ROOT:
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 4b9cbb28d7f..28f136d4aae 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
 
@@ -308,6 +309,9 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
 	int res, val = 0, len;
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	parent = dget_parent(dentry);
 	dir = parent->d_inode;
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0c75a5f3caf..9531c052d7a 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -29,6 +29,7 @@
 #include <linux/vfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/namei.h>
 
 #include <linux/ncp_fs.h>
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 37e0a8bb077..58beace14b1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -938,7 +938,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
  * component of the path.
  * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT.
  */
-static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask)
+static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
+						unsigned int mask)
 {
 	if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))
 		return 0;
@@ -1018,7 +1019,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
  * If the parent directory is seen to have changed, we throw out the
  * cached dentry and do a new lookup.
  */
-static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
+static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *dir;
 	struct inode *inode;
@@ -1027,6 +1028,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	struct nfs_fattr *fattr = NULL;
 	int error;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	parent = dget_parent(dentry);
 	dir = parent->d_inode;
 	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 4d54c60ceee..0310b16a723 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -52,9 +52,15 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry)
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
 				   struct nameidata *nd)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode;
 	int ret = 0;    /* if all else fails, just return false */
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+	struct ocfs2_super *osb;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	osb = OCFS2_SB(dentry->d_sb);
 
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 		   dentry->d_name.len, dentry->d_name.name);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 85f0a80912a..dc5b2fcadc3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1719,10 +1719,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
  */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
+	struct inode *inode;
+	struct task_struct *task;
 	const struct cred *cred;
 
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+
 	if (task) {
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
@@ -1888,12 +1894,19 @@ static int proc_fd_link(struct inode *inode, struct path *path)
 
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
-	int fd = proc_fd(inode);
+	struct inode *inode;
+	struct task_struct *task;
+	int fd;
 	struct files_struct *files;
 	const struct cred *cred;
 
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	fd = proc_fd(inode);
+
 	if (task) {
 		files = get_files_struct(task);
 		if (files) {
@@ -2563,8 +2576,14 @@ static const struct pid_entry proc_base_stuff[] = {
  */
 static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
+	struct inode *inode;
+	struct task_struct *task;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
 	if (task) {
 		put_task_struct(task);
 		return 1;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 35efd85a4d3..c9097f43b42 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -5,6 +5,7 @@
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
+#include <linux/namei.h>
 #include "internal.h"
 
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -389,6 +390,8 @@ static const struct inode_operations proc_sys_dir_operations = {
 
 static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
 	return !PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e0f0d7ea10a..9ea22a56cdf 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -972,6 +972,8 @@ int reiserfs_permission(struct inode *inode, int mask)
 
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
 	return -EPERM;
 }
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 3e076caa8da..ea9120a830d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -239,9 +239,13 @@ static int sysfs_dentry_delete(const struct dentry *dentry)
 
 static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct sysfs_dirent *sd;
 	int is_dir;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	sd = dentry->d_fsdata;
 	mutex_lock(&sysfs_mutex);
 
 	/* The sysfs dirent has been deleted */
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index b1aeda07725..8b2064d0292 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -190,7 +190,6 @@ struct dentry_operations {
 #define DCACHE_OP_REVALIDATE	0x4000
 #define DCACHE_OP_DELETE	0x8000
 
-
 extern spinlock_t dcache_inode_lock;
 extern seqlock_t rename_lock;
 
-- 
cgit v1.2.3


From b74c79e99389cd79b31fcc08f82c24e492e63c7e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:58 +1100
Subject: fs: provide rcu-walk aware permission i_ops

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 Documentation/filesystems/Locking         |  6 +--
 Documentation/filesystems/path-lookup.txt | 44 ++++++++++++++++--
 Documentation/filesystems/porting         |  5 +++
 Documentation/filesystems/vfs.txt         | 10 ++++-
 drivers/staging/smbfs/file.c              |  5 ++-
 fs/9p/acl.c                               |  5 ++-
 fs/9p/acl.h                               |  2 +-
 fs/afs/internal.h                         |  2 +-
 fs/afs/security.c                         |  7 ++-
 fs/bad_inode.c                            |  5 ++-
 fs/btrfs/acl.c                            |  6 ++-
 fs/btrfs/ctree.h                          |  2 +-
 fs/btrfs/inode.c                          |  7 ++-
 fs/ceph/inode.c                           | 11 +++--
 fs/ceph/super.h                           |  2 +-
 fs/cifs/cifsfs.c                          |  7 ++-
 fs/coda/dir.c                             |  5 ++-
 fs/coda/pioctl.c                          |  6 ++-
 fs/ecryptfs/inode.c                       |  4 +-
 fs/ext2/acl.c                             |  8 +++-
 fs/ext2/acl.h                             |  2 +-
 fs/ext3/acl.c                             |  8 +++-
 fs/ext3/acl.h                             |  2 +-
 fs/ext4/acl.c                             |  8 +++-
 fs/ext4/acl.h                             |  2 +-
 fs/fuse/dir.c                             | 10 +++--
 fs/generic_acl.c                          |  8 +++-
 fs/gfs2/acl.c                             |  5 ++-
 fs/gfs2/acl.h                             |  2 +-
 fs/gfs2/file.c                            |  2 +-
 fs/gfs2/inode.c                           |  4 +-
 fs/gfs2/inode.h                           |  2 +-
 fs/gfs2/ops_inode.c                       | 18 +++++---
 fs/hostfs/hostfs_kern.c                   |  7 ++-
 fs/hpfs/namei.c                           |  2 +-
 fs/jffs2/acl.c                            |  5 ++-
 fs/jffs2/acl.h                            |  2 +-
 fs/jfs/acl.c                              |  8 +++-
 fs/jfs/jfs_acl.h                          |  2 +-
 fs/logfs/dir.c                            |  6 ++-
 fs/namei.c                                | 75 ++++++++++++-------------------
 fs/nfs/dir.c                              |  7 ++-
 fs/nilfs2/inode.c                         | 10 +++--
 fs/nilfs2/nilfs.h                         |  2 +-
 fs/ocfs2/acl.c                            |  8 +++-
 fs/ocfs2/acl.h                            |  2 +-
 fs/ocfs2/file.c                           |  7 ++-
 fs/ocfs2/file.h                           |  2 +-
 fs/proc/base.c                            |  6 ++-
 fs/proc/proc_sysctl.c                     |  5 ++-
 fs/reiserfs/xattr.c                       | 14 ++++--
 fs/sysfs/inode.c                          | 11 +++--
 fs/sysfs/sysfs.h                          |  2 +-
 fs/xfs/linux-2.6/xfs_acl.c                |  8 +++-
 fs/xfs/xfs_acl.h                          |  2 +-
 include/linux/coda_linux.h                |  2 +-
 include/linux/fs.h                        | 10 +++--
 include/linux/generic_acl.h               |  2 +-
 include/linux/nfs_fs.h                    |  2 +-
 include/linux/reiserfs_xattr.h            |  2 +-
 60 files changed, 287 insertions(+), 146 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index e90ffe61eb6..977d8919cc6 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -47,8 +47,8 @@ ata *);
 	void * (*follow_link) (struct dentry *, struct nameidata *);
 	void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
-	int (*permission) (struct inode *, int, struct nameidata *);
-	int (*check_acl)(struct inode *, int);
+	int (*permission) (struct inode *, int, unsigned int);
+	int (*check_acl)(struct inode *, int, unsigned int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
@@ -76,7 +76,7 @@ follow_link:	no
 put_link:	no
 truncate:	yes		(see below)
 setattr:	yes
-permission:	no
+permission:	no (may not block if called in rcu-walk mode)
 check_acl:	no
 getattr:	no
 setxattr:	yes
diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt
index 8789d1810be..eb59c8b44be 100644
--- a/Documentation/filesystems/path-lookup.txt
+++ b/Documentation/filesystems/path-lookup.txt
@@ -316,11 +316,9 @@ The detailed design for rcu-walk is like this:
 
 The cases where rcu-walk cannot continue are:
 * NULL dentry (ie. any uncached path element)
-* parent with d_inode->i_op->permission or ACLs
 * Following links
 
-In future patches, permission checks become rcu-walk aware. It may be possible
-eventually to make following links rcu-walk aware.
+It may be possible eventually to make following links rcu-walk aware.
 
 Uncached path elements will always require dropping to ref-walk mode, at the
 very least because i_mutex needs to be grabbed, and objects allocated.
@@ -336,9 +334,49 @@ or stored into. The result is massive improvements in performance and
 scalability of path resolution.
 
 
+Interesting statistics
+======================
+
+The following table gives rcu lookup statistics for a few simple workloads
+(2s12c24t Westmere, debian non-graphical system). Ungraceful are attempts to
+drop rcu that fail due to d_seq failure and requiring the entire path lookup
+again. Other cases are successful rcu-drops that are required before the final
+element, nodentry for missing dentry, revalidate for filesystem revalidate
+routine requiring rcu drop, permission for permission check requiring drop,
+and link for symlink traversal requiring drop.
+
+     rcu-lookups     restart  nodentry          link  revalidate  permission
+bootup     47121           0      4624          1010       10283        7852
+dbench  25386793           0   6778659(26.7%)     55         549        1156
+kbuild   2696672          10     64442(2.3%)  108764(4.0%)     1        1590
+git diff   39605           0        28             2           0         106
+vfstest 24185492        4945    708725(2.9%) 1076136(4.4%)     0        2651
+
+What this shows is that failed rcu-walk lookups, ie. ones that are restarted
+entirely with ref-walk, are quite rare. Even the "vfstest" case which
+specifically has concurrent renames/mkdir/rmdir/ creat/unlink/etc to excercise
+such races is not showing a huge amount of restarts.
+
+Dropping from rcu-walk to ref-walk mean that we have encountered a dentry where
+the reference count needs to be taken for some reason. This is either because
+we have reached the target of the path walk, or because we have encountered a
+condition that can't be resolved in rcu-walk mode.  Ideally, we drop rcu-walk
+only when we have reached the target dentry, so the other statistics show where
+this does not happen.
+
+Note that a graceful drop from rcu-walk mode due to something such as the
+dentry not existing (which can be common) is not necessarily a failure of
+rcu-walk scheme, because some elements of the path may have been walked in
+rcu-walk mode. The further we get from common path elements (such as cwd or
+root), the less contended the dentry is likely to be. The closer we are to
+common path elements, the more likely they will exist in dentry cache.
+
+
 Papers and other documentation on dcache locking
 ================================================
 
 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124).
 
 2. http://lse.sourceforge.net/locking/dcache/dcache.html
+
+
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index cd9756a2709..07a32b42cf9 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -379,4 +379,9 @@ where possible.
 the filesystem provides it), which requires dropping out of rcu-walk mode. This
 may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be
 returned if the filesystem cannot handle rcu-walk. See
+Documentation/filesystems/vfs.txt for more details.
+
+	permission and check_acl are inode permission checks that are called
+on many or all directory inodes on the way down a path walk (to check for
+exec permission). These must now be rcu-walk aware (flags & IPERM_RCU). See
 Documentation/filesystems/vfs.txt for more details.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index c936b491238..fbb324e2bd4 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -325,7 +325,8 @@ struct inode_operations {
         void * (*follow_link) (struct dentry *, struct nameidata *);
         void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
-	int (*permission) (struct inode *, int, struct nameidata *);
+	int (*permission) (struct inode *, int, unsigned int);
+	int (*check_acl)(struct inode *, int, unsigned int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
@@ -414,6 +415,13 @@ otherwise noted.
   permission: called by the VFS to check for access rights on a POSIX-like
   	filesystem.
 
+	May be called in rcu-walk mode (flags & IPERM_RCU). If in rcu-walk
+	mode, the filesystem must check the permission without blocking or
+	storing to the inode.
+
+	If a situation is encountered that rcu-walk cannot handle, return
+	-ECHILD and it will be called again in ref-walk mode.
+
   setattr: called by the VFS to set attributes for a file. This method
   	is called by chmod(2) and related system calls.
 
diff --git a/drivers/staging/smbfs/file.c b/drivers/staging/smbfs/file.c
index 5dcd19c60eb..31372e7b12d 100644
--- a/drivers/staging/smbfs/file.c
+++ b/drivers/staging/smbfs/file.c
@@ -407,11 +407,14 @@ smb_file_release(struct inode *inode, struct file * file)
  * privileges, so we need our own check for this.
  */
 static int
-smb_file_permission(struct inode *inode, int mask)
+smb_file_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	int mode = inode->i_mode;
 	int error = 0;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	VERBOSE("mode=%x, mask=%x\n", mode, mask);
 
 	/* Look at user permissions */
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 12d602351db..6e58c4ca1e6 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -91,11 +91,14 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
 	return acl;
 }
 
-int v9fs_check_acl(struct inode *inode, int mask)
+int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	struct v9fs_session_info *v9ses;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	v9ses = v9fs_inode2v9ses(inode);
 	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
 		/*
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 59e18c2e8c7..7ef3ac9f6d9 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -16,7 +16,7 @@
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
 extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern int v9fs_check_acl(struct inode *inode, int mask);
+extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern int v9fs_acl_chmod(struct dentry *);
 extern int v9fs_set_create_acl(struct dentry *,
 			       struct posix_acl *, struct posix_acl *);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index cca8eef736f..6d4bc1c8ff6 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -624,7 +624,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int);
+extern int afs_permission(struct inode *, int, unsigned int);
 
 /*
  * server.c
diff --git a/fs/afs/security.c b/fs/afs/security.c
index bb4ed144d0e..f44b9d35537 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -285,13 +285,16 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
  * - AFS ACLs are attached to directories only, and a file is controlled by its
  *   parent directory's ACL
  */
-int afs_permission(struct inode *inode, int mask)
+int afs_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_access_t uninitialized_var(access);
 	struct key *key;
 	int ret;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	_enter("{{%x:%u},%lx},%x,",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
 
@@ -347,7 +350,7 @@ int afs_permission(struct inode *inode, int mask)
 	}
 
 	key_put(key);
-	ret = generic_permission(inode, mask, NULL);
+	ret = generic_permission(inode, mask, flags, NULL);
 	_leave(" = %d", ret);
 	return ret;
 
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f024d8aadde..9ad2369d9e3 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -229,8 +229,11 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
 	return -EIO;
 }
 
-static int bad_inode_permission(struct inode *inode, int mask)
+static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	return -EIO;
 }
 
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b..cb518a4b917 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -185,13 +185,15 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
 	return ret;
 }
 
-int btrfs_check_acl(struct inode *inode, int mask)
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	int error = -EAGAIN;
 
-	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index af52f6d7a4d..a142d204b52 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2544,7 +2544,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
 
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 #else
 #define btrfs_check_acl NULL
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 63e4546b478..5cf0db0081f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7211,11 +7211,14 @@ static int btrfs_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
-	return generic_permission(inode, mask, btrfs_check_acl);
+	return generic_permission(inode, mask, flags, btrfs_check_acl);
 }
 
 static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 47f8c8baf3b..e61de4f7b99 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1781,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
  * Check inode permissions.  We verify we have a valid value for
  * the AUTH cap, then call the generic handler.
  */
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+	int err;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
 
 	if (!err)
-		err = generic_permission(inode, mask, NULL);
+		err = generic_permission(inode, mask, flags, NULL);
 	return err;
 }
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7f01728a465..4553d8829ed 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -665,7 +665,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
 extern void ceph_queue_writeback(struct inode *inode);
 
 extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 			struct kstat *stat);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 223717dcc40..8e21e0fe65d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -283,10 +283,13 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int cifs_permission(struct inode *inode, int mask)
+static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	struct cifs_sb_info *cifs_sb;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	cifs_sb = CIFS_SB(inode->i_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
@@ -298,7 +301,7 @@ static int cifs_permission(struct inode *inode, int mask)
 		on the client (above and beyond ACL on servers) for
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */
-		return generic_permission(inode, mask, NULL);
+		return generic_permission(inode, mask, flags, NULL);
 }
 
 static struct kmem_cache *cifs_inode_cachep;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 619a8303766..29badd91360 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -135,10 +135,13 @@ exit:
 }
 
 
-int coda_permission(struct inode *inode, int mask)
+int coda_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	int error;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
  
 	if (!mask)
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 2fd89b5c5c7..741f0bd0391 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,7 +24,7 @@
 #include <linux/coda_psdev.h>
 
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask);
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
 			unsigned long user_data);
 
@@ -41,8 +41,10 @@ const struct file_operations coda_ioctl_operations = {
 };
 
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask)
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index f91b35db4c6..337352a9475 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -980,8 +980,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 }
 
 static int
-ecryptfs_permission(struct inode *inode, int mask)
+ecryptfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 	return inode_permission(ecryptfs_inode_to_lower(inode), mask);
 }
 
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 2bcc0431bad..dd9bb3f0c8d 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -232,10 +232,14 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 
 int
-ext2_check_acl(struct inode *inode, int mask)
+ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 3ff6cbb9ac4..c939b7b1209 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext2_check_acl (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int, unsigned int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 8a11fe21218..9e49da8302d 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -240,10 +240,14 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 
 int
-ext3_check_acl(struct inode *inode, int mask)
+ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 597334626de..5faf8048e90 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext3_check_acl (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int, unsigned int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ea..373dcaeedba 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,10 +238,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 
 int
-ext4_check_acl(struct inode *inode, int mask)
+ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac..dec821168fd 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext4_check_acl(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int, unsigned int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 07f4b5e675f..f738599fd8c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -985,12 +985,15 @@ static int fuse_access(struct inode *inode, int mask)
  * access request is sent.  Execute permission is still checked
  * locally based on file mode.
  */
-static int fuse_permission(struct inode *inode, int mask)
+static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	bool refreshed = false;
 	int err = 0;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	if (!fuse_allow_task(fc, current))
 		return -EACCES;
 
@@ -1005,7 +1008,7 @@ static int fuse_permission(struct inode *inode, int mask)
 	}
 
 	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
-		err = generic_permission(inode, mask, NULL);
+		err = generic_permission(inode, mask, flags, NULL);
 
 		/* If permission is denied, try to refresh file
 		   attributes.  This is also needed, because the root
@@ -1013,7 +1016,8 @@ static int fuse_permission(struct inode *inode, int mask)
 		if (err == -EACCES && !refreshed) {
 			err = fuse_do_getattr(inode, NULL, NULL);
 			if (!err)
-				err = generic_permission(inode, mask, NULL);
+				err = generic_permission(inode, mask,
+							flags, NULL);
 		}
 
 		/* Note: the opposite of the above test does not
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 6bc9e3a5a69..62800428213 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -190,10 +190,14 @@ generic_acl_chmod(struct inode *inode)
 }
 
 int
-generic_check_acl(struct inode *inode, int mask)
+generic_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
 	if (acl) {
 		int error = posix_acl_permission(inode, acl, mask);
 		posix_acl_release(acl);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 48171f4c943..7118f1a780a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -75,11 +75,14 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
  * Returns: errno
  */
 
-int gfs2_check_acl(struct inode *inode, int mask)
+int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	int error;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index b522b0cb39e..a93907c8159 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -16,7 +16,7 @@
 #define GFS2_POSIX_ACL_DEFAULT		"posix_acl_default"
 #define GFS2_ACL_MAX_ENTRIES		25
 
-extern int gfs2_check_acl(struct inode *inode, int mask);
+extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
 extern const struct xattr_handler gfs2_xattr_system_handler;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index aa996471ec5..fca6689e12e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -241,7 +241,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	    !capable(CAP_LINUX_IMMUTABLE))
 		goto out;
 	if (!IS_IMMUTABLE(inode)) {
-		error = gfs2_permission(inode, MAY_WRITE);
+		error = gfs2_permission(inode, MAY_WRITE, 0);
 		if (error)
 			goto out;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e1213f7f921..6163be9686b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -509,7 +509,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 	}
 
 	if (!is_root) {
-		error = gfs2_permission(dir, MAY_EXEC);
+		error = gfs2_permission(dir, MAY_EXEC, 0);
 		if (error)
 			goto out;
 	}
@@ -539,7 +539,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
 	int error;
 
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d8499fadcc5..732a183efdb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -113,7 +113,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 				  const struct qstr *name,
 				  unsigned int mode, dev_t dev);
-extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f28f89796f4..5c63a06fcd7 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -166,7 +166,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (error)
 		goto out_child;
 
-	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
 	if (error)
 		goto out_gunlock;
 
@@ -289,7 +289,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (IS_APPEND(&dip->i_inode))
 		return -EPERM;
 
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
 	if (error)
 		return error;
 
@@ -822,7 +822,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			}
 		}
 	} else {
-		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
+		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
 		if (error)
 			goto out_gunlock;
 
@@ -857,7 +857,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	/* Check out the dir to be renamed */
 
 	if (dir_rename) {
-		error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+		error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
 		if (error)
 			goto out_gunlock;
 	}
@@ -1041,13 +1041,17 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
  * Returns: errno
  */
 
-int gfs2_permission(struct inode *inode, int mask)
+int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_inode *ip;
 	struct gfs2_holder i_gh;
 	int error;
 	int unlock = 0;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	ip = GFS2_I(inode);
 	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
 		if (error)
@@ -1058,7 +1062,7 @@ int gfs2_permission(struct inode *inode, int mask)
 	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
 		error = -EACCES;
 	else
-		error = generic_permission(inode, mask, gfs2_check_acl);
+		error = generic_permission(inode, mask, flags, gfs2_check_acl);
 	if (unlock)
 		gfs2_glock_dq_uninit(&i_gh);
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 0bc81cf256b..d3244d949a4 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -749,11 +749,14 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	return err;
 }
 
-int hostfs_permission(struct inode *ino, int desired)
+int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
 {
 	char *name;
 	int r = 0, w = 0, x = 0, err;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	if (desired & MAY_READ) r = 1;
 	if (desired & MAY_WRITE) w = 1;
 	if (desired & MAY_EXEC) x = 1;
@@ -768,7 +771,7 @@ int hostfs_permission(struct inode *ino, int desired)
 		err = access_file(name, r, w, x);
 	__putname(name);
 	if (!err)
-		err = generic_permission(ino, desired, NULL);
+		err = generic_permission(ino, desired, flags, NULL);
 	return err;
 }
 
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 11c2b4080f6..f4ad9e31ddc 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -419,7 +419,7 @@ again:
 			unlock_kernel();
 			return -ENOSPC;
 		}
-		if (generic_permission(inode, MAY_WRITE, NULL) ||
+		if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
 		    !S_ISREG(inode->i_mode) ||
 		    get_write_access(inode)) {
 			d_rehash(dentry);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 54a92fd02bb..95b79672150 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -259,11 +259,14 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return rc;
 }
 
-int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	int rc;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 5e42de8d954..3119f59253d 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
 
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 
-extern int jffs2_check_acl(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int, unsigned int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 1057a4998e4..e5de9422fa3 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,10 +114,14 @@ out:
 	return rc;
 }
 
-int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 54e07559878..f9285c4900f 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 
 #ifdef CONFIG_JFS_POSIX_ACL
 
-int jfs_check_acl(struct inode *, int);
+int jfs_check_acl(struct inode *, int, unsigned int flags);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_acl_chmod(struct inode *inode);
 
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 409dfd65e9a..f9ddf0c388c 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -555,9 +555,11 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
 	return __logfs_create(dir, dentry, inode, target, destlen);
 }
 
-static int logfs_permission(struct inode *inode, int mask)
+static int logfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	return generic_permission(inode, mask, NULL);
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+	return generic_permission(inode, mask, flags, NULL);
 }
 
 static int logfs_link(struct dentry *old_dentry, struct inode *dir,
diff --git a/fs/namei.c b/fs/namei.c
index 6e275363e89..4e957bf744a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
 /*
  * This does basic POSIX ACL permission checking
  */
-static inline int __acl_permission_check(struct inode *inode, int mask,
-		int (*check_acl)(struct inode *inode, int mask), int rcu)
+static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
+		int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
 	umode_t			mode = inode->i_mode;
 
@@ -180,13 +180,9 @@ static inline int __acl_permission_check(struct inode *inode, int mask,
 		mode >>= 6;
 	else {
 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
-			if (rcu) {
-				return -ECHILD;
-			} else {
-				int error = check_acl(inode, mask);
-				if (error != -EAGAIN)
-					return error;
-			}
+			int error = check_acl(inode, mask, flags);
+			if (error != -EAGAIN)
+				return error;
 		}
 
 		if (in_group_p(inode->i_gid))
@@ -201,32 +197,31 @@ static inline int __acl_permission_check(struct inode *inode, int mask,
 	return -EACCES;
 }
 
-static inline int acl_permission_check(struct inode *inode, int mask,
-		int (*check_acl)(struct inode *inode, int mask))
-{
-	return __acl_permission_check(inode, mask, check_acl, 0);
-}
-
 /**
- * generic_permission  -  check for access rights on a Posix-like filesystem
+ * generic_permission -  check for access rights on a Posix-like filesystem
  * @inode:	inode to check access rights for
  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  * @check_acl:	optional callback to check for Posix ACLs
+ * @flags	IPERM_FLAG_ flags.
  *
  * Used to check for read/write/execute permissions on a file.
  * We use "fsuid" for this, letting us set arbitrary permissions
  * for filesystem access without changing the "normal" uids which
- * are used for other things..
+ * are used for other things.
+ *
+ * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
+ * request cannot be satisfied (eg. requires blocking or too much complexity).
+ * It would then be called again in ref-walk mode.
  */
-int generic_permission(struct inode *inode, int mask,
-		int (*check_acl)(struct inode *inode, int mask))
+int generic_permission(struct inode *inode, int mask, unsigned int flags,
+	int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
 	int ret;
 
 	/*
 	 * Do the basic POSIX ACL permission checks.
 	 */
-	ret = acl_permission_check(inode, mask, check_acl);
+	ret = acl_permission_check(inode, mask, flags, check_acl);
 	if (ret != -EACCES)
 		return ret;
 
@@ -281,9 +276,10 @@ int inode_permission(struct inode *inode, int mask)
 	}
 
 	if (inode->i_op->permission)
-		retval = inode->i_op->permission(inode, mask);
+		retval = inode->i_op->permission(inode, mask, 0);
 	else
-		retval = generic_permission(inode, mask, inode->i_op->check_acl);
+		retval = generic_permission(inode, mask, 0,
+				inode->i_op->check_acl);
 
 	if (retval)
 		return retval;
@@ -668,22 +664,19 @@ force_reval_path(struct path *path, struct nameidata *nd)
  * short-cut DAC fails, then call ->permission() to do more
  * complete permission check.
  */
-static inline int __exec_permission(struct inode *inode, int rcu)
+static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
 	int ret;
 
 	if (inode->i_op->permission) {
-		if (rcu)
-			return -ECHILD;
-		ret = inode->i_op->permission(inode, MAY_EXEC);
-		if (!ret)
-			goto ok;
-		return ret;
+		ret = inode->i_op->permission(inode, MAY_EXEC, flags);
+	} else {
+		ret = acl_permission_check(inode, MAY_EXEC, flags,
+				inode->i_op->check_acl);
 	}
-	ret = __acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl, rcu);
-	if (!ret)
+	if (likely(!ret))
 		goto ok;
-	if (rcu && ret == -ECHILD)
+	if (ret == -ECHILD)
 		return ret;
 
 	if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
@@ -691,17 +684,7 @@ static inline int __exec_permission(struct inode *inode, int rcu)
 
 	return ret;
 ok:
-	return security_inode_exec_permission(inode, rcu);
-}
-
-static int exec_permission(struct inode *inode)
-{
-	return __exec_permission(inode, 0);
-}
-
-static int exec_permission_rcu(struct inode *inode)
-{
-	return __exec_permission(inode, 1);
+	return security_inode_exec_permission(inode, flags);
 }
 
 static __always_inline void set_root(struct nameidata *nd)
@@ -1165,7 +1148,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 		nd->flags |= LOOKUP_CONTINUE;
 		if (nd->flags & LOOKUP_RCU) {
-			err = exec_permission_rcu(nd->inode);
+			err = exec_permission(nd->inode, IPERM_FLAG_RCU);
 			if (err == -ECHILD) {
 				if (nameidata_drop_rcu(nd))
 					return -ECHILD;
@@ -1173,7 +1156,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			}
 		} else {
 exec_again:
-			err = exec_permission(nd->inode);
+			err = exec_permission(nd->inode, 0);
 		}
  		if (err)
 			break;
@@ -1620,7 +1603,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	struct dentry *dentry;
 	int err;
 
-	err = exec_permission(inode);
+	err = exec_permission(inode, 0);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 58beace14b1..d33da530097 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2189,11 +2189,14 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
 	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
 
-int nfs_permission(struct inode *inode, int mask)
+int nfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	struct rpc_cred *cred;
 	int res = 0;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	nfs_inc_stats(inode, NFSIOS_VFSACCESS);
 
 	if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2241,7 +2244,7 @@ out:
 out_notsup:
 	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	if (res == 0)
-		res = generic_permission(inode, mask, NULL);
+		res = generic_permission(inode, mask, flags, NULL);
 	goto out;
 }
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 71d4bc8464e..77b48c8fab1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -785,15 +785,19 @@ out_err:
 	return err;
 }
 
-int nilfs_permission(struct inode *inode, int mask)
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	struct nilfs_root *root = NILFS_I(inode)->i_root;
+	struct nilfs_root *root;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	root = NILFS_I(inode)->i_root;
 	if ((mask & MAY_WRITE) && root &&
 	    root->cno != NILFS_CPTREE_CURRENT_CNO)
 		return -EROFS; /* snapshot is not writable */
 
-	return generic_permission(inode, mask, NULL);
+	return generic_permission(inode, mask, flags, NULL);
 }
 
 int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f7560da5a56..0ca98823db5 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -256,7 +256,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
-int nilfs_permission(struct inode *inode, int mask);
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
 extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
 				  struct buffer_head **);
 extern int nilfs_inode_dirty(struct inode *);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 391915093fe..704f6b1742f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -291,13 +291,17 @@ static int ocfs2_set_acl(handle_t *handle,
 	return ret;
 }
 
-int ocfs2_check_acl(struct inode *inode, int mask)
+int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_super *osb;
 	struct buffer_head *di_bh = NULL;
 	struct posix_acl *acl;
 	int ret = -EAGAIN;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	osb = OCFS2_SB(inode->i_sb);
 	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
 		return ret;
 
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 5c5d31f0585..4fe7c9cf4bf 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
 	__le32 e_id;
 };
 
-extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_check_acl(struct inode *, int, unsigned int);
 extern int ocfs2_acl_chmod(struct inode *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
 			  struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f6cba566429..bdadbae0909 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1307,10 +1307,13 @@ bail:
 	return err;
 }
 
-int ocfs2_permission(struct inode *inode, int mask)
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	int ret;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	mlog_entry_void();
 
 	ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1320,7 +1323,7 @@ int ocfs2_permission(struct inode *inode, int mask)
 		goto out;
 	}
 
-	ret = generic_permission(inode, mask, ocfs2_check_acl);
+	ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
 
 	ocfs2_inode_unlock(inode, 0);
 out:
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7..f5afbbef670 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask);
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
 
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dc5b2fcadc3..b953d41d9ab 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2114,11 +2114,13 @@ static const struct file_operations proc_fd_operations = {
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
-static int proc_fd_permission(struct inode *inode, int mask)
+static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	int rv;
 
-	rv = generic_permission(inode, mask, NULL);
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+	rv = generic_permission(inode, mask, flags, NULL);
 	if (rv == 0)
 		return 0;
 	if (task_pid(current) == proc_pid(inode))
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c9097f43b42..09a1f92a34e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -295,7 +295,7 @@ out:
 	return ret;
 }
 
-static int proc_sys_permission(struct inode *inode, int mask)
+static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
 {
 	/*
 	 * sysctl entries that are not writeable,
@@ -305,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask)
 	struct ctl_table *table;
 	int error;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	/* Executable files are not allowed under /proc/sys/ */
 	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
 		return -EACCES;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 9ea22a56cdf..3cfb2e93364 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -870,11 +870,14 @@ out:
 	return err;
 }
 
-static int reiserfs_check_acl(struct inode *inode, int mask)
+static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	int error = -EAGAIN; /* do regular unix permission checks by default */
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
 
 	if (acl) {
@@ -951,8 +954,10 @@ static int xattr_mount_check(struct super_block *s)
 	return 0;
 }
 
-int reiserfs_permission(struct inode *inode, int mask)
+int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 	/*
 	 * We don't do permission checks on the internal objects.
 	 * Permissions are determined by the "owning" object.
@@ -965,9 +970,10 @@ int reiserfs_permission(struct inode *inode, int mask)
 	 * Stat data v1 doesn't support ACLs.
 	 */
 	if (get_inode_sd_version(inode) != STAT_DATA_V1)
-		return generic_permission(inode, mask, reiserfs_check_acl);
+		return generic_permission(inode, mask, flags,
+					reiserfs_check_acl);
 #endif
-	return generic_permission(inode, mask, NULL);
+	return generic_permission(inode, mask, flags, NULL);
 }
 
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index cffb1fd8ba3..30ac2734558 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -348,13 +348,18 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
 		return -ENOENT;
 }
 
-int sysfs_permission(struct inode *inode, int mask)
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	struct sysfs_dirent *sd = inode->i_private;
+	struct sysfs_dirent *sd;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	sd = inode->i_private;
 
 	mutex_lock(&sysfs_mutex);
 	sysfs_refresh_inode(sd, inode);
 	mutex_unlock(&sysfs_mutex);
 
-	return generic_permission(inode, mask, NULL);
+	return generic_permission(inode, mask, flags, NULL);
 }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d9be60a2e95..ffaaa816bfb 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -200,7 +200,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_evict_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3..4b11eaf6a58 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,16 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 
 int
-xfs_check_acl(struct inode *inode, int mask)
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct xfs_inode *ip = XFS_I(inode);
+	struct xfs_inode *ip;
 	struct posix_acl *acl;
 	int error = -EAGAIN;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	ip = XFS_I(inode);
 	trace_xfs_check_acl(ip);
 
 	/*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d..11dd72070cb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
 #define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
 
 #ifdef CONFIG_XFS_POSIX_ACL
-extern int xfs_check_acl(struct inode *inode, int mask);
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index 2e914d0771b..4ccc59c1ea8 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -37,7 +37,7 @@ extern const struct file_operations coda_ioctl_operations;
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
 int coda_release(struct inode *i, struct file *f);
-int coda_permission(struct inode *inode, int mask);
+int coda_permission(struct inode *inode, int mask, unsigned int flags);
 int coda_revalidate_inode(struct dentry *);
 int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int coda_setattr(struct dentry *, struct iattr *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a04aa96acb7..d5a4d42f655 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1550,11 +1550,13 @@ struct file_operations {
 	int (*setlease)(struct file *, long, struct file_lock **);
 };
 
+#define IPERM_FLAG_RCU	0x0001
+
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
 	void * (*follow_link) (struct dentry *, struct nameidata *);
-	int (*permission) (struct inode *, int);
-	int (*check_acl)(struct inode *, int);
+	int (*permission) (struct inode *, int, unsigned int);
+	int (*check_acl)(struct inode *, int, unsigned int);
 
 	int (*readlink) (struct dentry *, char __user *,int);
 	void (*put_link) (struct dentry *, struct nameidata *, void *);
@@ -2165,8 +2167,8 @@ extern sector_t bmap(struct inode *, sector_t);
 #endif
 extern int notify_change(struct dentry *, struct iattr *);
 extern int inode_permission(struct inode *, int);
-extern int generic_permission(struct inode *, int,
-		int (*check_acl)(struct inode *, int));
+extern int generic_permission(struct inode *, int, unsigned int,
+		int (*check_acl)(struct inode *, int, unsigned int));
 
 static inline bool execute_ok(struct inode *inode)
 {
diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h
index 574bea4013b..0437e377b55 100644
--- a/include/linux/generic_acl.h
+++ b/include/linux/generic_acl.h
@@ -10,6 +10,6 @@ extern const struct xattr_handler generic_acl_default_handler;
 
 int generic_acl_init(struct inode *, struct inode *);
 int generic_acl_chmod(struct inode *);
-int generic_check_acl(struct inode *inode, int mask);
+int generic_check_acl(struct inode *inode, int mask, unsigned int flags);
 
 #endif /* LINUX_GENERIC_ACL_H */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 29d504d5d1c..0779bb8f95b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -351,7 +351,7 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-extern int nfs_permission(struct inode *, int);
+extern int nfs_permission(struct inode *, int, unsigned int);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index b2cf2089769..3b94c91f20a 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -41,7 +41,7 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
 int reiserfs_lookup_privroot(struct super_block *sb);
 int reiserfs_delete_xattrs(struct inode *inode);
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
-int reiserfs_permission(struct inode *inode, int mask);
+int reiserfs_permission(struct inode *inode, int mask, unsigned int flags);
 
 #ifdef CONFIG_REISERFS_FS_XATTR
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
-- 
cgit v1.2.3


From ec26fba40fa92c7cc5c61d40746f499dcefc67be Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 11 Jan 2011 10:30:48 -0500
Subject: Documentation: fl_mylease no longer exists

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 Documentation/filesystems/Locking | 2 --
 1 file changed, 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 33fa3e5d38f..01d7cbdcd3b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -340,7 +340,6 @@ prototypes:
 	int (*fl_grant)(struct file_lock *, struct file_lock *, int);
 	void (*fl_release_private)(struct file_lock *);
 	void (*fl_break)(struct file_lock *); /* break_lease callback */
-	int (*fl_mylease)(struct file_lock *, struct file_lock *);
 	int (*fl_change)(struct file_lock **, int);
 
 locking rules:
@@ -350,7 +349,6 @@ fl_notify:		yes		no
 fl_grant:		no		no
 fl_release_private:	maybe		no
 fl_break:		yes		no
-fl_mylease:		yes		no
 fl_change		yes		no
 
 --------------------------- buffer_head -----------------------------------
-- 
cgit v1.2.3


From 2818ef50c4dc103ce52e12d14ce2dfbde5268120 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <anton@tuxera.com>
Date: Wed, 12 Jan 2011 10:34:35 +0000
Subject: NTFS: writev() fix and maintenance/contact details update

Fix writev() to not keep writing the first segment over and over again
instead of moving onto subsequent segments and update the NTFS entry in
MAINTAINERS to reflect that Tuxera Inc. now supports the NTFS driver.

Signed-off-by: Anton Altaparmakov <anton@tuxera.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/ntfs.txt |  3 +++
 MAINTAINERS                        |  6 +++---
 fs/ntfs/Makefile                   |  2 +-
 fs/ntfs/file.c                     | 35 +++++++++++++++++------------------
 fs/ntfs/super.c                    |  6 +++---
 5 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index ac2a261c5f7..6ef8cf3bc9a 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -457,6 +457,9 @@ ChangeLog
 
 Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
 
+2.1.30:
+	- Fix writev() (it kept writing the first segment over and over again
+	  instead of moving onto subsequent segments).
 2.1.29:
 	- Fix a deadlock when mounting read-write.
 2.1.28:
diff --git a/MAINTAINERS b/MAINTAINERS
index 42f991e5a85..64d7621ab35 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4383,11 +4383,11 @@ F:	Documentation/scsi/NinjaSCSI.txt
 F:	drivers/scsi/nsp32*
 
 NTFS FILESYSTEM
-M:	Anton Altaparmakov <aia21@cantab.net>
+M:	Anton Altaparmakov <anton@tuxera.com>
 L:	linux-ntfs-dev@lists.sourceforge.net
-W:	http://www.linux-ntfs.org/
+W:	http://www.tuxera.com/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/aia21/ntfs-2.6.git
-S:	Maintained
+S:	Supported
 F:	Documentation/filesystems/ntfs.txt
 F:	fs/ntfs/
 
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 58b6be99254..4ff028fcfd6 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
 	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
 	     unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
 
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 113ebd9f25a..f4b1057abdd 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
  * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
  * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
  * single-segment behaviour.
  *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
- * when atomic and when not atomic.  This is ok because
- * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
- * and it is ok to call this when non-atomic.
- * Infact, the only difference between __copy_from_user_inatomic() and
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
+ * atomic and when not atomic.  This is ok because it calls
+ * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
+ * fact, the only difference between __copy_from_user_inatomic() and
  * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error.  And on many
- * architectures __copy_from_user_inatomic() is just defined to
- * __copy_from_user() so it makes no difference at all on those architectures.
+ * should not zero the tail of the buffer on error.  And on many architectures
+ * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
+ * makes no difference at all on those architectures.
  */
 static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
 		unsigned nr_pages, unsigned ofs, const struct iovec **iov,
@@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
 		if (unlikely(copied != len)) {
 			/* Do it the slow way. */
 			addr = kmap(*pages);
-			copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
-					*iov, *iov_ofs, len);
-			/*
-			 * Zero the rest of the target like __copy_from_user().
-			 */
-			memset(addr + ofs + copied, 0, len - copied);
-			kunmap(*pages);
+			copied = __ntfs_copy_from_user_iovec_inatomic(addr +
+					ofs, *iov, *iov_ofs, len);
 			if (unlikely(copied != len))
 				goto err_out;
+			kunmap(*pages);
 		}
 		total += len;
+		ntfs_set_next_iovec(iov, iov_ofs, len);
 		bytes -= len;
 		if (!bytes)
 			break;
-		ntfs_set_next_iovec(iov, iov_ofs, len);
 		ofs = 0;
 	} while (++pages < last_page);
 out:
 	return total;
 err_out:
-	total += copied;
+	BUG_ON(copied > len);
 	/* Zero the rest of the target like __copy_from_user(). */
+	memset(addr + ofs + copied, 0, len - copied);
+	kunmap(*pages);
+	total += copied;
+	ntfs_set_next_iovec(iov, iov_ofs, copied);
 	while (++pages < last_page) {
 		bytes -= len;
 		if (!bytes)
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index a30ecacc01f..29099a07b9f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
 /*
  * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2001,2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -3193,8 +3193,8 @@ static void __exit exit_ntfs_fs(void)
 	ntfs_sysctl(0);
 }
 
-MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov");
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
-- 
cgit v1.2.3


From 924241575a85249b9d410e38f5b2fcad9035e45c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 5 Jan 2011 15:00:07 -0500
Subject: fs: add documentation on fallocate hole punching

This patch simply adds documentation on how to handle the hole punching mode of
fallocate for any filesystem wishing to use it.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 07a32b42cf9..266d2059b9b 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -385,3 +385,12 @@ Documentation/filesystems/vfs.txt for more details.
 on many or all directory inodes on the way down a path walk (to check for
 exec permission). These must now be rcu-walk aware (flags & IPERM_RCU). See
 Documentation/filesystems/vfs.txt for more details.
+ 
+--
+[mandatory]
+	In ->fallocate() you must check the mode option passed in.  If your
+filesystem does not support hole punching (deallocating space in the middle of a
+file) you must return -EOPNOTSUPP if FALLOC_FL_PUNCH_HOLE is set in mode.
+Currently you can only have FALLOC_FL_PUNCH_HOLE with FALLOC_FL_KEEP_SIZE set,
+so the i_size should not change when hole punching, even when puching the end of
+a file off.
-- 
cgit v1.2.3


From 2d90508f638241a2e7422d884767398296ebe720 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Thu, 13 Jan 2011 15:45:53 -0800
Subject: mm: smaps: export mlock information

Currently there is no way to find whether a process has locked its pages
in memory or not.  And which of the memory regions are locked in memory.

Add a new field "Locked" to export this information via the smaps file.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 3 +++
 fs/proc/task_mmu.c                 | 7 +++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 9471225212c..ef757fca470 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -375,6 +375,7 @@ Anonymous:             0 kB
 Swap:                  0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
+Locked:              374 kB
 
 The first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
@@ -670,6 +671,8 @@ varies by architecture and compile options.  The following is from a
 
 > cat /proc/meminfo
 
+The "Locked" indicates whether the mapping is locked in memory or not.
+
 
 MemTotal:     16344972 kB
 MemFree:      13634064 kB
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c3755bd8dd3..60b914860f8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -418,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
 		   "Anonymous:      %8lu kB\n"
 		   "Swap:           %8lu kB\n"
 		   "KernelPageSize: %8lu kB\n"
-		   "MMUPageSize:    %8lu kB\n",
+		   "MMUPageSize:    %8lu kB\n"
+		   "Locked:         %8lu kB\n",
 		   (vma->vm_end - vma->vm_start) >> 10,
 		   mss.resident >> 10,
 		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -430,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.anonymous >> 10,
 		   mss.swap >> 10,
 		   vma_kernel_pagesize(vma) >> 10,
-		   vma_mmu_pagesize(vma) >> 10);
+		   vma_mmu_pagesize(vma) >> 10,
+		   (vma->vm_flags & VM_LOCKED) ?
+			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
-- 
cgit v1.2.3


From dabb16f639820267b3850d804571c70bd93d4e07 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Thu, 13 Jan 2011 15:46:05 -0800
Subject: oom: allow a non-CAP_SYS_RESOURCE proces to oom_score_adj down

We'd like to be able to oom_score_adj a process up/down as it
enters/leaves the foreground.  Currently, it is not possible to oom_adj
down without CAP_SYS_RESOURCE.  This patch allows a task to decrease its
oom_score_adj back to the value that a CAP_SYS_RESOURCE thread set it to
or its inherited value at fork.  Assuming the thread that has forked it
has oom_score_adj of 0, each process could decrease it back from 0 upon
activation unless a CAP_SYS_RESOURCE thread elevated it to something
higher.

Alternative considered:

* a setuid binary
* a daemon with CAP_SYS_RESOURCE

Since you don't wan't all processes to be able to reduce their oom_adj, a
setuid or daemon implementation would be complex.  The alternatives also
have much higher overhead.

This patch updated from original patch based on feedback from David
Rientjes.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 4 ++++
 fs/proc/base.c                     | 4 +++-
 include/linux/sched.h              | 2 ++
 kernel/fork.c                      | 1 +
 4 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ef757fca470..23cae6548d3 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1323,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj.
 Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
 other with its scaled value.
 
+The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
+value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
+requires CAP_SYS_RESOURCE.
+
 NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
 Documentation/feature-removal-schedule.txt.
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93f1cdd5d3d..9d096e82b20 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		goto err_task_lock;
 	}
 
-	if (oom_score_adj < task->signal->oom_score_adj &&
+	if (oom_score_adj < task->signal->oom_score_adj_min &&
 			!capable(CAP_SYS_RESOURCE)) {
 		err = -EACCES;
 		goto err_sighand;
@@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 			atomic_dec(&task->mm->oom_disable_count);
 	}
 	task->signal->oom_score_adj = oom_score_adj;
+	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+		task->signal->oom_score_adj_min = oom_score_adj;
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
 	 * always attainable.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 07402530fc7..f23b5bb6f52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -634,6 +634,8 @@ struct signal_struct {
 
 	int oom_adj;		/* OOM kill score adjustment (bit shift) */
 	int oom_score_adj;	/* OOM kill score adjustment */
+	int oom_score_adj_min;	/* OOM kill score adjustment minimum value.
+				 * Only settable by CAP_SYS_RESOURCE. */
 
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
diff --git a/kernel/fork.c b/kernel/fork.c
index 76a1fdd80bd..1499607e4da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -910,6 +910,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
 	sig->oom_adj = current->signal->oom_adj;
 	sig->oom_score_adj = current->signal->oom_score_adj;
+	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
 	mutex_init(&sig->cred_guard_mutex);
 
-- 
cgit v1.2.3


From a82416da83722944d78d933301e32e7c5ad70edd Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 14 Jan 2011 02:26:53 +0000
Subject: fs: small rcu-walk documentation fixes

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 Documentation/filesystems/porting | 8 ++++----
 Documentation/filesystems/vfs.txt | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 07a32b42cf9..7fcac6393cd 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -365,8 +365,8 @@ must be done in the RCU callback.
 [recommended]
 	vfs now tries to do path walking in "rcu-walk mode", which avoids
 atomic operations and scalability hazards on dentries and inodes (see
-Documentation/filesystems/path-walk.txt). d_hash and d_compare changes (above)
-are examples of the changes required to support this. For more complex
+Documentation/filesystems/path-lookup.txt). d_hash and d_compare changes
+(above) are examples of the changes required to support this. For more complex
 filesystem callbacks, the vfs drops out of rcu-walk mode before the fs call, so
 no changes are required to the filesystem. However, this is costly and loses
 the benefits of rcu-walk mode. We will begin to add filesystem callbacks that
@@ -383,5 +383,5 @@ Documentation/filesystems/vfs.txt for more details.
 
 	permission and check_acl are inode permission checks that are called
 on many or all directory inodes on the way down a path walk (to check for
-exec permission). These must now be rcu-walk aware (flags & IPERM_RCU). See
-Documentation/filesystems/vfs.txt for more details.
+exec permission). These must now be rcu-walk aware (flags & IPERM_FLAG_RCU).
+See Documentation/filesystems/vfs.txt for more details.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index fbb324e2bd4..cae6d27c9f5 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -415,8 +415,8 @@ otherwise noted.
   permission: called by the VFS to check for access rights on a POSIX-like
   	filesystem.
 
-	May be called in rcu-walk mode (flags & IPERM_RCU). If in rcu-walk
-	mode, the filesystem must check the permission without blocking or
+	May be called in rcu-walk mode (flags & IPERM_FLAG_RCU). If in rcu-walk
+        mode, the filesystem must check the permission without blocking or
 	storing to the inode.
 
 	If a situation is encountered that rcu-walk cannot handle, return
-- 
cgit v1.2.3


From 9875cf806403fae66b2410a3c2cc820d97731e04 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:21 +0000
Subject: Add a dentry op to handle automounting rather than abusing
 follow_link()

Add a dentry op (d_automount) to handle automounting directories rather than
abusing the follow_link() inode operation.  The operation is keyed off a new
dentry flag (DCACHE_NEED_AUTOMOUNT).

This also makes it easier to add an AT_ flag to suppress terminal segment
automount during pathwalk and removes the need for the kludge code in the
pathwalk algorithm to handle directories with follow_link() semantics.

The ->d_automount() dentry operation:

	struct vfsmount *(*d_automount)(struct path *mountpoint);

takes a pointer to the directory to be mounted upon, which is expected to
provide sufficient data to determine what should be mounted.  If successful, it
should return the vfsmount struct it creates (which it should also have added
to the namespace using do_add_mount() or similar).  If there's a collision with
another automount attempt, NULL should be returned.  If the directory specified
by the parameter should be used directly rather than being mounted upon,
-EISDIR should be returned.  In any other case, an error code should be
returned.

The ->d_automount() operation is called with no locks held and may sleep.  At
this point the pathwalk algorithm will be in ref-walk mode.

Within fs/namei.c itself, a new pathwalk subroutine (follow_automount()) is
added to handle mountpoints.  It will return -EREMOTE if the automount flag was
set, but no d_automount() op was supplied, -ELOOP if we've encountered too many
symlinks or mountpoints, -EISDIR if the walk point should be used without
mounting and 0 if successful.  The path will be updated to point to the mounted
filesystem if a successful automount took place.

__follow_mount() is replaced by follow_managed() which is more generic
(especially with the patch that adds ->d_manage()).  This handles transits from
directories during pathwalk, including automounting and skipping over
mountpoints (and holding processes with the next patch).

__follow_mount_rcu() will jump out of RCU-walk mode if it encounters an
automount point with nothing mounted on it.

follow_dotdot*() does not handle automounts as you don't want to trigger them
whilst following "..".

I've also extracted the mount/don't-mount logic from autofs4 and included it
here.  It makes the mount go ahead anyway if someone calls open() or creat(),
tries to traverse the directory, tries to chdir/chroot/etc. into the directory,
or sticks a '/' on the end of the pathname.  If they do a stat(), however,
they'll only trigger the automount if they didn't also say O_NOFOLLOW.

I've also added an inode flag (S_AUTOMOUNT) so that filesystems can mark their
inodes as automount points.  This flag is automatically propagated to the
dentry as DCACHE_NEED_AUTOMOUNT by __d_instantiate().  This saves NFS and could
save AFS a private flag bit apiece, but is not strictly necessary.  It would be
preferable to do the propagation in d_set_d_op(), but that doesn't normally
have access to the inode.

[AV: fixed breakage in case if __follow_mount_rcu() fails and nameidata_drop_rcu()
succeeds in RCU case of do_lookup(); we need to fall through to non-RCU case after
that, rather than just returning with ungrabbed *path]

Signed-off-by: David Howells <dhowells@redhat.com>
Was-Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |   2 +
 Documentation/filesystems/vfs.txt |  14 +++
 fs/dcache.c                       |   5 +-
 fs/namei.c                        | 226 ++++++++++++++++++++++++++++----------
 include/linux/dcache.h            |   7 +-
 include/linux/fs.h                |   2 +
 6 files changed, 198 insertions(+), 58 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 977d8919cc6..5f0c52a0738 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -19,6 +19,7 @@ prototypes:
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
+	struct vfsmount *(*d_automount)(struct path *path);
 
 locking rules:
 		rename_lock	->d_lock	may block	rcu-walk
@@ -29,6 +30,7 @@ d_delete:	no		yes		no		no
 d_release:	no		no		yes		no
 d_iput:		no		no		yes		no
 d_dname:	no		no		no		no
+d_automount:	no		no		yes		no
 
 --------------------------- inode_operations --------------------------- 
 prototypes:
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index cae6d27c9f5..726a4f6fa3c 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -864,6 +864,7 @@ struct dentry_operations {
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
+	struct vfsmount *(*d_automount)(struct path *);
 };
 
   d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -930,6 +931,19 @@ struct dentry_operations {
 	at the end of the buffer, and returns a pointer to the first char.
 	dynamic_dname() helper function is provided to take care of this.
 
+  d_automount: called when an automount dentry is to be traversed (optional).
+	This should create a new VFS mount record, mount it on the directory
+	and return the record to the caller.  The caller is supplied with a
+	path parameter giving the automount directory to describe the automount
+	target and the parent VFS mount record to provide inheritable mount
+	parameters.  NULL should be returned if someone else managed to make
+	the automount first.  If the automount failed, then an error code
+	should be returned.
+
+	This function is only used if DCACHE_NEED_AUTOMOUNT is set on the
+	dentry.  This is set by __d_instantiate() if S_AUTOMOUNT is set on the
+	inode being added.
+
 Example :
 
 static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
diff --git a/fs/dcache.c b/fs/dcache.c
index 0c6d5c549d8..51f7bb6463a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1380,8 +1380,11 @@ EXPORT_SYMBOL(d_set_d_op);
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	spin_lock(&dentry->d_lock);
-	if (inode)
+	if (inode) {
+		if (unlikely(IS_AUTOMOUNT(inode)))
+			dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
 		list_add(&dentry->d_alias, &inode->i_dentry);
+	}
 	dentry->d_inode = inode;
 	dentry_rcuwalk_barrier(dentry);
 	spin_unlock(&dentry->d_lock);
diff --git a/fs/namei.c b/fs/namei.c
index 529e917ad2f..16109da68bb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -896,51 +896,120 @@ int follow_up(struct path *path)
 }
 
 /*
- * serialization is taken care of in namespace.c
+ * Perform an automount
+ * - return -EISDIR to tell follow_managed() to stop and return the path we
+ *   were called with.
  */
-static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
-				struct inode **inode)
+static int follow_automount(struct path *path, unsigned flags,
+			    bool *need_mntput)
 {
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted;
-		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
-		if (!mounted)
-			return;
-		path->mnt = mounted;
-		path->dentry = mounted->mnt_root;
-		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
-		*inode = path->dentry->d_inode;
+	struct vfsmount *mnt;
+
+	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
+		return -EREMOTE;
+
+	/* We want to mount if someone is trying to open/create a file of any
+	 * type under the mountpoint, wants to traverse through the mountpoint
+	 * or wants to open the mounted directory.
+	 *
+	 * We don't want to mount if someone's just doing a stat and they've
+	 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
+	 * appended a '/' to the name.
+	 */
+	if (!(flags & LOOKUP_FOLLOW) &&
+	    !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
+		       LOOKUP_OPEN | LOOKUP_CREATE)))
+		return -EISDIR;
+
+	current->total_link_count++;
+	if (current->total_link_count >= 40)
+		return -ELOOP;
+
+	mnt = path->dentry->d_op->d_automount(path);
+	if (IS_ERR(mnt)) {
+		/*
+		 * The filesystem is allowed to return -EISDIR here to indicate
+		 * it doesn't want to automount.  For instance, autofs would do
+		 * this so that its userspace daemon can mount on this dentry.
+		 *
+		 * However, we can only permit this if it's a terminal point in
+		 * the path being looked up; if it wasn't then the remainder of
+		 * the path is inaccessible and we should say so.
+		 */
+		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+			return -EREMOTE;
+		return PTR_ERR(mnt);
 	}
-}
+	if (!mnt) /* mount collision */
+		return 0;
 
-static int __follow_mount(struct path *path)
-{
-	int res = 0;
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
-		if (!mounted)
-			break;
-		dput(path->dentry);
-		if (res)
-			mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
-		res = 1;
+	if (mnt->mnt_sb == path->mnt->mnt_sb &&
+	    mnt->mnt_root == path->dentry) {
+		mntput(mnt);
+		return -ELOOP;
 	}
-	return res;
+
+	dput(path->dentry);
+	if (*need_mntput)
+		mntput(path->mnt);
+	path->mnt = mnt;
+	path->dentry = dget(mnt->mnt_root);
+	*need_mntput = true;
+	return 0;
 }
 
-static void follow_mount(struct path *path)
+/*
+ * Handle a dentry that is managed in some way.
+ * - Flagged as mountpoint
+ * - Flagged as automount point
+ *
+ * This may only be called in refwalk mode.
+ *
+ * Serialization is taken care of in namespace.c
+ */
+static int follow_managed(struct path *path, unsigned flags)
 {
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
-		if (!mounted)
-			break;
-		dput(path->dentry);
-		mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
+	unsigned managed;
+	bool need_mntput = false;
+	int ret;
+
+	/* Given that we're not holding a lock here, we retain the value in a
+	 * local variable for each dentry as we look at it so that we don't see
+	 * the components of that value change under us */
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       managed &= DCACHE_MANAGED_DENTRY,
+	       unlikely(managed != 0)) {
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (mounted) {
+				dput(path->dentry);
+				if (need_mntput)
+					mntput(path->mnt);
+				path->mnt = mounted;
+				path->dentry = dget(mounted->mnt_root);
+				need_mntput = true;
+				continue;
+			}
+
+			/* Something is mounted on this dentry in another
+			 * namespace and/or whatever was mounted there in this
+			 * namespace got unmounted before we managed to get the
+			 * vfsmount_lock */
+		}
+
+		/* Handle an automount point */
+		if (managed & DCACHE_NEED_AUTOMOUNT) {
+			ret = follow_automount(path, flags, &need_mntput);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+			continue;
+		}
+
+		/* We didn't change the current path point */
+		break;
 	}
+	return 0;
 }
 
 int follow_down(struct path *path)
@@ -958,13 +1027,37 @@ int follow_down(struct path *path)
 	return 0;
 }
 
+/*
+ * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
+ * meet an automount point and we're not walking to "..".  True is returned to
+ * continue, false to abort.
+ */
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+			       struct inode **inode, bool reverse_transit)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted;
+		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+		if (!mounted)
+			break;
+		path->mnt = mounted;
+		path->dentry = mounted->mnt_root;
+		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+		*inode = path->dentry->d_inode;
+	}
+
+	if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+		return reverse_transit;
+	return true;
+}
+
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
 	struct inode *inode = nd->inode;
 
 	set_root_rcu(nd);
 
-	while(1) {
+	while (1) {
 		if (nd->path.dentry == nd->root.dentry &&
 		    nd->path.mnt == nd->root.mnt) {
 			break;
@@ -987,12 +1080,28 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 		inode = nd->path.dentry->d_inode;
 	}
-	__follow_mount_rcu(nd, &nd->path, &inode);
+	__follow_mount_rcu(nd, &nd->path, &inode, true);
 	nd->inode = inode;
 
 	return 0;
 }
 
+/*
+ * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+ */
+static void follow_mount(struct path *path)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted = lookup_mnt(path);
+		if (!mounted)
+			break;
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
+	}
+}
+
 static void follow_dotdot(struct nameidata *nd)
 {
 	set_root(nd);
@@ -1057,12 +1166,14 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
 	struct inode *dir;
+	int err;
+
 	/*
 	 * See if the low-level filesystem might want
 	 * to use its own hash..
 	 */
 	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-		int err = parent->d_op->d_hash(parent, nd->inode, name);
+		err = parent->d_op->d_hash(parent, nd->inode, name);
 		if (err < 0)
 			return err;
 	}
@@ -1092,20 +1203,25 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 done2:
 		path->mnt = mnt;
 		path->dentry = dentry;
-		__follow_mount_rcu(nd, path, inode);
-	} else {
-		dentry = __d_lookup(parent, name);
-		if (!dentry)
-			goto need_lookup;
+		if (likely(__follow_mount_rcu(nd, path, inode, false)))
+			return 0;
+		if (nameidata_drop_rcu(nd))
+			return -ECHILD;
+		/* fallthru */
+	}
+	dentry = __d_lookup(parent, name);
+	if (!dentry)
+		goto need_lookup;
 found:
-		if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-			goto need_revalidate;
+	if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+		goto need_revalidate;
 done:
-		path->mnt = mnt;
-		path->dentry = dentry;
-		__follow_mount(path);
-		*inode = path->dentry->d_inode;
-	}
+	path->mnt = mnt;
+	path->dentry = dentry;
+	err = follow_managed(path, nd->flags);
+	if (unlikely(err < 0))
+		return err;
+	*inode = path->dentry->d_inode;
 	return 0;
 
 need_lookup:
@@ -2203,11 +2319,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (open_flag & O_EXCL)
 		goto exit_dput;
 
-	if (__follow_mount(path)) {
-		error = -ELOOP;
-		if (open_flag & O_NOFOLLOW)
-			goto exit_dput;
-	}
+	error = follow_managed(path, nd->flags);
+	if (error < 0)
+		goto exit_dput;
 
 	error = -ENOENT;
 	if (!path->dentry->d_inode)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 59fcd24b146..ee6c26d142c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -167,6 +167,7 @@ struct dentry_operations {
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
+	struct vfsmount *(*d_automount)(struct path *);
 } ____cacheline_aligned;
 
 /*
@@ -205,13 +206,17 @@ struct dentry_operations {
 
 #define DCACHE_CANT_MOUNT	0x0100
 #define DCACHE_GENOCIDE		0x0200
-#define DCACHE_MOUNTED		0x0400	/* is a mountpoint */
 
 #define DCACHE_OP_HASH		0x1000
 #define DCACHE_OP_COMPARE	0x2000
 #define DCACHE_OP_REVALIDATE	0x4000
 #define DCACHE_OP_DELETE	0x8000
 
+#define DCACHE_MOUNTED		0x10000	/* is a mountpoint */
+#define DCACHE_NEED_AUTOMOUNT	0x20000	/* handle automount on this dir */
+#define DCACHE_MANAGED_DENTRY \
+	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT)
+
 extern seqlock_t rename_lock;
 
 static inline int dname_external(struct dentry *dentry)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3984f2358d1..4c00ed53be8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -242,6 +242,7 @@ struct inodes_stat_t {
 #define S_SWAPFILE	256	/* Do not truncate: swapon got its bmaps */
 #define S_PRIVATE	512	/* Inode is fs-internal */
 #define S_IMA		1024	/* Inode has an associated IMA struct */
+#define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -277,6 +278,7 @@ struct inodes_stat_t {
 #define IS_SWAPFILE(inode)	((inode)->i_flags & S_SWAPFILE)
 #define IS_PRIVATE(inode)	((inode)->i_flags & S_PRIVATE)
 #define IS_IMA(inode)		((inode)->i_flags & S_IMA)
+#define IS_AUTOMOUNT(inode)	((inode)->i_flags & S_AUTOMOUNT)
 
 /* the read-only stuff doesn't really belong here, but any other place is
    probably as bad and I don't want to create yet another include file. */
-- 
cgit v1.2.3


From cc53ce53c86924bfe98a12ea20b7465038a08792 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:26 +0000
Subject: Add a dentry op to allow processes to be held during pathwalk transit

Add a dentry op (d_manage) to permit a filesystem to hold a process and make it
sleep when it tries to transit away from one of that filesystem's directories
during a pathwalk.  The operation is keyed off a new dentry flag
(DCACHE_MANAGE_TRANSIT).

The filesystem is allowed to be selective about which processes it holds and
which it permits to continue on or prohibits from transiting from each flagged
directory.  This will allow autofs to hold up client processes whilst letting
its userspace daemon through to maintain the directory or the stuff behind it
or mounted upon it.

The ->d_manage() dentry operation:

	int (*d_manage)(struct path *path, bool mounting_here);

takes a pointer to the directory about to be transited away from and a flag
indicating whether the transit is undertaken by do_add_mount() or
do_move_mount() skipping through a pile of filesystems mounted on a mountpoint.

It should return 0 if successful and to let the process continue on its way;
-EISDIR to prohibit the caller from skipping to overmounted filesystems or
automounting, and to use this directory; or some other error code to return to
the user.

->d_manage() is called with namespace_sem writelocked if mounting_here is true
and no other locks held, so it may sleep.  However, if mounting_here is true,
it may not initiate or wait for a mount or unmount upon the parameter
directory, even if the act is actually performed by userspace.

Within fs/namei.c, follow_managed() is extended to check with d_manage() first
on each managed directory, before transiting away from it or attempting to
automount upon it.

follow_down() is renamed follow_down_one() and should only be used where the
filesystem deliberately intends to avoid management steps (e.g. autofs).

A new follow_down() is added that incorporates the loop done by all other
callers of follow_down() (do_add/move_mount(), autofs and NFSD; whilst AFS, NFS
and CIFS do use it, their use is removed by converting them to use
d_automount()).  The new follow_down() calls d_manage() as appropriate.  It
also takes an extra parameter to indicate if it is being called from mount code
(with namespace_sem writelocked) which it passes to d_manage().  follow_down()
ignores automount points so that it can be used to mount on them.

__follow_mount_rcu() is made to abort rcu-walk mode if it hits a directory with
DCACHE_MANAGE_TRANSIT set on the basis that we're probably going to have to
sleep.  It would be possible to enter d_manage() in rcu-walk mode too, and have
that determine whether to abort or not itself.  That would allow the autofs
daemon to continue on in rcu-walk mode.

Note that DCACHE_MANAGE_TRANSIT on a directory should be cleared when it isn't
required as every tranist from that directory will cause d_manage() to be
invoked.  It can always be set again when necessary.

==========================
WHAT THIS MEANS FOR AUTOFS
==========================

Autofs currently uses the lookup() inode op and the d_revalidate() dentry op to
trigger the automounting of indirect mounts, and both of these can be called
with i_mutex held.

autofs knows that the i_mutex will be held by the caller in lookup(), and so
can drop it before invoking the daemon - but this isn't so for d_revalidate(),
since the lock is only held on _some_ of the code paths that call it.  This
means that autofs can't risk dropping i_mutex from its d_revalidate() function
before it calls the daemon.

The bug could manifest itself as, for example, a process that's trying to
validate an automount dentry that gets made to wait because that dentry is
expired and needs cleaning up:

	mkdir         S ffffffff8014e05a     0 32580  24956
	Call Trace:
	 [<ffffffff885371fd>] :autofs4:autofs4_wait+0x674/0x897
	 [<ffffffff80127f7d>] avc_has_perm+0x46/0x58
	 [<ffffffff8009fdcf>] autoremove_wake_function+0x0/0x2e
	 [<ffffffff88537be6>] :autofs4:autofs4_expire_wait+0x41/0x6b
	 [<ffffffff88535cfc>] :autofs4:autofs4_revalidate+0x91/0x149
	 [<ffffffff80036d96>] __lookup_hash+0xa0/0x12f
	 [<ffffffff80057a2f>] lookup_create+0x46/0x80
	 [<ffffffff800e6e31>] sys_mkdirat+0x56/0xe4

versus the automount daemon which wants to remove that dentry, but can't
because the normal process is holding the i_mutex lock:

	automount     D ffffffff8014e05a     0 32581      1              32561
	Call Trace:
	 [<ffffffff80063c3f>] __mutex_lock_slowpath+0x60/0x9b
	 [<ffffffff8000ccf1>] do_path_lookup+0x2ca/0x2f1
	 [<ffffffff80063c89>] .text.lock.mutex+0xf/0x14
	 [<ffffffff800e6d55>] do_rmdir+0x77/0xde
	 [<ffffffff8005d229>] tracesys+0x71/0xe0
	 [<ffffffff8005d28d>] tracesys+0xd5/0xe0

which means that the system is deadlocked.

This patch allows autofs to hold up normal processes whilst the daemon goes
ahead and does things to the dentry tree behind the automouter point without
risking a deadlock as almost no locks are held in d_manage() and none in
d_automount().

Signed-off-by: David Howells <dhowells@redhat.com>
Was-Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  2 ++
 Documentation/filesystems/vfs.txt | 21 +++++++++++-
 drivers/staging/autofs/dirhash.c  |  5 ++-
 fs/afs/mntpt.c                    |  5 +--
 fs/autofs4/autofs_i.h             | 13 -------
 fs/autofs4/dev-ioctl.c            |  2 +-
 fs/autofs4/expire.c               |  2 +-
 fs/autofs4/root.c                 | 11 +++---
 fs/cifs/cifs_dfs_ref.c            |  5 +--
 fs/namei.c                        | 72 +++++++++++++++++++++++++++++++++++++--
 fs/namespace.c                    | 14 ++++----
 fs/nfs/namespace.c                |  5 +--
 fs/nfsd/vfs.c                     |  5 +--
 include/linux/dcache.h            | 11 ++++--
 include/linux/namei.h             |  3 +-
 15 files changed, 126 insertions(+), 50 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 5f0c52a0738..cbf98b989b1 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -20,6 +20,7 @@ prototypes:
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
 	struct vfsmount *(*d_automount)(struct path *path);
+	int (*d_manage)(struct dentry *, bool);
 
 locking rules:
 		rename_lock	->d_lock	may block	rcu-walk
@@ -31,6 +32,7 @@ d_release:	no		no		yes		no
 d_iput:		no		no		yes		no
 d_dname:	no		no		no		no
 d_automount:	no		no		yes		no
+d_manage:	no		no		yes		no
 
 --------------------------- inode_operations --------------------------- 
 prototypes:
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 726a4f6fa3c..4682586b147 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -865,6 +865,7 @@ struct dentry_operations {
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
+	int (*d_manage)(struct dentry *, bool);
 };
 
   d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -938,12 +939,30 @@ struct dentry_operations {
 	target and the parent VFS mount record to provide inheritable mount
 	parameters.  NULL should be returned if someone else managed to make
 	the automount first.  If the automount failed, then an error code
-	should be returned.
+	should be returned.  If -EISDIR is returned, then the directory will
+	be treated as an ordinary directory and returned to pathwalk to
+	continue walking.
 
 	This function is only used if DCACHE_NEED_AUTOMOUNT is set on the
 	dentry.  This is set by __d_instantiate() if S_AUTOMOUNT is set on the
 	inode being added.
 
+  d_manage: called to allow the filesystem to manage the transition from a
+	dentry (optional).  This allows autofs, for example, to hold up clients
+	waiting to explore behind a 'mountpoint' whilst letting the daemon go
+	past and construct the subtree there.  0 should be returned to let the
+	calling process continue.  -EISDIR can be returned to tell pathwalk to
+	use this directory as an ordinary directory and to ignore anything
+	mounted on it and not to check the automount flag.  Any other error
+	code will abort pathwalk completely.
+
+	If the 'mounting_here' parameter is true, then namespace_sem is being
+	held by the caller and the function should not initiate any mounts or
+	unmounts that it will then wait for.
+
+	This function is only used if DCACHE_MANAGE_TRANSIT is set on the
+	dentry being transited from.
+
 Example :
 
 static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
diff --git a/drivers/staging/autofs/dirhash.c b/drivers/staging/autofs/dirhash.c
index d3f42c8325f..a08bd735503 100644
--- a/drivers/staging/autofs/dirhash.c
+++ b/drivers/staging/autofs/dirhash.c
@@ -88,14 +88,13 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 		}
 		path.mnt = mnt;
 		path_get(&path);
-		if (!follow_down(&path)) {
+		if (!follow_down_one(&path)) {
 			path_put(&path);
 			DPRINTK(("autofs: not expirable\
 			(not a mounted directory): %s\n", ent->name));
 			continue;
 		}
-		while (d_mountpoint(path.dentry) && follow_down(&path))
-			;
+		follow_down(&path, false);  // TODO: need to check error
 		umount_ok = may_umount(path.mnt);
 		path_put(&path);
 
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index e83c0336e7b..f3e891d57a2 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -273,10 +273,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 		break;
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
-		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path))
-			;
-		err = 0;
+		err = follow_down(&nd->path, false);
 	default:
 		mntput(newmnt);
 		break;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0fffe1c24ce..eb67953452b 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -229,19 +229,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
-static inline int autofs4_follow_mount(struct path *path)
-{
-	int res = 0;
-
-	while (d_mountpoint(path->dentry)) {
-		int followed = follow_down(path);
-		if (!followed)
-			break;
-		res = 1;
-	}
-	return res;
-}
-
 static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
 {
 	return new_encode_dev(sbi->sb->s_dev);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index eff9a419469..1442da4860e 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
 		err = have_submounts(path.dentry);
 
-		if (follow_down(&path))
+		if (follow_down_one(&path))
 			magic = path.mnt->mnt_sb->s_magic;
 	}
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cc1d0136590..6a930b90d38 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -56,7 +56,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 
 	path_get(&path);
 
-	if (!follow_down(&path))
+	if (!follow_down_one(&path))
 		goto done;
 
 	if (is_autofs4_dentry(path.dentry)) {
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 651e4ef563b..20225636a4e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -234,7 +234,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		nd->flags);
 	/*
 	 * For an expire of a covered direct or offset mount we need
-	 * to break out of follow_down() at the autofs mount trigger
+	 * to break out of follow_down_one() at the autofs mount trigger
 	 * (d_mounted--), so we can see the expiring flag, and manage
 	 * the blocking and following here until the expire is completed.
 	 */
@@ -243,7 +243,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		if (ino->flags & AUTOFS_INF_EXPIRING) {
 			spin_unlock(&sbi->fs_lock);
 			/* Follow down to our covering mount. */
-			if (!follow_down(&nd->path))
+			if (!follow_down_one(&nd->path))
 				goto done;
 			goto follow;
 		}
@@ -292,11 +292,10 @@ follow:
 	 * multi-mount with no root offset so we don't need
 	 * to follow it.
 	 */
-	if (d_mountpoint(dentry)) {
-		if (!autofs4_follow_mount(&nd->path)) {
-			status = -ENOENT;
+	if (d_managed(dentry)) {
+		status = follow_down(&nd->path, false);
+		if (status < 0)
 			goto out_error;
-		}
 	}
 
 done:
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c68a056f27f..83479cf63f9 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -273,10 +273,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 		break;
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
-		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path))
-			;
-		err = 0;
+		err = follow_down(&nd->path, false);
 	default:
 		mntput(newmnt);
 		break;
diff --git a/fs/namei.c b/fs/namei.c
index 16109da68bb..9d3033dc22e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -960,6 +960,7 @@ static int follow_automount(struct path *path, unsigned flags,
 
 /*
  * Handle a dentry that is managed in some way.
+ * - Flagged for transit management (autofs)
  * - Flagged as mountpoint
  * - Flagged as automount point
  *
@@ -979,6 +980,16 @@ static int follow_managed(struct path *path, unsigned flags)
 	while (managed = ACCESS_ONCE(path->dentry->d_flags),
 	       managed &= DCACHE_MANAGED_DENTRY,
 	       unlikely(managed != 0)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held. */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(path->dentry, false);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+		}
+
 		/* Transit to a mounted filesystem. */
 		if (managed & DCACHE_MOUNTED) {
 			struct vfsmount *mounted = lookup_mnt(path);
@@ -1012,7 +1023,7 @@ static int follow_managed(struct path *path, unsigned flags)
 	return 0;
 }
 
-int follow_down(struct path *path)
+int follow_down_one(struct path *path)
 {
 	struct vfsmount *mounted;
 
@@ -1029,14 +1040,19 @@ int follow_down(struct path *path)
 
 /*
  * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
- * meet an automount point and we're not walking to "..".  True is returned to
+ * meet a managed dentry and we're not walking to "..".  True is returned to
  * continue, false to abort.
  */
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			       struct inode **inode, bool reverse_transit)
 {
+	unsigned abort_mask =
+		reverse_transit ? 0 : DCACHE_MANAGE_TRANSIT;
+
 	while (d_mountpoint(path->dentry)) {
 		struct vfsmount *mounted;
+		if (path->dentry->d_flags & abort_mask)
+			return true;
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		if (!mounted)
 			break;
@@ -1086,6 +1102,57 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	return 0;
 }
 
+/*
+ * Follow down to the covering mount currently visible to userspace.  At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ *
+ * Care must be taken as namespace_sem may be held (indicated by mounting_here
+ * being true).
+ */
+int follow_down(struct path *path, bool mounting_here)
+{
+	unsigned managed;
+	int ret;
+
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held.
+		 *
+		 * We indicate to the filesystem if someone is trying to mount
+		 * something here.  This gives autofs the chance to deny anyone
+		 * other than its daemon the right to mount on its
+		 * superstructure.
+		 *
+		 * The filesystem may sleep at this point.
+		 */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(path->dentry, mounting_here);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+		}
+
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (!mounted)
+				break;
+			dput(path->dentry);
+			mntput(path->mnt);
+			path->mnt = mounted;
+			path->dentry = dget(mounted->mnt_root);
+			continue;
+		}
+
+		/* Don't handle automount points here */
+		break;
+	}
+	return 0;
+}
+
 /*
  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
  */
@@ -3530,6 +3597,7 @@ const struct inode_operations page_symlink_inode_operations = {
 };
 
 EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(follow_down_one);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 3ddfd9046c4..d94ccd6ddaf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1844,9 +1844,10 @@ static int do_move_mount(struct path *path, char *old_name)
 		return err;
 
 	down_write(&namespace_sem);
-	while (d_mountpoint(path->dentry) &&
-	       follow_down(path))
-		;
+	err = follow_down(path, true);
+	if (err < 0)
+		goto out;
+
 	err = -EINVAL;
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
 		goto out;
@@ -1940,9 +1941,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
-	while (d_mountpoint(path->dentry) &&
-	       follow_down(path))
-		;
+	err = follow_down(path, true);
+	if (err < 0)
+		goto unlock;
+
 	err = -EINVAL;
 	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
 		goto unlock;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 74aaf3963c1..bfcb933e575 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -176,10 +176,7 @@ out_err:
 	path_put(&nd->path);
 	goto out;
 out_follow:
-	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path))
-		;
-	err = 0;
+	err = follow_down(&nd->path, false);
 	goto out;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 230b79fbf00..0f79e33a65d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -88,8 +88,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 			    .dentry = dget(dentry)};
 	int err = 0;
 
-	while (d_mountpoint(path.dentry) && follow_down(&path))
-		;
+	err = follow_down(&path, false);
+	if (err < 0)
+		goto out;
 
 	exp2 = rqst_exp_get_by_name(rqstp, &path);
 	if (IS_ERR(exp2)) {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index ee6c26d142c..1a87760d653 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -168,6 +168,7 @@ struct dentry_operations {
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
+	int (*d_manage)(struct dentry *, bool);
 } ____cacheline_aligned;
 
 /*
@@ -214,8 +215,9 @@ struct dentry_operations {
 
 #define DCACHE_MOUNTED		0x10000	/* is a mountpoint */
 #define DCACHE_NEED_AUTOMOUNT	0x20000	/* handle automount on this dir */
+#define DCACHE_MANAGE_TRANSIT	0x40000	/* manage transit from this dirent */
 #define DCACHE_MANAGED_DENTRY \
-	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT)
+	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
 
 extern seqlock_t rename_lock;
 
@@ -404,7 +406,12 @@ static inline void dont_mount(struct dentry *dentry)
 
 extern void dput(struct dentry *);
 
-static inline int d_mountpoint(struct dentry *dentry)
+static inline bool d_managed(struct dentry *dentry)
+{
+	return dentry->d_flags & DCACHE_MANAGED_DENTRY;
+}
+
+static inline bool d_mountpoint(struct dentry *dentry)
 {
 	return dentry->d_flags & DCACHE_MOUNTED;
 }
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 18d06add0a4..8ef2c789c2a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -79,7 +79,8 @@ extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 
-extern int follow_down(struct path *);
+extern int follow_down_one(struct path *);
+extern int follow_down(struct path *, bool);
 extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
-- 
cgit v1.2.3


From ab90911ff90cdab59b31c045c3f0ae480d14f29d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:46:51 +0000
Subject: Allow d_manage() to be used in RCU-walk mode

Allow d_manage() to be called from pathwalk when it is in RCU-walk mode as well
as when it is in Ref-walk mode.  This permits __follow_mount_rcu() to call
d_manage() directly.  d_manage() needs a parameter to indicate that it is in
RCU-walk mode as it isn't allowed to sleep if in that mode (but should return
-ECHILD instead).

autofs4_d_manage() can then be set to retain RCU-walk mode if the daemon
accesses it and otherwise request dropping back to ref-walk mode.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  2 +-
 Documentation/filesystems/vfs.txt |  7 ++++++-
 fs/autofs4/root.c                 |  8 ++++++--
 fs/namei.c                        | 15 ++++++++-------
 include/linux/dcache.h            |  2 +-
 5 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index cbf98b989b1..39707748ed2 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -32,7 +32,7 @@ d_release:	no		no		yes		no
 d_iput:		no		no		yes		no
 d_dname:	no		no		no		no
 d_automount:	no		no		yes		no
-d_manage:	no		no		yes		no
+d_manage:	no		no		yes (ref-walk)	maybe
 
 --------------------------- inode_operations --------------------------- 
 prototypes:
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 4682586b147..3c4b2f1b64d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -865,7 +865,7 @@ struct dentry_operations {
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
-	int (*d_manage)(struct dentry *, bool);
+	int (*d_manage)(struct dentry *, bool, bool);
 };
 
   d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -960,6 +960,11 @@ struct dentry_operations {
 	held by the caller and the function should not initiate any mounts or
 	unmounts that it will then wait for.
 
+	If the 'rcu_walk' parameter is true, then the caller is doing a
+	pathwalk in RCU-walk mode.  Sleeping is not permitted in this mode,
+	and the caller can be asked to leave it and call again by returing
+	-ECHILD.
+
 	This function is only used if DCACHE_MANAGE_TRANSIT is set on the
 	dentry being transited from.
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 9194e274f84..dbd95512808 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -36,7 +36,7 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool);
+static int autofs4_d_manage(struct dentry *, bool, bool);
 
 const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
@@ -450,7 +450,7 @@ done:
 	return NULL;
 }
 
-int autofs4_d_manage(struct dentry *dentry, bool mounting_here)
+int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 
@@ -464,6 +464,10 @@ int autofs4_d_manage(struct dentry *dentry, bool mounting_here)
 		return 0;
 	}
 
+	/* We need to sleep, so we need pathwalk to be in ref-mode */
+	if (rcu_walk)
+		return -ECHILD;
+
 	/* Wait for pending expires */
 	do_expire_wait(dentry);
 
diff --git a/fs/namei.c b/fs/namei.c
index 37385201271..5c89695ae1e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -987,7 +987,8 @@ static int follow_managed(struct path *path, unsigned flags)
 		if (managed & DCACHE_MANAGE_TRANSIT) {
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(path->dentry, false);
+			ret = path->dentry->d_op->d_manage(path->dentry,
+							   false, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
@@ -1048,13 +1049,12 @@ int follow_down_one(struct path *path)
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			       struct inode **inode, bool reverse_transit)
 {
-	unsigned abort_mask =
-		reverse_transit ? 0 : DCACHE_MANAGE_TRANSIT;
-
 	while (d_mountpoint(path->dentry)) {
 		struct vfsmount *mounted;
-		if (path->dentry->d_flags & abort_mask)
-			return true;
+		if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+		    !reverse_transit &&
+		    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+			return false;
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		if (!mounted)
 			break;
@@ -1132,7 +1132,8 @@ int follow_down(struct path *path, bool mounting_here)
 		if (managed & DCACHE_MANAGE_TRANSIT) {
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(path->dentry, mounting_here);
+			ret = path->dentry->d_op->d_manage(
+				path->dentry, mounting_here, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1a87760d653..f958c19e3ca 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -168,7 +168,7 @@ struct dentry_operations {
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
-	int (*d_manage)(struct dentry *, bool);
+	int (*d_manage)(struct dentry *, bool, bool);
 } ____cacheline_aligned;
 
 /*
-- 
cgit v1.2.3


From ea5b778a8b98c85a87d66bf844904f9c3802b869 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 19:10:03 +0000
Subject: Unexport do_add_mount() and add in follow_automount(), not
 ->d_automount()

Unexport do_add_mount() and make ->d_automount() return the vfsmount to be
added rather than calling do_add_mount() itself.  follow_automount() will then
do the addition.

This slightly complicates things as ->d_automount() normally wants to add the
new vfsmount to an expiration list and start an expiration timer.  The problem
with that is that the vfsmount will be deleted if it has a refcount of 1 and
the timer will not repeat if the expiration list is empty.

To this end, we require the vfsmount to be returned from d_automount() with a
refcount of (at least) 2.  One of these refs will be dropped unconditionally.
In addition, follow_automount() must get a 3rd ref around the call to
do_add_mount() lest it eat a ref and return an error, leaving the mount we
have open to being expired as we would otherwise have only 1 ref on it.

d_automount() should also add the the vfsmount to the expiration list (by
calling mnt_set_expiry()) and start the expiration timer before returning, if
this mechanism is to be used.  The vfsmount will be unlinked from the
expiration list by follow_automount() if do_add_mount() fails.

This patch also fixes the call to do_add_mount() for AFS to propagate the mount
flags from the parent vfsmount.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/vfs.txt | 23 ++++++++++++---------
 fs/afs/mntpt.c                    | 25 ++++++-----------------
 fs/cifs/cifs_dfs_ref.c            | 26 ++++++------------------
 fs/internal.h                     |  2 ++
 fs/namei.c                        | 42 ++++++++++++++++++++++++++++++++-------
 fs/namespace.c                    | 41 ++++++++++++++++++++++++++++++--------
 fs/nfs/namespace.c                | 24 ++++------------------
 include/linux/mount.h             |  7 +------
 8 files changed, 101 insertions(+), 89 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3c4b2f1b64d..94cf97b901d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -933,15 +933,20 @@ struct dentry_operations {
 	dynamic_dname() helper function is provided to take care of this.
 
   d_automount: called when an automount dentry is to be traversed (optional).
-	This should create a new VFS mount record, mount it on the directory
-	and return the record to the caller.  The caller is supplied with a
-	path parameter giving the automount directory to describe the automount
-	target and the parent VFS mount record to provide inheritable mount
-	parameters.  NULL should be returned if someone else managed to make
-	the automount first.  If the automount failed, then an error code
-	should be returned.  If -EISDIR is returned, then the directory will
-	be treated as an ordinary directory and returned to pathwalk to
-	continue walking.
+	This should create a new VFS mount record and return the record to the
+	caller.  The caller is supplied with a path parameter giving the
+	automount directory to describe the automount target and the parent
+	VFS mount record to provide inheritable mount parameters.  NULL should
+	be returned if someone else managed to make the automount first.  If
+	the vfsmount creation failed, then an error code should be returned.
+	If -EISDIR is returned, then the directory will be treated as an
+	ordinary directory and returned to pathwalk to continue walking.
+
+	If a vfsmount is returned, the caller will attempt to mount it on the
+	mountpoint and will remove the vfsmount from its expiration list in
+	the case of failure.  The vfsmount should be returned with 2 refs on
+	it to prevent automatic expiration - the caller will clean up the
+	additional ref.
 
 	This function is only used if DCACHE_NEED_AUTOMOUNT is set on the
 	dentry.  This is set by __d_instantiate() if S_AUTOMOUNT is set on the
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index d23b2e344a7..aa59184151d 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -241,7 +241,6 @@ error_no_devname:
 struct vfsmount *afs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
-	int err;
 
 	_enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
 
@@ -249,24 +248,12 @@ struct vfsmount *afs_d_automount(struct path *path)
 	if (IS_ERR(newmnt))
 		return newmnt;
 
-	mntget(newmnt);
-	err = do_add_mount(newmnt, path, MNT_SHRINKABLE, &afs_vfsmounts);
-	switch (err) {
-	case 0:
-		queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
-				   afs_mntpt_expiry_timeout * HZ);
-		_leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
-		return newmnt;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		mntput(newmnt);
-		_leave(" = NULL [EBUSY]");
-		return NULL;
-	default:
-		mntput(newmnt);
-		_leave(" = %d", err);
-		return ERR_PTR(err);
-	}
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &afs_vfsmounts);
+	queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+			   afs_mntpt_expiry_timeout * HZ);
+	_leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+	return newmnt;
 }
 
 /*
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 0fc163808de..7ed36536e75 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -351,7 +351,6 @@ free_xid:
 struct vfsmount *cifs_dfs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
-	int err;
 
 	cFYI(1, "in %s", __func__);
 
@@ -361,25 +360,12 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
 		return newmnt;
 	}
 
-	mntget(newmnt);
-	err = do_add_mount(newmnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE,
-			   &cifs_dfs_automount_list);
-	switch (err) {
-	case 0:
-		schedule_delayed_work(&cifs_dfs_automount_task,
-				      cifs_dfs_mountpoint_expiry_timeout);
-		cFYI(1, "leaving %s [ok]" , __func__);
-		return newmnt;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		mntput(newmnt);
-		cFYI(1, "leaving %s [EBUSY]" , __func__);
-		return NULL;
-	default:
-		mntput(newmnt);
-		cFYI(1, "leaving %s [error %d]" , __func__, err);
-		return ERR_PTR(err);
-	}
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
+	schedule_delayed_work(&cifs_dfs_automount_task,
+			      cifs_dfs_mountpoint_expiry_timeout);
+	cFYI(1, "leaving %s [ok]" , __func__);
+	return newmnt;
 }
 
 const struct inode_operations cifs_dfs_referral_inode_operations = {
diff --git a/fs/internal.h b/fs/internal.h
index 9687c2ee273..4931060fd08 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,8 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 extern void release_mounts(struct list_head *);
 extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int do_add_mount(struct vfsmount *, struct path *, int);
+extern void mnt_clear_expiry(struct vfsmount *);
 
 extern void __init mnt_init(void);
 
diff --git a/fs/namei.c b/fs/namei.c
index 5c89695ae1e..c2e37727e3a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -900,6 +900,7 @@ static int follow_automount(struct path *path, unsigned flags,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
+	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
 		return -EREMOTE;
@@ -942,22 +943,49 @@ static int follow_automount(struct path *path, unsigned flags,
 			return -EREMOTE;
 		return PTR_ERR(mnt);
 	}
+
 	if (!mnt) /* mount collision */
 		return 0;
 
+	/* The new mount record should have at least 2 refs to prevent it being
+	 * expired before we get a chance to add it
+	 */
+	BUG_ON(mnt_get_count(mnt) < 2);
+
 	if (mnt->mnt_sb == path->mnt->mnt_sb &&
 	    mnt->mnt_root == path->dentry) {
+		mnt_clear_expiry(mnt);
+		mntput(mnt);
 		mntput(mnt);
 		return -ELOOP;
 	}
 
-	dput(path->dentry);
-	if (*need_mntput)
-		mntput(path->mnt);
-	path->mnt = mnt;
-	path->dentry = dget(mnt->mnt_root);
-	*need_mntput = true;
-	return 0;
+	/* We need to add the mountpoint to the parent.  The filesystem may
+	 * have placed it on an expiry list, and so we need to make sure it
+	 * won't be expired under us if do_add_mount() fails (do_add_mount()
+	 * will eat a reference unconditionally).
+	 */
+	mntget(mnt);
+	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+	switch (err) {
+	case -EBUSY:
+		/* Someone else made a mount here whilst we were busy */
+		err = 0;
+	default:
+		mnt_clear_expiry(mnt);
+		mntput(mnt);
+		mntput(mnt);
+		return err;
+	case 0:
+		mntput(mnt);
+		dput(path->dentry);
+		if (*need_mntput)
+			mntput(path->mnt);
+		path->mnt = mnt;
+		path->dentry = dget(mnt->mnt_root);
+		*need_mntput = true;
+		return 0;
+	}
 }
 
 /*
diff --git a/fs/namespace.c b/fs/namespace.c
index d94ccd6ddaf..bfcb701f949 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1925,15 +1925,14 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
-	return do_add_mount(mnt, path, mnt_flags, NULL);
+	return do_add_mount(mnt, path, mnt_flags);
 }
 
 /*
  * add a mount into a namespace's mount tree
- * - provide the option of adding the new mount to an expiration list
+ * - this unconditionally eats one of the caller's references to newmnt.
  */
-int do_add_mount(struct vfsmount *newmnt, struct path *path,
-		 int mnt_flags, struct list_head *fslist)
+int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
 {
 	int err;
 
@@ -1963,9 +1962,6 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 	if ((err = graft_tree(newmnt, path)))
 		goto unlock;
 
-	if (fslist) /* add to the specified expiration list */
-		list_add_tail(&newmnt->mnt_expire, fslist);
-
 	up_write(&namespace_sem);
 	return 0;
 
@@ -1975,7 +1971,36 @@ unlock:
 	return err;
 }
 
-EXPORT_SYMBOL_GPL(do_add_mount);
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+	down_write(&namespace_sem);
+	br_write_lock(vfsmount_lock);
+
+	list_add_tail(&mnt->mnt_expire, expiry_list);
+
+	br_write_unlock(vfsmount_lock);
+	up_write(&namespace_sem);
+}
+EXPORT_SYMBOL(mnt_set_expiry);
+
+/*
+ * Remove a vfsmount from any expiration list it may be on
+ */
+void mnt_clear_expiry(struct vfsmount *mnt)
+{
+	if (!list_empty(&mnt->mnt_expire)) {
+		down_write(&namespace_sem);
+		br_write_lock(vfsmount_lock);
+		list_del_init(&mnt->mnt_expire);
+		br_write_unlock(vfsmount_lock);
+		up_write(&namespace_sem);
+	}
+}
 
 /*
  * process a list of expirable mountpoints with the intent of discarding any
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f3fbb1bf3f1..f32b8603dca 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -149,26 +149,10 @@ struct vfsmount *nfs_d_automount(struct path *path)
 	if (IS_ERR(mnt))
 		goto out;
 
-	mntget(mnt);
-	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE,
-			   &nfs_automount_list);
-	switch (err) {
-	case 0:
-		dprintk("%s: done, success\n", __func__);
-		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
-		break;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		mntput(mnt);
-		dprintk("%s: done, collision\n", __func__);
-		mnt = NULL;
-		break;
-	default:
-		mntput(mnt);
-		dprintk("%s: done, error %d\n", __func__, err);
-		mnt = ERR_PTR(err);
-		break;
-	}
+	dprintk("%s: done, success\n", __func__);
+	mntget(mnt); /* prevent immediate expiration */
+	mnt_set_expiry(mnt, &nfs_automount_list);
+	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 
 out:
 	nfs_free_fattr(fattr);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 1869ea24a73..af4765ea8fd 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -110,12 +110,7 @@ extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
 				      void *data);
 
-struct nameidata;
-
-struct path;
-extern int do_add_mount(struct vfsmount *newmnt, struct path *path,
-			int mnt_flags, struct list_head *fslist);
-
+extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
 
 extern dev_t name_to_dev_t(char *name);
-- 
cgit v1.2.3


From 2fe17c1075836b66678ed2a305fd09b6773883aa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Jan 2011 13:07:43 +0100
Subject: fallocate should be a file operation

Currently all filesystems except XFS implement fallocate asynchronously,
while XFS forced a commit.  Both of these are suboptimal - in case of O_SYNC
I/O we really want our allocation on disk, especially for the !KEEP_SIZE
case where we actually grow the file with user-visible zeroes.  On the
other hand always commiting the transaction is a bad idea for fast-path
uses of fallocate like for example in recent Samba versions.   Given
that block allocation is a data plane operation anyway change it from
an inode operation to a file operation so that we have the file structure
available that lets us check for O_SYNC.

This also includes moving the code around for a few of the filesystems,
and remove the already unnedded S_ISDIR checks given that we only wire
up fallocate for regular files.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |   3 +-
 fs/btrfs/file.c                   | 113 +++++++++++++++++
 fs/btrfs/inode.c                  | 111 ----------------
 fs/ext4/ext4.h                    |   2 +-
 fs/ext4/extents.c                 |   9 +-
 fs/ext4/file.c                    |   2 +-
 fs/gfs2/file.c                    | 258 ++++++++++++++++++++++++++++++++++++++
 fs/gfs2/ops_inode.c               | 258 --------------------------------------
 fs/ocfs2/file.c                   |   8 +-
 fs/open.c                         |   4 +-
 fs/xfs/linux-2.6/xfs_file.c       |  56 +++++++++
 fs/xfs/linux-2.6/xfs_iops.c       |  60 ---------
 include/linux/fs.h                |   4 +-
 13 files changed, 440 insertions(+), 448 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 651d5237c15..4471a416c27 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -60,7 +60,6 @@ ata *);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
-	long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
 
 locking rules:
@@ -88,7 +87,6 @@ getxattr:	no
 listxattr:	no
 removexattr:	yes
 truncate_range:	yes
-fallocate:	no
 fiemap:		no
 	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
@@ -437,6 +435,7 @@ prototypes:
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
 			size_t, unsigned int);
 	int (*setlease)(struct file *, long, struct file_lock **);
+	long (*fallocate)(struct file *, int, loff_t, loff_t);
 };
 
 locking rules:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763..a9e0a4eaf3d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
@@ -1237,6 +1238,117 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 	return 0;
 }
 
+static long btrfs_fallocate(struct file *file, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct extent_state *cached_state = NULL;
+	u64 cur_offset;
+	u64 last_byte;
+	u64 alloc_start;
+	u64 alloc_end;
+	u64 alloc_hint = 0;
+	u64 locked_end;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+	struct extent_map *em;
+	int ret;
+
+	alloc_start = offset & ~mask;
+	alloc_end =  (offset + len + mask) & ~mask;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
+	mutex_lock(&inode->i_mutex);
+	ret = inode_newsize_ok(inode, alloc_end);
+	if (ret)
+		goto out;
+
+	if (alloc_start > inode->i_size) {
+		ret = btrfs_cont_expand(inode, alloc_start);
+		if (ret)
+			goto out;
+	}
+
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+	if (ret)
+		goto out;
+
+	locked_end = alloc_end - 1;
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		/* the extent lock is ordered inside the running
+		 * transaction
+		 */
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+				 locked_end, 0, &cached_state, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    alloc_end - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > alloc_start &&
+		    ordered->file_offset < alloc_end) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     alloc_start, locked_end,
+					     &cached_state, GFP_NOFS);
+			/*
+			 * we can't wait on the range with the transaction
+			 * running or with the extent lock held
+			 */
+			btrfs_wait_ordered_range(inode, alloc_start,
+						 alloc_end - alloc_start);
+		} else {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+	}
+
+	cur_offset = alloc_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				      alloc_end - cur_offset, 0);
+		BUG_ON(IS_ERR(em) || !em);
+		last_byte = min(extent_map_end(em), alloc_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (em->block_start == EXTENT_MAP_HOLE ||
+		    (cur_offset >= inode->i_size &&
+		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+							last_byte - cur_offset,
+							1 << inode->i_blkbits,
+							offset + len,
+							&alloc_hint);
+			if (ret < 0) {
+				free_extent_map(em);
+				break;
+			}
+		}
+		free_extent_map(em);
+
+		cur_offset = last_byte;
+		if (cur_offset >= alloc_end) {
+			ret = 0;
+			break;
+		}
+	}
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			     &cached_state, GFP_NOFS);
+
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
 const struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -1248,6 +1360,7 @@ const struct file_operations btrfs_file_operations = {
 	.open		= generic_file_open,
 	.release	= btrfs_release_file,
 	.fsync		= btrfs_sync_file,
+	.fallocate	= btrfs_fallocate,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 64daf2acd0d..902afbf5081 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7098,116 +7098,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 					   min_size, actual_len, alloc_hint, trans);
 }
 
-static long btrfs_fallocate(struct inode *inode, int mode,
-			    loff_t offset, loff_t len)
-{
-	struct extent_state *cached_state = NULL;
-	u64 cur_offset;
-	u64 last_byte;
-	u64 alloc_start;
-	u64 alloc_end;
-	u64 alloc_hint = 0;
-	u64 locked_end;
-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-	struct extent_map *em;
-	int ret;
-
-	alloc_start = offset & ~mask;
-	alloc_end =  (offset + len + mask) & ~mask;
-
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
-		return -EOPNOTSUPP;
-
-	/*
-	 * wait for ordered IO before we have any locks.  We'll loop again
-	 * below with the locks held.
-	 */
-	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
-	mutex_lock(&inode->i_mutex);
-	ret = inode_newsize_ok(inode, alloc_end);
-	if (ret)
-		goto out;
-
-	if (alloc_start > inode->i_size) {
-		ret = btrfs_cont_expand(inode, alloc_start);
-		if (ret)
-			goto out;
-	}
-
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-	if (ret)
-		goto out;
-
-	locked_end = alloc_end - 1;
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		/* the extent lock is ordered inside the running
-		 * transaction
-		 */
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-				 locked_end, 0, &cached_state, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode,
-							    alloc_end - 1);
-		if (ordered &&
-		    ordered->file_offset + ordered->len > alloc_start &&
-		    ordered->file_offset < alloc_end) {
-			btrfs_put_ordered_extent(ordered);
-			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-					     alloc_start, locked_end,
-					     &cached_state, GFP_NOFS);
-			/*
-			 * we can't wait on the range with the transaction
-			 * running or with the extent lock held
-			 */
-			btrfs_wait_ordered_range(inode, alloc_start,
-						 alloc_end - alloc_start);
-		} else {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-	}
-
-	cur_offset = alloc_start;
-	while (1) {
-		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
-				      alloc_end - cur_offset, 0);
-		BUG_ON(IS_ERR(em) || !em);
-		last_byte = min(extent_map_end(em), alloc_end);
-		last_byte = (last_byte + mask) & ~mask;
-		if (em->block_start == EXTENT_MAP_HOLE ||
-		    (cur_offset >= inode->i_size &&
-		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-							last_byte - cur_offset,
-							1 << inode->i_blkbits,
-							offset + len,
-							&alloc_hint);
-			if (ret < 0) {
-				free_extent_map(em);
-				break;
-			}
-		}
-		free_extent_map(em);
-
-		cur_offset = last_byte;
-		if (cur_offset >= alloc_end) {
-			ret = 0;
-			break;
-		}
-	}
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-			     &cached_state, GFP_NOFS);
-
-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-out:
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
 static int btrfs_set_page_dirty(struct page *page)
 {
 	return __set_page_dirty_nobuffers(page);
@@ -7310,7 +7200,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.listxattr      = btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
-	.fallocate	= btrfs_fallocate,
 	.fiemap		= btrfs_fiemap,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1de65f57203..0c8d97b56f3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2065,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
-extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 			  ssize_t len);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4bdd160854e..63a75810b7c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3627,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
 }
 
 /*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
  * operation, which gets called from sys_fallocate system call.
  * For block-mapped files, posix_fallocate should fall back to the method
  * of writing zeroes to the required new blocks (the same behavior which is
  * expected for file systems which do not support fallocate() system call).
  */
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	handle_t *handle;
 	loff_t new_size;
 	unsigned int max_blocks;
@@ -3655,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
-	/* preallocation to directories is currently not supported */
-	if (S_ISDIR(inode->i_mode))
-		return -ENODEV;
-
 	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bb003dc9fff..2e8322c8aa8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -210,6 +210,7 @@ const struct file_operations ext4_file_operations = {
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
+	.fallocate	= ext4_fallocate,
 };
 
 const struct inode_operations ext4_file_inode_operations = {
@@ -223,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.check_acl	= ext4_check_acl,
-	.fallocate	= ext4_fallocate,
 	.fiemap		= ext4_fiemap,
 };
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fca6689e12e..7cfdcb91336 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
+static void empty_write_end(struct page *page, unsigned from,
+			   unsigned to)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+	page_zero_new_buffers(page, from, to);
+	flush_dcache_page(page);
+	mark_page_accessed(page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	block_commit_write(page, from, to);
+}
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+	unsigned start, end, next;
+	struct buffer_head *bh, *head;
+	int error;
+
+	if (!page_has_buffers(page)) {
+		error = __block_write_begin(page, from, to - from, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+
+		empty_write_end(page, from, to);
+		return 0;
+	}
+
+	bh = head = page_buffers(page);
+	next = end = 0;
+	while (next < from) {
+		next += bh->b_size;
+		bh = bh->b_this_page;
+	}
+	start = next;
+	do {
+		next += bh->b_size;
+		if (buffer_mapped(bh)) {
+			if (end) {
+				error = __block_write_begin(page, start, end - start,
+							    gfs2_block_map);
+				if (unlikely(error))
+					return error;
+				empty_write_end(page, start, end);
+				end = 0;
+			}
+			start = next;
+		}
+		else
+			end = next;
+		bh = bh->b_this_page;
+	} while (next < to);
+
+	if (end) {
+		error = __block_write_begin(page, start, end - start, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+		empty_write_end(page, start, end);
+	}
+
+	return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+			   int mode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *dibh;
+	int error;
+	u64 start = offset >> PAGE_CACHE_SHIFT;
+	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t curr;
+	struct page *page;
+	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+	unsigned int from, to;
+
+	if (!end_offset)
+		end_offset = PAGE_CACHE_SIZE;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(error))
+		goto out;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (unlikely(error))
+			goto out;
+	}
+
+	curr = start;
+	offset = start << PAGE_CACHE_SHIFT;
+	from = start_offset;
+	to = PAGE_CACHE_SIZE;
+	while (curr <= end) {
+		page = grab_cache_page_write_begin(inode->i_mapping, curr,
+						   AOP_FLAG_NOFS);
+		if (unlikely(!page)) {
+			error = -ENOMEM;
+			goto out;
+		}
+
+		if (curr == end)
+			to = end_offset;
+		error = write_empty_blocks(page, from, to);
+		if (!error && offset + to > inode->i_size &&
+		    !(mode & FALLOC_FL_KEEP_SIZE)) {
+			i_size_write(inode, offset + to);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+		if (error)
+			goto out;
+		curr++;
+		offset += PAGE_CACHE_SIZE;
+		from = 0;
+	}
+
+	gfs2_dinode_out(ip, dibh->b_data);
+	mark_inode_dirty(inode);
+
+	brelse(dibh);
+
+out:
+	return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+			    unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		max_data -= tmp;
+	}
+	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+	   so it might end up with fewer data blocks */
+	if (max_data <= *data_blocks)
+		return;
+	*data_blocks = max_data;
+	*ind_blocks = max_blocks - max_data;
+	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+	if (*len > max) {
+		*len = max;
+		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+	}
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
+			   loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	loff_t bytes, max_bytes;
+	struct gfs2_alloc *al;
+	int error;
+	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+		 sdp->sd_sb.sb_bsize_shift;
+
+	len = next - offset;
+	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+	if (!bytes)
+		bytes = UINT_MAX;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	if (!gfs2_write_alloc_required(ip, offset, len))
+		goto out_unlock;
+
+	while (len > 0) {
+		if (len < bytes)
+			bytes = len;
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+retry:
+		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error) {
+			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+				bytes >>= 1;
+				goto retry;
+			}
+			goto out_qunlock;
+		}
+		max_bytes = bytes;
+		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+		al->al_requested = data_blocks + ind_blocks;
+
+		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+			  RES_RG_HDR + gfs2_rg_blocks(al);
+		if (gfs2_is_jdata(ip))
+			rblocks += data_blocks ? data_blocks : 1;
+
+		error = gfs2_trans_begin(sdp, rblocks,
+					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+		if (error)
+			goto out_trans_fail;
+
+		error = fallocate_chunk(inode, offset, max_bytes, mode);
+		gfs2_trans_end(sdp);
+
+		if (error)
+			goto out_trans_fail;
+
+		len -= max_bytes;
+		offset += max_bytes;
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	goto out_unlock;
+
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_qunlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 
 /**
@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.setlease	= gfs2_setlease,
+	.fallocate	= gfs2_fallocate,
 };
 
 const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.setlease	= generic_setlease,
+	.fallocate	= gfs2_fallocate,
 };
 
 const struct file_operations gfs2_dir_fops_nolock = {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index c09528c07f3..d8b26ac2e20 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,8 +18,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
-#include <linux/swap.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -1257,261 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	return ret;
 }
 
-static void empty_write_end(struct page *page, unsigned from,
-			   unsigned to)
-{
-	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-
-	page_zero_new_buffers(page, from, to);
-	flush_dcache_page(page);
-	mark_page_accessed(page);
-
-	if (!gfs2_is_writeback(ip))
-		gfs2_page_add_databufs(ip, page, from, to);
-
-	block_commit_write(page, from, to);
-}
-
-
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-{
-	unsigned start, end, next;
-	struct buffer_head *bh, *head;
-	int error;
-
-	if (!page_has_buffers(page)) {
-		error = __block_write_begin(page, from, to - from, gfs2_block_map);
-		if (unlikely(error))
-			return error;
-
-		empty_write_end(page, from, to);
-		return 0;
-	}
-
-	bh = head = page_buffers(page);
-	next = end = 0;
-	while (next < from) {
-		next += bh->b_size;
-		bh = bh->b_this_page;
-	}
-	start = next;
-	do {
-		next += bh->b_size;
-		if (buffer_mapped(bh)) {
-			if (end) {
-				error = __block_write_begin(page, start, end - start,
-							    gfs2_block_map);
-				if (unlikely(error))
-					return error;
-				empty_write_end(page, start, end);
-				end = 0;
-			}
-			start = next;
-		}
-		else
-			end = next;
-		bh = bh->b_this_page;
-	} while (next < to);
-
-	if (end) {
-		error = __block_write_begin(page, start, end - start, gfs2_block_map);
-		if (unlikely(error))
-			return error;
-		empty_write_end(page, start, end);
-	}
-
-	return 0;
-}
-
-static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
-			   int mode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct buffer_head *dibh;
-	int error;
-	u64 start = offset >> PAGE_CACHE_SHIFT;
-	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
-	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
-	pgoff_t curr;
-	struct page *page;
-	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
-	unsigned int from, to;
-
-	if (!end_offset)
-		end_offset = PAGE_CACHE_SIZE;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (unlikely(error))
-		goto out;
-
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-
-	if (gfs2_is_stuffed(ip)) {
-		error = gfs2_unstuff_dinode(ip, NULL);
-		if (unlikely(error))
-			goto out;
-	}
-
-	curr = start;
-	offset = start << PAGE_CACHE_SHIFT;
-	from = start_offset;
-	to = PAGE_CACHE_SIZE;
-	while (curr <= end) {
-		page = grab_cache_page_write_begin(inode->i_mapping, curr,
-						   AOP_FLAG_NOFS);
-		if (unlikely(!page)) {
-			error = -ENOMEM;
-			goto out;
-		}
-
-		if (curr == end)
-			to = end_offset;
-		error = write_empty_blocks(page, from, to);
-		if (!error && offset + to > inode->i_size &&
-		    !(mode & FALLOC_FL_KEEP_SIZE)) {
-			i_size_write(inode, offset + to);
-		}
-		unlock_page(page);
-		page_cache_release(page);
-		if (error)
-			goto out;
-		curr++;
-		offset += PAGE_CACHE_SIZE;
-		from = 0;
-	}
-
-	gfs2_dinode_out(ip, dibh->b_data);
-	mark_inode_dirty(inode);
-
-	brelse(dibh);
-
-out:
-	return error;
-}
-
-static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
-			    unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
-	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
-
-	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
-		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-		max_data -= tmp;
-	}
-	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
-	   so it might end up with fewer data blocks */
-	if (max_data <= *data_blocks)
-		return;
-	*data_blocks = max_data;
-	*ind_blocks = max_blocks - max_data;
-	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
-	if (*len > max) {
-		*len = max;
-		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
-	}
-}
-
-static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
-			   loff_t len)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_inode *ip = GFS2_I(inode);
-	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-	loff_t bytes, max_bytes;
-	struct gfs2_alloc *al;
-	int error;
-	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
-	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
-
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
-		return -EOPNOTSUPP;
-
-	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
-		 sdp->sd_sb.sb_bsize_shift;
-
-	len = next - offset;
-	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
-	if (!bytes)
-		bytes = UINT_MAX;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-	error = gfs2_glock_nq(&ip->i_gh);
-	if (unlikely(error))
-		goto out_uninit;
-
-	if (!gfs2_write_alloc_required(ip, offset, len))
-		goto out_unlock;
-
-	while (len > 0) {
-		if (len < bytes)
-			bytes = len;
-		al = gfs2_alloc_get(ip);
-		if (!al) {
-			error = -ENOMEM;
-			goto out_unlock;
-		}
-
-		error = gfs2_quota_lock_check(ip);
-		if (error)
-			goto out_alloc_put;
-
-retry:
-		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
-
-		al->al_requested = data_blocks + ind_blocks;
-		error = gfs2_inplace_reserve(ip);
-		if (error) {
-			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
-				bytes >>= 1;
-				goto retry;
-			}
-			goto out_qunlock;
-		}
-		max_bytes = bytes;
-		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
-		al->al_requested = data_blocks + ind_blocks;
-
-		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
-			  RES_RG_HDR + gfs2_rg_blocks(al);
-		if (gfs2_is_jdata(ip))
-			rblocks += data_blocks ? data_blocks : 1;
-
-		error = gfs2_trans_begin(sdp, rblocks,
-					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
-		if (error)
-			goto out_trans_fail;
-
-		error = fallocate_chunk(inode, offset, max_bytes, mode);
-		gfs2_trans_end(sdp);
-
-		if (error)
-			goto out_trans_fail;
-
-		len -= max_bytes;
-		offset += max_bytes;
-		gfs2_inplace_release(ip);
-		gfs2_quota_unlock(ip);
-		gfs2_alloc_put(ip);
-	}
-	goto out_unlock;
-
-out_trans_fail:
-	gfs2_inplace_release(ip);
-out_qunlock:
-	gfs2_quota_unlock(ip);
-out_alloc_put:
-	gfs2_alloc_put(ip);
-out_unlock:
-	gfs2_glock_dq(&ip->i_gh);
-out_uninit:
-	gfs2_holder_uninit(&ip->i_gh);
-	return error;
-}
-
-
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len)
 {
@@ -1562,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
-	.fallocate = gfs2_fallocate,
 	.fiemap = gfs2_fiemap,
 };
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index cf254ce8c94..a6651956482 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1989,9 +1989,10 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 	return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
 }
 
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
 			    loff_t len)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_space_resv sr;
 	int change_size = 1;
@@ -2002,9 +2003,6 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
 	if (!ocfs2_writes_unwritten_extents(osb))
 		return -EOPNOTSUPP;
 
-	if (S_ISDIR(inode->i_mode))
-		return -ENODEV;
-
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		change_size = 0;
 
@@ -2612,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
 	.getxattr	= generic_getxattr,
 	.listxattr	= ocfs2_listxattr,
 	.removexattr	= generic_removexattr,
-	.fallocate	= ocfs2_fallocate,
 	.fiemap		= ocfs2_fiemap,
 };
 
@@ -2644,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
+	.fallocate	= ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops = {
diff --git a/fs/open.c b/fs/open.c
index 5b6ef7e2859..e52389e1f05 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -255,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
 		return -EFBIG;
 
-	if (!inode->i_op->fallocate)
+	if (!file->f_op->fallocate)
 		return -EOPNOTSUPP;
 
-	return inode->i_op->fallocate(inode, mode, offset, len);
+	return file->f_op->fallocate(file, mode, offset, len);
 }
 
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ef51eb43e13..a55c1b46b21 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,6 +37,7 @@
 #include "xfs_trace.h"
 
 #include <linux/dcache.h>
+#include <linux/falloc.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -882,6 +883,60 @@ out_unlock:
 	return ret;
 }
 
+STATIC long
+xfs_file_fallocate(
+	struct file	*file,
+	int		mode,
+	loff_t		offset,
+	loff_t		len)
+{
+	struct inode	*inode = file->f_path.dentry->d_inode;
+	long		error;
+	loff_t		new_size = 0;
+	xfs_flock64_t	bf;
+	xfs_inode_t	*ip = XFS_I(inode);
+	int		cmd = XFS_IOC_RESVSP;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	bf.l_whence = 0;
+	bf.l_start = offset;
+	bf.l_len = len;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		cmd = XFS_IOC_UNRESVSP;
+
+	/* check the new inode size is valid before allocating */
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		error = inode_newsize_ok(inode, new_size);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
+	if (error)
+		goto out_unlock;
+
+	/* Change file size if needed */
+	if (new_size) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = new_size;
+		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+	}
+
+out_unlock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
+
 STATIC int
 xfs_file_open(
 	struct inode	*inode,
@@ -1000,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
+	.fallocate	= xfs_file_fallocate,
 };
 
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index a4ecc2188a0..bd5727852fd 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
 #include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/falloc.h>
 #include <linux/fiemap.h>
 #include <linux/slab.h>
 
@@ -505,64 +504,6 @@ xfs_vn_setattr(
 	return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
 
-STATIC long
-xfs_vn_fallocate(
-	struct inode	*inode,
-	int		mode,
-	loff_t		offset,
-	loff_t		len)
-{
-	long		error;
-	loff_t		new_size = 0;
-	xfs_flock64_t	bf;
-	xfs_inode_t	*ip = XFS_I(inode);
-	int		cmd = XFS_IOC_RESVSP;
-
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-		return -EOPNOTSUPP;
-
-	/* preallocation on directories not yet supported */
-	error = -ENODEV;
-	if (S_ISDIR(inode->i_mode))
-		goto out_error;
-
-	bf.l_whence = 0;
-	bf.l_start = offset;
-	bf.l_len = len;
-
-	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-	if (mode & FALLOC_FL_PUNCH_HOLE)
-		cmd = XFS_IOC_UNRESVSP;
-
-	/* check the new inode size is valid before allocating */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    offset + len > i_size_read(inode)) {
-		new_size = offset + len;
-		error = inode_newsize_ok(inode, new_size);
-		if (error)
-			goto out_unlock;
-	}
-
-	error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
-	if (error)
-		goto out_unlock;
-
-	/* Change file size if needed */
-	if (new_size) {
-		struct iattr iattr;
-
-		iattr.ia_valid = ATTR_SIZE;
-		iattr.ia_size = new_size;
-		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-	}
-
-out_unlock:
-	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
-	return error;
-}
-
 #define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
 /*
@@ -656,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
 	.getxattr		= generic_getxattr,
 	.removexattr		= generic_removexattr,
 	.listxattr		= xfs_vn_listxattr,
-	.fallocate		= xfs_vn_fallocate,
 	.fiemap			= xfs_vn_fiemap,
 };
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 177b4ddea41..09b5bd6a7c6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1552,6 +1552,8 @@ struct file_operations {
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
 	int (*setlease)(struct file *, long, struct file_lock **);
+	long (*fallocate)(struct file *file, int mode, loff_t offset,
+			  loff_t len);
 };
 
 #define IPERM_FLAG_RCU	0x0001
@@ -1582,8 +1584,6 @@ struct inode_operations {
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
-	long (*fallocate)(struct inode *inode, int mode, loff_t offset,
-			  loff_t len);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
 } ____cacheline_aligned;
-- 
cgit v1.2.3