From b93d87c19821ba7d3ee11557403d782e541071ad Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 7 Nov 2011 16:37:57 -0500
Subject: nfsd4: fix lockowner matching

Lockowners are looked up by file as well as by owner, but we were
forgetting to do a comparison on the file.  This could cause an
incorrect result from lockt.

(Note looking up the inode from the lockowner is pretty awkward here.
The data structures need fixing.)

Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 47e94e33a97..5abced7a740 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3809,16 +3809,29 @@ nevermind:
 		deny->ld_type = NFS4_WRITE_LT;
 }
 
+static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner)
+{
+	struct nfs4_ol_stateid *lst;
+
+	if (!same_owner_str(&lo->lo_owner, owner, clid))
+		return false;
+	lst = list_first_entry(&lo->lo_owner.so_stateids,
+			       struct nfs4_ol_stateid, st_perstateowner);
+	return lst->st_file->fi_inode == inode;
+}
+
 static struct nfs4_lockowner *
 find_lockowner_str(struct inode *inode, clientid_t *clid,
 		struct xdr_netobj *owner)
 {
 	unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
+	struct nfs4_lockowner *lo;
 	struct nfs4_stateowner *op;
 
 	list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
-		if (same_owner_str(op, owner, clid))
-			return lockowner(op);
+		lo = lockowner(op);
+		if (same_lockowner_ino(lo, inode, clid, owner))
+			return lo;
 	}
 	return NULL;
 }
-- 
cgit v1.2.3


From 684e563858018d27acb8f00e30c026215bbd0ffb Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 4 Nov 2011 17:08:10 -0400
Subject: nfsd4: cleanup lock clientid handling in sessions case

I'd rather the "ignore clientid in sessions case" rule be enforced in
just one place.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5abced7a740..9354ddebbee 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3946,10 +3946,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 * lock stateid.
 		 */
 		struct nfs4_ol_stateid *open_stp = NULL;
-		
+
+		if (nfsd4_has_session(cstate))
+			/* See rfc 5661 18.10.3: given clientid is ignored: */
+			memcpy(&lock->v.new.clientid,
+				&cstate->session->se_client->cl_clientid,
+				sizeof(clientid_t));
+
 		status = nfserr_stale_clientid;
-		if (!nfsd4_has_session(cstate) &&
-		    STALE_CLIENTID(&lock->lk_new_clientid))
+		if (STALE_CLIENTID(&lock->lk_new_clientid))
 			goto out;
 
 		/* validate and update open stateid and open seqid */
@@ -3961,8 +3966,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 		open_sop = openowner(open_stp->st_stateowner);
 		status = nfserr_bad_stateid;
-		if (!nfsd4_has_session(cstate) &&
-			!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+		if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
 						&lock->v.new.clientid))
 			goto out;
 		/* create lockowner and lock stateid */
-- 
cgit v1.2.3


From 64a284d07c7d84299a90826950079a8ef11e8204 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 20 Oct 2011 06:57:46 -0400
Subject: nfsd4: maintain one seqid stream per (lockowner, file)

Instead of creating a new lockowner and stateid for every
open_to_lockowner call, reuse the existing lockowner if it exists.

Reported-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 58 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9354ddebbee..d09d5242c08 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3905,6 +3905,37 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
 	__set_bit(access, &lock_stp->st_access_bmap);
 }
 
+__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
+{
+	struct nfs4_file *fi = ost->st_file;
+	struct nfs4_openowner *oo = openowner(ost->st_stateowner);
+	struct nfs4_client *cl = oo->oo_owner.so_client;
+	struct nfs4_lockowner *lo;
+	unsigned int strhashval;
+
+	lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner);
+	if (lo) {
+		if (!cstate->minorversion)
+			return nfserr_bad_seqid;
+		/* XXX: a lockowner always has exactly one stateid: */
+		*lst = list_first_entry(&lo->lo_owner.so_stateids,
+				struct nfs4_ol_stateid, st_perstateowner);
+		return nfs_ok;
+	}
+	strhashval = lock_ownerstr_hashval(fi->fi_inode, cl->cl_clientid.cl_id,
+			&lock->v.new.owner);
+	lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
+	if (lo == NULL)
+		return nfserr_jukebox;
+	*lst = alloc_init_lock_stateid(lo, fi, ost);
+	if (*lst == NULL) {
+		release_lockowner(lo);
+		return nfserr_jukebox;
+	}
+	*new = true;
+	return nfs_ok;
+}
+
 /*
  *  LOCK operation 
  */
@@ -3920,7 +3951,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct file_lock file_lock;
 	struct file_lock conflock;
 	__be32 status = 0;
-	unsigned int strhashval;
+	bool new_state = false;
 	int lkflg;
 	int err;
 
@@ -3969,21 +4000,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
 						&lock->v.new.clientid))
 			goto out;
-		/* create lockowner and lock stateid */
-		fp = open_stp->st_file;
-		strhashval = lock_ownerstr_hashval(fp->fi_inode,
-				open_sop->oo_owner.so_client->cl_clientid.cl_id,
-				&lock->v.new.owner);
-		/* XXX: Do we need to check for duplicate stateowners on
-		 * the same file, or should they just be allowed (and
-		 * create new stateids)? */
-		status = nfserr_jukebox;
-		lock_sop = alloc_init_lock_stateowner(strhashval,
-				open_sop->oo_owner.so_client, open_stp, lock);
-		if (lock_sop == NULL)
-			goto out;
-		lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
-		if (lock_stp == NULL)
+		status = lookup_or_create_lock_state(cstate, open_stp, lock,
+							&lock_stp, &new_state);
+		if (status)
 			goto out;
 	} else {
 		/* lock (lock owner + lock stateid) already exists */
@@ -3993,10 +4012,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				       NFS4_LOCK_STID, &lock_stp);
 		if (status)
 			goto out;
-		lock_sop = lockowner(lock_stp->st_stateowner);
-		fp = lock_stp->st_file;
 	}
-	/* lock_sop and lock_stp have been created or found */
+	lock_sop = lockowner(lock_stp->st_stateowner);
+	fp = lock_stp->st_file;
 
 	lkflg = setlkflg(lock->lk_type);
 	status = nfs4_check_openmode(lock_stp, lkflg);
@@ -4071,7 +4089,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		break;
 	}
 out:
-	if (status && lock->lk_is_new && lock_sop)
+	if (status && new_state)
 		release_lockowner(lock_sop);
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
-- 
cgit v1.2.3


From 65178db42a02c7984f711614546e97e9952d8e01 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Tue, 1 Nov 2011 13:35:21 -0400
Subject: NFSD: Added fault injection

Fault injection on the NFS server makes it easier to test the client's
state manager and recovery threads.  Simulating errors on the server is
easier than finding the right conditions that cause them naturally.

This patch uses debugfs to add a simple framework for fault injection to
the server.  This framework is a config option, and can be enabled
through CONFIG_NFSD_FAULT_INJECTION.  Assuming you have debugfs mounted
to /sys/debug, a set of files will be created in /sys/debug/nfsd/.
Writing to any of these files will cause the corresponding action and
write a log entry to dmesg.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/Kconfig        |  10 +++++
 fs/nfsd/Makefile       |   1 +
 fs/nfsd/fault_inject.c |  91 ++++++++++++++++++++++++++++++++++++++
 fs/nfsd/fault_inject.h |  28 ++++++++++++
 fs/nfsd/nfs4state.c    | 115 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfsctl.c       |   7 +++
 6 files changed, 252 insertions(+)
 create mode 100644 fs/nfsd/fault_inject.c
 create mode 100644 fs/nfsd/fault_inject.h

(limited to 'fs')

diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 10e6366608f..8df1ea4a6ff 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -80,3 +80,13 @@ config NFSD_V4
 	  available from http://linux-nfs.org/.
 
 	  If unsure, say N.
+
+config NFSD_FAULT_INJECTION
+	bool "NFS server manual fault injection"
+	depends on NFSD_V4 && DEBUG_KERNEL
+	help
+	  This option enables support for manually injecting faults
+	  into the NFS server.  This is intended to be used for
+	  testing error recovery on the NFS client.
+
+	  If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9b118ee2019..af32ef06b4f 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_NFSD)	+= nfsd.o
 
 nfsd-y 			:= nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
 			   export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
new file mode 100644
index 00000000000..ce7f0758d84
--- /dev/null
+++ b/fs/nfsd/fault_inject.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+ *
+ * Uses debugfs to create fault injection points for client testing
+ */
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include "state.h"
+#include "fault_inject.h"
+
+struct nfsd_fault_inject_op {
+	char *file;
+	void (*func)(u64);
+};
+
+static struct nfsd_fault_inject_op inject_ops[] = {
+	{
+		.file   = "forget_clients",
+		.func   = nfsd_forget_clients,
+	},
+	{
+		.file   = "forget_locks",
+		.func   = nfsd_forget_locks,
+	},
+	{
+		.file   = "forget_openowners",
+		.func   = nfsd_forget_openowners,
+	},
+	{
+		.file   = "forget_delegations",
+		.func   = nfsd_forget_delegations,
+	},
+	{
+		.file   = "recall_delegations",
+		.func   = nfsd_recall_delegations,
+	},
+};
+
+static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
+static struct dentry *debug_dir;
+
+static int nfsd_inject_set(void *op_ptr, u64 val)
+{
+	struct nfsd_fault_inject_op *op = op_ptr;
+
+	if (val == 0)
+		printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
+	else
+		printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
+
+	op->func(val);
+	return 0;
+}
+
+static int nfsd_inject_get(void *data, u64 *val)
+{
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n");
+
+void nfsd_fault_inject_cleanup(void)
+{
+	debugfs_remove_recursive(debug_dir);
+}
+
+int nfsd_fault_inject_init(void)
+{
+	unsigned int i;
+	struct nfsd_fault_inject_op *op;
+	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+
+	debug_dir = debugfs_create_dir("nfsd", NULL);
+	if (!debug_dir)
+		goto fail;
+
+	for (i = 0; i < NUM_INJECT_OPS; i++) {
+		op = &inject_ops[i];
+		if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd))
+			goto fail;
+	}
+	return 0;
+
+fail:
+	nfsd_fault_inject_cleanup();
+	return -ENOMEM;
+}
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
new file mode 100644
index 00000000000..90bd0570956
--- /dev/null
+++ b/fs/nfsd/fault_inject.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+ *
+ * Function definitions for fault injection
+ */
+
+#ifndef LINUX_NFSD_FAULT_INJECT_H
+#define LINUX_NFSD_FAULT_INJECT_H
+
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+int nfsd_fault_inject_init(void);
+void nfsd_fault_inject_cleanup(void);
+void nfsd_forget_clients(u64);
+void nfsd_forget_locks(u64);
+void nfsd_forget_openowners(u64);
+void nfsd_forget_delegations(u64);
+void nfsd_recall_delegations(u64);
+#else /* CONFIG_NFSD_FAULT_INJECTION */
+static inline int nfsd_fault_inject_init(void) { return 0; }
+static inline void nfsd_fault_inject_cleanup(void) {}
+static inline void nfsd_forget_clients(u64 num) {}
+static inline void nfsd_forget_locks(u64 num) {}
+static inline void nfsd_forget_openowners(u64 num) {}
+static inline void nfsd_forget_delegations(u64 num) {}
+static inline void nfsd_recall_delegations(u64 num) {}
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+
+#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d09d5242c08..a511eff0569 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4429,6 +4429,121 @@ nfs4_check_open_reclaim(clientid_t *clid)
 	return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
 }
 
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+
+void nfsd_forget_clients(u64 num)
+{
+	struct nfs4_client *clp, *next;
+	int count = 0;
+
+	nfs4_lock_state();
+	list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
+		nfsd4_remove_clid_dir(clp);
+		expire_client(clp);
+		if (++count == num)
+			break;
+	}
+	nfs4_unlock_state();
+
+	printk(KERN_INFO "NFSD: Forgot %d clients", count);
+}
+
+static void release_lockowner_sop(struct nfs4_stateowner *sop)
+{
+	release_lockowner(lockowner(sop));
+}
+
+static void release_openowner_sop(struct nfs4_stateowner *sop)
+{
+	release_openowner(openowner(sop));
+}
+
+static int nfsd_release_n_owners(u64 num,
+				struct list_head hashtbl[],
+				unsigned int hashtbl_size,
+				void (*release_sop)(struct nfs4_stateowner *))
+{
+	int i, count = 0;
+	struct nfs4_stateowner *sop, *next;
+
+	for (i = 0; i < hashtbl_size; i++) {
+		list_for_each_entry_safe(sop, next, &hashtbl[i], so_strhash) {
+			release_sop(sop);
+			if (++count == num)
+				return count;
+		}
+	}
+	return count;
+}
+
+void nfsd_forget_locks(u64 num)
+{
+	int count;
+
+	nfs4_lock_state();
+	count = nfsd_release_n_owners(num, lock_ownerstr_hashtbl,
+				     LOCK_HASH_SIZE, release_lockowner_sop);
+	nfs4_unlock_state();
+
+	printk(KERN_INFO "NFSD: Forgot %d locks", count);
+}
+
+void nfsd_forget_openowners(u64 num)
+{
+	int count;
+
+	nfs4_lock_state();
+	count = nfsd_release_n_owners(num, open_ownerstr_hashtbl,
+				     OPEN_OWNER_HASH_SIZE, release_openowner_sop);
+	nfs4_unlock_state();
+
+	printk(KERN_INFO "NFSD: Forgot %d open owners", count);
+}
+
+int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *))
+{
+	int i, count = 0;
+	struct nfs4_file *fp;
+	struct nfs4_delegation *dp, *next;
+
+	for (i = 0; i < FILE_HASH_SIZE; i++) {
+		list_for_each_entry(fp, &file_hashtbl[i], fi_hash) {
+			list_for_each_entry_safe(dp, next, &fp->fi_delegations, dl_perfile) {
+				deleg_func(dp);
+				if (++count == num)
+					return count;
+			}
+		}
+	}
+	return count;
+}
+
+void nfsd_forget_delegations(u64 num)
+{
+	unsigned int count;
+
+	nfs4_lock_state();
+	count = nfsd_process_n_delegations(num, unhash_delegation);
+	nfs4_unlock_state();
+
+	printk(KERN_INFO "NFSD: Forgot %d delegations", count);
+}
+
+void nfsd_recall_delegations(u64 num)
+{
+	unsigned int count;
+
+	nfs4_lock_state();
+	spin_lock(&recall_lock);
+	count = nfsd_process_n_delegations(num, nfsd_break_one_deleg);
+	spin_unlock(&recall_lock);
+	nfs4_unlock_state();
+
+	printk(KERN_INFO "NFSD: Recalled %d delegations", count);
+}
+
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+
 /* initialization to perform at module load time: */
 
 int
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c45a2ea4a09..b2e8093ebc2 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -18,6 +18,7 @@
 #include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
+#include "fault_inject.h"
 
 /*
  *	We have a single directory with several nodes in it.
@@ -1131,6 +1132,9 @@ static int __init init_nfsd(void)
 	retval = nfs4_state_init(); /* nfs4 locking state */
 	if (retval)
 		return retval;
+	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+	if (retval)
+		goto out_free_slabs;
 	nfsd_stat_init();	/* Statistics */
 	retval = nfsd_reply_cache_init();
 	if (retval)
@@ -1161,6 +1165,8 @@ out_free_cache:
 	nfsd_reply_cache_shutdown();
 out_free_stat:
 	nfsd_stat_shutdown();
+	nfsd_fault_inject_cleanup();
+out_free_slabs:
 	nfsd4_free_slabs();
 	return retval;
 }
@@ -1175,6 +1181,7 @@ static void __exit exit_nfsd(void)
 	nfsd_lockd_shutdown();
 	nfsd_idmap_shutdown();
 	nfsd4_free_slabs();
+	nfsd_fault_inject_cleanup();
 	unregister_filesystem(&nfsd_fs_type);
 }
 
-- 
cgit v1.2.3


From 72083396074035ffa5cf81b6bb3e55f1d615badf Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Tue, 1 Nov 2011 15:24:59 -0400
Subject: NFSD: Call nfsd4_init_slabs() from init_nfsd()

init_nfsd() was calling free_slabs() during cleanup code, but the call
to init_slabs() was hidden in nfsd4_state_init().  This could be
confusing to people unfamiliar with the code.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 10 +++-------
 fs/nfsd/nfsctl.c    |  3 ++-
 fs/nfsd/nfsd.h      |  6 ++++--
 3 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a511eff0569..1d6812db509 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2301,7 +2301,7 @@ nfsd4_free_slabs(void)
 	nfsd4_free_slab(&deleg_slab);
 }
 
-static int
+int
 nfsd4_init_slabs(void)
 {
 	openowner_slab = kmem_cache_create("nfsd4_openowners",
@@ -4546,14 +4546,11 @@ void nfsd_recall_delegations(u64 num)
 
 /* initialization to perform at module load time: */
 
-int
+void
 nfs4_state_init(void)
 {
-	int i, status;
+	int i;
 
-	status = nfsd4_init_slabs();
-	if (status)
-		return status;
 	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&conf_id_hashtbl[i]);
 		INIT_LIST_HEAD(&conf_str_hashtbl[i]);
@@ -4577,7 +4574,6 @@ nfs4_state_init(void)
 	INIT_LIST_HEAD(&client_lru);
 	INIT_LIST_HEAD(&del_recall_lru);
 	reclaim_str_hashtbl_size = 0;
-	return 0;
 }
 
 static void
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b2e8093ebc2..8daa935f329 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1129,9 +1129,10 @@ static int __init init_nfsd(void)
 	int retval;
 	printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
 
-	retval = nfs4_state_init(); /* nfs4 locking state */
+	retval = nfsd4_init_slabs();
 	if (retval)
 		return retval;
+	nfs4_state_init();
 	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
 	if (retval)
 		goto out_free_slabs;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 58134a23fdf..4e2ee29f2cf 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -104,14 +104,16 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
  */
 #ifdef CONFIG_NFSD_V4
 extern unsigned int max_delegations;
-int nfs4_state_init(void);
+void nfs4_state_init(void);
+int nfsd4_init_slabs(void);
 void nfsd4_free_slabs(void);
 int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
 void nfs4_reset_lease(time_t leasetime);
 int nfs4_reset_recoverydir(char *recdir);
 #else
-static inline int nfs4_state_init(void) { return 0; }
+static inline void nfs4_state_init(void) { }
+static inline int nfsd4_init_slabs(void) { return 0; }
 static inline void nfsd4_free_slabs(void) { }
 static inline int nfs4_state_start(void) { return 0; }
 static inline void nfs4_state_shutdown(void) { }
-- 
cgit v1.2.3


From c7e8472cf8ae877b232eb506284a5b15cf7efb2c Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Tue, 1 Nov 2011 14:18:28 -0400
Subject: NFSD: Remove unnecessary whitespace

The close parenthesis was hard to find with it spaced so far over.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
[bfields@redhat.com: get all these lines under 80 chars while we're here]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsd.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 4e2ee29f2cf..1d1e8589b4c 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -340,15 +340,15 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
 }
 
 /* These will return ERR_INVAL if specified in GETATTR or READDIR. */
-#define NFSD_WRITEONLY_ATTRS_WORD1							    \
-(FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
+#define NFSD_WRITEONLY_ATTRS_WORD1 \
+	(FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
 
 /* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
-#define NFSD_WRITEABLE_ATTRS_WORD0                                                          \
-(FATTR4_WORD0_SIZE              | FATTR4_WORD0_ACL                                         )
-#define NFSD_WRITEABLE_ATTRS_WORD1                                                          \
-(FATTR4_WORD1_MODE              | FATTR4_WORD1_OWNER         | FATTR4_WORD1_OWNER_GROUP     \
- | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#define NFSD_WRITEABLE_ATTRS_WORD0 \
+	(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
+#define NFSD_WRITEABLE_ATTRS_WORD1 \
+	(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
+	| FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
 #define NFSD_WRITEABLE_ATTRS_WORD2 0
 
 #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
-- 
cgit v1.2.3


From 06f1f864d4ae5804e83785308d41f14a08e4b980 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 7 Nov 2011 16:58:18 -0500
Subject: nfsd4: hash lockowners to simplify RELEASE_LOCKOWNER

Hash lockowners on just the owner string rather than on (owner, inode).
This makes the owner-string lookup needed for RELEASE_LOCKOWNER simpler
(currently it's doing at a linear search through the entire hash
table!).  That may come at the expense of making (owner, inode) lookups
more expensive if a client reuses the same lockowner across multiple
files.  We might add a separate lookup for that.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 42 ++++++++++++++----------------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1d6812db509..eec990022ad 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3746,15 +3746,6 @@ last_byte_offset(u64 start, u64 len)
 	return end > start ? end - 1: NFS4_MAX_UINT64;
 }
 
-static inline unsigned int
-lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
-		struct xdr_netobj *ownername)
-{
-	return (file_hashval(inode) + cl_id
-			+ opaque_hashval(ownername->data, ownername->len))
-		& LOCK_HASH_MASK;
-}
-
 static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
 
 /*
@@ -3824,7 +3815,7 @@ static struct nfs4_lockowner *
 find_lockowner_str(struct inode *inode, clientid_t *clid,
 		struct xdr_netobj *owner)
 {
-	unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
+	unsigned int hashval = open_ownerstr_hashval(clid->cl_id, owner);
 	struct nfs4_lockowner *lo;
 	struct nfs4_stateowner *op;
 
@@ -3847,7 +3838,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
  * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 
  * occurred. 
  *
- * strhashval = lock_ownerstr_hashval 
+ * strhashval = open_ownerstr_hashval
  */
 
 static struct nfs4_lockowner *
@@ -3922,7 +3913,7 @@ __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct n
 				struct nfs4_ol_stateid, st_perstateowner);
 		return nfs_ok;
 	}
-	strhashval = lock_ownerstr_hashval(fi->fi_inode, cl->cl_clientid.cl_id,
+	strhashval = open_ownerstr_hashval(cl->cl_clientid.cl_id,
 			&lock->v.new.owner);
 	lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
 	if (lo == NULL)
@@ -4286,7 +4277,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 	struct nfs4_ol_stateid *stp;
 	struct xdr_netobj *owner = &rlockowner->rl_owner;
 	struct list_head matches;
-	int i;
+	unsigned int hashval = open_ownerstr_hashval(clid->cl_id, owner);
 	__be32 status;
 
 	dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
@@ -4301,22 +4292,17 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 	nfs4_lock_state();
 
 	status = nfserr_locks_held;
-	/* XXX: we're doing a linear search through all the lockowners.
-	 * Yipes!  For now we'll just hope clients aren't really using
-	 * release_lockowner much, but eventually we have to fix these
-	 * data structures. */
 	INIT_LIST_HEAD(&matches);
-	for (i = 0; i < LOCK_HASH_SIZE; i++) {
-		list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
-			if (!same_owner_str(sop, owner, clid))
-				continue;
-			list_for_each_entry(stp, &sop->so_stateids,
-					st_perstateowner) {
-				lo = lockowner(sop);
-				if (check_for_locks(stp->st_file, lo))
-					goto out;
-				list_add(&lo->lo_list, &matches);
-			}
+
+	list_for_each_entry(sop, &lock_ownerstr_hashtbl[hashval], so_strhash) {
+		if (!same_owner_str(sop, owner, clid))
+			continue;
+		list_for_each_entry(stp, &sop->so_stateids,
+				st_perstateowner) {
+			lo = lockowner(sop);
+			if (check_for_locks(stp->st_file, lo))
+				goto out;
+			list_add(&lo->lo_list, &matches);
 		}
 	}
 	/* Clients probably won't expect us to return with some (but not all)
-- 
cgit v1.2.3


From 16bfdaafa2c66d8cc81422a97a105a849ca65b3e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 7 Nov 2011 17:23:30 -0500
Subject: nfsd4: share open and lock owner hash tables

Now that they're used in the same way, it's a little simpler to put open
and lock owners in the same hash table, and I can't see a reason not to.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 71 ++++++++++++++++++++++++-----------------------------
 1 file changed, 32 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index eec990022ad..dcbe7f37759 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -133,21 +133,21 @@ unsigned int max_delegations;
  * Open owner state (share locks)
  */
 
-/* hash tables for open owners */
-#define OPEN_OWNER_HASH_BITS              8
-#define OPEN_OWNER_HASH_SIZE             (1 << OPEN_OWNER_HASH_BITS)
-#define OPEN_OWNER_HASH_MASK             (OPEN_OWNER_HASH_SIZE - 1)
+/* hash tables for lock and open owners */
+#define OWNER_HASH_BITS              8
+#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
+#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
 
-static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
 {
 	unsigned int ret;
 
 	ret = opaque_hashval(ownername->data, ownername->len);
 	ret += clientid;
-	return ret & OPEN_OWNER_HASH_MASK;
+	return ret & OWNER_HASH_MASK;
 }
 
-static struct list_head	open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
+static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
 
 /* hash table for nfs4_file */
 #define FILE_HASH_BITS                   8
@@ -2373,7 +2373,7 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
 
 static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
 {
-	list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
+	list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
 	list_add(&oo->oo_perclient, &clp->cl_openowners);
 }
 
@@ -2436,7 +2436,9 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
 	struct nfs4_stateowner *so;
 	struct nfs4_openowner *oo;
 
-	list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
+	list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+		if (!so->so_is_open_owner)
+			continue;
 		if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
 			oo = openowner(so);
 			renew_client(oo->oo_owner.so_client);
@@ -2580,7 +2582,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	if (open->op_file == NULL)
 		return nfserr_jukebox;
 
-	strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
+	strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
 	oo = find_openstateowner_str(strhashval, open);
 	open->op_openowner = oo;
 	if (!oo) {
@@ -3718,13 +3720,7 @@ out:
 }
 
 
-/* 
- * Lock owner state (byte-range locks)
- */
 #define LOFF_OVERFLOW(start, len)      ((u64)(len) > ~(u64)(start))
-#define LOCK_HASH_BITS              8
-#define LOCK_HASH_SIZE             (1 << LOCK_HASH_BITS)
-#define LOCK_HASH_MASK             (LOCK_HASH_SIZE - 1)
 
 static inline u64
 end_offset(u64 start, u64 len)
@@ -3746,8 +3742,6 @@ last_byte_offset(u64 start, u64 len)
 	return end > start ? end - 1: NFS4_MAX_UINT64;
 }
 
-static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
-
 /*
  * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
  * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
@@ -3815,11 +3809,13 @@ static struct nfs4_lockowner *
 find_lockowner_str(struct inode *inode, clientid_t *clid,
 		struct xdr_netobj *owner)
 {
-	unsigned int hashval = open_ownerstr_hashval(clid->cl_id, owner);
+	unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
 	struct nfs4_lockowner *lo;
 	struct nfs4_stateowner *op;
 
-	list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
+	list_for_each_entry(op, &ownerstr_hashtbl[hashval], so_strhash) {
+		if (op->so_is_open_owner)
+			continue;
 		lo = lockowner(op);
 		if (same_lockowner_ino(lo, inode, clid, owner))
 			return lo;
@@ -3829,7 +3825,7 @@ find_lockowner_str(struct inode *inode, clientid_t *clid,
 
 static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
 {
-	list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+	list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
 	list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
 }
 
@@ -3838,7 +3834,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
  * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 
  * occurred. 
  *
- * strhashval = open_ownerstr_hashval
+ * strhashval = ownerstr_hashval
  */
 
 static struct nfs4_lockowner *
@@ -3913,7 +3909,7 @@ __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct n
 				struct nfs4_ol_stateid, st_perstateowner);
 		return nfs_ok;
 	}
-	strhashval = open_ownerstr_hashval(cl->cl_clientid.cl_id,
+	strhashval = ownerstr_hashval(cl->cl_clientid.cl_id,
 			&lock->v.new.owner);
 	lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
 	if (lo == NULL)
@@ -4277,7 +4273,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 	struct nfs4_ol_stateid *stp;
 	struct xdr_netobj *owner = &rlockowner->rl_owner;
 	struct list_head matches;
-	unsigned int hashval = open_ownerstr_hashval(clid->cl_id, owner);
+	unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
 	__be32 status;
 
 	dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
@@ -4294,7 +4290,9 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 	status = nfserr_locks_held;
 	INIT_LIST_HEAD(&matches);
 
-	list_for_each_entry(sop, &lock_ownerstr_hashtbl[hashval], so_strhash) {
+	list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) {
+		if (sop->so_is_open_owner)
+			continue;
 		if (!same_owner_str(sop, owner, clid))
 			continue;
 		list_for_each_entry(stp, &sop->so_stateids,
@@ -4444,16 +4442,16 @@ static void release_openowner_sop(struct nfs4_stateowner *sop)
 	release_openowner(openowner(sop));
 }
 
-static int nfsd_release_n_owners(u64 num,
-				struct list_head hashtbl[],
-				unsigned int hashtbl_size,
+static int nfsd_release_n_owners(u64 num, bool is_open_owner
 				void (*release_sop)(struct nfs4_stateowner *))
 {
 	int i, count = 0;
 	struct nfs4_stateowner *sop, *next;
 
-	for (i = 0; i < hashtbl_size; i++) {
-		list_for_each_entry_safe(sop, next, &hashtbl[i], so_strhash) {
+	for (i = 0; i < OWNER_HASH_SIZE; i++) {
+		list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) {
+			if (sop->so_is_open_owner != is_open_owner)
+				continue;
 			release_sop(sop);
 			if (++count == num)
 				return count;
@@ -4467,8 +4465,7 @@ void nfsd_forget_locks(u64 num)
 	int count;
 
 	nfs4_lock_state();
-	count = nfsd_release_n_owners(num, lock_ownerstr_hashtbl,
-				     LOCK_HASH_SIZE, release_lockowner_sop);
+	count = nfsd_release_n_owners(num, false, release_lockowner_sop);
 	nfs4_unlock_state();
 
 	printk(KERN_INFO "NFSD: Forgot %d locks", count);
@@ -4479,8 +4476,7 @@ void nfsd_forget_openowners(u64 num)
 	int count;
 
 	nfs4_lock_state();
-	count = nfsd_release_n_owners(num, open_ownerstr_hashtbl,
-				     OPEN_OWNER_HASH_SIZE, release_openowner_sop);
+	count = nfsd_release_n_owners(num, true, release_openowner_sop);
 	nfs4_unlock_state();
 
 	printk(KERN_INFO "NFSD: Forgot %d open owners", count);
@@ -4549,11 +4545,8 @@ nfs4_state_init(void)
 	for (i = 0; i < FILE_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&file_hashtbl[i]);
 	}
-	for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
-		INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
-	}
-	for (i = 0; i < LOCK_HASH_SIZE; i++) {
-		INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
+	for (i = 0; i < OWNER_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
 	}
 	memset(&onestateid, ~0, sizeof(stateid_t));
 	INIT_LIST_HEAD(&close_lru);
-- 
cgit v1.2.3


From 63894ab9f63a688f6b0b8cdd01ac0a9f36d507b8 Mon Sep 17 00:00:00 2001
From: Eryu Guan <guaneryu@gmail.com>
Date: Tue, 1 Nov 2011 10:06:19 +0800
Subject: ext3: call ext3_mark_recovery_complete() when recovery is really
 needed

Call ext3_mark_recovery_complete() in ext3_fill_super() only if
needs_recovery is non-zero.

Besides that, print out "recovery complete" message after calling
ext3_mark_recovery_complete().

Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Eryu Guan <guaneryu@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 922d289aeeb..767fa3a2bd1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2060,9 +2060,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
 	ext3_orphan_cleanup(sb, es);
 	EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
-	if (needs_recovery)
+	if (needs_recovery) {
+		ext3_mark_recovery_complete(sb, es);
 		ext3_msg(sb, KERN_INFO, "recovery complete");
-	ext3_mark_recovery_complete(sb, es);
+	}
 	ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
-- 
cgit v1.2.3


From 353de31b86850309b205a3d4cc1294052471b14d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 15 Nov 2011 19:25:03 -0500
Subject: nfsd4: fix CONFIG_NFSD_FAULT_INJECTION compile error

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index dcbe7f37759..7e4edbde737 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4442,7 +4442,7 @@ static void release_openowner_sop(struct nfs4_stateowner *sop)
 	release_openowner(openowner(sop));
 }
 
-static int nfsd_release_n_owners(u64 num, bool is_open_owner
+static int nfsd_release_n_owners(u64 num, bool is_open_owner,
 				void (*release_sop)(struct nfs4_stateowner *))
 {
 	int i, count = 0;
-- 
cgit v1.2.3


From 009673b439cf74d70a486fca0177e274febd81a7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 7 Nov 2011 17:40:10 -0500
Subject: nfsd4: add a separate (lockowner, inode) lookup

Address the possible performance regression mentioned in "nfsd4: hash
lockowners to simplify RELEASE_LOCKOWNER" by providing a separate
(lockowner, inode) hash.

Really, I doubt this matters much, but I think it's likely we'll change
these data structures here and I'd rather that the need for (owner,
inode) lookups be well-documented.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 29 +++++++++++++++++++++++------
 fs/nfsd/state.h     |  1 +
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7e4edbde737..a84978c13bb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -514,6 +514,7 @@ static void unhash_lockowner(struct nfs4_lockowner *lo)
 
 	list_del(&lo->lo_owner.so_strhash);
 	list_del(&lo->lo_perstateid);
+	list_del(&lo->lo_owner_ino_hash);
 	while (!list_empty(&lo->lo_owner.so_stateids)) {
 		stp = list_first_entry(&lo->lo_owner.so_stateids,
 				struct nfs4_ol_stateid, st_perstateowner);
@@ -3722,6 +3723,10 @@ out:
 
 #define LOFF_OVERFLOW(start, len)      ((u64)(len) > ~(u64)(start))
 
+#define LOCKOWNER_INO_HASH_BITS 8
+#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
+#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
+
 static inline u64
 end_offset(u64 start, u64 len)
 {
@@ -3742,6 +3747,15 @@ last_byte_offset(u64 start, u64 len)
 	return end > start ? end - 1: NFS4_MAX_UINT64;
 }
 
+static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername)
+{
+	return (file_hashval(inode) + cl_id
+			+ opaque_hashval(ownername->data, ownername->len))
+		& LOCKOWNER_INO_HASH_MASK;
+}
+
+static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
+
 /*
  * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
  * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
@@ -3809,14 +3823,10 @@ static struct nfs4_lockowner *
 find_lockowner_str(struct inode *inode, clientid_t *clid,
 		struct xdr_netobj *owner)
 {
-	unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
+	unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
 	struct nfs4_lockowner *lo;
-	struct nfs4_stateowner *op;
 
-	list_for_each_entry(op, &ownerstr_hashtbl[hashval], so_strhash) {
-		if (op->so_is_open_owner)
-			continue;
-		lo = lockowner(op);
+	list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
 		if (same_lockowner_ino(lo, inode, clid, owner))
 			return lo;
 	}
@@ -3825,7 +3835,12 @@ find_lockowner_str(struct inode *inode, clientid_t *clid,
 
 static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
 {
+	struct inode *inode = open_stp->st_file->fi_inode;
+	unsigned int inohash = lockowner_ino_hashval(inode,
+			clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
+
 	list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
+	list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]);
 	list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
 }
 
@@ -4548,6 +4563,8 @@ nfs4_state_init(void)
 	for (i = 0; i < OWNER_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
 	}
+	for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
 	memset(&onestateid, ~0, sizeof(stateid_t));
 	INIT_LIST_HEAD(&close_lru);
 	INIT_LIST_HEAD(&client_lru);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index a3cf38476a1..89c2cd84d79 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -366,6 +366,7 @@ struct nfs4_openowner {
 
 struct nfs4_lockowner {
 	struct nfs4_stateowner	lo_owner; /* must be first element */
+	struct list_head	lo_owner_ino_hash; /* hash by owner,file */
 	struct list_head        lo_perstateid; /* for lockowners only */
 	struct list_head	lo_list; /* for temporary uses */
 };
-- 
cgit v1.2.3


From 9beb3bf5a92bb8fc6503f844bf0772df29f14a02 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 26 Oct 2011 15:24:55 -0500
Subject: dlm: convert rsb list to rb_tree

Change the linked lists to rb_tree's in the rsb
hash table to speed up searches.  Slow rsb searches
were having a large impact on gfs2 performance due
to the large number of dlm locks gfs2 uses.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/debug_fs.c     | 28 +++++++++--------
 fs/dlm/dlm_internal.h |  9 ++++--
 fs/dlm/lock.c         | 87 +++++++++++++++++++++++++++++++++++++++++----------
 fs/dlm/lockspace.c    | 23 ++++++--------
 fs/dlm/recover.c      | 21 ++++++++-----
 5 files changed, 113 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 59779237e2b..3dca2b39e83 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -393,6 +393,7 @@ static const struct seq_operations format3_seq_ops;
 
 static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 {
+	struct rb_node *node;
 	struct dlm_ls *ls = seq->private;
 	struct rsbtbl_iter *ri;
 	struct dlm_rsb *r;
@@ -418,9 +419,10 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 		ri->format = 3;
 
 	spin_lock(&ls->ls_rsbtbl[bucket].lock);
-	if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
-		list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
-				    res_hashchain) {
+	if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
+		for (node = rb_first(&ls->ls_rsbtbl[bucket].keep); node;
+		     node = rb_next(node)) {
+			r = rb_entry(node, struct dlm_rsb, res_hashnode);
 			if (!entry--) {
 				dlm_hold_rsb(r);
 				ri->rsb = r;
@@ -449,9 +451,9 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 		}
 
 		spin_lock(&ls->ls_rsbtbl[bucket].lock);
-		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
-			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
-					     struct dlm_rsb, res_hashchain);
+		if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
+			node = rb_first(&ls->ls_rsbtbl[bucket].keep);
+			r = rb_entry(node, struct dlm_rsb, res_hashnode);
 			dlm_hold_rsb(r);
 			ri->rsb = r;
 			ri->bucket = bucket;
@@ -467,7 +469,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 {
 	struct dlm_ls *ls = seq->private;
 	struct rsbtbl_iter *ri = iter_ptr;
-	struct list_head *next;
+	struct rb_node *next;
 	struct dlm_rsb *r, *rp;
 	loff_t n = *pos;
 	unsigned bucket;
@@ -480,10 +482,10 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 
 	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	rp = ri->rsb;
-	next = rp->res_hashchain.next;
+	next = rb_next(&rp->res_hashnode);
 
-	if (next != &ls->ls_rsbtbl[bucket].list) {
-		r = list_entry(next, struct dlm_rsb, res_hashchain);
+	if (next) {
+		r = rb_entry(next, struct dlm_rsb, res_hashnode);
 		dlm_hold_rsb(r);
 		ri->rsb = r;
 		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
@@ -511,9 +513,9 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 		}
 
 		spin_lock(&ls->ls_rsbtbl[bucket].lock);
-		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
-			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
-					     struct dlm_rsb, res_hashchain);
+		if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
+			next = rb_first(&ls->ls_rsbtbl[bucket].keep);
+			r = rb_entry(next, struct dlm_rsb, res_hashnode);
 			dlm_hold_rsb(r);
 			ri->rsb = r;
 			ri->bucket = bucket;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index fe2860c0244..5685a9a5dba 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -103,8 +103,8 @@ struct dlm_dirtable {
 };
 
 struct dlm_rsbtable {
-	struct list_head	list;
-	struct list_head	toss;
+	struct rb_root		keep;
+	struct rb_root		toss;
 	spinlock_t		lock;
 };
 
@@ -285,7 +285,10 @@ struct dlm_rsb {
 	unsigned long		res_toss_time;
 	uint32_t		res_first_lkid;
 	struct list_head	res_lookup;	/* lkbs waiting on first */
-	struct list_head	res_hashchain;	/* rsbtbl */
+	union {
+		struct list_head	res_hashchain;
+		struct rb_node		res_hashnode;	/* rsbtbl */
+	};
 	struct list_head	res_grantqueue;
 	struct list_head	res_convertqueue;
 	struct list_head	res_waitqueue;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 83b5e32514e..d47183043c5 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
    L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
 */
 #include <linux/types.h>
+#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include "dlm_internal.h"
 #include <linux/dlm_device.h>
@@ -380,6 +381,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
 
 	r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
 	list_del(&r->res_hashchain);
+	/* Convert the empty list_head to a NULL rb_node for tree usage: */
+	memset(&r->res_hashnode, 0, sizeof(struct rb_node));
 	ls->ls_new_rsb_count--;
 	spin_unlock(&ls->ls_new_rsb_spin);
 
@@ -388,7 +391,6 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
 	memcpy(r->res_name, name, len);
 	mutex_init(&r->res_mutex);
 
-	INIT_LIST_HEAD(&r->res_hashchain);
 	INIT_LIST_HEAD(&r->res_lookup);
 	INIT_LIST_HEAD(&r->res_grantqueue);
 	INIT_LIST_HEAD(&r->res_convertqueue);
@@ -400,14 +402,31 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
 	return 0;
 }
 
-static int search_rsb_list(struct list_head *head, char *name, int len,
+static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
+{
+	char maxname[DLM_RESNAME_MAXLEN];
+
+	memset(maxname, 0, DLM_RESNAME_MAXLEN);
+	memcpy(maxname, name, nlen);
+	return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
+}
+
+static int search_rsb_tree(struct rb_root *tree, char *name, int len,
 			   unsigned int flags, struct dlm_rsb **r_ret)
 {
+	struct rb_node *node = tree->rb_node;
 	struct dlm_rsb *r;
 	int error = 0;
-
-	list_for_each_entry(r, head, res_hashchain) {
-		if (len == r->res_length && !memcmp(name, r->res_name, len))
+	int rc;
+
+	while (node) {
+		r = rb_entry(node, struct dlm_rsb, res_hashnode);
+		rc = rsb_cmp(r, name, len);
+		if (rc < 0)
+			node = node->rb_left;
+		else if (rc > 0)
+			node = node->rb_right;
+		else
 			goto found;
 	}
 	*r_ret = NULL;
@@ -420,22 +439,54 @@ static int search_rsb_list(struct list_head *head, char *name, int len,
 	return error;
 }
 
+static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
+{
+	struct rb_node **newn = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int rc;
+
+	while (*newn) {
+		struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb,
+					       res_hashnode);
+
+		parent = *newn;
+		rc = rsb_cmp(cur, rsb->res_name, rsb->res_length);
+		if (rc < 0)
+			newn = &parent->rb_left;
+		else if (rc > 0)
+			newn = &parent->rb_right;
+		else {
+			log_print("rsb_insert match");
+			dlm_dump_rsb(rsb);
+			dlm_dump_rsb(cur);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&rsb->res_hashnode, parent, newn);
+	rb_insert_color(&rsb->res_hashnode, tree);
+	return 0;
+}
+
 static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
 		       unsigned int flags, struct dlm_rsb **r_ret)
 {
 	struct dlm_rsb *r;
 	int error;
 
-	error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+	error = search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r);
 	if (!error) {
 		kref_get(&r->res_ref);
 		goto out;
 	}
-	error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+	error = search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
 	if (error)
 		goto out;
 
-	list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+	if (error)
+		return error;
 
 	if (dlm_no_directory(ls))
 		goto out;
@@ -527,8 +578,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
 			nodeid = 0;
 		r->res_nodeid = nodeid;
 	}
-	list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
-	error = 0;
+	error = rsb_insert(r, &ls->ls_rsbtbl[bucket].keep);
  out_unlock:
 	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
  out:
@@ -556,7 +606,8 @@ static void toss_rsb(struct kref *kref)
 
 	DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
 	kref_init(&r->res_ref);
-	list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
+	rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
 	r->res_toss_time = jiffies;
 	if (r->res_lvbptr) {
 		dlm_free_lvb(r->res_lvbptr);
@@ -1082,19 +1133,19 @@ static void dir_remove(struct dlm_rsb *r)
 				     r->res_name, r->res_length);
 }
 
-/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
-   found since they are in order of newest to oldest? */
+/* FIXME: make this more efficient */
 
 static int shrink_bucket(struct dlm_ls *ls, int b)
 {
+	struct rb_node *n;
 	struct dlm_rsb *r;
 	int count = 0, found;
 
 	for (;;) {
 		found = 0;
 		spin_lock(&ls->ls_rsbtbl[b].lock);
-		list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
-					    res_hashchain) {
+		for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) {
+			r = rb_entry(n, struct dlm_rsb, res_hashnode);
 			if (!time_after_eq(jiffies, r->res_toss_time +
 					   dlm_config.ci_toss_secs * HZ))
 				continue;
@@ -1108,7 +1159,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
 		}
 
 		if (kref_put(&r->res_ref, kill_rsb)) {
-			list_del(&r->res_hashchain);
+			rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
 			spin_unlock(&ls->ls_rsbtbl[b].lock);
 
 			if (is_master(r))
@@ -4441,10 +4492,12 @@ int dlm_purge_locks(struct dlm_ls *ls)
 
 static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
 {
+	struct rb_node *n;
 	struct dlm_rsb *r, *r_ret = NULL;
 
 	spin_lock(&ls->ls_rsbtbl[bucket].lock);
-	list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
+	for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) {
+		r = rb_entry(n, struct dlm_rsb, res_hashnode);
 		if (!rsb_flag(r, RSB_LOCKS_PURGED))
 			continue;
 		hold_rsb(r);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index a1d8f1af144..1d16a23b0a0 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -457,8 +457,8 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	if (!ls->ls_rsbtbl)
 		goto out_lsfree;
 	for (i = 0; i < size; i++) {
-		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
-		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+		ls->ls_rsbtbl[i].keep.rb_node = NULL;
+		ls->ls_rsbtbl[i].toss.rb_node = NULL;
 		spin_lock_init(&ls->ls_rsbtbl[i].lock);
 	}
 
@@ -685,7 +685,7 @@ static int lockspace_busy(struct dlm_ls *ls, int force)
 static int release_lockspace(struct dlm_ls *ls, int force)
 {
 	struct dlm_rsb *rsb;
-	struct list_head *head;
+	struct rb_node *n;
 	int i, busy, rv;
 
 	busy = lockspace_busy(ls, force);
@@ -746,20 +746,15 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 	 */
 
 	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
-		head = &ls->ls_rsbtbl[i].list;
-		while (!list_empty(head)) {
-			rsb = list_entry(head->next, struct dlm_rsb,
-					 res_hashchain);
-
-			list_del(&rsb->res_hashchain);
+		while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) {
+			rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+			rb_erase(n, &ls->ls_rsbtbl[i].keep);
 			dlm_free_rsb(rsb);
 		}
 
-		head = &ls->ls_rsbtbl[i].toss;
-		while (!list_empty(head)) {
-			rsb = list_entry(head->next, struct dlm_rsb,
-					 res_hashchain);
-			list_del(&rsb->res_hashchain);
+		while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) {
+			rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+			rb_erase(n, &ls->ls_rsbtbl[i].toss);
 			dlm_free_rsb(rsb);
 		}
 	}
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 14638235f7b..50467cefdbd 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -715,6 +715,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
 
 int dlm_create_root_list(struct dlm_ls *ls)
 {
+	struct rb_node *n;
 	struct dlm_rsb *r;
 	int i, error = 0;
 
@@ -727,7 +728,8 @@ int dlm_create_root_list(struct dlm_ls *ls)
 
 	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
 		spin_lock(&ls->ls_rsbtbl[i].lock);
-		list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+		for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
+			r = rb_entry(n, struct dlm_rsb, res_hashnode);
 			list_add(&r->res_root_list, &ls->ls_root_list);
 			dlm_hold_rsb(r);
 		}
@@ -741,7 +743,8 @@ int dlm_create_root_list(struct dlm_ls *ls)
 			continue;
 		}
 
-		list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
+		for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = rb_next(n)) {
+			r = rb_entry(n, struct dlm_rsb, res_hashnode);
 			list_add(&r->res_root_list, &ls->ls_root_list);
 			dlm_hold_rsb(r);
 		}
@@ -771,16 +774,18 @@ void dlm_release_root_list(struct dlm_ls *ls)
 
 void dlm_clear_toss_list(struct dlm_ls *ls)
 {
-	struct dlm_rsb *r, *safe;
+	struct rb_node *n, *next;
+	struct dlm_rsb *rsb;
 	int i;
 
 	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
 		spin_lock(&ls->ls_rsbtbl[i].lock);
-		list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
-					 res_hashchain) {
-			if (dlm_no_directory(ls) || !is_master(r)) {
-				list_del(&r->res_hashchain);
-				dlm_free_rsb(r);
+		for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) {
+			next = rb_next(n);;
+			rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+			if (dlm_no_directory(ls) || !is_master(rsb)) {
+				rb_erase(n, &ls->ls_rsbtbl[i].toss);
+				dlm_free_rsb(rsb);
 			}
 		}
 		spin_unlock(&ls->ls_rsbtbl[i].lock);
-- 
cgit v1.2.3


From 0720a06a7518c9d0c0125bd5d1f3b6264c55c3dd Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 17 Nov 2011 16:42:19 -0500
Subject: NLS: improve UTF8 -> UTF16 string conversion routine

The utf8s_to_utf16s conversion routine needs to be improved.  Unlike
its utf16s_to_utf8s sibling, it doesn't accept arguments specifying
the maximum length of the output buffer or the endianness of its
16-bit output.

This patch (as1501) adds the two missing arguments, and adjusts the
only two places in the kernel where the function is called.  A
follow-on patch will add a third caller that does utilize the new
capabilities.

The two conversion routines are still annoyingly inconsistent in the
way they handle invalid byte combinations.  But that's a subject for a
different patch.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
CC: Clemens Ladisch <clemens@ladisch.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/hv/hv_kvp.c | 10 ++++++----
 fs/fat/namei_vfat.c |  3 ++-
 fs/nls/nls_base.c   | 43 +++++++++++++++++++++++++++++++++----------
 include/linux/nls.h |  5 +++--
 4 files changed, 44 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 89f52440fcf..0e8343f585b 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -212,11 +212,13 @@ kvp_respond_to_host(char *key, char *value, int error)
 	 * The windows host expects the key/value pair to be encoded
 	 * in utf16.
 	 */
-	keylen = utf8s_to_utf16s(key_name, strlen(key_name),
-				(wchar_t *)kvp_data->data.key);
+	keylen = utf8s_to_utf16s(key_name, strlen(key_name), UTF16_HOST_ENDIAN,
+				(wchar_t *) kvp_data->data.key,
+				HV_KVP_EXCHANGE_MAX_KEY_SIZE / 2);
 	kvp_data->data.key_size = 2*(keylen + 1); /* utf16 encoding */
-	valuelen = utf8s_to_utf16s(value, strlen(value),
-				(wchar_t *)kvp_data->data.value);
+	valuelen = utf8s_to_utf16s(value, strlen(value), UTF16_HOST_ENDIAN,
+				(wchar_t *) kvp_data->data.value,
+				HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2);
 	kvp_data->data.value_size = 2*(valuelen + 1); /* utf16 encoding */
 
 	kvp_data->data.value_type = REG_SZ; /* all our values are strings */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a87a65663c2..c25cf151b84 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -512,7 +512,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
 	int charlen;
 
 	if (utf8) {
-		*outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
+		*outlen = utf8s_to_utf16s(name, len, UTF16_HOST_ENDIAN,
+				(wchar_t *) outname, FAT_LFN_LEN + 2);
 		if (*outlen < 0)
 			return *outlen;
 		else if (*outlen > FAT_LFN_LEN)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 44a88a9fa2c..0eb059ec6f2 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -114,34 +114,57 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
 }
 EXPORT_SYMBOL(utf32_to_utf8);
 
-int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
+static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian)
+{
+	switch (endian) {
+	default:
+		*s = (wchar_t) c;
+		break;
+	case UTF16_LITTLE_ENDIAN:
+		*s = __cpu_to_le16(c);
+		break;
+	case UTF16_BIG_ENDIAN:
+		*s = __cpu_to_be16(c);
+		break;
+	}
+}
+
+int utf8s_to_utf16s(const u8 *s, int len, enum utf16_endian endian,
+		wchar_t *pwcs, int maxlen)
 {
 	u16 *op;
 	int size;
 	unicode_t u;
 
 	op = pwcs;
-	while (*s && len > 0) {
+	while (len > 0 && maxlen > 0 && *s) {
 		if (*s & 0x80) {
 			size = utf8_to_utf32(s, len, &u);
 			if (size < 0)
 				return -EINVAL;
+			s += size;
+			len -= size;
 
 			if (u >= PLANE_SIZE) {
+				if (maxlen < 2)
+					break;
 				u -= PLANE_SIZE;
-				*op++ = (wchar_t) (SURROGATE_PAIR |
-						((u >> 10) & SURROGATE_BITS));
-				*op++ = (wchar_t) (SURROGATE_PAIR |
+				put_utf16(op++, SURROGATE_PAIR |
+						((u >> 10) & SURROGATE_BITS),
+						endian);
+				put_utf16(op++, SURROGATE_PAIR |
 						SURROGATE_LOW |
-						(u & SURROGATE_BITS));
+						(u & SURROGATE_BITS),
+						endian);
+				maxlen -= 2;
 			} else {
-				*op++ = (wchar_t) u;
+				put_utf16(op++, u, endian);
+				maxlen--;
 			}
-			s += size;
-			len -= size;
 		} else {
-			*op++ = *s++;
+			put_utf16(op++, *s++, endian);
 			len--;
+			maxlen--;
 		}
 	}
 	return op - pwcs;
diff --git a/include/linux/nls.h b/include/linux/nls.h
index d47beef08df..5dc635f8d79 100644
--- a/include/linux/nls.h
+++ b/include/linux/nls.h
@@ -43,7 +43,7 @@ enum utf16_endian {
 	UTF16_BIG_ENDIAN
 };
 
-/* nls.c */
+/* nls_base.c */
 extern int register_nls(struct nls_table *);
 extern int unregister_nls(struct nls_table *);
 extern struct nls_table *load_nls(char *);
@@ -52,7 +52,8 @@ extern struct nls_table *load_nls_default(void);
 
 extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu);
 extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen);
-extern int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs);
+extern int utf8s_to_utf16s(const u8 *s, int len,
+		enum utf16_endian endian, wchar_t *pwcs, int maxlen);
 extern int utf16s_to_utf8s(const wchar_t *pwcs, int len,
 		enum utf16_endian endian, u8 *s, int maxlen);
 
-- 
cgit v1.2.3


From 8c111b3f56332a216b18cd57950bdf04ac8f2a98 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Sat, 19 Nov 2011 17:34:29 +0800
Subject: jbd: clear revoked flag on buffers before a new transaction started

Currently, we clear revoked flag only when a block is reused.  However,
this can tigger a false journal error.  Consider a situation when a block
is used as a meta block and is deleted(revoked) in ordered mode, then the
block is allocated as a data block to a file.  At this moment, user changes
the file's journal mode from ordered to journaled and truncates the file.
The block will be considered re-revoked by journal because it has revoked
flag still pending from the last transaction and an assertion triggers.

We fix the problem by keeping the revoked status more uptodate - we clear
revoked flag when switching revoke tables to reflect there is no revoked
buffers in current transaction any more.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/commit.c     |  6 ++++++
 fs/jbd/revoke.c     | 34 ++++++++++++++++++++++++++++++++++
 include/linux/jbd.h |  1 +
 3 files changed, 41 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 8799207df05..f2b9a571f4c 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -391,6 +391,12 @@ void journal_commit_transaction(journal_t *journal)
 
 	jbd_debug (3, "JBD: commit phase 1\n");
 
+	/*
+	 * Clear revoked flag to reflect there is no revoked buffers
+	 * in the next transaction which is going to be started.
+	 */
+	journal_clear_buffer_revoked_flags(journal);
+
 	/*
 	 * Switch to a new revoke table.
 	 */
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 305a9076315..25c713e7071 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -47,6 +47,10 @@
  *   overwriting the new data.  We don't even need to clear the revoke
  *   bit here.
  *
+ * We cache revoke status of a buffer in the current transaction in b_states
+ * bits.  As the name says, revokevalid flag indicates that the cached revoke
+ * status of a buffer is valid and we can rely on the cached status.
+ *
  * Revoke information on buffers is a tri-state value:
  *
  * RevokeValid clear:	no cached revoke status, need to look it up
@@ -479,6 +483,36 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	return did_revoke;
 }
 
+/*
+ * journal_clear_revoked_flags clears revoked flag of buffers in
+ * revoke table to reflect there is no revoked buffer in the next
+ * transaction which is going to be started.
+ */
+void journal_clear_buffer_revoked_flags(journal_t *journal)
+{
+	struct jbd_revoke_table_s *revoke = journal->j_revoke;
+	int i = 0;
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		struct list_head *hash_list;
+		struct list_head *list_entry;
+		hash_list = &revoke->hash_table[i];
+
+		list_for_each(list_entry, hash_list) {
+			struct jbd_revoke_record_s *record;
+			struct buffer_head *bh;
+			record = (struct jbd_revoke_record_s *)list_entry;
+			bh = __find_get_block(journal->j_fs_dev,
+					      record->blocknr,
+					      journal->j_blocksize);
+			if (bh) {
+				clear_buffer_revoked(bh);
+				__brelse(bh);
+			}
+		}
+	}
+}
+
 /* journal_switch_revoke table select j_revoke for next transaction
  * we do not want to suspend any processing until all revokes are
  * written -bzzz
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index c7acdde3243..492cc12244f 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -913,6 +913,7 @@ extern int	journal_set_revoke(journal_t *, unsigned int, tid_t);
 extern int	journal_test_revoke(journal_t *, unsigned int, tid_t);
 extern void	journal_clear_revoke(journal_t *);
 extern void	journal_switch_revoke_table(journal_t *journal);
+extern void	journal_clear_buffer_revoked_flags(journal_t *journal);
 
 /*
  * The log thread user interface:
-- 
cgit v1.2.3


From eaecf43a6970c8d0ef54a31427c82a99e4863fe8 Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Fri, 18 Nov 2011 00:00:52 +0100
Subject: UBIFS: Use kmemdup rather than duplicating its implementation

The semantic patch that makes this change is available
in scripts/coccinelle/api/memdup.cocci.

Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
---
 fs/ubifs/lpt.c   | 6 ++----
 fs/ubifs/tnc.c   | 3 +--
 fs/ubifs/xattr.c | 6 ++----
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 6189c74d97f..66d59d0a140 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1986,12 +1986,11 @@ again:
 
 				if (path[h].in_tree)
 					continue;
-				nnode = kmalloc(sz, GFP_NOFS);
+				nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS);
 				if (!nnode) {
 					err = -ENOMEM;
 					goto out;
 				}
-				memcpy(nnode, &path[h].nnode, sz);
 				parent = nnode->parent;
 				parent->nbranch[nnode->iip].nnode = nnode;
 				path[h].ptr.nnode = nnode;
@@ -2004,12 +2003,11 @@ again:
 				const size_t sz = sizeof(struct ubifs_pnode);
 				struct ubifs_nnode *parent;
 
-				pnode = kmalloc(sz, GFP_NOFS);
+				pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS);
 				if (!pnode) {
 					err = -ENOMEM;
 					goto out;
 				}
-				memcpy(pnode, &path[h].pnode, sz);
 				parent = pnode->parent;
 				parent->nbranch[pnode->iip].pnode = pnode;
 				path[h].ptr.pnode = pnode;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 06673864768..e14ee53159d 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -344,12 +344,11 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 		return err;
 	}
 
-	lnc_node = kmalloc(zbr->len, GFP_NOFS);
+	lnc_node = kmemdup(node, zbr->len, GFP_NOFS);
 	if (!lnc_node)
 		/* We don't have to have the cache, so no error */
 		return 0;
 
-	memcpy(lnc_node, node, zbr->len);
 	zbr->leaf = lnc_node;
 	return 0;
 }
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index bf18f7a0454..85b27226875 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -138,12 +138,11 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 	ui = ubifs_inode(inode);
 	ui->xattr = 1;
 	ui->flags |= UBIFS_XATTR_FL;
-	ui->data = kmalloc(size, GFP_NOFS);
+	ui->data = kmemdup(value, size, GFP_NOFS);
 	if (!ui->data) {
 		err = -ENOMEM;
 		goto out_free;
 	}
-	memcpy(ui->data, value, size);
 	inode->i_size = ui->ui_size = size;
 	ui->data_len = size;
 
@@ -204,12 +203,11 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
 		return err;
 
 	kfree(ui->data);
-	ui->data = kmalloc(size, GFP_NOFS);
+	ui->data = kmemdup(value, size, GFP_NOFS);
 	if (!ui->data) {
 		err = -ENOMEM;
 		goto out_free;
 	}
-	memcpy(ui->data, value, size);
 	inode->i_size = ui->ui_size = size;
 	ui->data_len = size;
 
-- 
cgit v1.2.3


From bcdd0c1600903e9222abfcde28947406020ccb5d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 22 Nov 2011 11:00:20 +0300
Subject: ext3: NULL dereference in ext3_evict_inode()

This is an fsfuzzer bug.  ->s_journal is set at the end of
ext3_load_journal() but we try to use it in the error handling from
ext3_get_journal() while it's still NULL.

[  337.039041] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024
[  337.040380] IP: [<ffffffff816e6539>] _raw_spin_lock+0x9/0x30
[  337.041687] PGD 0
[  337.043118] Oops: 0002 [#1] SMP
[  337.044483] CPU 3
[  337.044495] Modules linked in: ecb md4 cifs fuse kvm_intel kvm brcmsmac brcmutil crc8 cordic r8169 [last unloaded: scsi_wait_scan]
[  337.047633]
[  337.049259] Pid: 8308, comm: mount Not tainted 3.2.0-rc2-next-20111121+ #24 SAMSUNG ELECTRONICS CO., LTD. RV411/RV511/E3511/S3511    /RV411/RV511/E3511/S3511
[  337.051064] RIP: 0010:[<ffffffff816e6539>]  [<ffffffff816e6539>] _raw_spin_lock+0x9/0x30
[  337.052879] RSP: 0018:ffff8800b1d11ae8  EFLAGS: 00010282
[  337.054668] RAX: 0000000000000100 RBX: 0000000000000000 RCX: ffff8800b77c2000
[  337.056400] RDX: ffff8800a97b5c00 RSI: 0000000000000000 RDI: 0000000000000024
[  337.058099] RBP: ffff8800b1d11ae8 R08: 6000000000000000 R09: e018000000000000
[  337.059841] R10: ff67366cc2607c03 R11: 00000000110688e6 R12: 0000000000000000
[  337.061607] R13: 0000000000000000 R14: 0000000000000000 R15: ffff8800a78f06e8
[  337.063385] FS:  00007f9d95652800(0000) GS:ffff8800b7180000(0000) knlGS:0000000000000000
[  337.065110] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  337.066801] CR2: 0000000000000024 CR3: 00000000aef2c000 CR4: 00000000000006e0
[  337.068581] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  337.070321] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  337.072105] Process mount (pid: 8308, threadinfo ffff8800b1d10000, task ffff8800b1d02be0)
[  337.073800] Stack:
[  337.075487]  ffff8800b1d11b08 ffffffff811f48cf ffff88007ac9b158 0000000000000000
[  337.077255]  ffff8800b1d11b38 ffffffff8119405d ffff88007ac9b158 ffff88007ac9b250
[  337.078851]  ffffffff8181bda0 ffffffff8181bda0 ffff8800b1d11b68 ffffffff81131e31
[  337.080284] Call Trace:
[  337.081706]  [<ffffffff811f48cf>] log_start_commit+0x1f/0x40
[  337.083107]  [<ffffffff8119405d>] ext3_evict_inode+0x1fd/0x2a0
[  337.084490]  [<ffffffff81131e31>] evict+0xa1/0x1a0
[  337.085857]  [<ffffffff81132031>] iput+0x101/0x210
[  337.087220]  [<ffffffff811339d1>] iget_failed+0x21/0x30
[  337.088581]  [<ffffffff811905fc>] ext3_iget+0x15c/0x450
[  337.089936]  [<ffffffff8118b0c1>] ? ext3_rsv_window_add+0x81/0x100
[  337.091284]  [<ffffffff816df9a4>] ext3_get_journal+0x15/0xde
[  337.092641]  [<ffffffff811a2e9b>] ext3_fill_super+0xf2b/0x1c30
[  337.093991]  [<ffffffff810ddf7d>] ? register_shrinker+0x4d/0x60
[  337.095332]  [<ffffffff8111c112>] mount_bdev+0x1a2/0x1e0
[  337.096680]  [<ffffffff811a1f70>] ? ext3_setup_super+0x210/0x210
[  337.098026]  [<ffffffff8119a770>] ext3_mount+0x10/0x20
[  337.099362]  [<ffffffff8111cbee>] mount_fs+0x3e/0x1b0
[  337.100759]  [<ffffffff810eda1b>] ? __alloc_percpu+0xb/0x10
[  337.102330]  [<ffffffff81135385>] vfs_kern_mount+0x65/0xc0
[  337.103889]  [<ffffffff8113611f>] do_kern_mount+0x4f/0x100
[  337.105442]  [<ffffffff811378fc>] do_mount+0x19c/0x890
[  337.106989]  [<ffffffff810e8456>] ? memdup_user+0x46/0x90
[  337.108572]  [<ffffffff810e84f3>] ? strndup_user+0x53/0x70
[  337.110114]  [<ffffffff811383fb>] sys_mount+0x8b/0xe0
[  337.111617]  [<ffffffff816ed93b>] system_call_fastpath+0x16/0x1b
[  337.113133] Code: 38 c2 74 0f 66 0f 1f 44 00 00 f3 90 0f b6 03 38 c2 75 f7 48 83 c4 08 5b 5d c3 0f 1f 84 00 00 00 00 00 55 b8 00 01 00 00 48 89 e5 <f0> 66 0f c1 07 0f b6 d4 38 c2 74 0c 0f 1f 00 f3 90 0f b6 07 38
[  337.116588] RIP  [<ffffffff816e6539>] _raw_spin_lock+0x9/0x30
[  337.118260]  RSP <ffff8800b1d11ae8>
[  337.119998] CR2: 0000000000000024
[  337.188701] ---[ end trace c36d790becac1615 ]---

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 85fe655fe3e..9da9125ba3e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -223,8 +223,12 @@ void ext3_evict_inode (struct inode *inode)
 	 *
 	 * Note that directories do not have this problem because they don't
 	 * use page cache.
+	 *
+	 * The s_journal check handles the case when ext3_get_journal() fails
+	 * and puts the journal inode.
 	 */
 	if (inode->i_nlink && ext3_should_journal_data(inode) &&
+	    EXT3_SB(inode->i_sb)->s_journal &&
 	    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
 		tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
 		journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
-- 
cgit v1.2.3


From 67114fe6103183190fb0a24f58e5695a80beb2ec Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Thu, 17 Nov 2011 23:43:40 +0100
Subject: nfsd4: Use kmemdup rather than duplicating its implementation

The semantic patch that makes this change is available
in scripts/coccinelle/api/memdup.cocci.

Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 3 +--
 fs/nfsd/nfs4xdr.c   | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a84978c13bb..6ab677913f9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -986,12 +986,11 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 	clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
 	if (clp == NULL)
 		return NULL;
-	clp->cl_name.data = kmalloc(name.len, GFP_KERNEL);
+	clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
 	if (clp->cl_name.data == NULL) {
 		kfree(clp);
 		return NULL;
 	}
-	memcpy(clp->cl_name.data, name.data, name.len);
 	clp->cl_name.len = name.len;
 	return clp;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b6fa792d6b8..0ec5a1b9700 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -215,10 +215,9 @@ defer_free(struct nfsd4_compoundargs *argp,
 static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
 {
 	if (p == argp->tmp) {
-		p = kmalloc(nbytes, GFP_KERNEL);
+		p = kmemdup(argp->tmp, nbytes, GFP_KERNEL);
 		if (!p)
 			return NULL;
-		memcpy(p, argp->tmp, nbytes);
 	} else {
 		BUG_ON(p != argp->tmpp);
 		argp->tmpp = NULL;
-- 
cgit v1.2.3


From 045ddc8991698a8e9c5668c6190faa8b5d516dc0 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 21 Nov 2011 10:15:13 -0500
Subject: NLS: raname "maxlen" to "maxout" in UTF conversion routines

As requested by NamJae Jeon, this patch (as1503) changes the name of
the "maxlen" parameters to "maxout" in the various UTF conversion
routines.  This should make the role of that parameter more clear.

The patch also renames the "len" parameters to "inlen", for the same
reason.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Reviewed-by: NamJae Jeon <linkinjeon@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/nls/nls_base.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 0eb059ec6f2..fea6bd5831d 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -52,7 +52,7 @@ static const struct utf8_table utf8_table[] =
 #define SURROGATE_LOW	0x00000400
 #define SURROGATE_BITS	0x000003ff
 
-int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
+int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu)
 {
 	unsigned long l;
 	int c0, c, nc;
@@ -71,7 +71,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
 			*pu = (unicode_t) l;
 			return nc;
 		}
-		if (len <= nc)
+		if (inlen <= nc)
 			return -1;
 		s++;
 		c = (*s ^ 0x80) & 0xFF;
@@ -83,7 +83,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
 }
 EXPORT_SYMBOL(utf8_to_utf32);
 
-int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
+int utf32_to_utf8(unicode_t u, u8 *s, int maxout)
 {
 	unsigned long l;
 	int c, nc;
@@ -97,7 +97,7 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
 		return -1;
 
 	nc = 0;
-	for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
+	for (t = utf8_table; t->cmask && maxout; t++, maxout--) {
 		nc++;
 		if (l <= t->lmask) {
 			c = t->shift;
@@ -129,24 +129,24 @@ static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian)
 	}
 }
 
-int utf8s_to_utf16s(const u8 *s, int len, enum utf16_endian endian,
-		wchar_t *pwcs, int maxlen)
+int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian,
+		wchar_t *pwcs, int maxout)
 {
 	u16 *op;
 	int size;
 	unicode_t u;
 
 	op = pwcs;
-	while (len > 0 && maxlen > 0 && *s) {
+	while (inlen > 0 && maxout > 0 && *s) {
 		if (*s & 0x80) {
-			size = utf8_to_utf32(s, len, &u);
+			size = utf8_to_utf32(s, inlen, &u);
 			if (size < 0)
 				return -EINVAL;
 			s += size;
-			len -= size;
+			inlen -= size;
 
 			if (u >= PLANE_SIZE) {
-				if (maxlen < 2)
+				if (maxout < 2)
 					break;
 				u -= PLANE_SIZE;
 				put_utf16(op++, SURROGATE_PAIR |
@@ -156,15 +156,15 @@ int utf8s_to_utf16s(const u8 *s, int len, enum utf16_endian endian,
 						SURROGATE_LOW |
 						(u & SURROGATE_BITS),
 						endian);
-				maxlen -= 2;
+				maxout -= 2;
 			} else {
 				put_utf16(op++, u, endian);
-				maxlen--;
+				maxout--;
 			}
 		} else {
 			put_utf16(op++, *s++, endian);
-			len--;
-			maxlen--;
+			inlen--;
+			maxout--;
 		}
 	}
 	return op - pwcs;
@@ -183,27 +183,27 @@ static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
 	}
 }
 
-int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
-		u8 *s, int maxlen)
+int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
+		u8 *s, int maxout)
 {
 	u8 *op;
 	int size;
 	unsigned long u, v;
 
 	op = s;
-	while (len > 0 && maxlen > 0) {
+	while (inlen > 0 && maxout > 0) {
 		u = get_utf16(*pwcs, endian);
 		if (!u)
 			break;
 		pwcs++;
-		len--;
+		inlen--;
 		if (u > 0x7f) {
 			if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
 				if (u & SURROGATE_LOW) {
 					/* Ignore character and move on */
 					continue;
 				}
-				if (len <= 0)
+				if (inlen <= 0)
 					break;
 				v = get_utf16(*pwcs, endian);
 				if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
@@ -214,18 +214,18 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
 				u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
 						+ (v & SURROGATE_BITS);
 				pwcs++;
-				len--;
+				inlen--;
 			}
-			size = utf32_to_utf8(u, op, maxlen);
+			size = utf32_to_utf8(u, op, maxout);
 			if (size == -1) {
 				/* Ignore character and move on */
 			} else {
 				op += size;
-				maxlen -= size;
+				maxout -= size;
 			}
 		} else {
 			*op++ = (u8) u;
-			maxlen--;
+			maxout--;
 		}
 	}
 	return op - s;
-- 
cgit v1.2.3


From b2ea70afade7080360ac55c4e64ff7a5fafdb67b Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Fri, 18 Nov 2011 12:14:49 +0200
Subject: nfsd: Fix oops when parsing a 0 length export

expkey_parse() oopses when handling a 0 length export. This is easily
triggerable from usermode by writing 0 bytes into
'/proc/[proc id]/net/rpc/nfsd.fh/channel'.

Below is the log:

[ 1402.286893] BUG: unable to handle kernel paging request at ffff880077c49fff
[ 1402.287632] IP: [<ffffffff812b4b99>] expkey_parse+0x28/0x2e1
[ 1402.287632] PGD 2206063 PUD 1fdfd067 PMD 1ffbc067 PTE 8000000077c49160
[ 1402.287632] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 1402.287632] CPU 1
[ 1402.287632] Pid: 20198, comm: trinity Not tainted 3.2.0-rc2-sasha-00058-gc65cd37 #6
[ 1402.287632] RIP: 0010:[<ffffffff812b4b99>]  [<ffffffff812b4b99>] expkey_parse+0x28/0x2e1
[ 1402.287632] RSP: 0018:ffff880077f0fd68  EFLAGS: 00010292
[ 1402.287632] RAX: ffff880077c49fff RBX: 00000000ffffffea RCX: 0000000001043400
[ 1402.287632] RDX: 0000000000000000 RSI: ffff880077c4a000 RDI: ffffffff82283de0
[ 1402.287632] RBP: ffff880077f0fe18 R08: 0000000000000001 R09: ffff880000000000
[ 1402.287632] R10: 0000000000000000 R11: 0000000000000001 R12: ffff880077c4a000
[ 1402.287632] R13: ffffffff82283de0 R14: 0000000001043400 R15: ffffffff82283de0
[ 1402.287632] FS:  00007f25fec3f700(0000) GS:ffff88007d400000(0000) knlGS:0000000000000000
[ 1402.287632] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1402.287632] CR2: ffff880077c49fff CR3: 0000000077e1d000 CR4: 00000000000406e0
[ 1402.287632] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1402.287632] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 1402.287632] Process trinity (pid: 20198, threadinfo ffff880077f0e000, task ffff880077db17b0)
[ 1402.287632] Stack:
[ 1402.287632]  ffff880077db17b0 ffff880077c4a000 ffff880077f0fdb8 ffffffff810b411e
[ 1402.287632]  ffff880000000000 ffff880077db17b0 ffff880077c4a000 ffffffff82283de0
[ 1402.287632]  0000000001043400 ffffffff82283de0 ffff880077f0fde8 ffffffff81111f63
[ 1402.287632] Call Trace:
[ 1402.287632]  [<ffffffff810b411e>] ? lock_release+0x1af/0x1bc
[ 1402.287632]  [<ffffffff81111f63>] ? might_fault+0x97/0x9e
[ 1402.287632]  [<ffffffff81111f1a>] ? might_fault+0x4e/0x9e
[ 1402.287632]  [<ffffffff81a8bcf2>] cache_do_downcall+0x3e/0x4f
[ 1402.287632]  [<ffffffff81a8c950>] cache_write.clone.16+0xbb/0x130
[ 1402.287632]  [<ffffffff81a8c9df>] ? cache_write_pipefs+0x1a/0x1a
[ 1402.287632]  [<ffffffff81a8c9f8>] cache_write_procfs+0x19/0x1b
[ 1402.287632]  [<ffffffff8118dc54>] proc_reg_write+0x8e/0xad
[ 1402.287632]  [<ffffffff8113fe81>] vfs_write+0xaa/0xfd
[ 1402.287632]  [<ffffffff8114142d>] ? fget_light+0x35/0x9e
[ 1402.287632]  [<ffffffff8113ff8b>] sys_write+0x48/0x6f
[ 1402.287632]  [<ffffffff81bbdb92>] system_call_fastpath+0x16/0x1b
[ 1402.287632] Code: c0 c9 c3 55 48 63 d2 48 89 e5 48 8d 44 32 ff 41 57 41 56 41 55 41 54 53 bb ea ff ff ff 48 81 ec 88 00 00 00 48 89 b5 58 ff ff ff
[ 1402.287632]  38 0a 0f 85 89 02 00 00 c6 00 00 48 8b 3d 44 4a e5 01 48 85
[ 1402.287632] RIP  [<ffffffff812b4b99>] expkey_parse+0x28/0x2e1
[ 1402.287632]  RSP <ffff880077f0fd68>
[ 1402.287632] CR2: ffff880077c49fff
[ 1402.287632] ---[ end trace 368ef53ff773a5e3 ]---

Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Neil Brown <neilb@suse.de>
Cc: linux-nfs@vger.kernel.org
Cc: stable@kernel.org
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 62f3b9074e8..5f312abf124 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -87,7 +87,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 	struct svc_expkey key;
 	struct svc_expkey *ek = NULL;
 
-	if (mesg[mlen-1] != '\n')
+	if (mlen < 1 || mesg[mlen-1] != '\n')
 		return -EINVAL;
 	mesg[mlen-1] = 0;
 
-- 
cgit v1.2.3


From 0cf99b91c669510b785b459c211772091a94efd5 Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Wed, 23 Nov 2011 10:48:40 +0800
Subject: nfsd41: allow non-reclaim open-by-fh's in 4.1

With NFSv4.0 it was safe to assume that open-by-filehandles were always
reclaims.

With NFSv4.1 there are non-reclaim open-by-filehandle operations, so we
should ensure we're only insisting on reclaims in the
OPEN_CLAIM_PREVIOUS case.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index fa383361bc6..9415bc415ce 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -266,10 +266,6 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 {
 	__be32 status;
 
-	/* Only reclaims from previously confirmed clients are valid */
-	if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
-		return status;
-
 	/* We don't know the target directory, and therefore can not
 	* set the change info
 	*/
@@ -373,6 +369,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			break;
 		case NFS4_OPEN_CLAIM_PREVIOUS:
 			open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+			status = nfs4_check_open_reclaim(&open->op_clientid);
+			if (status)
+				goto out;
 		case NFS4_OPEN_CLAIM_FH:
 		case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
 			status = do_open_fhandle(rqstp, &cstate->current_fh,
-- 
cgit v1.2.3


From f5c8593b94190aabdcf207a544f082c7816c4fe6 Mon Sep 17 00:00:00 2001
From: Stanislav Kinsbursky <skinsbursky@parallels.com>
Date: Wed, 7 Dec 2011 12:57:56 +0300
Subject: NFSd: use network-namespace-aware cache registering routines

v2: cache_register_net() and cache_unregister_net() GPL exports added

This is a cleanup patch. Hope, some day generic cache_register() and
cache_unregister() will be removed.

Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c    | 10 +++++-----
 fs/nfsd/nfs4idmap.c | 11 ++++++-----
 net/sunrpc/cache.c  |  2 ++
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5f312abf124..cf8a6bd062f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1226,12 +1226,12 @@ nfsd_export_init(void)
 	int rv;
 	dprintk("nfsd: initializing export module.\n");
 
-	rv = cache_register(&svc_export_cache);
+	rv = cache_register_net(&svc_export_cache, &init_net);
 	if (rv)
 		return rv;
-	rv = cache_register(&svc_expkey_cache);
+	rv = cache_register_net(&svc_expkey_cache, &init_net);
 	if (rv)
-		cache_unregister(&svc_export_cache);
+		cache_unregister_net(&svc_export_cache, &init_net);
 	return rv;
 
 }
@@ -1255,8 +1255,8 @@ nfsd_export_shutdown(void)
 
 	dprintk("nfsd: shutting down export module.\n");
 
-	cache_unregister(&svc_expkey_cache);
-	cache_unregister(&svc_export_cache);
+	cache_unregister_net(&svc_expkey_cache, &init_net);
+	cache_unregister_net(&svc_export_cache, &init_net);
 	svcauth_unix_purge();
 
 	dprintk("nfsd: export shutdown complete.\n");
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 55780a22fdb..94096273cd6 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
 #include <linux/seq_file.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <net/net_namespace.h>
 #include "idmap.h"
 #include "nfsd.h"
 
@@ -466,20 +467,20 @@ nfsd_idmap_init(void)
 {
 	int rv;
 
-	rv = cache_register(&idtoname_cache);
+	rv = cache_register_net(&idtoname_cache, &init_net);
 	if (rv)
 		return rv;
-	rv = cache_register(&nametoid_cache);
+	rv = cache_register_net(&nametoid_cache, &init_net);
 	if (rv)
-		cache_unregister(&idtoname_cache);
+		cache_unregister_net(&idtoname_cache, &init_net);
 	return rv;
 }
 
 void
 nfsd_idmap_shutdown(void)
 {
-	cache_unregister(&idtoname_cache);
-	cache_unregister(&nametoid_cache);
+	cache_unregister_net(&idtoname_cache, &init_net);
+	cache_unregister_net(&nametoid_cache, &init_net);
 }
 
 static int
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 72ad836e4fe..b8daa577734 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1641,6 +1641,7 @@ int cache_register_net(struct cache_detail *cd, struct net *net)
 		sunrpc_destroy_cache_detail(cd);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(cache_register_net);
 
 int cache_register(struct cache_detail *cd)
 {
@@ -1653,6 +1654,7 @@ void cache_unregister_net(struct cache_detail *cd, struct net *net)
 	remove_cache_proc_entries(cd, net);
 	sunrpc_destroy_cache_detail(cd);
 }
+EXPORT_SYMBOL_GPL(cache_unregister_net);
 
 void cache_unregister(struct cache_detail *cd)
 {
-- 
cgit v1.2.3


From f32f3c2d3f09a586349ca9180885dc8741290fd9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 12 Dec 2011 15:00:35 -0500
Subject: nfsd4: initialize special stateid's at compile time

Stateid's with "other" ("opaque") field all zeros or all ones are
reserved.  We define all_ones separately on the off chance there will be
more such some day, though currently all the other special stateid's
have zero other field.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6ab677913f9..213da7b7e7d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,12 +49,20 @@
 time_t nfsd4_lease = 90;     /* default lease time */
 time_t nfsd4_grace = 90;
 static time_t boot_time;
-static stateid_t zerostateid;             /* bits all 0 */
-static stateid_t onestateid;              /* bits all 1 */
+
+#define all_ones {{~0,~0},~0}
+static const stateid_t one_stateid = {
+	.si_generation = ~0,
+	.si_opaque = all_ones,
+};
+static const stateid_t zero_stateid = {
+	/* all fields zero */
+};
+
 static u64 current_sessionid = 1;
 
-#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
-#define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
+#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t)))
+#define ONE_STATEID(stateid)  (!memcmp((stateid), &one_stateid, sizeof(stateid_t)))
 
 /* forward declarations */
 static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
@@ -4564,7 +4572,6 @@ nfs4_state_init(void)
 	}
 	for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
-	memset(&onestateid, ~0, sizeof(stateid_t));
 	INIT_LIST_HEAD(&close_lru);
 	INIT_LIST_HEAD(&client_lru);
 	INIT_LIST_HEAD(&del_recall_lru);
-- 
cgit v1.2.3


From c07c3d193412bbf4e9f405e75dc84e35e77fac28 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 13 Dec 2011 11:58:48 +0100
Subject: fuse: llseek optimize SEEK_CUR and SEEK_SET

Use generic_file_llseek() instead of open coding the seek function.

i_mutex protection is only necessary for SEEK_END (and SEEK_HOLE, SEEK_DATA), so
move SEEK_CUR and SEEK_SET out from under i_mutex.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 48 ++++++++----------------------------------------
 1 file changed, 8 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0c84100acd4..e0b60acfe72 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1555,48 +1555,16 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
 	loff_t retval;
 	struct inode *inode = file->f_path.dentry->d_inode;
 
-	mutex_lock(&inode->i_mutex);
-	if (origin != SEEK_CUR && origin != SEEK_SET) {
-		retval = fuse_update_attributes(inode, NULL, file, NULL);
-		if (retval)
-			goto exit;
-	}
+	/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
+	if (origin == SEEK_CUR || origin == SEEK_SET)
+		return generic_file_llseek(file, offset, origin);
 
-	switch (origin) {
-	case SEEK_END:
-		offset += i_size_read(inode);
-		break;
-	case SEEK_CUR:
-		if (offset == 0) {
-			retval = file->f_pos;
-			goto exit;
-		}
-		offset += file->f_pos;
-		break;
-	case SEEK_DATA:
-		if (offset >= i_size_read(inode)) {
-			retval = -ENXIO;
-			goto exit;
-		}
-		break;
-	case SEEK_HOLE:
-		if (offset >= i_size_read(inode)) {
-			retval = -ENXIO;
-			goto exit;
-		}
-		offset = i_size_read(inode);
-		break;
-	}
-	retval = -EINVAL;
-	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
-		if (offset != file->f_pos) {
-			file->f_pos = offset;
-			file->f_version = 0;
-		}
-		retval = offset;
-	}
-exit:
+	mutex_lock(&inode->i_mutex);
+	retval = fuse_update_attributes(inode, NULL, file, NULL);
+	if (!retval)
+		retval = generic_file_llseek(file, offset, origin);
 	mutex_unlock(&inode->i_mutex);
+
 	return retval;
 }
 
-- 
cgit v1.2.3


From c411cc88d873b3f68635a04691f7f115c46bc39e Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Tue, 29 Nov 2011 22:08:00 +0100
Subject: fuse: Use kcalloc instead of kzalloc to allocate array

The advantage of kcalloc is, that will prevent integer overflows which could
result from the multiplication of number of elements and size and it is also
a bit nicer to read.

The semantic patch that makes this change is available
in https://lkml.org/lkml/2011/11/25/107

Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e0b60acfe72..c297425cba7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1776,7 +1776,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
 
 	err = -ENOMEM;
-	pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+	pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
 	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
 	if (!pages || !iov_page)
 		goto out;
-- 
cgit v1.2.3


From b18da0c56e9ff43a007b6c8e302c62e720964151 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 13 Dec 2011 11:58:49 +0100
Subject: fuse: support ioctl on directories

Multiplexing filesystems may want to support ioctls on the underlying
files and directores (e.g. FS_IOC_{GET,SET}FLAGS).

Ioctl support on directories was missing so add it now.

Reported-by: Antonio SJ Musumeci <bile@landofbile.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c        | 26 ++++++++++++++++++++++++++
 fs/fuse/file.c       |  8 ++++----
 fs/fuse/fuse_i.h     |  2 ++
 include/linux/fuse.h |  7 ++++++-
 4 files changed, 38 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9f63e493a9b..344577933f6 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1182,6 +1182,30 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
 	return fuse_fsync_common(file, start, end, datasync, 1);
 }
 
+static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+	/* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
+	if (fc->minor < 18)
+		return -ENOTTY;
+
+	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
+}
+
+static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+	if (fc->minor < 18)
+		return -ENOTTY;
+
+	return fuse_ioctl_common(file, cmd, arg,
+				 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
+}
+
 static bool update_mtime(unsigned ivalid)
 {
 	/* Always update if mtime is explicitly set  */
@@ -1596,6 +1620,8 @@ static const struct file_operations fuse_dir_operations = {
 	.open		= fuse_dir_open,
 	.release	= fuse_dir_release,
 	.fsync		= fuse_dir_fsync,
+	.unlocked_ioctl	= fuse_dir_ioctl,
+	.compat_ioctl	= fuse_dir_compat_ioctl,
 };
 
 static const struct inode_operations fuse_common_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c297425cba7..4a199fd93fb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1926,8 +1926,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 }
 EXPORT_SYMBOL_GPL(fuse_do_ioctl);
 
-static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
-				   unsigned long arg, unsigned int flags)
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+		       unsigned long arg, unsigned int flags)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1944,13 +1944,13 @@ static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
 static long fuse_file_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg)
 {
-	return fuse_file_ioctl_common(file, cmd, arg, 0);
+	return fuse_ioctl_common(file, cmd, arg, 0);
 }
 
 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
 				   unsigned long arg)
 {
-	return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
 }
 
 /*
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index cf6db0a9321..09337bcc255 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -765,6 +765,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 		       size_t count, loff_t *ppos, int write);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		   unsigned int flags);
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+		       unsigned long arg, unsigned int flags);
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
 
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 464cff52686..446c89718b9 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -50,6 +50,9 @@
  *
  * 7.17
  *  - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK
+ *
+ * 7.18
+ *  - add FUSE_IOCTL_DIR flag
  */
 
 #ifndef _LINUX_FUSE_H
@@ -81,7 +84,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 17
+#define FUSE_KERNEL_MINOR_VERSION 18
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -214,6 +217,7 @@ struct fuse_file_lock {
  * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
  * FUSE_IOCTL_RETRY: retry with new iovecs
  * FUSE_IOCTL_32BIT: 32bit ioctl
+ * FUSE_IOCTL_DIR: is a directory
  *
  * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
  */
@@ -221,6 +225,7 @@ struct fuse_file_lock {
 #define FUSE_IOCTL_UNRESTRICTED	(1 << 1)
 #define FUSE_IOCTL_RETRY	(1 << 2)
 #define FUSE_IOCTL_32BIT	(1 << 3)
+#define FUSE_IOCTL_DIR		(1 << 4)
 
 #define FUSE_IOCTL_MAX_IOV	256
 
-- 
cgit v1.2.3


From 451d0f599934fd97faf54a5d7954b518e66192cb Mon Sep 17 00:00:00 2001
From: John Muir <john@jmuir.com>
Date: Tue, 6 Dec 2011 21:50:06 +0100
Subject: FUSE: Notifying the kernel of deletion.

Allows a FUSE file-system to tell the kernel when a file or directory is
deleted. If the specified dentry has the specified inode number, the kernel will
unhash it.

The current 'fuse_notify_inval_entry' does not cause the kernel to clean up
directories that are in use properly, and as a result the users of those
directories see incorrect semantics from the file-system. The error condition
seen when 'fuse_notify_inval_entry' is used to notify of a deleted directory is
avoided when 'fuse_notify_delete' is used instead.

The following scenario demonstrates the difference:
1. User A chdirs into 'testdir' and starts reading 'testfile'.
2. User B rm -rf 'testdir'.
3. User B creates 'testdir'.
4. User C chdirs into 'testdir'.

If you run the above within the same machine on any file-system (including fuse
file-systems), there is no problem: user C is able to chdir into the new
testdir. The old testdir is removed from the dentry tree, but still open by user
A.

If operations 2 and 3 are performed via the network such that the fuse
file-system uses one of the notify functions to tell the kernel that the nodes
are gone, then the following error occurs for user C while user A holds the
original directory open:

muirj@empacher:~> ls /test/testdir
ls: cannot access /test/testdir: No such file or directory

The issue here is that the kernel still has a dentry for testdir, and so it is
requesting the attributes for the old directory, while the file-system is
responding that the directory no longer exists.

If on the other hand, if the file-system can notify the kernel that the
directory is deleted using the new 'fuse_notify_delete' function, then the above
ls will find the new directory as expected.

Signed-off-by: John Muir <john@jmuir.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c        | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/fuse/dir.c        | 32 +++++++++++++++++++++++++++--
 fs/fuse/fuse_i.h     |  8 +++++++-
 include/linux/fuse.h |  9 +++++++++
 4 files changed, 102 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 2aaf3eaaf13..5f3368ab0fa 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1378,7 +1378,59 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
 	down_read(&fc->killsb);
 	err = -ENOENT;
 	if (fc->sb)
-		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+	up_read(&fc->killsb);
+	kfree(buf);
+	return err;
+
+err:
+	kfree(buf);
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
+			      struct fuse_copy_state *cs)
+{
+	struct fuse_notify_delete_out outarg;
+	int err = -ENOMEM;
+	char *buf;
+	struct qstr name;
+
+	buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	err = -EINVAL;
+	if (size < sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+
+	err = -ENAMETOOLONG;
+	if (outarg.namelen > FUSE_NAME_MAX)
+		goto err;
+
+	err = -EINVAL;
+	if (size != sizeof(outarg) + outarg.namelen + 1)
+		goto err;
+
+	name.name = buf;
+	name.len = outarg.namelen;
+	err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+	buf[outarg.namelen] = 0;
+	name.hash = full_name_hash(name.name, name.len);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb)
+		err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
+					       outarg.child, &name);
 	up_read(&fc->killsb);
 	kfree(buf);
 	return err;
@@ -1597,6 +1649,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 	case FUSE_NOTIFY_RETRIEVE:
 		return fuse_notify_retrieve(fc, size, cs);
 
+	case FUSE_NOTIFY_DELETE:
+		return fuse_notify_delete(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 344577933f6..bef8c3011d3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -868,7 +868,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 }
 
 int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
-			     struct qstr *name)
+			     u64 child_nodeid, struct qstr *name)
 {
 	int err = -ENOTDIR;
 	struct inode *parent;
@@ -895,8 +895,36 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 
 	fuse_invalidate_attr(parent);
 	fuse_invalidate_entry(entry);
+
+	if (child_nodeid != 0 && entry->d_inode) {
+		mutex_lock(&entry->d_inode->i_mutex);
+		if (get_node_id(entry->d_inode) != child_nodeid) {
+			err = -ENOENT;
+			goto badentry;
+		}
+		if (d_mountpoint(entry)) {
+			err = -EBUSY;
+			goto badentry;
+		}
+		if (S_ISDIR(entry->d_inode->i_mode)) {
+			shrink_dcache_parent(entry);
+			if (!simple_empty(entry)) {
+				err = -ENOTEMPTY;
+				goto badentry;
+			}
+			entry->d_inode->i_flags |= S_DEAD;
+		}
+		dont_mount(entry);
+		clear_nlink(entry->d_inode);
+		err = 0;
+ badentry:
+		mutex_unlock(&entry->d_inode->i_mutex);
+		if (!err)
+			d_delete(entry);
+	} else {
+		err = 0;
+	}
 	dput(entry);
-	err = 0;
 
  unlock:
 	mutex_unlock(&parent->i_mutex);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 09337bcc255..a571584a091 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -755,9 +755,15 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
 /**
  * File-system tells the kernel to invalidate parent attributes and
  * the dentry matching parent/name.
+ *
+ * If the child_nodeid is non-zero and:
+ *    - matches the inode number for the dentry matching parent/name,
+ *    - is not a mount point
+ *    - is a file or oan empty directory
+ * then the dentry is unhashed (d_delete()).
  */
 int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
-			     struct qstr *name);
+			     u64 child_nodeid, struct qstr *name);
 
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 446c89718b9..8ba2c9460b2 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -53,6 +53,7 @@
  *
  * 7.18
  *  - add FUSE_IOCTL_DIR flag
+ *  - add FUSE_NOTIFY_DELETE
  */
 
 #ifndef _LINUX_FUSE_H
@@ -288,6 +289,7 @@ enum fuse_notify_code {
 	FUSE_NOTIFY_INVAL_ENTRY = 3,
 	FUSE_NOTIFY_STORE = 4,
 	FUSE_NOTIFY_RETRIEVE = 5,
+	FUSE_NOTIFY_DELETE = 6,
 	FUSE_NOTIFY_CODE_MAX,
 };
 
@@ -611,6 +613,13 @@ struct fuse_notify_inval_entry_out {
 	__u32	padding;
 };
 
+struct fuse_notify_delete_out {
+	__u64	parent;
+	__u64	child;
+	__u32	namelen;
+	__u32	padding;
+};
+
 struct fuse_notify_store_out {
 	__u64	nodeid;
 	__u64	offset;
-- 
cgit v1.2.3


From 1ff97647f066aef72ae68042c9abc4a837a12e6d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 13 Dec 2011 11:06:49 -0800
Subject: char_dev.c: fix up some whitespace errors

Remove some minor whitespace errors (2 trailing spaces, and one space
needed for a comma) to make the file checkpatch.pl clean with the
exception of the exports, which is fine for now.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/char_dev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index dca9e5e0f73..3f152b92a94 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -272,7 +272,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
 	cd = __register_chrdev_region(major, baseminor, count, name);
 	if (IS_ERR(cd))
 		return PTR_ERR(cd);
-	
+
 	cdev = cdev_alloc();
 	if (!cdev)
 		goto out2;
@@ -280,7 +280,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
 	cdev->owner = fops->owner;
 	cdev->ops = fops;
 	kobject_set_name(&cdev->kobj, "%s", name);
-		
+
 	err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
 	if (err)
 		goto out;
@@ -405,7 +405,7 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 		goto out_cdev_put;
 
 	if (filp->f_op->open) {
-		ret = filp->f_op->open(inode,filp);
+		ret = filp->f_op->open(inode, filp);
 		if (ret)
 			goto out_cdev_put;
 	}
-- 
cgit v1.2.3


From 39c4cc0fcc38cff29d5b9884372b17c894c9c080 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Tue, 13 Dec 2011 16:35:58 -0500
Subject: NFSD: Only reinitilize the recall_lru list under the recall lock

unhash_delegation() will grab the recall lock before calling
list_del_init() in each of these places.  This patch removes the
redundant calls.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 213da7b7e7d..19ca9b54200 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1066,7 +1066,6 @@ expire_client(struct nfs4_client *clp)
 	spin_unlock(&recall_lock);
 	while (!list_empty(&reaplist)) {
 		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
-		list_del_init(&dp->dl_recall_lru);
 		unhash_delegation(dp);
 	}
 	while (!list_empty(&clp->cl_openowners)) {
@@ -3133,7 +3132,6 @@ nfs4_laundromat(void)
 	spin_unlock(&recall_lock);
 	list_for_each_safe(pos, next, &reaplist) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		list_del_init(&dp->dl_recall_lru);
 		unhash_delegation(dp);
 	}
 	test_val = nfsd4_lease;
@@ -4674,7 +4672,6 @@ __nfs4_state_shutdown(void)
 	spin_unlock(&recall_lock);
 	list_for_each_safe(pos, next, &reaplist) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		list_del_init(&dp->dl_recall_lru);
 		unhash_delegation(dp);
 	}
 
-- 
cgit v1.2.3


From 6e736be7f282fff705db7c34a15313281b372a76 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Dec 2011 00:33:38 +0100
Subject: block: make ioc get/put interface more conventional and fix race on
 alloction

Ignoring copy_io() during fork, io_context can be allocated from two
places - current_io_context() and set_task_ioprio().  The former is
always called from local task while the latter can be called from
different task.  The synchornization between them are peculiar and
dubious.

* current_io_context() doesn't grab task_lock() and assumes that if it
  saw %NULL ->io_context, it would stay that way until allocation and
  assignment is complete.  It has smp_wmb() between alloc/init and
  assignment.

* set_task_ioprio() grabs task_lock() for assignment and does
  smp_read_barrier_depends() between "ioc = task->io_context" and "if
  (ioc)".  Unfortunately, this doesn't achieve anything - the latter
  is not a dependent load of the former.  ie, if ioc itself were being
  dereferenced "ioc->xxx", it would mean something (not sure what tho)
  but as the code currently stands, the dependent read barrier is
  noop.

As only one of the the two test-assignment sequences is task_lock()
protected, the task_lock() can't do much about race between the two.
Nothing prevents current_io_context() and set_task_ioprio() allocating
its own ioc for the same task and overwriting the other's.

Also, set_task_ioprio() can race with exiting task and create a new
ioc after exit_io_context() is finished.

ioc get/put doesn't have any reason to be complex.  The only hot path
is accessing the existing ioc of %current, which is simple to achieve
given that ->io_context is never destroyed as long as the task is
alive.  All other paths can happily go through task_lock() like all
other task sub structures without impacting anything.

This patch updates ioc get/put so that it becomes more conventional.

* alloc_io_context() is replaced with get_task_io_context().  This is
  the only interface which can acquire access to ioc of another task.
  On return, the caller has an explicit reference to the object which
  should be put using put_io_context() afterwards.

* The functionality of current_io_context() remains the same but when
  creating a new ioc, it shares the code path with
  get_task_io_context() and always goes through task_lock().

* get_io_context() now means incrementing ref on an ioc which the
  caller already has access to (be that an explicit refcnt or implicit
  %current one).

* PF_EXITING inhibits creation of new io_context and once
  exit_io_context() is finished, it's guaranteed that both ioc
  acquisition functions return %NULL.

* All users are updated.  Most are trivial but
  smp_read_barrier_depends() removal from cfq_get_io_context() needs a
  bit of explanation.  I suppose the original intention was to ensure
  ioc->ioprio is visible when set_task_ioprio() allocates new
  io_context and installs it; however, this wouldn't have worked
  because set_task_ioprio() doesn't have wmb between init and install.
  There are other problems with this which will be fixed in another
  patch.

* While at it, use NUMA_NO_NODE instead of -1 for wildcard node
  specification.

-v2: Vivek spotted contamination from debug patch.  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |  9 +++--
 block/blk-ioc.c           | 99 +++++++++++++++++++++++++++++++----------------
 block/blk.h               |  1 +
 block/cfq-iosched.c       | 18 ++++-----
 fs/ioprio.c               | 21 ++--------
 include/linux/iocontext.h |  4 +-
 kernel/fork.c             |  8 ++--
 7 files changed, 91 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8f630cec906..4b001dcd85b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1645,11 +1645,12 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	struct io_context *ioc;
 
-	task_lock(tsk);
-	ioc = tsk->io_context;
-	if (ioc)
+	/* we don't lose anything even if ioc allocation fails */
+	ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
 		ioc->cgroup_changed = 1;
-	task_unlock(tsk);
+		put_io_context(ioc);
+	}
 }
 
 void blkio_policy_register(struct blkio_policy_type *blkiop)
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 8bebf06bac7..b13ed96776c 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -16,6 +16,19 @@
  */
 static struct kmem_cache *iocontext_cachep;
 
+/**
+ * get_io_context - increment reference count to io_context
+ * @ioc: io_context to get
+ *
+ * Increment reference count to @ioc.
+ */
+void get_io_context(struct io_context *ioc)
+{
+	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	atomic_long_inc(&ioc->refcount);
+}
+EXPORT_SYMBOL(get_io_context);
+
 static void cfq_dtor(struct io_context *ioc)
 {
 	if (!hlist_empty(&ioc->cic_list)) {
@@ -71,6 +84,9 @@ void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
 
+	/* PF_EXITING prevents new io_context from being attached to @task */
+	WARN_ON_ONCE(!(current->flags & PF_EXITING));
+
 	task_lock(task);
 	ioc = task->io_context;
 	task->io_context = NULL;
@@ -82,7 +98,9 @@ void exit_io_context(struct task_struct *task)
 	put_io_context(ioc);
 }
 
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
+static struct io_context *create_task_io_context(struct task_struct *task,
+						 gfp_t gfp_flags, int node,
+						 bool take_ref)
 {
 	struct io_context *ioc;
 
@@ -98,6 +116,20 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->cic_list);
 
+	/* try to install, somebody might already have beaten us to it */
+	task_lock(task);
+
+	if (!task->io_context && !(task->flags & PF_EXITING)) {
+		task->io_context = ioc;
+	} else {
+		kmem_cache_free(iocontext_cachep, ioc);
+		ioc = task->io_context;
+	}
+
+	if (ioc && take_ref)
+		get_io_context(ioc);
+
+	task_unlock(task);
 	return ioc;
 }
 
@@ -114,46 +146,47 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
  */
 struct io_context *current_io_context(gfp_t gfp_flags, int node)
 {
-	struct task_struct *tsk = current;
-	struct io_context *ret;
-
-	ret = tsk->io_context;
-	if (likely(ret))
-		return ret;
-
-	ret = alloc_io_context(gfp_flags, node);
-	if (ret) {
-		/* make sure set_task_ioprio() sees the settings above */
-		smp_wmb();
-		tsk->io_context = ret;
-	}
+	might_sleep_if(gfp_flags & __GFP_WAIT);
 
-	return ret;
+	if (current->io_context)
+		return current->io_context;
+
+	return create_task_io_context(current, gfp_flags, node, false);
 }
+EXPORT_SYMBOL(current_io_context);
 
-/*
- * If the current task has no IO context then create one and initialise it.
- * If it does have a context, take a ref on it.
+/**
+ * get_task_io_context - get io_context of a task
+ * @task: task of interest
+ * @gfp_flags: allocation flags, used if allocation is necessary
+ * @node: allocation node, used if allocation is necessary
+ *
+ * Return io_context of @task.  If it doesn't exist, it is created with
+ * @gfp_flags and @node.  The returned io_context has its reference count
+ * incremented.
  *
- * This is always called in the context of the task which submitted the I/O.
+ * This function always goes through task_lock() and it's better to use
+ * current_io_context() + get_io_context() for %current.
  */
-struct io_context *get_io_context(gfp_t gfp_flags, int node)
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node)
 {
-	struct io_context *ioc = NULL;
-
-	/*
-	 * Check for unlikely race with exiting task. ioc ref count is
-	 * zero when ioc is being detached.
-	 */
-	do {
-		ioc = current_io_context(gfp_flags, node);
-		if (unlikely(!ioc))
-			break;
-	} while (!atomic_long_inc_not_zero(&ioc->refcount));
+	struct io_context *ioc;
 
-	return ioc;
+	might_sleep_if(gfp_flags & __GFP_WAIT);
+
+	task_lock(task);
+	ioc = task->io_context;
+	if (likely(ioc)) {
+		get_io_context(ioc);
+		task_unlock(task);
+		return ioc;
+	}
+	task_unlock(task);
+
+	return create_task_io_context(task, gfp_flags, node, true);
 }
-EXPORT_SYMBOL(get_io_context);
+EXPORT_SYMBOL(get_task_io_context);
 
 static int __init blk_ioc_init(void)
 {
diff --git a/block/blk.h b/block/blk.h
index aae4d88fc52..fc3c41b2fd2 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -122,6 +122,7 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
 }
 #endif
 
+void get_io_context(struct io_context *ioc);
 struct io_context *current_io_context(gfp_t gfp_flags, int node);
 
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ec3f5e8ba56..d42d89ccce1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include "blk.h"
 #include "cfq.h"
 
 /*
@@ -3194,13 +3195,13 @@ static struct cfq_io_context *
 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
-	struct cfq_io_context *cic;
+	struct cfq_io_context *cic = NULL;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	ioc = get_io_context(gfp_mask, cfqd->queue->node);
+	ioc = current_io_context(gfp_mask, cfqd->queue->node);
 	if (!ioc)
-		return NULL;
+		goto err;
 
 	cic = cfq_cic_lookup(cfqd, ioc);
 	if (cic)
@@ -3211,10 +3212,10 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		goto err;
 
 	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
-		goto err_free;
-
+		goto err;
 out:
-	smp_read_barrier_depends();
+	get_io_context(ioc);
+
 	if (unlikely(ioc->ioprio_changed))
 		cfq_ioc_set_ioprio(ioc);
 
@@ -3223,10 +3224,9 @@ out:
 		cfq_ioc_set_cgroup(ioc);
 #endif
 	return cic;
-err_free:
-	cfq_cic_free(cic);
 err:
-	put_io_context(ioc);
+	if (cic)
+		cfq_cic_free(cic);
 	return NULL;
 }
 
diff --git a/fs/ioprio.c b/fs/ioprio.c
index f79dab83e17..998ec239d1e 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -48,28 +48,13 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 	if (err)
 		return err;
 
-	task_lock(task);
-	do {
-		ioc = task->io_context;
-		/* see wmb() in current_io_context() */
-		smp_read_barrier_depends();
-		if (ioc)
-			break;
-
-		ioc = alloc_io_context(GFP_ATOMIC, -1);
-		if (!ioc) {
-			err = -ENOMEM;
-			break;
-		}
-		task->io_context = ioc;
-	} while (1);
-
-	if (!err) {
+	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
 		ioc->ioprio = ioprio;
 		ioc->ioprio_changed = 1;
+		put_io_context(ioc);
 	}
 
-	task_unlock(task);
 	return err;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 8a6ecb66346..28bb621ef5a 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -78,8 +78,8 @@ struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
-struct io_context *get_io_context(gfp_t gfp_flags, int node);
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node);
 #else
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d08..5bcfc739bb7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -870,6 +870,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 #ifdef CONFIG_BLOCK
 	struct io_context *ioc = current->io_context;
+	struct io_context *new_ioc;
 
 	if (!ioc)
 		return 0;
@@ -881,11 +882,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
 	} else if (ioprio_valid(ioc->ioprio)) {
-		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
-		if (unlikely(!tsk->io_context))
+		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+		if (unlikely(!new_ioc))
 			return -ENOMEM;
 
-		tsk->io_context->ioprio = ioc->ioprio;
+		new_ioc->ioprio = ioc->ioprio;
+		put_io_context(new_ioc);
 	}
 #endif
 	return 0;
-- 
cgit v1.2.3


From dc86900e0a8f665122de6faadd27fb4c6d2b3e4d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Dec 2011 00:33:38 +0100
Subject: block, cfq: move ioc ioprio/cgroup changed handling to cic

ioprio/cgroup change was handled by marking the changed state in ioc
and, on the following access to the ioc, performing RCU-protected
iteration through all cic's grabbing the matching queue_lock.

This patch moves the changed state to each cic.  When ioprio or cgroup
changes, the respective bit is set on all cic's of the ioc and when
each of those cic (not ioc) is accessed, change is applied for that
specific ioc-queue pair.

This also fixes the following two race conditions between setting and
clearing of changed states.

* Missing barrier between assign/load of ioprio and ioprio_changed
  allowed applying old ioprio.

* Change requests could happen between application of change and
  clearing of changed variables.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |  2 +-
 block/blk-ioc.c           | 45 +++++++++++++++++++++++++++++++++++++++++++++
 block/cfq-iosched.c       | 28 +++++++++-------------------
 fs/ioprio.c               |  3 +--
 include/linux/iocontext.h | 14 +++++++++-----
 5 files changed, 65 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4b001dcd85b..dc00835aab6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1648,7 +1648,7 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	/* we don't lose anything even if ioc allocation fails */
 	ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
-		ioc->cgroup_changed = 1;
+		ioc_cgroup_changed(ioc);
 		put_io_context(ioc);
 	}
 }
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index b13ed96776c..6f59fbad93d 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -188,6 +188,51 @@ struct io_context *get_task_io_context(struct task_struct *task,
 }
 EXPORT_SYMBOL(get_task_io_context);
 
+void ioc_set_changed(struct io_context *ioc, int which)
+{
+	struct cfq_io_context *cic;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(cic, n, &ioc->cic_list, cic_list)
+		set_bit(which, &cic->changed);
+}
+
+/**
+ * ioc_ioprio_changed - notify ioprio change
+ * @ioc: io_context of interest
+ * @ioprio: new ioprio
+ *
+ * @ioc's ioprio has changed to @ioprio.  Set %CIC_IOPRIO_CHANGED for all
+ * cic's.  iosched is responsible for checking the bit and applying it on
+ * request issue path.
+ */
+void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	ioc->ioprio = ioprio;
+	ioc_set_changed(ioc, CIC_IOPRIO_CHANGED);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
+/**
+ * ioc_cgroup_changed - notify cgroup change
+ * @ioc: io_context of interest
+ *
+ * @ioc's cgroup has changed.  Set %CIC_CGROUP_CHANGED for all cic's.
+ * iosched is responsible for checking the bit and applying it on request
+ * issue path.
+ */
+void ioc_cgroup_changed(struct io_context *ioc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	ioc_set_changed(ioc, CIC_CGROUP_CHANGED);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
 static int __init blk_ioc_init(void)
 {
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a612ca65f37..51aece2eea7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2904,7 +2904,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_ioprio(struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
@@ -2933,12 +2933,6 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
-static void cfq_ioc_set_ioprio(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, changed_ioprio);
-	ioc->ioprio_changed = 0;
-}
-
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			  pid_t pid, bool is_sync)
 {
@@ -2960,7 +2954,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_cgroup(struct cfq_io_context *cic)
 {
 	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
@@ -2986,12 +2980,6 @@ static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
-
-static void cfq_ioc_set_cgroup(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, changed_cgroup);
-	ioc->cgroup_changed = 0;
-}
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
 static struct cfq_queue *
@@ -3222,13 +3210,15 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 out:
 	get_io_context(ioc);
 
-	if (unlikely(ioc->ioprio_changed))
-		cfq_ioc_set_ioprio(ioc);
-
+	if (unlikely(cic->changed)) {
+		if (test_and_clear_bit(CIC_IOPRIO_CHANGED, &cic->changed))
+			changed_ioprio(cic);
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-	if (unlikely(ioc->cgroup_changed))
-		cfq_ioc_set_cgroup(ioc);
+		if (test_and_clear_bit(CIC_CGROUP_CHANGED, &cic->changed))
+			changed_cgroup(cic);
 #endif
+	}
+
 	return cic;
 err:
 	if (cic)
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 998ec239d1e..0f1b9515213 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -50,8 +50,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
-		ioc->ioprio = ioprio;
-		ioc->ioprio_changed = 1;
+		ioc_ioprio_changed(ioc, ioprio);
 		put_io_context(ioc);
 	}
 
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 079aea8fd8a..2c2b6da96b3 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -13,6 +13,11 @@ struct cfq_ttime {
 	unsigned long ttime_mean;
 };
 
+enum {
+	CIC_IOPRIO_CHANGED,
+	CIC_CGROUP_CHANGED,
+};
+
 struct cfq_io_context {
 	void *key;
 	struct request_queue *q;
@@ -26,6 +31,8 @@ struct cfq_io_context {
 	struct list_head queue_list;
 	struct hlist_node cic_list;
 
+	unsigned long changed;
+
 	void (*dtor)(struct io_context *); /* destructor */
 	void (*exit)(struct io_context *); /* called on task exit */
 
@@ -44,11 +51,6 @@ struct io_context {
 	spinlock_t lock;
 
 	unsigned short ioprio;
-	unsigned short ioprio_changed;
-
-#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-	unsigned short cgroup_changed;
-#endif
 
 	/*
 	 * For request batching
@@ -81,6 +83,8 @@ void put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
+void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
+void ioc_cgroup_changed(struct io_context *ioc);
 #else
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
-- 
cgit v1.2.3


From b2efa05265d62bc29f3a64400fad4b44340eedb8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Dec 2011 00:33:39 +0100
Subject: block, cfq: unlink cfq_io_context's immediately

cic is association between io_context and request_queue.  A cic is
linked from both ioc and q and should be destroyed when either one
goes away.  As ioc and q both have their own locks, locking becomes a
bit complex - both orders work for removal from one but not from the
other.

Currently, cfq tries to circumvent this locking order issue with RCU.
ioc->lock nests inside queue_lock but the radix tree and cic's are
also protected by RCU allowing either side to walk their lists without
grabbing lock.

This rather unconventional use of RCU quickly devolves into extremely
fragile convolution.  e.g. The following is from cfqd going away too
soon after ioc and q exits raced.

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:
 [   88.503444]
 Pid: 599, comm: hexdump Not tainted 3.1.0-rc10-work+ #158 Bochs Bochs
 RIP: 0010:[<ffffffff81397628>]  [<ffffffff81397628>] cfq_exit_single_io_context+0x58/0xf0
 ...
 Call Trace:
  [<ffffffff81395a4a>] call_for_each_cic+0x5a/0x90
  [<ffffffff81395ab5>] cfq_exit_io_context+0x15/0x20
  [<ffffffff81389130>] exit_io_context+0x100/0x140
  [<ffffffff81098a29>] do_exit+0x579/0x850
  [<ffffffff81098d5b>] do_group_exit+0x5b/0xd0
  [<ffffffff81098de7>] sys_exit_group+0x17/0x20
  [<ffffffff81b02f2b>] system_call_fastpath+0x16/0x1b

The only real hot path here is cic lookup during request
initialization and avoiding extra locking requires very confined use
of RCU.  This patch makes cic removal from both ioc and request_queue
perform double-locking and unlink immediately.

* From q side, the change is almost trivial as ioc->lock nests inside
  queue_lock.  It just needs to grab each ioc->lock as it walks
  cic_list and unlink it.

* From ioc side, it's a bit more difficult because of inversed lock
  order.  ioc needs its lock to walk its cic_list but can't grab the
  matching queue_lock and needs to perform unlock-relock dancing.

  Unlinking is now wholly done from put_io_context() and fast path is
  optimized by using the queue_lock the caller already holds, which is
  by far the most common case.  If the ioc accessed multiple devices,
  it tries with trylock.  In unlikely cases of fast path failure, it
  falls back to full double-locking dance from workqueue.

Double-locking isn't the prettiest thing in the world but it's *far*
simpler and more understandable than RCU trick without adding any
meaningful overhead.

This still leaves a lot of now unnecessary RCU logics.  Future patches
will trim them.

-v2: Vivek pointed out that cic->q was being dereferenced after
     cic->release() was called.  Updated to use local variable @this_q
     instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |   2 +-
 block/blk-ioc.c           | 166 ++++++++++++++++++++++++++++++++++++++--------
 block/cfq-iosched.c       |  44 +++---------
 fs/ioprio.c               |   2 +-
 include/linux/blkdev.h    |   3 +
 include/linux/iocontext.h |  12 ++--
 kernel/fork.c             |   2 +-
 7 files changed, 159 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index dc00835aab6..27886935804 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1649,7 +1649,7 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc_cgroup_changed(ioc);
-		put_io_context(ioc);
+		put_io_context(ioc, NULL);
 	}
 }
 
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 6f59fbad93d..fb23965595d 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -29,55 +29,164 @@ void get_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(get_io_context);
 
-static void cfq_dtor(struct io_context *ioc)
+/*
+ * Releasing ioc may nest into another put_io_context() leading to nested
+ * fast path release.  As the ioc's can't be the same, this is okay but
+ * makes lockdep whine.  Keep track of nesting and use it as subclass.
+ */
+#ifdef CONFIG_LOCKDEP
+#define ioc_release_depth(q)		((q) ? (q)->ioc_release_depth : 0)
+#define ioc_release_depth_inc(q)	(q)->ioc_release_depth++
+#define ioc_release_depth_dec(q)	(q)->ioc_release_depth--
+#else
+#define ioc_release_depth(q)		0
+#define ioc_release_depth_inc(q)	do { } while (0)
+#define ioc_release_depth_dec(q)	do { } while (0)
+#endif
+
+/*
+ * Slow path for ioc release in put_io_context().  Performs double-lock
+ * dancing to unlink all cic's and then frees ioc.
+ */
+static void ioc_release_fn(struct work_struct *work)
 {
-	if (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic;
+	struct io_context *ioc = container_of(work, struct io_context,
+					      release_work);
+	struct request_queue *last_q = NULL;
+
+	spin_lock_irq(&ioc->lock);
+
+	while (!hlist_empty(&ioc->cic_list)) {
+		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
+							 struct cfq_io_context,
+							 cic_list);
+		struct request_queue *this_q = cic->q;
+
+		if (this_q != last_q) {
+			/*
+			 * Need to switch to @this_q.  Once we release
+			 * @ioc->lock, it can go away along with @cic.
+			 * Hold on to it.
+			 */
+			__blk_get_queue(this_q);
+
+			/*
+			 * blk_put_queue() might sleep thanks to kobject
+			 * idiocy.  Always release both locks, put and
+			 * restart.
+			 */
+			if (last_q) {
+				spin_unlock(last_q->queue_lock);
+				spin_unlock_irq(&ioc->lock);
+				blk_put_queue(last_q);
+			} else {
+				spin_unlock_irq(&ioc->lock);
+			}
+
+			last_q = this_q;
+			spin_lock_irq(this_q->queue_lock);
+			spin_lock(&ioc->lock);
+			continue;
+		}
+		ioc_release_depth_inc(this_q);
+		cic->exit(cic);
+		cic->release(cic);
+		ioc_release_depth_dec(this_q);
+	}
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
-		cic->dtor(ioc);
+	if (last_q) {
+		spin_unlock(last_q->queue_lock);
+		spin_unlock_irq(&ioc->lock);
+		blk_put_queue(last_q);
+	} else {
+		spin_unlock_irq(&ioc->lock);
 	}
+
+	kmem_cache_free(iocontext_cachep, ioc);
 }
 
 /**
  * put_io_context - put a reference of io_context
  * @ioc: io_context to put
+ * @locked_q: request_queue the caller is holding queue_lock of (hint)
  *
  * Decrement reference count of @ioc and release it if the count reaches
- * zero.
+ * zero.  If the caller is holding queue_lock of a queue, it can indicate
+ * that with @locked_q.  This is an optimization hint and the caller is
+ * allowed to pass in %NULL even when it's holding a queue_lock.
  */
-void put_io_context(struct io_context *ioc)
+void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
 {
+	struct request_queue *last_q = locked_q;
+	unsigned long flags;
+
 	if (ioc == NULL)
 		return;
 
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	if (locked_q)
+		lockdep_assert_held(locked_q->queue_lock);
 
 	if (!atomic_long_dec_and_test(&ioc->refcount))
 		return;
 
-	rcu_read_lock();
-	cfq_dtor(ioc);
-	rcu_read_unlock();
-
-	kmem_cache_free(iocontext_cachep, ioc);
-}
-EXPORT_SYMBOL(put_io_context);
+	/*
+	 * Destroy @ioc.  This is a bit messy because cic's are chained
+	 * from both ioc and queue, and ioc->lock nests inside queue_lock.
+	 * The inner ioc->lock should be held to walk our cic_list and then
+	 * for each cic the outer matching queue_lock should be grabbed.
+	 * ie. We need to do reverse-order double lock dancing.
+	 *
+	 * Another twist is that we are often called with one of the
+	 * matching queue_locks held as indicated by @locked_q, which
+	 * prevents performing double-lock dance for other queues.
+	 *
+	 * So, we do it in two stages.  The fast path uses the queue_lock
+	 * the caller is holding and, if other queues need to be accessed,
+	 * uses trylock to avoid introducing locking dependency.  This can
+	 * handle most cases, especially if @ioc was performing IO on only
+	 * single device.
+	 *
+	 * If trylock doesn't cut it, we defer to @ioc->release_work which
+	 * can do all the double-locking dancing.
+	 */
+	spin_lock_irqsave_nested(&ioc->lock, flags,
+				 ioc_release_depth(locked_q));
+
+	while (!hlist_empty(&ioc->cic_list)) {
+		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
+							 struct cfq_io_context,
+							 cic_list);
+		struct request_queue *this_q = cic->q;
+
+		if (this_q != last_q) {
+			if (last_q && last_q != locked_q)
+				spin_unlock(last_q->queue_lock);
+			last_q = NULL;
+
+			if (!spin_trylock(this_q->queue_lock))
+				break;
+			last_q = this_q;
+			continue;
+		}
+		ioc_release_depth_inc(this_q);
+		cic->exit(cic);
+		cic->release(cic);
+		ioc_release_depth_dec(this_q);
+	}
 
-static void cfq_exit(struct io_context *ioc)
-{
-	rcu_read_lock();
+	if (last_q && last_q != locked_q)
+		spin_unlock(last_q->queue_lock);
 
-	if (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic;
+	spin_unlock_irqrestore(&ioc->lock, flags);
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
-		cic->exit(ioc);
-	}
-	rcu_read_unlock();
+	/* if no cic's left, we're done; otherwise, kick release_work */
+	if (hlist_empty(&ioc->cic_list))
+		kmem_cache_free(iocontext_cachep, ioc);
+	else
+		schedule_work(&ioc->release_work);
 }
+EXPORT_SYMBOL(put_io_context);
 
 /* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
@@ -92,10 +201,8 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-	if (atomic_dec_and_test(&ioc->nr_tasks))
-		cfq_exit(ioc);
-
-	put_io_context(ioc);
+	atomic_dec(&ioc->nr_tasks);
+	put_io_context(ioc, NULL);
 }
 
 static struct io_context *create_task_io_context(struct task_struct *task,
@@ -115,6 +222,7 @@ static struct io_context *create_task_io_context(struct task_struct *task,
 	spin_lock_init(&ioc->lock);
 	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->cic_list);
+	INIT_WORK(&ioc->release_work, ioc_release_fn);
 
 	/* try to install, somebody might already have beaten us to it */
 	task_lock(task);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e617b088c59..6cc60656040 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1778,7 +1778,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqd->active_queue = NULL;
 
 	if (cfqd->active_cic) {
-		put_io_context(cfqd->active_cic->ioc);
+		put_io_context(cfqd->active_cic->ioc, cfqd->queue);
 		cfqd->active_cic = NULL;
 	}
 }
@@ -2812,38 +2812,6 @@ static void cfq_exit_cic(struct cfq_io_context *cic)
 	}
 }
 
-static void cfq_exit_single_io_context(struct io_context *ioc,
-				       struct cfq_io_context *cic)
-{
-	struct cfq_data *cfqd = cic_to_cfqd(cic);
-
-	if (cfqd) {
-		struct request_queue *q = cfqd->queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
-
-		/*
-		 * Ensure we get a fresh copy of the ->key to prevent
-		 * race between exiting task and queue
-		 */
-		smp_read_barrier_depends();
-		if (cic->key == cfqd)
-			cfq_exit_cic(cic);
-
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
-}
-
-/*
- * The process that ioc belongs to has exited, we need to clean up
- * and put the internal structures we have that belongs to that process.
- */
-static void cfq_exit_io_context(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, cfq_exit_single_io_context);
-}
-
 static struct cfq_io_context *
 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
@@ -2855,8 +2823,8 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		cic->ttime.last_end_request = jiffies;
 		INIT_LIST_HEAD(&cic->queue_list);
 		INIT_HLIST_NODE(&cic->cic_list);
-		cic->dtor = cfq_free_io_context;
-		cic->exit = cfq_exit_io_context;
+		cic->exit = cfq_exit_cic;
+		cic->release = cfq_release_cic;
 		elv_ioc_count_inc(cfq_ioc_count);
 	}
 
@@ -3726,7 +3694,7 @@ static void cfq_put_request(struct request *rq)
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
 
-		put_io_context(RQ_CIC(rq)->ioc);
+		put_io_context(RQ_CIC(rq)->ioc, cfqq->cfqd->queue);
 
 		rq->elevator_private[0] = NULL;
 		rq->elevator_private[1] = NULL;
@@ -3937,8 +3905,12 @@ static void cfq_exit_queue(struct elevator_queue *e)
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
 							struct cfq_io_context,
 							queue_list);
+		struct io_context *ioc = cic->ioc;
 
+		spin_lock(&ioc->lock);
 		cfq_exit_cic(cic);
+		cfq_release_cic(cic);
+		spin_unlock(&ioc->lock);
 	}
 
 	cfq_put_async_queues(cfqd);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 0f1b9515213..f84b380d65e 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc_ioprio_changed(ioc, ioprio);
-		put_io_context(ioc);
+		put_io_context(ioc, NULL);
 	}
 
 	return err;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d1b6f4ed1f9..65c2f8c7008 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -393,6 +393,9 @@ struct request_queue {
 	/* Throttle data */
 	struct throtl_data *td;
 #endif
+#ifdef CONFIG_LOCKDEP
+	int			ioc_release_depth;
+#endif
 };
 
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 2c2b6da96b3..01e86312878 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -3,6 +3,7 @@
 
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
+#include <linux/workqueue.h>
 
 struct cfq_queue;
 struct cfq_ttime {
@@ -33,8 +34,8 @@ struct cfq_io_context {
 
 	unsigned long changed;
 
-	void (*dtor)(struct io_context *); /* destructor */
-	void (*exit)(struct io_context *); /* called on task exit */
+	void (*exit)(struct cfq_io_context *);
+	void (*release)(struct cfq_io_context *);
 
 	struct rcu_head rcu_head;
 };
@@ -61,6 +62,8 @@ struct io_context {
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
 	void __rcu *ioc_data;
+
+	struct work_struct release_work;
 };
 
 static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -79,7 +82,7 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 
 struct task_struct;
 #ifdef CONFIG_BLOCK
-void put_io_context(struct io_context *ioc);
+void put_io_context(struct io_context *ioc, struct request_queue *locked_q);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
@@ -87,7 +90,8 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
 void ioc_cgroup_changed(struct io_context *ioc);
 #else
 struct io_context;
-static inline void put_io_context(struct io_context *ioc) { }
+static inline void put_io_context(struct io_context *ioc,
+				  struct request_queue *locked_q) { }
 static inline void exit_io_context(struct task_struct *task) { }
 #endif
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5bcfc739bb7..2753449f203 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -887,7 +887,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 			return -ENOMEM;
 
 		new_ioc->ioprio = ioc->ioprio;
-		put_io_context(new_ioc);
+		put_io_context(new_ioc, NULL);
 	}
 #endif
 	return 0;
-- 
cgit v1.2.3


From 2d3475c0ad625f7a43844a6cd0130d136416e604 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Wed, 14 Dec 2011 14:39:56 -0500
Subject: NFSD: forget_delegations should use list_for_each_entry_safe

Otherwise the for loop could try to use a file recently removed from the
file_hashtbl list and oops.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Tested-by: Casey Bodley <cbodley@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 19ca9b54200..7355fe405d6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4505,18 +4505,19 @@ void nfsd_forget_openowners(u64 num)
 int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *))
 {
 	int i, count = 0;
-	struct nfs4_file *fp;
-	struct nfs4_delegation *dp, *next;
+	struct nfs4_file *fp, *fnext;
+	struct nfs4_delegation *dp, *dnext;
 
 	for (i = 0; i < FILE_HASH_SIZE; i++) {
-		list_for_each_entry(fp, &file_hashtbl[i], fi_hash) {
-			list_for_each_entry_safe(dp, next, &fp->fi_delegations, dl_perfile) {
+		list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) {
+			list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) {
 				deleg_func(dp);
 				if (++count == num)
 					return count;
 			}
 		}
 	}
+
 	return count;
 }
 
-- 
cgit v1.2.3


From 1bc36b6426ae49139e9f56491db76b95921454d7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 19 Oct 2011 11:44:41 +0200
Subject: writeback: Include all dirty inodes in background writeback

Current livelock avoidance code makes background work to include only inodes
that were dirtied before background writeback has started. However background
writeback can be running for a long time and thus excluding newly dirtied
inodes can eventually exclude significant portion of dirty inodes making
background writeback inefficient. Since background writeback avoids livelocking
the flusher thread by yielding to any other work, there is no real reason why
background work should not include all dirty inodes so change the logic in
wb_writeback().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 517f211a3bd..92d353e069d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -743,11 +743,17 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (work->for_background && !over_bground_thresh(wb->bdi))
 			break;
 
+		/*
+		 * Kupdate and background works are special and we want to
+		 * include all inodes that need writing. Livelock avoidance is
+		 * handled by these works yielding to any other work so we are
+		 * safe.
+		 */
 		if (work->for_kupdate) {
 			oldest_jif = jiffies -
 				msecs_to_jiffies(dirty_expire_interval * 10);
-			work->older_than_this = &oldest_jif;
-		}
+		} else if (work->for_background)
+			oldest_jif = jiffies;
 
 		trace_writeback_start(wb->bdi, work);
 		if (list_empty(&wb->b_io))
-- 
cgit v1.2.3


From 32c7f202a4801252a0f3578807b75a961f792870 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Mon, 8 Aug 2011 15:19:47 -0600
Subject: btrfs: fix dirtied pages accounting on sub-page writes

When doing 1KB sequential writes to the same page,
balance_dirty_pages_ratelimited_nr() should be called once instead of 4
times, the latter makes the dirtier tasks be throttled much too heavy.

Fix it with proper de-accounting on clear_page_dirty_for_io().

CC: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/btrfs/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97fbe939c05..bfb620ead29 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1136,7 +1136,8 @@ again:
 				     GFP_NOFS);
 	}
 	for (i = 0; i < num_pages; i++) {
-		clear_page_dirty_for_io(pages[i]);
+		if (clear_page_dirty_for_io(pages[i]))
+			account_page_redirty(pages[i]);
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
-- 
cgit v1.2.3


From 60e07cf515e541ea3e13b888d273c9b19a2ad9dd Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Sun, 18 Dec 2011 15:49:54 -0500
Subject: ext4: do not reference pa_inode from group_pa

pa_inode in group_pa is set NULL in ext4_mb_new_group_pa, so
pa_inode should be not referenced.

Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c           | 2 +-
 include/trace/events/ext4.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e2d8be8f28b..cb990b21c69 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 	ext4_group_t group;
 	ext4_grpblk_t bit;
 
-	trace_ext4_mb_release_group_pa(pa);
+	trace_ext4_mb_release_group_pa(sb, pa);
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 748ff7cbe55..319538bf17d 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -573,9 +573,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa,
 );
 
 TRACE_EVENT(ext4_mb_release_group_pa,
-	TP_PROTO(struct ext4_prealloc_space *pa),
+	TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa),
 
-	TP_ARGS(pa),
+	TP_ARGS(sb, pa),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
@@ -585,7 +585,7 @@ TRACE_EVENT(ext4_mb_release_group_pa,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= pa->pa_inode->i_sb->s_dev;
+		__entry->dev		= sb->s_dev;
 		__entry->pa_pstart	= pa->pa_pstart;
 		__entry->pa_len		= pa->pa_len;
 	),
-- 
cgit v1.2.3


From 5635a62b83c04d05e4eb4575a1c3de51a35bacdc Mon Sep 17 00:00:00 2001
From: Zheng Liu <gnehzuil.liu@gmail.com>
Date: Sun, 18 Dec 2011 16:13:58 -0500
Subject: ext4: add missing space to ext4_msg output in ext4_fill_super()

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3e1329e2f82..35377d57ec4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3508,7 +3508,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * of the filesystem.
 	 */
 	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
 			 "block %u is beyond end of filesystem (%llu)",
 			 le32_to_cpu(es->s_first_data_block),
 			 ext4_blocks_count(es));
-- 
cgit v1.2.3


From acd6ad83517639e8f09a8c5525b1dccd81cd2a10 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 18 Dec 2011 17:37:02 -0500
Subject: ext4: fix error handling on inode bitmap corruption

When insert_inode_locked() fails in ext4_new_inode() it most likely means inode
bitmap got corrupted and we allocated again inode which is already in use. Also
doing unlock_new_inode() during error recovery is wrong since the inode does
not have I_NEW set. Fix the problem by jumping to fail: (instead of fail_drop:)
which declares filesystem error and does not call unlock_new_inode().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00beb4f9cc4..8fb6844f973 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -885,8 +885,12 @@ got:
 	if (IS_DIRSYNC(inode))
 		ext4_handle_sync(handle);
 	if (insert_inode_locked(inode) < 0) {
-		err = -EINVAL;
-		goto fail_drop;
+		/*
+		 * Likely a bitmap corruption causing inode to be allocated
+		 * twice.
+		 */
+		err = -EIO;
+		goto fail;
 	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
-- 
cgit v1.2.3


From 14d7f3efe923bc60839c65f9818793c64b4d708b Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Sun, 18 Dec 2011 17:39:02 -0500
Subject: ext4: remove unused local variable

In get_implied_cluster_alloc(), rr_cluster_end was being
defined and set, but was never used.  Removed this.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 607b1557d29..4423b11476a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3625,7 +3625,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
 	ext4_lblk_t ex_cluster_start, ex_cluster_end;
-	ext4_lblk_t rr_cluster_start, rr_cluster_end;
+	ext4_lblk_t rr_cluster_start;
 	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
 	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
 	unsigned short ee_len = ext4_ext_get_actual_len(ex);
@@ -3636,7 +3636,6 @@ static int get_implied_cluster_alloc(struct super_block *sb,
 
 	/* The requested region passed into ext4_map_blocks() */
 	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
-	rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
 
 	if ((rr_cluster_start == ex_cluster_end) ||
 	    (rr_cluster_start == ex_cluster_start)) {
-- 
cgit v1.2.3


From 8c48f7e88e293b9dd422bd8884842aea85d30b22 Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Sun, 18 Dec 2011 23:05:43 -0500
Subject: ext4: optimize ext4_find_delalloc_range() in nodelalloc mode

We found performance regression when using bigalloc with "nodelalloc"
(1MB cluster size):

1. mke2fs -C 1048576 -O ^has_journal,bigalloc /dev/sda
2. mount -o nodelalloc /dev/sda /test/
3. time dd if=/dev/zero of=/test/io bs=1048576 count=1024

The "dd" will cost about 2 seconds to finish, but if we mke2fs without
"bigalloc", "dd" will only cost less than 1 second.

The reason is: when using ext4 with "nodelalloc", it will call
ext4_find_delalloc_cluster() nearly everytime it call
ext4_ext_map_blocks(), and ext4_find_delalloc_range() will also scan
all pages in cluster because no buffer is "delayed".  A cluster has
256 pages (1MB cluster), so it will scan 256 * 256k pags when creating
a 1G file. That severely hurts the performance.

Therefore, we return immediately from ext4_find_delalloc_range() in
nodelalloc mode, since by definition there can't be any delalloc
pages.

Signed-off-by: Robin Dong <sanbai@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4423b11476a..5684f251092 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3281,6 +3281,9 @@ static int ext4_find_delalloc_range(struct inode *inode,
 	ext4_lblk_t i, pg_lblk;
 	pgoff_t index;
 
+	if (!test_opt(inode->i_sb, DELALLOC))
+		return 0;
+
 	/* reverse search wont work if fs block size is less than page size */
 	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
 		search_hint_reverse = 0;
-- 
cgit v1.2.3


From 5db0276014b80484689eb6c1bf7b94af1c7d5b1a Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Tue, 1 Nov 2011 17:04:16 +0100
Subject: Btrfs: add optional integrity check code

The two files added in this patch contain all the code that is
required to implement the integrity checks.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/check-integrity.c | 3068 ++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/check-integrity.h |   36 +
 2 files changed, 3104 insertions(+)
 create mode 100644 fs/btrfs/check-integrity.c
 create mode 100644 fs/btrfs/check-integrity.h

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 00000000000..ad0b3ba735b
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3068 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ *    currently referenced by the super block (either directly
+ *    or indirectly).
+ * 2. When a super block is written, it is verified that all
+ *    referenced (directly or indirectly) blocks fulfill the
+ *    following requirements:
+ *    2a. All referenced blocks have either been present when
+ *        the file system was mounted, (i.e., they have been
+ *        referenced by the super block) or they have been
+ *        written since then and the write completion callback
+ *        was called and a FLUSH request to the device where
+ *        these blocks are located was received and completed.
+ *    2b. All referenced blocks need to have a generation
+ *        number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/crc32c.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)	/* in characters,
+							 * excluding " [...]" */
+#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
+
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE			0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION		0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE			0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE			0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH			0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH			0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE				0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE				0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE				0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES			0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE			0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES				0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS		0x00001000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+	u32 magic_num;		/* only used for debug purposes */
+	unsigned int is_metadata:1;	/* if it is meta-data, not data-data */
+	unsigned int is_superblock:1;	/* if it is one of the superblocks */
+	unsigned int is_iodone:1;	/* if is done by lower subsystem */
+	unsigned int iodone_w_error:1;	/* error was indicated to endio */
+	unsigned int never_written:1;	/* block was added because it was
+					 * referenced, not because it was
+					 * written */
+	unsigned int mirror_num:2;	/* large enough to hold
+					 * BTRFS_SUPER_MIRROR_MAX */
+	struct btrfsic_dev_state *dev_state;
+	u64 dev_bytenr;		/* key, physical byte num on disk */
+	u64 logical_bytenr;	/* logical byte num on disk */
+	u64 generation;
+	struct btrfs_disk_key disk_key;	/* extra info to print in case of
+					 * issues, will not always be correct */
+	struct list_head collision_resolving_node;	/* list node */
+	struct list_head all_blocks_node;	/* list node */
+
+	/* the following two lists contain block_link items */
+	struct list_head ref_to_list;	/* list */
+	struct list_head ref_from_list;	/* list */
+	struct btrfsic_block *next_in_same_bio;
+	void *orig_bio_bh_private;
+	union {
+		bio_end_io_t *bio;
+		bh_end_io_t *bh;
+	} orig_bio_bh_end_io;
+	int submit_bio_bh_rw;
+	u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+	u32 magic_num;		/* only used for debug purposes */
+	u32 ref_cnt;
+	struct list_head node_ref_to;	/* list node */
+	struct list_head node_ref_from;	/* list node */
+	struct list_head collision_resolving_node;	/* list node */
+	struct btrfsic_block *block_ref_to;
+	struct btrfsic_block *block_ref_from;
+	u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+	u32 magic_num;		/* only used for debug purposes */
+	struct block_device *bdev;
+	struct btrfsic_state *state;
+	struct list_head collision_resolving_node;	/* list node */
+	struct btrfsic_block dummy_block_for_bio_bh_flush;
+	u64 last_flush_gen;
+	char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+	struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+	struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+	struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+	u64 start;		/* virtual bytenr */
+	u64 dev_bytenr;		/* physical bytenr on device */
+	u32 len;
+	struct btrfsic_dev_state *dev;
+	char *data;
+	struct buffer_head *bh;	/* do not use if set to NULL */
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+	u32 magic;
+	u32 nr;
+	int error;
+	int i;
+	int limit_nesting;
+	int num_copies;
+	int mirror_num;
+	struct btrfsic_block *block;
+	struct btrfsic_block_data_ctx *block_ctx;
+	struct btrfsic_block *next_block;
+	struct btrfsic_block_data_ctx next_block_ctx;
+	struct btrfs_header *hdr;
+	struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+	u32 print_mask;
+	int include_extent_data;
+	int csum_size;
+	struct list_head all_blocks_list;
+	struct btrfsic_block_hashtable block_hashtable;
+	struct btrfsic_block_link_hashtable block_link_hashtable;
+	struct btrfs_root *root;
+	u64 max_superblock_generation;
+	struct btrfsic_block *latest_superblock;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+					struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+		struct block_device *bdev,
+		u64 dev_bytenr,
+		struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+		struct btrfsic_block_link *l,
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+		struct block_device *bdev_ref_to,
+		u64 dev_bytenr_ref_to,
+		struct block_device *bdev_ref_from,
+		u64 dev_bytenr_ref_from,
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+		struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+		struct btrfsic_dev_state *ds,
+		struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+		struct block_device *bdev,
+		struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+				      struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+				     struct btrfsic_block *block,
+				     struct btrfsic_block_data_ctx *block_ctx,
+				     struct btrfs_header *hdr,
+				     int limit_nesting, int force_iodone_flag);
+static int btrfsic_create_link_to_next_block(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx
+		*block_ctx, u64 next_bytenr,
+		int limit_nesting,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block **next_blockp,
+		int force_iodone_flag,
+		int *num_copiesp, int *mirror_nump,
+		struct btrfs_disk_key *disk_key,
+		u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+				      struct btrfsic_block *block,
+				      struct btrfsic_block_data_ctx *block_ctx,
+				      u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+			     struct btrfsic_block_data_ctx *block_ctx_out,
+			     int mirror_num);
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+				  u32 len, struct block_device *bdev,
+				  struct btrfsic_block_data_ctx *block_ctx_out);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+			      struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+				     const u8 *data, unsigned int size);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+					  u64 dev_bytenr, u8 *mapped_data,
+					  unsigned int len, struct bio *bio,
+					  int *bio_is_patched,
+					  struct buffer_head *bh,
+					  int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const block,
+		struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+					      const struct btrfsic_block *block,
+					      int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+					struct btrfsic_block *const block,
+					int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+				   const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+				  const struct btrfsic_block *block,
+				  int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block *next_block,
+		struct btrfsic_block *from_block,
+		u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *block_ctx,
+		const char *additional_string,
+		int is_metadata,
+		int is_iodone,
+		int never_written,
+		int mirror_num,
+		int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+		struct btrfsic_state *state,
+		struct btrfsic_dev_state *dev_state,
+		struct btrfs_device *device,
+		int superblock_mirror_num,
+		struct btrfsic_dev_state **selected_dev_state,
+		struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+		struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+					   u64 bytenr,
+					   struct btrfsic_dev_state *dev_state,
+					   u64 dev_bytenr, char *data);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+	b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+	b->dev_state = NULL;
+	b->dev_bytenr = 0;
+	b->logical_bytenr = 0;
+	b->generation = BTRFSIC_GENERATION_UNKNOWN;
+	b->disk_key.objectid = 0;
+	b->disk_key.type = 0;
+	b->disk_key.offset = 0;
+	b->is_metadata = 0;
+	b->is_superblock = 0;
+	b->is_iodone = 0;
+	b->iodone_w_error = 0;
+	b->never_written = 0;
+	b->mirror_num = 0;
+	b->next_in_same_bio = NULL;
+	b->orig_bio_bh_private = NULL;
+	b->orig_bio_bh_end_io.bio = NULL;
+	INIT_LIST_HEAD(&b->collision_resolving_node);
+	INIT_LIST_HEAD(&b->all_blocks_node);
+	INIT_LIST_HEAD(&b->ref_to_list);
+	INIT_LIST_HEAD(&b->ref_from_list);
+	b->submit_bio_bh_rw = 0;
+	b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+	struct btrfsic_block *b;
+
+	b = kzalloc(sizeof(*b), GFP_NOFS);
+	if (NULL != b)
+		btrfsic_block_init(b);
+
+	return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+	BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+	kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+	l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+	l->ref_cnt = 1;
+	INIT_LIST_HEAD(&l->node_ref_to);
+	INIT_LIST_HEAD(&l->node_ref_from);
+	INIT_LIST_HEAD(&l->collision_resolving_node);
+	l->block_ref_to = NULL;
+	l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+	struct btrfsic_block_link *l;
+
+	l = kzalloc(sizeof(*l), GFP_NOFS);
+	if (NULL != l)
+		btrfsic_block_link_init(l);
+
+	return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+	BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+	kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+	ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+	ds->bdev = NULL;
+	ds->state = NULL;
+	ds->name[0] = '\0';
+	INIT_LIST_HEAD(&ds->collision_resolving_node);
+	ds->last_flush_gen = 0;
+	btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+	ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+	ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+	struct btrfsic_dev_state *ds;
+
+	ds = kzalloc(sizeof(*ds), GFP_NOFS);
+	if (NULL != ds)
+		btrfsic_dev_state_init(ds);
+
+	return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+	BUG_ON(!(NULL == ds ||
+		 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+	kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+					struct btrfsic_block_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(b->dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+	list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+	list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+		struct block_device *bdev,
+		u64 dev_bytenr,
+		struct btrfsic_block_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)bdev))) &
+	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_block *const b =
+		    list_entry(elem, struct btrfsic_block,
+			       collision_resolving_node);
+
+		if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+			return b;
+	}
+
+	return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+		struct btrfsic_block_link_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+		struct btrfsic_block_link *l,
+		struct btrfsic_block_link_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+	     ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+	     ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+	     & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+	BUG_ON(NULL == l->block_ref_to);
+	BUG_ON(NULL == l->block_ref_from);
+	list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+	list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+		struct block_device *bdev_ref_to,
+		u64 dev_bytenr_ref_to,
+		struct block_device *bdev_ref_from,
+		u64 dev_bytenr_ref_from,
+		struct btrfsic_block_link_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+	     ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+	     ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+	     ((unsigned int)((uintptr_t)bdev_ref_from))) &
+	     (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_block_link *const l =
+		    list_entry(elem, struct btrfsic_block_link,
+			       collision_resolving_node);
+
+		BUG_ON(NULL == l->block_ref_to);
+		BUG_ON(NULL == l->block_ref_from);
+		if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+		    l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+		    l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+		    l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+			return l;
+	}
+
+	return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+		struct btrfsic_dev_state_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+		struct btrfsic_dev_state *ds,
+		struct btrfsic_dev_state_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)((uintptr_t)ds->bdev)) &
+	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+	list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+	list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+		struct block_device *bdev,
+		struct btrfsic_dev_state_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)((uintptr_t)bdev)) &
+	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_dev_state *const ds =
+		    list_entry(elem, struct btrfsic_dev_state,
+			       collision_resolving_node);
+
+		if (ds->bdev == bdev)
+			return ds;
+	}
+
+	return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+				      struct btrfs_fs_devices *fs_devices)
+{
+	int ret;
+	struct btrfs_super_block *selected_super;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+	struct btrfsic_dev_state *selected_dev_state = NULL;
+	int pass;
+
+	BUG_ON(NULL == state);
+	selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+	if (NULL == selected_super) {
+		printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+		return -1;
+	}
+
+	list_for_each_entry(device, dev_head, dev_list) {
+		int i;
+		struct btrfsic_dev_state *dev_state;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		dev_state = btrfsic_dev_state_lookup(device->bdev);
+		BUG_ON(NULL == dev_state);
+		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+			ret = btrfsic_process_superblock_dev_mirror(
+					state, dev_state, device, i,
+					&selected_dev_state, selected_super);
+			if (0 != ret && 0 == i) {
+				kfree(selected_super);
+				return ret;
+			}
+		}
+	}
+
+	if (NULL == state->latest_superblock) {
+		printk(KERN_INFO "btrfsic: no superblock found!\n");
+		kfree(selected_super);
+		return -1;
+	}
+
+	state->csum_size = btrfs_super_csum_size(selected_super);
+
+	for (pass = 0; pass < 3; pass++) {
+		int num_copies;
+		int mirror_num;
+		u64 next_bytenr;
+
+		switch (pass) {
+		case 0:
+			next_bytenr = btrfs_super_root(selected_super);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "root@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		case 1:
+			next_bytenr = btrfs_super_chunk_root(selected_super);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "chunk@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		case 2:
+			next_bytenr = btrfs_super_log_root(selected_super);
+			if (0 == next_bytenr)
+				continue;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "log@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				     next_bytenr, PAGE_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       (unsigned long long)next_bytenr, num_copies);
+
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block *next_block;
+			struct btrfsic_block_data_ctx tmp_next_block_ctx;
+			struct btrfsic_block_link *l;
+			struct btrfs_header *hdr;
+
+			ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+						&tmp_next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO "btrfsic:"
+				       " btrfsic_map_block(root @%llu,"
+				       " mirror %d) failed!\n",
+				       (unsigned long long)next_bytenr,
+				       mirror_num);
+				kfree(selected_super);
+				return -1;
+			}
+
+			next_block = btrfsic_block_hashtable_lookup(
+					tmp_next_block_ctx.dev->bdev,
+					tmp_next_block_ctx.dev_bytenr,
+					&state->block_hashtable);
+			BUG_ON(NULL == next_block);
+
+			l = btrfsic_block_link_hashtable_lookup(
+					tmp_next_block_ctx.dev->bdev,
+					tmp_next_block_ctx.dev_bytenr,
+					state->latest_superblock->dev_state->
+					bdev,
+					state->latest_superblock->dev_bytenr,
+					&state->block_link_hashtable);
+			BUG_ON(NULL == l);
+
+			ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+			if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+				printk(KERN_INFO
+				       "btrfsic: read @logical %llu failed!\n",
+				       (unsigned long long)
+				       tmp_next_block_ctx.start);
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				kfree(selected_super);
+				return -1;
+			}
+
+			hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
+			ret = btrfsic_process_metablock(state,
+							next_block,
+							&tmp_next_block_ctx,
+							hdr,
+							BTRFS_MAX_LEVEL + 3, 1);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+		}
+	}
+
+	kfree(selected_super);
+	return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+		struct btrfsic_state *state,
+		struct btrfsic_dev_state *dev_state,
+		struct btrfs_device *device,
+		int superblock_mirror_num,
+		struct btrfsic_dev_state **selected_dev_state,
+		struct btrfs_super_block *selected_super)
+{
+	struct btrfs_super_block *super_tmp;
+	u64 dev_bytenr;
+	struct buffer_head *bh;
+	struct btrfsic_block *superblock_tmp;
+	int pass;
+	struct block_device *const superblock_bdev = device->bdev;
+
+	/* super block bytenr is always the unmapped device bytenr */
+	dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+	bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+	if (NULL == bh)
+		return -1;
+	super_tmp = (struct btrfs_super_block *)
+	    (bh->b_data + (dev_bytenr & 4095));
+
+	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+	    strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
+		    sizeof(super_tmp->magic)) ||
+	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+		brelse(bh);
+		return 0;
+	}
+
+	superblock_tmp =
+	    btrfsic_block_hashtable_lookup(superblock_bdev,
+					   dev_bytenr,
+					   &state->block_hashtable);
+	if (NULL == superblock_tmp) {
+		superblock_tmp = btrfsic_block_alloc();
+		if (NULL == superblock_tmp) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			brelse(bh);
+			return -1;
+		}
+		/* for superblock, only the dev_bytenr makes sense */
+		superblock_tmp->dev_bytenr = dev_bytenr;
+		superblock_tmp->dev_state = dev_state;
+		superblock_tmp->logical_bytenr = dev_bytenr;
+		superblock_tmp->generation = btrfs_super_generation(super_tmp);
+		superblock_tmp->is_metadata = 1;
+		superblock_tmp->is_superblock = 1;
+		superblock_tmp->is_iodone = 1;
+		superblock_tmp->never_written = 0;
+		superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk(KERN_INFO "New initial S-block (bdev %p, %s)"
+			       " @%llu (%s/%llu/%d)\n",
+			       superblock_bdev, device->name,
+			       (unsigned long long)dev_bytenr,
+			       dev_state->name,
+			       (unsigned long long)dev_bytenr,
+			       superblock_mirror_num);
+		list_add(&superblock_tmp->all_blocks_node,
+			 &state->all_blocks_list);
+		btrfsic_block_hashtable_add(superblock_tmp,
+					    &state->block_hashtable);
+	}
+
+	/* select the one with the highest generation field */
+	if (btrfs_super_generation(super_tmp) >
+	    state->max_superblock_generation ||
+	    0 == state->max_superblock_generation) {
+		memcpy(selected_super, super_tmp, sizeof(*selected_super));
+		*selected_dev_state = dev_state;
+		state->max_superblock_generation =
+		    btrfs_super_generation(super_tmp);
+		state->latest_superblock = superblock_tmp;
+	}
+
+	for (pass = 0; pass < 3; pass++) {
+		u64 next_bytenr;
+		int num_copies;
+		int mirror_num;
+		const char *additional_string = NULL;
+		struct btrfs_disk_key tmp_disk_key;
+
+		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_disk_key.offset = 0;
+		switch (pass) {
+		case 0:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+			additional_string = "initial root ";
+			next_bytenr = btrfs_super_root(super_tmp);
+			break;
+		case 1:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+			additional_string = "initial chunk ";
+			next_bytenr = btrfs_super_chunk_root(super_tmp);
+			break;
+		case 2:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+			additional_string = "initial log ";
+			next_bytenr = btrfs_super_log_root(super_tmp);
+			if (0 == next_bytenr)
+				continue;
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				     next_bytenr, PAGE_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       (unsigned long long)next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block *next_block;
+			struct btrfsic_block_data_ctx tmp_next_block_ctx;
+			struct btrfsic_block_link *l;
+
+			if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+					      &tmp_next_block_ctx,
+					      mirror_num)) {
+				printk(KERN_INFO "btrfsic: btrfsic_map_block("
+				       "bytenr @%llu, mirror %d) failed!\n",
+				       (unsigned long long)next_bytenr,
+				       mirror_num);
+				brelse(bh);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state, &tmp_next_block_ctx,
+					additional_string, 1, 1, 0,
+					mirror_num, NULL);
+			if (NULL == next_block) {
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				brelse(bh);
+				return -1;
+			}
+
+			next_block->disk_key = tmp_disk_key;
+			next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+			l = btrfsic_block_link_lookup_or_add(
+					state, &tmp_next_block_ctx,
+					next_block, superblock_tmp,
+					BTRFSIC_GENERATION_UNKNOWN);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+			if (NULL == l) {
+				brelse(bh);
+				return -1;
+			}
+		}
+	}
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+		btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+	brelse(bh);
+	return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+	struct btrfsic_stack_frame *sf;
+
+	sf = kzalloc(sizeof(*sf), GFP_NOFS);
+	if (NULL == sf)
+		printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+	else
+		sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+	return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+	BUG_ON(!(NULL == sf ||
+		 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+	kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const first_block,
+		struct btrfsic_block_data_ctx *const first_block_ctx,
+		struct btrfs_header *const first_hdr,
+		int first_limit_nesting, int force_iodone_flag)
+{
+	struct btrfsic_stack_frame initial_stack_frame = { 0 };
+	struct btrfsic_stack_frame *sf;
+	struct btrfsic_stack_frame *next_stack;
+
+	sf = &initial_stack_frame;
+	sf->error = 0;
+	sf->i = -1;
+	sf->limit_nesting = first_limit_nesting;
+	sf->block = first_block;
+	sf->block_ctx = first_block_ctx;
+	sf->next_block = NULL;
+	sf->hdr = first_hdr;
+	sf->prev = NULL;
+
+continue_with_new_stack_frame:
+	sf->block->generation = le64_to_cpu(sf->hdr->generation);
+	if (0 == sf->hdr->level) {
+		struct btrfs_leaf *const leafhdr =
+		    (struct btrfs_leaf *)sf->hdr;
+
+		if (-1 == sf->i) {
+			sf->nr = le32_to_cpu(leafhdr->header.nritems);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "leaf %llu items %d generation %llu"
+				       " owner %llu\n",
+				       (unsigned long long)
+				       sf->block_ctx->start,
+				       sf->nr,
+				       (unsigned long long)
+				       le64_to_cpu(leafhdr->header.generation),
+				       (unsigned long long)
+				       le64_to_cpu(leafhdr->header.owner));
+		}
+
+continue_with_current_leaf_stack_frame:
+		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+			sf->i++;
+			sf->num_copies = 0;
+		}
+
+		if (sf->i < sf->nr) {
+			struct btrfs_item *disk_item = leafhdr->items + sf->i;
+			struct btrfs_disk_key *disk_key = &disk_item->key;
+			u8 type;
+			const u32 item_offset = le32_to_cpu(disk_item->offset);
+
+			type = disk_key->type;
+
+			if (BTRFS_ROOT_ITEM_KEY == type) {
+				const struct btrfs_root_item *const root_item =
+				    (struct btrfs_root_item *)
+				    (sf->block_ctx->data +
+				     offsetof(struct btrfs_leaf, items) +
+				     item_offset);
+				const u64 next_bytenr =
+				    le64_to_cpu(root_item->bytenr);
+
+				sf->error =
+				    btrfsic_create_link_to_next_block(
+						state,
+						sf->block,
+						sf->block_ctx,
+						next_bytenr,
+						sf->limit_nesting,
+						&sf->next_block_ctx,
+						&sf->next_block,
+						force_iodone_flag,
+						&sf->num_copies,
+						&sf->mirror_num,
+						disk_key,
+						le64_to_cpu(root_item->
+						generation));
+				if (sf->error)
+					goto one_stack_frame_backwards;
+
+				if (NULL != sf->next_block) {
+					struct btrfs_header *const next_hdr =
+					    (struct btrfs_header *)
+					    sf->next_block_ctx.data;
+
+					next_stack =
+					    btrfsic_stack_frame_alloc();
+					if (NULL == next_stack) {
+						btrfsic_release_block_ctx(
+								&sf->
+								next_block_ctx);
+						goto one_stack_frame_backwards;
+					}
+
+					next_stack->i = -1;
+					next_stack->block = sf->next_block;
+					next_stack->block_ctx =
+					    &sf->next_block_ctx;
+					next_stack->next_block = NULL;
+					next_stack->hdr = next_hdr;
+					next_stack->limit_nesting =
+					    sf->limit_nesting - 1;
+					next_stack->prev = sf;
+					sf = next_stack;
+					goto continue_with_new_stack_frame;
+				}
+			} else if (BTRFS_EXTENT_DATA_KEY == type &&
+				   state->include_extent_data) {
+				sf->error = btrfsic_handle_extent_data(
+						state,
+						sf->block,
+						sf->block_ctx,
+						item_offset,
+						force_iodone_flag);
+				if (sf->error)
+					goto one_stack_frame_backwards;
+			}
+
+			goto continue_with_current_leaf_stack_frame;
+		}
+	} else {
+		struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+		if (-1 == sf->i) {
+			sf->nr = le32_to_cpu(nodehdr->header.nritems);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "node %llu level %d items %d"
+				       " generation %llu owner %llu\n",
+				       (unsigned long long)
+				       sf->block_ctx->start,
+				       nodehdr->header.level, sf->nr,
+				       (unsigned long long)
+				       le64_to_cpu(nodehdr->header.generation),
+				       (unsigned long long)
+				       le64_to_cpu(nodehdr->header.owner));
+		}
+
+continue_with_current_node_stack_frame:
+		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+			sf->i++;
+			sf->num_copies = 0;
+		}
+
+		if (sf->i < sf->nr) {
+			struct btrfs_key_ptr *disk_key_ptr =
+			    nodehdr->ptrs + sf->i;
+			const u64 next_bytenr =
+			    le64_to_cpu(disk_key_ptr->blockptr);
+
+			sf->error = btrfsic_create_link_to_next_block(
+					state,
+					sf->block,
+					sf->block_ctx,
+					next_bytenr,
+					sf->limit_nesting,
+					&sf->next_block_ctx,
+					&sf->next_block,
+					force_iodone_flag,
+					&sf->num_copies,
+					&sf->mirror_num,
+					&disk_key_ptr->key,
+					le64_to_cpu(disk_key_ptr->generation));
+			if (sf->error)
+				goto one_stack_frame_backwards;
+
+			if (NULL != sf->next_block) {
+				struct btrfs_header *const next_hdr =
+				    (struct btrfs_header *)
+				    sf->next_block_ctx.data;
+
+				next_stack = btrfsic_stack_frame_alloc();
+				if (NULL == next_stack)
+					goto one_stack_frame_backwards;
+
+				next_stack->i = -1;
+				next_stack->block = sf->next_block;
+				next_stack->block_ctx = &sf->next_block_ctx;
+				next_stack->next_block = NULL;
+				next_stack->hdr = next_hdr;
+				next_stack->limit_nesting =
+				    sf->limit_nesting - 1;
+				next_stack->prev = sf;
+				sf = next_stack;
+				goto continue_with_new_stack_frame;
+			}
+
+			goto continue_with_current_node_stack_frame;
+		}
+	}
+
+one_stack_frame_backwards:
+	if (NULL != sf->prev) {
+		struct btrfsic_stack_frame *const prev = sf->prev;
+
+		/* the one for the initial block is freed in the caller */
+		btrfsic_release_block_ctx(sf->block_ctx);
+
+		if (sf->error) {
+			prev->error = sf->error;
+			btrfsic_stack_frame_free(sf);
+			sf = prev;
+			goto one_stack_frame_backwards;
+		}
+
+		btrfsic_stack_frame_free(sf);
+		sf = prev;
+		goto continue_with_new_stack_frame;
+	} else {
+		BUG_ON(&initial_stack_frame != sf);
+	}
+
+	return sf->error;
+}
+
+static int btrfsic_create_link_to_next_block(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx *block_ctx,
+		u64 next_bytenr,
+		int limit_nesting,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block **next_blockp,
+		int force_iodone_flag,
+		int *num_copiesp, int *mirror_nump,
+		struct btrfs_disk_key *disk_key,
+		u64 parent_generation)
+{
+	struct btrfsic_block *next_block = NULL;
+	int ret;
+	struct btrfsic_block_link *l;
+	int did_alloc_block_link;
+	int block_was_created;
+
+	*next_blockp = NULL;
+	if (0 == *num_copiesp) {
+		*num_copiesp =
+		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				     next_bytenr, PAGE_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       (unsigned long long)next_bytenr, *num_copiesp);
+		*mirror_nump = 1;
+	}
+
+	if (*mirror_nump > *num_copiesp)
+		return 0;
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+		printk(KERN_INFO
+		       "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+		       *mirror_nump);
+	ret = btrfsic_map_block(state, next_bytenr,
+				BTRFSIC_BLOCK_SIZE,
+				next_block_ctx, *mirror_nump);
+	if (ret) {
+		printk(KERN_INFO
+		       "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+		       (unsigned long long)next_bytenr, *mirror_nump);
+		btrfsic_release_block_ctx(next_block_ctx);
+		*next_blockp = NULL;
+		return -1;
+	}
+
+	next_block = btrfsic_block_lookup_or_add(state,
+						 next_block_ctx, "referenced ",
+						 1, force_iodone_flag,
+						 !force_iodone_flag,
+						 *mirror_nump,
+						 &block_was_created);
+	if (NULL == next_block) {
+		btrfsic_release_block_ctx(next_block_ctx);
+		*next_blockp = NULL;
+		return -1;
+	}
+	if (block_was_created) {
+		l = NULL;
+		next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+	} else {
+		if (next_block->logical_bytenr != next_bytenr &&
+		    !(!next_block->is_metadata &&
+		      0 == next_block->logical_bytenr)) {
+			printk(KERN_INFO
+			       "Referenced block @%llu (%s/%llu/%d)"
+			       " found in hash table, %c,"
+			       " bytenr mismatch (!= stored %llu).\n",
+			       (unsigned long long)next_bytenr,
+			       next_block_ctx->dev->name,
+			       (unsigned long long)next_block_ctx->dev_bytenr,
+			       *mirror_nump,
+			       btrfsic_get_block_type(state, next_block),
+			       (unsigned long long)next_block->logical_bytenr);
+		} else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "Referenced block @%llu (%s/%llu/%d)"
+			       " found in hash table, %c.\n",
+			       (unsigned long long)next_bytenr,
+			       next_block_ctx->dev->name,
+			       (unsigned long long)next_block_ctx->dev_bytenr,
+			       *mirror_nump,
+			       btrfsic_get_block_type(state, next_block));
+		next_block->logical_bytenr = next_bytenr;
+
+		next_block->mirror_num = *mirror_nump;
+		l = btrfsic_block_link_hashtable_lookup(
+				next_block_ctx->dev->bdev,
+				next_block_ctx->dev_bytenr,
+				block_ctx->dev->bdev,
+				block_ctx->dev_bytenr,
+				&state->block_link_hashtable);
+	}
+
+	next_block->disk_key = *disk_key;
+	if (NULL == l) {
+		l = btrfsic_block_link_alloc();
+		if (NULL == l) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			btrfsic_release_block_ctx(next_block_ctx);
+			*next_blockp = NULL;
+			return -1;
+		}
+
+		did_alloc_block_link = 1;
+		l->block_ref_to = next_block;
+		l->block_ref_from = block;
+		l->ref_cnt = 1;
+		l->parent_generation = parent_generation;
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+
+		list_add(&l->node_ref_to, &block->ref_to_list);
+		list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+		btrfsic_block_link_hashtable_add(l,
+						 &state->block_link_hashtable);
+	} else {
+		did_alloc_block_link = 0;
+		if (0 == limit_nesting) {
+			l->ref_cnt++;
+			l->parent_generation = parent_generation;
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_add_link(state, l);
+		}
+	}
+
+	if (limit_nesting > 0 && did_alloc_block_link) {
+		ret = btrfsic_read_block(state, next_block_ctx);
+		if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+			printk(KERN_INFO
+			       "btrfsic: read block @logical %llu failed!\n",
+			       (unsigned long long)next_bytenr);
+			btrfsic_release_block_ctx(next_block_ctx);
+			*next_blockp = NULL;
+			return -1;
+		}
+
+		*next_blockp = next_block;
+	} else {
+		*next_blockp = NULL;
+	}
+	(*mirror_nump)++;
+
+	return 0;
+}
+
+static int btrfsic_handle_extent_data(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx *block_ctx,
+		u32 item_offset, int force_iodone_flag)
+{
+	int ret;
+	struct btrfs_file_extent_item *file_extent_item =
+	    (struct btrfs_file_extent_item *)(block_ctx->data +
+					      offsetof(struct btrfs_leaf,
+						       items) + item_offset);
+	u64 next_bytenr =
+	    le64_to_cpu(file_extent_item->disk_bytenr) +
+	    le64_to_cpu(file_extent_item->offset);
+	u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
+	u64 generation = le64_to_cpu(file_extent_item->generation);
+	struct btrfsic_block_link *l;
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+		printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+		       " offset = %llu, num_bytes = %llu\n",
+		       file_extent_item->type,
+		       (unsigned long long)
+		       le64_to_cpu(file_extent_item->disk_bytenr),
+		       (unsigned long long)
+		       le64_to_cpu(file_extent_item->offset),
+		       (unsigned long long)
+		       le64_to_cpu(file_extent_item->num_bytes));
+	if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
+	    ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
+		return 0;
+	while (num_bytes > 0) {
+		u32 chunk_len;
+		int num_copies;
+		int mirror_num;
+
+		if (num_bytes > BTRFSIC_BLOCK_SIZE)
+			chunk_len = BTRFSIC_BLOCK_SIZE;
+		else
+			chunk_len = num_bytes;
+
+		num_copies =
+		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				     next_bytenr, PAGE_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       (unsigned long long)next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block_data_ctx next_block_ctx;
+			struct btrfsic_block *next_block;
+			int block_was_created;
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "btrfsic_handle_extent_data("
+				       "mirror_num=%d)\n", mirror_num);
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+				printk(KERN_INFO
+				       "\tdisk_bytenr = %llu, num_bytes %u\n",
+				       (unsigned long long)next_bytenr,
+				       chunk_len);
+			ret = btrfsic_map_block(state, next_bytenr,
+						chunk_len, &next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(@%llu,"
+				       " mirror=%d) failed!\n",
+				       (unsigned long long)next_bytenr,
+				       mirror_num);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state,
+					&next_block_ctx,
+					"referenced ",
+					0,
+					force_iodone_flag,
+					!force_iodone_flag,
+					mirror_num,
+					&block_was_created);
+			if (NULL == next_block) {
+				printk(KERN_INFO
+				       "btrfsic: error, kmalloc failed!\n");
+				btrfsic_release_block_ctx(&next_block_ctx);
+				return -1;
+			}
+			if (!block_was_created) {
+				if (next_block->logical_bytenr != next_bytenr &&
+				    !(!next_block->is_metadata &&
+				      0 == next_block->logical_bytenr)) {
+					printk(KERN_INFO
+					       "Referenced block"
+					       " @%llu (%s/%llu/%d)"
+					       " found in hash table, D,"
+					       " bytenr mismatch"
+					       " (!= stored %llu).\n",
+					       (unsigned long long)next_bytenr,
+					       next_block_ctx.dev->name,
+					       (unsigned long long)
+					       next_block_ctx.dev_bytenr,
+					       mirror_num,
+					       (unsigned long long)
+					       next_block->logical_bytenr);
+				}
+				next_block->logical_bytenr = next_bytenr;
+				next_block->mirror_num = mirror_num;
+			}
+
+			l = btrfsic_block_link_lookup_or_add(state,
+							     &next_block_ctx,
+							     next_block, block,
+							     generation);
+			btrfsic_release_block_ctx(&next_block_ctx);
+			if (NULL == l)
+				return -1;
+		}
+
+		next_bytenr += chunk_len;
+		num_bytes -= chunk_len;
+	}
+
+	return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+			     struct btrfsic_block_data_ctx *block_ctx_out,
+			     int mirror_num)
+{
+	int ret;
+	u64 length;
+	struct btrfs_bio *multi = NULL;
+	struct btrfs_device *device;
+
+	length = len;
+	ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+			      bytenr, &length, &multi, mirror_num);
+
+	device = multi->stripes[0].dev;
+	block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+	block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+	block_ctx_out->start = bytenr;
+	block_ctx_out->len = len;
+	block_ctx_out->data = NULL;
+	block_ctx_out->bh = NULL;
+
+	if (0 == ret)
+		kfree(multi);
+	if (NULL == block_ctx_out->dev) {
+		ret = -ENXIO;
+		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+	}
+
+	return ret;
+}
+
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+				  u32 len, struct block_device *bdev,
+				  struct btrfsic_block_data_ctx *block_ctx_out)
+{
+	block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
+	block_ctx_out->dev_bytenr = bytenr;
+	block_ctx_out->start = bytenr;
+	block_ctx_out->len = len;
+	block_ctx_out->data = NULL;
+	block_ctx_out->bh = NULL;
+	if (NULL != block_ctx_out->dev) {
+		return 0;
+	} else {
+		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
+		return -ENXIO;
+	}
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+	if (NULL != block_ctx->bh) {
+		brelse(block_ctx->bh);
+		block_ctx->bh = NULL;
+	}
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+			      struct btrfsic_block_data_ctx *block_ctx)
+{
+	block_ctx->bh = NULL;
+	if (block_ctx->dev_bytenr & 4095) {
+		printk(KERN_INFO
+		       "btrfsic: read_block() with unaligned bytenr %llu\n",
+		       (unsigned long long)block_ctx->dev_bytenr);
+		return -1;
+	}
+	if (block_ctx->len > 4096) {
+		printk(KERN_INFO
+		       "btrfsic: read_block() with too huge size %d\n",
+		       block_ctx->len);
+		return -1;
+	}
+
+	block_ctx->bh = __bread(block_ctx->dev->bdev,
+				block_ctx->dev_bytenr >> 12, 4096);
+	if (NULL == block_ctx->bh)
+		return -1;
+	block_ctx->data = block_ctx->bh->b_data;
+
+	return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+	struct list_head *elem_all;
+
+	BUG_ON(NULL == state);
+
+	printk(KERN_INFO "all_blocks_list:\n");
+	list_for_each(elem_all, &state->all_blocks_list) {
+		const struct btrfsic_block *const b_all =
+		    list_entry(elem_all, struct btrfsic_block,
+			       all_blocks_node);
+		struct list_head *elem_ref_to;
+		struct list_head *elem_ref_from;
+
+		printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+		       btrfsic_get_block_type(state, b_all),
+		       (unsigned long long)b_all->logical_bytenr,
+		       b_all->dev_state->name,
+		       (unsigned long long)b_all->dev_bytenr,
+		       b_all->mirror_num);
+
+		list_for_each(elem_ref_to, &b_all->ref_to_list) {
+			const struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+			       " refers %u* to"
+			       " %c @%llu (%s/%llu/%d)\n",
+			       btrfsic_get_block_type(state, b_all),
+			       (unsigned long long)b_all->logical_bytenr,
+			       b_all->dev_state->name,
+			       (unsigned long long)b_all->dev_bytenr,
+			       b_all->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+		}
+
+		list_for_each(elem_ref_from, &b_all->ref_from_list) {
+			const struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_from,
+				       struct btrfsic_block_link,
+				       node_ref_from);
+
+			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+			       " is ref %u* from"
+			       " %c @%llu (%s/%llu/%d)\n",
+			       btrfsic_get_block_type(state, b_all),
+			       (unsigned long long)b_all->logical_bytenr,
+			       b_all->dev_state->name,
+			       (unsigned long long)b_all->dev_bytenr,
+			       b_all->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_from),
+			       (unsigned long long)
+			       l->block_ref_from->logical_bytenr,
+			       l->block_ref_from->dev_state->name,
+			       (unsigned long long)
+			       l->block_ref_from->dev_bytenr,
+			       l->block_ref_from->mirror_num);
+		}
+
+		printk(KERN_INFO "\n");
+	}
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+				     const u8 *data, unsigned int size)
+{
+	struct btrfs_header *h;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u32 crc = ~(u32)0;
+	int fail = 0;
+	int crc_fail = 0;
+
+	h = (struct btrfs_header *)data;
+
+	if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+		fail++;
+
+	crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+	btrfs_csum_final(crc, csum);
+	if (memcmp(csum, h->csum, state->csum_size))
+		crc_fail++;
+
+	return fail || crc_fail;
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+					  u64 dev_bytenr,
+					  u8 *mapped_data, unsigned int len,
+					  struct bio *bio,
+					  int *bio_is_patched,
+					  struct buffer_head *bh,
+					  int submit_bio_bh_rw)
+{
+	int is_metadata;
+	struct btrfsic_block *block;
+	struct btrfsic_block_data_ctx block_ctx;
+	int ret;
+	struct btrfsic_state *state = dev_state->state;
+	struct block_device *bdev = dev_state->bdev;
+
+	WARN_ON(len > PAGE_SIZE);
+	is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
+	if (NULL != bio_is_patched)
+		*bio_is_patched = 0;
+
+	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+					       &state->block_hashtable);
+	if (NULL != block) {
+		u64 bytenr;
+		struct list_head *elem_ref_to;
+		struct list_head *tmp_ref_to;
+
+		if (block->is_superblock) {
+			bytenr = le64_to_cpu(((struct btrfs_super_block *)
+					      mapped_data)->bytenr);
+			is_metadata = 1;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+				printk(KERN_INFO
+				       "[before new superblock is written]:\n");
+				btrfsic_dump_tree_sub(state, block, 0);
+			}
+		}
+		if (is_metadata) {
+			if (!block->is_superblock) {
+				bytenr = le64_to_cpu(((struct btrfs_header *)
+						      mapped_data)->bytenr);
+				btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+							       dev_state,
+							       dev_bytenr,
+							       mapped_data);
+			}
+			if (block->logical_bytenr != bytenr) {
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/%d)"
+				       " found in hash table, %c,"
+				       " bytenr mismatch"
+				       " (!= stored %llu).\n",
+				       (unsigned long long)bytenr,
+				       dev_state->name,
+				       (unsigned long long)dev_bytenr,
+				       block->mirror_num,
+				       btrfsic_get_block_type(state, block),
+				       (unsigned long long)
+				       block->logical_bytenr);
+				block->logical_bytenr = bytenr;
+			} else if (state->print_mask &
+				   BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/%d)"
+				       " found in hash table, %c.\n",
+				       (unsigned long long)bytenr,
+				       dev_state->name,
+				       (unsigned long long)dev_bytenr,
+				       block->mirror_num,
+				       btrfsic_get_block_type(state, block));
+		} else {
+			bytenr = block->logical_bytenr;
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/%d)"
+				       " found in hash table, %c.\n",
+				       (unsigned long long)bytenr,
+				       dev_state->name,
+				       (unsigned long long)dev_bytenr,
+				       block->mirror_num,
+				       btrfsic_get_block_type(state, block));
+		}
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "ref_to_list: %cE, ref_from_list: %cE\n",
+			       list_empty(&block->ref_to_list) ? ' ' : '!',
+			       list_empty(&block->ref_from_list) ? ' ' : '!');
+		if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+			       " @%llu (%s/%llu/%d), old(gen=%llu,"
+			       " objectid=%llu, type=%d, offset=%llu),"
+			       " new(gen=%llu),"
+			       " which is referenced by most recent superblock"
+			       " (superblockgen=%llu)!\n",
+			       btrfsic_get_block_type(state, block),
+			       (unsigned long long)bytenr,
+			       dev_state->name,
+			       (unsigned long long)dev_bytenr,
+			       block->mirror_num,
+			       (unsigned long long)block->generation,
+			       (unsigned long long)
+			       le64_to_cpu(block->disk_key.objectid),
+			       block->disk_key.type,
+			       (unsigned long long)
+			       le64_to_cpu(block->disk_key.offset),
+			       (unsigned long long)
+			       le64_to_cpu(((struct btrfs_header *)
+					    mapped_data)->generation),
+			       (unsigned long long)
+			       state->max_superblock_generation);
+			btrfsic_dump_tree(state);
+		}
+
+		if (!block->is_iodone && !block->never_written) {
+			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+			       " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+			       " which is not yet iodone!\n",
+			       btrfsic_get_block_type(state, block),
+			       (unsigned long long)bytenr,
+			       dev_state->name,
+			       (unsigned long long)dev_bytenr,
+			       block->mirror_num,
+			       (unsigned long long)block->generation,
+			       (unsigned long long)
+			       le64_to_cpu(((struct btrfs_header *)
+					    mapped_data)->generation));
+			/* it would not be safe to go on */
+			btrfsic_dump_tree(state);
+			return;
+		}
+
+		/*
+		 * Clear all references of this block. Do not free
+		 * the block itself even if is not referenced anymore
+		 * because it still carries valueable information
+		 * like whether it was ever written and IO completed.
+		 */
+		list_for_each_safe(elem_ref_to, tmp_ref_to,
+				   &block->ref_to_list) {
+			struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_rem_link(state, l);
+			l->ref_cnt--;
+			if (0 == l->ref_cnt) {
+				list_del(&l->node_ref_to);
+				list_del(&l->node_ref_from);
+				btrfsic_block_link_hashtable_remove(l);
+				btrfsic_block_link_free(l);
+			}
+		}
+
+		if (block->is_superblock)
+			ret = btrfsic_map_superblock(state, bytenr, len,
+						     bdev, &block_ctx);
+		else
+			ret = btrfsic_map_block(state, bytenr, len,
+						&block_ctx, 0);
+		if (ret) {
+			printk(KERN_INFO
+			       "btrfsic: btrfsic_map_block(root @%llu)"
+			       " failed!\n", (unsigned long long)bytenr);
+			return;
+		}
+		block_ctx.data = mapped_data;
+		/* the following is required in case of writes to mirrors,
+		 * use the same that was used for the lookup */
+		block_ctx.dev = dev_state;
+		block_ctx.dev_bytenr = dev_bytenr;
+
+		if (is_metadata || state->include_extent_data) {
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			if (NULL != bio) {
+				block->is_iodone = 0;
+				BUG_ON(NULL == bio_is_patched);
+				if (!*bio_is_patched) {
+					block->orig_bio_bh_private =
+					    bio->bi_private;
+					block->orig_bio_bh_end_io.bio =
+					    bio->bi_end_io;
+					block->next_in_same_bio = NULL;
+					bio->bi_private = block;
+					bio->bi_end_io = btrfsic_bio_end_io;
+					*bio_is_patched = 1;
+				} else {
+					struct btrfsic_block *chained_block =
+					    (struct btrfsic_block *)
+					    bio->bi_private;
+
+					BUG_ON(NULL == chained_block);
+					block->orig_bio_bh_private =
+					    chained_block->orig_bio_bh_private;
+					block->orig_bio_bh_end_io.bio =
+					    chained_block->orig_bio_bh_end_io.
+					    bio;
+					block->next_in_same_bio = chained_block;
+					bio->bi_private = block;
+				}
+			} else if (NULL != bh) {
+				block->is_iodone = 0;
+				block->orig_bio_bh_private = bh->b_private;
+				block->orig_bio_bh_end_io.bh = bh->b_end_io;
+				block->next_in_same_bio = NULL;
+				bh->b_private = block;
+				bh->b_end_io = btrfsic_bh_end_io;
+			} else {
+				block->is_iodone = 1;
+				block->orig_bio_bh_private = NULL;
+				block->orig_bio_bh_end_io.bio = NULL;
+				block->next_in_same_bio = NULL;
+			}
+		}
+
+		block->flush_gen = dev_state->last_flush_gen + 1;
+		block->submit_bio_bh_rw = submit_bio_bh_rw;
+		if (is_metadata) {
+			block->logical_bytenr = bytenr;
+			block->is_metadata = 1;
+			if (block->is_superblock) {
+				ret = btrfsic_process_written_superblock(
+						state,
+						block,
+						(struct btrfs_super_block *)
+						mapped_data);
+				if (state->print_mask &
+				    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+					printk(KERN_INFO
+					"[after new superblock is written]:\n");
+					btrfsic_dump_tree_sub(state, block, 0);
+				}
+			} else {
+				block->mirror_num = 0;	/* unknown */
+				ret = btrfsic_process_metablock(
+						state,
+						block,
+						&block_ctx,
+						(struct btrfs_header *)
+						block_ctx.data,
+						0, 0);
+			}
+			if (ret)
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_process_metablock"
+				       "(root @%llu) failed!\n",
+				       (unsigned long long)dev_bytenr);
+		} else {
+			block->is_metadata = 0;
+			block->mirror_num = 0;	/* unknown */
+			block->generation = BTRFSIC_GENERATION_UNKNOWN;
+			if (!state->include_extent_data
+			    && list_empty(&block->ref_from_list)) {
+				/*
+				 * disk block is overwritten with extent
+				 * data (not meta data) and we are configured
+				 * to not include extent data: take the
+				 * chance and free the block's memory
+				 */
+				btrfsic_block_hashtable_remove(block);
+				list_del(&block->all_blocks_node);
+				btrfsic_block_free(block);
+			}
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	} else {
+		/* block has not been found in hash table */
+		u64 bytenr;
+
+		if (!is_metadata) {
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "Written block (%s/%llu/?)"
+				       " !found in hash table, D.\n",
+				       dev_state->name,
+				       (unsigned long long)dev_bytenr);
+			if (!state->include_extent_data)
+				return;	/* ignore that written D block */
+
+			/* this is getting ugly for the
+			 * include_extent_data case... */
+			bytenr = 0;	/* unknown */
+			block_ctx.start = bytenr;
+			block_ctx.len = len;
+			block_ctx.bh = NULL;
+		} else {
+			bytenr = le64_to_cpu(((struct btrfs_header *)
+					      mapped_data)->bytenr);
+			btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+						       dev_bytenr,
+						       mapped_data);
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/?)"
+				       " !found in hash table, M.\n",
+				       (unsigned long long)bytenr,
+				       dev_state->name,
+				       (unsigned long long)dev_bytenr);
+
+			ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
+						0);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(root @%llu)"
+				       " failed!\n",
+				       (unsigned long long)dev_bytenr);
+				return;
+			}
+		}
+		block_ctx.data = mapped_data;
+		/* the following is required in case of writes to mirrors,
+		 * use the same that was used for the lookup */
+		block_ctx.dev = dev_state;
+		block_ctx.dev_bytenr = dev_bytenr;
+
+		block = btrfsic_block_alloc();
+		if (NULL == block) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			btrfsic_release_block_ctx(&block_ctx);
+			return;
+		}
+		block->dev_state = dev_state;
+		block->dev_bytenr = dev_bytenr;
+		block->logical_bytenr = bytenr;
+		block->is_metadata = is_metadata;
+		block->never_written = 0;
+		block->iodone_w_error = 0;
+		block->mirror_num = 0;	/* unknown */
+		block->flush_gen = dev_state->last_flush_gen + 1;
+		block->submit_bio_bh_rw = submit_bio_bh_rw;
+		if (NULL != bio) {
+			block->is_iodone = 0;
+			BUG_ON(NULL == bio_is_patched);
+			if (!*bio_is_patched) {
+				block->orig_bio_bh_private = bio->bi_private;
+				block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+				block->next_in_same_bio = NULL;
+				bio->bi_private = block;
+				bio->bi_end_io = btrfsic_bio_end_io;
+				*bio_is_patched = 1;
+			} else {
+				struct btrfsic_block *chained_block =
+				    (struct btrfsic_block *)
+				    bio->bi_private;
+
+				BUG_ON(NULL == chained_block);
+				block->orig_bio_bh_private =
+				    chained_block->orig_bio_bh_private;
+				block->orig_bio_bh_end_io.bio =
+				    chained_block->orig_bio_bh_end_io.bio;
+				block->next_in_same_bio = chained_block;
+				bio->bi_private = block;
+			}
+		} else if (NULL != bh) {
+			block->is_iodone = 0;
+			block->orig_bio_bh_private = bh->b_private;
+			block->orig_bio_bh_end_io.bh = bh->b_end_io;
+			block->next_in_same_bio = NULL;
+			bh->b_private = block;
+			bh->b_end_io = btrfsic_bh_end_io;
+		} else {
+			block->is_iodone = 1;
+			block->orig_bio_bh_private = NULL;
+			block->orig_bio_bh_end_io.bio = NULL;
+			block->next_in_same_bio = NULL;
+		}
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "New written %c-block @%llu (%s/%llu/%d)\n",
+			       is_metadata ? 'M' : 'D',
+			       (unsigned long long)block->logical_bytenr,
+			       block->dev_state->name,
+			       (unsigned long long)block->dev_bytenr,
+			       block->mirror_num);
+		list_add(&block->all_blocks_node, &state->all_blocks_list);
+		btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+		if (is_metadata) {
+			ret = btrfsic_process_metablock(state, block,
+							&block_ctx,
+							(struct btrfs_header *)
+							block_ctx.data, 0, 0);
+			if (ret)
+				printk(KERN_INFO
+				       "btrfsic: process_metablock(root @%llu)"
+				       " failed!\n",
+				       (unsigned long long)dev_bytenr);
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	}
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+	struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+	int iodone_w_error;
+
+	/* mutex is not held! This is not save if IO is not yet completed
+	 * on umount */
+	iodone_w_error = 0;
+	if (bio_error_status)
+		iodone_w_error = 1;
+
+	BUG_ON(NULL == block);
+	bp->bi_private = block->orig_bio_bh_private;
+	bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+	do {
+		struct btrfsic_block *next_block;
+		struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+		if ((dev_state->state->print_mask &
+		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+			printk(KERN_INFO
+			       "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+			       bio_error_status,
+			       btrfsic_get_block_type(dev_state->state, block),
+			       (unsigned long long)block->logical_bytenr,
+			       dev_state->name,
+			       (unsigned long long)block->dev_bytenr,
+			       block->mirror_num);
+		next_block = block->next_in_same_bio;
+		block->iodone_w_error = iodone_w_error;
+		if (block->submit_bio_bh_rw & REQ_FLUSH) {
+			dev_state->last_flush_gen++;
+			if ((dev_state->state->print_mask &
+			     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+				printk(KERN_INFO
+				       "bio_end_io() new %s flush_gen=%llu\n",
+				       dev_state->name,
+				       (unsigned long long)
+				       dev_state->last_flush_gen);
+		}
+		if (block->submit_bio_bh_rw & REQ_FUA)
+			block->flush_gen = 0; /* FUA completed means block is
+					       * on disk */
+		block->is_iodone = 1; /* for FLUSH, this releases the block */
+		block = next_block;
+	} while (NULL != block);
+
+	bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+	struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+	int iodone_w_error = !uptodate;
+	struct btrfsic_dev_state *dev_state;
+
+	BUG_ON(NULL == block);
+	dev_state = block->dev_state;
+	if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+		printk(KERN_INFO
+		       "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+		       iodone_w_error,
+		       btrfsic_get_block_type(dev_state->state, block),
+		       (unsigned long long)block->logical_bytenr,
+		       block->dev_state->name,
+		       (unsigned long long)block->dev_bytenr,
+		       block->mirror_num);
+
+	block->iodone_w_error = iodone_w_error;
+	if (block->submit_bio_bh_rw & REQ_FLUSH) {
+		dev_state->last_flush_gen++;
+		if ((dev_state->state->print_mask &
+		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+			printk(KERN_INFO
+			       "bh_end_io() new %s flush_gen=%llu\n",
+			       dev_state->name,
+			       (unsigned long long)dev_state->last_flush_gen);
+	}
+	if (block->submit_bio_bh_rw & REQ_FUA)
+		block->flush_gen = 0; /* FUA completed means block is on disk */
+
+	bh->b_private = block->orig_bio_bh_private;
+	bh->b_end_io = block->orig_bio_bh_end_io.bh;
+	block->is_iodone = 1; /* for FLUSH, this releases the block */
+	bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const superblock,
+		struct btrfs_super_block *const super_hdr)
+{
+	int pass;
+
+	superblock->generation = btrfs_super_generation(super_hdr);
+	if (!(superblock->generation > state->max_superblock_generation ||
+	      0 == state->max_superblock_generation)) {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk(KERN_INFO
+			       "btrfsic: superblock @%llu (%s/%llu/%d)"
+			       " with old gen %llu <= %llu\n",
+			       (unsigned long long)superblock->logical_bytenr,
+			       superblock->dev_state->name,
+			       (unsigned long long)superblock->dev_bytenr,
+			       superblock->mirror_num,
+			       (unsigned long long)
+			       btrfs_super_generation(super_hdr),
+			       (unsigned long long)
+			       state->max_superblock_generation);
+	} else {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk(KERN_INFO
+			       "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+			       " with new gen %llu > %llu\n",
+			       (unsigned long long)superblock->logical_bytenr,
+			       superblock->dev_state->name,
+			       (unsigned long long)superblock->dev_bytenr,
+			       superblock->mirror_num,
+			       (unsigned long long)
+			       btrfs_super_generation(super_hdr),
+			       (unsigned long long)
+			       state->max_superblock_generation);
+
+		state->max_superblock_generation =
+		    btrfs_super_generation(super_hdr);
+		state->latest_superblock = superblock;
+	}
+
+	for (pass = 0; pass < 3; pass++) {
+		int ret;
+		u64 next_bytenr;
+		struct btrfsic_block *next_block;
+		struct btrfsic_block_data_ctx tmp_next_block_ctx;
+		struct btrfsic_block_link *l;
+		int num_copies;
+		int mirror_num;
+		const char *additional_string = NULL;
+		struct btrfs_disk_key tmp_disk_key;
+
+		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_disk_key.offset = 0;
+
+		switch (pass) {
+		case 0:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+			additional_string = "root ";
+			next_bytenr = btrfs_super_root(super_hdr);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "root@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		case 1:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+			additional_string = "chunk ";
+			next_bytenr = btrfs_super_chunk_root(super_hdr);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "chunk@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		case 2:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+			additional_string = "log ";
+			next_bytenr = btrfs_super_log_root(super_hdr);
+			if (0 == next_bytenr)
+				continue;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "log@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				     next_bytenr, PAGE_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       (unsigned long long)next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			int was_created;
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "btrfsic_process_written_superblock("
+				       "mirror_num=%d)\n", mirror_num);
+			ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+						&tmp_next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(@%llu,"
+				       " mirror=%d) failed!\n",
+				       (unsigned long long)next_bytenr,
+				       mirror_num);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state,
+					&tmp_next_block_ctx,
+					additional_string,
+					1, 0, 1,
+					mirror_num,
+					&was_created);
+			if (NULL == next_block) {
+				printk(KERN_INFO
+				       "btrfsic: error, kmalloc failed!\n");
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				return -1;
+			}
+
+			next_block->disk_key = tmp_disk_key;
+			if (was_created)
+				next_block->generation =
+				    BTRFSIC_GENERATION_UNKNOWN;
+			l = btrfsic_block_link_lookup_or_add(
+					state,
+					&tmp_next_block_ctx,
+					next_block,
+					superblock,
+					BTRFSIC_GENERATION_UNKNOWN);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+			if (NULL == l)
+				return -1;
+		}
+	}
+
+	if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
+		WARN_ON(1);
+		btrfsic_dump_tree(state);
+	}
+
+	return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+					struct btrfsic_block *const block,
+					int recursion_level)
+{
+	struct list_head *elem_ref_to;
+	int ret = 0;
+
+	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+		/*
+		 * Note that this situation can happen and does not
+		 * indicate an error in regular cases. It happens
+		 * when disk blocks are freed and later reused.
+		 * The check-integrity module is not aware of any
+		 * block free operations, it just recognizes block
+		 * write operations. Therefore it keeps the linkage
+		 * information for a block until a block is
+		 * rewritten. This can temporarily cause incorrect
+		 * and even circular linkage informations. This
+		 * causes no harm unless such blocks are referenced
+		 * by the most recent super block.
+		 */
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "btrfsic: abort cyclic linkage (case 1).\n");
+
+		return ret;
+	}
+
+	/*
+	 * This algorithm is recursive because the amount of used stack
+	 * space is very small and the max recursion depth is limited.
+	 */
+	list_for_each(elem_ref_to, &block->ref_to_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_to, struct btrfsic_block_link,
+			       node_ref_to);
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "rl=%d, %c @%llu (%s/%llu/%d)"
+			       " %u* refers to %c @%llu (%s/%llu/%d)\n",
+			       recursion_level,
+			       btrfsic_get_block_type(state, block),
+			       (unsigned long long)block->logical_bytenr,
+			       block->dev_state->name,
+			       (unsigned long long)block->dev_bytenr,
+			       block->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+		if (l->block_ref_to->never_written) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is never written!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
+		} else if (!l->block_ref_to->is_iodone) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is not yet iodone!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
+		} else if (l->parent_generation !=
+			   l->block_ref_to->generation &&
+			   BTRFSIC_GENERATION_UNKNOWN !=
+			   l->parent_generation &&
+			   BTRFSIC_GENERATION_UNKNOWN !=
+			   l->block_ref_to->generation) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " with generation %llu !="
+			       " parent generation %llu!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num,
+			       (unsigned long long)l->block_ref_to->generation,
+			       (unsigned long long)l->parent_generation);
+			ret = -1;
+		} else if (l->block_ref_to->flush_gen >
+			   l->block_ref_to->dev_state->last_flush_gen) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is not flushed out of disk's write cache"
+			       " (block flush_gen=%llu,"
+			       " dev->flush_gen=%llu)!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num,
+			       (unsigned long long)block->flush_gen,
+			       (unsigned long long)
+			       l->block_ref_to->dev_state->last_flush_gen);
+			ret = -1;
+		} else if (-1 == btrfsic_check_all_ref_blocks(state,
+							      l->block_ref_to,
+							      recursion_level +
+							      1)) {
+			ret = -1;
+		}
+	}
+
+	return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+		const struct btrfsic_state *state,
+		const struct btrfsic_block *block,
+		int recursion_level)
+{
+	struct list_head *elem_ref_from;
+
+	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+		/* refer to comment at "abort cyclic linkage (case 1)" */
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "btrfsic: abort cyclic linkage (case 2).\n");
+
+		return 0;
+	}
+
+	/*
+	 * This algorithm is recursive because the amount of used stack space
+	 * is very small and the max recursion depth is limited.
+	 */
+	list_for_each(elem_ref_from, &block->ref_from_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_from, struct btrfsic_block_link,
+			       node_ref_from);
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "rl=%d, %c @%llu (%s/%llu/%d)"
+			       " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+			       recursion_level,
+			       btrfsic_get_block_type(state, block),
+			       (unsigned long long)block->logical_bytenr,
+			       block->dev_state->name,
+			       (unsigned long long)block->dev_bytenr,
+			       block->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_from),
+			       (unsigned long long)
+			       l->block_ref_from->logical_bytenr,
+			       l->block_ref_from->dev_state->name,
+			       (unsigned long long)
+			       l->block_ref_from->dev_bytenr,
+			       l->block_ref_from->mirror_num);
+		if (l->block_ref_from->is_superblock &&
+		    state->latest_superblock->dev_bytenr ==
+		    l->block_ref_from->dev_bytenr &&
+		    state->latest_superblock->dev_state->bdev ==
+		    l->block_ref_from->dev_state->bdev)
+			return 1;
+		else if (btrfsic_is_block_ref_by_superblock(state,
+							    l->block_ref_from,
+							    recursion_level +
+							    1))
+			return 1;
+	}
+
+	return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l)
+{
+	printk(KERN_INFO
+	       "Add %u* link from %c @%llu (%s/%llu/%d)"
+	       " to %c @%llu (%s/%llu/%d).\n",
+	       l->ref_cnt,
+	       btrfsic_get_block_type(state, l->block_ref_from),
+	       (unsigned long long)l->block_ref_from->logical_bytenr,
+	       l->block_ref_from->dev_state->name,
+	       (unsigned long long)l->block_ref_from->dev_bytenr,
+	       l->block_ref_from->mirror_num,
+	       btrfsic_get_block_type(state, l->block_ref_to),
+	       (unsigned long long)l->block_ref_to->logical_bytenr,
+	       l->block_ref_to->dev_state->name,
+	       (unsigned long long)l->block_ref_to->dev_bytenr,
+	       l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l)
+{
+	printk(KERN_INFO
+	       "Rem %u* link from %c @%llu (%s/%llu/%d)"
+	       " to %c @%llu (%s/%llu/%d).\n",
+	       l->ref_cnt,
+	       btrfsic_get_block_type(state, l->block_ref_from),
+	       (unsigned long long)l->block_ref_from->logical_bytenr,
+	       l->block_ref_from->dev_state->name,
+	       (unsigned long long)l->block_ref_from->dev_bytenr,
+	       l->block_ref_from->mirror_num,
+	       btrfsic_get_block_type(state, l->block_ref_to),
+	       (unsigned long long)l->block_ref_to->logical_bytenr,
+	       l->block_ref_to->dev_state->name,
+	       (unsigned long long)l->block_ref_to->dev_bytenr,
+	       l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+				   const struct btrfsic_block *block)
+{
+	if (block->is_superblock &&
+	    state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+	    state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+		return 'S';
+	else if (block->is_superblock)
+		return 's';
+	else if (block->is_metadata)
+		return 'M';
+	else
+		return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+	btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+				  const struct btrfsic_block *block,
+				  int indent_level)
+{
+	struct list_head *elem_ref_to;
+	int indent_add;
+	static char buf[80];
+	int cursor_position;
+
+	/*
+	 * Should better fill an on-stack buffer with a complete line and
+	 * dump it at once when it is time to print a newline character.
+	 */
+
+	/*
+	 * This algorithm is recursive because the amount of used stack space
+	 * is very small and the max recursion depth is limited.
+	 */
+	indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+			     btrfsic_get_block_type(state, block),
+			     (unsigned long long)block->logical_bytenr,
+			     block->dev_state->name,
+			     (unsigned long long)block->dev_bytenr,
+			     block->mirror_num);
+	if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+		printk("[...]\n");
+		return;
+	}
+	printk(buf);
+	indent_level += indent_add;
+	if (list_empty(&block->ref_to_list)) {
+		printk("\n");
+		return;
+	}
+	if (block->mirror_num > 1 &&
+	    !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+		printk(" [...]\n");
+		return;
+	}
+
+	cursor_position = indent_level;
+	list_for_each(elem_ref_to, &block->ref_to_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_to, struct btrfsic_block_link,
+			       node_ref_to);
+
+		while (cursor_position < indent_level) {
+			printk(" ");
+			cursor_position++;
+		}
+		if (l->ref_cnt > 1)
+			indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+		else
+			indent_add = sprintf(buf, " --> ");
+		if (indent_level + indent_add >
+		    BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+			printk("[...]\n");
+			cursor_position = 0;
+			continue;
+		}
+
+		printk(buf);
+
+		btrfsic_dump_tree_sub(state, l->block_ref_to,
+				      indent_level + indent_add);
+		cursor_position = 0;
+	}
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block *next_block,
+		struct btrfsic_block *from_block,
+		u64 parent_generation)
+{
+	struct btrfsic_block_link *l;
+
+	l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+						next_block_ctx->dev_bytenr,
+						from_block->dev_state->bdev,
+						from_block->dev_bytenr,
+						&state->block_link_hashtable);
+	if (NULL == l) {
+		l = btrfsic_block_link_alloc();
+		if (NULL == l) {
+			printk(KERN_INFO
+			       "btrfsic: error, kmalloc" " failed!\n");
+			return NULL;
+		}
+
+		l->block_ref_to = next_block;
+		l->block_ref_from = from_block;
+		l->ref_cnt = 1;
+		l->parent_generation = parent_generation;
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+
+		list_add(&l->node_ref_to, &from_block->ref_to_list);
+		list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+		btrfsic_block_link_hashtable_add(l,
+						 &state->block_link_hashtable);
+	} else {
+		l->ref_cnt++;
+		l->parent_generation = parent_generation;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+	}
+
+	return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *block_ctx,
+		const char *additional_string,
+		int is_metadata,
+		int is_iodone,
+		int never_written,
+		int mirror_num,
+		int *was_created)
+{
+	struct btrfsic_block *block;
+
+	block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+					       block_ctx->dev_bytenr,
+					       &state->block_hashtable);
+	if (NULL == block) {
+		struct btrfsic_dev_state *dev_state;
+
+		block = btrfsic_block_alloc();
+		if (NULL == block) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			return NULL;
+		}
+		dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+		if (NULL == dev_state) {
+			printk(KERN_INFO
+			       "btrfsic: error, lookup dev_state failed!\n");
+			btrfsic_block_free(block);
+			return NULL;
+		}
+		block->dev_state = dev_state;
+		block->dev_bytenr = block_ctx->dev_bytenr;
+		block->logical_bytenr = block_ctx->start;
+		block->is_metadata = is_metadata;
+		block->is_iodone = is_iodone;
+		block->never_written = never_written;
+		block->mirror_num = mirror_num;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "New %s%c-block @%llu (%s/%llu/%d)\n",
+			       additional_string,
+			       btrfsic_get_block_type(state, block),
+			       (unsigned long long)block->logical_bytenr,
+			       dev_state->name,
+			       (unsigned long long)block->dev_bytenr,
+			       mirror_num);
+		list_add(&block->all_blocks_node, &state->all_blocks_list);
+		btrfsic_block_hashtable_add(block, &state->block_hashtable);
+		if (NULL != was_created)
+			*was_created = 1;
+	} else {
+		if (NULL != was_created)
+			*was_created = 0;
+	}
+
+	return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+					   u64 bytenr,
+					   struct btrfsic_dev_state *dev_state,
+					   u64 dev_bytenr, char *data)
+{
+	int num_copies;
+	int mirror_num;
+	int ret;
+	struct btrfsic_block_data_ctx block_ctx;
+	int match = 0;
+
+	num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				      bytenr, PAGE_SIZE);
+
+	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+		ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+					&block_ctx, mirror_num);
+		if (ret) {
+			printk(KERN_INFO "btrfsic:"
+			       " btrfsic_map_block(logical @%llu,"
+			       " mirror %d) failed!\n",
+			       (unsigned long long)bytenr, mirror_num);
+			continue;
+		}
+
+		if (dev_state->bdev == block_ctx.dev->bdev &&
+		    dev_bytenr == block_ctx.dev_bytenr) {
+			match++;
+			btrfsic_release_block_ctx(&block_ctx);
+			break;
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	}
+
+	if (!match) {
+		printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+		       " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+		       " phys_bytenr=%llu)!\n",
+		       (unsigned long long)bytenr, dev_state->name,
+		       (unsigned long long)dev_bytenr);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+						&block_ctx, mirror_num);
+			if (ret)
+				continue;
+
+			printk(KERN_INFO "Read logical bytenr @%llu maps to"
+			       " (%s/%llu/%d)\n",
+			       (unsigned long long)bytenr,
+			       block_ctx.dev->name,
+			       (unsigned long long)block_ctx.dev_bytenr,
+			       mirror_num);
+		}
+		WARN_ON(1);
+	}
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+		struct block_device *bdev)
+{
+	struct btrfsic_dev_state *ds;
+
+	ds = btrfsic_dev_state_hashtable_lookup(bdev,
+						&btrfsic_dev_state_hashtable);
+	return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+	struct btrfsic_dev_state *dev_state;
+
+	if (!btrfsic_is_initialized)
+		return submit_bh(rw, bh);
+
+	mutex_lock(&btrfsic_mutex);
+	/* since btrfsic_submit_bh() might also be called before
+	 * btrfsic_mount(), this might return NULL */
+	dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+	/* Only called to write the superblock (incl. FLUSH/FUA) */
+	if (NULL != dev_state &&
+	    (rw & WRITE) && bh->b_size > 0) {
+		u64 dev_bytenr;
+
+		dev_bytenr = 4096 * bh->b_blocknr;
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
+			       " size=%lu, data=%p, bdev=%p)\n",
+			       rw, bh->b_blocknr,
+			       (unsigned long long)dev_bytenr, bh->b_size,
+			       bh->b_data, bh->b_bdev);
+		btrfsic_process_written_block(dev_state, dev_bytenr,
+					      bh->b_data, bh->b_size, NULL,
+					      NULL, bh, rw);
+	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+			       rw, bh->b_bdev);
+		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+			if ((dev_state->state->print_mask &
+			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			      BTRFSIC_PRINT_MASK_VERBOSE)))
+				printk(KERN_INFO
+				       "btrfsic_submit_bh(%s) with FLUSH"
+				       " but dummy block already in use"
+				       " (ignored)!\n",
+				       dev_state->name);
+		} else {
+			struct btrfsic_block *const block =
+				&dev_state->dummy_block_for_bio_bh_flush;
+
+			block->is_iodone = 0;
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			block->flush_gen = dev_state->last_flush_gen + 1;
+			block->submit_bio_bh_rw = rw;
+			block->orig_bio_bh_private = bh->b_private;
+			block->orig_bio_bh_end_io.bh = bh->b_end_io;
+			block->next_in_same_bio = NULL;
+			bh->b_private = block;
+			bh->b_end_io = btrfsic_bh_end_io;
+		}
+	}
+	mutex_unlock(&btrfsic_mutex);
+	return submit_bh(rw, bh);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+	struct btrfsic_dev_state *dev_state;
+
+	if (!btrfsic_is_initialized) {
+		submit_bio(rw, bio);
+		return;
+	}
+
+	mutex_lock(&btrfsic_mutex);
+	/* since btrfsic_submit_bio() is also called before
+	 * btrfsic_mount(), this might return NULL */
+	dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+	if (NULL != dev_state &&
+	    (rw & WRITE) && NULL != bio->bi_io_vec) {
+		unsigned int i;
+		u64 dev_bytenr;
+		int bio_is_patched;
+
+		dev_bytenr = 512 * bio->bi_sector;
+		bio_is_patched = 0;
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bio(rw=0x%x, bi_vcnt=%u,"
+			       " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
+			       rw, bio->bi_vcnt, bio->bi_sector,
+			       (unsigned long long)dev_bytenr,
+			       bio->bi_bdev);
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			u8 *mapped_data;
+
+			mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+			if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			     BTRFSIC_PRINT_MASK_VERBOSE) ==
+			    (dev_state->state->print_mask &
+			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			      BTRFSIC_PRINT_MASK_VERBOSE)))
+				printk(KERN_INFO
+				       "#%u: page=%p, mapped=%p, len=%u,"
+				       " offset=%u\n",
+				       i, bio->bi_io_vec[i].bv_page,
+				       mapped_data,
+				       bio->bi_io_vec[i].bv_len,
+				       bio->bi_io_vec[i].bv_offset);
+			btrfsic_process_written_block(dev_state, dev_bytenr,
+						      mapped_data,
+						      bio->bi_io_vec[i].bv_len,
+						      bio, &bio_is_patched,
+						      NULL, rw);
+			kunmap(bio->bi_io_vec[i].bv_page);
+			dev_bytenr += bio->bi_io_vec[i].bv_len;
+		}
+	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+			       rw, bio->bi_bdev);
+		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+			if ((dev_state->state->print_mask &
+			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			      BTRFSIC_PRINT_MASK_VERBOSE)))
+				printk(KERN_INFO
+				       "btrfsic_submit_bio(%s) with FLUSH"
+				       " but dummy block already in use"
+				       " (ignored)!\n",
+				       dev_state->name);
+		} else {
+			struct btrfsic_block *const block =
+				&dev_state->dummy_block_for_bio_bh_flush;
+
+			block->is_iodone = 0;
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			block->flush_gen = dev_state->last_flush_gen + 1;
+			block->submit_bio_bh_rw = rw;
+			block->orig_bio_bh_private = bio->bi_private;
+			block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+			block->next_in_same_bio = NULL;
+			bio->bi_private = block;
+			bio->bi_end_io = btrfsic_bio_end_io;
+		}
+	}
+	mutex_unlock(&btrfsic_mutex);
+
+	submit_bio(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+		  struct btrfs_fs_devices *fs_devices,
+		  int including_extent_data, u32 print_mask)
+{
+	int ret;
+	struct btrfsic_state *state;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+
+	state = kzalloc(sizeof(*state), GFP_NOFS);
+	if (NULL == state) {
+		printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+		return -1;
+	}
+
+	if (!btrfsic_is_initialized) {
+		mutex_init(&btrfsic_mutex);
+		btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+		btrfsic_is_initialized = 1;
+	}
+	mutex_lock(&btrfsic_mutex);
+	state->root = root;
+	state->print_mask = print_mask;
+	state->include_extent_data = including_extent_data;
+	state->csum_size = 0;
+	INIT_LIST_HEAD(&state->all_blocks_list);
+	btrfsic_block_hashtable_init(&state->block_hashtable);
+	btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+	state->max_superblock_generation = 0;
+	state->latest_superblock = NULL;
+
+	list_for_each_entry(device, dev_head, dev_list) {
+		struct btrfsic_dev_state *ds;
+		char *p;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		ds = btrfsic_dev_state_alloc();
+		if (NULL == ds) {
+			printk(KERN_INFO
+			       "btrfs check-integrity: kmalloc() failed!\n");
+			mutex_unlock(&btrfsic_mutex);
+			return -1;
+		}
+		ds->bdev = device->bdev;
+		ds->state = state;
+		bdevname(ds->bdev, ds->name);
+		ds->name[BDEVNAME_SIZE - 1] = '\0';
+		for (p = ds->name; *p != '\0'; p++);
+		while (p > ds->name && *p != '/')
+			p--;
+		if (*p == '/')
+			p++;
+		strlcpy(ds->name, p, sizeof(ds->name));
+		btrfsic_dev_state_hashtable_add(ds,
+						&btrfsic_dev_state_hashtable);
+	}
+
+	ret = btrfsic_process_superblock(state, fs_devices);
+	if (0 != ret) {
+		mutex_unlock(&btrfsic_mutex);
+		btrfsic_unmount(root, fs_devices);
+		return ret;
+	}
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+		btrfsic_dump_database(state);
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+		btrfsic_dump_tree(state);
+
+	mutex_unlock(&btrfsic_mutex);
+	return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+		     struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *elem_all;
+	struct list_head *tmp_all;
+	struct btrfsic_state *state;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+
+	if (!btrfsic_is_initialized)
+		return;
+
+	mutex_lock(&btrfsic_mutex);
+
+	state = NULL;
+	list_for_each_entry(device, dev_head, dev_list) {
+		struct btrfsic_dev_state *ds;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		ds = btrfsic_dev_state_hashtable_lookup(
+				device->bdev,
+				&btrfsic_dev_state_hashtable);
+		if (NULL != ds) {
+			state = ds->state;
+			btrfsic_dev_state_hashtable_remove(ds);
+			btrfsic_dev_state_free(ds);
+		}
+	}
+
+	if (NULL == state) {
+		printk(KERN_INFO
+		       "btrfsic: error, cannot find state information"
+		       " on umount!\n");
+		mutex_unlock(&btrfsic_mutex);
+		return;
+	}
+
+	/*
+	 * Don't care about keeping the lists' state up to date,
+	 * just free all memory that was allocated dynamically.
+	 * Free the blocks and the block_links.
+	 */
+	list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+		struct btrfsic_block *const b_all =
+		    list_entry(elem_all, struct btrfsic_block,
+			       all_blocks_node);
+		struct list_head *elem_ref_to;
+		struct list_head *tmp_ref_to;
+
+		list_for_each_safe(elem_ref_to, tmp_ref_to,
+				   &b_all->ref_to_list) {
+			struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_rem_link(state, l);
+
+			l->ref_cnt--;
+			if (0 == l->ref_cnt)
+				btrfsic_block_link_free(l);
+		}
+
+		if (b_all->is_iodone)
+			btrfsic_block_free(b_all);
+		else
+			printk(KERN_INFO "btrfs: attempt to free %c-block"
+			       " @%llu (%s/%llu/%d) on umount which is"
+			       " not yet iodone!\n",
+			       btrfsic_get_block_type(state, b_all),
+			       (unsigned long long)b_all->logical_bytenr,
+			       b_all->dev_state->name,
+			       (unsigned long long)b_all->dev_bytenr,
+			       b_all->mirror_num);
+	}
+
+	mutex_unlock(&btrfsic_mutex);
+
+	kfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 00000000000..8b59175cc50
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+		  struct btrfs_fs_devices *fs_devices,
+		  int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+		     struct btrfs_fs_devices *fs_devices);
+
+#endif
-- 
cgit v1.2.3


From c975dd469d748ce619c510050d4fb407c2398591 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Tue, 1 Nov 2011 17:06:04 +0100
Subject: Btrfs: add config option to enable btrfs integrity check

Added the BTRFS_FS_CHECK_INTEGRITY option to Kconfig. It depends on
BTRFS_FS.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/Kconfig | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be14..d33f01c08b6 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
 	  Linux website <http://acl.bestbits.at/>.
 
 	  If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+	bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+	depends on BTRFS_FS
+	help
+	  Adds code that examines all block write requests (including
+	  writes of the super block). The goal is to verify that the
+	  state of the filesystem on disk is always consistent, i.e.,
+	  after a power-loss or kernel panic event the filesystem is
+	  in a consistent state.
+
+	  If the integrity check tool is included and activated in
+	  the mount options, plenty of kernel memory is used, and
+	  plenty of additional CPU cycles are spent. Enabling this
+	  functionality is not intended for normal use.
+
+	  In most cases, unless you are a btrfs developer who needs
+	  to verify the integrity of (super)-block write requests
+	  during the run of a regression test, say N
-- 
cgit v1.2.3


From f11e4d7f533249ddfa110116200c5c3a509f9218 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Tue, 1 Nov 2011 17:06:39 +0100
Subject: Btrfs: Makefile changes to optionally include btrfs integrity check

If the btrfs integrity check is enabled, the files required to
implement the checks are included in the build.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e..bc5b3556cee 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,3 +11,4 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   reada.o backref.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
-- 
cgit v1.2.3


From 21adbd5cbb5344a3fca6bb7ddb2ab6cb03c44546 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Wed, 9 Nov 2011 13:44:05 +0100
Subject: Btrfs: integrate integrity check module into btrfs

This is the last part of the patch series. It modifies the btrfs
code to use the integrity check module if configured to do so
with the define BTRFS_FS_CHECK_INTEGRITY. If this define is not set,
the only effective change is that code is added that handles the
mount option to activate the integrity check. If the mount option is
set and the define BTRFS_FS_CHECK_INTEGRITY is not set, that code
complains in the log and the mount fails with EINVAL.

Add the mount option to activate the usage of the integrity check
code.
Add invocation of btrfs integrity check code init and cleanup
function on mount and umount, respectively.
Add hook to call btrfs integrity check code version of
submit_bh/submit_bio.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/ctree.h     |  8 +++++++-
 fs/btrfs/disk-io.c   | 26 ++++++++++++++++++++++++--
 fs/btrfs/extent_io.c |  5 +++--
 fs/btrfs/scrub.c     |  5 +++--
 fs/btrfs/super.c     | 39 ++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.c   |  7 ++++---
 6 files changed, 79 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323..39f6188688e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -971,7 +971,7 @@ struct btrfs_fs_info {
 	 * is required instead of the faster short fsync log commits
 	 */
 	u64 last_trans_log_full_commit;
-	unsigned long mount_opt:20;
+	unsigned long mount_opt:21;
 	unsigned long compress_type:4;
 	u64 max_inline;
 	u64 alloc_start;
@@ -1155,6 +1155,10 @@ struct btrfs_fs_info {
 	int scrub_workers_refcnt;
 	struct btrfs_workers scrub_workers;
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	u32 check_integrity_print_mask;
+#endif
+
 	/* filesystem state */
 	u64 fs_state;
 
@@ -1413,6 +1417,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
 #define BTRFS_MOUNT_RECOVERY		(1 << 18)
+#define BTRFS_MOUNT_CHECK_INTEGRITY	(1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 20)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3f9d5551e58..f363c6d9c3d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,7 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "check-integrity.h"
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -2001,6 +2002,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->scrub_pause_wait);
 	init_rwsem(&fs_info->scrub_super_lock);
 	fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	fs_info->check_integrity_print_mask = 0;
+#endif
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
@@ -2356,6 +2360,19 @@ retry_root_backup:
 		btrfs_set_opt(fs_info->mount_opt, SSD);
 	}
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+		ret = btrfsic_mount(tree_root, fs_devices,
+				    btrfs_test_opt(tree_root,
+					CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+				    1 : 0,
+				    fs_info->check_integrity_print_mask);
+		if (ret)
+			printk(KERN_WARNING "btrfs: failed to initialize"
+			       " integrity check module %s\n", sb->s_id);
+	}
+#endif
+
 	/* do not make disk changes in broken FS */
 	if (btrfs_super_log_root(disk_super) != 0 &&
 	    !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2634,7 +2651,7 @@ static int write_dev_supers(struct btrfs_device *device,
 		 * we fua the first super.  The others we allow
 		 * to go down lazy.
 		 */
-		ret = submit_bh(WRITE_FUA, bh);
+		ret = btrfsic_submit_bh(WRITE_FUA, bh);
 		if (ret)
 			errors++;
 	}
@@ -2711,7 +2728,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 	device->flush_bio = bio;
 
 	bio_get(bio);
-	submit_bio(WRITE_FLUSH, bio);
+	btrfsic_submit_bio(WRITE_FLUSH, bio);
 
 	return 0;
 }
@@ -3057,6 +3074,11 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->caching_workers);
 	btrfs_stop_workers(&fs_info->readahead_workers);
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(root, CHECK_INTEGRITY))
+		btrfsic_unmount(root, fs_info->fs_devices);
+#endif
+
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f..246669296e0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,7 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
+#include "check-integrity.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
 	}
 	bio->bi_bdev = dev->bdev;
 	bio_add_page(bio, page, length, start-page_offset(page));
-	submit_bio(WRITE_SYNC, bio);
+	btrfsic_submit_bio(WRITE_SYNC, bio);
 	wait_for_completion(&compl);
 
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num, bio_flags, start);
 	else
-		submit_bio(rw, bio);
+		btrfsic_submit_bio(rw, bio);
 
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ddf2c90d3fc..567e148caca 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "check-integrity.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -732,7 +733,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
 	bio_add_page(bio, page, PAGE_SIZE, 0);
 	bio->bi_end_io = scrub_fixup_end_io;
 	bio->bi_private = &complete;
-	submit_bio(rw, bio);
+	btrfsic_submit_bio(rw, bio);
 
 	/* this will also unplug the queue */
 	wait_for_completion(&complete);
@@ -958,7 +959,7 @@ static int scrub_submit(struct scrub_dev *sdev)
 	sdev->curr = -1;
 	atomic_inc(&sdev->in_flight);
 
-	submit_bio(READ, sbio->bio);
+	btrfsic_submit_bio(READ, sbio->bio);
 
 	return 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 34a8b6112ea..22a2015f1d7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -165,7 +165,10 @@ enum {
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
 	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+	Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+	Opt_check_integrity, Opt_check_integrity_including_extent_data,
+	Opt_check_integrity_print_mask,
+	Opt_err,
 };
 
 static match_table_t tokens = {
@@ -200,6 +203,9 @@ static match_table_t tokens = {
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_no_space_cache, "nospace_cache"},
 	{Opt_recovery, "recovery"},
+	{Opt_check_integrity, "check_int"},
+	{Opt_check_integrity_including_extent_data, "check_int_data"},
+	{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
 	{Opt_err, NULL},
 };
 
@@ -398,6 +404,37 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: enabling auto recovery");
 			btrfs_set_opt(info->mount_opt, RECOVERY);
 			break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+		case Opt_check_integrity_including_extent_data:
+			printk(KERN_INFO "btrfs: enabling check integrity"
+			       " including extent data\n");
+			btrfs_set_opt(info->mount_opt,
+				      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+			break;
+		case Opt_check_integrity:
+			printk(KERN_INFO "btrfs: enabling check integrity\n");
+			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+			break;
+		case Opt_check_integrity_print_mask:
+			intarg = 0;
+			match_int(&args[0], &intarg);
+			if (intarg) {
+				info->check_integrity_print_mask = intarg;
+				printk(KERN_INFO "btrfs:"
+				       " check_integrity_print_mask 0x%x\n",
+				       info->check_integrity_print_mask);
+			}
+			break;
+#else
+		case Opt_check_integrity_including_extent_data:
+		case Opt_check_integrity:
+		case Opt_check_integrity_print_mask:
+			printk(KERN_ERR "btrfs: support for check_integrity*"
+			       " not compiled in!\n");
+			ret = -EINVAL;
+			goto out;
+#endif
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9..821334f6e3a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -32,6 +32,7 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "async-thread.h"
+#include "check-integrity.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
@@ -246,7 +247,7 @@ loop_lock:
 			sync_pending = 0;
 		}
 
-		submit_bio(cur->bi_rw, cur);
+		btrfsic_submit_bio(cur->bi_rw, cur);
 		num_run++;
 		batch_run++;
 		if (need_resched())
@@ -3304,7 +3305,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & REQ_WRITE)) {
 		bio_get(bio);
-		submit_bio(rw, bio);
+		btrfsic_submit_bio(rw, bio);
 		bio_put(bio);
 		return 0;
 	}
@@ -3399,7 +3400,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 			if (async_submit)
 				schedule_bio(root, dev, rw, bio);
 			else
-				submit_bio(rw, bio);
+				btrfsic_submit_bio(rw, bio);
 		} else {
 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 			bio->bi_sector = logical >> 9;
-- 
cgit v1.2.3


From 22cdfca5641817060dd724a9c30442f5c0675fcd Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 21 Dec 2011 14:14:31 -0500
Subject: ext4: remove unneeded file_remove_suid() from ext4_ioctl()

In the code to support EXT4_IOC_MOVE_EXT, ext4_ioctl calls
file_remove_suid() after the call to ext4_move_extents() if any
extents has been moved.  There are at least three things wrong with
this.  First, file_remove_suid() should be called with i_mutex down,
which is not here.  Second, it should be called before the donor file
has been modified, to avoid a potential race condition.  Third, and
most importantly, it's pointless, because ext4_file_extents() already
checks if the donor file has the setuid or setgid bit set, and will
return an error in that case.  So the first two objections don't
really matter, since file_remove_suid() will never need to modify the
inode in any case.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a56796814d6..ff1aab7cd6e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -247,8 +247,6 @@ setversion_out:
 		err = ext4_move_extents(filp, donor_filp, me.orig_start,
 					me.donor_start, me.len, &me.moved_len);
 		mnt_drop_write(filp->f_path.mnt);
-		if (me.moved_len > 0)
-			file_remove_suid(donor_filp);
 
 		if (copy_to_user((struct move_extent __user *)arg,
 				 &me, sizeof(me)))
-- 
cgit v1.2.3


From da5c81356426c476112f2b59fe64bdb1b37f079d Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 12:29:12 +0200
Subject: Btrfs: generic data structure to build unique lists

ulist is a generic data structures to hold a collection of unique u64
values. The only operations it supports is adding to the list and
enumerating it.

It is possible to store an auxiliary value along with the key. The
implementation is preliminary and can probably be sped up significantly.

It is used by btrfs_find_all_roots() quota to translate recursions into
iterative loops.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/Makefile |   2 +-
 fs/btrfs/ulist.c  | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ulist.h  |  68 +++++++++++++++++
 3 files changed, 289 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/ulist.c
 create mode 100644 fs/btrfs/ulist.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e..70798407b9a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-	   reada.o backref.o
+	   reada.o backref.o ulist.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 00000000000..12f5147bd2b
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include "ulist.h"
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * elem = NULL;
+ *
+ * while ((elem = ulist_next(ulist, elem)) {
+ * 	for (all child nodes n in elem)
+ *		ulist_add(ulist, n);
+ *	do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are adressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist:	the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+	ulist->nnodes = 0;
+	ulist->nodes = ulist->int_nodes;
+	ulist->nodes_alloced = ULIST_SIZE;
+}
+EXPORT_SYMBOL(ulist_init);
+
+/**
+ * ulist_fini - free up additionally allocated memory for the ulist
+ * @ulist:	the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+void ulist_fini(struct ulist *ulist)
+{
+	/*
+	 * The first ULIST_SIZE elements are stored inline in struct ulist.
+	 * Only if more elements are alocated they need to be freed.
+	 */
+	if (ulist->nodes_alloced > ULIST_SIZE)
+		kfree(ulist->nodes);
+	ulist->nodes_alloced = 0;	/* in case ulist_fini is called twice */
+}
+EXPORT_SYMBOL(ulist_fini);
+
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist:	ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+	ulist_fini(ulist);
+	ulist_init(ulist);
+}
+EXPORT_SYMBOL(ulist_reinit);
+
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask:	allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(unsigned long gfp_mask)
+{
+	struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+
+	if (!ulist)
+		return NULL;
+
+	ulist_init(ulist);
+
+	return ulist;
+}
+EXPORT_SYMBOL(ulist_alloc);
+
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist:	ulist to free
+ *
+ * It is not necessary to call ulist_fini before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+	if (!ulist)
+		return;
+	ulist_fini(ulist);
+	kfree(ulist);
+}
+EXPORT_SYMBOL(ulist_free);
+
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist:	ulist to add the element to
+ * @val:	value to add to ulist
+ * @aux:	auxiliary value to store along with val
+ * @gfp_mask:	flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ *       locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+	      unsigned long gfp_mask)
+{
+	int i;
+
+	for (i = 0; i < ulist->nnodes; ++i) {
+		if (ulist->nodes[i].val == val)
+			return 0;
+	}
+
+	if (ulist->nnodes >= ulist->nodes_alloced) {
+		u64 new_alloced = ulist->nodes_alloced + 128;
+		struct ulist_node *new_nodes;
+		void *old = NULL;
+
+		/*
+		 * if nodes_alloced == ULIST_SIZE no memory has been allocated
+		 * yet, so pass NULL to krealloc
+		 */
+		if (ulist->nodes_alloced > ULIST_SIZE)
+			old = ulist->nodes;
+
+		new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
+				     gfp_mask);
+		if (!new_nodes)
+			return -ENOMEM;
+
+		if (!old)
+			memcpy(new_nodes, ulist->int_nodes,
+			       sizeof(ulist->int_nodes));
+
+		ulist->nodes = new_nodes;
+		ulist->nodes_alloced = new_alloced;
+	}
+	ulist->nodes[ulist->nnodes].val = val;
+	ulist->nodes[ulist->nnodes].aux = aux;
+	++ulist->nnodes;
+
+	return 1;
+}
+EXPORT_SYMBOL(ulist_add);
+
+/**
+ * ulist_next - iterate ulist
+ * @ulist:	ulist to iterate
+ * @prev:	previously returned element or %NULL to start iteration
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ *       locking is needed
+ *
+ * This function is used to iterate an ulist. The iteration is started with
+ * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+{
+	int next;
+
+	if (ulist->nnodes == 0)
+		return NULL;
+
+	if (!prev)
+		return &ulist->nodes[0];
+
+	next = (prev - ulist->nodes) + 1;
+	if (next < 0 || next >= ulist->nnodes)
+		return NULL;
+
+	return &ulist->nodes[next];
+}
+EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 00000000000..2e25dec58ec
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ *
+ */
+
+#ifndef __ULIST__
+#define __ULIST__
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ */
+
+/*
+ * number of elements statically allocated inside struct ulist
+ */
+#define ULIST_SIZE 16
+
+/*
+ * element of the list
+ */
+struct ulist_node {
+	u64 val;		/* value to store */
+	unsigned long aux;	/* auxiliary value saved along with the val */
+};
+
+struct ulist {
+	/*
+	 * number of elements stored in list
+	 */
+	unsigned long nnodes;
+
+	/*
+	 * number of nodes we already have room for
+	 */
+	unsigned long nodes_alloced;
+
+	/*
+	 * pointer to the array storing the elements. The first ULIST_SIZE
+	 * elements are stored inline. In this case the it points to int_nodes.
+	 * After exceeding ULIST_SIZE, dynamic memory is allocated.
+	 */
+	struct ulist_node *nodes;
+
+	/*
+	 * inline storage space for the first ULIST_SIZE entries
+	 */
+	struct ulist_node int_nodes[ULIST_SIZE];
+};
+
+void ulist_init(struct ulist *ulist);
+void ulist_fini(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(unsigned long gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+	      unsigned long gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+
+#endif
-- 
cgit v1.2.3


From c7d22a3c3cdb73d8a0151e2ccc8cf4a48c48310b Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Tue, 22 Nov 2011 15:14:33 +0100
Subject: Btrfs: added helper btrfs_next_item()

btrfs_next_item() makes the btrfs path point to the next item, crossing leaf
boundaries if needed.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50634abef9b..3e4a07b7981 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2482,6 +2482,13 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+	++p->slots[0];
+	if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+		return btrfs_next_leaf(root, p);
+	return 0;
+}
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 void btrfs_drop_snapshot(struct btrfs_root *root,
-- 
cgit v1.2.3


From 66d7e7f09f77456fe68683247d77721032a00ee5 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 12 Sep 2011 15:26:38 +0200
Subject: Btrfs: mark delayed refs as for cow

Add a for_cow parameter to add_delayed_*_ref and pass the appropriate value
from every call site. The for_cow parameter will later on be used to
determine if a ref will change anything with respect to qgroups.

Delayed refs coming from relocation are always counted as for_cow, as they
don't change subvol quota.

Also pass in the fs_info for later use.

btrfs_find_all_roots() will use this as an optimization, as changes that are
for_cow will not change anything with respect to which root points to a
certain leaf. Thus, we don't need to add the current sequence number to
those delayed refs.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.c       |  42 ++++++++++----------
 fs/btrfs/ctree.h       |  17 +++++----
 fs/btrfs/delayed-ref.c |  50 ++++++++++++++----------
 fs/btrfs/delayed-ref.h |  15 +++++---
 fs/btrfs/disk-io.c     |   3 +-
 fs/btrfs/extent-tree.c | 101 +++++++++++++++++++++++++++++--------------------
 fs/btrfs/file.c        |  10 ++---
 fs/btrfs/inode.c       |   2 +-
 fs/btrfs/ioctl.c       |   5 ++-
 fs/btrfs/relocation.c  |  18 +++++----
 fs/btrfs/transaction.c |   4 +-
 fs/btrfs/tree-log.c    |   2 +-
 12 files changed, 155 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdee..0639a555e16 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
 				     new_root_objectid, &disk_key, level,
-				     buf->start, 0);
+				     buf->start, 0, 1);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-		ret = btrfs_inc_ref(trans, root, cow, 1);
+		ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 	else
-		ret = btrfs_inc_ref(trans, root, cow, 0);
+		ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 
 	if (ret)
 		return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 		if ((owner == root->root_key.objectid ||
 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-			ret = btrfs_inc_ref(trans, root, buf, 1);
+			ret = btrfs_inc_ref(trans, root, buf, 1, 1);
 			BUG_ON(ret);
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID) {
-				ret = btrfs_dec_ref(trans, root, buf, 0);
+				ret = btrfs_dec_ref(trans, root, buf, 0, 1);
 				BUG_ON(ret);
-				ret = btrfs_inc_ref(trans, root, cow, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 				BUG_ON(ret);
 			}
 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 			else
-				ret = btrfs_inc_ref(trans, root, cow, 0);
+				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 			BUG_ON(ret);
 		}
 		if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 			else
-				ret = btrfs_inc_ref(trans, root, cow, 0);
+				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 			BUG_ON(ret);
-			ret = btrfs_dec_ref(trans, root, buf, 1);
+			ret = btrfs_dec_ref(trans, root, buf, 1, 1);
 			BUG_ON(ret);
 		}
 		clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
 				     root->root_key.objectid, &disk_key,
-				     level, search_start, empty_size);
+				     level, search_start, empty_size, 1);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		rcu_assign_pointer(root->node, cow);
 
 		btrfs_free_tree_block(trans, root, buf, parent_start,
-				      last_ref);
+				      last_ref, 1);
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
 		btrfs_free_tree_block(trans, root, buf, parent_start,
-				      last_ref);
+				      last_ref, 1);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		free_extent_buffer(mid);
 
 		root_sub_used(root, mid->len);
-		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
 		return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			if (wret)
 				ret = wret;
 			root_sub_used(root, right->len);
-			btrfs_free_tree_block(trans, root, right, 0, 1);
+			btrfs_free_tree_block(trans, root, right, 0, 1, 0);
 			free_extent_buffer(right);
 			right = NULL;
 		} else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (wret)
 			ret = wret;
 		root_sub_used(root, mid->len);
-		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
 		free_extent_buffer(mid);
 		mid = NULL;
 	} else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
 				   root->root_key.objectid, &lower_key,
-				   level, root->node->start, 0);
+				   level, root->node->start, 0, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
 	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
 					root->root_key.objectid,
-					&disk_key, level, c->start, 0);
+					&disk_key, level, c->start, 0, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
@@ -2970,7 +2970,7 @@ again:
 
 	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
 					root->root_key.objectid,
-					&disk_key, 0, l->start, 0);
+					&disk_key, 0, l->start, 0, 0);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
 	root_sub_used(root, leaf->len);
 
-	btrfs_free_tree_block(trans, root, leaf, 0, 1);
+	btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
 	return 0;
 }
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3e4a07b7981..543f60bddb3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2277,11 +2277,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root, u32 blocksize,
 					u64 parent, u64 root_objectid,
 					struct btrfs_disk_key *key, int level,
-					u64 hint, u64 empty_size);
+					u64 hint, u64 empty_size, int for_cow);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   u64 parent, int last_ref);
+			   u64 parent, int last_ref, int for_cow);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize,
@@ -2301,17 +2301,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
+		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
+		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 flags,
 				int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 owner, u64 offset);
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int for_cow);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2323,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset);
+			 u64 root_objectid, u64 owner, u64 offset, int for_cow);
 
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
@@ -2492,7 +2492,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 void btrfs_drop_snapshot(struct btrfs_root *root,
-			 struct btrfs_block_rsv *block_rsv, int update_ref);
+			 struct btrfs_block_rsv *block_rsv, int update_ref,
+			 int for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct extent_buffer *node,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd0..3a0f0ab804f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -390,7 +390,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
  * this does all the dirty work in terms of maintaining the correct
  * overall modification count.
  */
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+					struct btrfs_trans_handle *trans,
 					struct btrfs_delayed_ref_node *ref,
 					u64 bytenr, u64 num_bytes,
 					int action, int is_data)
@@ -468,10 +469,12 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
 /*
  * helper to insert a delayed tree ref into the rbtree.
  */
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+					 struct btrfs_trans_handle *trans,
 					 struct btrfs_delayed_ref_node *ref,
 					 u64 bytenr, u64 num_bytes, u64 parent,
-					 u64 ref_root, int level, int action)
+					 u64 ref_root, int level, int action,
+					 int for_cow)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_tree_ref *full_ref;
@@ -522,11 +525,12 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
  * helper to insert a delayed data ref into the rbtree.
  */
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+					 struct btrfs_trans_handle *trans,
 					 struct btrfs_delayed_ref_node *ref,
 					 u64 bytenr, u64 num_bytes, u64 parent,
 					 u64 ref_root, u64 owner, u64 offset,
-					 int action)
+					 int action, int for_cow)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_data_ref *full_ref;
@@ -554,6 +558,7 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
 		full_ref->root = ref_root;
 		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
 	}
+
 	full_ref->objectid = owner;
 	full_ref->offset = offset;
 
@@ -580,10 +585,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root,  int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op)
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow)
 {
 	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
@@ -610,12 +617,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-				   action, 0);
+	ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+				   num_bytes, action, 0);
 	BUG_ON(ret);
 
-	ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
-				   parent, ref_root, level, action);
+	ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+				   num_bytes, parent, ref_root, level, action,
+				   for_cow);
 	BUG_ON(ret);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
@@ -624,11 +632,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op)
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow)
 {
 	struct btrfs_delayed_data_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +665,20 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-				   action, 1);
+	ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+				   num_bytes, action, 1);
 	BUG_ON(ret);
 
-	ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-				   parent, ref_root, owner, offset, action);
+	ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+				   num_bytes, parent, ref_root, owner, offset,
+				   action, for_cow);
 	BUG_ON(ret);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
 
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op)
 {
@@ -683,7 +695,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+	ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
 				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
 				   extent_op->is_data);
 	BUG_ON(ret);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab..8316bff18d3 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -151,16 +151,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 	}
 }
 
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root, int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 94abc25392f..6f8cd17c9a9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1243,7 +1243,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	root->ref_cows = 0;
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+				      BTRFS_TREE_LOG_OBJECTID, NULL,
+				      0, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		kfree(root);
 		return ERR_CAST(leaf);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 813c6bb96c9..dc8b9a83459 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1872,20 +1872,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset)
+			 u64 root_objectid, u64 owner, u64 offset, int for_cow)
 {
 	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_ADD_DELAYED_REF, NULL);
+					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
 	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, owner, offset,
-					BTRFS_ADD_DELAYED_REF, NULL);
+					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
 	}
 	return ret;
 }
@@ -2405,7 +2409,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 	extent_op->update_key = 0;
 	extent_op->is_data = is_data ? 1 : 0;
 
-	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+					  num_bytes, extent_op);
 	if (ret)
 		kfree(extent_op);
 	return ret;
@@ -2590,7 +2595,7 @@ out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   int full_backref, int inc)
+			   int full_backref, int inc, int for_cow)
 {
 	u64 bytenr;
 	u64 num_bytes;
@@ -2603,7 +2608,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 	int level;
 	int ret = 0;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64, int);
 
 	ref_root = btrfs_header_owner(buf);
 	nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2645,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			key.offset -= btrfs_file_extent_offset(buf, fi);
 			ret = process_func(trans, root, bytenr, num_bytes,
 					   parent, ref_root, key.objectid,
-					   key.offset);
+					   key.offset, for_cow);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = btrfs_level_size(root, level - 1);
 			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, level - 1, 0);
+					   parent, ref_root, level - 1, 0,
+					   for_cow);
 			if (ret)
 				goto fail;
 		}
@@ -2659,15 +2665,15 @@ fail:
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
+		  struct extent_buffer *buf, int full_backref, int for_cow)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
+		  struct extent_buffer *buf, int full_backref, int for_cow)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -4937,16 +4943,17 @@ out:
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   u64 parent, int last_ref)
+			   u64 parent, int last_ref, int for_cow)
 {
 	struct btrfs_block_group_cache *cache = NULL;
 	int ret;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
-						parent, root->root_key.objectid,
-						btrfs_header_level(buf),
-						BTRFS_DROP_DELAYED_REF, NULL);
+		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+					buf->start, buf->len,
+					parent, root->root_key.objectid,
+					btrfs_header_level(buf),
+					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
 		BUG_ON(ret);
 	}
 
@@ -4981,12 +4988,12 @@ out:
 	btrfs_put_block_group(cache);
 }
 
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 owner, u64 offset)
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int for_cow)
 {
 	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	/*
 	 * tree log blocks never actually go into the extent allocation
@@ -4998,14 +5005,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
 		ret = 0;
 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_DROP_DELAYED_REF, NULL);
+					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
 		BUG_ON(ret);
 	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
-					parent, root_objectid, owner,
-					offset, BTRFS_DROP_DELAYED_REF, NULL);
+		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+						num_bytes,
+						parent, root_objectid, owner,
+						offset, BTRFS_DROP_DELAYED_REF,
+						NULL, for_cow);
 		BUG_ON(ret);
 	}
 	return ret;
@@ -5826,9 +5836,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 
 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
-	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
-					 0, root_objectid, owner, offset,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL);
+	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+					 ins->offset, 0,
+					 root_objectid, owner, offset,
+					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
 	return ret;
 }
 
@@ -5998,7 +6009,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root, u32 blocksize,
 					u64 parent, u64 root_objectid,
 					struct btrfs_disk_key *key, int level,
-					u64 hint, u64 empty_size)
+					u64 hint, u64 empty_size, int for_cow)
 {
 	struct btrfs_key ins;
 	struct btrfs_block_rsv *block_rsv;
@@ -6042,10 +6053,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		extent_op->update_flags = 1;
 		extent_op->is_data = 0;
 
-		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+					ins.objectid,
 					ins.offset, parent, root_objectid,
 					level, BTRFS_ADD_DELAYED_EXTENT,
-					extent_op);
+					extent_op, for_cow);
 		BUG_ON(ret);
 	}
 	return buf;
@@ -6062,6 +6074,7 @@ struct walk_control {
 	int keep_locks;
 	int reada_slot;
 	int reada_count;
+	int for_reloc;
 };
 
 #define DROP_REFERENCE	1
@@ -6200,9 +6213,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 	/* wc->stage == UPDATE_BACKREF */
 	if (!(wc->flags[level] & flag)) {
 		BUG_ON(!path->locks[level]);
-		ret = btrfs_inc_ref(trans, root, eb, 1);
+		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
 		BUG_ON(ret);
-		ret = btrfs_dec_ref(trans, root, eb, 0);
+		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
 		BUG_ON(ret);
 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
 						  eb->len, flag, 0);
@@ -6346,7 +6359,7 @@ skip:
 		}
 
 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-					root->root_key.objectid, level - 1, 0);
+				root->root_key.objectid, level - 1, 0, 0);
 		BUG_ON(ret);
 	}
 	btrfs_tree_unlock(next);
@@ -6420,9 +6433,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 	if (wc->refs[level] == 1) {
 		if (level == 0) {
 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-				ret = btrfs_dec_ref(trans, root, eb, 1);
+				ret = btrfs_dec_ref(trans, root, eb, 1,
+						    wc->for_reloc);
 			else
-				ret = btrfs_dec_ref(trans, root, eb, 0);
+				ret = btrfs_dec_ref(trans, root, eb, 0,
+						    wc->for_reloc);
 			BUG_ON(ret);
 		}
 		/* make block locked assertion in clean_tree_block happy */
@@ -6449,7 +6464,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			       btrfs_header_owner(path->nodes[level + 1]));
 	}
 
-	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
 out:
 	wc->refs[level] = 0;
 	wc->flags[level] = 0;
@@ -6533,7 +6548,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * blocks are properly updated.
  */
 void btrfs_drop_snapshot(struct btrfs_root *root,
-			 struct btrfs_block_rsv *block_rsv, int update_ref)
+			 struct btrfs_block_rsv *block_rsv, int update_ref,
+			 int for_reloc)
 {
 	struct btrfs_path *path;
 	struct btrfs_trans_handle *trans;
@@ -6621,6 +6637,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = update_ref;
 	wc->keep_locks = 0;
+	wc->for_reloc = for_reloc;
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
@@ -6705,6 +6722,7 @@ out:
  * drop subtree rooted at tree block 'node'.
  *
  * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
  */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
@@ -6749,6 +6767,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = 0;
 	wc->keep_locks = 1;
+	wc->for_reloc = 1;
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f2e92828960..d2b60ed6c33 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -678,7 +678,7 @@ next_slot:
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						new_key.objectid,
-						start - extent_offset);
+						start - extent_offset, 0);
 				BUG_ON(ret);
 				*hint_byte = disk_bytenr;
 			}
@@ -753,7 +753,7 @@ next_slot:
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						key.objectid, key.offset -
-						extent_offset);
+						extent_offset, 0);
 				BUG_ON(ret);
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
 
 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 					   root->root_key.objectid,
-					   ino, orig_offset);
+					   ino, orig_offset, 0);
 		BUG_ON(ret);
 
 		if (split == start) {
@@ -989,7 +989,7 @@ again:
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset);
+					ino, orig_offset, 0);
 		BUG_ON(ret);
 	}
 	other_start = 0;
@@ -1006,7 +1006,7 @@ again:
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset);
+					ino, orig_offset, 0);
 		BUG_ON(ret);
 	}
 	if (del_nr == 0) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c5ccec23984..ea819386b86 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3139,7 +3139,7 @@ delete:
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
-						ino, extent_offset);
+						ino, extent_offset, 0);
 			BUG_ON(ret);
 		}
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 72d461656f6..c48f2e931ea 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -358,7 +358,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 		return PTR_ERR(trans);
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-				      0, objectid, NULL, 0, 0, 0);
+				      0, objectid, NULL, 0, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		goto fail;
@@ -2425,7 +2425,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 							disko, diskl, 0,
 							root->root_key.objectid,
 							btrfs_ino(inode),
-							new_key.offset - datao);
+							new_key.offset - datao,
+							0);
 					BUG_ON(ret);
 				}
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index dff29d5e151..8c1aae2c845 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
 					   num_bytes, parent,
 					   btrfs_header_owner(leaf),
-					   key.objectid, key.offset);
+					   key.objectid, key.offset, 1);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					parent, btrfs_header_owner(leaf),
-					key.objectid, key.offset);
+					key.objectid, key.offset, 1);
 		BUG_ON(ret);
 	}
 	if (dirty)
@@ -1778,21 +1778,23 @@ again:
 
 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
+					src->root_key.objectid, level - 1, 0,
+					1);
 		BUG_ON(ret);
 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0);
+					0, 1);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
+					src->root_key.objectid, level - 1, 0,
+					1);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0);
+					0, 1);
 		BUG_ON(ret);
 
 		btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
 		} else {
 			list_del_init(&reloc_root->root_list);
 		}
-		btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
+		btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
 	}
 
 	if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 						node->eb->start, blocksize,
 						upper->eb->start,
 						btrfs_header_owner(upper->eb),
-						node->level, 0);
+						node->level, 0, 1);
 			BUG_ON(ret);
 
 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3..a2bfedcbcab 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 
 		if (btrfs_header_backref_rev(root->node) <
 		    BTRFS_MIXED_BACKREF_REV)
-			btrfs_drop_snapshot(root, NULL, 0);
+			btrfs_drop_snapshot(root, NULL, 0, 0);
 		else
-			btrfs_drop_snapshot(root, NULL, 1);
+			btrfs_drop_snapshot(root, NULL, 1, 0);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f4d81c06d48..fce7b9ec3bf 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
 						0, root->root_key.objectid,
-						key->objectid, offset);
+						key->objectid, offset, 0);
 				BUG_ON(ret);
 			} else {
 				/*
-- 
cgit v1.2.3


From eebe063b7f916087cd5c61de57b20a3a30894a96 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Wed, 14 Sep 2011 14:01:24 +0200
Subject: Btrfs: always save ref_root in delayed refs

For consistent backref walking and (later) qgroup calculation the
information to which root a delayed ref belongs is useful even for shared
refs.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/delayed-ref.c | 18 ++++++++----------
 fs/btrfs/delayed-ref.h | 12 ++++--------
 2 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 3a0f0ab804f..babd37badb4 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -495,13 +495,12 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->in_tree = 1;
 
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
-	if (parent) {
-		full_ref->parent = parent;
+	full_ref->parent = parent;
+	full_ref->root = ref_root;
+	if (parent)
 		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-	} else {
-		full_ref->root = ref_root;
+	else
 		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-	}
 	full_ref->level = level;
 
 	trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -551,13 +550,12 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->in_tree = 1;
 
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
-	if (parent) {
-		full_ref->parent = parent;
+	full_ref->parent = parent;
+	full_ref->root = ref_root;
+	if (parent)
 		ref->type = BTRFS_SHARED_DATA_REF_KEY;
-	} else {
-		full_ref->root = ref_root;
+	else
 		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-	}
 
 	full_ref->objectid = owner;
 	full_ref->offset = offset;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 8316bff18d3..a5fb2bc8373 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -98,19 +98,15 @@ struct btrfs_delayed_ref_head {
 
 struct btrfs_delayed_tree_ref {
 	struct btrfs_delayed_ref_node node;
-	union {
-		u64 root;
-		u64 parent;
-	};
+	u64 root;
+	u64 parent;
 	int level;
 };
 
 struct btrfs_delayed_data_ref {
 	struct btrfs_delayed_ref_node node;
-	union {
-		u64 root;
-		u64 parent;
-	};
+	u64 root;
+	u64 parent;
 	u64 objectid;
 	u64 offset;
 };
-- 
cgit v1.2.3


From 2aff57b0c052344e8401a8b4a33c2a1ecb0f627c Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 28 Dec 2011 12:02:13 -0500
Subject: ext4: allocate delalloc blocks before changing journal mode

delalloc blocks should be allocated before changing journal mode,
otherwise they can not be allocated and even more truncate on
delalloc blocks could triggre BUG by flushing delalloc buffers.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 92655fd8965..cb0ba9d77a8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4647,6 +4647,17 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		return 0;
 	if (is_journal_aborted(journal))
 		return -EROFS;
+	/* We have to allocate physical blocks for delalloc blocks
+	 * before flushing journal. otherwise delalloc blocks can not
+	 * be allocated any more. even more truncate on delalloc blocks
+	 * could trigger BUG by flushing delalloc blocks in journal.
+	 * There is no delalloc block in non-journal data mode.
+	 */
+	if (val && test_opt(inode->i_sb, DELALLOC)) {
+		err = ext4_alloc_da_blocks(inode);
+		if (err < 0)
+			return err;
+	}
 
 	jbd2_journal_lock_updates(journal);
 	jbd2_journal_flush(journal);
-- 
cgit v1.2.3


From 5872ddaaf05bf25e3ab90580295ebc946405928c Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 28 Dec 2011 13:55:51 -0500
Subject: ext4: flush journal when switching from data=journal mode

It's necessary to flush the journal when switching away from
data=journal mode.  This is because there are no revoke records when
data blocks are journalled, but revoke records are required in the
other journal modes.

However, it is not necessary to flush the journal when switching into
data=journal mode, and flushing the journal is expensive.  So let's
avoid it in that case.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cb0ba9d77a8..1254934de69 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4660,7 +4660,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	}
 
 	jbd2_journal_lock_updates(journal);
-	jbd2_journal_flush(journal);
 
 	/*
 	 * OK, there are no updates running now, and all cached data is
@@ -4672,8 +4671,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 
 	if (val)
 		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
-	else
+	else {
+		jbd2_journal_flush(journal);
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+	}
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
-- 
cgit v1.2.3


From 1ba37268cd19e5a2a80924bfe8618bf1ba3e8249 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 28 Dec 2011 17:46:46 -0500
Subject: jbd2: clear revoked flag on buffers before a new transaction started

Currently, we clear revoked flag only when a block is reused.  However,
this can tigger a false journal error.  Consider a situation when a block
is used as a meta block and is deleted(revoked) in ordered mode, then the
block is allocated as a data block to a file.  At this moment, user changes
the file's journal mode from ordered to journaled and truncates the file.
The block will be considered re-revoked by journal because it has revoked
flag still pending from the last transaction and an assertion triggers.

We fix the problem by keeping the revoked status more uptodate - we clear
revoked flag when switching revoke tables to reflect there is no revoked
buffers in current transaction any more.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c     |  6 ++++++
 fs/jbd2/revoke.c     | 34 ++++++++++++++++++++++++++++++++++
 include/linux/jbd2.h |  1 +
 3 files changed, 41 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 68d704db787..5069b847515 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -429,6 +429,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 	jbd_debug(3, "JBD2: commit phase 1\n");
 
+	/*
+	 * Clear revoked flag to reflect there is no revoked buffers
+	 * in the next transaction which is going to be started.
+	 */
+	jbd2_clear_buffer_revoked_flags(journal);
+
 	/*
 	 * Switch to a new revoke table.
 	 */
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 69fd9358811..30b2867d6cc 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -47,6 +47,10 @@
  *   overwriting the new data.  We don't even need to clear the revoke
  *   bit here.
  *
+ * We cache revoke status of a buffer in the current transaction in b_states
+ * bits.  As the name says, revokevalid flag indicates that the cached revoke
+ * status of a buffer is valid and we can rely on the cached status.
+ *
  * Revoke information on buffers is a tri-state value:
  *
  * RevokeValid clear:	no cached revoke status, need to look it up
@@ -478,6 +482,36 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	return did_revoke;
 }
 
+/*
+ * journal_clear_revoked_flag clears revoked flag of buffers in
+ * revoke table to reflect there is no revoked buffers in the next
+ * transaction which is going to be started.
+ */
+void jbd2_clear_buffer_revoked_flags(journal_t *journal)
+{
+	struct jbd2_revoke_table_s *revoke = journal->j_revoke;
+	int i = 0;
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		struct list_head *hash_list;
+		struct list_head *list_entry;
+		hash_list = &revoke->hash_table[i];
+
+		list_for_each(list_entry, hash_list) {
+			struct jbd2_revoke_record_s *record;
+			struct buffer_head *bh;
+			record = (struct jbd2_revoke_record_s *)list_entry;
+			bh = __find_get_block(journal->j_fs_dev,
+					      record->blocknr,
+					      journal->j_blocksize);
+			if (bh) {
+				clear_buffer_revoked(bh);
+				__brelse(bh);
+			}
+		}
+	}
+}
+
 /* journal_switch_revoke table select j_revoke for next transaction
  * we do not want to suspend any processing until all revokes are
  * written -bzzz
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 2092ea21e46..5557baefed6 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1151,6 +1151,7 @@ extern int	jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
 extern int	jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t);
 extern void	jbd2_journal_clear_revoke(journal_t *);
 extern void	jbd2_journal_switch_revoke_table(journal_t *journal);
+extern void	jbd2_clear_buffer_revoked_flags(journal_t *journal);
 
 /*
  * The log thread user interface:
-- 
cgit v1.2.3


From 88635ca277adb67db34e88281817d1ce10713553 Mon Sep 17 00:00:00 2001
From: Zheng Liu <gnehzuil.liu@gmail.com>
Date: Wed, 28 Dec 2011 19:00:25 -0500
Subject: ext4: add missing spaces to debugging printk's

Fix ext4_debug format in ext4_ext_handle_uninitialized_extents() and
ext4_end_io_dio().

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 4 ++--
 fs/ext4/inode.c   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5684f251092..b35bb40556d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3456,8 +3456,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	int err = 0;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
 
-	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
-		  "block %llu, max_blocks %u, flags %d, allocated %u",
+	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
+		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
 		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
 		  flags, allocated);
 	ext4_ext_show_leaf(inode, path);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1254934de69..ef9e8fdddfb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2760,7 +2760,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 	if (!io_end || !size)
 		goto out;
 
-	ext_debug("ext4_end_io_dio(): io_end 0x%p"
+	ext_debug("ext4_end_io_dio(): io_end 0x%p "
 		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
  		  iocb->private, io_end->inode->i_ino, iocb, offset,
 		  size);
-- 
cgit v1.2.3


From 14c83c9fddf2e75bdd0c20f1072f35260e356484 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 28 Dec 2011 20:25:13 -0500
Subject: ext4: avoid counting the number of free inodes twice in
 find_group_orlov()

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8fb6844f973..cdafc05d79c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -358,7 +358,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
-	unsigned int freei, avefreei;
+	unsigned int freei, avefreei, grp_free;
 	ext4_fsblk_t freeb, avefreec;
 	unsigned int ndirs;
 	int max_dirs, min_inodes;
@@ -477,8 +477,8 @@ fallback_retry:
 	for (i = 0; i < ngroups; i++) {
 		grp = (parent_group + i) % ngroups;
 		desc = ext4_get_group_desc(sb, grp, NULL);
-		if (desc && ext4_free_inodes_count(sb, desc) &&
-		    ext4_free_inodes_count(sb, desc) >= avefreei) {
+		grp_free = ext4_free_inodes_count(sb, desc);
+		if (desc && grp_free && grp_free >= avefreei) {
 			*group = grp;
 			return 0;
 		}
-- 
cgit v1.2.3


From ccb4d7af914e0fe9b2f1022f8ea6c300463fd5e6 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Wed, 28 Dec 2011 20:25:40 -0500
Subject: ext4: remove no longer used functions in inode.c

The functions ext4_block_truncate_page() and ext4_block_zero_page_range()
are no longer used, so remove them.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |   4 --
 fs/ext4/inode.c | 120 --------------------------------------------------------
 2 files changed, 124 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5b0e26a1272..ae2407f4502 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1880,10 +1880,6 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_block_truncate_page(handle_t *handle,
-		struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
-		struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_discard_partial_page_buffers(handle_t *handle,
 		struct address_space *mapping, loff_t from,
 		loff_t length, int flags);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ef9e8fdddfb..e6cc24dfa98 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3301,126 +3301,6 @@ next:
 	return err;
 }
 
-/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
-		struct address_space *mapping, loff_t from)
-{
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned length;
-	unsigned blocksize;
-	struct inode *inode = mapping->host;
-
-	blocksize = inode->i_sb->s_blocksize;
-	length = blocksize - (offset & (blocksize - 1));
-
-	return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'.  The range to be zero'd must
- * be contained with in one block.  If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-int ext4_block_zero_page_range(handle_t *handle,
-		struct address_space *mapping, loff_t from, loff_t length)
-{
-	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned blocksize, max, pos;
-	ext4_lblk_t iblock;
-	struct inode *inode = mapping->host;
-	struct buffer_head *bh;
-	struct page *page;
-	int err = 0;
-
-	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-				   mapping_gfp_mask(mapping) & ~__GFP_FS);
-	if (!page)
-		return -ENOMEM;
-
-	blocksize = inode->i_sb->s_blocksize;
-	max = blocksize - (offset & (blocksize - 1));
-
-	/*
-	 * correct length if it does not fall between
-	 * 'from' and the end of the block
-	 */
-	if (length > max || length < 0)
-		length = max;
-
-	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-
-	/* Find the buffer that contains "offset" */
-	bh = page_buffers(page);
-	pos = blocksize;
-	while (offset >= pos) {
-		bh = bh->b_this_page;
-		iblock++;
-		pos += blocksize;
-	}
-
-	err = 0;
-	if (buffer_freed(bh)) {
-		BUFFER_TRACE(bh, "freed: skip");
-		goto unlock;
-	}
-
-	if (!buffer_mapped(bh)) {
-		BUFFER_TRACE(bh, "unmapped");
-		ext4_get_block(inode, iblock, bh, 0);
-		/* unmapped? It's a hole - nothing to do */
-		if (!buffer_mapped(bh)) {
-			BUFFER_TRACE(bh, "still unmapped");
-			goto unlock;
-		}
-	}
-
-	/* Ok, it's mapped. Make sure it's up-to-date */
-	if (PageUptodate(page))
-		set_buffer_uptodate(bh);
-
-	if (!buffer_uptodate(bh)) {
-		err = -EIO;
-		ll_rw_block(READ, 1, &bh);
-		wait_on_buffer(bh);
-		/* Uhhuh. Read error. Complain and punt. */
-		if (!buffer_uptodate(bh))
-			goto unlock;
-	}
-
-	if (ext4_should_journal_data(inode)) {
-		BUFFER_TRACE(bh, "get write access");
-		err = ext4_journal_get_write_access(handle, bh);
-		if (err)
-			goto unlock;
-	}
-
-	zero_user(page, offset, length);
-
-	BUFFER_TRACE(bh, "zeroed end of block");
-
-	err = 0;
-	if (ext4_should_journal_data(inode)) {
-		err = ext4_handle_dirty_metadata(handle, inode, bh);
-	} else
-		mark_buffer_dirty(bh);
-
-unlock:
-	unlock_page(page);
-	page_cache_release(page);
-	return err;
-}
-
 int ext4_can_truncate(struct inode *inode)
 {
 	if (S_ISREG(inode->i_mode))
-- 
cgit v1.2.3


From 597d508c17a6dcd17770f4dd9da873d93cc15493 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 28 Dec 2011 20:32:07 -0500
Subject: ext4: use proper little-endian bitops

ext4_{set,clear}_bit() is defined as __test_and_{set,clear}_bit_le() for
ext4.  Only two ext4_{set,clear}_bit() calls check the return value.  The
rest of calls ignore the return value and they can be replaced with
__{set,clear}_bit_le().

This changes ext4_{set,clear}_bit() from __test_and_{set,clear}_bit_le()
to __{set,clear}_bit_le() and introduces ext4_test_and_{set,clear}_bit()
for the two places where old bit needs to be returned.

This ext4_{set,clear}_bit() change is considered safe, because if someone
uses these macros without noticing the change, new ext4_{set,clear}_bit
don't have return value and causes compiler errors where the return value
is used.

This also removes unused ext4_find_first_zero_bit().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h   | 7 ++++---
 fs/ext4/ialloc.c | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ae2407f4502..0e43bba049a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -957,12 +957,13 @@ struct ext4_inode_info {
 #define test_opt2(sb, opt)		(EXT4_SB(sb)->s_mount_opt2 & \
 					 EXT4_MOUNT2_##opt)
 
-#define ext4_set_bit			__test_and_set_bit_le
+#define ext4_test_and_set_bit		__test_and_set_bit_le
+#define ext4_set_bit			__set_bit_le
 #define ext4_set_bit_atomic		ext2_set_bit_atomic
-#define ext4_clear_bit			__test_and_clear_bit_le
+#define ext4_test_and_clear_bit		__test_and_clear_bit_le
+#define ext4_clear_bit			__clear_bit_le
 #define ext4_clear_bit_atomic		ext2_clear_bit_atomic
 #define ext4_test_bit			test_bit_le
-#define ext4_find_first_zero_bit	find_first_zero_bit_le
 #define ext4_find_next_zero_bit		find_next_zero_bit_le
 #define ext4_find_next_bit		find_next_bit_le
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index cdafc05d79c..72fc9892231 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		fatal = ext4_journal_get_write_access(handle, bh2);
 	}
 	ext4_lock_group(sb, block_group);
-	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+	cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
 	if (fatal || !cleared) {
 		ext4_unlock_group(sb, block_group);
 		goto out;
@@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	 */
 	down_read(&grp->alloc_sem);
 	ext4_lock_group(sb, group);
-	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+	if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
 		/* not a free inode */
 		retval = 1;
 		goto err_ret;
-- 
cgit v1.2.3


From e552a596687bf0e1802c744a7bb113afbd2bf4d4 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Thu, 29 Dec 2011 03:50:20 +0000
Subject: Squashfs: add missing block release on error condition

squashfs_read_metadata forgets to release the cache block if
an error has occurred.

Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
---
 fs/squashfs/cache.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f744be98cd5..ea6e798e548 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -332,17 +332,20 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
 		u64 *block, int *offset, int length)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
-	int bytes, copied = length;
+	int bytes, res = length;
 	struct squashfs_cache_entry *entry;
 
 	TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
 
 	while (length) {
 		entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
-		if (entry->error)
-			return entry->error;
-		else if (*offset >= entry->length)
-			return -EIO;
+		if (entry->error) {
+			res = entry->error;
+			goto error;
+		} else if (*offset >= entry->length) {
+			res = -EIO;
+			goto error;
+		}
 
 		bytes = squashfs_copy_data(buffer, entry, *offset, length);
 		if (buffer)
@@ -358,7 +361,11 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
 		squashfs_cache_put(entry);
 	}
 
-	return copied;
+	return res;
+
+error:
+	squashfs_cache_put(entry);
+	return res;
 }
 
 
-- 
cgit v1.2.3


From d7fbd893388d9e86d29b7cfbd5457bcf03496fbf Mon Sep 17 00:00:00 2001
From: Ajeet Yadav <ajeet.yadav.77@gmail.com>
Date: Tue, 27 Dec 2011 15:10:04 +0530
Subject: Squashfs: optimise squashfs_cache_get entry search

squashfs_cache_get() iterates over all entries to search for
 block its looking for. Often get() / put() are called for
 same block.

If we cache the current entry index, then we can optimise the
subsequent *_get() calls.

Signed-off-by: Ajeet Yadav <ajeet.yadav.77@gmail.com>
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
---
 fs/squashfs/cache.c          | 11 ++++++++---
 fs/squashfs/squashfs_fs_sb.h |  1 +
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index ea6e798e548..af0b7380259 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -70,11 +70,15 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
 	spin_lock(&cache->lock);
 
 	while (1) {
-		for (i = 0; i < cache->entries; i++)
-			if (cache->entry[i].block == block)
+		for (i = cache->curr_blk, n = 0; n < cache->entries; n++) {
+			if (cache->entry[i].block == block) {
+				cache->curr_blk = i;
 				break;
+			}
+			i = (i + 1) % cache->entries;
+		}
 
-		if (i == cache->entries) {
+		if (n == cache->entries) {
 			/*
 			 * Block not in cache, if all cache entries are used
 			 * go to sleep waiting for one to become available.
@@ -245,6 +249,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 		goto cleanup;
 	}
 
+	cache->curr_blk = 0;
 	cache->next_blk = 0;
 	cache->unused = entries;
 	cache->entries = entries;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 651f0b31d29..52934a22f29 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -28,6 +28,7 @@
 struct squashfs_cache {
 	char			*name;
 	int			entries;
+	int			curr_blk;
 	int			next_blk;
 	int			num_waiters;
 	int			unused;
-- 
cgit v1.2.3


From e97a5d893fdf45c20799b72a1c11dca3b282c89c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Sat, 31 Dec 2011 16:54:46 -0200
Subject: [media] fs/compat_ioctl: it needs to see the DVBv3 compat stuff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only the ioctl core should see the DVBv3 compat stuff, as its
contents are not available anymore to the drivers.

As fs/compat_ioctl also handles DVBv3 ioctl's, it needs those
definitions:

    fs/compat_ioctl.c:1345: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1345: error: array type has incomplete element type
    fs/compat_ioctl.c:1345: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1345: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1345: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1345: error: array type has incomplete element type
    fs/compat_ioctl.c:1345: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1345: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1345: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1345: error: array type has incomplete element type
    fs/compat_ioctl.c:1345: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1345: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1345: error: initializer element is not constant
    fs/compat_ioctl.c:1345: error: (near initialization for ‘ioctl_pointer[462]’)
    fs/compat_ioctl.c:1346: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1346: error: array type has incomplete element type
    fs/compat_ioctl.c:1346: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1346: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1346: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1346: error: array type has incomplete element type
    fs/compat_ioctl.c:1346: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1346: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1346: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1346: error: array type has incomplete element type
    fs/compat_ioctl.c:1346: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1346: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_parameters’
    fs/compat_ioctl.c:1346: error: initializer element is not constant
    fs/compat_ioctl.c:1346: error: (near initialization for ‘ioctl_pointer[463]’)
    fs/compat_ioctl.c:1347: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_event’
    fs/compat_ioctl.c:1347: error: array type has incomplete element type
    fs/compat_ioctl.c:1347: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_event’
    fs/compat_ioctl.c:1347: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_event’
    fs/compat_ioctl.c:1347: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_event’
    fs/compat_ioctl.c:1347: error: array type has incomplete element type
    fs/compat_ioctl.c:1347: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_event’
    fs/compat_ioctl.c:1347: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_event’
    fs/compat_ioctl.c:1347: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_event’
    fs/compat_ioctl.c:1347: error: array type has incomplete element type
    fs/compat_ioctl.c:1347: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_event’
    fs/compat_ioctl.c:1347: error: invalid application of ‘sizeof’ to incomplete type ‘struct dvb_frontend_event’
    fs/compat_ioctl.c:1347: error: initializer element is not constant
    fs/compat_ioctl.c:1347: error: (near initialization for ‘ioctl_pointer[464]’)

Reported-by: Michael Krufky <mkrufky@linuxtv.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 fs/compat_ioctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 51352de88ef..32200c2ca7f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -105,6 +105,7 @@
 
 #include <linux/hiddev.h>
 
+#define __DVB_CORE__
 #include <linux/dvb/audio.h>
 #include <linux/dvb/dmx.h>
 #include <linux/dvb/frontend.h>
-- 
cgit v1.2.3


From cc37f75a9ffbbfcb1c3297534f293c8284e3c5a6 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Mon, 2 Jan 2012 17:47:14 +0000
Subject: Squashfs: fix mount time sanity check for corrupted superblock

A Squashfs filesystem containing nothing but an empty directory,
although unusual and ultimately pointless, is still valid.

The directory_table >= next_table sanity check rejects these
filesystems as invalid because the directory_table is empty and
equal to next_table.

Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
---
 fs/squashfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 2da1715452a..4619247d74e 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -290,7 +290,7 @@ handle_fragments:
 
 check_directory_table:
 	/* Sanity check directory_table */
-	if (msblk->directory_table >= next_table) {
+	if (msblk->directory_table > next_table) {
 		err = -EINVAL;
 		goto failed_mount;
 	}
-- 
cgit v1.2.3


From aec39680b02ed55fcbb2bc87ad96eba16a651546 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 2 Jan 2012 17:30:05 -0500
Subject: nfsd4: fix spurious 4.1 post-reboot failures

In the NFSv4.1 case, this could cause a spurious "NFSD: failed to write
recovery record (err -17); please check that /var/lib/nfs/v4recovery
exists and is writable.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reported-by: Steve Dickson <SteveD@redhat.com>
---
 fs/nfsd/nfs4recover.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index ed083b9a731..28712e20bb1 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -144,8 +144,15 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		status = PTR_ERR(dentry);
 		goto out_unlock;
 	}
-	status = -EEXIST;
 	if (dentry->d_inode)
+		/*
+		 * In the 4.1 case, where we're called from
+		 * reclaim_complete(), records from the previous reboot
+		 * may still be left, so this is OK.
+		 *
+		 * In the 4.0 case, we should never get here; but we may
+		 * as well be forgiving and just succeed silently.
+		 */
 		goto out_put;
 	status = mnt_want_write(rec_file->f_path.mnt);
 	if (status)
-- 
cgit v1.2.3


From 3d4a1c80c4eb97187b3a61b3bfa8c804327f7a45 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Tue, 3 Jan 2012 02:58:13 +0000
Subject: Squashfs: fix i_blocks calculation with extended regular files

The le64_to_cpu() forces the calculation to be unsigned, with
the effect that it can underflow leading to an incorrect large
value.

This bug only triggers in rare(ish) circumstances, an empty file
encoded as an extended regular file or a completely sparse file.
Normally empty files are encoded as a regular file rather than as
an extended regular file (and the regular file i_blocks calculation
doesn't have this bug).  To save space regular file inodes are
optimised to encode the most commonly occurring files.  Less
common regular files are encoded using extended regular file inodes
which contain extra information.

Empty files with nlinks greater than 1, and or empty files
with extended attributes are encoded using extended regular file
inodes and they will hit this bug.

Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
---
 fs/squashfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index fd7b3b3bda1..81afbccfa84 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -208,8 +208,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		inode->i_op = &squashfs_inode_ops;
 		inode->i_fop = &generic_ro_fops;
 		inode->i_mode |= S_IFREG;
-		inode->i_blocks = ((inode->i_size -
-				le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
+		inode->i_blocks = (inode->i_size -
+				le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;
 
 		squashfs_i(inode)->fragment_block = frag_blk;
 		squashfs_i(inode)->fragment_size = frag_size;
-- 
cgit v1.2.3


From b1c770c273a4787069306fc82aab245e9ac72e9d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 21 Dec 2011 00:07:42 +0000
Subject: xfs: fix endian conversion issue in discard code

When finding the longest extent in an AG, we read the value directly
out of the AGF buffer without endian conversion. This will give an
incorrect length, resulting in FITRIM operations potentially not
trimming everything that it should.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_discard.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 8a24f0c6c86..286a051f12c 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -68,7 +68,7 @@ xfs_trim_extents(
 	 * Look up the longest btree in the AGF and start with it.
 	 */
 	error = xfs_alloc_lookup_le(cur, 0,
-				    XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+			    be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
 	if (error)
 		goto out_del_cursor;
 
@@ -84,7 +84,7 @@ xfs_trim_extents(
 		if (error)
 			goto out_del_cursor;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
-		ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+		ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
 
 		/*
 		 * Too small?  Give up.
-- 
cgit v1.2.3


From 18e3143848f1abdd07e7d9879cf67f4e147ff8b7 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:18:50 -0500
Subject: ext4: add a function which extends a group without checking
 parameters

This patch added a function named ext4_group_extend_no_check() whose code
is copied from ext4_group_extend().  ext4_group_extend_no_check() assumes
the parameter is valid and has been checked by caller.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 996780ab4f4..1c3c2b56049 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -968,6 +968,57 @@ exit_put:
 	return err;
 } /* ext4_group_add */
 
+/*
+ * extend a group without checking assuming that checking has been done.
+ */
+static int ext4_group_extend_no_check(struct super_block *sb,
+				      ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	handle_t *handle;
+	int err = 0, err2;
+
+	/* We will update the superblock, one block bitmap, and
+	 * one group descriptor via ext4_group_add_blocks().
+	 */
+	handle = ext4_journal_start_sb(sb, 3);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		ext4_warning(sb, "error %d on journal start", err);
+		return err;
+	}
+
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (err) {
+		ext4_warning(sb, "error %d on journal write access", err);
+		goto errout;
+	}
+
+	ext4_blocks_count_set(es, o_blocks_count + add);
+	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+	/* We add the blocks to the bitmap and set the group need init bit */
+	err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
+	if (err)
+		goto errout;
+	ext4_handle_dirty_super(handle, sb);
+	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+errout:
+	err2 = ext4_journal_stop(handle);
+	if (err2 && !err)
+		err = err2;
+
+	if (!err) {
+		if (test_opt(sb, DEBUG))
+			printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+			       "blocks\n", ext4_blocks_count(es));
+		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
+			       sizeof(struct ext4_super_block));
+	}
+	return err;
+}
+
 /*
  * Extend the filesystem to the new number of blocks specified.  This entry
  * point is only used to extend the current filesystem to the end of the last
-- 
cgit v1.2.3


From bb08c1e7d8c072da338f6d905a89376b36023017 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:20:50 -0500
Subject: ext4: add a function which adds a new group descriptors to a fs

This patch adds a function named ext4_add_new_descs() which adds one
or more new group descriptors to a fs and whose code is copied from
ext4_group_add().

The function will be used by new resize implementation.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 1c3c2b56049..3bb4e7b502e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -735,6 +735,52 @@ exit_err:
 	}
 }
 
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
+ *
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
+ */
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+			      ext4_group_t group, struct inode *resize_inode,
+			      ext4_group_t count)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head *gdb_bh;
+	int i, gdb_off, gdb_num, err = 0;
+
+	for (i = 0; i < count; i++, group++) {
+		int reserved_gdb = ext4_bg_has_super(sb, group) ?
+			le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+
+		gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+		gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+		/*
+		 * We will only either add reserved group blocks to a backup group
+		 * or remove reserved blocks for the first group in a new group block.
+		 * Doing both would be mean more complex code, and sane people don't
+		 * use non-sparse filesystems anymore.  This is already checked above.
+		 */
+		if (gdb_off) {
+			gdb_bh = sbi->s_group_desc[gdb_num];
+			err = ext4_journal_get_write_access(handle, gdb_bh);
+
+			if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+				err = reserve_backup_gdb(handle, resize_inode, group);
+		} else
+			err = add_new_gdb(handle, resize_inode, group);
+		if (err)
+			break;
+	}
+	return err;
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
-- 
cgit v1.2.3


From 28c7bac0091687e6116ebd6c179e154ae4053c90 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:22:50 -0500
Subject: ext4: add a structure which will be used by 64bit-resize interface

This patch adds a structure which will be used by 64bit-resize interface.
Two functions which allocate and destroy the structure respectively are
added.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3bb4e7b502e..6076d5e4b51 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -134,6 +134,61 @@ static int verify_group_input(struct super_block *sb,
 	return err;
 }
 
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+	struct ext4_new_group_data *groups;	/* new_group_data for groups
+						   in the flex group */
+	__u16 *bg_flags;			/* block group flags of groups
+						   in @groups */
+	ext4_group_t count;			/* number of groups in @groups
+						 */
+};
+
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+	struct ext4_new_flex_group_data *flex_gd;
+
+	flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+	if (flex_gd == NULL)
+		goto out3;
+
+	flex_gd->count = flexbg_size;
+
+	flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+				  flexbg_size, GFP_NOFS);
+	if (flex_gd->groups == NULL)
+		goto out2;
+
+	flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+	if (flex_gd->bg_flags == NULL)
+		goto out1;
+
+	return flex_gd;
+
+out1:
+	kfree(flex_gd->groups);
+out2:
+	kfree(flex_gd);
+out3:
+	return NULL;
+}
+
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+	kfree(flex_gd->bg_flags);
+	kfree(flex_gd->groups);
+	kfree(flex_gd);
+}
+
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
 				  ext4_fsblk_t blk)
 {
-- 
cgit v1.2.3


From 33afdcc5402d0abf70ef2dfb96d0b901d20bcc37 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:32:52 -0500
Subject: ext4: add a function which sets up group blocks of a flex bg

This patch adds a function named setup_new_flex_group_blocks() which
sets up group blocks of a flex bg.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h   |   8 ++
 fs/ext4/resize.c | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 258 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0e43bba049a..05058e2b7f4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,6 +511,14 @@ struct ext4_new_group_data {
 	__u32 free_blocks_count;
 };
 
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+	BLOCK_BITMAP = 0,	/* block bitmap */
+	INODE_BITMAP,		/* inode bitmap */
+	INODE_TABLE,		/* inode tables */
+	GROUP_TABLE_COUNT,
+};
+
 /*
  * Flags used by ext4_map_blocks()
  */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6076d5e4b51..e8ccb2f8f45 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -233,6 +233,256 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh)
 	return 0;
 }
 
+/*
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for ext4_setup_new_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+			struct ext4_new_flex_group_data *flex_gd,
+			ext4_fsblk_t block, ext4_group_t count)
+{
+	ext4_group_t count2;
+
+	ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+	for (count2 = count; count > 0; count -= count2, block += count2) {
+		ext4_fsblk_t start;
+		struct buffer_head *bh;
+		ext4_group_t group;
+		int err;
+
+		ext4_get_group_no_and_offset(sb, block, &group, NULL);
+		start = ext4_group_first_block_no(sb, group);
+		group -= flex_gd->groups[0].group;
+
+		count2 = sb->s_blocksize * 8 - (block - start);
+		if (count2 > count)
+			count2 = count;
+
+		if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+			BUG_ON(flex_gd->count > 1);
+			continue;
+		}
+
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			return err;
+
+		bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+		if (!bh)
+			return -EIO;
+
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
+			return err;
+		ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+			   block - start, count2);
+		ext4_set_bits(bh->b_data, block - start, count2);
+
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (unlikely(err))
+			return err;
+		brelse(bh);
+	}
+
+	return 0;
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
+ * This doesn't need to be part of the main transaction, since we are only
+ * changing blocks outside the actual filesystem.  We still do journaling to
+ * ensure the recovery is correct in case of a failure just after resize.
+ * If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ *  1. copy super block and GDT, and initialize group tables if necessary.
+ *     In this step, we only set bits in blocks bitmaps for blocks taken by
+ *     super block and GDT.
+ *  2. allocate group tables in block bitmaps, that is, set bits in block
+ *     bitmap for blocks taken by group tables.
+ */
+static int setup_new_flex_group_blocks(struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd)
+{
+	int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+	ext4_fsblk_t start;
+	ext4_fsblk_t block;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	__u16 *bg_flags = flex_gd->bg_flags;
+	handle_t *handle;
+	ext4_group_t group, count;
+	struct buffer_head *bh = NULL;
+	int reserved_gdb, i, j, err = 0, err2;
+
+	BUG_ON(!flex_gd->count || !group_data ||
+	       group_data[0].group != sbi->s_groups_count);
+
+	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+
+	/* This transaction may be extended/restarted along the way */
+	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	group = group_data[0].group;
+	for (i = 0; i < flex_gd->count; i++, group++) {
+		unsigned long gdblocks;
+
+		gdblocks = ext4_bg_num_gdb(sb, group);
+		start = ext4_group_first_block_no(sb, group);
+
+		/* Copy all of the GDT blocks into the backup in this group */
+		for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
+			struct buffer_head *gdb;
+
+			ext4_debug("update backup group %#04llx\n", block);
+			err = extend_or_restart_transaction(handle, 1);
+			if (err)
+				goto out;
+
+			gdb = sb_getblk(sb, block);
+			if (!gdb) {
+				err = -EIO;
+				goto out;
+			}
+
+			err = ext4_journal_get_write_access(handle, gdb);
+			if (err) {
+				brelse(gdb);
+				goto out;
+			}
+			memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+			       gdb->b_size);
+			set_buffer_uptodate(gdb);
+
+			err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+			if (unlikely(err)) {
+				brelse(gdb);
+				goto out;
+			}
+			brelse(gdb);
+		}
+
+		/* Zero out all of the reserved backup group descriptor
+		 * table blocks
+		 */
+		if (ext4_bg_has_super(sb, group)) {
+			err = sb_issue_zeroout(sb, gdblocks + start + 1,
+					reserved_gdb, GFP_NOFS);
+			if (err)
+				goto out;
+		}
+
+		/* Initialize group tables of the grop @group */
+		if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
+			goto handle_bb;
+
+		/* Zero out all of the inode table blocks */
+		block = group_data[i].inode_table;
+		ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+			   block, sbi->s_itb_per_group);
+		err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+				       GFP_NOFS);
+		if (err)
+			goto out;
+
+handle_bb:
+		if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
+			goto handle_ib;
+
+		/* Initialize block bitmap of the @group */
+		block = group_data[i].block_bitmap;
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			goto out;
+
+		bh = bclean(handle, sb, block);
+		if (IS_ERR(bh)) {
+			err = PTR_ERR(bh);
+			goto out;
+		}
+		if (ext4_bg_has_super(sb, group)) {
+			ext4_debug("mark backup superblock %#04llx (+0)\n",
+				   start);
+			ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
+						     1);
+		}
+		ext4_mark_bitmap_end(group_data[i].blocks_count,
+				     sb->s_blocksize * 8, bh->b_data);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			goto out;
+		brelse(bh);
+
+handle_ib:
+		if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+			continue;
+
+		/* Initialize inode bitmap of the @group */
+		block = group_data[i].inode_bitmap;
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			goto out;
+		/* Mark unused entries in inode bitmap used */
+		bh = bclean(handle, sb, block);
+		if (IS_ERR(bh)) {
+			err = PTR_ERR(bh);
+			goto out;
+		}
+
+		ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+				     sb->s_blocksize * 8, bh->b_data);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			goto out;
+		brelse(bh);
+	}
+	bh = NULL;
+
+	/* Mark group tables in block bitmap */
+	for (j = 0; j < GROUP_TABLE_COUNT; j++) {
+		count = group_table_count[j];
+		start = (&group_data[0].block_bitmap)[j];
+		block = start;
+		for (i = 1; i < flex_gd->count; i++) {
+			block += group_table_count[j];
+			if (block == (&group_data[i].block_bitmap)[j]) {
+				count += group_table_count[j];
+				continue;
+			}
+			err = set_flexbg_block_bitmap(sb, handle,
+						flex_gd, start, count);
+			if (err)
+				goto out;
+			count = group_table_count[j];
+			start = group_data[i].block_bitmap;
+			block = start;
+		}
+
+		if (count) {
+			err = set_flexbg_block_bitmap(sb, handle,
+						flex_gd, start, count);
+			if (err)
+				goto out;
+		}
+	}
+
+out:
+	brelse(bh);
+	err2 = ext4_journal_stop(handle);
+	if (err2 && !err)
+		err = err2;
+
+	return err;
+}
+
 /*
  * Set up the block and inode bitmaps, and the inode table for the new group.
  * This doesn't need to be part of the main transaction, since we are only
-- 
cgit v1.2.3


From 083f5b24cc55448e0602a807a5c2872e1f3796e2 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:37:31 -0500
Subject: ext4: add a function which sets up a block group descriptors of a
 flex bg

This patch adds a function named ext4_setup_new_descs which sets up the
block group descriptors of a flex bg.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e8ccb2f8f45..098bdb8e97c 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1086,6 +1086,62 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
 	return err;
 }
 
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd)
+{
+	struct ext4_new_group_data	*group_data = flex_gd->groups;
+	struct ext4_group_desc		*gdp;
+	struct ext4_sb_info		*sbi = EXT4_SB(sb);
+	struct buffer_head		*gdb_bh;
+	ext4_group_t			group;
+	__u16				*bg_flags = flex_gd->bg_flags;
+	int				i, gdb_off, gdb_num, err = 0;
+	
+
+	for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+		group = group_data->group;
+
+		gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+		gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+		/*
+		 * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+		 */
+		gdb_bh = sbi->s_group_desc[gdb_num];
+		/* Update group descriptor block for new group */
+		gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
+						 gdb_off * EXT4_DESC_SIZE(sb));
+
+		memset(gdp, 0, EXT4_DESC_SIZE(sb));
+		ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+		ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+		ext4_inode_table_set(sb, gdp, group_data->inode_table);
+		ext4_free_group_clusters_set(sb, gdp,
+					     EXT4_B2C(sbi, group_data->free_blocks_count));
+		ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+		gdp->bg_flags = cpu_to_le16(*bg_flags);
+		gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+
+		err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+		if (unlikely(err)) {
+			ext4_std_error(sb, err);
+			break;
+		}
+
+		/*
+		 * We can allocate memory for mb_alloc based on the new group
+		 * descriptor
+		 */
+		err = ext4_mb_add_groupinfo(sb, group, gdp);
+		if (err)
+			break;
+	}
+	return err;
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
-- 
cgit v1.2.3


From 2e10e2f2e5a800a54ad2f16dfdd8c034e005958b Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:41:39 -0500
Subject: ext4: add a function which updates the super block during online
 resizing

This patch adds a function named ext4_update_super() which updates
super block so the newly created block groups are visible to the file
system.  This code is copied from ext4_group_add().

The function will be used by new resize implementation.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 098bdb8e97c..eb0aebcca55 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1142,6 +1142,100 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
 	return err;
 }
 
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+			     struct ext4_new_flex_group_data *flex_gd)
+{
+	ext4_fsblk_t blocks_count = 0;
+	ext4_fsblk_t free_blocks = 0;
+	ext4_fsblk_t reserved_blocks = 0;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int i;
+
+	BUG_ON(flex_gd->count == 0 || group_data == NULL);
+	/*
+	 * Make the new blocks and inodes valid next.  We do this before
+	 * increasing the group count so that once the group is enabled,
+	 * all of its blocks and inodes are already valid.
+	 *
+	 * We always allocate group-by-group, then block-by-block or
+	 * inode-by-inode within a group, so enabling these
+	 * blocks/inodes before the group is live won't actually let us
+	 * allocate the new space yet.
+	 */
+	for (i = 0; i < flex_gd->count; i++) {
+		blocks_count += group_data[i].blocks_count;
+		free_blocks += group_data[i].free_blocks_count;
+	}
+
+	reserved_blocks = ext4_r_blocks_count(es) * 100;
+	do_div(reserved_blocks, ext4_blocks_count(es));
+	reserved_blocks *= blocks_count;
+	do_div(reserved_blocks, 100);
+
+	ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+		     flex_gd->count);
+
+	/*
+	 * We need to protect s_groups_count against other CPUs seeing
+	 * inconsistent state in the superblock.
+	 *
+	 * The precise rules we use are:
+	 *
+	 * * Writers must perform a smp_wmb() after updating all
+	 *   dependent data and before modifying the groups count
+	 *
+	 * * Readers must perform an smp_rmb() after reading the groups
+	 *   count and before reading any dependent data.
+	 *
+	 * NB. These rules can be relaxed when checking the group count
+	 * while freeing data, as we can only allocate from a block
+	 * group after serialising against the group count, and we can
+	 * only then free after serialising in turn against that
+	 * allocation.
+	 */
+	smp_wmb();
+
+	/* Update the global fs size fields */
+	sbi->s_groups_count += flex_gd->count;
+
+	/* Update the reserved block counts only once the new group is
+	 * active. */
+	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+				reserved_blocks);
+
+	/* Update the free space counts */
+	percpu_counter_add(&sbi->s_freeclusters_counter,
+			   EXT4_B2C(sbi, free_blocks));
+	percpu_counter_add(&sbi->s_freeinodes_counter,
+			   EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+				      EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group;
+		flex_group = ext4_flex_group(sbi, group_data[0].group);
+		atomic_add(EXT4_B2C(sbi, free_blocks),
+			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+	}
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: added group %u:"
+		       "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+		       blocks_count, free_blocks, reserved_blocks);
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
-- 
cgit v1.2.3


From c72df9f928efd5b17e84bdb7b8ec1be3b9c1ea9d Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:43:39 -0500
Subject: ext4: pass verify_reserved_gdb() the number of group decriptors

The 64bit resizer adds a flex group each time, so verify_reserved_gdb
can not use s_groups_count directly, it should use the number of group
decriptors before the added group.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index eb0aebcca55..12eace09654 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -656,10 +656,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
  * groups in current filesystem that have BACKUPS, or -ve error code.
  */
 static int verify_reserved_gdb(struct super_block *sb,
+			       ext4_group_t end,
 			       struct buffer_head *primary)
 {
 	const ext4_fsblk_t blk = primary->b_blocknr;
-	const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
 	unsigned three = 1;
 	unsigned five = 5;
 	unsigned seven = 7;
@@ -734,7 +734,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	if (!gdb_bh)
 		return -EIO;
 
-	gdbackups = verify_reserved_gdb(sb, gdb_bh);
+	gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
 	if (gdbackups < 0) {
 		err = gdbackups;
 		goto exit_bh;
@@ -897,7 +897,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 			err = -EIO;
 			goto exit_bh;
 		}
-		if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+		gdbackups = verify_reserved_gdb(sb, group, primary[res]);
+		if (gdbackups < 0) {
 			brelse(primary[res]);
 			err = gdbackups;
 			goto exit_bh;
-- 
cgit v1.2.3


From 3fbea4b3683a5dfa86489ef7799cbe55e8003dfa Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:44:38 -0500
Subject: ext4: add a new function which allocates bitmaps and inode tables

This patch adds a new function named ext4_allocates_group_table()
which allocates block bitmaps, inode bitmaps and inode tables for a
flex groups and is used by resize code.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 12eace09654..a4075de73c7 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -189,6 +189,117 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
 	kfree(flex_gd);
 }
 
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize.  Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ */
+static void ext4_alloc_group_tables(struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd,
+				int flexbg_size)
+{
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	ext4_fsblk_t start_blk;
+	ext4_fsblk_t last_blk;
+	ext4_group_t src_group;
+	ext4_group_t bb_index = 0;
+	ext4_group_t ib_index = 0;
+	ext4_group_t it_index = 0;
+	ext4_group_t group;
+	ext4_group_t last_group;
+	unsigned overhead;
+
+	BUG_ON(flex_gd->count == 0 || group_data == NULL);
+
+	src_group = group_data[0].group;
+	last_group  = src_group + flex_gd->count - 1;
+
+	BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+	       (last_group & ~(flexbg_size - 1))));
+next_group:
+	group = group_data[0].group;
+	start_blk = ext4_group_first_block_no(sb, src_group);
+	last_blk = start_blk + group_data[src_group - group].blocks_count;
+
+	overhead = ext4_bg_has_super(sb, src_group) ?
+		   (1 + ext4_bg_num_gdb(sb, src_group) +
+		    le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+
+	start_blk += overhead;
+
+	BUG_ON(src_group >= group_data[0].group + flex_gd->count);
+	/* We collect contiguous blocks as much as possible. */
+	src_group++;
+	for (; src_group <= last_group; src_group++)
+		if (!ext4_bg_has_super(sb, src_group))
+			last_blk += group_data[src_group - group].blocks_count;
+		else
+			break;
+
+	/* Allocate block bitmaps */
+	for (; bb_index < flex_gd->count; bb_index++) {
+		if (start_blk >= last_blk)
+			goto next_group;
+		group_data[bb_index].block_bitmap = start_blk++;
+		ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count--;
+		if (flexbg_size > 1)
+			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+	}
+
+	/* Allocate inode bitmaps */
+	for (; ib_index < flex_gd->count; ib_index++) {
+		if (start_blk >= last_blk)
+			goto next_group;
+		group_data[ib_index].inode_bitmap = start_blk++;
+		ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count--;
+		if (flexbg_size > 1)
+			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+	}
+
+	/* Allocate inode tables */
+	for (; it_index < flex_gd->count; it_index++) {
+		if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+			goto next_group;
+		group_data[it_index].inode_table = start_blk;
+		ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count -=
+					EXT4_SB(sb)->s_itb_per_group;
+		if (flexbg_size > 1)
+			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+
+		start_blk += EXT4_SB(sb)->s_itb_per_group;
+	}
+
+	if (test_opt(sb, DEBUG)) {
+		int i;
+		group = group_data[0].group;
+
+		printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+		       "%d groups, flexbg size is %d:\n", flex_gd->count,
+		       flexbg_size);
+
+		for (i = 0; i < flex_gd->count; i++) {
+			printk(KERN_DEBUG "adding %s group %u: %u "
+			       "blocks (%d free)\n",
+			       ext4_bg_has_super(sb, group + i) ? "normal" :
+			       "no-super", group + i,
+			       group_data[i].blocks_count,
+			       group_data[i].free_blocks_count);
+		}
+	}
+}
+
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
 				  ext4_fsblk_t blk)
 {
-- 
cgit v1.2.3


From 4bac1f8cef7bfd2c62793f75aba66a5b8357dede Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:44:38 -0500
Subject: ext4: add a new function which adds a flex group to a fs

This patch adds a new function named ext4_flex_group_add() which adds a
flex group to a fs.  The function is used by 64bit-resize interface.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a4075de73c7..dac23561f3e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1348,6 +1348,88 @@ static void ext4_update_super(struct super_block *sb,
 		       blocks_count, free_blocks, reserved_blocks);
 }
 
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+			       struct inode *resize_inode,
+			       struct ext4_new_flex_group_data *flex_gd)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t o_blocks_count;
+	ext4_grpblk_t last;
+	ext4_group_t group;
+	handle_t *handle;
+	unsigned reserved_gdb;
+	int err = 0, err2 = 0, credit;
+
+	BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+
+	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+	o_blocks_count = ext4_blocks_count(es);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+	BUG_ON(last);
+
+	err = setup_new_flex_group_blocks(sb, flex_gd);
+	if (err)
+		goto exit;
+	/*
+	 * We will always be modifying at least the superblock and  GDT
+	 * block.  If we are adding a group past the last current GDT block,
+	 * we will also modify the inode and the dindirect block.  If we
+	 * are adding a group with superblock/GDT backups  we will also
+	 * modify each of the reserved GDT dindirect blocks.
+	 */
+	credit = flex_gd->count * 4 + reserved_gdb;
+	handle = ext4_journal_start_sb(sb, credit);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit;
+	}
+
+	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+	if (err)
+		goto exit_journal;
+
+	group = flex_gd->groups[0].group;
+	BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+	err = ext4_add_new_descs(handle, sb, group,
+				resize_inode, flex_gd->count);
+	if (err)
+		goto exit_journal;
+
+	err = ext4_setup_new_descs(handle, sb, flex_gd);
+	if (err)
+		goto exit_journal;
+
+	ext4_update_super(sb, flex_gd);
+
+	err = ext4_handle_dirty_super(handle, sb);
+
+exit_journal:
+	err2 = ext4_journal_stop(handle);
+	if (!err)
+		err = err2;
+
+	if (!err) {
+		int i;
+		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+			       sizeof(struct ext4_super_block));
+		for (i = 0; i < flex_gd->count; i++, group++) {
+			struct buffer_head *gdb_bh;
+			int gdb_num;
+			gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+			gdb_bh = sbi->s_group_desc[gdb_num];
+			update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+				       gdb_bh->b_size);
+		}
+	}
+exit:
+	return err;
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
-- 
cgit v1.2.3


From f95a34c66554235b70a681fcd9feebc195f7ec0e Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Fri, 14 Oct 2011 12:34:58 -0500
Subject: dlm: move recovery barrier calls

Put all the calls to recovery barriers in the same function
to clarify where they each happen.  Should not change any behavior.
Also modify some recovery debug lines to make them consistent.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dir.c      |  1 -
 fs/dlm/member.c   |  7 +------
 fs/dlm/recover.c  |  2 --
 fs/dlm/recoverd.c | 45 +++++++++++++++++++++++++++------------------
 4 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 7b84c1dbc82..83641574b01 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -290,7 +290,6 @@ int dlm_recover_directory(struct dlm_ls *ls)
 
  out_status:
 	error = 0;
-	dlm_set_recover_status(ls, DLM_RS_DIR);
 	log_debug(ls, "dlm_recover_directory %d entries", count);
  out_free:
 	kfree(last_name);
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b12532e553f..5ebd1df6967 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -251,7 +251,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 	ls->ls_low_nodeid = low;
 
 	make_member_array(ls);
-	dlm_set_recover_status(ls, DLM_RS_NODES);
 	*neg_out = neg;
 
 	error = ping_members(ls);
@@ -261,12 +260,8 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 		ls->ls_members_result = error;
 		complete(&ls->ls_members_done);
 	}
-	if (error)
-		goto out;
 
-	error = dlm_recover_members_wait(ls);
- out:
-	log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
+	log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
 	return error;
 }
 
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 50467cefdbd..81b23930449 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -542,8 +542,6 @@ int dlm_recover_locks(struct dlm_ls *ls)
  out:
 	if (error)
 		recover_list_clear(ls);
-	else
-		dlm_set_recover_status(ls, DLM_RS_LOCKS);
 	return error;
 }
 
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 774da3cf92c..5a9e1a49a86 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -54,7 +54,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 	unsigned long start;
 	int error, neg = 0;
 
-	log_debug(ls, "recover %llx", (unsigned long long)rv->seq);
+	log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq);
 
 	mutex_lock(&ls->ls_recoverd_active);
 
@@ -76,14 +76,22 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
 	/*
 	 * Add or remove nodes from the lockspace's ls_nodes list.
-	 * Also waits for all nodes to complete dlm_recover_members.
 	 */
 
 	error = dlm_recover_members(ls, rv, &neg);
 	if (error) {
-		log_debug(ls, "recover_members failed %d", error);
+		log_debug(ls, "dlm_recover_members error %d", error);
 		goto fail;
 	}
+
+	dlm_set_recover_status(ls, DLM_RS_NODES);
+
+	error = dlm_recover_members_wait(ls);
+	if (error) {
+		log_debug(ls, "dlm_recover_members_wait error %d", error);
+		goto fail;
+	}
+
 	start = jiffies;
 
 	/*
@@ -93,17 +101,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
 	error = dlm_recover_directory(ls);
 	if (error) {
-		log_debug(ls, "recover_directory failed %d", error);
+		log_debug(ls, "dlm_recover_directory error %d", error);
 		goto fail;
 	}
 
-	/*
-	 * Wait for all nodes to complete directory rebuild.
-	 */
+	dlm_set_recover_status(ls, DLM_RS_DIR);
 
 	error = dlm_recover_directory_wait(ls);
 	if (error) {
-		log_debug(ls, "recover_directory_wait failed %d", error);
+		log_debug(ls, "dlm_recover_directory_wait error %d", error);
 		goto fail;
 	}
 
@@ -133,7 +139,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
 		error = dlm_recover_masters(ls);
 		if (error) {
-			log_debug(ls, "recover_masters failed %d", error);
+			log_debug(ls, "dlm_recover_masters error %d", error);
 			goto fail;
 		}
 
@@ -143,13 +149,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
 		error = dlm_recover_locks(ls);
 		if (error) {
-			log_debug(ls, "recover_locks failed %d", error);
+			log_debug(ls, "dlm_recover_locks error %d", error);
 			goto fail;
 		}
 
+		dlm_set_recover_status(ls, DLM_RS_LOCKS);
+
 		error = dlm_recover_locks_wait(ls);
 		if (error) {
-			log_debug(ls, "recover_locks_wait failed %d", error);
+			log_debug(ls, "dlm_recover_locks_wait error %d", error);
 			goto fail;
 		}
 
@@ -170,7 +178,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
 		error = dlm_recover_locks_wait(ls);
 		if (error) {
-			log_debug(ls, "recover_locks_wait failed %d", error);
+			log_debug(ls, "dlm_recover_locks_wait error %d", error);
 			goto fail;
 		}
 	}
@@ -186,9 +194,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 	dlm_purge_requestqueue(ls);
 
 	dlm_set_recover_status(ls, DLM_RS_DONE);
+
 	error = dlm_recover_done_wait(ls);
 	if (error) {
-		log_debug(ls, "recover_done_wait failed %d", error);
+		log_debug(ls, "dlm_recover_done_wait error %d", error);
 		goto fail;
 	}
 
@@ -200,25 +209,25 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
 	error = enable_locking(ls, rv->seq);
 	if (error) {
-		log_debug(ls, "enable_locking failed %d", error);
+		log_debug(ls, "enable_locking error %d", error);
 		goto fail;
 	}
 
 	error = dlm_process_requestqueue(ls);
 	if (error) {
-		log_debug(ls, "process_requestqueue failed %d", error);
+		log_debug(ls, "dlm_process_requestqueue error %d", error);
 		goto fail;
 	}
 
 	error = dlm_recover_waiters_post(ls);
 	if (error) {
-		log_debug(ls, "recover_waiters_post failed %d", error);
+		log_debug(ls, "dlm_recover_waiters_post error %d", error);
 		goto fail;
 	}
 
 	dlm_grant_after_purge(ls);
 
-	log_debug(ls, "recover %llx done: %u ms",
+	log_debug(ls, "dlm_recover %llx done: %u ms",
 		  (unsigned long long)rv->seq,
 		  jiffies_to_msecs(jiffies - start));
 	mutex_unlock(&ls->ls_recoverd_active);
@@ -227,7 +236,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
  fail:
 	dlm_release_root_list(ls);
-	log_debug(ls, "recover %llx error %d",
+	log_debug(ls, "dlm_recover %llx error %d",
 		  (unsigned long long)rv->seq, error);
 	mutex_unlock(&ls->ls_recoverd_active);
 	return error;
-- 
cgit v1.2.3


From 757a42719635495779462514458bbfbf12a37dac Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 20 Oct 2011 13:26:28 -0500
Subject: dlm: add node slots and generation

Slot numbers are assigned to nodes when they join the lockspace.
The slot number chosen is the minimum unused value starting at 1.
Once a node is assigned a slot, that slot number will not change
while the node remains a lockspace member.  If the node leaves
and rejoins it can be assigned a new slot number.

A new generation number is also added to a lockspace.  It is
set and incremented during each recovery along with the slot
collection/assignment.

The slot numbers will be passed to gfs2 which will use them as
journal id's.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h |  48 ++++++++-
 fs/dlm/lockspace.c    |   5 +
 fs/dlm/member.c       | 284 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/dlm/member.h       |   7 ++
 fs/dlm/rcom.c         |  99 +++++++++++++++---
 fs/dlm/rcom.h         |   2 +-
 fs/dlm/recover.c      |  64 ++++++++++--
 7 files changed, 480 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5685a9a5dba..f4d132c7690 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -117,6 +117,18 @@ struct dlm_member {
 	struct list_head	list;
 	int			nodeid;
 	int			weight;
+	int			slot;
+	int			slot_prev;
+	uint32_t		generation;
+};
+
+/*
+ * low nodeid saves array of these in ls_slots
+ */
+
+struct dlm_slot {
+	int			nodeid;
+	int			slot;
 };
 
 /*
@@ -337,7 +349,9 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
 /* dlm_header is first element of all structs sent between nodes */
 
 #define DLM_HEADER_MAJOR	0x00030000
-#define DLM_HEADER_MINOR	0x00000000
+#define DLM_HEADER_MINOR	0x00000001
+
+#define DLM_HEADER_SLOTS	0x00000001
 
 #define DLM_MSG			1
 #define DLM_RCOM		2
@@ -425,10 +439,34 @@ union dlm_packet {
 	struct dlm_rcom		rcom;
 };
 
+#define DLM_RSF_NEED_SLOTS	0x00000001
+
+/* RCOM_STATUS data */
+struct rcom_status {
+	__le32			rs_flags;
+	__le32			rs_unused1;
+	__le64			rs_unused2;
+};
+
+/* RCOM_STATUS_REPLY data */
 struct rcom_config {
 	__le32			rf_lvblen;
 	__le32			rf_lsflags;
-	__le64			rf_unused;
+
+	/* DLM_HEADER_SLOTS adds: */
+	__le32			rf_flags;
+	__le16			rf_our_slot;
+	__le16			rf_num_slots;
+	__le32			rf_generation;
+	__le32			rf_unused1;
+	__le64			rf_unused2;
+};
+
+struct rcom_slot {
+	__le32			ro_nodeid;
+	__le16			ro_slot;
+	__le16			ro_unused1;
+	__le64			ro_unused2;
 };
 
 struct rcom_lock {
@@ -455,6 +493,7 @@ struct dlm_ls {
 	struct list_head	ls_list;	/* list of lockspaces */
 	dlm_lockspace_t		*ls_local_handle;
 	uint32_t		ls_global_id;	/* global unique lockspace ID */
+	uint32_t		ls_generation;
 	uint32_t		ls_exflags;
 	int			ls_lvblen;
 	int			ls_count;	/* refcount of processes in
@@ -493,6 +532,11 @@ struct dlm_ls {
 	int			ls_total_weight;
 	int			*ls_node_array;
 
+	int			ls_slot;
+	int			ls_num_slots;
+	int			ls_slots_size;
+	struct dlm_slot		*ls_slots;
+
 	struct dlm_rsb		ls_stub_rsb;	/* for returning errors */
 	struct dlm_lkb		ls_stub_lkb;	/* for returning errors */
 	struct dlm_message	ls_stub_ms;	/* for faking a reply */
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 1d16a23b0a0..1441f04bfab 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -525,6 +525,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	if (!ls->ls_recover_buf)
 		goto out_dirfree;
 
+	ls->ls_slot = 0;
+	ls->ls_num_slots = 0;
+	ls->ls_slots_size = 0;
+	ls->ls_slots = NULL;
+
 	INIT_LIST_HEAD(&ls->ls_recover_list);
 	spin_lock_init(&ls->ls_recover_list_lock);
 	ls->ls_recover_list_count = 0;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 5ebd1df6967..eebc52aae82 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -19,6 +19,280 @@
 #include "config.h"
 #include "lowcomms.h"
 
+int dlm_slots_version(struct dlm_header *h)
+{
+	if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS)
+		return 0;
+	return 1;
+}
+
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+		  struct dlm_member *memb)
+{
+	struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+
+	if (!dlm_slots_version(&rc->rc_header))
+		return;
+
+	memb->slot = le16_to_cpu(rf->rf_our_slot);
+	memb->generation = le32_to_cpu(rf->rf_generation);
+}
+
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct dlm_slot *slot;
+	struct rcom_slot *ro;
+	int i;
+
+	ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+
+	/* ls_slots array is sparse, but not rcom_slots */
+
+	for (i = 0; i < ls->ls_slots_size; i++) {
+		slot = &ls->ls_slots[i];
+		if (!slot->nodeid)
+			continue;
+		ro->ro_nodeid = cpu_to_le32(slot->nodeid);
+		ro->ro_slot = cpu_to_le16(slot->slot);
+		ro++;
+	}
+}
+
+#define SLOT_DEBUG_LINE 128
+
+static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
+			    struct rcom_slot *ro0, struct dlm_slot *array,
+			    int array_size)
+{
+	char line[SLOT_DEBUG_LINE];
+	int len = SLOT_DEBUG_LINE - 1;
+	int pos = 0;
+	int ret, i;
+
+	if (!dlm_config.ci_log_debug)
+		return;
+
+	memset(line, 0, sizeof(line));
+
+	if (array) {
+		for (i = 0; i < array_size; i++) {
+			if (!array[i].nodeid)
+				continue;
+
+			ret = snprintf(line + pos, len - pos, " %d:%d",
+				       array[i].slot, array[i].nodeid);
+			if (ret >= len - pos)
+				break;
+			pos += ret;
+		}
+	} else if (ro0) {
+		for (i = 0; i < num_slots; i++) {
+			ret = snprintf(line + pos, len - pos, " %d:%d",
+				       ro0[i].ro_slot, ro0[i].ro_nodeid);
+			if (ret >= len - pos)
+				break;
+			pos += ret;
+		}
+	}
+
+	log_debug(ls, "generation %u slots %d%s", gen, num_slots, line);
+}
+
+int dlm_slots_copy_in(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	struct dlm_rcom *rc = ls->ls_recover_buf;
+	struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+	struct rcom_slot *ro0, *ro;
+	int our_nodeid = dlm_our_nodeid();
+	int i, num_slots;
+	uint32_t gen;
+
+	if (!dlm_slots_version(&rc->rc_header))
+		return -1;
+
+	gen = le32_to_cpu(rf->rf_generation);
+	if (gen <= ls->ls_generation) {
+		log_error(ls, "dlm_slots_copy_in gen %u old %u",
+			  gen, ls->ls_generation);
+	}
+	ls->ls_generation = gen;
+
+	num_slots = le16_to_cpu(rf->rf_num_slots);
+	if (!num_slots)
+		return -1;
+
+	ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+
+	for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+		ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid);
+		ro->ro_slot = le16_to_cpu(ro->ro_slot);
+	}
+
+	log_debug_slots(ls, gen, num_slots, ro0, NULL, 0);
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+			if (ro->ro_nodeid != memb->nodeid)
+				continue;
+			memb->slot = ro->ro_slot;
+			memb->slot_prev = memb->slot;
+			break;
+		}
+
+		if (memb->nodeid == our_nodeid) {
+			if (ls->ls_slot && ls->ls_slot != memb->slot) {
+				log_error(ls, "dlm_slots_copy_in our slot "
+					  "changed %d %d", ls->ls_slot,
+					  memb->slot);
+				return -1;
+			}
+
+			if (!ls->ls_slot)
+				ls->ls_slot = memb->slot;
+		}
+
+		if (!memb->slot) {
+			log_error(ls, "dlm_slots_copy_in nodeid %d no slot",
+				   memb->nodeid);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/* for any nodes that do not support slots, we will not have set memb->slot
+   in wait_status_all(), so memb->slot will remain -1, and we will not
+   assign slots or set ls_num_slots here */
+
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+		     struct dlm_slot **slots_out, uint32_t *gen_out)
+{
+	struct dlm_member *memb;
+	struct dlm_slot *array;
+	int our_nodeid = dlm_our_nodeid();
+	int array_size, max_slots, i;
+	int need = 0;
+	int max = 0;
+	int num = 0;
+	uint32_t gen = 0;
+
+	/* our own memb struct will have slot -1 gen 0 */
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->nodeid == our_nodeid) {
+			memb->slot = ls->ls_slot;
+			memb->generation = ls->ls_generation;
+			break;
+		}
+	}
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->generation > gen)
+			gen = memb->generation;
+
+		/* node doesn't support slots */
+
+		if (memb->slot == -1)
+			return -1;
+
+		/* node needs a slot assigned */
+
+		if (!memb->slot)
+			need++;
+
+		/* node has a slot assigned */
+
+		num++;
+
+		if (!max || max < memb->slot)
+			max = memb->slot;
+
+		/* sanity check, once slot is assigned it shouldn't change */
+
+		if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) {
+			log_error(ls, "nodeid %d slot changed %d %d",
+				  memb->nodeid, memb->slot_prev, memb->slot);
+			return -1;
+		}
+		memb->slot_prev = memb->slot;
+	}
+
+	array_size = max + need;
+
+	array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS);
+	if (!array)
+		return -ENOMEM;
+
+	num = 0;
+
+	/* fill in slots (offsets) that are used */
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (!memb->slot)
+			continue;
+
+		if (memb->slot > array_size) {
+			log_error(ls, "invalid slot number %d", memb->slot);
+			kfree(array);
+			return -1;
+		}
+
+		array[memb->slot - 1].nodeid = memb->nodeid;
+		array[memb->slot - 1].slot = memb->slot;
+		num++;
+	}
+
+	/* assign new slots from unused offsets */
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->slot)
+			continue;
+
+		for (i = 0; i < array_size; i++) {
+			if (array[i].nodeid)
+				continue;
+
+			memb->slot = i + 1;
+			memb->slot_prev = memb->slot;
+			array[i].nodeid = memb->nodeid;
+			array[i].slot = memb->slot;
+			num++;
+
+			if (!ls->ls_slot && memb->nodeid == our_nodeid)
+				ls->ls_slot = memb->slot;
+			break;
+		}
+
+		if (!memb->slot) {
+			log_error(ls, "no free slot found");
+			kfree(array);
+			return -1;
+		}
+	}
+
+	gen++;
+
+	log_debug_slots(ls, gen, num, NULL, array, array_size);
+
+	max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
+		     sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
+
+	if (num > max_slots) {
+		log_error(ls, "num_slots %d exceeds max_slots %d",
+			  num, max_slots);
+		kfree(array);
+		return -1;
+	}
+
+	*gen_out = gen;
+	*slots_out = array;
+	*slots_size = array_size;
+	*num_slots = num;
+	return 0;
+}
+
 static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
 {
 	struct dlm_member *memb = NULL;
@@ -176,7 +450,7 @@ static int ping_members(struct dlm_ls *ls)
 		error = dlm_recovery_stopped(ls);
 		if (error)
 			break;
-		error = dlm_rcom_status(ls, memb->nodeid);
+		error = dlm_rcom_status(ls, memb->nodeid, 0);
 		if (error)
 			break;
 	}
@@ -322,7 +596,15 @@ int dlm_ls_stop(struct dlm_ls *ls)
 	 */
 
 	dlm_recoverd_suspend(ls);
+
+	spin_lock(&ls->ls_recover_lock);
+	kfree(ls->ls_slots);
+	ls->ls_slots = NULL;
+	ls->ls_num_slots = 0;
+	ls->ls_slots_size = 0;
 	ls->ls_recover_status = 0;
+	spin_unlock(&ls->ls_recover_lock);
+
 	dlm_recoverd_resume(ls);
 
 	if (!ls->ls_recover_begin)
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 7a26fca1e0b..7e87e8a79df 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -20,6 +20,13 @@ void dlm_clear_members_gone(struct dlm_ls *ls);
 int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
 int dlm_is_removed(struct dlm_ls *ls, int nodeid);
 int dlm_is_member(struct dlm_ls *ls, int nodeid);
+int dlm_slots_version(struct dlm_header *h);
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+		   struct dlm_member *memb);
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_slots_copy_in(struct dlm_ls *ls);
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+		     struct dlm_slot **slots_out, uint32_t *gen_out);
 
 #endif                          /* __MEMBER_DOT_H__ */
 
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f10a50f24e8..ac5c616c969 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -23,6 +23,7 @@
 #include "memory.h"
 #include "lock.h"
 #include "util.h"
+#include "member.h"
 
 
 static int rcom_response(struct dlm_ls *ls)
@@ -72,20 +73,30 @@ static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
 	dlm_lowcomms_commit_buffer(mh);
 }
 
+static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
+			    uint32_t flags)
+{
+	rs->rs_flags = cpu_to_le32(flags);
+}
+
 /* When replying to a status request, a node also sends back its
    configuration values.  The requesting node then checks that the remote
    node is configured the same way as itself. */
 
-static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf,
+			    uint32_t num_slots)
 {
 	rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen);
 	rf->rf_lsflags = cpu_to_le32(ls->ls_exflags);
+
+	rf->rf_our_slot = cpu_to_le16(ls->ls_slot);
+	rf->rf_num_slots = cpu_to_le16(num_slots);
+	rf->rf_generation =  cpu_to_le32(ls->ls_generation);
 }
 
-static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
+static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 {
 	struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
-	size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
 
 	if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
 		log_error(ls, "version mismatch: %x nodeid %d: %x",
@@ -94,12 +105,6 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 		return -EPROTO;
 	}
 
-	if (rc->rc_header.h_length < conf_size) {
-		log_error(ls, "config too short: %d nodeid %d",
-			  rc->rc_header.h_length, nodeid);
-		return -EPROTO;
-	}
-
 	if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen ||
 	    le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) {
 		log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
@@ -127,7 +132,18 @@ static void disallow_sync_reply(struct dlm_ls *ls)
 	spin_unlock(&ls->ls_rcom_spin);
 }
 
-int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+/*
+ * low nodeid gathers one slot value at a time from each node.
+ * it sets need_slots=0, and saves rf_our_slot returned from each
+ * rcom_config.
+ *
+ * other nodes gather all slot values at once from the low nodeid.
+ * they set need_slots=1, and ignore the rf_our_slot returned from each
+ * rcom_config.  they use the rf_num_slots returned from the low
+ * node's rcom_config.
+ */
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
 {
 	struct dlm_rcom *rc;
 	struct dlm_mhandle *mh;
@@ -141,10 +157,13 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
 		goto out;
 	}
 
-	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
+			    sizeof(struct rcom_status), &rc, &mh);
 	if (error)
 		goto out;
 
+	set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
+
 	allow_sync_reply(ls, &rc->rc_id);
 	memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
 
@@ -161,8 +180,11 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
 		/* we pretend the remote lockspace exists with 0 status */
 		log_debug(ls, "remote node %d not ready", nodeid);
 		rc->rc_result = 0;
-	} else
-		error = check_config(ls, rc, nodeid);
+		error = 0;
+	} else {
+		error = check_rcom_config(ls, rc, nodeid);
+	}
+
 	/* the caller looks at rc_result for the remote recovery status */
  out:
 	return error;
@@ -172,17 +194,60 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 {
 	struct dlm_rcom *rc;
 	struct dlm_mhandle *mh;
-	int error, nodeid = rc_in->rc_header.h_nodeid;
+	struct rcom_status *rs;
+	uint32_t status;
+	int nodeid = rc_in->rc_header.h_nodeid;
+	int len = sizeof(struct rcom_config);
+	int num_slots = 0;
+	int error;
+
+	if (!dlm_slots_version(&rc_in->rc_header)) {
+		status = dlm_recover_status(ls);
+		goto do_create;
+	}
+
+	rs = (struct rcom_status *)rc_in->rc_buf;
 
+	if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) {
+		status = dlm_recover_status(ls);
+		goto do_create;
+	}
+
+	spin_lock(&ls->ls_recover_lock);
+	status = ls->ls_recover_status;
+	num_slots = ls->ls_num_slots;
+	spin_unlock(&ls->ls_recover_lock);
+	len += num_slots * sizeof(struct rcom_slot);
+
+ do_create:
 	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
-			    sizeof(struct rcom_config), &rc, &mh);
+			    len, &rc, &mh);
 	if (error)
 		return;
+
 	rc->rc_id = rc_in->rc_id;
 	rc->rc_seq_reply = rc_in->rc_seq;
-	rc->rc_result = dlm_recover_status(ls);
-	make_config(ls, (struct rcom_config *) rc->rc_buf);
+	rc->rc_result = status;
+
+	set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
+
+	if (!num_slots)
+		goto do_send;
+
+	spin_lock(&ls->ls_recover_lock);
+	if (ls->ls_num_slots != num_slots) {
+		spin_unlock(&ls->ls_recover_lock);
+		log_debug(ls, "receive_rcom_status num_slots %d to %d",
+			  num_slots, ls->ls_num_slots);
+		rc->rc_result = 0;
+		set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0);
+		goto do_send;
+	}
+
+	dlm_slots_copy_out(ls, rc);
+	spin_unlock(&ls->ls_recover_lock);
 
+ do_send:
 	send_rcom(ls, mh, rc);
 }
 
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index b09abd29ba3..206723ab744 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -14,7 +14,7 @@
 #ifndef __RCOM_DOT_H__
 #define __RCOM_DOT_H__
 
-int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags);
 int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
 int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
 int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 81b23930449..34d5adf1fce 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -85,14 +85,20 @@ uint32_t dlm_recover_status(struct dlm_ls *ls)
 	return status;
 }
 
+static void _set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+	ls->ls_recover_status |= status;
+}
+
 void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
 {
 	spin_lock(&ls->ls_recover_lock);
-	ls->ls_recover_status |= status;
+	_set_recover_status(ls, status);
 	spin_unlock(&ls->ls_recover_lock);
 }
 
-static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
+			   int save_slots)
 {
 	struct dlm_rcom *rc = ls->ls_recover_buf;
 	struct dlm_member *memb;
@@ -106,10 +112,13 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
 				goto out;
 			}
 
-			error = dlm_rcom_status(ls, memb->nodeid);
+			error = dlm_rcom_status(ls, memb->nodeid, 0);
 			if (error)
 				goto out;
 
+			if (save_slots)
+				dlm_slot_save(ls, rc, memb);
+
 			if (rc->rc_result & wait_status)
 				break;
 			if (delay < 1000)
@@ -121,7 +130,8 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
 	return error;
 }
 
-static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
+			   uint32_t status_flags)
 {
 	struct dlm_rcom *rc = ls->ls_recover_buf;
 	int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
@@ -132,7 +142,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
 			goto out;
 		}
 
-		error = dlm_rcom_status(ls, nodeid);
+		error = dlm_rcom_status(ls, nodeid, status_flags);
 		if (error)
 			break;
 
@@ -152,18 +162,56 @@ static int wait_status(struct dlm_ls *ls, uint32_t status)
 	int error;
 
 	if (ls->ls_low_nodeid == dlm_our_nodeid()) {
-		error = wait_status_all(ls, status);
+		error = wait_status_all(ls, status, 0);
 		if (!error)
 			dlm_set_recover_status(ls, status_all);
 	} else
-		error = wait_status_low(ls, status_all);
+		error = wait_status_low(ls, status_all, 0);
 
 	return error;
 }
 
 int dlm_recover_members_wait(struct dlm_ls *ls)
 {
-	return wait_status(ls, DLM_RS_NODES);
+	struct dlm_member *memb;
+	struct dlm_slot *slots;
+	int num_slots, slots_size;
+	int error, rv;
+	uint32_t gen;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		memb->slot = -1;
+		memb->generation = 0;
+	}
+
+	if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+		error = wait_status_all(ls, DLM_RS_NODES, 1);
+		if (error)
+			goto out;
+
+		/* slots array is sparse, slots_size may be > num_slots */
+
+		rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen);
+		if (!rv) {
+			spin_lock(&ls->ls_recover_lock);
+			_set_recover_status(ls, DLM_RS_NODES_ALL);
+			ls->ls_num_slots = num_slots;
+			ls->ls_slots_size = slots_size;
+			ls->ls_slots = slots;
+			ls->ls_generation = gen;
+			spin_unlock(&ls->ls_recover_lock);
+		} else {
+			dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
+		}
+	} else {
+		error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS);
+		if (error)
+			goto out;
+
+		dlm_slots_copy_in(ls);
+	}
+ out:
+	return error;
 }
 
 int dlm_recover_directory_wait(struct dlm_ls *ls)
-- 
cgit v1.2.3


From 60f98d1839376d30e13f3e452dce2433fad3060e Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 2 Nov 2011 14:30:58 -0500
Subject: dlm: add recovery callbacks

These new callbacks notify the dlm user about lock recovery.
GFS2, and possibly others, need to be aware of when the dlm
will be doing lock recovery for a failed lockspace member.

In the past, this coordination has been done between dlm and
file system daemons in userspace, which then direct their
kernel counterparts.  These callbacks allow the same
coordination directly, and more simply.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/config.c       | 130 ++++++++++++++++++---------------
 fs/dlm/config.h       |  17 ++++-
 fs/dlm/dlm_internal.h |  21 ++----
 fs/dlm/lockspace.c    |  43 +++++++++--
 fs/dlm/member.c       | 197 ++++++++++++++++++++++++++++++++------------------
 fs/dlm/member.h       |   3 +-
 fs/dlm/recoverd.c     |  10 +--
 fs/dlm/user.c         |   5 +-
 fs/gfs2/lock_dlm.c    |   4 +-
 fs/ocfs2/stack_user.c |   4 +-
 include/linux/dlm.h   |  71 ++++++++++++++++--
 11 files changed, 333 insertions(+), 172 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 6cf72fcc0d0..e7e327d43fa 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/in.h>
 #include <linux/in6.h>
+#include <linux/dlmconstants.h>
 #include <net/ipv6.h>
 #include <net/sock.h>
 
@@ -36,6 +37,7 @@
 static struct config_group *space_list;
 static struct config_group *comm_list;
 static struct dlm_comm *local_comm;
+static uint32_t dlm_comm_count;
 
 struct dlm_clusters;
 struct dlm_cluster;
@@ -103,6 +105,8 @@ struct dlm_cluster {
 	unsigned int cl_timewarn_cs;
 	unsigned int cl_waitwarn_us;
 	unsigned int cl_new_rsb_count;
+	unsigned int cl_recover_callbacks;
+	char cl_cluster_name[DLM_LOCKSPACE_LEN];
 };
 
 enum {
@@ -118,6 +122,8 @@ enum {
 	CLUSTER_ATTR_TIMEWARN_CS,
 	CLUSTER_ATTR_WAITWARN_US,
 	CLUSTER_ATTR_NEW_RSB_COUNT,
+	CLUSTER_ATTR_RECOVER_CALLBACKS,
+	CLUSTER_ATTR_CLUSTER_NAME,
 };
 
 struct cluster_attribute {
@@ -126,6 +132,27 @@ struct cluster_attribute {
 	ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
 };
 
+static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
+{
+	return sprintf(buf, "%s\n", cl->cl_cluster_name);
+}
+
+static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
+					  const char *buf, size_t len)
+{
+	strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
+	strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+	return len;
+}
+
+static struct cluster_attribute cluster_attr_cluster_name = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "cluster_name",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = cluster_cluster_name_read,
+	.store  = cluster_cluster_name_write,
+};
+
 static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
 			   int *info_field, int check_zero,
 			   const char *buf, size_t len)
@@ -171,6 +198,7 @@ CLUSTER_ATTR(protocol, 0);
 CLUSTER_ATTR(timewarn_cs, 1);
 CLUSTER_ATTR(waitwarn_us, 0);
 CLUSTER_ATTR(new_rsb_count, 0);
+CLUSTER_ATTR(recover_callbacks, 0);
 
 static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -185,6 +213,8 @@ static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
 	[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
 	[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
+	[CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
+	[CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
 	NULL,
 };
 
@@ -293,6 +323,7 @@ struct dlm_comms {
 
 struct dlm_comm {
 	struct config_item item;
+	int seq;
 	int nodeid;
 	int local;
 	int addr_count;
@@ -309,6 +340,7 @@ struct dlm_node {
 	int nodeid;
 	int weight;
 	int new;
+	int comm_seq; /* copy of cm->seq when nd->nodeid is set */
 };
 
 static struct configfs_group_operations clusters_ops = {
@@ -455,6 +487,9 @@ static struct config_group *make_cluster(struct config_group *g,
 	cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
 	cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
 	cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
+	cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
+	memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
+	       DLM_LOCKSPACE_LEN);
 
 	space_list = &sps->ss_group;
 	comm_list = &cms->cs_group;
@@ -558,6 +593,11 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
 		return ERR_PTR(-ENOMEM);
 
 	config_item_init_type_name(&cm->item, name, &comm_type);
+
+	cm->seq = dlm_comm_count++;
+	if (!cm->seq)
+		cm->seq = dlm_comm_count++;
+
 	cm->nodeid = -1;
 	cm->local = 0;
 	cm->addr_count = 0;
@@ -801,7 +841,10 @@ static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
 static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
 				 size_t len)
 {
+	uint32_t seq = 0;
 	nd->nodeid = simple_strtol(buf, NULL, 0);
+	dlm_comm_seq(nd->nodeid, &seq);
+	nd->comm_seq = seq;
 	return len;
 }
 
@@ -908,13 +951,13 @@ static void put_comm(struct dlm_comm *cm)
 }
 
 /* caller must free mem */
-int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
-		    int **new_out, int *new_count_out)
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
+		     int *count_out)
 {
 	struct dlm_space *sp;
 	struct dlm_node *nd;
-	int i = 0, rv = 0, ids_count = 0, new_count = 0;
-	int *ids, *new;
+	struct dlm_config_node *nodes, *node;
+	int rv, count;
 
 	sp = get_space(lsname);
 	if (!sp)
@@ -927,73 +970,42 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
 		goto out;
 	}
 
-	ids_count = sp->members_count;
+	count = sp->members_count;
 
-	ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
-	if (!ids) {
+	nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS);
+	if (!nodes) {
 		rv = -ENOMEM;
 		goto out;
 	}
 
+	node = nodes;
 	list_for_each_entry(nd, &sp->members, list) {
-		ids[i++] = nd->nodeid;
-		if (nd->new)
-			new_count++;
-	}
-
-	if (ids_count != i)
-		printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i);
-
-	if (!new_count)
-		goto out_ids;
+		node->nodeid = nd->nodeid;
+		node->weight = nd->weight;
+		node->new = nd->new;
+		node->comm_seq = nd->comm_seq;
+		node++;
 
-	new = kcalloc(new_count, sizeof(int), GFP_NOFS);
-	if (!new) {
-		kfree(ids);
-		rv = -ENOMEM;
-		goto out;
+		nd->new = 0;
 	}
 
-	i = 0;
-	list_for_each_entry(nd, &sp->members, list) {
-		if (nd->new) {
-			new[i++] = nd->nodeid;
-			nd->new = 0;
-		}
-	}
-	*new_count_out = new_count;
-	*new_out = new;
-
- out_ids:
-	*ids_count_out = ids_count;
-	*ids_out = ids;
+	*count_out = count;
+	*nodes_out = nodes;
+	rv = 0;
  out:
 	mutex_unlock(&sp->members_lock);
 	put_space(sp);
 	return rv;
 }
 
-int dlm_node_weight(char *lsname, int nodeid)
+int dlm_comm_seq(int nodeid, uint32_t *seq)
 {
-	struct dlm_space *sp;
-	struct dlm_node *nd;
-	int w = -EEXIST;
-
-	sp = get_space(lsname);
-	if (!sp)
-		goto out;
-
-	mutex_lock(&sp->members_lock);
-	list_for_each_entry(nd, &sp->members, list) {
-		if (nd->nodeid != nodeid)
-			continue;
-		w = nd->weight;
-		break;
-	}
-	mutex_unlock(&sp->members_lock);
-	put_space(sp);
- out:
-	return w;
+	struct dlm_comm *cm = get_comm(nodeid, NULL);
+	if (!cm)
+		return -EEXIST;
+	*seq = cm->seq;
+	put_comm(cm);
+	return 0;
 }
 
 int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
@@ -1047,6 +1059,8 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
 #define DEFAULT_WAITWARN_US	   0
 #define DEFAULT_NEW_RSB_COUNT    128
+#define DEFAULT_RECOVER_CALLBACKS  0
+#define DEFAULT_CLUSTER_NAME      ""
 
 struct dlm_config_info dlm_config = {
 	.ci_tcp_port = DEFAULT_TCP_PORT,
@@ -1060,6 +1074,8 @@ struct dlm_config_info dlm_config = {
 	.ci_protocol = DEFAULT_PROTOCOL,
 	.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
 	.ci_waitwarn_us = DEFAULT_WAITWARN_US,
-	.ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT
+	.ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
+	.ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
+	.ci_cluster_name = DEFAULT_CLUSTER_NAME
 };
 
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 3099d0dd26c..9f5e3663bb0 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,13 @@
 #ifndef __CONFIG_DOT_H__
 #define __CONFIG_DOT_H__
 
+struct dlm_config_node {
+	int nodeid;
+	int weight;
+	int new;
+	uint32_t comm_seq;
+};
+
 #define DLM_MAX_ADDR_COUNT 3
 
 struct dlm_config_info {
@@ -29,15 +36,17 @@ struct dlm_config_info {
 	int ci_timewarn_cs;
 	int ci_waitwarn_us;
 	int ci_new_rsb_count;
+	int ci_recover_callbacks;
+	char ci_cluster_name[DLM_LOCKSPACE_LEN];
 };
 
 extern struct dlm_config_info dlm_config;
 
 int dlm_config_init(void);
 void dlm_config_exit(void);
-int dlm_node_weight(char *lsname, int nodeid);
-int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
-		    int **new_out, int *new_count_out);
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
+		     int *count_out);
+int dlm_comm_seq(int nodeid, uint32_t *seq);
 int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
 int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
 int dlm_our_nodeid(void);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index f4d132c7690..3a564d197e9 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -119,28 +119,18 @@ struct dlm_member {
 	int			weight;
 	int			slot;
 	int			slot_prev;
+	int			comm_seq;
 	uint32_t		generation;
 };
 
-/*
- * low nodeid saves array of these in ls_slots
- */
-
-struct dlm_slot {
-	int			nodeid;
-	int			slot;
-};
-
 /*
  * Save and manage recovery state for a lockspace.
  */
 
 struct dlm_recover {
 	struct list_head	list;
-	int			*nodeids;   /* nodeids of all members */
-	int			node_count;
-	int			*new;       /* nodeids of new members */
-	int			new_count;
+	struct dlm_config_node	*nodes;
+	int			nodes_count;
 	uint64_t		seq;
 };
 
@@ -584,6 +574,9 @@ struct dlm_ls {
 	struct list_head	ls_root_list;	/* root resources */
 	struct rw_semaphore	ls_root_sem;	/* protect root_list */
 
+	const struct dlm_lockspace_ops *ls_ops;
+	void			*ls_ops_arg;
+
 	int			ls_namelen;
 	char			ls_name[1];
 };
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 1441f04bfab..a1ea25face8 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -386,12 +386,15 @@ static void threads_stop(void)
 	dlm_lowcomms_stop();
 }
 
-static int new_lockspace(const char *name, int namelen, void **lockspace,
-			 uint32_t flags, int lvblen)
+static int new_lockspace(const char *name, const char *cluster,
+			 uint32_t flags, int lvblen,
+			 const struct dlm_lockspace_ops *ops, void *ops_arg,
+			 int *ops_result, dlm_lockspace_t **lockspace)
 {
 	struct dlm_ls *ls;
 	int i, size, error;
 	int do_unreg = 0;
+	int namelen = strlen(name);
 
 	if (namelen > DLM_LOCKSPACE_LEN)
 		return -EINVAL;
@@ -403,8 +406,24 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 		return -EINVAL;
 
 	if (!dlm_user_daemon_available()) {
-		module_put(THIS_MODULE);
-		return -EUNATCH;
+		log_print("dlm user daemon not available");
+		error = -EUNATCH;
+		goto out;
+	}
+
+	if (ops && ops_result) {
+	       	if (!dlm_config.ci_recover_callbacks)
+			*ops_result = -EOPNOTSUPP;
+		else
+			*ops_result = 0;
+	}
+
+	if (dlm_config.ci_recover_callbacks && cluster &&
+	    strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) {
+		log_print("dlm cluster name %s mismatch %s",
+			  dlm_config.ci_cluster_name, cluster);
+		error = -EBADR;
+		goto out;
 	}
 
 	error = 0;
@@ -442,6 +461,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	ls->ls_flags = 0;
 	ls->ls_scan_time = jiffies;
 
+	if (ops && dlm_config.ci_recover_callbacks) {
+		ls->ls_ops = ops;
+		ls->ls_ops_arg = ops_arg;
+	}
+
 	if (flags & DLM_LSFL_TIMEWARN)
 		set_bit(LSFL_TIMEWARN, &ls->ls_flags);
 
@@ -619,8 +643,10 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	return error;
 }
 
-int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
-		      uint32_t flags, int lvblen)
+int dlm_new_lockspace(const char *name, const char *cluster,
+		      uint32_t flags, int lvblen,
+		      const struct dlm_lockspace_ops *ops, void *ops_arg,
+		      int *ops_result, dlm_lockspace_t **lockspace)
 {
 	int error = 0;
 
@@ -630,7 +656,8 @@ int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
 	if (error)
 		goto out;
 
-	error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+	error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg,
+			      ops_result, lockspace);
 	if (!error)
 		ls_count++;
 	if (error > 0)
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index eebc52aae82..862640a36d5 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,7 @@ int dlm_slots_version(struct dlm_header *h)
 }
 
 void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
-		  struct dlm_member *memb)
+		   struct dlm_member *memb)
 {
 	struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
 
@@ -317,59 +317,51 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
 	}
 }
 
-static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node)
 {
 	struct dlm_member *memb;
-	int w, error;
+	int error;
 
 	memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
 	if (!memb)
 		return -ENOMEM;
 
-	w = dlm_node_weight(ls->ls_name, nodeid);
-	if (w < 0) {
-		kfree(memb);
-		return w;
-	}
-
-	error = dlm_lowcomms_connect_node(nodeid);
+	error = dlm_lowcomms_connect_node(node->nodeid);
 	if (error < 0) {
 		kfree(memb);
 		return error;
 	}
 
-	memb->nodeid = nodeid;
-	memb->weight = w;
+	memb->nodeid = node->nodeid;
+	memb->weight = node->weight;
+	memb->comm_seq = node->comm_seq;
 	add_ordered_member(ls, memb);
 	ls->ls_num_nodes++;
 	return 0;
 }
 
-static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
-{
-	list_move(&memb->list, &ls->ls_nodes_gone);
-	ls->ls_num_nodes--;
-}
-
-int dlm_is_member(struct dlm_ls *ls, int nodeid)
+static struct dlm_member *find_memb(struct list_head *head, int nodeid)
 {
 	struct dlm_member *memb;
 
-	list_for_each_entry(memb, &ls->ls_nodes, list) {
+	list_for_each_entry(memb, head, list) {
 		if (memb->nodeid == nodeid)
-			return 1;
+			return memb;
 	}
+	return NULL;
+}
+
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+	if (find_memb(&ls->ls_nodes, nodeid))
+		return 1;
 	return 0;
 }
 
 int dlm_is_removed(struct dlm_ls *ls, int nodeid)
 {
-	struct dlm_member *memb;
-
-	list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
-		if (memb->nodeid == nodeid)
-			return 1;
-	}
+	if (find_memb(&ls->ls_nodes_gone, nodeid))
+		return 1;
 	return 0;
 }
 
@@ -460,10 +452,88 @@ static int ping_members(struct dlm_ls *ls)
 	return error;
 }
 
+static void dlm_lsop_recover_prep(struct dlm_ls *ls)
+{
+	if (!ls->ls_ops || !ls->ls_ops->recover_prep)
+		return;
+	ls->ls_ops->recover_prep(ls->ls_ops_arg);
+}
+
+static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
+{
+	struct dlm_slot slot;
+	uint32_t seq;
+	int error;
+
+	if (!ls->ls_ops || !ls->ls_ops->recover_slot)
+		return;
+
+	/* if there is no comms connection with this node
+	   or the present comms connection is newer
+	   than the one when this member was added, then
+	   we consider the node to have failed (versus
+	   being removed due to dlm_release_lockspace) */
+
+	error = dlm_comm_seq(memb->nodeid, &seq);
+
+	if (!error && seq == memb->comm_seq)
+		return;
+
+	slot.nodeid = memb->nodeid;
+	slot.slot = memb->slot;
+
+	ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot);
+}
+
+void dlm_lsop_recover_done(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	struct dlm_slot *slots;
+	int i, num;
+
+	if (!ls->ls_ops || !ls->ls_ops->recover_done)
+		return;
+
+	num = ls->ls_num_nodes;
+
+	slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL);
+	if (!slots)
+		return;
+
+	i = 0;
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (i == num) {
+			log_error(ls, "dlm_lsop_recover_done bad num %d", num);
+			goto out;
+		}
+		slots[i].nodeid = memb->nodeid;
+		slots[i].slot = memb->slot;
+		i++;
+	}
+
+	ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num,
+				 ls->ls_slot, ls->ls_generation);
+ out:
+	kfree(slots);
+}
+
+static struct dlm_config_node *find_config_node(struct dlm_recover *rv,
+						int nodeid)
+{
+	int i;
+
+	for (i = 0; i < rv->nodes_count; i++) {
+		if (rv->nodes[i].nodeid == nodeid)
+			return &rv->nodes[i];
+	}
+	return NULL;
+}
+
 int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 {
 	struct dlm_member *memb, *safe;
-	int i, error, found, pos = 0, neg = 0, low = -1;
+	struct dlm_config_node *node;
+	int i, error, neg = 0, low = -1;
 
 	/* previously removed members that we've not finished removing need to
 	   count as a negative change so the "neg" recovery steps will happen */
@@ -476,46 +546,32 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 	/* move departed members from ls_nodes to ls_nodes_gone */
 
 	list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
-		found = 0;
-		for (i = 0; i < rv->node_count; i++) {
-			if (memb->nodeid == rv->nodeids[i]) {
-				found = 1;
-				break;
-			}
-		}
+		node = find_config_node(rv, memb->nodeid);
+		if (node && !node->new)
+			continue;
 
-		if (!found) {
-			neg++;
-			dlm_remove_member(ls, memb);
+		if (!node) {
 			log_debug(ls, "remove member %d", memb->nodeid);
+		} else {
+			/* removed and re-added */
+			log_debug(ls, "remove member %d comm_seq %u %u",
+				  memb->nodeid, memb->comm_seq, node->comm_seq);
 		}
-	}
-
-	/* Add an entry to ls_nodes_gone for members that were removed and
-	   then added again, so that previous state for these nodes will be
-	   cleared during recovery. */
 
-	for (i = 0; i < rv->new_count; i++) {
-		if (!dlm_is_member(ls, rv->new[i]))
-			continue;
-		log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
-
-		memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
-		if (!memb)
-			return -ENOMEM;
-		memb->nodeid = rv->new[i];
-		list_add_tail(&memb->list, &ls->ls_nodes_gone);
 		neg++;
+		list_move(&memb->list, &ls->ls_nodes_gone);
+		ls->ls_num_nodes--;
+		dlm_lsop_recover_slot(ls, memb);
 	}
 
 	/* add new members to ls_nodes */
 
-	for (i = 0; i < rv->node_count; i++) {
-		if (dlm_is_member(ls, rv->nodeids[i]))
+	for (i = 0; i < rv->nodes_count; i++) {
+		node = &rv->nodes[i];
+		if (dlm_is_member(ls, node->nodeid))
 			continue;
-		dlm_add_member(ls, rv->nodeids[i]);
-		pos++;
-		log_debug(ls, "add member %d", rv->nodeids[i]);
+		dlm_add_member(ls, node);
+		log_debug(ls, "add member %d", node->nodeid);
 	}
 
 	list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -609,21 +665,22 @@ int dlm_ls_stop(struct dlm_ls *ls)
 
 	if (!ls->ls_recover_begin)
 		ls->ls_recover_begin = jiffies;
+
+	dlm_lsop_recover_prep(ls);
 	return 0;
 }
 
 int dlm_ls_start(struct dlm_ls *ls)
 {
 	struct dlm_recover *rv = NULL, *rv_old;
-	int *ids = NULL, *new = NULL;
-	int error, ids_count = 0, new_count = 0;
+	struct dlm_config_node *nodes;
+	int error, count;
 
 	rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
 	if (!rv)
 		return -ENOMEM;
 
-	error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count,
-				&new, &new_count);
+	error = dlm_config_nodes(ls->ls_name, &nodes, &count);
 	if (error < 0)
 		goto fail;
 
@@ -638,10 +695,8 @@ int dlm_ls_start(struct dlm_ls *ls)
 		goto fail;
 	}
 
-	rv->nodeids = ids;
-	rv->node_count = ids_count;
-	rv->new = new;
-	rv->new_count = new_count;
+	rv->nodes = nodes;
+	rv->nodes_count = count;
 	rv->seq = ++ls->ls_recover_seq;
 	rv_old = ls->ls_recover_args;
 	ls->ls_recover_args = rv;
@@ -649,9 +704,8 @@ int dlm_ls_start(struct dlm_ls *ls)
 
 	if (rv_old) {
 		log_error(ls, "unused recovery %llx %d",
-			  (unsigned long long)rv_old->seq, rv_old->node_count);
-		kfree(rv_old->nodeids);
-		kfree(rv_old->new);
+			  (unsigned long long)rv_old->seq, rv_old->nodes_count);
+		kfree(rv_old->nodes);
 		kfree(rv_old);
 	}
 
@@ -660,8 +714,7 @@ int dlm_ls_start(struct dlm_ls *ls)
 
  fail:
 	kfree(rv);
-	kfree(ids);
-	kfree(new);
+	kfree(nodes);
 	return error;
 }
 
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 7e87e8a79df..3deb70661c6 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -27,6 +27,7 @@ void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc);
 int dlm_slots_copy_in(struct dlm_ls *ls);
 int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
 		     struct dlm_slot **slots_out, uint32_t *gen_out);
+void dlm_lsop_recover_done(struct dlm_ls *ls);
 
 #endif                          /* __MEMBER_DOT_H__ */
 
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 5a9e1a49a86..3780caf7ae0 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -227,11 +227,12 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
 	dlm_grant_after_purge(ls);
 
-	log_debug(ls, "dlm_recover %llx done: %u ms",
-		  (unsigned long long)rv->seq,
+	log_debug(ls, "dlm_recover %llx generation %u done: %u ms",
+		  (unsigned long long)rv->seq, ls->ls_generation,
 		  jiffies_to_msecs(jiffies - start));
 	mutex_unlock(&ls->ls_recoverd_active);
 
+	dlm_lsop_recover_done(ls);
 	return 0;
 
  fail:
@@ -259,8 +260,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
 
 	if (rv) {
 		ls_recover(ls, rv);
-		kfree(rv->nodeids);
-		kfree(rv->new);
+		kfree(rv->nodes);
 		kfree(rv);
 	}
 }
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d8ea6075640..eb4ed9ba309 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -392,8 +392,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	error = dlm_new_lockspace(params->name, strlen(params->name),
-				  &lockspace, params->flags, DLM_USER_LVB_LEN);
+	error = dlm_new_lockspace(params->name, NULL, params->flags,
+				  DLM_USER_LVB_LEN, NULL, NULL, NULL,
+				  &lockspace);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 98c80d8c2a6..ce85b62bc0a 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -195,10 +195,10 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
 		return -EINVAL;
 	}
 
-	error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm,
+	error = dlm_new_lockspace(fsname, NULL, 
 				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
 				  (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
-				  GDLM_LVB_SIZE);
+				  GDLM_LVB_SIZE, NULL, NULL, NULL, &ls->ls_dlm);
 	if (error)
 		printk(KERN_ERR "dlm_new_lockspace error %d", error);
 
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index a5ebe421195..286edf1e231 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -827,8 +827,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 		goto out;
 	}
 
-	rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
-			       &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
+	rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
+			       NULL, NULL, NULL, &fsdlm);
 	if (rc) {
 		ocfs2_live_connection_drop(control);
 		goto out;
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index d4e02f5353a..6c7f6e9546c 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -74,15 +74,76 @@ struct dlm_lksb {
 
 #ifdef __KERNEL__
 
+struct dlm_slot {
+	int nodeid; /* 1 to MAX_INT */
+	int slot;   /* 1 to MAX_INT */
+};
+
+/*
+ * recover_prep: called before the dlm begins lock recovery.
+ *   Notfies lockspace user that locks from failed members will be granted.
+ * recover_slot: called after recover_prep and before recover_done.
+ *   Identifies a failed lockspace member.
+ * recover_done: called after the dlm completes lock recovery.
+ *   Identifies lockspace members and lockspace generation number.
+ */
+
+struct dlm_lockspace_ops {
+	void (*recover_prep) (void *ops_arg);
+	void (*recover_slot) (void *ops_arg, struct dlm_slot *slot);
+	void (*recover_done) (void *ops_arg, struct dlm_slot *slots,
+			      int num_slots, int our_slot, uint32_t generation);
+};
+
 /*
  * dlm_new_lockspace
  *
- * Starts a lockspace with the given name.  If the named lockspace exists in
- * the cluster, the calling node joins it.
+ * Create/join a lockspace.
+ *
+ * name: lockspace name, null terminated, up to DLM_LOCKSPACE_LEN (not
+ *   including terminating null).
+ *
+ * cluster: cluster name, null terminated, up to DLM_LOCKSPACE_LEN (not
+ *   including terminating null).  Optional.  When cluster is null, it
+ *   is not used.  When set, dlm_new_lockspace() returns -EBADR if cluster
+ *   is not equal to the dlm cluster name.
+ *
+ * flags:
+ * DLM_LSFL_NODIR
+ *   The dlm should not use a resource directory, but statically assign
+ *   resource mastery to nodes based on the name hash that is otherwise
+ *   used to select the directory node.  Must be the same on all nodes.
+ * DLM_LSFL_TIMEWARN
+ *   The dlm should emit netlink messages if locks have been waiting
+ *   for a configurable amount of time.  (Unused.)
+ * DLM_LSFL_FS
+ *   The lockspace user is in the kernel (i.e. filesystem).  Enables
+ *   direct bast/cast callbacks.
+ * DLM_LSFL_NEWEXCL
+ *   dlm_new_lockspace() should return -EEXIST if the lockspace exists.
+ *
+ * lvblen: length of lvb in bytes.  Must be multiple of 8.
+ *   dlm_new_lockspace() returns an error if this does not match
+ *   what other nodes are using.
+ *
+ * ops: callbacks that indicate lockspace recovery points so the
+ *   caller can coordinate its recovery and know lockspace members.
+ *   This is only used by the initial dlm_new_lockspace() call.
+ *   Optional.
+ *
+ * ops_arg: arg for ops callbacks.
+ *
+ * ops_result: tells caller if the ops callbacks (if provided) will
+ *   be used or not.  0: will be used, -EXXX will not be used.
+ *   -EOPNOTSUPP: the dlm does not have recovery_callbacks enabled.
+ *
+ * lockspace: handle for dlm functions
  */
 
-int dlm_new_lockspace(const char *name, int namelen,
-		      dlm_lockspace_t **lockspace, uint32_t flags, int lvblen);
+int dlm_new_lockspace(const char *name, const char *cluster,
+		      uint32_t flags, int lvblen,
+		      const struct dlm_lockspace_ops *ops, void *ops_arg,
+		      int *ops_result, dlm_lockspace_t **lockspace);
 
 /*
  * dlm_release_lockspace
-- 
cgit v1.2.3


From 5b25f70f4200766355cdabda604e131d2fb6010d Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 10:55:48 +0200
Subject: Btrfs: add nested locking mode for paths

This patch adds the possibilty to read-lock an extent even if it is already
write-locked from the same thread. btrfs_find_all_roots() needs this
capability.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent_io.c |  1 +
 fs/btrfs/extent_io.h |  2 ++
 fs/btrfs/locking.c   | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index be1bf627a14..dd8d140eb27 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3571,6 +3571,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	atomic_set(&eb->blocking_writers, 0);
 	atomic_set(&eb->spinning_readers, 0);
 	atomic_set(&eb->spinning_writers, 0);
+	eb->lock_nested = 0;
 	init_waitqueue_head(&eb->write_lock_wq);
 	init_waitqueue_head(&eb->read_lock_wq);
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c300132..bc6a042cb6f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -129,6 +129,7 @@ struct extent_buffer {
 	struct list_head leak_list;
 	struct rcu_head rcu_head;
 	atomic_t refs;
+	pid_t lock_owner;
 
 	/* count of read lock holders on the extent buffer */
 	atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
 	atomic_t blocking_readers;
 	atomic_t spinning_readers;
 	atomic_t spinning_writers;
+	int lock_nested;
 
 	/* protects write locks */
 	rwlock_t lock;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d77b67c4b27..5e178d8f716 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
  */
 void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	if (rw == BTRFS_WRITE_LOCK) {
 		if (atomic_read(&eb->blocking_writers) == 0) {
 			WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
  */
 void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (&eb->lock_nested && current->pid == eb->lock_owner) {
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
 		BUG_ON(atomic_read(&eb->blocking_writers) != 1);
 		write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
 again:
+	read_lock(&eb->lock);
+	if (atomic_read(&eb->blocking_writers) &&
+	    current->pid == eb->lock_owner) {
+		/*
+		 * This extent is already write-locked by our thread. We allow
+		 * an additional read lock to be added because it's for the same
+		 * thread. btrfs_find_all_roots() depends on this as it may be
+		 * called on a partly (write-)locked tree.
+		 */
+		BUG_ON(eb->lock_nested);
+		eb->lock_nested = 1;
+		read_unlock(&eb->lock);
+		return;
+	}
+	read_unlock(&eb->lock);
 	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
 	read_lock(&eb->lock);
 	if (atomic_read(&eb->blocking_writers)) {
 		read_unlock(&eb->lock);
-		wait_event(eb->write_lock_wq,
-			   atomic_read(&eb->blocking_writers) == 0);
 		goto again;
 	}
 	atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 	}
 	atomic_inc(&eb->write_locks);
 	atomic_inc(&eb->spinning_writers);
+	eb->lock_owner = current->pid;
 	return 1;
 }
 
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
  */
 void btrfs_tree_read_unlock(struct extent_buffer *eb)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
+			eb->lock_nested = 0;
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->spinning_readers) == 0);
 	atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
  */
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
+			eb->lock_nested = 0;
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->blocking_readers) == 0);
 	if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
 	WARN_ON(atomic_read(&eb->spinning_writers));
 	atomic_inc(&eb->spinning_writers);
 	atomic_inc(&eb->write_locks);
+	eb->lock_owner = current->pid;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 00f04b88791ff49dc64ada18819d40a5b0671709 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Wed, 14 Sep 2011 12:37:00 +0200
Subject: Btrfs: add sequence numbers to delayed refs

Sequence numbers are needed to reconstruct the backrefs of a given extent to
a certain point in time. The total set of backrefs consist of the set of
backrefs recorded on disk plus the enqueued delayed refs for it that existed
at that moment.

This patch also adds a list that records all delayed refs which are
currently in the process of being added.

When walking all refs of an extent in btrfs_find_all_roots(), we freeze the
current state of delayed refs, honor anythinh up to this point and prevent
processing newer delayed refs to assert consistency.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/delayed-ref.c | 34 ++++++++++++++++++++++++
 fs/btrfs/delayed-ref.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.c |  4 +++
 3 files changed, 108 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index babd37badb4..a405db0320e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
 		return -1;
 	if (ref1->type > ref2->type)
 		return 1;
+	/* merging of sequenced refs is not allowed */
+	if (ref1->seq < ref2->seq)
+		return -1;
+	if (ref1->seq > ref2->seq)
+		return 1;
 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -209,6 +214,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 seq)
+{
+	struct seq_list *elem;
+
+	assert_spin_locked(&delayed_refs->lock);
+	if (list_empty(&delayed_refs->seq_head))
+		return 0;
+
+	elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
+	if (seq >= elem->seq) {
+		pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
+			 seq, elem->seq, delayed_refs);
+		return 1;
+	}
+	return 0;
+}
+
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 			   struct list_head *cluster, u64 start)
 {
@@ -438,6 +461,7 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	ref->action  = 0;
 	ref->is_head = 1;
 	ref->in_tree = 1;
+	ref->seq = 0;
 
 	head_ref = btrfs_delayed_node_to_head(ref);
 	head_ref->must_insert_reserved = must_insert_reserved;
@@ -479,6 +503,7 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_tree_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	u64 seq = 0;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -494,6 +519,10 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
+	if (need_ref_seq(for_cow, ref_root))
+		seq = inc_delayed_seq(delayed_refs);
+	ref->seq = seq;
+
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
 	full_ref->parent = parent;
 	full_ref->root = ref_root;
@@ -534,6 +563,7 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_data_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	u64 seq = 0;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -549,6 +579,10 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
+	if (need_ref_seq(for_cow, ref_root))
+		seq = inc_delayed_seq(delayed_refs);
+	ref->seq = seq;
+
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
 	full_ref->parent = parent;
 	full_ref->root = ref_root;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a5fb2bc8373..174416f7882 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
 	/* the size of the extent */
 	u64 num_bytes;
 
+	/* seq number to keep track of insertion order */
+	u64 seq;
+
 	/* ref count on this data structure */
 	atomic_t refs;
 
@@ -136,6 +139,20 @@ struct btrfs_delayed_ref_root {
 	int flushing;
 
 	u64 run_delayed_start;
+
+	/*
+	 * seq number of delayed refs. We need to know if a backref was being
+	 * added before the currently processed ref or afterwards.
+	 */
+	u64 seq;
+
+	/*
+	 * seq_list holds a list of all seq numbers that are currently being
+	 * added to the list. While walking backrefs (btrfs_find_all_roots,
+	 * qgroups), which might take some time, no newer ref must be processed,
+	 * as it might influence the outcome of the walk.
+	 */
+	struct list_head seq_head;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -171,6 +188,59 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 			   struct btrfs_delayed_ref_head *head);
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 			   struct list_head *cluster, u64 search_start);
+
+struct seq_list {
+	struct list_head list;
+	u64 seq;
+};
+
+static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
+{
+	assert_spin_locked(&delayed_refs->lock);
+	++delayed_refs->seq;
+	return delayed_refs->seq;
+}
+
+static inline void
+btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+		      struct seq_list *elem)
+{
+	assert_spin_locked(&delayed_refs->lock);
+	elem->seq = delayed_refs->seq;
+	list_add_tail(&elem->list, &delayed_refs->seq_head);
+}
+
+static inline void
+btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+		      struct seq_list *elem)
+{
+	spin_lock(&delayed_refs->lock);
+	list_del(&elem->list);
+	spin_unlock(&delayed_refs->lock);
+}
+
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 seq);
+
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
+{
+	if (for_cow)
+		return 0;
+
+	if (rootid == BTRFS_FS_TREE_OBJECTID)
+		return 1;
+
+	if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+
+	return 0;
+}
+
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a2bfedcbcab..31a7393af64 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 	WARN_ON(atomic_read(&transaction->use_count) == 0);
 	if (atomic_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
+		WARN_ON(transaction->delayed_refs.root.rb_node);
+		WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
 		memset(transaction, 0, sizeof(*transaction));
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
@@ -108,8 +110,10 @@ loop:
 	cur_trans->delayed_refs.num_heads = 0;
 	cur_trans->delayed_refs.flushing = 0;
 	cur_trans->delayed_refs.run_delayed_start = 0;
+	cur_trans->delayed_refs.seq = 1;
 	spin_lock_init(&cur_trans->commit_lock);
 	spin_lock_init(&cur_trans->delayed_refs.lock);
+	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
 
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-- 
cgit v1.2.3


From d1270cd91f308c9d22b2804720c36ccd32dbc35e Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 15:16:43 +0200
Subject: Btrfs: put back delayed refs that are too new

When processing a delayed ref, first check if there are still old refs in
the process of being added. If so, put this ref back to the tree. To avoid
looping on this ref, choose a newer one in the next loop.
btrfs_find_ref_cluster has to take care of that.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/delayed-ref.c | 43 +++++++++++++++++++++++++------------------
 fs/btrfs/extent-tree.c | 27 ++++++++++++++++++++++-----
 2 files changed, 47 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index a405db0320e..ee181989d44 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -155,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 
 /*
  * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
  */
 static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
 				  u64 bytenr,
-				  struct btrfs_delayed_ref_node **last)
+				  struct btrfs_delayed_ref_node **last,
+				  int return_bigger)
 {
-	struct rb_node *n = root->rb_node;
+	struct rb_node *n;
 	struct btrfs_delayed_ref_node *entry;
-	int cmp;
+	int cmp = 0;
 
+again:
+	n = root->rb_node;
+	entry = NULL;
 	while (n) {
 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 		WARN_ON(!entry->in_tree);
@@ -187,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
 		else
 			return entry;
 	}
+	if (entry && return_bigger) {
+		if (cmp > 0) {
+			n = rb_next(&entry->rb_node);
+			if (!n)
+				n = rb_first(root);
+			entry = rb_entry(n, struct btrfs_delayed_ref_node,
+					 rb_node);
+			bytenr = entry->bytenr;
+			return_bigger = 0;
+			goto again;
+		}
+		return entry;
+	}
 	return NULL;
 }
 
@@ -246,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 		node = rb_first(&delayed_refs->root);
 	} else {
 		ref = NULL;
-		find_ref_head(&delayed_refs->root, start, &ref);
+		find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
 		if (ref) {
-			struct btrfs_delayed_ref_node *tmp;
-
-			node = rb_prev(&ref->rb_node);
-			while (node) {
-				tmp = rb_entry(node,
-					       struct btrfs_delayed_ref_node,
-					       rb_node);
-				if (tmp->bytenr < start)
-					break;
-				ref = tmp;
-				node = rb_prev(&ref->rb_node);
-			}
 			node = &ref->rb_node;
 		} else
 			node = rb_first(&delayed_refs->root);
@@ -748,7 +755,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
 	if (ref)
 		return btrfs_delayed_node_to_head(ref);
 	return NULL;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dc8b9a83459..bbcca12fbbb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2236,6 +2236,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			}
 		}
 
+		/*
+		 * locked_ref is the head node, so we have to go one
+		 * node back for any delayed ref updates
+		 */
+		ref = select_delayed_ref(locked_ref);
+
+		if (ref && ref->seq &&
+		    btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+			/*
+			 * there are still refs with lower seq numbers in the
+			 * process of being added. Don't run this ref yet.
+			 */
+			list_del_init(&locked_ref->cluster);
+			mutex_unlock(&locked_ref->mutex);
+			locked_ref = NULL;
+			delayed_refs->num_heads_ready++;
+			spin_unlock(&delayed_refs->lock);
+			cond_resched();
+			spin_lock(&delayed_refs->lock);
+			continue;
+		}
+
 		/*
 		 * record the must insert reserved flag before we
 		 * drop the spin lock.
@@ -2246,11 +2268,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		extent_op = locked_ref->extent_op;
 		locked_ref->extent_op = NULL;
 
-		/*
-		 * locked_ref is the head node, so we have to go one
-		 * node back for any delayed ref updates
-		 */
-		ref = select_delayed_ref(locked_ref);
 		if (!ref) {
 			/* All delayed refs have been processed, Go ahead
 			 * and send the head node to run_one_delayed_ref,
-- 
cgit v1.2.3


From a168650c08300434e1456abe7b6451f1448230d3 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 12 Dec 2011 16:10:07 +0100
Subject: Btrfs: add waitqueue instead of doing busy waiting for more delayed
 refs

Now that we may be holding back delayed refs for a limited period, we
might end up having no runnable delayed refs. Without this commit, we'd
do busy waiting in that thread until another (runnable) ref arives.
Instead, we're detecting this situation and use a waitqueue, such that
we only try to run more refs after
	a) another runnable ref was added  or
	b) delayed refs are no longer held back

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/delayed-ref.c |  8 +++++++
 fs/btrfs/delayed-ref.h |  7 ++++++
 fs/btrfs/extent-tree.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/transaction.c |  1 +
 4 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ee181989d44..66e4f29505a 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -664,6 +664,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 				   num_bytes, parent, ref_root, level, action,
 				   for_cow);
 	BUG_ON(ret);
+	if (!need_ref_seq(for_cow, ref_root) &&
+	    waitqueue_active(&delayed_refs->seq_wait))
+		wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
@@ -712,6 +715,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 				   num_bytes, parent, ref_root, owner, offset,
 				   action, for_cow);
 	BUG_ON(ret);
+	if (!need_ref_seq(for_cow, ref_root) &&
+	    waitqueue_active(&delayed_refs->seq_wait))
+		wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
@@ -739,6 +745,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				   extent_op->is_data);
 	BUG_ON(ret);
 
+	if (waitqueue_active(&delayed_refs->seq_wait))
+		wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 174416f7882..d8f244d9492 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -153,6 +153,12 @@ struct btrfs_delayed_ref_root {
 	 * as it might influence the outcome of the walk.
 	 */
 	struct list_head seq_head;
+
+	/*
+	 * when the only refs we have in the list must not be processed, we want
+	 * to wait for more refs to show up or for the end of backref walking.
+	 */
+	wait_queue_head_t seq_wait;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -216,6 +222,7 @@ btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
 {
 	spin_lock(&delayed_refs->lock);
 	list_del(&elem->list);
+	wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bbcca12fbbb..0a435e2e143 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2300,7 +2300,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
 		delayed_refs->num_entries--;
-
+		/*
+		 * we modified num_entries, but as we're currently running
+		 * delayed refs, skip
+		 *     wake_up(&delayed_refs->seq_wait);
+		 * here.
+		 */
 		spin_unlock(&delayed_refs->lock);
 
 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2317,6 +2322,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 	return count;
 }
 
+
+static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+			unsigned long num_refs)
+{
+	struct list_head *first_seq = delayed_refs->seq_head.next;
+
+	spin_unlock(&delayed_refs->lock);
+	pr_debug("waiting for more refs (num %ld, first %p)\n",
+		 num_refs, first_seq);
+	wait_event(delayed_refs->seq_wait,
+		   num_refs != delayed_refs->num_entries ||
+		   delayed_refs->seq_head.next != first_seq);
+	pr_debug("done waiting for more refs (num %ld, first %p)\n",
+		 delayed_refs->num_entries, delayed_refs->seq_head.next);
+	spin_lock(&delayed_refs->lock);
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2332,8 +2354,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_node *ref;
 	struct list_head cluster;
 	int ret;
+	u64 delayed_start;
 	int run_all = count == (unsigned long)-1;
 	int run_most = 0;
+	unsigned long num_refs = 0;
+	int consider_waiting;
 
 	if (root == root->fs_info->extent_root)
 		root = root->fs_info->tree_root;
@@ -2341,6 +2366,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	delayed_refs = &trans->transaction->delayed_refs;
 	INIT_LIST_HEAD(&cluster);
 again:
+	consider_waiting = 0;
 	spin_lock(&delayed_refs->lock);
 	if (count == 0) {
 		count = delayed_refs->num_entries * 2;
@@ -2357,11 +2383,35 @@ again:
 		 * of refs to process starting at the first one we are able to
 		 * lock
 		 */
+		delayed_start = delayed_refs->run_delayed_start;
 		ret = btrfs_find_ref_cluster(trans, &cluster,
 					     delayed_refs->run_delayed_start);
 		if (ret)
 			break;
 
+		if (delayed_start >= delayed_refs->run_delayed_start) {
+			if (consider_waiting == 0) {
+				/*
+				 * btrfs_find_ref_cluster looped. let's do one
+				 * more cycle. if we don't run any delayed ref
+				 * during that cycle (because we can't because
+				 * all of them are blocked) and if the number of
+				 * refs doesn't change, we avoid busy waiting.
+				 */
+				consider_waiting = 1;
+				num_refs = delayed_refs->num_entries;
+			} else {
+				wait_for_more_refs(delayed_refs, num_refs);
+				/*
+				 * after waiting, things have changed. we
+				 * dropped the lock and someone else might have
+				 * run some refs, built new clusters and so on.
+				 * therefore, we restart staleness detection.
+				 */
+				consider_waiting = 0;
+			}
+		}
+
 		ret = run_clustered_refs(trans, root, &cluster);
 		BUG_ON(ret < 0);
 
@@ -2369,6 +2419,11 @@ again:
 
 		if (count == 0)
 			break;
+
+		if (ret || delayed_refs->run_delayed_start == 0) {
+			/* refs were run, let's reset staleness detection */
+			consider_waiting = 0;
+		}
 	}
 
 	if (run_all) {
@@ -4933,6 +4988,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	rb_erase(&head->node.rb_node, &delayed_refs->root);
 
 	delayed_refs->num_entries--;
+	if (waitqueue_active(&delayed_refs->seq_wait))
+		wake_up(&delayed_refs->seq_wait);
 
 	/*
 	 * we don't take a ref on the node because we're removing it from the
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 31a7393af64..04c5c7c2c32 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -111,6 +111,7 @@ loop:
 	cur_trans->delayed_refs.flushing = 0;
 	cur_trans->delayed_refs.run_delayed_start = 0;
 	cur_trans->delayed_refs.seq = 1;
+	init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
 	spin_lock_init(&cur_trans->commit_lock);
 	spin_lock_init(&cur_trans->delayed_refs.lock);
 	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
-- 
cgit v1.2.3


From 8da6d5815c592b713ecaf4f4f8b631f8359c96c4 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Wed, 23 Nov 2011 18:55:04 +0100
Subject: Btrfs: added btrfs_find_all_roots()

This function gets a byte number (a data extent), collects all the leafs
pointing to it and walks up the trees to find all fs roots pointing to those
leafs. It also returns the list of all leafs pointing to that extent.

It does proper locking for the involved trees, can be used on busy file
systems and honors delayed refs.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/backref.c | 783 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/backref.h |   5 +
 2 files changed, 788 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd..03c30a1836f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,6 +19,9 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
 
 struct __data_ref {
 	struct list_head list;
@@ -32,6 +35,786 @@ struct __shared_ref {
 	u64 disk_byte;
 };
 
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
+	struct list_head list;
+	u64 root_id;
+	struct btrfs_key key;
+	int level;
+	int count;
+	u64 parent;
+	u64 wanted_disk_byte;
+};
+
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+			    struct btrfs_key *key, int level, u64 parent,
+			    u64 wanted_disk_byte, int count)
+{
+	struct __prelim_ref *ref;
+
+	/* in case we're adding delayed refs, we're holding the refs spinlock */
+	ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->root_id = root_id;
+	if (key)
+		ref->key = *key;
+	else
+		memset(&ref->key, 0, sizeof(ref->key));
+
+	ref->level = level;
+	ref->count = count;
+	ref->parent = parent;
+	ref->wanted_disk_byte = wanted_disk_byte;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+				struct ulist *parents,
+				struct extent_buffer *eb, int level,
+				u64 wanted_objectid, u64 wanted_disk_byte)
+{
+	int ret;
+	int slot;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 disk_byte;
+
+add_parent:
+	ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+	if (ret < 0)
+		return ret;
+
+	if (level != 0)
+		return 0;
+
+	/*
+	 * if the current leaf is full with EXTENT_DATA items, we must
+	 * check the next one if that holds a reference as well.
+	 * ref->count cannot be used to skip this check.
+	 * repeat this until we don't find any additional EXTENT_DATA items.
+	 */
+	while (1) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret < 0)
+			return ret;
+		if (ret)
+			return 0;
+
+		eb = path->nodes[0];
+		for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
+			btrfs_item_key_to_cpu(eb, &key, slot);
+			if (key.objectid != wanted_objectid ||
+			    key.type != BTRFS_EXTENT_DATA_KEY)
+				return 0;
+			fi = btrfs_item_ptr(eb, slot,
+						struct btrfs_file_extent_item);
+			disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+			if (disk_byte == wanted_disk_byte)
+				goto add_parent;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+					struct __prelim_ref *ref,
+					struct ulist *parents)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct btrfs_key root_key;
+	struct btrfs_key key = {0};
+	struct extent_buffer *eb;
+	int ret = 0;
+	int root_level;
+	int level = ref->level;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	root_key.objectid = ref->root_id;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
+
+	rcu_read_lock();
+	root_level = btrfs_header_level(root->node);
+	rcu_read_unlock();
+
+	if (root_level + 1 == level)
+		goto out;
+
+	path->lowest_level = level;
+	ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+	pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+		 "%d for key (%llu %u %llu)\n",
+		 (unsigned long long)ref->root_id, level, ref->count, ret,
+		 (unsigned long long)ref->key.objectid, ref->key.type,
+		 (unsigned long long)ref->key.offset);
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[level];
+	if (!eb) {
+		WARN_ON(1);
+		ret = 1;
+		goto out;
+	}
+
+	if (level == 0) {
+		if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret)
+				goto out;
+			eb = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+	}
+
+	/* the last two parameters will only be used for level == 0 */
+	ret = add_all_parents(root, path, parents, eb, level, key.objectid,
+				ref->wanted_disk_byte);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+				   struct list_head *head)
+{
+	int err;
+	int ret = 0;
+	struct __prelim_ref *ref;
+	struct __prelim_ref *ref_safe;
+	struct __prelim_ref *new_ref;
+	struct ulist *parents;
+	struct ulist_node *node;
+
+	parents = ulist_alloc(GFP_NOFS);
+	if (!parents)
+		return -ENOMEM;
+
+	/*
+	 * _safe allows us to insert directly after the current item without
+	 * iterating over the newly inserted items.
+	 * we're also allowed to re-assign ref during iteration.
+	 */
+	list_for_each_entry_safe(ref, ref_safe, head, list) {
+		if (ref->parent)	/* already direct */
+			continue;
+		if (ref->count == 0)
+			continue;
+		err = __resolve_indirect_ref(fs_info, ref, parents);
+		if (err) {
+			if (ret == 0)
+				ret = err;
+			continue;
+		}
+
+		/* we put the first parent into the ref at hand */
+		node = ulist_next(parents, NULL);
+		ref->parent = node ? node->val : 0;
+
+		/* additional parents require new refs being added here */
+		while ((node = ulist_next(parents, node))) {
+			new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
+			if (!new_ref) {
+				ret = -ENOMEM;
+				break;
+			}
+			memcpy(new_ref, ref, sizeof(*ref));
+			new_ref->parent = node->val;
+			list_add(&new_ref->list, &ref->list);
+		}
+		ulist_reinit(parents);
+	}
+
+	ulist_free(parents);
+	return ret;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ * mode = 2: merge identical parents
+ */
+static int __merge_refs(struct list_head *head, int mode)
+{
+	struct list_head *pos1;
+
+	list_for_each(pos1, head) {
+		struct list_head *n2;
+		struct list_head *pos2;
+		struct __prelim_ref *ref1;
+
+		ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+		if (mode == 1 && ref1->key.type == 0)
+			continue;
+		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+		     pos2 = n2, n2 = pos2->next) {
+			struct __prelim_ref *ref2;
+
+			ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+			if (mode == 1) {
+				if (memcmp(&ref1->key, &ref2->key,
+					   sizeof(ref1->key)) ||
+				    ref1->level != ref2->level ||
+				    ref1->root_id != ref2->root_id)
+					continue;
+				ref1->count += ref2->count;
+			} else {
+				if (ref1->parent != ref2->parent)
+					continue;
+				ref1->count += ref2->count;
+			}
+			list_del(&ref2->list);
+			kfree(ref2);
+		}
+
+	}
+	return 0;
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+			      struct btrfs_key *info_key,
+			      struct list_head *prefs)
+{
+	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+	struct rb_node *n = &head->node.rb_node;
+	int sgn;
+	int ret;
+
+	if (extent_op && extent_op->update_key)
+		btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+
+	while ((n = rb_prev(n))) {
+		struct btrfs_delayed_ref_node *node;
+		node = rb_entry(n, struct btrfs_delayed_ref_node,
+				rb_node);
+		if (node->bytenr != head->node.bytenr)
+			break;
+		WARN_ON(node->is_head);
+
+		if (node->seq > seq)
+			continue;
+
+		switch (node->action) {
+		case BTRFS_ADD_DELAYED_EXTENT:
+		case BTRFS_UPDATE_DELAYED_HEAD:
+			WARN_ON(1);
+			continue;
+		case BTRFS_ADD_DELAYED_REF:
+			sgn = 1;
+			break;
+		case BTRFS_DROP_DELAYED_REF:
+			sgn = -1;
+			break;
+		default:
+			BUG_ON(1);
+		}
+		switch (node->type) {
+		case BTRFS_TREE_BLOCK_REF_KEY: {
+			struct btrfs_delayed_tree_ref *ref;
+
+			ref = btrfs_delayed_node_to_tree_ref(node);
+			ret = __add_prelim_ref(prefs, ref->root, info_key,
+					       ref->level + 1, 0, node->bytenr,
+					       node->ref_mod * sgn);
+			break;
+		}
+		case BTRFS_SHARED_BLOCK_REF_KEY: {
+			struct btrfs_delayed_tree_ref *ref;
+
+			ref = btrfs_delayed_node_to_tree_ref(node);
+			ret = __add_prelim_ref(prefs, ref->root, info_key,
+					       ref->level + 1, ref->parent,
+					       node->bytenr,
+					       node->ref_mod * sgn);
+			break;
+		}
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_delayed_data_ref *ref;
+			struct btrfs_key key;
+
+			ref = btrfs_delayed_node_to_data_ref(node);
+
+			key.objectid = ref->objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = ref->offset;
+			ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+					       node->bytenr,
+					       node->ref_mod * sgn);
+			break;
+		}
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_delayed_data_ref *ref;
+			struct btrfs_key key;
+
+			ref = btrfs_delayed_node_to_data_ref(node);
+
+			key.objectid = ref->objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = ref->offset;
+			ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+					       ref->parent, node->bytenr,
+					       node->ref_mod * sgn);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		BUG_ON(ret);
+	}
+
+	return 0;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+			     struct btrfs_path *path, u64 bytenr,
+			     struct btrfs_key *info_key, int *info_level,
+			     struct list_head *prefs)
+{
+	int ret;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+	unsigned long end;
+	struct btrfs_extent_item *ei;
+	u64 flags;
+	u64 item_size;
+
+	/*
+	 * enumerate all inline refs
+	 */
+	leaf = path->nodes[0];
+	slot = path->slots[0] - 1;
+
+	item_size = btrfs_item_size_nr(leaf, slot);
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		struct btrfs_tree_block_info *info;
+		struct btrfs_disk_key disk_key;
+
+		info = (struct btrfs_tree_block_info *)ptr;
+		*info_level = btrfs_tree_block_level(leaf, info);
+		btrfs_tree_block_key(leaf, info, &disk_key);
+		btrfs_disk_key_to_cpu(info_key, &disk_key);
+		ptr += sizeof(struct btrfs_tree_block_info);
+		BUG_ON(ptr > end);
+	} else {
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+	}
+
+	while (ptr < end) {
+		struct btrfs_extent_inline_ref *iref;
+		u64 offset;
+		int type;
+
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(leaf, iref);
+		offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+		switch (type) {
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, 0, info_key,
+						*info_level + 1, offset,
+						bytenr, 1);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_shared_data_ref *sdref;
+			int count;
+
+			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+			count = btrfs_shared_data_ref_count(leaf, sdref);
+			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+					       bytenr, count);
+			break;
+		}
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, offset, info_key,
+					       *info_level + 1, 0, bytenr, 1);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_extent_data_ref *dref;
+			int count;
+			u64 root;
+
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			count = btrfs_extent_data_ref_count(leaf, dref);
+			key.objectid = btrfs_extent_data_ref_objectid(leaf,
+								      dref);
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+			root = btrfs_extent_data_ref_root(leaf, dref);
+			ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
+						count);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		BUG_ON(ret);
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+
+	return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+			    struct btrfs_path *path, u64 bytenr,
+			    struct btrfs_key *info_key, int info_level,
+			    struct list_head *prefs)
+{
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	while (1) {
+		ret = btrfs_next_item(extent_root, path);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = 0;
+			break;
+		}
+
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.objectid != bytenr)
+			break;
+		if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+			continue;
+		if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+			break;
+
+		switch (key.type) {
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, 0, info_key,
+						info_level + 1, key.offset,
+						bytenr, 1);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_shared_data_ref *sdref;
+			int count;
+
+			sdref = btrfs_item_ptr(leaf, slot,
+					      struct btrfs_shared_data_ref);
+			count = btrfs_shared_data_ref_count(leaf, sdref);
+			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+						bytenr, count);
+			break;
+		}
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, key.offset, info_key,
+						info_level + 1, 0, bytenr, 1);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_extent_data_ref *dref;
+			int count;
+			u64 root;
+
+			dref = btrfs_item_ptr(leaf, slot,
+					      struct btrfs_extent_data_ref);
+			count = btrfs_extent_data_ref_count(leaf, dref);
+			key.objectid = btrfs_extent_data_ref_objectid(leaf,
+								      dref);
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+			root = btrfs_extent_data_ref_root(leaf, dref);
+			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+						bytenr, count);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		BUG_ON(ret);
+	}
+
+	return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info, u64 bytenr,
+			     u64 seq, struct ulist *refs, struct ulist *roots)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_key info_key = { 0 };
+	struct btrfs_delayed_ref_root *delayed_refs = NULL;
+	struct btrfs_delayed_ref_head *head = NULL;
+	int info_level = 0;
+	int ret;
+	struct list_head prefs_delayed;
+	struct list_head prefs;
+	struct __prelim_ref *ref;
+
+	INIT_LIST_HEAD(&prefs);
+	INIT_LIST_HEAD(&prefs_delayed);
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * grab both a lock on the path and a lock on the delayed ref head.
+	 * We need both to get a consistent picture of how the refs look
+	 * at a specified point in time
+	 */
+again:
+	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	/*
+	 * look if there are updates for this ref queued and lock the head
+	 */
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (head) {
+		if (!mutex_trylock(&head->mutex)) {
+			atomic_inc(&head->node.refs);
+			spin_unlock(&delayed_refs->lock);
+
+			btrfs_release_path(path);
+
+			/*
+			 * Mutex was contended, block until it's
+			 * released and try again
+			 */
+			mutex_lock(&head->mutex);
+			mutex_unlock(&head->mutex);
+			btrfs_put_delayed_ref(&head->node);
+			goto again;
+		}
+		ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
+		if (ret)
+			goto out;
+	}
+	spin_unlock(&delayed_refs->lock);
+
+	if (path->slots[0]) {
+		struct extent_buffer *leaf;
+		int slot;
+
+		leaf = path->nodes[0];
+		slot = path->slots[0] - 1;
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid == bytenr &&
+		    key.type == BTRFS_EXTENT_ITEM_KEY) {
+			ret = __add_inline_refs(fs_info, path, bytenr,
+						&info_key, &info_level, &prefs);
+			if (ret)
+				goto out;
+			ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+					       info_level, &prefs);
+			if (ret)
+				goto out;
+		}
+	}
+	btrfs_release_path(path);
+
+	/*
+	 * when adding the delayed refs above, the info_key might not have
+	 * been known yet. Go over the list and replace the missing keys
+	 */
+	list_for_each_entry(ref, &prefs_delayed, list) {
+		if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
+			memcpy(&ref->key, &info_key, sizeof(ref->key));
+	}
+	list_splice_init(&prefs_delayed, &prefs);
+
+	ret = __merge_refs(&prefs, 1);
+	if (ret)
+		goto out;
+
+	ret = __resolve_indirect_refs(fs_info, &prefs);
+	if (ret)
+		goto out;
+
+	ret = __merge_refs(&prefs, 2);
+	if (ret)
+		goto out;
+
+	while (!list_empty(&prefs)) {
+		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		list_del(&ref->list);
+		if (ref->count < 0)
+			WARN_ON(1);
+		if (ref->count && ref->root_id && ref->parent == 0) {
+			/* no parent == root of tree */
+			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+			BUG_ON(ret < 0);
+		}
+		if (ref->count && ref->parent) {
+			ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+			BUG_ON(ret < 0);
+		}
+		kfree(ref);
+	}
+
+out:
+	if (head)
+		mutex_unlock(&head->mutex);
+	btrfs_free_path(path);
+	while (!list_empty(&prefs)) {
+		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		list_del(&ref->list);
+		kfree(ref);
+	}
+	while (!list_empty(&prefs_delayed)) {
+		ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+				       list);
+		list_del(&ref->list);
+		kfree(ref);
+	}
+
+	return ret;
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 num_bytes, u64 seq, struct ulist **leafs)
+{
+	struct ulist *tmp;
+	int ret;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+	*leafs = ulist_alloc(GFP_NOFS);
+	if (!*leafs) {
+		ulist_free(tmp);
+		return -ENOMEM;
+	}
+
+	ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+	ulist_free(tmp);
+
+	if (ret < 0 && ret != -ENOENT) {
+		ulist_free(*leafs);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 num_bytes, u64 seq, struct ulist **roots)
+{
+	struct ulist *tmp;
+	struct ulist_node *node = NULL;
+	int ret;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+	*roots = ulist_alloc(GFP_NOFS);
+	if (!*roots) {
+		ulist_free(tmp);
+		return -ENOMEM;
+	}
+
+	while (1) {
+		ret = find_parent_nodes(trans, fs_info, bytenr, seq,
+					tmp, *roots);
+		if (ret < 0 && ret != -ENOENT) {
+			ulist_free(tmp);
+			ulist_free(*roots);
+			return ret;
+		}
+		node = ulist_next(tmp, node);
+		if (!node)
+			break;
+		bytenr = node->val;
+	}
+
+	ulist_free(tmp);
+	return 0;
+}
+
+
 static int __inode_info(u64 inum, u64 ioff, u8 key_type,
 			struct btrfs_root *fs_root, struct btrfs_path *path,
 			struct btrfs_key *found_key)
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8..d00dfa9ca93 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,7 @@
 #define __BTRFS_BACKREF__
 
 #include "ioctl.h"
+#include "ulist.h"
 
 struct inode_fs_paths {
 	struct btrfs_path		*btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 num_bytes, u64 seq, struct ulist **roots);
+
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
 					struct btrfs_path *path);
-- 
cgit v1.2.3


From 19c5246d251640ac76daa4d34165af78c64b1454 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 4 Jan 2012 17:09:44 -0500
Subject: ext4: add new online resize interface

This patch adds new online resize interface, whose input argument is a
64-bit integer indicating how many blocks there are in the resized fs.

In new resize impelmentation, all work like allocating group tables
are done by kernel side, so the new resize interface can support
flex_bg feature and prepares ground for suppoting resize with features
like bigalloc and exclude bitmap. Besides these, user-space tools just
passes in the new number of blocks.

We delay initializing the bitmaps and inode tables of added groups if
possible and add multi groups (a flex groups) each time, so new resize
is very fast like mkfs.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt |   7 ++
 fs/ext4/ext4.h                     |   2 +
 fs/ext4/ioctl.c                    |  57 ++++++++++++
 fs/ext4/resize.c                   | 177 +++++++++++++++++++++++++++++++++++++
 4 files changed, 243 insertions(+)

(limited to 'fs')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 4917cf24a5e..10ec4639f15 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -581,6 +581,13 @@ Table of Ext4 specific ioctls
 			      behaviour may change in the future as it is
 			      not necessary and has been done this way only
 			      for sake of simplicity.
+
+ EXT4_IOC_RESIZE_FS	      Resize the filesystem to a new size.  The number
+			      of blocks of resized filesystem is passed in via
+			      64 bit integer argument.  The kernel allocates
+			      bitmaps and inode table, the userspace tool thus
+			      just passes the new number of blocks.
+
 ..............................................................................
 
 References
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 05058e2b7f4..4bc0e82a905 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -583,6 +583,7 @@ enum {
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -1929,6 +1930,7 @@ extern int ext4_group_add(struct super_block *sb,
 extern int ext4_group_extend(struct super_block *sb,
 				struct ext4_super_block *es,
 				ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
 
 /* super.c */
 extern void *ext4_kvmalloc(size_t size, gfp_t flags);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ff1aab7cd6e..c1a98804a38 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -18,6 +18,8 @@
 #include "ext4_jbd2.h"
 #include "ext4.h"
 
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
@@ -329,6 +331,60 @@ mext_out:
 		return err;
 	}
 
+	case EXT4_IOC_RESIZE_FS: {
+		ext4_fsblk_t n_blocks_count;
+		struct super_block *sb = inode->i_sb;
+		int err = 0, err2 = 0;
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not (yet) supported with bigalloc");
+			return -EOPNOTSUPP;
+		}
+
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+			       EXT4_FEATURE_INCOMPAT_META_BG)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not (yet) supported with meta_bg");
+			return -EOPNOTSUPP;
+		}
+
+		if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+				   sizeof(__u64))) {
+			return -EFAULT;
+		}
+
+		if (n_blocks_count > MAX_32_NUM &&
+		    !EXT4_HAS_INCOMPAT_FEATURE(sb,
+					       EXT4_FEATURE_INCOMPAT_64BIT)) {
+			ext4_msg(sb, KERN_ERR,
+				 "File system only supports 32-bit block numbers");
+			return -EOPNOTSUPP;
+		}
+
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			goto resizefs_out;
+
+		err = ext4_resize_fs(sb, n_blocks_count);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write(filp->f_path.mnt);
+resizefs_out:
+		ext4_resize_end(sb);
+		return err;
+	}
+
 	case FITRIM:
 	{
 		struct request_queue *q = bdev_get_queue(sb->s_bdev);
@@ -427,6 +483,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 	case EXT4_IOC_MOVE_EXT:
 	case FITRIM:
+	case EXT4_IOC_RESIZE_FS:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index dac23561f3e..5fe2a013ee6 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1430,6 +1430,70 @@ exit:
 	return err;
 }
 
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+				    struct ext4_new_flex_group_data *flex_gd,
+				    ext4_fsblk_t n_blocks_count,
+				    unsigned long flexbg_size)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	ext4_fsblk_t o_blocks_count;
+	ext4_group_t n_group;
+	ext4_group_t group;
+	ext4_group_t last_group;
+	ext4_grpblk_t last;
+	ext4_grpblk_t blocks_per_group;
+	unsigned long i;
+
+	blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+
+	o_blocks_count = ext4_blocks_count(es);
+
+	if (o_blocks_count == n_blocks_count)
+		return 0;
+
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+	BUG_ON(last);
+	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+
+	last_group = group | (flexbg_size - 1);
+	if (last_group > n_group)
+		last_group = n_group;
+
+	flex_gd->count = last_group - group + 1;
+
+	for (i = 0; i < flex_gd->count; i++) {
+		int overhead;
+
+		group_data[i].group = group + i;
+		group_data[i].blocks_count = blocks_per_group;
+		overhead = ext4_bg_has_super(sb, group + i) ?
+			   (1 + ext4_bg_num_gdb(sb, group + i) +
+			    le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+		group_data[i].free_blocks_count = blocks_per_group - overhead;
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+			flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+					       EXT4_BG_INODE_UNINIT;
+		else
+			flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+	}
+
+	if (last_group == n_group &&
+	    EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+		/* We need to initialize block bitmap of last group. */
+		flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+
+	if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+		group_data[i - 1].blocks_count = last + 1;
+		group_data[i - 1].free_blocks_count -= blocks_per_group-
+					last - 1;
+	}
+
+	return 1;
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
@@ -1827,3 +1891,116 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 exit_put:
 	return err;
 } /* ext4_group_extend */
+
+/*
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
+ *
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+	struct ext4_new_flex_group_data *flex_gd = NULL;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head *bh;
+	struct inode *resize_inode;
+	ext4_fsblk_t o_blocks_count;
+	ext4_group_t o_group;
+	ext4_group_t n_group;
+	ext4_grpblk_t offset;
+	unsigned long n_desc_blocks;
+	unsigned long o_desc_blocks;
+	unsigned long desc_blocks;
+	int err = 0, flexbg_size = 1;
+
+	o_blocks_count = ext4_blocks_count(es);
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
+		       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+
+	if (n_blocks_count < o_blocks_count) {
+		/* On-line shrinking not supported */
+		ext4_warning(sb, "can't shrink FS - resize aborted");
+		return -EINVAL;
+	}
+
+	if (n_blocks_count == o_blocks_count)
+		/* Nothing need to do */
+		return 0;
+
+	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset);
+
+	n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
+			EXT4_DESC_PER_BLOCK(sb);
+	o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+			EXT4_DESC_PER_BLOCK(sb);
+	desc_blocks = n_desc_blocks - o_desc_blocks;
+
+	if (desc_blocks &&
+	    (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
+	     le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
+		ext4_warning(sb, "No reserved GDT blocks, can't resize");
+		return -EPERM;
+	}
+
+	resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+	if (IS_ERR(resize_inode)) {
+		ext4_warning(sb, "Error opening resize inode");
+		return PTR_ERR(resize_inode);
+	}
+
+	/* See if the device is actually as big as what was requested */
+	bh = sb_bread(sb, n_blocks_count - 1);
+	if (!bh) {
+		ext4_warning(sb, "can't read last block, resize aborted");
+		return -ENOSPC;
+	}
+	brelse(bh);
+
+	if (offset != 0) {
+		/* extend the last group */
+		ext4_grpblk_t add;
+		add = EXT4_BLOCKS_PER_GROUP(sb) - offset;
+		err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+		if (err)
+			goto out;
+	}
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    es->s_log_groups_per_flex)
+		flexbg_size = 1 << es->s_log_groups_per_flex;
+
+	o_blocks_count = ext4_blocks_count(es);
+	if (o_blocks_count == n_blocks_count)
+		goto out;
+
+	flex_gd = alloc_flex_gd(flexbg_size);
+	if (flex_gd == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Add flex groups. Note that a regular group is a
+	 * flex group with 1 group.
+	 */
+	while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+					      flexbg_size)) {
+		ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
+		err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+		if (unlikely(err))
+			break;
+	}
+
+out:
+	if (flex_gd)
+		free_flex_gd(flex_gd);
+
+	iput(resize_inode);
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
+		       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+	return err;
+}
-- 
cgit v1.2.3


From d89651c8e222b2d2797bf66d4eb7064459f4f4f4 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 4 Jan 2012 17:09:48 -0500
Subject: ext4: let ext4_group_extend() use common code

ext4_group_extend_no_check() is moved out from ext4_group_extend(),
this patch lets ext4_group_extend() call ext4_group_extentd_no_check()
instead.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 41 ++---------------------------------------
 1 file changed, 2 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5fe2a013ee6..eba706d9276 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1795,8 +1795,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	ext4_grpblk_t last;
 	ext4_grpblk_t add;
 	struct buffer_head *bh;
-	handle_t *handle;
-	int err, err2;
+	int err;
 	ext4_group_t group;
 
 	o_blocks_count = ext4_blocks_count(es);
@@ -1852,43 +1851,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	}
 	brelse(bh);
 
-	/* We will update the superblock, one block bitmap, and
-	 * one group descriptor via ext4_free_blocks().
-	 */
-	handle = ext4_journal_start_sb(sb, 3);
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		ext4_warning(sb, "error %d on journal start", err);
-		goto exit_put;
-	}
-
-	if ((err = ext4_journal_get_write_access(handle,
-						 EXT4_SB(sb)->s_sbh))) {
-		ext4_warning(sb, "error %d on journal write access", err);
-		ext4_journal_stop(handle);
-		goto exit_put;
-	}
-	ext4_blocks_count_set(es, o_blocks_count + add);
-	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
-		   o_blocks_count + add);
-	/* We add the blocks to the bitmap and set the group need init bit */
-	err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
-	ext4_handle_dirty_super(handle, sb);
-	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
-		   o_blocks_count + add);
-	err2 = ext4_journal_stop(handle);
-	if (!err && err2)
-		err = err2;
-
-	if (err)
-		goto exit_put;
-
-	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
-		       ext4_blocks_count(es));
-	update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
-		       sizeof(struct ext4_super_block));
-exit_put:
+	err = ext4_group_extend_no_check(sb, o_blocks_count, add);
 	return err;
 } /* ext4_group_extend */
 
-- 
cgit v1.2.3


From 61f296cc49751f1dc992039229d12b0de7e0c2ae Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 4 Jan 2012 17:09:50 -0500
Subject: ext4: let ext4_group_add() use common code

This patch lets ext4_group_add() call ext4_flex_group_add().

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 309 ++-----------------------------------------------------
 1 file changed, 10 insertions(+), 299 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index eba706d9276..f9d948f0eb8 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -594,137 +594,6 @@ out:
 	return err;
 }
 
-/*
- * Set up the block and inode bitmaps, and the inode table for the new group.
- * This doesn't need to be part of the main transaction, since we are only
- * changing blocks outside the actual filesystem.  We still do journaling to
- * ensure the recovery is correct in case of a failure just after resize.
- * If any part of this fails, we simply abort the resize.
- */
-static int setup_new_group_blocks(struct super_block *sb,
-				  struct ext4_new_group_data *input)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
-	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
-		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
-	unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
-	struct buffer_head *bh;
-	handle_t *handle;
-	ext4_fsblk_t block;
-	ext4_grpblk_t bit;
-	int i;
-	int err = 0, err2;
-
-	/* This transaction may be extended/restarted along the way */
-	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
-
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	BUG_ON(input->group != sbi->s_groups_count);
-
-	/* Copy all of the GDT blocks into the backup in this group */
-	for (i = 0, bit = 1, block = start + 1;
-	     i < gdblocks; i++, block++, bit++) {
-		struct buffer_head *gdb;
-
-		ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
-		err = extend_or_restart_transaction(handle, 1);
-		if (err)
-			goto exit_journal;
-
-		gdb = sb_getblk(sb, block);
-		if (!gdb) {
-			err = -EIO;
-			goto exit_journal;
-		}
-		if ((err = ext4_journal_get_write_access(handle, gdb))) {
-			brelse(gdb);
-			goto exit_journal;
-		}
-		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
-		set_buffer_uptodate(gdb);
-		err = ext4_handle_dirty_metadata(handle, NULL, gdb);
-		if (unlikely(err)) {
-			brelse(gdb);
-			goto exit_journal;
-		}
-		brelse(gdb);
-	}
-
-	/* Zero out all of the reserved backup group descriptor table blocks */
-	ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
-			block, sbi->s_itb_per_group);
-	err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
-			       GFP_NOFS);
-	if (err)
-		goto exit_journal;
-
-	err = extend_or_restart_transaction(handle, 2);
-	if (err)
-		goto exit_journal;
-
-	bh = bclean(handle, sb, input->block_bitmap);
-	if (IS_ERR(bh)) {
-		err = PTR_ERR(bh);
-		goto exit_journal;
-	}
-
-	if (ext4_bg_has_super(sb, input->group)) {
-		ext4_debug("mark backup group tables %#04llx (+0)\n", start);
-		ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1);
-	}
-
-	ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
-		   input->block_bitmap - start);
-	ext4_set_bit(input->block_bitmap - start, bh->b_data);
-	ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
-		   input->inode_bitmap - start);
-	ext4_set_bit(input->inode_bitmap - start, bh->b_data);
-
-	/* Zero out all of the inode table blocks */
-	block = input->inode_table;
-	ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
-			block, sbi->s_itb_per_group);
-	err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
-	if (err)
-		goto exit_bh;
-	ext4_set_bits(bh->b_data, input->inode_table - start,
-		      sbi->s_itb_per_group);
-
-
-	ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
-			     bh->b_data);
-	err = ext4_handle_dirty_metadata(handle, NULL, bh);
-	if (unlikely(err)) {
-		ext4_std_error(sb, err);
-		goto exit_bh;
-	}
-	brelse(bh);
-	/* Mark unused entries in inode bitmap used */
-	ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
-		   input->inode_bitmap, input->inode_bitmap - start);
-	if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
-		err = PTR_ERR(bh);
-		goto exit_journal;
-	}
-
-	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-			     bh->b_data);
-	err = ext4_handle_dirty_metadata(handle, NULL, bh);
-	if (unlikely(err))
-		ext4_std_error(sb, err);
-exit_bh:
-	brelse(bh);
-
-exit_journal:
-	if ((err2 = ext4_journal_stop(handle)) && !err)
-		err = err2;
-
-	return err;
-}
-
 /*
  * Iterate through the groups which hold BACKUP superblock/GDT copies in an
  * ext4 filesystem.  The counters should be initialized to 1, 5, and 7 before
@@ -1509,16 +1378,15 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
  */
 int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 {
+	struct ext4_new_flex_group_data flex_gd;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
 		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
-	struct buffer_head *primary = NULL;
-	struct ext4_group_desc *gdp;
 	struct inode *inode = NULL;
-	handle_t *handle;
 	int gdb_off, gdb_num;
-	int err, err2;
+	int err;
+	__u16 bg_flags = 0;
 
 	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
 	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
@@ -1557,172 +1425,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	}
 
 
-	if ((err = verify_group_input(sb, input)))
-		goto exit_put;
-
-	if ((err = setup_new_group_blocks(sb, input)))
-		goto exit_put;
-
-	/*
-	 * We will always be modifying at least the superblock and a GDT
-	 * block.  If we are adding a group past the last current GDT block,
-	 * we will also modify the inode and the dindirect block.  If we
-	 * are adding a group with superblock/GDT backups  we will also
-	 * modify each of the reserved GDT dindirect blocks.
-	 */
-	handle = ext4_journal_start_sb(sb,
-				       ext4_bg_has_super(sb, input->group) ?
-				       3 + reserved_gdb : 4);
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		goto exit_put;
-	}
-
-	if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
-		goto exit_journal;
-
-        /*
-         * We will only either add reserved group blocks to a backup group
-         * or remove reserved blocks for the first group in a new group block.
-         * Doing both would be mean more complex code, and sane people don't
-         * use non-sparse filesystems anymore.  This is already checked above.
-         */
-	if (gdb_off) {
-		primary = sbi->s_group_desc[gdb_num];
-		if ((err = ext4_journal_get_write_access(handle, primary)))
-			goto exit_journal;
-
-		if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) {
-			err = reserve_backup_gdb(handle, inode, input->group);
-			if (err)
-				goto exit_journal;
-		}
-	} else {
-		/*
-		 * Note that we can access new group descriptor block safely
-		 * only if add_new_gdb() succeeds.
-		 */
-		err = add_new_gdb(handle, inode, input->group);
-		if (err)
-			goto exit_journal;
-		primary = sbi->s_group_desc[gdb_num];
-	}
-
-        /*
-         * OK, now we've set up the new group.  Time to make it active.
-         *
-         * so we have to be safe wrt. concurrent accesses the group
-         * data.  So we need to be careful to set all of the relevant
-         * group descriptor data etc. *before* we enable the group.
-         *
-         * The key field here is sbi->s_groups_count: as long as
-         * that retains its old value, nobody is going to access the new
-         * group.
-         *
-         * So first we update all the descriptor metadata for the new
-         * group; then we update the total disk blocks count; then we
-         * update the groups count to enable the group; then finally we
-         * update the free space counts so that the system can start
-         * using the new disk blocks.
-         */
-
-	/* Update group descriptor block for new group */
-	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
-					 gdb_off * EXT4_DESC_SIZE(sb));
-
-	memset(gdp, 0, EXT4_DESC_SIZE(sb));
-	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
-	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
-	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-	ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
-	ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
-	gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
-
-	/*
-	 * We can allocate memory for mb_alloc based on the new group
-	 * descriptor
-	 */
-	err = ext4_mb_add_groupinfo(sb, input->group, gdp);
+	err = verify_group_input(sb, input);
 	if (err)
-		goto exit_journal;
-
-	/*
-	 * Make the new blocks and inodes valid next.  We do this before
-	 * increasing the group count so that once the group is enabled,
-	 * all of its blocks and inodes are already valid.
-	 *
-	 * We always allocate group-by-group, then block-by-block or
-	 * inode-by-inode within a group, so enabling these
-	 * blocks/inodes before the group is live won't actually let us
-	 * allocate the new space yet.
-	 */
-	ext4_blocks_count_set(es, ext4_blocks_count(es) +
-		input->blocks_count);
-	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb));
-
-	/*
-	 * We need to protect s_groups_count against other CPUs seeing
-	 * inconsistent state in the superblock.
-	 *
-	 * The precise rules we use are:
-	 *
-	 * * Writers must perform a smp_wmb() after updating all dependent
-	 *   data and before modifying the groups count
-	 *
-	 * * Readers must perform an smp_rmb() after reading the groups count
-	 *   and before reading any dependent data.
-	 *
-	 * NB. These rules can be relaxed when checking the group count
-	 * while freeing data, as we can only allocate from a block
-	 * group after serialising against the group count, and we can
-	 * only then free after serialising in turn against that
-	 * allocation.
-	 */
-	smp_wmb();
-
-	/* Update the global fs size fields */
-	sbi->s_groups_count++;
-
-	err = ext4_handle_dirty_metadata(handle, NULL, primary);
-	if (unlikely(err)) {
-		ext4_std_error(sb, err);
-		goto exit_journal;
-	}
-
-	/* Update the reserved block counts only once the new group is
-	 * active. */
-	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
-		input->reserved_blocks);
-
-	/* Update the free space counts */
-	percpu_counter_add(&sbi->s_freeclusters_counter,
-			   EXT4_B2C(sbi, input->free_blocks_count));
-	percpu_counter_add(&sbi->s_freeinodes_counter,
-			   EXT4_INODES_PER_GROUP(sb));
-
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
-	    sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group;
-		flex_group = ext4_flex_group(sbi, input->group);
-		atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
-			   &sbi->s_flex_groups[flex_group].free_clusters);
-		atomic_add(EXT4_INODES_PER_GROUP(sb),
-			   &sbi->s_flex_groups[flex_group].free_inodes);
-	}
-
-	ext4_handle_dirty_super(handle, sb);
+		goto out;
 
-exit_journal:
-	if ((err2 = ext4_journal_stop(handle)) && !err)
-		err = err2;
-	if (!err && primary) {
-		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
-			       sizeof(struct ext4_super_block));
-		update_backups(sb, primary->b_blocknr, primary->b_data,
-			       primary->b_size);
-	}
-exit_put:
+	flex_gd.count = 1;
+	flex_gd.groups = input;
+	flex_gd.bg_flags = &bg_flags;
+	err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
 	iput(inode);
 	return err;
 } /* ext4_group_add */
-- 
cgit v1.2.3


From 014a1770371a028d22f364718c805f4216911ecd Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Wed, 4 Jan 2012 17:09:52 -0500
Subject: ext4: add missing ext4_resize_end on error paths

Online resize ioctls 'EXT4_IOC_GROUP_EXTEND' and 'EXT4_IOC_GROUP_ADD'
call ext4_resize_begin() to check permissions and to set the
EXT4_RESIZING bit lock, they do their work and they must finish with
ext4_resize_end() which calls clear_bit_unlock() to unlock and to
avoid -EBUSY errors for the next resize operations.

This patch adds the missing ext4_resize_end() calls on error paths.

Patch tested.

Cc: stable@vger.kernel.org
Signed-off-by: Djalal Harouni <tixxdz@opendz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c1a98804a38..b81a5f1b697 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -184,19 +184,22 @@ setversion_out:
 		if (err)
 			return err;
 
-		if (get_user(n_blocks_count, (__u32 __user *)arg))
-			return -EFAULT;
+		if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+			err = -EFAULT;
+			goto group_extend_out;
+		}
 
 		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Online resizing not supported with bigalloc");
-			return -EOPNOTSUPP;
+			err = -EOPNOTSUPP;
+			goto group_extend_out;
 		}
 
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
-			return err;
+			goto group_extend_out;
 
 		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
 		if (EXT4_SB(sb)->s_journal) {
@@ -206,9 +209,10 @@ setversion_out:
 		}
 		if (err == 0)
 			err = err2;
+
 		mnt_drop_write(filp->f_path.mnt);
+group_extend_out:
 		ext4_resize_end(sb);
-
 		return err;
 	}
 
@@ -267,19 +271,22 @@ mext_out:
 			return err;
 
 		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
-				sizeof(input)))
-			return -EFAULT;
+				sizeof(input))) {
+			err = -EFAULT;
+			goto group_add_out;
+		}
 
 		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Online resizing not supported with bigalloc");
-			return -EOPNOTSUPP;
+			err = -EOPNOTSUPP;
+			goto group_add_out;
 		}
 
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
-			return err;
+			goto group_add_out;
 
 		err = ext4_group_add(sb, &input);
 		if (EXT4_SB(sb)->s_journal) {
@@ -289,9 +296,10 @@ mext_out:
 		}
 		if (err == 0)
 			err = err2;
+
 		mnt_drop_write(filp->f_path.mnt);
+group_add_out:
 		ext4_resize_end(sb);
-
 		return err;
 	}
 
-- 
cgit v1.2.3


From 1d526fc91bea04ee35b7599bf8b82f86c0aaf46c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Wed, 4 Jan 2012 21:22:51 -0500
Subject: ext4: Report max_batch_time option correctly

Currently the value reported for max_batch_time is really the
value of min_batch_time.

Reported-by: Russell Coker <russell@coker.com.au>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 35377d57ec4..36570b7af7c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1096,7 +1096,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	}
 	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
 		seq_printf(seq, ",max_batch_time=%u",
-			   (unsigned) sbi->s_min_batch_time);
+			   (unsigned) sbi->s_max_batch_time);
 	}
 
 	/*
-- 
cgit v1.2.3


From 9b90e5e02896406a6da28a376568003d14c06770 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 4 Jan 2012 22:01:53 -0500
Subject: ext4: reserve new feature flag codepoints

Reserve the ext4 features flags EXT4_FEATURE_RO_COMPAT_METADATA_CSUM,
EXT4_FEATURE_INCOMPAT_INLINEDATA, and EXT4_FEATURE_INCOMPAT_LARGEDIR.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4bc0e82a905..13d15149c85 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1407,6 +1407,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
 #define EXT4_FEATURE_RO_COMPAT_QUOTA		0x0100
 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC		0x0200
+#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -1419,6 +1420,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
 #define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA	0x2000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */
 
 #define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
-- 
cgit v1.2.3


From 9837d8e982b7e87a7207f90618e45d460e196e6a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 4 Jan 2012 22:03:11 -0500
Subject: jbd2: fix hung processes in jbd2_journal_lock_updates()

Toshiyuki Okajima found out that when running

for ((i=0; i < 100000; i++)); do
        if ((i%2 == 0)); then
                chattr +j /mnt/file
        else
                chattr -j /mnt/file
        fi
        echo "0" >> /mnt/file
done

process sometimes hangs indefinitely in jbd2_journal_lock_updates().

Toshiyuki identified that the following race happens:

jbd2_journal_lock_updates()            |jbd2_journal_stop()
---------------------------------------+---------------------------------------
 write_lock(&journal->j_state_lock)    |    .
 ++journal->j_barrier_count            |    .
 spin_lock(&tran->t_handle_lock)       |    .
 atomic_read(&tran->t_updates) //not 0 |
                                       | atomic_dec_and_test(&tran->t_updates)
                                       |    // t_updates = 0
                                       | wake_up(&journal->j_wait_updates)
 prepare_to_wait()                     |    // no process is woken up.
 spin_unlock(&tran->t_handle_lock)     |
 write_unlock(&journal->j_state_lock)  |
 schedule() // never return            |

We fix the problem by first calling prepare_to_wait() and only after that
checking t_updates in jbd2_journal_lock_updates().

Reported-and-analyzed-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0e41a4c080..35ae096bed5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -517,12 +517,13 @@ void jbd2_journal_lock_updates(journal_t *journal)
 			break;
 
 		spin_lock(&transaction->t_handle_lock);
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+				TASK_UNINTERRUPTIBLE);
 		if (!atomic_read(&transaction->t_updates)) {
 			spin_unlock(&transaction->t_handle_lock);
+			finish_wait(&journal->j_wait_updates, &wait);
 			break;
 		}
-		prepare_to_wait(&journal->j_wait_updates, &wait,
-				TASK_UNINTERRUPTIBLE);
 		spin_unlock(&transaction->t_handle_lock);
 		write_unlock(&journal->j_state_lock);
 		schedule();
-- 
cgit v1.2.3


From 176576dbc8141528557eeeb007af2d5a2a4891ef Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Wed, 4 Jan 2012 22:32:12 -0500
Subject: ext4: make local symbol ext4_initxattrs static

The ext4_initxattrs symbol is used only in this file, so it should be
declared static.

Signed-off-by: Djalal Harouni <tixxdz@opendz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/xattr_security.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 34e4350dd4d..e247f8bca8e 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -48,8 +48,9 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
 			      name, value, size, flags);
 }
 
-int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-		    void *fs_info)
+static int
+ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		void *fs_info)
 {
 	const struct xattr *xattr;
 	handle_t *handle = fs_info;
-- 
cgit v1.2.3


From 5f163cc759a9fa8844a4efcf1f579dc5b2ca2491 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 4 Jan 2012 22:33:28 -0500
Subject: ext4: make more symbols static

A couple more functions can reasonably be made static if desired.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 4 +++-
 fs/ext4/ext4.h   | 5 -----
 fs/ext4/inode.c  | 5 ++++-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 12ccacda44e..f9e2cd8cf71 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -23,6 +23,8 @@
 
 #include <trace/events/ext4.h>
 
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+					    ext4_group_t block_group);
 /*
  * balloc.c contains the blocks allocation and deallocation routines
  */
@@ -668,7 +670,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
  * This function returns the number of file system metadata clusters at
  * the beginning of a block group, including the reserved gdt blocks.
  */
-unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
 				     ext4_group_t block_group)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 13d15149c85..e7dc9ad7394 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1803,8 +1803,6 @@ extern void ext4_init_block_bitmap(struct super_block *sb,
 extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
 					      ext4_group_t block_group,
 					      struct ext4_group_desc *gdp);
-extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
-					    ext4_group_t block_group);
 extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
 					   ext4_group_t block_group,
 					   struct ext4_group_desc *gdp);
@@ -1896,9 +1894,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_discard_partial_page_buffers(handle_t *handle,
 		struct address_space *mapping, loff_t from,
 		loff_t length, int flags);
-extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-		struct inode *inode, struct page *page, loff_t from,
-		loff_t length, int flags);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e6cc24dfa98..a526684cbe3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -72,6 +72,9 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+		struct inode *inode, struct page *page, loff_t from,
+		loff_t length, int flags);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -3161,7 +3164,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
  *
  * Returns zero on sucess or negative on failure.
  */
-int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
 		struct inode *inode, struct page *page, loff_t from,
 		loff_t length, int flags)
 {
-- 
cgit v1.2.3


From 4692cf58aa7b81f721c1653d48db99ea41421d58 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 2 Dec 2011 14:56:41 +0100
Subject: Btrfs: new backref walking code

The old backref iteration code could only safely be used on commit roots.
Besides this limitation, it had bugs in finding the roots for these
references. This commit replaces large parts of it by btrfs_find_all_roots()
which a) really finds all roots and the correct roots, b) works correctly
under heavy file system load, c) considers delayed refs.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/backref.c | 354 +++++++++++++++--------------------------------------
 fs/btrfs/ioctl.c   |   8 +-
 fs/btrfs/scrub.c   |   7 +-
 3 files changed, 107 insertions(+), 262 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 03c30a1836f..b9a843226de 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -23,18 +23,6 @@
 #include "transaction.h"
 #include "delayed-ref.h"
 
-struct __data_ref {
-	struct list_head list;
-	u64 inum;
-	u64 root;
-	u64 extent_data_item_offset;
-};
-
-struct __shared_ref {
-	struct list_head list;
-	u64 disk_byte;
-};
-
 /*
  * this structure records all encountered refs on the way up to the root
  */
@@ -964,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
 	if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
 	    found_key->objectid > logical ||
-	    found_key->objectid + found_key->offset <= logical)
+	    found_key->objectid + found_key->offset <= logical) {
+		pr_debug("logical %llu is not within any extent\n",
+			 (unsigned long long)logical);
 		return -ENOENT;
+	}
 
 	eb = path->nodes[0];
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -974,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	flags = btrfs_extent_flags(eb, ei);
 
+	pr_debug("logical %llu is at position %llu within the extent (%llu "
+		 "EXTENT_ITEM %llu) flags %#llx size %u\n",
+		 (unsigned long long)logical,
+		 (unsigned long long)(logical - found_key->objectid),
+		 (unsigned long long)found_key->objectid,
+		 (unsigned long long)found_key->offset,
+		 (unsigned long long)flags, item_size);
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		return BTRFS_EXTENT_FLAG_TREE_BLOCK;
 	if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -1070,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 	return 0;
 }
 
-static int __data_list_add(struct list_head *head, u64 inum,
-				u64 extent_data_item_offset, u64 root)
-{
-	struct __data_ref *ref;
-
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
-	if (!ref)
-		return -ENOMEM;
-
-	ref->inum = inum;
-	ref->extent_data_item_offset = extent_data_item_offset;
-	ref->root = root;
-	list_add_tail(&ref->list, head);
-
-	return 0;
-}
-
-static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
-				struct btrfs_extent_data_ref *dref)
-{
-	return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
-				btrfs_extent_data_ref_offset(eb, dref),
-				btrfs_extent_data_ref_root(eb, dref));
-}
-
-static int __shared_list_add(struct list_head *head, u64 disk_byte)
-{
-	struct __shared_ref *ref;
-
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
-	if (!ref)
-		return -ENOMEM;
-
-	ref->disk_byte = disk_byte;
-	list_add_tail(&ref->list, head);
-
-	return 0;
-}
-
-static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
-					   u64 logical, u64 inum,
-					   u64 extent_data_item_offset,
-					   u64 extent_offset,
-					   struct btrfs_path *path,
-					   struct list_head *data_refs,
-					   iterate_extent_inodes_t *iterate,
-					   void *ctx)
-{
-	u64 ref_root;
-	u32 item_size;
-	struct btrfs_key key;
-	struct extent_buffer *eb;
-	struct btrfs_extent_item *ei;
-	struct btrfs_extent_inline_ref *eiref;
-	struct __data_ref *ref;
-	int ret;
-	int type;
-	int last;
-	unsigned long ptr = 0;
-
-	WARN_ON(!list_empty(data_refs));
-	ret = extent_from_logical(fs_info, logical, path, &key);
-	if (ret & BTRFS_EXTENT_FLAG_DATA)
-		ret = -EIO;
-	if (ret < 0)
-		goto out;
-
-	eb = path->nodes[0];
-	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-	ret = 0;
-	ref_root = 0;
-	/*
-	 * as done in iterate_extent_inodes, we first build a list of refs to
-	 * iterate, then free the path and then iterate them to avoid deadlocks.
-	 */
-	do {
-		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-						&eiref, &type);
-		if (last < 0) {
-			ret = last;
-			goto out;
-		}
-		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
-		    type == BTRFS_SHARED_BLOCK_REF_KEY) {
-			ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
-			ret = __data_list_add(data_refs, inum,
-						extent_data_item_offset,
-						ref_root);
-		}
-	} while (!ret && !last);
-
-	btrfs_release_path(path);
-
-	if (ref_root == 0) {
-		printk(KERN_ERR "btrfs: failed to find tree block ref "
-			"for shared data backref %llu\n", logical);
-		WARN_ON(1);
-		ret = -EIO;
-	}
-
-out:
-	while (!list_empty(data_refs)) {
-		ref = list_first_entry(data_refs, struct __data_ref, list);
-		list_del(&ref->list);
-		if (!ret)
-			ret = iterate(ref->inum, extent_offset +
-					ref->extent_data_item_offset,
-					ref->root, ctx);
-		kfree(ref);
-	}
-
-	return ret;
-}
-
-static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
-				    u64 logical, u64 orig_extent_item_objectid,
-				    u64 extent_offset, struct btrfs_path *path,
-				    struct list_head *data_refs,
-				    iterate_extent_inodes_t *iterate,
-				    void *ctx)
+static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path, u64 logical,
+				u64 orig_extent_item_objectid,
+				u64 extent_item_pos, u64 root,
+				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	u64 disk_byte;
 	struct btrfs_key key;
@@ -1199,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
 	struct extent_buffer *eb;
 	int slot;
 	int nritems;
-	int ret;
-	int found = 0;
+	int ret = 0;
+	int extent_type;
+	u64 data_offset;
+	u64 data_len;
 
 	eb = read_tree_block(fs_info->tree_root, logical,
 				fs_info->tree_root->leafsize, 0);
@@ -1218,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
 		if (key.type != BTRFS_EXTENT_DATA_KEY)
 			continue;
 		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-		if (!fi) {
-			free_extent_buffer(eb);
-			return -EIO;
-		}
+		extent_type = btrfs_file_extent_type(eb, fi);
+		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
 		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-		if (disk_byte != orig_extent_item_objectid) {
-			if (found)
-				break;
-			else
-				continue;
-		}
-		++found;
-		ret = __iter_shared_inline_ref_inodes(fs_info, logical,
-							key.objectid,
-							key.offset,
-							extent_offset, path,
-							data_refs,
-							iterate, ctx);
-		if (ret)
-			break;
-	}
+		if (disk_byte != orig_extent_item_objectid)
+			continue;
 
-	if (!found) {
-		printk(KERN_ERR "btrfs: failed to follow shared data backref "
-			"to parent %llu\n", logical);
-		WARN_ON(1);
-		ret = -EIO;
+		data_offset = btrfs_file_extent_offset(eb, fi);
+		data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+		if (extent_item_pos < data_offset ||
+		    extent_item_pos >= data_offset + data_len)
+			continue;
+
+		pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+				"root %llu\n", orig_extent_item_objectid,
+				key.objectid, key.offset, root);
+		ret = iterate(key.objectid,
+				key.offset + (extent_item_pos - data_offset),
+				root, ctx);
+		if (ret) {
+			pr_debug("stopping iteration because ret=%d\n", ret);
+			break;
+		}
 	}
 
 	free_extent_buffer(eb);
+
 	return ret;
 }
 
 /*
  * calls iterate() for every inode that references the extent identified by
- * the given parameters. will use the path given as a parameter and return it
- * released.
+ * the given parameters.
  * when the iterator function returns a non-zero value, iteration stops.
+ * path is guaranteed to be in released state when iterate() is called.
  */
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path,
-				u64 extent_item_objectid,
-				u64 extent_offset,
+				u64 extent_item_objectid, u64 extent_item_pos,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
-	unsigned long ptr = 0;
-	int last;
 	int ret;
-	int type;
-	u64 logical;
-	u32 item_size;
-	struct btrfs_extent_inline_ref *eiref;
-	struct btrfs_extent_data_ref *dref;
-	struct extent_buffer *eb;
-	struct btrfs_extent_item *ei;
-	struct btrfs_key key;
 	struct list_head data_refs = LIST_HEAD_INIT(data_refs);
 	struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
-	struct __data_ref *ref_d;
-	struct __shared_ref *ref_s;
+	struct btrfs_trans_handle *trans;
+	struct ulist *refs;
+	struct ulist *roots;
+	struct ulist_node *ref_node = NULL;
+	struct ulist_node *root_node = NULL;
+	struct seq_list seq_elem;
+	struct btrfs_delayed_ref_root *delayed_refs;
 
-	eb = path->nodes[0];
-	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-	/* first we iterate the inline refs, ... */
-	do {
-		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-						&eiref, &type);
-		if (last == -ENOENT) {
-			ret = 0;
-			break;
-		}
-		if (last < 0) {
-			ret = last;
-			break;
-		}
+	trans = btrfs_join_transaction(fs_info->extent_root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
-			dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
-			ret = __data_list_add_eb(&data_refs, eb, dref);
-		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
-			logical = btrfs_extent_inline_ref_offset(eb, eiref);
-			ret = __shared_list_add(&shared_refs, logical);
-		}
-	} while (!ret && !last);
+	pr_debug("resolving all inodes for extent %llu\n",
+			extent_item_objectid);
 
-	/* ... then we proceed to in-tree references and ... */
-	while (!ret) {
-		++path->slots[0];
-		if (path->slots[0] > btrfs_header_nritems(eb)) {
-			ret = btrfs_next_leaf(fs_info->extent_root, path);
-			if (ret) {
-				if (ret == 1)
-					ret = 0; /* we're done */
-				break;
-			}
-			eb = path->nodes[0];
-		}
-		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
-		if (key.objectid != extent_item_objectid)
-			break;
-		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
-			dref = btrfs_item_ptr(eb, path->slots[0],
-						struct btrfs_extent_data_ref);
-			ret = __data_list_add_eb(&data_refs, eb, dref);
-		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
-			ret = __shared_list_add(&shared_refs, key.offset);
-		}
-	}
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+	spin_unlock(&delayed_refs->lock);
 
-	btrfs_release_path(path);
+	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+				   extent_item_pos, seq_elem.seq,
+				   &refs);
 
-	/*
-	 * ... only at the very end we can process the refs we found. this is
-	 * because the iterator function we call is allowed to make tree lookups
-	 * and we have to avoid deadlocks. additionally, we need more tree
-	 * lookups ourselves for shared data refs.
-	 */
-	while (!list_empty(&data_refs)) {
-		ref_d = list_first_entry(&data_refs, struct __data_ref, list);
-		list_del(&ref_d->list);
-		if (!ret)
-			ret = iterate(ref_d->inum, extent_offset +
-					ref_d->extent_data_item_offset,
-					ref_d->root, ctx);
-		kfree(ref_d);
-	}
+	if (ret)
+		goto out;
 
-	while (!list_empty(&shared_refs)) {
-		ref_s = list_first_entry(&shared_refs, struct __shared_ref,
-					list);
-		list_del(&ref_s->list);
-		if (!ret)
-			ret = __iter_shared_inline_ref(fs_info,
-							ref_s->disk_byte,
-							extent_item_objectid,
-							extent_offset, path,
-							&data_refs,
-							iterate, ctx);
-		kfree(ref_s);
+	while (!ret && (ref_node = ulist_next(refs, ref_node))) {
+		ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
+						seq_elem.seq, &roots);
+		if (ret)
+			break;
+		while (!ret && (root_node = ulist_next(roots, root_node))) {
+			pr_debug("root %llu references leaf %llu\n",
+					root_node->val, ref_node->val);
+			ret = iterate_leaf_refs(fs_info, path, ref_node->val,
+						extent_item_objectid,
+						extent_item_pos, root_node->val,
+						iterate, ctx);
+		}
 	}
 
+	ulist_free(refs);
+	ulist_free(roots);
+out:
+	btrfs_put_delayed_seq(delayed_refs, &seq_elem);
+	btrfs_end_transaction(trans, fs_info->extent_root);
 	return ret;
 }
 
@@ -1369,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	int ret;
-	u64 offset;
+	u64 extent_item_pos;
 	struct btrfs_key found_key;
 
 	ret = extent_from_logical(fs_info, logical, path,
 					&found_key);
+	btrfs_release_path(path);
 	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		ret = -EINVAL;
 	if (ret < 0)
 		return ret;
 
-	offset = logical - found_key.objectid;
+	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
-					offset, iterate, ctx);
+					extent_item_pos, iterate, ctx);
 
 	return ret;
 }
@@ -1426,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
 			name_len = btrfs_inode_ref_name_len(eb, iref);
 			/* path must be released before calling iterate()! */
+			pr_debug("following ref at offset %u for inode %llu in "
+				 "tree %llu\n", cur,
+				 (unsigned long long)found_key.objectid,
+				 (unsigned long long)fs_root->objectid);
 			ret = iterate(parent, iref, eb, ctx);
 			if (ret) {
 				free_extent_buffer(eb);
@@ -1466,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 		return PTR_ERR(fspath);
 
 	if (fspath > fspath_min) {
+		pr_debug("path resolved: %s\n", fspath);
 		ipath->fspath->val[i] = (u64)(unsigned long)fspath;
 		++ipath->fspath->elem_cnt;
 		ipath->fspath->bytes_left = fspath - fspath_min;
 	} else {
+		pr_debug("missed path, not enough space. missing bytes: %lu, "
+			 "constructed so far: %s\n",
+			 (unsigned long)(fspath_min - fspath), fspath_min);
 		++ipath->fspath->elem_missed;
 		ipath->fspath->bytes_missing += fspath_min - fspath;
 		ipath->fspath->bytes_left = 0;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c48f2e931ea..9b0526872b7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2976,7 +2976,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 {
 	int ret = 0;
 	int size;
-	u64 extent_offset;
+	u64 extent_item_pos;
 	struct btrfs_ioctl_logical_ino_args *loi;
 	struct btrfs_data_container *inodes = NULL;
 	struct btrfs_path *path = NULL;
@@ -3007,15 +3007,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 	}
 
 	ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+	btrfs_release_path(path);
 
 	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		ret = -ENOENT;
 	if (ret < 0)
 		goto out;
 
-	extent_offset = loi->logical - key.objectid;
+	extent_item_pos = loi->logical - key.objectid;
 	ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
-					extent_offset, build_ino_list, inodes);
+					extent_item_pos, build_ino_list,
+					inodes);
 
 	if (ret < 0)
 		goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c27bcb67f33..b5edff25a53 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -309,7 +309,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 	u8 ref_level;
 	unsigned long ptr = 0;
 	const int bufsize = 4096;
-	u64 extent_offset;
+	u64 extent_item_pos;
 
 	path = btrfs_alloc_path();
 
@@ -329,12 +329,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 	if (ret < 0)
 		goto out;
 
-	extent_offset = swarn.logical - found_key.objectid;
+	extent_item_pos = swarn.logical - found_key.objectid;
 	swarn.extent_item_size = found_key.offset;
 
 	eb = path->nodes[0];
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+	btrfs_release_path(path);
 
 	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		do {
@@ -351,7 +352,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 	} else {
 		swarn.path = path;
 		iterate_extent_inodes(fs_info, path, found_key.objectid,
-					extent_offset,
+					extent_item_pos,
 					scrub_print_warning_inode, &swarn);
 	}
 
-- 
cgit v1.2.3


From 6bf7e080d5bcb0d399ee38ce3dabbfad64448192 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 1 Dec 2011 14:35:19 +0100
Subject: Btrfs: make sure we're not using obsolete code in btrfs_get_extent

There's code in btrfs_get_extent that should never be used. This patch turns
a WARN_ON(1) into a BUG(), hoping we can remove the transaction code from
btrfs_get_extent soon.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ea819386b86..603d740f0f1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5022,7 +5022,7 @@ again:
 			}
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
-			WARN_ON(1);
+			BUG();
 			if (!trans) {
 				kunmap(page);
 				free_extent_map(em);
-- 
cgit v1.2.3


From 68c97153fb7f2877f98aa6c29546381d9cad2fed Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2012 13:22:46 -0500
Subject: SUNRPC: Clean up the RPCSEC_GSS service ticket requests

Instead of hacking specific service names into gss_encode_v1_msg, we should
just allow the caller to specify the service name explicitly.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs/client.c                 |  2 +-
 fs/nfsd/nfs4callback.c          |  2 +-
 include/linux/sunrpc/auth.h     |  3 ++-
 include/linux/sunrpc/auth_gss.h |  2 +-
 net/sunrpc/auth_generic.c       |  6 ++++--
 net/sunrpc/auth_gss/auth_gss.c  | 40 +++++++++++++++++++++++-----------------
 6 files changed, 32 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 873bf00d51a..32ea37198e9 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -185,7 +185,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_minorversion = cl_init->minorversion;
 	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
 #endif
-	cred = rpc_lookup_machine_cred();
+	cred = rpc_lookup_machine_cred("*");
 	if (!IS_ERR(cred))
 		clp->cl_machine_cred = cred;
 	nfs_fscache_get_client_cookie(clp);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7748d6a18d9..6f3ebb48b12 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -718,7 +718,7 @@ int set_callback_cred(void)
 {
 	if (callback_cred)
 		return 0;
-	callback_cred = rpc_lookup_machine_cred();
+	callback_cred = rpc_lookup_machine_cred("nfs");
 	if (!callback_cred)
 		return -ENOMEM;
 	return 0;
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index febc4dbec2c..7874a8a5663 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -26,6 +26,7 @@ struct auth_cred {
 	uid_t	uid;
 	gid_t	gid;
 	struct group_info *group_info;
+	const char *principal;
 	unsigned char machine_cred : 1;
 };
 
@@ -127,7 +128,7 @@ void			rpc_destroy_generic_auth(void);
 void 			rpc_destroy_authunix(void);
 
 struct rpc_cred *	rpc_lookup_cred(void);
-struct rpc_cred *	rpc_lookup_machine_cred(void);
+struct rpc_cred *	rpc_lookup_machine_cred(const char *service_name);
 int			rpcauth_register(const struct rpc_authops *);
 int			rpcauth_unregister(const struct rpc_authops *);
 struct rpc_auth *	rpcauth_create(rpc_authflavor_t, struct rpc_clnt *);
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index 8eee9dbbfe7..f1cfd4c85cd 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -82,8 +82,8 @@ struct gss_cred {
 	enum rpc_gss_svc	gc_service;
 	struct gss_cl_ctx __rcu	*gc_ctx;
 	struct gss_upcall_msg	*gc_upcall;
+	const char		*gc_principal;
 	unsigned long		gc_upcall_timestamp;
-	unsigned char		gc_machine_cred : 1;
 };
 
 #endif /* __KERNEL__ */
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index e010a015d99..1426ec3d0a5 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -41,15 +41,17 @@ EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 /*
  * Public call interface for looking up machine creds.
  */
-struct rpc_cred *rpc_lookup_machine_cred(void)
+struct rpc_cred *rpc_lookup_machine_cred(const char *service_name)
 {
 	struct auth_cred acred = {
 		.uid = RPC_MACHINE_CRED_USERID,
 		.gid = RPC_MACHINE_CRED_GROUPID,
+		.principal = service_name,
 		.machine_cred = 1,
 	};
 
-	dprintk("RPC:       looking up machine cred\n");
+	dprintk("RPC:       looking up machine cred for service %s\n",
+			service_name);
 	return generic_auth.au_ops->lookup_cred(&generic_auth, &acred, 0);
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index afb56553dfe..28d72d29873 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -392,7 +392,8 @@ static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg)
 }
 
 static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
-				struct rpc_clnt *clnt, int machine_cred)
+				struct rpc_clnt *clnt,
+				const char *service_name)
 {
 	struct gss_api_mech *mech = gss_msg->auth->mech;
 	char *p = gss_msg->databuf;
@@ -407,12 +408,8 @@ static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
 		p += len;
 		gss_msg->msg.len += len;
 	}
-	if (machine_cred) {
-		len = sprintf(p, "service=* ");
-		p += len;
-		gss_msg->msg.len += len;
-	} else if (!strcmp(clnt->cl_program->name, "nfs4_cb")) {
-		len = sprintf(p, "service=nfs ");
+	if (service_name != NULL) {
+		len = sprintf(p, "service=%s ", service_name);
 		p += len;
 		gss_msg->msg.len += len;
 	}
@@ -429,17 +426,18 @@ static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
 }
 
 static void gss_encode_msg(struct gss_upcall_msg *gss_msg,
-				struct rpc_clnt *clnt, int machine_cred)
+				struct rpc_clnt *clnt,
+				const char *service_name)
 {
 	if (pipe_version == 0)
 		gss_encode_v0_msg(gss_msg);
 	else /* pipe_version == 1 */
-		gss_encode_v1_msg(gss_msg, clnt, machine_cred);
+		gss_encode_v1_msg(gss_msg, clnt, service_name);
 }
 
-static inline struct gss_upcall_msg *
-gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid, struct rpc_clnt *clnt,
-		int machine_cred)
+static struct gss_upcall_msg *
+gss_alloc_msg(struct gss_auth *gss_auth, struct rpc_clnt *clnt,
+		uid_t uid, const char *service_name)
 {
 	struct gss_upcall_msg *gss_msg;
 	int vers;
@@ -459,7 +457,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid, struct rpc_clnt *clnt,
 	atomic_set(&gss_msg->count, 1);
 	gss_msg->uid = uid;
 	gss_msg->auth = gss_auth;
-	gss_encode_msg(gss_msg, clnt, machine_cred);
+	gss_encode_msg(gss_msg, clnt, service_name);
 	return gss_msg;
 }
 
@@ -471,7 +469,7 @@ gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cr
 	struct gss_upcall_msg *gss_new, *gss_msg;
 	uid_t uid = cred->cr_uid;
 
-	gss_new = gss_alloc_msg(gss_auth, uid, clnt, gss_cred->gc_machine_cred);
+	gss_new = gss_alloc_msg(gss_auth, clnt, uid, gss_cred->gc_principal);
 	if (IS_ERR(gss_new))
 		return gss_new;
 	gss_msg = gss_add_msg(gss_new);
@@ -995,7 +993,9 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 	 */
 	cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW;
 	cred->gc_service = gss_auth->service;
-	cred->gc_machine_cred = acred->machine_cred;
+	cred->gc_principal = NULL;
+	if (acred->machine_cred)
+		cred->gc_principal = acred->principal;
 	kref_get(&gss_auth->kref);
 	return &cred->gc_base;
 
@@ -1030,7 +1030,12 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
 	if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
 		return 0;
 out:
-	if (acred->machine_cred != gss_cred->gc_machine_cred)
+	if (acred->principal != NULL) {
+		if (gss_cred->gc_principal == NULL)
+			return 0;
+		return strcmp(acred->principal, gss_cred->gc_principal) == 0;
+	}
+	if (gss_cred->gc_principal != NULL)
 		return 0;
 	return rc->cr_uid == acred->uid;
 }
@@ -1104,7 +1109,8 @@ static int gss_renew_cred(struct rpc_task *task)
 	struct rpc_auth *auth = oldcred->cr_auth;
 	struct auth_cred acred = {
 		.uid = oldcred->cr_uid,
-		.machine_cred = gss_cred->gc_machine_cred,
+		.principal = gss_cred->gc_principal,
+		.machine_cred = (gss_cred->gc_principal != NULL ? 1 : 0),
 	};
 	struct rpc_cred *new;
 
-- 
cgit v1.2.3


From 2edb6bc3852c681c0d948245bd55108dc6407604 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 16 Nov 2011 11:46:31 +1100
Subject: NFS - fix recent breakage to NFS error handling.

From c6d615d2b97fe305cbf123a8751ced859dca1d5e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 16 Nov 2011 09:39:05 +1100
Subject: [PATCH] NFS - fix recent breakage to NFS error handling.

commit 02c24a82187d5a628c68edfe71ae60dc135cd178 made a small and
presumably unintended change to write error handling in NFS.

Previously an error from filemap_write_and_wait_range would only be of
interest if nfs_file_fsync did not return an error.  After this commit,
an error from filemap_write_and_wait_range would mean that (the rest of)
nfs_file_fsync would not even be called.

This means that:
 1/ you are more likely to see EIO than e.g. EDQUOT or ENOSPC.
 2/ NFS_CONTEXT_ERROR_WRITE remains set for longer so more writes are
    synchronous.

This patch restores previous behaviour.

Cc: stable@kernel.org
Cc: Josef Bacik <josef@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 606ef0f20ae..c43a452f7da 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -272,13 +272,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 			datasync);
 
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (ret)
-		return ret;
 	mutex_lock(&inode->i_mutex);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
 	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 	status = nfs_commit_inode(inode, FLUSH_SYNC);
+	if (status >= 0 && ret < 0)
+		status = ret;
 	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 	if (have_error)
 		ret = xchg(&ctx->error, 0);
-- 
cgit v1.2.3


From 8a0d551a59ac92d8ff048d6cb29d3a02073e81e8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 20 Dec 2011 06:57:45 -0500
Subject: nfs: fix regression in handling of context= option in NFSv4

Setting the security context of a NFSv4 mount via the context= mount
option is currently broken. The NFSv4 codepath allocates a parsed
options struct, and then parses the mount options to fill it. It
eventually calls nfs4_remote_mount which calls security_init_mnt_opts.
That clobbers the lsm_opts struct that was populated earlier. This bug
also looks like it causes a small memory leak on each v4 mount where
context= is used.

Fix this by moving the initialization of the lsm_opts into
nfs_alloc_parsed_mount_data. Also, add a destructor for
nfs_parsed_mount_data to make it easier to free all of the allocations
hanging off of it, and to ensure that the security_free_mnt_opts is
called whenever security_init_mnt_opts is.

I believe this regression was introduced quite some time ago, probably
by commit c02d7adf.

Cc: stable@vger.kernel.org
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 134777406ee..3ada13c9986 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -909,10 +909,24 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
 		data->auth_flavor_len	= 1;
 		data->version		= version;
 		data->minorversion	= 0;
+		security_init_mnt_opts(&data->lsm_opts);
 	}
 	return data;
 }
 
+static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data)
+{
+	if (data) {
+		kfree(data->client_address);
+		kfree(data->mount_server.hostname);
+		kfree(data->nfs_server.export_path);
+		kfree(data->nfs_server.hostname);
+		kfree(data->fscache_uniq);
+		security_free_mnt_opts(&data->lsm_opts);
+		kfree(data);
+	}
+}
+
 /*
  * Sanity-check a server address provided by the mount command.
  *
@@ -2220,9 +2234,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
 	mntfh = nfs_alloc_fhandle();
 	if (data == NULL || mntfh == NULL)
-		goto out_free_fh;
-
-	security_init_mnt_opts(&data->lsm_opts);
+		goto out;
 
 	/* Validate the mount data */
 	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
@@ -2234,8 +2246,6 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 #ifdef CONFIG_NFS_V4
 	if (data->version == 4) {
 		mntroot = nfs4_try_mount(flags, dev_name, data);
-		kfree(data->client_address);
-		kfree(data->nfs_server.export_path);
 		goto out;
 	}
 #endif	/* CONFIG_NFS_V4 */
@@ -2290,13 +2300,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	s->s_flags |= MS_ACTIVE;
 
 out:
-	kfree(data->nfs_server.hostname);
-	kfree(data->mount_server.hostname);
-	kfree(data->fscache_uniq);
-	security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
+	nfs_free_parsed_mount_data(data);
 	nfs_free_fhandle(mntfh);
-	kfree(data);
 	return mntroot;
 
 out_err_nosb:
@@ -2623,9 +2628,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
 
 	mntfh = nfs_alloc_fhandle();
 	if (data == NULL || mntfh == NULL)
-		goto out_free_fh;
-
-	security_init_mnt_opts(&data->lsm_opts);
+		goto out;
 
 	/* Get a volume representation */
 	server = nfs4_create_server(data, mntfh);
@@ -2677,13 +2680,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
 
 	s->s_flags |= MS_ACTIVE;
 
-	security_free_mnt_opts(&data->lsm_opts);
 	nfs_free_fhandle(mntfh);
 	return mntroot;
 
 out:
-	security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
 	nfs_free_fhandle(mntfh);
 	return ERR_PTR(error);
 
@@ -2838,7 +2838,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
 
 	data = nfs_alloc_parsed_mount_data(4);
 	if (data == NULL)
-		goto out_free_data;
+		goto out;
 
 	/* Validate the mount data */
 	error = nfs4_validate_mount_data(raw_data, data, dev_name);
@@ -2852,12 +2852,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
 		error = PTR_ERR(res);
 
 out:
-	kfree(data->client_address);
-	kfree(data->nfs_server.export_path);
-	kfree(data->nfs_server.hostname);
-	kfree(data->fscache_uniq);
-out_free_data:
-	kfree(data);
+	nfs_free_parsed_mount_data(data);
 	dprintk("<-- nfs4_mount() = %d%s\n", error,
 			error != 0 ? " [error]" : "");
 	return res;
-- 
cgit v1.2.3


From 61f2e5106582d02f30b6807e3f9c07463c572ccb Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Nov 2011 13:58:20 -0500
Subject: NFSv4.1: fix backchannel slotid off-by-one bug

Cc:stable@kernel.org
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 43926add945..54cea8ad5a7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -339,7 +339,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 	dprintk("%s enter. slotid %d seqid %d\n",
 		__func__, args->csa_slotid, args->csa_sequenceid);
 
-	if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
+	if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
 		return htonl(NFS4ERR_BADSLOT);
 
 	slot = tbl->slots + args->csa_slotid;
-- 
cgit v1.2.3


From aacd5537270a752fe12a9914a207284fc2341c6d Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Nov 2011 13:58:21 -0500
Subject: NFSv4.1: cleanup init and reset of session slot tables

We are either initializing or resetting a session. Initialize or reset
the session slot tables accordingly.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 59 +++++++++++++++++++++----------------------------------
 1 file changed, 22 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d9f4d78c341..a64aa56e140 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5021,23 +5021,6 @@ out:
 	return ret;
 }
 
-/*
- * Reset the forechannel and backchannel slot tables
- */
-static int nfs4_reset_slot_tables(struct nfs4_session *session)
-{
-	int status;
-
-	status = nfs4_reset_slot_table(&session->fc_slot_table,
-			session->fc_attrs.max_reqs, 1);
-	if (status)
-		return status;
-
-	status = nfs4_reset_slot_table(&session->bc_slot_table,
-			session->bc_attrs.max_reqs, 0);
-	return status;
-}
-
 /* Destroy the slot table */
 static void nfs4_destroy_slot_tables(struct nfs4_session *session)
 {
@@ -5083,29 +5066,35 @@ out:
 }
 
 /*
- * Initialize the forechannel and backchannel tables
+ * Initialize or reset the forechannel and backchannel tables
  */
-static int nfs4_init_slot_tables(struct nfs4_session *session)
+static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
 {
 	struct nfs4_slot_table *tbl;
-	int status = 0;
+	int status;
 
-	tbl = &session->fc_slot_table;
+	dprintk("--> %s\n", __func__);
+	/* Fore channel */
+	tbl = &ses->fc_slot_table;
 	if (tbl->slots == NULL) {
-		status = nfs4_init_slot_table(tbl,
-				session->fc_attrs.max_reqs, 1);
+		status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+		if (status) /* -ENOMEM */
+			return status;
+	} else {
+		status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
 		if (status)
 			return status;
 	}
-
-	tbl = &session->bc_slot_table;
+	/* Back channel */
+	tbl = &ses->bc_slot_table;
 	if (tbl->slots == NULL) {
-		status = nfs4_init_slot_table(tbl,
-				session->bc_attrs.max_reqs, 0);
+		status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
 		if (status)
-			nfs4_destroy_slot_tables(session);
-	}
-
+			/* Fore and back channel share a connection so get
+			 * both slot tables or neither */
+			nfs4_destroy_slot_tables(ses);
+	} else
+		status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
 	return status;
 }
 
@@ -5293,13 +5282,9 @@ int nfs4_proc_create_session(struct nfs_client *clp)
 	if (status)
 		goto out;
 
-	/* Init and reset the fore channel */
-	status = nfs4_init_slot_tables(session);
-	dprintk("slot table initialization returned %d\n", status);
-	if (status)
-		goto out;
-	status = nfs4_reset_slot_tables(session);
-	dprintk("slot table reset returned %d\n", status);
+	/* Init or reset the session slot tables */
+	status = nfs4_setup_session_slot_tables(session);
+	dprintk("slot table setup returned %d\n", status);
 	if (status)
 		goto out;
 
-- 
cgit v1.2.3


From aabd0b40b327d5c6518c8c908819b9bf864ad56a Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Nov 2011 13:58:22 -0500
Subject: NFSv4.1: change nfs4_free_slot parameters for dynamic slots

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a64aa56e140..b2104461ed4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -363,9 +363,8 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
  * Must be called while holding tbl->slot_tbl_lock
  */
 static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
+nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
 {
-	int free_slotid = free_slot - tbl->slots;
 	int slotid = free_slotid;
 
 	BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
@@ -430,7 +429,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 	}
 
 	spin_lock(&tbl->slot_tbl_lock);
-	nfs4_free_slot(tbl, res->sr_slot);
+	nfs4_free_slot(tbl, res->sr_slot - tbl->slots);
 	nfs4_check_drain_fc_complete(res->sr_session);
 	spin_unlock(&tbl->slot_tbl_lock);
 	res->sr_slot = NULL;
-- 
cgit v1.2.3


From 0b1c8fc43c1f9fcde2d18182988f05eeaaae509b Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Nov 2011 13:58:26 -0500
Subject: NFSv4.1: cleanup comment and debug printk

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b2104461ed4..fcc2408d7ab 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -552,13 +552,10 @@ int nfs41_setup_sequence(struct nfs4_session *session,
 	spin_lock(&tbl->slot_tbl_lock);
 	if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
 	    !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
-		/*
-		 * The state manager will wait until the slot table is empty.
-		 * Schedule the reset thread
-		 */
+		/* The state manager will wait until the slot table is empty */
 		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
 		spin_unlock(&tbl->slot_tbl_lock);
-		dprintk("%s Schedule Session Reset\n", __func__);
+		dprintk("%s session is draining\n", __func__);
 		return -EAGAIN;
 	}
 
-- 
cgit v1.2.3


From 3476f114addb7b96912840a234702f660a1f460b Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 11 Aug 2011 13:54:28 -0700
Subject: nfs: fix a minor do_div portability issue

This change modifies filelayout_get_dense_offset() to use the functions
in math64.h and thus avoid a 32-bit platform compile error trying to
use do_div() on an s64 type.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Reviewed-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index a62d36b9a99..71ec08617e2 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -49,13 +49,14 @@ filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
 			    loff_t offset)
 {
 	u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
-	u64 tmp;
+	u64 stripe_no;
+	u32 rem;
 
 	offset -= flseg->pattern_offset;
-	tmp = offset;
-	do_div(tmp, stripe_width);
+	stripe_no = div_u64(offset, stripe_width);
+	div_u64_rem(offset, flseg->stripe_unit, &rem);
 
-	return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+	return stripe_no * flseg->stripe_unit + rem;
 }
 
 /* This function is used by the layout driver to calculate the
-- 
cgit v1.2.3


From bf118a342f10dafe44b14451a1392c3254629a1f Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 7 Dec 2011 11:55:27 -0500
Subject: NFSv4: include bitmap in nfsv4 get acl data

The NFSv4 bitmap size is unbounded: a server can return an arbitrary
sized bitmap in an FATTR4_WORD0_ACL request.  Replace using the
nfs4_fattr_bitmap_maxsz as a guess to the maximum bitmask returned by a server
with the inclusion of the bitmap (xdr length plus bitmasks) and the acl data
xdr length to the (cached) acl page data.

This is a general solution to commit e5012d1f "NFSv4.1: update
nfs4_fattr_bitmap_maxsz" and fixes hitting a BUG_ON in xdr_shrink_bufhead
when getting ACLs.

Fix a bug in decode_getacl that returned -EINVAL on ACLs > page when getxattr
was called with a NULL buffer, preventing ACL > PAGE_SIZE from being retrieved.

Cc: stable@kernel.org
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c          | 96 +++++++++++++++++++++++++++-------------------
 fs/nfs/nfs4xdr.c           | 31 +++++++++++----
 include/linux/nfs_xdr.h    |  5 +++
 include/linux/sunrpc/xdr.h |  2 +
 net/sunrpc/xdr.c           |  3 +-
 5 files changed, 89 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fcc2408d7ab..3b108011845 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3426,19 +3426,6 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
  */
 #define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
 
-static void buf_to_pages(const void *buf, size_t buflen,
-		struct page **pages, unsigned int *pgbase)
-{
-	const void *p = buf;
-
-	*pgbase = offset_in_page(buf);
-	p -= *pgbase;
-	while (p < buf + buflen) {
-		*(pages++) = virt_to_page(p);
-		p += PAGE_CACHE_SIZE;
-	}
-}
-
 static int buf_to_pages_noslab(const void *buf, size_t buflen,
 		struct page **pages, unsigned int *pgbase)
 {
@@ -3535,9 +3522,19 @@ out:
 	nfs4_set_cached_acl(inode, acl);
 }
 
+/*
+ * The getxattr API returns the required buffer length when called with a
+ * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating
+ * the required buf.  On a NULL buf, we send a page of data to the server
+ * guessing that the ACL request can be serviced by a page. If so, we cache
+ * up to the page of ACL data, and the 2nd call to getxattr is serviced by
+ * the cache. If not so, we throw away the page, and cache the required
+ * length. The next getxattr call will then produce another round trip to
+ * the server, this time with the input buf of the required size.
+ */
 static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
 {
-	struct page *pages[NFS4ACL_MAXPAGES];
+	struct page *pages[NFS4ACL_MAXPAGES] = {NULL, };
 	struct nfs_getaclargs args = {
 		.fh = NFS_FH(inode),
 		.acl_pages = pages,
@@ -3552,41 +3549,60 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
-	struct page *localpage = NULL;
-	int ret;
+	int ret = -ENOMEM, npages, i, acl_len = 0;
 
-	if (buflen < PAGE_SIZE) {
-		/* As long as we're doing a round trip to the server anyway,
-		 * let's be prepared for a page of acl data. */
-		localpage = alloc_page(GFP_KERNEL);
-		resp_buf = page_address(localpage);
-		if (localpage == NULL)
-			return -ENOMEM;
-		args.acl_pages[0] = localpage;
-		args.acl_pgbase = 0;
-		args.acl_len = PAGE_SIZE;
-	} else {
-		resp_buf = buf;
-		buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
+	npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	/* As long as we're doing a round trip to the server anyway,
+	 * let's be prepared for a page of acl data. */
+	if (npages == 0)
+		npages = 1;
+
+	for (i = 0; i < npages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i])
+			goto out_free;
+	}
+	if (npages > 1) {
+		/* for decoding across pages */
+		args.acl_scratch = alloc_page(GFP_KERNEL);
+		if (!args.acl_scratch)
+			goto out_free;
 	}
-	ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
+	args.acl_len = npages * PAGE_SIZE;
+	args.acl_pgbase = 0;
+	/* Let decode_getfacl know not to fail if the ACL data is larger than
+	 * the page we send as a guess */
+	if (buf == NULL)
+		res.acl_flags |= NFS4_ACL_LEN_REQUEST;
+	resp_buf = page_address(pages[0]);
+
+	dprintk("%s  buf %p buflen %ld npages %d args.acl_len %ld\n",
+		__func__, buf, buflen, npages, args.acl_len);
+	ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
+			     &msg, &args.seq_args, &res.seq_res, 0);
 	if (ret)
 		goto out_free;
-	if (res.acl_len > args.acl_len)
-		nfs4_write_cached_acl(inode, NULL, res.acl_len);
+
+	acl_len = res.acl_len - res.acl_data_offset;
+	if (acl_len > args.acl_len)
+		nfs4_write_cached_acl(inode, NULL, acl_len);
 	else
-		nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
+		nfs4_write_cached_acl(inode, resp_buf + res.acl_data_offset,
+				      acl_len);
 	if (buf) {
 		ret = -ERANGE;
-		if (res.acl_len > buflen)
+		if (acl_len > buflen)
 			goto out_free;
-		if (localpage)
-			memcpy(buf, resp_buf, res.acl_len);
+		_copy_from_pages(buf, pages, res.acl_data_offset,
+				res.acl_len);
 	}
-	ret = res.acl_len;
+	ret = acl_len;
 out_free:
-	if (localpage)
-		__free_page(localpage);
+	for (i = 0; i < npages; i++)
+		if (pages[i])
+			__free_page(pages[i]);
+	if (args.acl_scratch)
+		__free_page(args.acl_scratch);
 	return ret;
 }
 
@@ -3617,6 +3633,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
 		nfs_zap_acl_cache(inode);
 	ret = nfs4_read_cached_acl(inode, buf, buflen);
 	if (ret != -ENOENT)
+		/* -ENOENT is returned if there is no ACL or if there is an ACL
+		 * but no cached acl data, just the acl length */
 		return ret;
 	return nfs4_get_acl_uncached(inode, buf, buflen);
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e6161b213ed..dcaf69309d8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2517,11 +2517,13 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
-	replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
+	replen = hdr.replen + op_decode_hdr_maxsz + 1;
 	encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
 		args->acl_pages, args->acl_pgbase, args->acl_len);
+	xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE);
+
 	encode_nops(&hdr);
 }
 
@@ -4957,17 +4959,18 @@ decode_restorefh(struct xdr_stream *xdr)
 }
 
 static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
-		size_t *acl_len)
+			 struct nfs_getaclres *res)
 {
-	__be32 *savep;
+	__be32 *savep, *bm_p;
 	uint32_t attrlen,
 		 bitmap[3] = {0};
 	struct kvec *iov = req->rq_rcv_buf.head;
 	int status;
 
-	*acl_len = 0;
+	res->acl_len = 0;
 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
 		goto out;
+	bm_p = xdr->p;
 	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
 		goto out;
 	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
@@ -4979,18 +4982,30 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 		size_t hdrlen;
 		u32 recvd;
 
+		/* The bitmap (xdr len + bitmaps) and the attr xdr len words
+		 * are stored with the acl data to handle the problem of
+		 * variable length bitmaps.*/
+		xdr->p = bm_p;
+		res->acl_data_offset = be32_to_cpup(bm_p) + 2;
+		res->acl_data_offset <<= 2;
+
 		/* We ignore &savep and don't do consistency checks on
 		 * the attr length.  Let userspace figure it out.... */
 		hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
+		attrlen += res->acl_data_offset;
 		recvd = req->rq_rcv_buf.len - hdrlen;
 		if (attrlen > recvd) {
-			dprintk("NFS: server cheating in getattr"
-					" acl reply: attrlen %u > recvd %u\n",
+			if (res->acl_flags & NFS4_ACL_LEN_REQUEST) {
+				/* getxattr interface called with a NULL buf */
+				res->acl_len = attrlen;
+				goto out;
+			}
+			dprintk("NFS: acl reply: attrlen %u > recvd %u\n",
 					attrlen, recvd);
 			return -EINVAL;
 		}
 		xdr_read_pages(xdr, attrlen);
-		*acl_len = attrlen;
+		res->acl_len = attrlen;
 	} else
 		status = -EOPNOTSUPP;
 
@@ -6028,7 +6043,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_getacl(xdr, rqstp, &res->acl_len);
+	status = decode_getacl(xdr, rqstp, res);
 
 out:
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2a7c533be5d..6c898afe609 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -602,11 +602,16 @@ struct nfs_getaclargs {
 	size_t				acl_len;
 	unsigned int			acl_pgbase;
 	struct page **			acl_pages;
+	struct page *			acl_scratch;
 	struct nfs4_sequence_args 	seq_args;
 };
 
+/* getxattr ACL interface flags */
+#define NFS4_ACL_LEN_REQUEST	0x0001	/* zero length getxattr buffer */
 struct nfs_getaclres {
 	size_t				acl_len;
+	size_t				acl_data_offset;
+	int				acl_flags;
 	struct nfs4_sequence_res	seq_res;
 };
 
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index a20970ef9e4..af70af33354 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -191,6 +191,8 @@ extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
 			     struct xdr_array2_desc *desc);
 extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
 			     struct xdr_array2_desc *desc);
+extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase,
+			     size_t len);
 
 /*
  * Provide some simple tools for XDR buffer overflow-checking etc.
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 277ebd4bf09..593f4c60530 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -296,7 +296,7 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
  * Copies data into an arbitrary memory location from an array of pages
  * The copy is assumed to be non-overlapping.
  */
-static void
+void
 _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 {
 	struct page **pgfrom;
@@ -324,6 +324,7 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 
 	} while ((len -= copy) != 0);
 }
+EXPORT_SYMBOL_GPL(_copy_from_pages);
 
 /*
  * xdr_shrink_bufhead
-- 
cgit v1.2.3


From 414adf14cd3b52e411f79d941a15d0fd4af427fc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 6 Dec 2011 16:13:39 -0500
Subject: NFS: Clean up nfs4_find_state_owners_locked()

There's no longer a need to check the so_server field in the state
owner, because nowadays the RB tree we search for state owners
contains owners for that only server.

Make nfs4_find_state_owners_locked() use the same tree searching logic
as nfs4_insert_state_owner_locked().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6a7107ae6b7..6354e4fcc82 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -377,31 +377,22 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
 	struct rb_node **p = &server->state_owners.rb_node,
 		       *parent = NULL;
-	struct nfs4_state_owner *sp, *res = NULL;
+	struct nfs4_state_owner *sp;
 
 	while (*p != NULL) {
 		parent = *p;
 		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
 
-		if (server < sp->so_server) {
-			p = &parent->rb_left;
-			continue;
-		}
-		if (server > sp->so_server) {
-			p = &parent->rb_right;
-			continue;
-		}
 		if (cred < sp->so_cred)
 			p = &parent->rb_left;
 		else if (cred > sp->so_cred)
 			p = &parent->rb_right;
 		else {
 			atomic_inc(&sp->so_count);
-			res = sp;
-			break;
+			return sp;
 		}
 	}
-	return res;
+	return NULL;
 }
 
 static struct nfs4_state_owner *
-- 
cgit v1.2.3


From a0ea787b027b79cf2e01c6758e5246db06520158 Mon Sep 17 00:00:00 2001
From: Jim Garlick <garlick@llnl.gov>
Date: Tue, 3 Jan 2012 15:27:50 -0600
Subject: fs/9p: check schedule_timeout_interruptible return value

In v9fs_file_do_lock() we need to check return value of
schedule_timeout_interruptible() and exit the loop when it
returns nonzero, otherwise the loop is not really interruptible
and after the signal, the loop is no longer throttled by
P9_LOCK_TIMEOUT.

Signed-off-by: Jim Garlick <garlick.jim@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 62857a810a7..a7ac45d5bd2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -204,7 +204,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 			break;
 		if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
 			break;
-		schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+		if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+			break;
 	}
 
 	/* map 9p status to VFS status */
-- 
cgit v1.2.3


From 5d3851530d6d68564e4e0ce04d0547d4d106fc72 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 28 Nov 2011 10:40:46 -0800
Subject: 9p: Reduce object size with CONFIG_NET_9P_DEBUG

Reduce object size by deduplicating formats.

Use vsprintf extension %pV.
Rename P9_DPRINTK uses to p9_debug, align arguments.
Add function for _p9_debug and macro to add __func__.
Add missing "\n"s to p9_debug uses.
Remove embedded function names as p9_debug adds it.
Remove P9_EPRINTK macro and convert use to pr_<level>.
Add and use pr_fmt and pr_<level>.

$ size fs/9p/built-in.o*
   text	   data	    bss	    dec	    hex	filename
  62133	    984	  16000	  79117	  1350d	fs/9p/built-in.o.new
  67342	    984	  16928	  85254	  14d06	fs/9p/built-in.o.old
$ size net/9p/built-in.o*
   text	   data	    bss	    dec	    hex	filename
  88792	   4148	  22024	 114964	  1c114	net/9p/built-in.o.new
  94072	   4148	  23232	 121452	  1da6c	net/9p/built-in.o.old

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/cache.c          |  64 ++++++-------
 fs/9p/fid.c            |   8 +-
 fs/9p/v9fs.c           |  59 ++++++------
 fs/9p/vfs_addr.c       |  13 ++-
 fs/9p/vfs_dentry.c     |  12 +--
 fs/9p/vfs_dir.c        |  13 ++-
 fs/9p/vfs_file.c       |  31 +++----
 fs/9p/vfs_inode.c      |  86 +++++++++---------
 fs/9p/vfs_inode_dotl.c |  85 +++++++++--------
 fs/9p/vfs_super.c      |  12 +--
 fs/9p/xattr.c          |  16 ++--
 include/net/9p/9p.h    |  28 ++----
 net/9p/client.c        | 242 ++++++++++++++++++++++++-------------------------
 net/9p/error.c         |   6 +-
 net/9p/mod.c           |  31 ++++++-
 net/9p/protocol.c      |   8 +-
 net/9p/trans_fd.c      | 113 ++++++++++++-----------
 net/9p/trans_rdma.c    |  26 +++---
 net/9p/trans_virtio.c  |  34 +++----
 net/9p/util.c          |   4 +-
 20 files changed, 452 insertions(+), 439 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 945aa5f02f9..a9ea73d6dcf 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -62,8 +62,8 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
 	uint16_t klen = 0;
 
 	v9ses = (struct v9fs_session_info *)cookie_netfs_data;
-	P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
-		   buffer, bufmax);
+	p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n",
+		 v9ses, buffer, bufmax);
 
 	if (v9ses->cachetag)
 		klen = strlen(v9ses->cachetag);
@@ -72,7 +72,7 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
 		return 0;
 
 	memcpy(buffer, v9ses->cachetag, klen);
-	P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
+	p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag);
 	return klen;
 }
 
@@ -91,14 +91,14 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
 	v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
 						&v9fs_cache_session_index_def,
 						v9ses);
-	P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
-		   v9ses->fscache);
+	p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
+		 v9ses, v9ses->fscache);
 }
 
 void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
 {
-	P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
-		   v9ses->fscache);
+	p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
+		 v9ses, v9ses->fscache);
 	fscache_relinquish_cookie(v9ses->fscache, 0);
 	v9ses->fscache = NULL;
 }
@@ -109,8 +109,8 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
 {
 	const struct v9fs_inode *v9inode = cookie_netfs_data;
 	memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
-		   v9inode->qid.path);
+	p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n",
+		 &v9inode->vfs_inode, v9inode->qid.path);
 	return sizeof(v9inode->qid.path);
 }
 
@@ -120,8 +120,8 @@ static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
 	const struct v9fs_inode *v9inode = cookie_netfs_data;
 	*size = i_size_read(&v9inode->vfs_inode);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
-		   *size);
+	p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n",
+		 &v9inode->vfs_inode, *size);
 }
 
 static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
@@ -129,8 +129,8 @@ static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
 {
 	const struct v9fs_inode *v9inode = cookie_netfs_data;
 	memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
-		   v9inode->qid.version);
+	p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n",
+		 &v9inode->vfs_inode, v9inode->qid.version);
 	return sizeof(v9inode->qid.version);
 }
 
@@ -206,8 +206,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
 						  &v9fs_cache_inode_index_def,
 						  v9inode);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
-		   v9inode->fscache);
+	p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
+		 inode, v9inode->fscache);
 }
 
 void v9fs_cache_inode_put_cookie(struct inode *inode)
@@ -216,8 +216,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode)
 
 	if (!v9inode->fscache)
 		return;
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
-		   v9inode->fscache);
+	p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
+		 inode, v9inode->fscache);
 
 	fscache_relinquish_cookie(v9inode->fscache, 0);
 	v9inode->fscache = NULL;
@@ -229,8 +229,8 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
 
 	if (!v9inode->fscache)
 		return;
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
-		   v9inode->fscache);
+	p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
+		 inode, v9inode->fscache);
 
 	fscache_relinquish_cookie(v9inode->fscache, 1);
 	v9inode->fscache = NULL;
@@ -272,8 +272,8 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
 	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
 						  v9inode);
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
-		   inode, old, v9inode->fscache);
+	p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
+		 inode, old, v9inode->fscache);
 
 	spin_unlock(&v9inode->fscache_lock);
 }
@@ -323,7 +323,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
 	int ret;
 	const struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
 	if (!v9inode->fscache)
 		return -ENOBUFS;
 
@@ -335,13 +335,13 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
 	switch (ret) {
 	case -ENOBUFS:
 	case -ENODATA:
-		P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
+		p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret);
 		return 1;
 	case 0:
-		P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+		p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
 		return ret;
 	default:
-		P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+		p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
 		return ret;
 	}
 }
@@ -361,7 +361,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
 	int ret;
 	const struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
+	p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages);
 	if (!v9inode->fscache)
 		return -ENOBUFS;
 
@@ -373,15 +373,15 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
 	switch (ret) {
 	case -ENOBUFS:
 	case -ENODATA:
-		P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
+		p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret);
 		return 1;
 	case 0:
 		BUG_ON(!list_empty(pages));
 		BUG_ON(*nr_pages != 0);
-		P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+		p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
 		return ret;
 	default:
-		P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+		p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
 		return ret;
 	}
 }
@@ -396,9 +396,9 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 	int ret;
 	const struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
 	ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
-	P9_DPRINTK(P9_DEBUG_FSC, "ret =  %d", ret);
+	p9_debug(P9_DEBUG_FSC, "ret =  %d\n", ret);
 	if (ret != 0)
 		v9fs_uncache_page(inode, page);
 }
@@ -409,7 +409,7 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
 {
 	const struct v9fs_inode *v9inode = V9FS_I(inode);
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
 	if (PageFsCache(page))
 		fscache_wait_on_page_write(v9inode->fscache, page);
 }
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 85b67ffa2a4..da8eefbe830 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -45,8 +45,8 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
 {
 	struct v9fs_dentry *dent;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n",
-					fid->fid, dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n",
+		 fid->fid, dentry->d_name.name);
 
 	dent = dentry->d_fsdata;
 	if (!dent) {
@@ -79,8 +79,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
 	struct v9fs_dentry *dent;
 	struct p9_fid *fid, *ret;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
-		dentry->d_name.name, dentry, uid, any);
+	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
+		 dentry->d_name.name, dentry, uid, any);
 	dent = (struct v9fs_dentry *) dentry->d_fsdata;
 	ret = NULL;
 	if (dent) {
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2b78014a124..1964f98e74b 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -23,6 +23,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -85,15 +87,15 @@ static int get_cache_mode(char *s)
 
 	if (!strcmp(s, "loose")) {
 		version = CACHE_LOOSE;
-		P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n");
+		p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
 	} else if (!strcmp(s, "fscache")) {
 		version = CACHE_FSCACHE;
-		P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n");
+		p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
 	} else if (!strcmp(s, "none")) {
 		version = CACHE_NONE;
-		P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n");
+		p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
 	} else
-		printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s);
+		pr_info("Unknown Cache mode %s\n", s);
 	return version;
 }
 
@@ -140,8 +142,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 		case Opt_debug:
 			r = match_int(&args[0], &option);
 			if (r < 0) {
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					   "integer field, but no integer?\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
 				ret = r;
 				continue;
 			}
@@ -154,8 +156,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 		case Opt_dfltuid:
 			r = match_int(&args[0], &option);
 			if (r < 0) {
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					   "integer field, but no integer?\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
 				ret = r;
 				continue;
 			}
@@ -164,8 +166,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 		case Opt_dfltgid:
 			r = match_int(&args[0], &option);
 			if (r < 0) {
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					   "integer field, but no integer?\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
 				ret = r;
 				continue;
 			}
@@ -174,8 +176,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 		case Opt_afid:
 			r = match_int(&args[0], &option);
 			if (r < 0) {
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					   "integer field, but no integer?\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
 				ret = r;
 				continue;
 			}
@@ -205,8 +207,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			s = match_strdup(&args[0]);
 			if (!s) {
 				ret = -ENOMEM;
-				P9_DPRINTK(P9_DEBUG_ERROR,
-				  "problem allocating copy of cache arg\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "problem allocating copy of cache arg\n");
 				goto free_and_return;
 			}
 			ret = get_cache_mode(s);
@@ -223,8 +225,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			s = match_strdup(&args[0]);
 			if (!s) {
 				ret = -ENOMEM;
-				P9_DPRINTK(P9_DEBUG_ERROR,
-				  "problem allocating copy of access arg\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "problem allocating copy of access arg\n");
 				goto free_and_return;
 			}
 
@@ -240,8 +242,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 				v9ses->uid = simple_strtoul(s, &e, 10);
 				if (*e != '\0') {
 					ret = -EINVAL;
-					printk(KERN_INFO "9p: Unknown access "
-							"argument %s.\n", s);
+					pr_info("Unknown access argument %s\n",
+						s);
 					kfree(s);
 					goto free_and_return;
 				}
@@ -254,9 +256,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 #ifdef CONFIG_9P_FS_POSIX_ACL
 			v9ses->flags |= V9FS_POSIX_ACL;
 #else
-			P9_DPRINTK(P9_DEBUG_ERROR,
-					"Not defined CONFIG_9P_FS_POSIX_ACL. "
-					"Ignoring posixacl option\n");
+			p9_debug(P9_DEBUG_ERROR,
+				 "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
 #endif
 			break;
 
@@ -318,7 +319,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	if (IS_ERR(v9ses->clnt)) {
 		retval = PTR_ERR(v9ses->clnt);
 		v9ses->clnt = NULL;
-		P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n");
+		p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
 		goto error;
 	}
 
@@ -371,7 +372,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	if (IS_ERR(fid)) {
 		retval = PTR_ERR(fid);
 		fid = NULL;
-		P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n");
+		p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
 		goto error;
 	}
 
@@ -429,7 +430,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
  */
 
 void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
-	P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
+	p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
 	p9_client_disconnect(v9ses->clnt);
 }
 
@@ -442,7 +443,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
 
 void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
 {
-	P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+	p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
 	p9_client_begin_disconnect(v9ses->clnt);
 }
 
@@ -591,23 +592,23 @@ static void v9fs_cache_unregister(void)
 static int __init init_v9fs(void)
 {
 	int err;
-	printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
+	pr_info("Installing v9fs 9p2000 file system support\n");
 	/* TODO: Setup list of registered trasnport modules */
 	err = register_filesystem(&v9fs_fs_type);
 	if (err < 0) {
-		printk(KERN_ERR "Failed to register filesystem\n");
+		pr_err("Failed to register filesystem\n");
 		return err;
 	}
 
 	err = v9fs_cache_register();
 	if (err < 0) {
-		printk(KERN_ERR "Failed to register v9fs for caching\n");
+		pr_err("Failed to register v9fs for caching\n");
 		goto out_fs_unreg;
 	}
 
 	err = v9fs_sysfs_init();
 	if (err < 0) {
-		printk(KERN_ERR "Failed to register with sysfs\n");
+		pr_err("Failed to register with sysfs\n");
 		goto out_sysfs_cleanup;
 	}
 
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 2524e4cbb8e..0ad61c6a65a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -56,7 +56,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
 	struct inode *inode;
 
 	inode = page->mapping->host;
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	p9_debug(P9_DEBUG_VFS, "\n");
 
 	BUG_ON(!PageLocked(page));
 
@@ -116,14 +116,14 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
 	struct inode *inode;
 
 	inode = mapping->host;
-	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+	p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
 
 	ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
 	if (ret == 0)
 		return ret;
 
 	ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
-	P9_DPRINTK(P9_DEBUG_VFS, "  = %d\n", ret);
+	p9_debug(P9_DEBUG_VFS, "  = %d\n", ret);
 	return ret;
 }
 
@@ -263,10 +263,9 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	 * Now that we do caching with cache mode enabled, We need
 	 * to support direct IO
 	 */
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
-			"off/no(%lld/%lu) EINVAL\n",
-			iocb->ki_filp->f_path.dentry->d_name.name,
-			(long long) pos, nr_segs);
+	p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
+		 iocb->ki_filp->f_path.dentry->d_name.name,
+		 (long long)pos, nr_segs);
 
 	return -EINVAL;
 }
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index e022890c6f4..d529437ff44 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -53,8 +53,8 @@
 
 static int v9fs_dentry_delete(const struct dentry *dentry)
 {
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
-									dentry);
+	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+		 dentry->d_name.name, dentry);
 
 	return 1;
 }
@@ -66,8 +66,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
  */
 static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-		   dentry->d_name.name, dentry);
+	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+		 dentry->d_name.name, dentry);
 
 	/* Don't cache negative dentries */
 	if (!dentry->d_inode)
@@ -86,8 +86,8 @@ static void v9fs_dentry_release(struct dentry *dentry)
 	struct v9fs_dentry *dent;
 	struct p9_fid *temp, *current_fid;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
-									dentry);
+	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+		 dentry->d_name.name, dentry);
 	dent = dentry->d_fsdata;
 	if (dent) {
 		list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 598fff1a54e..ff911e77965 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -140,7 +140,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int reclen = 0;
 	struct p9_rdir *rdir;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
 	fid = filp->private_data;
 
 	buflen = fid->clnt->msize - P9_IOHDRSZ;
@@ -168,7 +168,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
 					  rdir->tail - rdir->head, &st);
 			if (err) {
-				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
 				err = -EIO;
 				p9stat_free(&st);
 				goto unlock_and_exit;
@@ -213,7 +213,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
 	struct p9_dirent curdirent;
 	u64 oldoffset = 0;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
 	fid = filp->private_data;
 
 	buflen = fid->clnt->msize - P9_READDIRHDRSZ;
@@ -244,7 +244,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
 					    rdir->tail - rdir->head,
 					    &curdirent);
 			if (err < 0) {
-				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
 				err = -EIO;
 				goto unlock_and_exit;
 			}
@@ -290,9 +290,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 	struct p9_fid *fid;
 
 	fid = filp->private_data;
-	P9_DPRINTK(P9_DEBUG_VFS,
-			"v9fs_dir_release: inode: %p filp: %p fid: %d\n",
-			inode, filp, fid ? fid->fid : -1);
+	p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
+		 inode, filp, fid ? fid->fid : -1);
 	if (fid)
 		p9_client_clunk(fid);
 	return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a7ac45d5bd2..fc06fd27065 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 	struct p9_fid *fid;
 	int omode;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+	p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
 	v9inode = V9FS_I(inode);
 	v9ses = v9fs_inode2v9ses(inode);
 	if (v9fs_proto_dotl(v9ses))
@@ -135,7 +135,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 	int res = 0;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
+	p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
 
 	/* No mandatory locks */
 	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -305,8 +305,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = -ENOLCK;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
-				cmd, fl, filp->f_path.dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
+		 filp, cmd, fl, filp->f_path.dentry->d_name.name);
 
 	/* No mandatory locks */
 	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -341,8 +341,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = -ENOLCK;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
-				cmd, fl, filp->f_path.dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
+		 filp, cmd, fl, filp->f_path.dentry->d_name.name);
 
 	/* No mandatory locks */
 	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -385,8 +385,8 @@ v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
 {
 	int n, total, size;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
-		   (long long unsigned) offset, count);
+	p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n",
+		 fid->fid, (long long unsigned)offset, count);
 	n = 0;
 	total = 0;
 	size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -444,7 +444,7 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
 	struct p9_fid *fid;
 	size_t size;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
+	p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
 	fid = filp->private_data;
 
 	size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -471,8 +471,8 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
 	loff_t origin = *offset;
 	unsigned long pg_start, pg_end;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
-		(int)count, (int)*offset);
+	p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
+		 data, (int)count, (int)*offset);
 
 	clnt = fid->clnt;
 	do {
@@ -553,7 +553,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
 		return retval;
 
 	mutex_lock(&inode->i_mutex);
-	P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
+	p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
 	fid = filp->private_data;
 	v9fs_blank_wstat(&wstat);
@@ -576,8 +576,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
 		return retval;
 
 	mutex_lock(&inode->i_mutex);
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
-			filp, datasync);
+	p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
 	fid = filp->private_data;
 
@@ -608,8 +607,8 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = filp->f_path.dentry->d_inode;
 
 
-	P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
-		   page, (unsigned long)filp->private_data);
+	p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
+		 page, (unsigned long)filp->private_data);
 
 	v9inode = V9FS_I(inode);
 	/* make sure the cache has finished storing the page */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 879ed885173..85c29733d8b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -23,6 +23,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -133,9 +135,8 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses,
 			res |= S_IFBLK;
 			break;
 		default:
-			P9_DPRINTK(P9_DEBUG_ERROR,
-				"Unknown special type %c %s\n", type,
-				stat->extension);
+			p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n",
+				 type, stat->extension);
 		};
 		*rdev = MKDEV(major, minor);
 	} else
@@ -281,8 +282,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 		} else if (v9fs_proto_dotu(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations;
 		} else {
-			P9_DPRINTK(P9_DEBUG_ERROR,
-				   "special files without extended mode\n");
+			p9_debug(P9_DEBUG_ERROR,
+				 "special files without extended mode\n");
 			err = -EINVAL;
 			goto error;
 		}
@@ -307,8 +308,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 		break;
 	case S_IFLNK:
 		if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
-			P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
-						"legacy protocol.\n");
+			p9_debug(P9_DEBUG_ERROR,
+				 "extended modes used with legacy protocol\n");
 			err = -EINVAL;
 			goto error;
 		}
@@ -335,8 +336,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 
 		break;
 	default:
-		P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
-			   mode, mode & S_IFMT);
+		p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
+			 mode, mode & S_IFMT);
 		err = -EINVAL;
 		goto error;
 	}
@@ -358,11 +359,12 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev)
 	struct inode *inode;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+	p9_debug(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
 
 	inode = new_inode(sb);
 	if (!inode) {
-		P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+		pr_warn("%s (%d): Problem allocating inode\n",
+			__func__, task_pid_nr(current));
 		return ERR_PTR(-ENOMEM);
 	}
 	err = v9fs_init_inode(v9ses, inode, mode, rdev);
@@ -578,15 +580,15 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
 	struct p9_fid *v9fid, *dfid;
 	struct v9fs_session_info *v9ses;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
-		   dir, dentry, flags);
+	p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
+		 dir, dentry, flags);
 
 	v9ses = v9fs_inode2v9ses(dir);
 	inode = dentry->d_inode;
 	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		retval = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
 		return retval;
 	}
 	if (v9fs_proto_dotl(v9ses))
@@ -635,7 +637,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	struct p9_fid *dfid, *ofid, *fid;
 	struct inode *inode;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
 
 	err = 0;
 	ofid = NULL;
@@ -644,7 +646,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
 		return ERR_PTR(err);
 	}
 
@@ -652,13 +654,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	ofid = p9_client_walk(dfid, 0, NULL, 1);
 	if (IS_ERR(ofid)) {
 		err = PTR_ERR(ofid);
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		return ERR_PTR(err);
 	}
 
 	err = p9_client_fcreate(ofid, name, perm, mode, extension);
 	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
 		goto error;
 	}
 
@@ -666,7 +668,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	fid = p9_client_walk(dfid, 1, &name, 1);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		fid = NULL;
 		goto error;
 	}
@@ -675,7 +677,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
 		goto error;
 	}
 	err = v9fs_fid_add(dentry, fid);
@@ -793,7 +795,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct p9_fid *fid;
 	struct v9fs_session_info *v9ses;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
 	err = 0;
 	v9ses = v9fs_inode2v9ses(dir);
 	perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
@@ -831,8 +833,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	char *name;
 	int result = 0;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
-		dir, dentry->d_name.name, dentry, nameidata);
+	p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
+		 dir, dentry->d_name.name, dentry, nameidata);
 
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -938,7 +940,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct p9_fid *newdirfid;
 	struct p9_wstat wstat;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	p9_debug(P9_DEBUG_VFS, "\n");
 	retval = 0;
 	old_inode = old_dentry->d_inode;
 	new_inode = new_dentry->d_inode;
@@ -974,8 +976,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 * 9P .u can only handle file rename in the same directory
 		 */
 
-		P9_DPRINTK(P9_DEBUG_ERROR,
-				"old dir and new dir are different\n");
+		p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n");
 		retval = -EXDEV;
 		goto clunk_newdir;
 	}
@@ -1031,7 +1032,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct p9_fid *fid;
 	struct p9_wstat *st;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	err = -EPERM;
 	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
@@ -1068,7 +1069,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct p9_fid *fid;
 	struct p9_wstat wstat;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	p9_debug(P9_DEBUG_VFS, "\n");
 	retval = inode_change_ok(dentry->d_inode, iattr);
 	if (retval)
 		return retval;
@@ -1213,7 +1214,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	struct p9_fid *fid;
 	struct p9_wstat *st;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
 	retval = -EPERM;
 	v9ses = v9fs_dentry2v9ses(dentry);
 	fid = v9fs_fid_lookup(dentry);
@@ -1235,8 +1236,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	/* copy extension buffer into buffer */
 	strncpy(buffer, st->extension, buflen);
 
-	P9_DPRINTK(P9_DEBUG_VFS,
-		"%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
+	p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n",
+		 dentry->d_name.name, st->extension, buffer);
 
 	retval = strnlen(buffer, buflen);
 done:
@@ -1257,7 +1258,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	int len = 0;
 	char *link = __getname();
 
-	P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
 
 	if (!link)
 		link = ERR_PTR(-ENOMEM);
@@ -1288,8 +1289,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
 	char *s = nd_get_link(nd);
 
-	P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name,
-		IS_ERR(s) ? "<error>" : s);
+	p9_debug(P9_DEBUG_VFS, " %s %s\n",
+		 dentry->d_name.name, IS_ERR(s) ? "<error>" : s);
 	if (!IS_ERR(s))
 		__putname(s);
 }
@@ -1312,7 +1313,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 
 	v9ses = v9fs_inode2v9ses(dir);
 	if (!v9fs_proto_dotu(v9ses)) {
-		P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
+		p9_debug(P9_DEBUG_ERROR, "not extended\n");
 		return -EPERM;
 	}
 
@@ -1340,8 +1341,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 static int
 v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
-	P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino,
-					dentry->d_name.name, symname);
+	p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
+		 dir->i_ino, dentry->d_name.name, symname);
 
 	return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname);
 }
@@ -1362,9 +1363,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	char *name;
 	struct p9_fid *oldfid;
 
-	P9_DPRINTK(P9_DEBUG_VFS,
-		" %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
-		old_dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
+		 dir->i_ino, dentry->d_name.name, old_dentry->d_name.name);
 
 	oldfid = v9fs_fid_clone(old_dentry);
 	if (IS_ERR(oldfid))
@@ -1403,9 +1403,9 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	int retval;
 	char *name;
 
-	P9_DPRINTK(P9_DEBUG_VFS,
-		" %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+	p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n",
+		 dir->i_ino, dentry->d_name.name, mode,
+		 MAJOR(rdev), MINOR(rdev));
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 0b5745e2194..73488fb69d3 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -283,13 +283,13 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	}
 
 	name = (char *) dentry->d_name.name;
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-			"mode:0x%x\n", name, flags, omode);
+	p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%x\n",
+		 name, flags, omode);
 
 	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
 		return err;
 	}
 
@@ -297,7 +297,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	ofid = p9_client_walk(dfid, 0, NULL, 1);
 	if (IS_ERR(ofid)) {
 		err = PTR_ERR(ofid);
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		return err;
 	}
 
@@ -307,16 +307,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	/* Update mode based on ACL value */
 	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
 	if (err) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			   "Failed to get acl values in creat %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
+			 err);
 		goto error;
 	}
 	err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
 				    mode, gid, &qid);
 	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-				"p9_client_open_dotl failed in creat %d\n",
-				err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
+			 err);
 		goto error;
 	}
 	v9fs_invalidate_inode_attr(dir);
@@ -325,14 +324,14 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	fid = p9_client_walk(dfid, 1, &name, 1);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		fid = NULL;
 		goto error;
 	}
 	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
 		goto error;
 	}
 	err = v9fs_fid_add(dentry, fid);
@@ -408,7 +407,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 	struct dentry *dir_dentry;
 	struct posix_acl *dacl = NULL, *pacl = NULL;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
 	err = 0;
 	v9ses = v9fs_inode2v9ses(dir);
 
@@ -420,7 +419,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 	dfid = v9fs_fid_lookup(dir_dentry);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
 		dfid = NULL;
 		goto error;
 	}
@@ -430,8 +429,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 	/* Update mode based on ACL value */
 	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
 	if (err) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			   "Failed to get acl values in mkdir %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n",
+			 err);
 		goto error;
 	}
 	name = (char *) dentry->d_name.name;
@@ -444,8 +443,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 		fid = p9_client_walk(dfid, 1, &name, 1);
 		if (IS_ERR(fid)) {
 			err = PTR_ERR(fid);
-			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-				err);
+			p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				 err);
 			fid = NULL;
 			goto error;
 		}
@@ -453,8 +452,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
-			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-				err);
+			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+				 err);
 			goto error;
 		}
 		err = v9fs_fid_add(dentry, fid);
@@ -495,7 +494,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
 	struct p9_fid *fid;
 	struct p9_stat_dotl *st;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	err = -EPERM;
 	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
@@ -537,7 +536,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 	struct p9_fid *fid;
 	struct p9_iattr_dotl p9attr;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	p9_debug(P9_DEBUG_VFS, "\n");
 
 	retval = inode_change_ok(dentry->d_inode, iattr);
 	if (retval)
@@ -670,14 +669,13 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 	struct v9fs_session_info *v9ses;
 
 	name = (char *) dentry->d_name.name;
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
-			dir->i_ino, name, symname);
+	p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
 	v9ses = v9fs_inode2v9ses(dir);
 
 	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
 		return err;
 	}
 
@@ -687,7 +685,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 	err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
 
 	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
 		goto error;
 	}
 
@@ -697,8 +695,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 		fid = p9_client_walk(dfid, 1, &name, 1);
 		if (IS_ERR(fid)) {
 			err = PTR_ERR(fid);
-			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-					err);
+			p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				 err);
 			fid = NULL;
 			goto error;
 		}
@@ -707,8 +705,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
-			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-					err);
+			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+				 err);
 			goto error;
 		}
 		err = v9fs_fid_add(dentry, fid);
@@ -751,9 +749,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 	struct p9_fid *dfid, *oldfid;
 	struct v9fs_session_info *v9ses;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
-			dir->i_ino, old_dentry->d_name.name,
-			dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+		 dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
 
 	v9ses = v9fs_inode2v9ses(dir);
 	dir_dentry = v9fs_dentry_from_dir_inode(dir);
@@ -770,7 +767,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 	err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
 
 	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
 		return err;
 	}
 
@@ -813,9 +810,9 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	struct dentry *dir_dentry;
 	struct posix_acl *dacl = NULL, *pacl = NULL;
 
-	P9_DPRINTK(P9_DEBUG_VFS,
-		" %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-		dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+	p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n",
+		 dir->i_ino, dentry->d_name.name, omode,
+		 MAJOR(rdev), MINOR(rdev));
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -825,7 +822,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	dfid = v9fs_fid_lookup(dir_dentry);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
 		dfid = NULL;
 		goto error;
 	}
@@ -835,8 +832,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	/* Update mode based on ACL value */
 	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
 	if (err) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			   "Failed to get acl values in mknod %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n",
+			 err);
 		goto error;
 	}
 	name = (char *) dentry->d_name.name;
@@ -851,8 +848,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		fid = p9_client_walk(dfid, 1, &name, 1);
 		if (IS_ERR(fid)) {
 			err = PTR_ERR(fid);
-			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-				err);
+			p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				 err);
 			fid = NULL;
 			goto error;
 		}
@@ -860,8 +857,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
-			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-				err);
+			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+				 err);
 			goto error;
 		}
 		err = v9fs_fid_add(dentry, fid);
@@ -905,7 +902,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
 	char *link = __getname();
 	char *target;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
 
 	if (!link) {
 		link = ERR_PTR(-ENOMEM);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index c70251d47ed..06d1973598a 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -121,7 +121,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	struct p9_fid *fid;
 	int retval = 0;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " \n");
+	p9_debug(P9_DEBUG_VFS, "\n");
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
@@ -191,7 +191,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		goto release_sb;
 	v9fs_fid_add(root, fid);
 
-	P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+	p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
 	return dget(sb->s_root);
 
 clunk_fid:
@@ -223,7 +223,7 @@ static void v9fs_kill_super(struct super_block *s)
 {
 	struct v9fs_session_info *v9ses = s->s_fs_info;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
+	p9_debug(P9_DEBUG_VFS, " %p\n", s);
 
 	kill_anon_super(s);
 
@@ -231,7 +231,7 @@ static void v9fs_kill_super(struct super_block *s)
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
 	s->s_fs_info = NULL;
-	P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
+	p9_debug(P9_DEBUG_VFS, "exiting kill_super\n");
 }
 
 static void
@@ -303,7 +303,7 @@ static int v9fs_write_inode(struct inode *inode,
 	 * send an fsync request to server irrespective of
 	 * wbc->sync_mode.
 	 */
-	P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+	p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
 	v9inode = V9FS_I(inode);
 	if (!v9inode->writeback_fid)
 		return 0;
@@ -326,7 +326,7 @@ static int v9fs_write_inode_dotl(struct inode *inode,
 	 * send an fsync request to server irrespective of
 	 * wbc->sync_mode.
 	 */
-	P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+	p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
 	v9inode = V9FS_I(inode);
 	if (!v9inode->writeback_fid)
 		return 0;
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index d288773871b..29653b70a9c 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -32,8 +32,8 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
 	attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
 	if (IS_ERR(attr_fid)) {
 		retval = PTR_ERR(attr_fid);
-		P9_DPRINTK(P9_DEBUG_VFS,
-			"p9_client_attrwalk failed %zd\n", retval);
+		p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n",
+			 retval);
 		attr_fid = NULL;
 		goto error;
 	}
@@ -87,8 +87,8 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
 {
 	struct p9_fid *fid;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
-		__func__, name, buffer_size);
+	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n",
+		 name, buffer_size);
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
@@ -115,8 +115,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
 	int retval, msize, write_count;
 	struct p9_fid *fid = NULL;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
-		__func__, name, value_len, flags);
+	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
+		 name, value_len, flags);
 
 	fid = v9fs_fid_clone(dentry);
 	if (IS_ERR(fid)) {
@@ -129,8 +129,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
 	 */
 	retval = p9_client_xattrcreate(fid, name, value_len, flags);
 	if (retval < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			"p9_client_xattrcreate failed %d\n", retval);
+		p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
+			 retval);
 		goto error;
 	}
 	msize = fid->clnt->msize;
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 2d70b95b3b5..7184853ca36 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -63,30 +63,16 @@ enum p9_debug_flags {
 
 #ifdef CONFIG_NET_9P_DEBUG
 extern unsigned int p9_debug_level;
-
-#define P9_DPRINTK(level, format, arg...) \
-do {  \
-	if ((p9_debug_level & level) == level) {\
-		if (level == P9_DEBUG_9P) \
-			printk(KERN_NOTICE "(%8.8d) " \
-			format , task_pid_nr(current) , ## arg); \
-		else \
-			printk(KERN_NOTICE "-- %s (%d): " \
-			format , __func__, task_pid_nr(current) , ## arg); \
-	} \
-} while (0)
-
+__printf(3, 4)
+void _p9_debug(enum p9_debug_flags level, const char *func,
+	       const char *fmt, ...);
+#define p9_debug(level, fmt, ...)			\
+	_p9_debug(level, __func__, fmt, ##__VA_ARGS__)
 #else
-#define P9_DPRINTK(level, format, arg...)  do { } while (0)
+#define p9_debug(level, fmt, ...)			\
+	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
-
-#define P9_EPRINTK(level, format, arg...) \
-do { \
-	printk(level "9p: %s (%d): " \
-		format , __func__, task_pid_nr(current), ## arg); \
-} while (0)
-
 /**
  * enum p9_msg_t - 9P message types
  * @P9_TLERROR: not used
diff --git a/net/9p/client.c b/net/9p/client.c
index 854ca7a911c..776618cd2be 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -23,6 +23,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -81,15 +83,15 @@ static int get_protocol_version(char *s)
 
 	if (!strcmp(s, "9p2000")) {
 		version = p9_proto_legacy;
-		P9_DPRINTK(P9_DEBUG_9P, "Protocol version: Legacy\n");
+		p9_debug(P9_DEBUG_9P, "Protocol version: Legacy\n");
 	} else if (!strcmp(s, "9p2000.u")) {
 		version = p9_proto_2000u;
-		P9_DPRINTK(P9_DEBUG_9P, "Protocol version: 9P2000.u\n");
+		p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.u\n");
 	} else if (!strcmp(s, "9p2000.L")) {
 		version = p9_proto_2000L;
-		P9_DPRINTK(P9_DEBUG_9P, "Protocol version: 9P2000.L\n");
+		p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n");
 	} else
-		printk(KERN_INFO "9p: Unknown protocol version %s.\n", s);
+		pr_info("Unknown protocol version %s\n", s);
 
 	return version;
 }
@@ -119,8 +121,8 @@ static int parse_opts(char *opts, struct p9_client *clnt)
 
 	tmp_options = kstrdup(opts, GFP_KERNEL);
 	if (!tmp_options) {
-		P9_DPRINTK(P9_DEBUG_ERROR,
-				"failed to allocate copy of option string\n");
+		p9_debug(P9_DEBUG_ERROR,
+			 "failed to allocate copy of option string\n");
 		return -ENOMEM;
 	}
 	options = tmp_options;
@@ -134,8 +136,8 @@ static int parse_opts(char *opts, struct p9_client *clnt)
 		case Opt_msize:
 			r = match_int(&args[0], &option);
 			if (r < 0) {
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					   "integer field, but no integer?\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
 				ret = r;
 				continue;
 			}
@@ -145,15 +147,14 @@ static int parse_opts(char *opts, struct p9_client *clnt)
 			s = match_strdup(&args[0]);
 			if (!s) {
 				ret = -ENOMEM;
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					"problem allocating copy of trans arg\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "problem allocating copy of trans arg\n");
 				goto free_and_return;
 			 }
 			clnt->trans_mod = v9fs_get_trans_by_name(s);
 			if (clnt->trans_mod == NULL) {
-				printk(KERN_INFO
-					"9p: Could not find "
-					"request transport: %s\n", s);
+				pr_info("Could not find request transport: %s\n",
+					s);
 				ret = -EINVAL;
 				kfree(s);
 				goto free_and_return;
@@ -167,8 +168,8 @@ static int parse_opts(char *opts, struct p9_client *clnt)
 			s = match_strdup(&args[0]);
 			if (!s) {
 				ret = -ENOMEM;
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					"problem allocating copy of version arg\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "problem allocating copy of version arg\n");
 				goto free_and_return;
 			}
 			ret = get_protocol_version(s);
@@ -225,7 +226,7 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
 					sizeof(struct p9_req_t), GFP_ATOMIC);
 
 			if (!c->reqs[row]) {
-				printk(KERN_ERR "Couldn't grow tag array\n");
+				pr_err("Couldn't grow tag array\n");
 				spin_unlock_irqrestore(&c->lock, flags);
 				return ERR_PTR(-ENOMEM);
 			}
@@ -244,7 +245,7 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
 	if (!req->tc) {
 		req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS);
 		if (!req->wq) {
-			printk(KERN_ERR "Couldn't grow tag array\n");
+			pr_err("Couldn't grow tag array\n");
 			return ERR_PTR(-ENOMEM);
 		}
 		init_waitqueue_head(req->wq);
@@ -253,7 +254,7 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
 		req->rc = kmalloc(sizeof(struct p9_fcall) + alloc_msize,
 				  GFP_NOFS);
 		if ((!req->tc) || (!req->rc)) {
-			printk(KERN_ERR "Couldn't grow tag array\n");
+			pr_err("Couldn't grow tag array\n");
 			kfree(req->tc);
 			kfree(req->rc);
 			kfree(req->wq);
@@ -343,9 +344,9 @@ static void p9_tag_cleanup(struct p9_client *c)
 	for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
 		for (col = 0; col < P9_ROW_MAXTAG; col++) {
 			if (c->reqs[row][col].status != REQ_STATUS_IDLE) {
-				P9_DPRINTK(P9_DEBUG_MUX,
-				  "Attempting to cleanup non-free tag %d,%d\n",
-				  row, col);
+				p9_debug(P9_DEBUG_MUX,
+					 "Attempting to cleanup non-free tag %d,%d\n",
+					 row, col);
 				/* TODO: delay execution of cleanup */
 				return;
 			}
@@ -379,7 +380,7 @@ static void p9_tag_cleanup(struct p9_client *c)
 static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
 {
 	int tag = r->tc->tag;
-	P9_DPRINTK(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag);
+	p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag);
 
 	r->status = REQ_STATUS_IDLE;
 	if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool))
@@ -394,9 +395,9 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
  */
 void p9_client_cb(struct p9_client *c, struct p9_req_t *req)
 {
-	P9_DPRINTK(P9_DEBUG_MUX, " tag %d\n", req->tc->tag);
+	p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc->tag);
 	wake_up(req->wq);
-	P9_DPRINTK(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
+	p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
 }
 EXPORT_SYMBOL(p9_client_cb);
 
@@ -431,8 +432,8 @@ p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag,
 	pdu->id = r_type;
 	pdu->tag = r_tag;
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n", pdu->size,
-							pdu->id, pdu->tag);
+	p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n",
+		 pdu->size, pdu->id, pdu->tag);
 
 	if (type)
 		*type = r_type;
@@ -473,7 +474,7 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
 	 */
 	trace_9p_protocol_dump(c, req->rc);
 	if (err) {
-		P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
+		p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
 		return err;
 	}
 	if (type != P9_RERROR && type != P9_RLERROR)
@@ -492,21 +493,21 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
 		if (!err || !IS_ERR_VALUE(err)) {
 			err = p9_errstr2errno(ename, strlen(ename));
 
-			P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
-				   -ecode, ename);
+			p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
+				 -ecode, ename);
 		}
 		kfree(ename);
 	} else {
 		err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
 		err = -ecode;
 
-		P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
+		p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
 	}
 
 	return err;
 
 out_err:
-	P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
+	p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
 
 	return err;
 }
@@ -538,7 +539,7 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
 	 */
 	trace_9p_protocol_dump(c, req->rc);
 	if (err) {
-		P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
+		p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
 		return err;
 	}
 
@@ -601,22 +602,22 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
 		if (!err || !IS_ERR_VALUE(err)) {
 			err = p9_errstr2errno(ename, strlen(ename));
 
-			P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
-				   -ecode, ename);
+			p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
+				 -ecode, ename);
 		}
 		kfree(ename);
 	} else {
 		err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
 		err = -ecode;
 
-		P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
+		p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
 	}
 	return err;
 
 out_free:
 	kfree(ename);
 out_err:
-	P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
+	p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
 	return err;
 }
 
@@ -645,7 +646,7 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
 	if (err)
 		return err;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag);
+	p9_debug(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag);
 
 	req = p9_client_rpc(c, P9_TFLUSH, "w", oldtag);
 	if (IS_ERR(req))
@@ -670,7 +671,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
 	int tag, err;
 	struct p9_req_t *req;
 
-	P9_DPRINTK(P9_DEBUG_MUX, "client %p op %d\n", c, type);
+	p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
 
 	/* we allow for any status other than disconnected */
 	if (c->status == Disconnected)
@@ -744,11 +745,11 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
 				       req->status >= REQ_STATUS_RCVD);
 
 	if (req->status == REQ_STATUS_ERROR) {
-		P9_DPRINTK(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+		p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
 		err = req->t_err;
 	}
 	if ((err == -ERESTARTSYS) && (c->status == Connected)) {
-		P9_DPRINTK(P9_DEBUG_MUX, "flushing\n");
+		p9_debug(P9_DEBUG_MUX, "flushing\n");
 		sigpending = 1;
 		clear_thread_flag(TIF_SIGPENDING);
 
@@ -827,11 +828,11 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
 		goto reterr;
 	}
 	if (req->status == REQ_STATUS_ERROR) {
-		P9_DPRINTK(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+		p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
 		err = req->t_err;
 	}
 	if ((err == -ERESTARTSYS) && (c->status == Connected)) {
-		P9_DPRINTK(P9_DEBUG_MUX, "flushing\n");
+		p9_debug(P9_DEBUG_MUX, "flushing\n");
 		sigpending = 1;
 		clear_thread_flag(TIF_SIGPENDING);
 
@@ -865,7 +866,7 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
 	struct p9_fid *fid;
 	unsigned long flags;
 
-	P9_DPRINTK(P9_DEBUG_FID, "clnt %p\n", clnt);
+	p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
 	fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL);
 	if (!fid)
 		return ERR_PTR(-ENOMEM);
@@ -898,7 +899,7 @@ static void p9_fid_destroy(struct p9_fid *fid)
 	struct p9_client *clnt;
 	unsigned long flags;
 
-	P9_DPRINTK(P9_DEBUG_FID, "fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
 	clnt = fid->clnt;
 	p9_idpool_put(fid->fid, clnt->fidpool);
 	spin_lock_irqsave(&clnt->lock, flags);
@@ -915,8 +916,8 @@ static int p9_client_version(struct p9_client *c)
 	char *version;
 	int msize;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n",
-						c->msize, c->proto_version);
+	p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n",
+		 c->msize, c->proto_version);
 
 	switch (c->proto_version) {
 	case p9_proto_2000L:
@@ -941,12 +942,12 @@ static int p9_client_version(struct p9_client *c)
 
 	err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version);
 	if (err) {
-		P9_DPRINTK(P9_DEBUG_9P, "version error %d\n", err);
+		p9_debug(P9_DEBUG_9P, "version error %d\n", err);
 		trace_9p_protocol_dump(c, req->rc);
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version);
+	p9_debug(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version);
 	if (!strncmp(version, "9P2000.L", 8))
 		c->proto_version = p9_proto_2000L;
 	else if (!strncmp(version, "9P2000.u", 8))
@@ -996,8 +997,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
 
 	if (clnt->trans_mod == NULL) {
 		err = -EPROTONOSUPPORT;
-		P9_DPRINTK(P9_DEBUG_ERROR,
-				"No transport defined or default transport\n");
+		p9_debug(P9_DEBUG_ERROR,
+			 "No transport defined or default transport\n");
 		goto destroy_tagpool;
 	}
 
@@ -1007,8 +1008,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
 		goto put_trans;
 	}
 
-	P9_DPRINTK(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
-		clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
+	p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
+		 clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
 
 	err = clnt->trans_mod->create(clnt, dev_name, options);
 	if (err)
@@ -1041,7 +1042,7 @@ void p9_client_destroy(struct p9_client *clnt)
 {
 	struct p9_fid *fid, *fidptr;
 
-	P9_DPRINTK(P9_DEBUG_MUX, "clnt %p\n", clnt);
+	p9_debug(P9_DEBUG_MUX, "clnt %p\n", clnt);
 
 	if (clnt->trans_mod)
 		clnt->trans_mod->close(clnt);
@@ -1049,7 +1050,7 @@ void p9_client_destroy(struct p9_client *clnt)
 	v9fs_put_trans(clnt->trans_mod);
 
 	list_for_each_entry_safe(fid, fidptr, &clnt->fidlist, flist) {
-		printk(KERN_INFO "Found fid %d not clunked\n", fid->fid);
+		pr_info("Found fid %d not clunked\n", fid->fid);
 		p9_fid_destroy(fid);
 	}
 
@@ -1064,14 +1065,14 @@ EXPORT_SYMBOL(p9_client_destroy);
 
 void p9_client_disconnect(struct p9_client *clnt)
 {
-	P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt);
+	p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt);
 	clnt->status = Disconnected;
 }
 EXPORT_SYMBOL(p9_client_disconnect);
 
 void p9_client_begin_disconnect(struct p9_client *clnt)
 {
-	P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt);
+	p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt);
 	clnt->status = BeginDisconnect;
 }
 EXPORT_SYMBOL(p9_client_begin_disconnect);
@@ -1085,8 +1086,8 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
 	struct p9_qid qid;
 
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
-		   afid ? afid->fid : -1, uname, aname);
+	p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
+		 afid ? afid->fid : -1, uname, aname);
 	fid = p9_fid_create(clnt);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
@@ -1108,10 +1109,8 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n",
-					qid.type,
-					(unsigned long long)qid.path,
-					qid.version);
+	p9_debug(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n",
+		 qid.type, (unsigned long long)qid.path, qid.version);
 
 	memmove(&fid->qid, &qid, sizeof(struct p9_qid));
 
@@ -1151,8 +1150,8 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
 		fid = oldfid;
 
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n",
-		oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL);
+	p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n",
+		 oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL);
 
 	req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid,
 								nwname, wnames);
@@ -1169,7 +1168,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
 	}
 	p9_free_req(clnt, req);
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids);
+	p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids);
 
 	if (nwqids != nwname) {
 		err = -ENOENT;
@@ -1177,7 +1176,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
 	}
 
 	for (count = 0; count < nwqids; count++)
-		P9_DPRINTK(P9_DEBUG_9P, "<<<     [%d] %x.%llx.%x\n",
+		p9_debug(P9_DEBUG_9P, "<<<     [%d] %x.%llx.%x\n",
 			count, wqids[count].type,
 			(unsigned long long)wqids[count].path,
 			wqids[count].version);
@@ -1212,7 +1211,7 @@ int p9_client_open(struct p9_fid *fid, int mode)
 	int iounit;
 
 	clnt = fid->clnt;
-	P9_DPRINTK(P9_DEBUG_9P, ">>> %s fid %d mode %d\n",
+	p9_debug(P9_DEBUG_9P, ">>> %s fid %d mode %d\n",
 		p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode);
 	err = 0;
 
@@ -1234,7 +1233,7 @@ int p9_client_open(struct p9_fid *fid, int mode)
 		goto free_and_error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n",
+	p9_debug(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n",
 		p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN",  qid.type,
 		(unsigned long long)qid.path, qid.version, iounit);
 
@@ -1256,7 +1255,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode,
 	struct p9_req_t *req;
 	int iounit;
 
-	P9_DPRINTK(P9_DEBUG_9P,
+	p9_debug(P9_DEBUG_9P,
 			">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n",
 			ofid->fid, name, flags, mode, gid);
 	clnt = ofid->clnt;
@@ -1277,7 +1276,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode,
 		goto free_and_error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n",
+	p9_debug(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n",
 			qid->type,
 			(unsigned long long)qid->path,
 			qid->version, iounit);
@@ -1301,7 +1300,7 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode,
 	struct p9_qid qid;
 	int iounit;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n",
+	p9_debug(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n",
 						fid->fid, name, perm, mode);
 	err = 0;
 	clnt = fid->clnt;
@@ -1322,7 +1321,7 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode,
 		goto free_and_error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n",
+	p9_debug(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n",
 				qid.type,
 				(unsigned long long)qid.path,
 				qid.version, iounit);
@@ -1344,7 +1343,7 @@ int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, gid_t gid,
 	struct p9_client *clnt;
 	struct p9_req_t *req;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s  symtgt %s\n",
+	p9_debug(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s  symtgt %s\n",
 			dfid->fid, name, symtgt);
 	clnt = dfid->clnt;
 
@@ -1361,7 +1360,7 @@ int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, gid_t gid,
 		goto free_and_error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n",
+	p9_debug(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n",
 			qid->type, (unsigned long long)qid->path, qid->version);
 
 free_and_error:
@@ -1376,7 +1375,7 @@ int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname)
 	struct p9_client *clnt;
 	struct p9_req_t *req;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n",
+	p9_debug(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n",
 			dfid->fid, oldfid->fid, newname);
 	clnt = dfid->clnt;
 	req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid,
@@ -1384,7 +1383,7 @@ int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RLINK\n");
+	p9_debug(P9_DEBUG_9P, "<<< RLINK\n");
 	p9_free_req(clnt, req);
 	return 0;
 }
@@ -1396,7 +1395,7 @@ int p9_client_fsync(struct p9_fid *fid, int datasync)
 	struct p9_client *clnt;
 	struct p9_req_t *req;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n",
+	p9_debug(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n",
 			fid->fid, datasync);
 	err = 0;
 	clnt = fid->clnt;
@@ -1407,7 +1406,7 @@ int p9_client_fsync(struct p9_fid *fid, int datasync)
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid);
 
 	p9_free_req(clnt, req);
 
@@ -1423,12 +1422,13 @@ int p9_client_clunk(struct p9_fid *fid)
 	struct p9_req_t *req;
 
 	if (!fid) {
-		P9_EPRINTK(KERN_WARNING, "Trying to clunk with NULL fid\n");
+		pr_warn("%s (%d): Trying to clunk with NULL fid\n",
+			__func__, task_pid_nr(current));
 		dump_stack();
 		return 0;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid);
 	err = 0;
 	clnt = fid->clnt;
 
@@ -1438,7 +1438,7 @@ int p9_client_clunk(struct p9_fid *fid)
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid);
 
 	p9_free_req(clnt, req);
 error:
@@ -1456,7 +1456,7 @@ int p9_client_remove(struct p9_fid *fid)
 	struct p9_client *clnt;
 	struct p9_req_t *req;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid);
 	err = 0;
 	clnt = fid->clnt;
 
@@ -1466,7 +1466,7 @@ int p9_client_remove(struct p9_fid *fid)
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid);
 
 	p9_free_req(clnt, req);
 error:
@@ -1481,7 +1481,7 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
 	struct p9_req_t *req;
 	struct p9_client *clnt;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TUNLINKAT fid %d %s %d\n",
+	p9_debug(P9_DEBUG_9P, ">>> TUNLINKAT fid %d %s %d\n",
 		   dfid->fid, name, flags);
 
 	clnt = dfid->clnt;
@@ -1490,7 +1490,7 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
 		err = PTR_ERR(req);
 		goto error;
 	}
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name);
+	p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name);
 
 	p9_free_req(clnt, req);
 error:
@@ -1509,7 +1509,7 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
 	int err, rsize, non_zc = 0;
 
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n",
+	p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n",
 		   fid->fid, (long long unsigned) offset, count);
 	err = 0;
 	clnt = fid->clnt;
@@ -1552,7 +1552,7 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
 		goto free_and_error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
+	p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
 
 	if (non_zc) {
 		if (data) {
@@ -1584,7 +1584,7 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata,
 	struct p9_client *clnt;
 	struct p9_req_t *req;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %d\n",
+	p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %d\n",
 				fid->fid, (long long unsigned) offset, count);
 	err = 0;
 	clnt = fid->clnt;
@@ -1626,7 +1626,7 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata,
 		goto free_and_error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RWRITE count %d\n", count);
+	p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count);
 
 	p9_free_req(clnt, req);
 	return count;
@@ -1646,7 +1646,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid)
 	struct p9_req_t *req;
 	u16 ignored;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid);
 
 	if (!ret)
 		return ERR_PTR(-ENOMEM);
@@ -1667,7 +1667,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid)
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P,
+	p9_debug(P9_DEBUG_9P,
 		"<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
 		"<<<    mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
 		"<<<    name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
@@ -1696,7 +1696,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
 								GFP_KERNEL);
 	struct p9_req_t *req;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n",
+	p9_debug(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n",
 							fid->fid, request_mask);
 
 	if (!ret)
@@ -1718,7 +1718,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P,
+	p9_debug(P9_DEBUG_9P,
 		"<<< RGETATTR st_result_mask=%lld\n"
 		"<<< qid=%x.%llx.%x\n"
 		"<<< st_mode=%8.8x st_nlink=%llu\n"
@@ -1784,8 +1784,8 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
 	err = 0;
 	clnt = fid->clnt;
 	wst->size = p9_client_statsize(wst, clnt->proto_version);
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid);
-	P9_DPRINTK(P9_DEBUG_9P,
+	p9_debug(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P,
 		"     sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
 		"     mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
 		"     name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
@@ -1802,7 +1802,7 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid);
 
 	p9_free_req(clnt, req);
 error:
@@ -1818,8 +1818,8 @@ int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr)
 
 	err = 0;
 	clnt = fid->clnt;
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid);
-	P9_DPRINTK(P9_DEBUG_9P,
+	p9_debug(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P,
 		"    valid=%x mode=%x uid=%d gid=%d size=%lld\n"
 		"    atime_sec=%lld atime_nsec=%lld\n"
 		"    mtime_sec=%lld mtime_nsec=%lld\n",
@@ -1833,7 +1833,7 @@ int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr)
 		err = PTR_ERR(req);
 		goto error;
 	}
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid);
 	p9_free_req(clnt, req);
 error:
 	return err;
@@ -1849,7 +1849,7 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb)
 	err = 0;
 	clnt = fid->clnt;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid);
 
 	req = p9_client_rpc(clnt, P9_TSTATFS, "d", fid->fid);
 	if (IS_ERR(req)) {
@@ -1866,7 +1866,7 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb)
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RSTATFS fid %d type 0x%lx bsize %ld "
+	p9_debug(P9_DEBUG_9P, "<<< RSTATFS fid %d type 0x%lx bsize %ld "
 		"blocks %llu bfree %llu bavail %llu files %llu ffree %llu "
 		"fsid %llu namelen %ld\n",
 		fid->fid, (long unsigned int)sb->type, (long int)sb->bsize,
@@ -1889,7 +1889,7 @@ int p9_client_rename(struct p9_fid *fid,
 	err = 0;
 	clnt = fid->clnt;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n",
+	p9_debug(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n",
 			fid->fid, newdirfid->fid, name);
 
 	req = p9_client_rpc(clnt, P9_TRENAME, "dds", fid->fid,
@@ -1899,7 +1899,7 @@ int p9_client_rename(struct p9_fid *fid,
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid);
 
 	p9_free_req(clnt, req);
 error:
@@ -1917,7 +1917,7 @@ int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name,
 	err = 0;
 	clnt = olddirfid->clnt;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TRENAMEAT olddirfid %d old name %s"
+	p9_debug(P9_DEBUG_9P, ">>> TRENAMEAT olddirfid %d old name %s"
 		   " newdirfid %d new name %s\n", olddirfid->fid, old_name,
 		   newdirfid->fid, new_name);
 
@@ -1928,7 +1928,7 @@ int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name,
 		goto error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n",
+	p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n",
 		   newdirfid->fid, new_name);
 
 	p9_free_req(clnt, req);
@@ -1956,7 +1956,7 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
 		attr_fid = NULL;
 		goto error;
 	}
-	P9_DPRINTK(P9_DEBUG_9P,
+	p9_debug(P9_DEBUG_9P,
 		">>> TXATTRWALK file_fid %d, attr_fid %d name %s\n",
 		file_fid->fid, attr_fid->fid, attr_name);
 
@@ -1973,7 +1973,7 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
 		goto clunk_fid;
 	}
 	p9_free_req(clnt, req);
-	P9_DPRINTK(P9_DEBUG_9P, "<<<  RXATTRWALK fid %d size %llu\n",
+	p9_debug(P9_DEBUG_9P, "<<<  RXATTRWALK fid %d size %llu\n",
 		attr_fid->fid, *attr_size);
 	return attr_fid;
 clunk_fid:
@@ -1994,7 +1994,7 @@ int p9_client_xattrcreate(struct p9_fid *fid, const char *name,
 	struct p9_req_t *req;
 	struct p9_client *clnt;
 
-	P9_DPRINTK(P9_DEBUG_9P,
+	p9_debug(P9_DEBUG_9P,
 		">>> TXATTRCREATE fid %d name  %s size %lld flag %d\n",
 		fid->fid, name, (long long)attr_size, flags);
 	err = 0;
@@ -2005,7 +2005,7 @@ int p9_client_xattrcreate(struct p9_fid *fid, const char *name,
 		err = PTR_ERR(req);
 		goto error;
 	}
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid);
 	p9_free_req(clnt, req);
 error:
 	return err;
@@ -2019,7 +2019,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
 	struct p9_req_t *req;
 	char *dataptr;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n",
+	p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n",
 				fid->fid, (long long unsigned) offset, count);
 
 	err = 0;
@@ -2056,7 +2056,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
 		goto free_and_error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count);
+	p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count);
 
 	if (non_zc)
 		memmove(data, dataptr, count);
@@ -2080,7 +2080,7 @@ int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode,
 
 	err = 0;
 	clnt = fid->clnt;
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TMKNOD fid %d name %s mode %d major %d "
+	p9_debug(P9_DEBUG_9P, ">>> TMKNOD fid %d name %s mode %d major %d "
 		"minor %d\n", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev));
 	req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddd", fid->fid, name, mode,
 		MAJOR(rdev), MINOR(rdev), gid);
@@ -2092,7 +2092,7 @@ int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode,
 		trace_9p_protocol_dump(clnt, req->rc);
 		goto error;
 	}
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type,
+	p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type,
 				(unsigned long long)qid->path, qid->version);
 
 error:
@@ -2111,7 +2111,7 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode,
 
 	err = 0;
 	clnt = fid->clnt;
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n",
+	p9_debug(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n",
 		 fid->fid, name, mode, gid);
 	req = p9_client_rpc(clnt, P9_TMKDIR, "dsdd", fid->fid, name, mode,
 		gid);
@@ -2123,7 +2123,7 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode,
 		trace_9p_protocol_dump(clnt, req->rc);
 		goto error;
 	}
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type,
+	p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type,
 				(unsigned long long)qid->path, qid->version);
 
 error:
@@ -2141,7 +2141,7 @@ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
 
 	err = 0;
 	clnt = fid->clnt;
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d "
+	p9_debug(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d "
 			"start %lld length %lld proc_id %d client_id %s\n",
 			fid->fid, flock->type, flock->flags, flock->start,
 			flock->length, flock->proc_id, flock->client_id);
@@ -2158,7 +2158,7 @@ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
 		trace_9p_protocol_dump(clnt, req->rc);
 		goto error;
 	}
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
+	p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
 error:
 	p9_free_req(clnt, req);
 	return err;
@@ -2174,7 +2174,7 @@ int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
 
 	err = 0;
 	clnt = fid->clnt;
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld "
+	p9_debug(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld "
 		"length %lld proc_id %d client_id %s\n", fid->fid, glock->type,
 		glock->start, glock->length, glock->proc_id, glock->client_id);
 
@@ -2191,7 +2191,7 @@ int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
 		trace_9p_protocol_dump(clnt, req->rc);
 		goto error;
 	}
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld "
+	p9_debug(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld "
 		"proc_id %d client_id %s\n", glock->type, glock->start,
 		glock->length, glock->proc_id, glock->client_id);
 error:
@@ -2208,7 +2208,7 @@ int p9_client_readlink(struct p9_fid *fid, char **target)
 
 	err = 0;
 	clnt = fid->clnt;
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid);
 
 	req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid);
 	if (IS_ERR(req))
@@ -2219,7 +2219,7 @@ int p9_client_readlink(struct p9_fid *fid, char **target)
 		trace_9p_protocol_dump(clnt, req->rc);
 		goto error;
 	}
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
+	p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
 error:
 	p9_free_req(clnt, req);
 	return err;
diff --git a/net/9p/error.c b/net/9p/error.c
index 52518512a93..2ab2de76010 100644
--- a/net/9p/error.c
+++ b/net/9p/error.c
@@ -27,6 +27,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/list.h>
 #include <linux/jhash.h>
@@ -237,8 +239,8 @@ int p9_errstr2errno(char *errstr, int len)
 	if (errno == 0) {
 		/* TODO: if error isn't found, add it dynamically */
 		errstr[len] = 0;
-		printk(KERN_ERR "%s: server reported unknown error %s\n",
-			__func__, errstr);
+		pr_err("%s: server reported unknown error %s\n",
+		       __func__, errstr);
 		errno = ESERVERFAULT;
 	}
 
diff --git a/net/9p/mod.c b/net/9p/mod.c
index 2664d129229..6ab36aea772 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -24,7 +24,11 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
 #include <linux/moduleparam.h>
 #include <net/9p/9p.h>
 #include <linux/fs.h>
@@ -39,6 +43,29 @@ unsigned int p9_debug_level = 0;	/* feature-rific global debug level  */
 EXPORT_SYMBOL(p9_debug_level);
 module_param_named(debug, p9_debug_level, uint, 0);
 MODULE_PARM_DESC(debug, "9P debugging level");
+
+void _p9_debug(enum p9_debug_flags level, const char *func,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if ((p9_debug_level & level) != level)
+		return;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	if (level == P9_DEBUG_9P)
+		pr_notice("(%8.8d) %pV", task_pid_nr(current), &vaf);
+	else
+		pr_notice("-- %s (%d): %pV", func, task_pid_nr(current), &vaf);
+
+	va_end(args);
+}
+EXPORT_SYMBOL(_p9_debug);
 #endif
 
 /*
@@ -147,7 +174,7 @@ static int __init init_p9(void)
 	int ret = 0;
 
 	p9_error_init();
-	printk(KERN_INFO "Installing 9P2000 support\n");
+	pr_info("Installing 9P2000 support\n");
 	p9_trans_fd_init();
 
 	return ret;
@@ -160,7 +187,7 @@ static int __init init_p9(void)
 
 static void __exit exit_p9(void)
 {
-	printk(KERN_INFO "Unloading 9P2000 support\n");
+	pr_info("Unloading 9P2000 support\n");
 
 	p9_trans_fd_exit();
 }
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index 55e10a96c90..9ee48cb3017 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -534,7 +534,7 @@ int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st)
 
 	ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "S", st);
 	if (ret) {
-		P9_DPRINTK(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret);
+		p9_debug(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret);
 		trace_9p_protocol_dump(clnt, &fake_pdu);
 	}
 
@@ -558,8 +558,8 @@ int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu)
 	pdu->size = size;
 
 	trace_9p_protocol_dump(clnt, pdu);
-	P9_DPRINTK(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n", pdu->size,
-							pdu->id, pdu->tag);
+	p9_debug(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n",
+		 pdu->size, pdu->id, pdu->tag);
 
 	return err;
 }
@@ -585,7 +585,7 @@ int p9dirent_read(struct p9_client *clnt, char *buf, int len,
 	ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "Qqbs", &dirent->qid,
 			  &dirent->d_off, &dirent->d_type, &nameptr);
 	if (ret) {
-		P9_DPRINTK(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret);
+		p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret);
 		trace_9p_protocol_dump(clnt, &fake_pdu);
 		goto out;
 	}
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index fdfdb5747f6..fccae26fa67 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -25,6 +25,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/in.h>
 #include <linux/module.h>
 #include <linux/net.h>
@@ -191,7 +193,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
 	unsigned long flags;
 	LIST_HEAD(cancel_list);
 
-	P9_DPRINTK(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
+	p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
 
 	spin_lock_irqsave(&m->client->lock, flags);
 
@@ -217,7 +219,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
 	spin_unlock_irqrestore(&m->client->lock, flags);
 
 	list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
-		P9_DPRINTK(P9_DEBUG_ERROR, "call back req %p\n", req);
+		p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
 		list_del(&req->req_list);
 		p9_client_cb(m->client, req);
 	}
@@ -275,7 +277,7 @@ static int p9_fd_read(struct p9_client *client, void *v, int len)
 		return -EREMOTEIO;
 
 	if (!(ts->rd->f_flags & O_NONBLOCK))
-		P9_DPRINTK(P9_DEBUG_ERROR, "blocking read ...\n");
+		p9_debug(P9_DEBUG_ERROR, "blocking read ...\n");
 
 	ret = kernel_read(ts->rd, ts->rd->f_pos, v, len);
 	if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
@@ -299,7 +301,7 @@ static void p9_read_work(struct work_struct *work)
 	if (m->err < 0)
 		return;
 
-	P9_DPRINTK(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos);
+	p9_debug(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos);
 
 	if (!m->rbuf) {
 		m->rbuf = m->tmp_buf;
@@ -308,11 +310,11 @@ static void p9_read_work(struct work_struct *work)
 	}
 
 	clear_bit(Rpending, &m->wsched);
-	P9_DPRINTK(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n", m,
-					m->rpos, m->rsize, m->rsize-m->rpos);
+	p9_debug(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n",
+		 m, m->rpos, m->rsize, m->rsize-m->rpos);
 	err = p9_fd_read(m->client, m->rbuf + m->rpos,
 						m->rsize - m->rpos);
-	P9_DPRINTK(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
+	p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
 	if (err == -EAGAIN) {
 		clear_bit(Rworksched, &m->wsched);
 		return;
@@ -325,25 +327,25 @@ static void p9_read_work(struct work_struct *work)
 
 	if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */
 		u16 tag;
-		P9_DPRINTK(P9_DEBUG_TRANS, "got new header\n");
+		p9_debug(P9_DEBUG_TRANS, "got new header\n");
 
 		n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */
 		if (n >= m->client->msize) {
-			P9_DPRINTK(P9_DEBUG_ERROR,
-				"requested packet size too big: %d\n", n);
+			p9_debug(P9_DEBUG_ERROR,
+				 "requested packet size too big: %d\n", n);
 			err = -EIO;
 			goto error;
 		}
 
 		tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */
-		P9_DPRINTK(P9_DEBUG_TRANS,
-			"mux %p pkt: size: %d bytes tag: %d\n", m, n, tag);
+		p9_debug(P9_DEBUG_TRANS,
+			 "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag);
 
 		m->req = p9_tag_lookup(m->client, tag);
 		if (!m->req || (m->req->status != REQ_STATUS_SENT &&
 					m->req->status != REQ_STATUS_FLSH)) {
-			P9_DPRINTK(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
-								 tag);
+			p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
+				 tag);
 			err = -EIO;
 			goto error;
 		}
@@ -364,7 +366,7 @@ static void p9_read_work(struct work_struct *work)
 
 	/* not an else because some packets (like clunk) have no payload */
 	if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */
-		P9_DPRINTK(P9_DEBUG_TRANS, "got new packet\n");
+		p9_debug(P9_DEBUG_TRANS, "got new packet\n");
 		spin_lock(&m->client->lock);
 		if (m->req->status != REQ_STATUS_ERROR)
 			m->req->status = REQ_STATUS_RCVD;
@@ -384,7 +386,7 @@ static void p9_read_work(struct work_struct *work)
 			n = p9_fd_poll(m->client, NULL);
 
 		if (n & POLLIN) {
-			P9_DPRINTK(P9_DEBUG_TRANS, "sched read work %p\n", m);
+			p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m);
 			schedule_work(&m->rq);
 		} else
 			clear_bit(Rworksched, &m->wsched);
@@ -418,7 +420,7 @@ static int p9_fd_write(struct p9_client *client, void *v, int len)
 		return -EREMOTEIO;
 
 	if (!(ts->wr->f_flags & O_NONBLOCK))
-		P9_DPRINTK(P9_DEBUG_ERROR, "blocking write ...\n");
+		p9_debug(P9_DEBUG_ERROR, "blocking write ...\n");
 
 	oldfs = get_fs();
 	set_fs(get_ds());
@@ -460,7 +462,7 @@ static void p9_write_work(struct work_struct *work)
 		req = list_entry(m->unsent_req_list.next, struct p9_req_t,
 			       req_list);
 		req->status = REQ_STATUS_SENT;
-		P9_DPRINTK(P9_DEBUG_TRANS, "move req %p\n", req);
+		p9_debug(P9_DEBUG_TRANS, "move req %p\n", req);
 		list_move_tail(&req->req_list, &m->req_list);
 
 		m->wbuf = req->tc->sdata;
@@ -469,11 +471,11 @@ static void p9_write_work(struct work_struct *work)
 		spin_unlock(&m->client->lock);
 	}
 
-	P9_DPRINTK(P9_DEBUG_TRANS, "mux %p pos %d size %d\n", m, m->wpos,
-								m->wsize);
+	p9_debug(P9_DEBUG_TRANS, "mux %p pos %d size %d\n",
+		 m, m->wpos, m->wsize);
 	clear_bit(Wpending, &m->wsched);
 	err = p9_fd_write(m->client, m->wbuf + m->wpos, m->wsize - m->wpos);
-	P9_DPRINTK(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err);
+	p9_debug(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err);
 	if (err == -EAGAIN) {
 		clear_bit(Wworksched, &m->wsched);
 		return;
@@ -497,7 +499,7 @@ static void p9_write_work(struct work_struct *work)
 			n = p9_fd_poll(m->client, NULL);
 
 		if (n & POLLOUT) {
-			P9_DPRINTK(P9_DEBUG_TRANS, "sched write work %p\n", m);
+			p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m);
 			schedule_work(&m->wq);
 		} else
 			clear_bit(Wworksched, &m->wsched);
@@ -551,7 +553,7 @@ p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
 	}
 
 	if (!pwait) {
-		P9_DPRINTK(P9_DEBUG_ERROR, "not enough wait_address slots\n");
+		p9_debug(P9_DEBUG_ERROR, "not enough wait_address slots\n");
 		return;
 	}
 
@@ -573,8 +575,7 @@ static struct p9_conn *p9_conn_create(struct p9_client *client)
 	int n;
 	struct p9_conn *m;
 
-	P9_DPRINTK(P9_DEBUG_TRANS, "client %p msize %d\n", client,
-								client->msize);
+	p9_debug(P9_DEBUG_TRANS, "client %p msize %d\n", client, client->msize);
 	m = kzalloc(sizeof(struct p9_conn), GFP_KERNEL);
 	if (!m)
 		return ERR_PTR(-ENOMEM);
@@ -591,12 +592,12 @@ static struct p9_conn *p9_conn_create(struct p9_client *client)
 
 	n = p9_fd_poll(client, &m->pt);
 	if (n & POLLIN) {
-		P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can read\n", m);
+		p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m);
 		set_bit(Rpending, &m->wsched);
 	}
 
 	if (n & POLLOUT) {
-		P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can write\n", m);
+		p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
 		set_bit(Wpending, &m->wsched);
 	}
 
@@ -618,7 +619,7 @@ static void p9_poll_mux(struct p9_conn *m)
 
 	n = p9_fd_poll(m->client, NULL);
 	if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
-		P9_DPRINTK(P9_DEBUG_TRANS, "error mux %p err %d\n", m, n);
+		p9_debug(P9_DEBUG_TRANS, "error mux %p err %d\n", m, n);
 		if (n >= 0)
 			n = -ECONNRESET;
 		p9_conn_cancel(m, n);
@@ -626,19 +627,19 @@ static void p9_poll_mux(struct p9_conn *m)
 
 	if (n & POLLIN) {
 		set_bit(Rpending, &m->wsched);
-		P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can read\n", m);
+		p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m);
 		if (!test_and_set_bit(Rworksched, &m->wsched)) {
-			P9_DPRINTK(P9_DEBUG_TRANS, "sched read work %p\n", m);
+			p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m);
 			schedule_work(&m->rq);
 		}
 	}
 
 	if (n & POLLOUT) {
 		set_bit(Wpending, &m->wsched);
-		P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can write\n", m);
+		p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
 		if ((m->wsize || !list_empty(&m->unsent_req_list)) &&
 		    !test_and_set_bit(Wworksched, &m->wsched)) {
-			P9_DPRINTK(P9_DEBUG_TRANS, "sched write work %p\n", m);
+			p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m);
 			schedule_work(&m->wq);
 		}
 	}
@@ -661,8 +662,8 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
 	struct p9_trans_fd *ts = client->trans;
 	struct p9_conn *m = ts->conn;
 
-	P9_DPRINTK(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n", m,
-						current, req->tc, req->tc->id);
+	p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n",
+		 m, current, req->tc, req->tc->id);
 	if (m->err < 0)
 		return m->err;
 
@@ -686,7 +687,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
 {
 	int ret = 1;
 
-	P9_DPRINTK(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+	p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
 
 	spin_lock(&client->lock);
 
@@ -726,8 +727,8 @@ static int parse_opts(char *params, struct p9_fd_opts *opts)
 
 	tmp_options = kstrdup(params, GFP_KERNEL);
 	if (!tmp_options) {
-		P9_DPRINTK(P9_DEBUG_ERROR,
-				"failed to allocate copy of option string\n");
+		p9_debug(P9_DEBUG_ERROR,
+			 "failed to allocate copy of option string\n");
 		return -ENOMEM;
 	}
 	options = tmp_options;
@@ -741,8 +742,8 @@ static int parse_opts(char *params, struct p9_fd_opts *opts)
 		if (token != Opt_err) {
 			r = match_int(&args[0], &option);
 			if (r < 0) {
-				P9_DPRINTK(P9_DEBUG_ERROR,
-				"integer field, but no integer?\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
 				continue;
 			}
 		}
@@ -801,7 +802,8 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
 	csocket->sk->sk_allocation = GFP_NOIO;
 	fd = sock_map_fd(csocket, 0);
 	if (fd < 0) {
-		P9_EPRINTK(KERN_ERR, "p9_socket_open: failed to map fd\n");
+		pr_err("%s (%d): failed to map fd\n",
+		       __func__, task_pid_nr(current));
 		sock_release(csocket);
 		kfree(p);
 		return fd;
@@ -837,8 +839,8 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
 
 static void p9_conn_destroy(struct p9_conn *m)
 {
-	P9_DPRINTK(P9_DEBUG_TRANS, "mux %p prev %p next %p\n", m,
-		m->mux_list.prev, m->mux_list.next);
+	p9_debug(P9_DEBUG_TRANS, "mux %p prev %p next %p\n",
+		 m, m->mux_list.prev, m->mux_list.next);
 
 	p9_mux_poll_stop(m);
 	cancel_work_sync(&m->rq);
@@ -919,7 +921,8 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
 	err = __sock_create(read_pnet(&current->nsproxy->net_ns), PF_INET,
 			    SOCK_STREAM, IPPROTO_TCP, &csocket, 1);
 	if (err) {
-		P9_EPRINTK(KERN_ERR, "p9_trans_tcp: problem creating socket\n");
+		pr_err("%s (%d): problem creating socket\n",
+		       __func__, task_pid_nr(current));
 		return err;
 	}
 
@@ -927,9 +930,8 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
 				    (struct sockaddr *)&sin_server,
 				    sizeof(struct sockaddr_in), 0);
 	if (err < 0) {
-		P9_EPRINTK(KERN_ERR,
-			"p9_trans_tcp: problem connecting socket to %s\n",
-			addr);
+		pr_err("%s (%d): problem connecting socket to %s\n",
+		       __func__, task_pid_nr(current), addr);
 		sock_release(csocket);
 		return err;
 	}
@@ -947,8 +949,8 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
 	csocket = NULL;
 
 	if (strlen(addr) >= UNIX_PATH_MAX) {
-		P9_EPRINTK(KERN_ERR, "p9_trans_unix: address too long: %s\n",
-			addr);
+		pr_err("%s (%d): address too long: %s\n",
+		       __func__, task_pid_nr(current), addr);
 		return -ENAMETOOLONG;
 	}
 
@@ -957,15 +959,16 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
 	err = __sock_create(read_pnet(&current->nsproxy->net_ns), PF_UNIX,
 			    SOCK_STREAM, 0, &csocket, 1);
 	if (err < 0) {
-		P9_EPRINTK(KERN_ERR, "p9_trans_unix: problem creating socket\n");
+		pr_err("%s (%d): problem creating socket\n",
+		       __func__, task_pid_nr(current));
+
 		return err;
 	}
 	err = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server,
 			sizeof(struct sockaddr_un) - 1, 0);
 	if (err < 0) {
-		P9_EPRINTK(KERN_ERR,
-			"p9_trans_unix: problem connecting socket: %s: %d\n",
-			addr, err);
+		pr_err("%s (%d): problem connecting socket: %s: %d\n",
+		       __func__, task_pid_nr(current), addr, err);
 		sock_release(csocket);
 		return err;
 	}
@@ -983,7 +986,7 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args)
 	parse_opts(args, &opts);
 
 	if (opts.rfd == ~0 || opts.wfd == ~0) {
-		printk(KERN_ERR "v9fs: Insufficient options for proto=fd\n");
+		pr_err("Insufficient options for proto=fd\n");
 		return -ENOPROTOOPT;
 	}
 
@@ -1050,7 +1053,7 @@ static void p9_poll_workfn(struct work_struct *work)
 {
 	unsigned long flags;
 
-	P9_DPRINTK(P9_DEBUG_TRANS, "start %p\n", current);
+	p9_debug(P9_DEBUG_TRANS, "start %p\n", current);
 
 	spin_lock_irqsave(&p9_poll_lock, flags);
 	while (!list_empty(&p9_poll_pending_list)) {
@@ -1066,7 +1069,7 @@ static void p9_poll_workfn(struct work_struct *work)
 	}
 	spin_unlock_irqrestore(&p9_poll_lock, flags);
 
-	P9_DPRINTK(P9_DEBUG_TRANS, "finish\n");
+	p9_debug(P9_DEBUG_TRANS, "finish\n");
 }
 
 int p9_trans_fd_init(void)
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 159c50f1c6b..2c69ddd691a 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -26,6 +26,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/in.h>
 #include <linux/module.h>
 #include <linux/net.h>
@@ -178,8 +180,8 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
 
 	tmp_options = kstrdup(params, GFP_KERNEL);
 	if (!tmp_options) {
-		P9_DPRINTK(P9_DEBUG_ERROR,
-			   "failed to allocate copy of option string\n");
+		p9_debug(P9_DEBUG_ERROR,
+			 "failed to allocate copy of option string\n");
 		return -ENOMEM;
 	}
 	options = tmp_options;
@@ -192,8 +194,8 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
 		token = match_token(p, tokens, args);
 		r = match_int(&args[0], &option);
 		if (r < 0) {
-			P9_DPRINTK(P9_DEBUG_ERROR,
-				   "integer field, but no integer?\n");
+			p9_debug(P9_DEBUG_ERROR,
+				 "integer field, but no integer?\n");
 			continue;
 		}
 		switch (token) {
@@ -301,8 +303,7 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
 	return;
 
  err_out:
-	P9_DPRINTK(P9_DEBUG_ERROR, "req %p err %d status %d\n",
-		   req, err, status);
+	p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, status);
 	rdma->state = P9_RDMA_FLUSHING;
 	client->status = Disconnected;
 }
@@ -318,8 +319,8 @@ handle_send(struct p9_client *client, struct p9_trans_rdma *rdma,
 
 static void qp_event_handler(struct ib_event *event, void *context)
 {
-	P9_DPRINTK(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event,
-								context);
+	p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n",
+		 event->event, context);
 }
 
 static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
@@ -345,8 +346,7 @@ static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
 			break;
 
 		default:
-			printk(KERN_ERR "9prdma: unexpected completion type, "
-			       "c->wc_op=%d, wc.opcode=%d, status=%d\n",
+			pr_err("unexpected completion type, c->wc_op=%d, wc.opcode=%d, status=%d\n",
 			       c->wc_op, wc.opcode, wc.status);
 			break;
 		}
@@ -356,7 +356,7 @@ static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
 
 static void cq_event_handler(struct ib_event *e, void *v)
 {
-	P9_DPRINTK(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v);
+	p9_debug(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v);
 }
 
 static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
@@ -407,7 +407,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
 	return ib_post_recv(rdma->qp, &wr, &bad_wr);
 
  error:
-	P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
+	p9_debug(P9_DEBUG_ERROR, "EIO\n");
 	return -EIO;
 }
 
@@ -500,7 +500,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 	kfree(c);
 	kfree(rpl_context->rc);
 	kfree(rpl_context);
-	P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
+	p9_debug(P9_DEBUG_ERROR, "EIO\n");
 	return -EIO;
  err_free1:
 	kfree(rpl_context->rc);
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 32aa9834229..330421e5471 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -26,6 +26,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/in.h>
 #include <linux/module.h>
 #include <linux/net.h>
@@ -145,7 +147,7 @@ static void req_done(struct virtqueue *vq)
 	struct p9_req_t *req;
 	unsigned long flags;
 
-	P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n");
+	p9_debug(P9_DEBUG_TRANS, ": request done\n");
 
 	while (1) {
 		spin_lock_irqsave(&chan->lock, flags);
@@ -158,8 +160,8 @@ static void req_done(struct virtqueue *vq)
 		spin_unlock_irqrestore(&chan->lock, flags);
 		/* Wakeup if anyone waiting for VirtIO ring space. */
 		wake_up(chan->vc_wq);
-		P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc);
-		P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag);
+		p9_debug(P9_DEBUG_TRANS, ": rc %p\n", rc);
+		p9_debug(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag);
 		req = p9_tag_lookup(chan->client, rc->tag);
 		req->status = REQ_STATUS_RCVD;
 		p9_client_cb(chan->client, req);
@@ -257,7 +259,7 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
 	unsigned long flags;
 	struct virtio_chan *chan = client->trans;
 
-	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n");
+	p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
 
 	req->status = REQ_STATUS_SENT;
 req_retry:
@@ -280,20 +282,19 @@ req_retry:
 			if (err  == -ERESTARTSYS)
 				return err;
 
-			P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n");
+			p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n");
 			goto req_retry;
 		} else {
 			spin_unlock_irqrestore(&chan->lock, flags);
-			P9_DPRINTK(P9_DEBUG_TRANS,
-					"9p debug: "
-					"virtio rpc add_buf returned failure");
+			p9_debug(P9_DEBUG_TRANS,
+				 "virtio rpc add_buf returned failure\n");
 			return -EIO;
 		}
 	}
 	virtqueue_kick(chan->vq);
 	spin_unlock_irqrestore(&chan->lock, flags);
 
-	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n");
+	p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
 	return 0;
 }
 
@@ -354,7 +355,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
 	struct page **in_pages = NULL, **out_pages = NULL;
 	struct virtio_chan *chan = client->trans;
 
-	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n");
+	p9_debug(P9_DEBUG_TRANS, "virtio request\n");
 
 	if (uodata) {
 		out_nr_pages = p9_nr_pages(uodata, outlen);
@@ -423,20 +424,19 @@ req_retry_pinned:
 			if (err  == -ERESTARTSYS)
 				goto err_out;
 
-			P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n");
+			p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n");
 			goto req_retry_pinned;
 		} else {
 			spin_unlock_irqrestore(&chan->lock, flags);
-			P9_DPRINTK(P9_DEBUG_TRANS,
-				   "9p debug: "
-				   "virtio rpc add_buf returned failure");
+			p9_debug(P9_DEBUG_TRANS,
+				 "virtio rpc add_buf returned failure\n");
 			err = -EIO;
 			goto err_out;
 		}
 	}
 	virtqueue_kick(chan->vq);
 	spin_unlock_irqrestore(&chan->lock, flags);
-	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n");
+	p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
 	err = wait_event_interruptible(*req->wq,
 				       req->status >= REQ_STATUS_RCVD);
 	/*
@@ -491,7 +491,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 
 	chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL);
 	if (!chan) {
-		printk(KERN_ERR "9p: Failed to allocate virtio 9P channel\n");
+		pr_err("Failed to allocate virtio 9P channel\n");
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -592,7 +592,7 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
 	mutex_unlock(&virtio_9p_lock);
 
 	if (!found) {
-		printk(KERN_ERR "9p: no channels available\n");
+		pr_err("no channels available\n");
 		return ret;
 	}
 
diff --git a/net/9p/util.c b/net/9p/util.c
index 9c1c9348ac3..6ceeeb384de 100644
--- a/net/9p/util.c
+++ b/net/9p/util.c
@@ -106,7 +106,7 @@ retry:
 	else if (error)
 		return -1;
 
-	P9_DPRINTK(P9_DEBUG_MUX, " id %d pool %p\n", i, p);
+	p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", i, p);
 	return i;
 }
 EXPORT_SYMBOL(p9_idpool_get);
@@ -124,7 +124,7 @@ void p9_idpool_put(int id, struct p9_idpool *p)
 {
 	unsigned long flags;
 
-	P9_DPRINTK(P9_DEBUG_MUX, " id %d pool %p\n", id, p);
+	p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", id, p);
 
 	spin_lock_irqsave(&p->lock, flags);
 	idr_remove(&p->pool, id);
-- 
cgit v1.2.3


From df345c674b6366952a21a67604c8f6d489bb7ea7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 5 Jan 2012 13:39:49 +0530
Subject: fs/9p: v9fs_stat2inode should update suid/sgid bits.

Create a new helper that update the permission bits and use
that, instead of opencoding the logic.

Reported and bisected by:  M. Mohan Kumar <mohan@in.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 85c29733d8b..cf3dd6bc537 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -94,6 +94,32 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
 	return res;
 }
 
+/**
+ * p9mode2perm- convert plan9 mode bits to unix permission bits
+ * @v9ses: v9fs session information
+ * @stat: p9_wstat from which mode need to be derived
+ *
+ */
+static int p9mode2perm(struct v9fs_session_info *v9ses,
+		       struct p9_wstat *stat)
+{
+	int res;
+	int mode = stat->mode;
+
+	res = mode & S_IALLUGO;
+	if (v9fs_proto_dotu(v9ses)) {
+		if ((mode & P9_DMSETUID) == P9_DMSETUID)
+			res |= S_ISUID;
+
+		if ((mode & P9_DMSETGID) == P9_DMSETGID)
+			res |= S_ISGID;
+
+		if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
+			res |= S_ISVTX;
+	}
+	return res;
+}
+
 /**
  * p9mode2unixmode- convert plan9 mode bits to unix mode bits
  * @v9ses: v9fs session information
@@ -107,8 +133,8 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses,
 	int res;
 	int mode = stat->mode;
 
-	res = mode & S_IALLUGO;
 	*rdev = 0;
+	res = p9mode2perm(v9ses, stat);
 
 	if ((mode & P9_DMDIR) == P9_DMDIR)
 		res |= S_IFDIR;
@@ -142,16 +168,6 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses,
 	} else
 		res |= S_IFREG;
 
-	if (v9fs_proto_dotu(v9ses)) {
-		if ((mode & P9_DMSETUID) == P9_DMSETUID)
-			res |= S_ISUID;
-
-		if ((mode & P9_DMSETGID) == P9_DMSETGID)
-			res |= S_ISGID;
-
-		if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
-			res |= S_ISVTX;
-	}
 	return res;
 }
 
@@ -1168,7 +1184,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 				set_nlink(inode, i_nlink);
 		}
 	}
-	mode = stat->mode & S_IALLUGO;
+	mode = p9mode2perm(v9ses, stat);
 	mode |= inode->i_mode & ~S_IALLUGO;
 	inode->i_mode = mode;
 	i_size_write(inode, stat->length);
-- 
cgit v1.2.3


From b6054793069bf08fcf220fff5fb33735d5493594 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 5 Jan 2012 10:42:17 -0600
Subject: fs/9p: We should not allocate a new inode when creating hardlines.

Don't do new_inode_from fid in case of hardlink creation. This ensures
that link count for hardlink files get updated properly. Earlier link count
was not updated on removing a hardlink with cache mode enabled.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index cf3dd6bc537..c8fe480d0db 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -680,26 +680,31 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 		goto error;
 	}
 
-	/* now walk from the parent so we can get unopened fid */
-	fid = p9_client_walk(dfid, 1, &name, 1);
-	if (IS_ERR(fid)) {
-		err = PTR_ERR(fid);
-		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-		fid = NULL;
-		goto error;
-	}
-
-	/* instantiate inode and assign the unopened fid to the dentry */
-	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
-		goto error;
+	if (!(perm & P9_DMLINK)) {
+		/* now walk from the parent so we can get unopened fid */
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			p9_debug(P9_DEBUG_VFS,
+				   "p9_client_walk failed %d\n", err);
+			fid = NULL;
+			goto error;
+		}
+		/*
+		 * instantiate inode and assign the unopened fid to the dentry
+		 */
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			p9_debug(P9_DEBUG_VFS,
+				   "inode creation failed %d\n", err);
+			goto error;
+		}
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		d_instantiate(dentry, inode);
 	}
-	err = v9fs_fid_add(dentry, fid);
-	if (err < 0)
-		goto error;
-	d_instantiate(dentry, inode);
 	return ofid;
 error:
 	if (ofid)
-- 
cgit v1.2.3


From 0aaaf5c424c7ffd6b0c4253251356558b16ef3a2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 6 Dec 2011 16:13:48 -0500
Subject: NFS: Cache state owners after files are closed

Servers have a finite amount of memory to store NFSv4 open and lock
owners.  Moreover, servers may have a difficult time determining when
they can reap their state owner table, thanks to gray areas in the
NFSv4 protocol specification.  Thus clients should be careful to reuse
state owners when possible.

Currently Linux is not too careful.  When a user has closed all her
files on one mount point, the state owner's reference count goes to
zero, and it is released.  The next OPEN allocates a new one.  A
workload that serially opens and closes files can run through a large
number of open owners this way.

When a state owner's reference count goes to zero, slap it onto a free
list for that nfs_server, with an expiry time.  Garbage collect before
looking for a state owner.  This makes state owners for active users
available for re-use.

Now that there can be unused state owners remaining at umount time,
purge the state owner free list when a server is destroyed.  Also be
sure not to reclaim unused state owners during state recovery.

This change has benefits for the client as well.  For some workloads,
this approach drops the number of OPEN_CONFIRM calls from the same as
the number of OPEN calls, down to just one.  This reduces wire traffic
and thus open(2) latency.  Before this patch, untarring a kernel
source tarball shows the OPEN_CONFIRM call counter steadily increasing
through the test.  With the patch, the OPEN_CONFIRM count remains at 1
throughout the entire untar.

As long as the expiry time is kept short, I don't think garbage
collection should be terribly expensive, although it does bounce the
clp->cl_lock around a bit.

[ At some point we should rationalize the use of the nfs_server
->destroy method. ]

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
[Trond: Fixed a garbage collection race and a few efficiency issues]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |  8 +++++
 fs/nfs/nfs4_fs.h          |  3 ++
 fs/nfs/nfs4state.c        | 89 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/nfs_fs_sb.h |  1 +
 4 files changed, 92 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 32ea37198e9..41bd67f80d3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -250,6 +250,11 @@ static void pnfs_init_server(struct nfs_server *server)
 	rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
 }
 
+static void nfs4_destroy_server(struct nfs_server *server)
+{
+	nfs4_purge_state_owners(server);
+}
+
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
@@ -1065,6 +1070,7 @@ static struct nfs_server *nfs_alloc_server(void)
 	INIT_LIST_HEAD(&server->master_link);
 	INIT_LIST_HEAD(&server->delegations);
 	INIT_LIST_HEAD(&server->layouts);
+	INIT_LIST_HEAD(&server->state_owners_lru);
 
 	atomic_set(&server->active, 0);
 
@@ -1538,6 +1544,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 
 	nfs_server_insert_lists(server);
 	server->mount_time = jiffies;
+	server->destroy = nfs4_destroy_server;
 out:
 	nfs_free_fattr(fattr);
 	return error;
@@ -1719,6 +1726,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 
 	/* Copy data from the source */
 	server->nfs_client = source->nfs_client;
+	server->destroy = source->destroy;
 	atomic_inc(&server->nfs_client->cl_count);
 	nfs_server_copy_userdata(server, source);
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 693ae22f873..4d7d0aedc10 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -94,6 +94,8 @@ struct nfs_unique_id {
 struct nfs4_state_owner {
 	struct nfs_unique_id so_owner_id;
 	struct nfs_server    *so_server;
+	struct list_head     so_lru;
+	unsigned long        so_expires;
 	struct rb_node	     so_server_node;
 
 	struct rpc_cred	     *so_cred;	 /* Associated cred */
@@ -319,6 +321,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
 
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+extern void nfs4_purge_state_owners(struct nfs_server *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct nfs4_state *, fmode_t);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6354e4fcc82..a53f33b4ac3 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
 #include <linux/ratelimit.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
+#include <linux/jiffies.h>
 
 #include "nfs4_fs.h"
 #include "callback.h"
@@ -388,6 +389,8 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 		else if (cred > sp->so_cred)
 			p = &parent->rb_right;
 		else {
+			if (!list_empty(&sp->so_lru))
+				list_del_init(&sp->so_lru);
 			atomic_inc(&sp->so_count);
 			return sp;
 		}
@@ -412,6 +415,8 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
 		else if (new->so_cred > sp->so_cred)
 			p = &parent->rb_right;
 		else {
+			if (!list_empty(&sp->so_lru))
+				list_del_init(&sp->so_lru);
 			atomic_inc(&sp->so_count);
 			return sp;
 		}
@@ -453,6 +458,7 @@ nfs4_alloc_state_owner(void)
 	spin_lock_init(&sp->so_sequence.lock);
 	INIT_LIST_HEAD(&sp->so_sequence.list);
 	atomic_set(&sp->so_count, 1);
+	INIT_LIST_HEAD(&sp->so_lru);
 	return sp;
 }
 
@@ -470,6 +476,38 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 	}
 }
 
+static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
+{
+	rpc_destroy_wait_queue(&sp->so_sequence.wait);
+	put_rpccred(sp->so_cred);
+	kfree(sp);
+}
+
+static void nfs4_gc_state_owners(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp, *tmp;
+	unsigned long time_min, time_max;
+	LIST_HEAD(doomed);
+
+	spin_lock(&clp->cl_lock);
+	time_max = jiffies;
+	time_min = (long)time_max - (long)clp->cl_lease_time;
+	list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+		/* NB: LRU is sorted so that oldest is at the head */
+		if (time_in_range(sp->so_expires, time_min, time_max))
+			break;
+		list_move(&sp->so_lru, &doomed);
+		nfs4_remove_state_owner_locked(sp);
+	}
+	spin_unlock(&clp->cl_lock);
+
+	list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+		list_del(&sp->so_lru);
+		nfs4_free_state_owner(sp);
+	}
+}
+
 /**
  * nfs4_get_state_owner - Look up a state owner given a credential
  * @server: nfs_server to search
@@ -487,10 +525,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
 	sp = nfs4_find_state_owner_locked(server, cred);
 	spin_unlock(&clp->cl_lock);
 	if (sp != NULL)
-		return sp;
+		goto out;
 	new = nfs4_alloc_state_owner();
 	if (new == NULL)
-		return NULL;
+		goto out;
 	new->so_server = server;
 	new->so_cred = cred;
 	spin_lock(&clp->cl_lock);
@@ -502,26 +540,58 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
 		rpc_destroy_wait_queue(&new->so_sequence.wait);
 		kfree(new);
 	}
+out:
+	nfs4_gc_state_owners(server);
 	return sp;
 }
 
 /**
  * nfs4_put_state_owner - Release a nfs4_state_owner
  * @sp: state owner data to release
- *
  */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
-	struct nfs_client *clp = sp->so_server->nfs_client;
-	struct rpc_cred *cred = sp->so_cred;
+	struct nfs_server *server = sp->so_server;
+	struct nfs_client *clp = server->nfs_client;
 
 	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
 		return;
-	nfs4_remove_state_owner_locked(sp);
+
+	if (!RB_EMPTY_NODE(&sp->so_server_node)) {
+		sp->so_expires = jiffies;
+		list_add_tail(&sp->so_lru, &server->state_owners_lru);
+		spin_unlock(&clp->cl_lock);
+	} else {
+		nfs4_remove_state_owner_locked(sp);
+		spin_unlock(&clp->cl_lock);
+		nfs4_free_state_owner(sp);
+	}
+}
+
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @server: nfs_server with cached state owners to release
+ *
+ * Called at umount time.  Remaining state owners will be on
+ * the LRU with ref count of zero.
+ */
+void nfs4_purge_state_owners(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp, *tmp;
+	LIST_HEAD(doomed);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+		list_move(&sp->so_lru, &doomed);
+		nfs4_remove_state_owner_locked(sp);
+	}
 	spin_unlock(&clp->cl_lock);
-	rpc_destroy_wait_queue(&sp->so_sequence.wait);
-	put_rpccred(cred);
-	kfree(sp);
+
+	list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+		list_del(&sp->so_lru);
+		nfs4_free_state_owner(sp);
+	}
 }
 
 static struct nfs4_state *
@@ -1393,6 +1463,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
 restart:
 	rcu_read_lock();
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		nfs4_purge_state_owners(server);
 		spin_lock(&clp->cl_lock);
 		for (pos = rb_first(&server->state_owners);
 		     pos != NULL;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b5479df8378..ba4d7656ecf 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -153,6 +153,7 @@ struct nfs_server {
 	struct rb_root		openowner_id;
 	struct rb_root		lockowner_id;
 #endif
+	struct list_head	state_owners_lru;
 	struct list_head	layouts;
 	struct list_head	delegations;
 	void (*destroy)(struct nfs_server *);
-- 
cgit v1.2.3


From b8548894bde94ccee836e210274ff401225e9733 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 2 Jan 2012 17:49:12 -0500
Subject: nfsd4: be forgiving in the absence of the recovery directory

If the recovery directory doesn't exist, then behavior after a reboot
will be suboptimal.  But it's unnecessarily harsh to then prevent the
nfsv4 server from working at all.  Instead just print a warning
(already done in nfsd4_init_recdir()) and soldier on.

Tested-by: Lior <lior@tonian.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4recover.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 28712e20bb1..eb01fffdc2f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -127,10 +127,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 
 	dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
 
-	if (!rec_file || clp->cl_firststate)
+	if (clp->cl_firststate)
 		return 0;
-
 	clp->cl_firststate = 1;
+	if (!rec_file)
+		return -ENOENT;
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0)
 		return status;
-- 
cgit v1.2.3


From 9b4146e85536ebd6b989f43906986e34486efdba Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 4 Jan 2012 16:26:43 -0500
Subject: NFSD: Change name of extended attribute containing junction

As of fedfs-utils-0.8.0, user space stores all NFS junction
information in a single extended attribute: "trusted.junction.nfs".

Both FedFS and NFS basic junctions are stored in this one attribute,
and the intention is that all future forms of NFS junction metadata
will be stored in this attribute.  Other protocols may use a different
extended attribute.

Thus NFSD needs to look only for that one extended attribute.  The
"trusted.junction.type" xattr is deprecated.  fedfs-utils-0.8.0 will
continue to attach a "trusted.junction.type" xattr to junctions, but
future fedfs-utils releases may no longer do that.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7a2e442623c..89689130621 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -594,8 +594,19 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
 	return error;
 }
 
-#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
-#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
+/*
+ * NFS junction information is stored in an extended attribute.
+ */
+#define NFSD_JUNCTION_XATTR_NAME	XATTR_TRUSTED_PREFIX "junction.nfs"
+
+/**
+ * nfsd4_is_junction - Test if an object could be an NFS junction
+ *
+ * @dentry: object to test
+ *
+ * Returns 1 if "dentry" appears to contain NFS junction information.
+ * Otherwise 0 is returned.
+ */
 int nfsd4_is_junction(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
@@ -606,7 +617,7 @@ int nfsd4_is_junction(struct dentry *dentry)
 		return 0;
 	if (!(inode->i_mode & S_ISVTX))
 		return 0;
-	if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
+	if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
 		return 0;
 	return 1;
 }
-- 
cgit v1.2.3


From 7a6ef8c72314f254c107c6a9ed7cb201961ee05a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 5 Jan 2012 15:38:41 -0500
Subject: nfsd4: nfsd4_create_clid_dir return value is unused

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4recover.c | 10 ++++------
 fs/nfsd/state.h       |  2 +-
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index eb01fffdc2f..a52f267f122 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -117,8 +117,7 @@ out_no_tfm:
 	return status;
 }
 
-int
-nfsd4_create_clid_dir(struct nfs4_client *clp)
+void nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
 	const struct cred *original_cred;
 	char *dname = clp->cl_recdir;
@@ -128,13 +127,13 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
 
 	if (clp->cl_firststate)
-		return 0;
+		return;
 	clp->cl_firststate = 1;
 	if (!rec_file)
-		return -ENOENT;
+		return;
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0)
-		return status;
+		return;
 
 	dir = rec_file->f_path.dentry;
 	/* lock the parent */
@@ -172,7 +171,6 @@ out_unlock:
 				" and is writeable", status,
 				user_recovery_dirname);
 	nfs4_reset_creds(original_cred);
-	return status;
 }
 
 typedef int (recdir_func)(struct dentry *, struct dentry *);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 89c2cd84d79..ffb5df1db94 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -483,7 +483,7 @@ extern void nfsd4_shutdown_recdir(void);
 extern int nfs4_client_to_reclaim(const char *name);
 extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
 extern void nfsd4_recdir_purge_old(void);
-extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
+extern void nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
 extern void release_session_client(struct nfsd4_session *);
 extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
-- 
cgit v1.2.3


From 69f594a38967f4540ce7a29b3fd214e68a8330bd Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: ptrace: do not audit capability check when outputing /proc/pid/stat

Reading /proc/pid/stat of another process checks if one has ptrace permissions
on that process.  If one does have permissions it outputs some data about the
process which might have security and attack implications.  If the current
task does not have ptrace permissions the read still works, but those fields
are filled with inocuous (0) values.  Since this check and a subsequent denial
is not a violation of the security policy we should not audit such denials.

This can be quite useful to removing ptrace broadly across a system without
flooding the logs when ps is run or something which harmlessly walks proc.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 fs/proc/array.c          |  2 +-
 include/linux/ptrace.h   |  5 +++--
 kernel/ptrace.c          | 12 ++++++++++--
 security/selinux/hooks.c |  2 +-
 4 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3a1dafd228d..ddffd7a88b9 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
-	permitted = ptrace_may_access(task, PTRACE_MODE_READ);
+	permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
 	mm = get_task_mm(task);
 	if (mm) {
 		vsize = task_vsize(mm);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 800f113bea6..a27e56ca41a 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -127,8 +127,9 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void exit_ptrace(struct task_struct *tracer);
-#define PTRACE_MODE_READ   1
-#define PTRACE_MODE_ATTACH 2
+#define PTRACE_MODE_READ	0x01
+#define PTRACE_MODE_ATTACH	0x02
+#define PTRACE_MODE_NOAUDIT	0x04
 /* Returns 0 on success, -errno on denial. */
 extern int __ptrace_may_access(struct task_struct *task, unsigned int mode);
 /* Returns true on success, false on denial. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 210bbf045ee..c890ac9a796 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -161,6 +161,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 	return ret;
 }
 
+static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+{
+	if (mode & PTRACE_MODE_NOAUDIT)
+		return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+	else
+		return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+}
+
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	const struct cred *cred = current_cred(), *tcred;
@@ -187,7 +195,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	     cred->gid == tcred->sgid &&
 	     cred->gid == tcred->gid))
 		goto ok;
-	if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+	if (ptrace_has_cap(tcred->user->user_ns, mode))
 		goto ok;
 	rcu_read_unlock();
 	return -EPERM;
@@ -196,7 +204,7 @@ ok:
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable && !ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
+	if (!dumpable  && !ptrace_has_cap(task_user_ns(task), mode))
 		return -EPERM;
 
 	return security_ptrace_access_check(task, mode);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c9605c4a2e0..14f94cd29c8 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1809,7 +1809,7 @@ static int selinux_ptrace_access_check(struct task_struct *child,
 	if (rc)
 		return rc;
 
-	if (mode == PTRACE_MODE_READ) {
+	if (mode & PTRACE_MODE_READ) {
 		u32 sid = current_sid();
 		u32 csid = task_sid(child);
 		return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ, NULL);
-- 
cgit v1.2.3


From 5c0b4129c07b902b27d3f3ebc087757f534a3abd Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 6 Jan 2012 09:28:12 +0200
Subject: pnfs-obj: pNFS errors are communicated on iodata->pnfs_error

Some time along the way pNFS IO errors were switched to
communicate with a special iodata->pnfs_error member instead
of the regular RPC members. But objlayout was not switched
over.

Fix that!
Without this fix any IO error is hanged, because IO is not
switched to MDS and pages are never cleared or read.

[Applies to 3.2.0. Same bug different patch for 3.1/0 Kernels]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objlayout.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 72074e3a04f..b3c29039f5b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -254,6 +254,8 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 	oir->status = rdata->task.tk_status = status;
 	if (status >= 0)
 		rdata->res.count = status;
+	else
+		rdata->pnfs_error = status;
 	objlayout_iodone(oir);
 	/* must not use oir after this point */
 
@@ -334,6 +336,8 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 	if (status >= 0) {
 		wdata->res.count = status;
 		wdata->verf.committed = oir->committed;
+	} else {
+		wdata->pnfs_error = status;
 	}
 	objlayout_iodone(oir);
 	/* must not use oir after this point */
-- 
cgit v1.2.3


From fe0fe83585f88346557868a803a479dfaaa0688a Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 6 Jan 2012 09:31:20 +0200
Subject: pnfs-obj: Must return layout on IO error

As mandated by the standard. In case of an IO error, a pNFS
objects layout driver must return it's layout. This is because
all device errors are reported to the server as part of the
layout return buffer.

This is implemented the same way PNFS_LAYOUTRET_ON_SETATTR
is done, through a bit flag on the pnfs_layoutdriver_type->flags
member. The flag is set by the layout driver that wants a
layout_return preformed at pnfs_ld_{write,read}_done in case
of an error.
(Though I have not defined a wrapper like pnfs_ld_layoutret_on_setattr
 because this code is never called outside of pnfs.c and pnfs IO
 paths)

Without this patch 3.[0-2] Kernels leak memory and have an annoying
WARN_ON after every IO error utilizing the pnfs-obj driver.

[This patch is for 3.2 Kernel. 3.1/0 Kernels need a different patch]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objio_osd.c |  3 ++-
 fs/nfs/pnfs.c                | 12 ++++++++++++
 fs/nfs/pnfs.h                |  1 +
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c807ab93140..55d01280a60 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -551,7 +551,8 @@ static const struct nfs_pageio_ops objio_pg_write_ops = {
 static struct pnfs_layoutdriver_type objlayout_type = {
 	.id = LAYOUT_OSD2_OBJECTS,
 	.name = "LAYOUT_OSD2_OBJECTS",
-	.flags                   = PNFS_LAYOUTRET_ON_SETATTR,
+	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |
+				   PNFS_LAYOUTRET_ON_ERROR,
 
 	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
 	.free_layout_hdr         = objlayout_free_layout_hdr,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8e672a2b2d6..f881a638794 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1178,6 +1178,15 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
 		put_lseg(data->lseg);
 		data->lseg = NULL;
 		dprintk("pnfs write error = %d\n", data->pnfs_error);
+		if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+						PNFS_LAYOUTRET_ON_ERROR) {
+			/* Don't lo_commit on error, Server will needs to
+			 * preform a file recovery.
+			 */
+			clear_bit(NFS_INO_LAYOUTCOMMIT,
+				  &NFS_I(data->inode)->flags);
+			pnfs_return_layout(data->inode);
+		}
 	}
 	data->mds_ops->rpc_release(data);
 }
@@ -1267,6 +1276,9 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
 	put_lseg(data->lseg);
 	data->lseg = NULL;
 	dprintk("pnfs write error = %d\n", data->pnfs_error);
+	if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+						PNFS_LAYOUTRET_ON_ERROR)
+		pnfs_return_layout(data->inode);
 
 	nfs_pageio_init_read_mds(&pgio, data->inode);
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1509530cb11..53d593a0a4f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -68,6 +68,7 @@ enum {
 enum layoutdriver_policy_flags {
 	/* Should the pNFS client commit and return the layout upon a setattr */
 	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 0,
+	PNFS_LAYOUTRET_ON_ERROR		= 1 << 1,
 };
 
 struct nfs4_deviceid_node;
-- 
cgit v1.2.3


From e2fecb215b321db0e4a5b2597349a63c07bec42f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 6 Jan 2012 08:57:46 -0500
Subject: NFS: Remove pNFS bloat from the generic write path

We have no business doing any this in the standard write release path.
Get rid of it, and put it in the pNFS layer.

Also, while we're at it, get rid of the completely bogus unlock/relock
semantics that were present in nfs_writeback_release_full(). It is
not only unnecessary, but actually dangerous to release the write lock
just in order to take it again in nfs_page_async_flush(). Better just
to open code the pgio operations in a pnfs helper.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h |  2 ++
 fs/nfs/pnfs.c     | 30 ++++++++++++++++++++++++++++--
 fs/nfs/write.c    | 27 ++-------------------------
 3 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3f4d95751d5..5ee92538b06 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -307,6 +307,8 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
 /* write.c */
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 		struct list_head *head);
+extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+				  struct inode *inode, int ioflags);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
 extern void nfs_commit_free(struct nfs_write_data *p);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f881a638794..17149a49006 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1166,6 +1166,33 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
+static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head)
+{
+	struct nfs_pageio_descriptor pgio;
+	LIST_HEAD(failed);
+
+	/* Resend all requests through the MDS */
+	nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE);
+	while (!list_empty(head)) {
+		struct nfs_page *req = nfs_list_entry(head->next);
+
+		nfs_list_remove_request(req);
+		if (!nfs_pageio_add_request(&pgio, req))
+			nfs_list_add_request(req, &failed);
+	}
+	nfs_pageio_complete(&pgio);
+
+	if (!list_empty(&failed)) {
+		/* For some reason our attempt to resend pages. Mark the
+		 * overall send request as having failed, and let
+		 * nfs_writeback_release_full deal with the error.
+		 */
+		list_move(&failed, head);
+		return -EIO;
+	}
+	return 0;
+}
+
 /*
  * Called by non rpc-based layout drivers
  */
@@ -1175,8 +1202,6 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
 		pnfs_set_layoutcommit(data);
 		data->mds_ops->rpc_call_done(&data->task, data);
 	} else {
-		put_lseg(data->lseg);
-		data->lseg = NULL;
 		dprintk("pnfs write error = %d\n", data->pnfs_error);
 		if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
 						PNFS_LAYOUTRET_ON_ERROR) {
@@ -1187,6 +1212,7 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
 				  &NFS_I(data->inode)->flags);
 			pnfs_return_layout(data->inode);
 		}
+		data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
 	}
 	data->mds_ops->rpc_release(data);
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1dda78db6a7..0c3885255f9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1052,7 +1052,7 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
 	.pg_doio = nfs_generic_pg_writepages,
 };
 
-static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
 				  struct inode *inode, int ioflags)
 {
 	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
@@ -1166,13 +1166,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 static void nfs_writeback_release_full(void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
-	int ret, status = data->task.tk_status;
-	struct nfs_pageio_descriptor pgio;
-
-	if (data->pnfs_error) {
-		nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
-		pgio.pg_recoalesce = 1;
-	}
+	int status = data->task.tk_status;
 
 	/* Update attributes as result of writeback. */
 	while (!list_empty(&data->pages)) {
@@ -1188,11 +1182,6 @@ static void nfs_writeback_release_full(void *calldata)
 			req->wb_bytes,
 			(long long)req_offset(req));
 
-		if (data->pnfs_error) {
-			dprintk(", pnfs error = %d\n", data->pnfs_error);
-			goto next;
-		}
-
 		if (status < 0) {
 			nfs_set_pageerror(page);
 			nfs_context_set_write_error(req->wb_context, status);
@@ -1212,19 +1201,7 @@ remove_request:
 	next:
 		nfs_clear_page_tag_locked(req);
 		nfs_end_page_writeback(page);
-		if (data->pnfs_error) {
-			lock_page(page);
-			nfs_pageio_cond_complete(&pgio, page->index);
-			ret = nfs_page_async_flush(&pgio, page, 0);
-			if (ret) {
-				nfs_set_pageerror(page);
-				dprintk("rewrite to MDS error = %d\n", ret);
-			}
-			unlock_page(page);
-		}
 	}
-	if (data->pnfs_error)
-		nfs_pageio_complete(&pgio);
 	nfs_writedata_release(calldata);
 }
 
-- 
cgit v1.2.3


From 831c2dc5f47c1dc79c32229d75065ada1dcc66e1 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 29 Nov 2011 15:35:53 -0800
Subject: ore: FIX breakage when MISC_FILESYSTEMS is not set

As Reported by Randy Dunlap

When MISC_FILESYSTEMS is not enabled and NFS4.1 is:

fs/built-in.o: In function `objio_alloc_io_state':
objio_osd.c:(.text+0xcb525): undefined reference to `ore_get_rw_state'
fs/built-in.o: In function `_write_done':
objio_osd.c:(.text+0xcb58d): undefined reference to `ore_check_io'
fs/built-in.o: In function `_read_done':
...

When MISC_FILESYSTEMS, which is more of a GUI thing then anything else,
is not selected. exofs/Kconfig is never examined during Kconfig,
and it can not do it's magic stuff to automatically select everything
needed.

We must split exofs/Kconfig in two. The ore one is always included.
And the exofs one is left in it's old place in the menu.

[Needed for the 3.2.0 Kernel]
CC: Stable Tree <stable@kernel.org>
Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/Kconfig           |  2 ++
 fs/exofs/Kconfig     | 11 -----------
 fs/exofs/Kconfig.ore | 12 ++++++++++++
 3 files changed, 14 insertions(+), 11 deletions(-)
 create mode 100644 fs/exofs/Kconfig.ore

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 5f4c45d4aa1..6ad58a59cf5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -218,6 +218,8 @@ source "fs/exofs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
+source "fs/exofs/Kconfig.ore"
+
 menuconfig NETWORK_FILESYSTEMS
 	bool "Network File Systems"
 	default y
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index da42f32c49b..86194b2f799 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,14 +1,3 @@
-# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
-# for every ORE user we do it like this. Any user should add itself here
-# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
-# selected here, and we default to "ON". So in effect it is like been
-# selected by any of the users.
-config ORE
-	tristate
-	depends on EXOFS_FS || PNFS_OBJLAYOUT
-	select ASYNC_XOR
-	default SCSI_OSD_ULD
-
 config EXOFS_FS
 	tristate "exofs: OSD based file system support"
 	depends on SCSI_OSD_ULD
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
new file mode 100644
index 00000000000..1ca7fb7b6ba
--- /dev/null
+++ b/fs/exofs/Kconfig.ore
@@ -0,0 +1,12 @@
+# ORE - Objects Raid Engine (libore.ko)
+#
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
+config ORE
+	tristate
+	depends on EXOFS_FS || PNFS_OBJLAYOUT
+	select ASYNC_XOR
+	default SCSI_OSD_ULD
-- 
cgit v1.2.3


From ffefb8eaa367e8a5c14f779233d9da1fbc23d164 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 27 Dec 2011 19:23:36 +0200
Subject: ore: Fix crash in case of an IO error.

The users of ore_check_io() expect the reported device
(In case of error) to be indexed relative to the passed-in
ore_components table, and not the logical dev index.

This causes a crash inside objlayoutdriver in case of
an IO error.

[Bug in 3.2.0 Kernel]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index d271ad83720..894f3e192e6 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -445,10 +445,10 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
 			u64 residual = ios->reading ?
 					or->in.residual : or->out.residual;
 			u64 offset = (ios->offset + ios->length) - residual;
-			struct ore_dev *od = ios->oc->ods[
-					per_dev->dev - ios->oc->first_dev];
+			unsigned dev = per_dev->dev - ios->oc->first_dev;
+			struct ore_dev *od = ios->oc->ods[dev];
 
-			on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
+			on_dev_error(ios, od, dev, osi.osd_err_pri,
 				     offset, residual);
 		}
 		if (osi.osd_err_pri >= acumulated_osd_err) {
-- 
cgit v1.2.3


From 361aba569f55dd159b850489a3538253afbb3973 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 28 Dec 2011 19:14:23 +0200
Subject: ore: fix BUG_ON, too few sgs when reading

When reading RAID5 files, in rare cases, we calculated too
few sg segments. There should be two extra for the beginning
and end partial units.

Also "too few sg segments" should not be a BUG_ON there is
all the mechanics in place to handle it, as a short read.
So just return -ENOMEM and the rest of the code will gracefully
split the IO.

[Bug in 3.2.0 Kernel]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c      | 2 +-
 fs/exofs/ore_raid.c | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 894f3e192e6..49cf230554a 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -266,7 +266,7 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
 
 			/* first/last seg is split */
 			num_raid_units += layout->group_width;
-			sgs_per_dev = div_u64(num_raid_units, data_devs);
+			sgs_per_dev = div_u64(num_raid_units, data_devs) + 2;
 		} else {
 			/* For Writes add parity pages array. */
 			max_par_pages = num_raid_units * pages_in_unit *
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 29c47e5c4a8..414a2dfd950 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -551,7 +551,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
 			    unsigned cur_len)
 {
 	if (ios->reading) {
-		BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
+		if (per_dev->cur_sg >= ios->sgs_per_dev) {
+			ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
+				per_dev->cur_sg, ios->sgs_per_dev);
+			return -ENOMEM;
+		}
 		_ore_add_sg_seg(per_dev, cur_len, true);
 	} else {
 		struct __stripe_pages_2d *sp2d = ios->sp2d;
-- 
cgit v1.2.3


From f766619db2be059cd0dbba8a36176fe01a29d588 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 18 Dec 2011 23:03:13 +0530
Subject: fs/9p: iattr_valid flags are kernel internal flags map them to 9p
 values.

Kernel internal values can change, add protocol values for these constant
and use them.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode_dotl.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 73488fb69d3..7ca7171273d 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -522,6 +522,46 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
+/*
+ * Attribute flags.
+ */
+#define P9_ATTR_MODE		(1 << 0)
+#define P9_ATTR_UID		(1 << 1)
+#define P9_ATTR_GID		(1 << 2)
+#define P9_ATTR_SIZE		(1 << 3)
+#define P9_ATTR_ATIME		(1 << 4)
+#define P9_ATTR_MTIME		(1 << 5)
+#define P9_ATTR_CTIME		(1 << 6)
+#define P9_ATTR_ATIME_SET	(1 << 7)
+#define P9_ATTR_MTIME_SET	(1 << 8)
+
+struct dotl_iattr_map {
+	int iattr_valid;
+	int p9_iattr_valid;
+};
+
+static int v9fs_mapped_iattr_valid(int iattr_valid)
+{
+	int i;
+	int p9_iattr_valid = 0;
+	struct dotl_iattr_map dotl_iattr_map[] = {
+		{ ATTR_MODE,		P9_ATTR_MODE },
+		{ ATTR_UID,		P9_ATTR_UID },
+		{ ATTR_GID,		P9_ATTR_GID },
+		{ ATTR_SIZE,		P9_ATTR_SIZE },
+		{ ATTR_ATIME,		P9_ATTR_ATIME },
+		{ ATTR_MTIME,		P9_ATTR_MTIME },
+		{ ATTR_CTIME,		P9_ATTR_CTIME },
+		{ ATTR_ATIME_SET,	P9_ATTR_ATIME_SET },
+		{ ATTR_MTIME_SET,	P9_ATTR_MTIME_SET },
+	};
+	for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) {
+		if (iattr_valid & dotl_iattr_map[i].iattr_valid)
+			p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid;
+	}
+	return p9_iattr_valid;
+}
+
 /**
  * v9fs_vfs_setattr_dotl - set file metadata
  * @dentry: file whose metadata to set
@@ -542,7 +582,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 	if (retval)
 		return retval;
 
-	p9attr.valid = iattr->ia_valid;
+	p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
 	p9attr.mode = iattr->ia_mode;
 	p9attr.uid = iattr->ia_uid;
 	p9attr.gid = iattr->ia_gid;
-- 
cgit v1.2.3


From 203bf287cb01a5dc26c20bd3737cecf3aeba1d48 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 6 Jan 2012 15:23:57 -0500
Subject: Btrfs: run chunk allocations while we do delayed refs

Btrfs tries to batch extent allocation tree changes to improve performance
and reduce metadata trashing.  But it doesn't allocate new metadata chunks
while it is doing allocations for the extent allocation tree.

This commit changes the delayed refence code to do chunk allocations if we're
getting low on room.  It prevents crashes and improves performance.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 14 ++++++++++----
 fs/btrfs/transaction.c |  9 +--------
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2b..71549d11a09 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2267,9 +2267,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 				BUG_ON(ret);
 				kfree(extent_op);
 
-				cond_resched();
-				spin_lock(&delayed_refs->lock);
-				continue;
+				goto next;
 			}
 
 			list_del_init(&locked_ref->cluster);
@@ -2289,7 +2287,11 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		btrfs_put_delayed_ref(ref);
 		kfree(extent_op);
 		count++;
-
+next:
+		do_chunk_alloc(trans, root->fs_info->extent_root,
+			       2 * 1024 * 1024,
+			       btrfs_get_alloc_profile(root, 0),
+			       CHUNK_ALLOC_NO_FORCE);
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
@@ -2317,6 +2319,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	if (root == root->fs_info->extent_root)
 		root = root->fs_info->tree_root;
 
+	do_chunk_alloc(trans, root->fs_info->extent_root,
+		       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+		       CHUNK_ALLOC_NO_FORCE);
+
 	delayed_refs = &trans->transaction->delayed_refs;
 	INIT_LIST_HEAD(&cluster);
 again:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3..360c2dfd1ee 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -467,19 +467,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
-	while (count < 4) {
+	while (count < 2) {
 		unsigned long cur = trans->delayed_ref_updates;
 		trans->delayed_ref_updates = 0;
 		if (cur &&
 		    trans->transaction->delayed_refs.num_heads_ready > 64) {
 			trans->delayed_ref_updates = 0;
-
-			/*
-			 * do a full flush if the transaction is trying
-			 * to close
-			 */
-			if (trans->transaction->delayed_refs.flushing)
-				cur = 0;
 			btrfs_run_delayed_refs(trans, root, cur);
 		} else {
 			break;
-- 
cgit v1.2.3


From cf1d72c9ceec391d34c48724da57282e97f01122 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 6 Jan 2012 15:41:34 -0500
Subject: Btrfs: lower the bar for chunk allocation

The chunk allocation code has tried to keep a pretty tight lid on creating new
metadata chunks.  This is partially because in the past the reservation
code didn't give us an accurate idea of how much space was being used.

The new code is much more accurate, so we're able to get rid of some of these
checks.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71549d11a09..247d2c94f8e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3263,27 +3263,12 @@ static int should_alloc_chunk(struct btrfs_root *root,
 		if (num_bytes - num_allocated < thresh)
 			return 1;
 	}
-
-	/*
-	 * we have two similar checks here, one based on percentage
-	 * and once based on a hard number of 256MB.  The idea
-	 * is that if we have a good amount of free
-	 * room, don't allocate a chunk.  A good mount is
-	 * less than 80% utilized of the chunks we have allocated,
-	 * or more than 256MB free
-	 */
-	if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
-		return 0;
-
-	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-		return 0;
-
 	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
-	/* 256MB or 5% of the FS */
-	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+	/* 256MB or 2% of the FS */
+	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
 
-	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
 		return 0;
 	return 1;
 }
-- 
cgit v1.2.3


From 1100373f8aa69e377386499350496e3d8565605f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 6 Jan 2012 15:47:38 -0500
Subject: Btrfs: use bigger metadata chunks on bigger filesystems

The 256MB chunk is a little small on a huge FS.  This scales up the
chunk size.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9..ac00e3aa80a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2441,7 +2441,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		max_stripe_size = 1024 * 1024 * 1024;
 		max_chunk_size = 10 * max_stripe_size;
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-		max_stripe_size = 256 * 1024 * 1024;
+		/* for larger filesystems, use larger metadata chunks */
+		if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+			max_stripe_size = 1024 * 1024 * 1024;
+		else
+			max_stripe_size = 256 * 1024 * 1024;
 		max_chunk_size = max_stripe_size;
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		max_stripe_size = 8 * 1024 * 1024;
-- 
cgit v1.2.3


From a5f6f719a5cd7caeee8ed8137cf3f94c3bbebc65 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <lxoliva@fsfla.org>
Date: Mon, 12 Dec 2011 04:48:19 -0200
Subject: Btrfs: test free space only for unclustered allocation

Since the clustered allocation may be taking extents from a different
block group, there's no point in spin-locking and testing the current
block group free space before attempting to allocate space from a
cluster, even more so when we might refrain from even trying the
cluster in the current block group because, after the cluster was set
up, not enough free space remained.  Furthermore, cluster creation
attempts fail fast when the block group doesn't have enough free
space, so the test was completely superfluous.

I've move the free space test past the cluster allocation attempt,
where it is more useful, and arranged for a cluster in the current
block group to be released before trying an unclustered allocation,
when we reach the LOOP_NO_EMPTY_SIZE stage, so that the free space in
the cluster stands a chance of being combined with additional free
space in the block group so as to succeed in the allocation attempt.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 247d2c94f8e..5ea3acc5324 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5286,15 +5286,6 @@ alloc:
 		if (unlikely(block_group->ro))
 			goto loop;
 
-		spin_lock(&block_group->free_space_ctl->tree_lock);
-		if (cached &&
-		    block_group->free_space_ctl->free_space <
-		    num_bytes + empty_cluster + empty_size) {
-			spin_unlock(&block_group->free_space_ctl->tree_lock);
-			goto loop;
-		}
-		spin_unlock(&block_group->free_space_ctl->tree_lock);
-
 		/*
 		 * Ok we want to try and use the cluster allocator, so
 		 * lets look there
@@ -5340,8 +5331,15 @@ refill_cluster:
 			 * plenty of times and not have found
 			 * anything, so we are likely way too
 			 * fragmented for the clustering stuff to find
-			 * anything.  */
-			if (loop >= LOOP_NO_EMPTY_SIZE) {
+			 * anything.
+			 *
+			 * However, if the cluster is taken from the
+			 * current block group, release the cluster
+			 * first, so that we stand a better chance of
+			 * succeeding in the unclustered
+			 * allocation.  */
+			if (loop >= LOOP_NO_EMPTY_SIZE &&
+			    last_ptr->block_group != block_group) {
 				spin_unlock(&last_ptr->refill_lock);
 				goto unclustered_alloc;
 			}
@@ -5352,6 +5350,11 @@ refill_cluster:
 			 */
 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
+			if (loop >= LOOP_NO_EMPTY_SIZE) {
+				spin_unlock(&last_ptr->refill_lock);
+				goto unclustered_alloc;
+			}
+
 			/* allocate a cluster in this block group */
 			ret = btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
@@ -5392,6 +5395,15 @@ refill_cluster:
 		}
 
 unclustered_alloc:
+		spin_lock(&block_group->free_space_ctl->tree_lock);
+		if (cached &&
+		    block_group->free_space_ctl->free_space <
+		    num_bytes + empty_cluster + empty_size) {
+			spin_unlock(&block_group->free_space_ctl->tree_lock);
+			goto loop;
+		}
+		spin_unlock(&block_group->free_space_ctl->tree_lock);
+
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
 						    num_bytes, empty_size);
 		/*
-- 
cgit v1.2.3


From 6926afd1925a54a13684ebe05987868890665e2b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 7 Jan 2012 13:22:46 -0500
Subject: NFSv4: Save the owner/group name string when doing open

...so that we can do the uid/gid mapping outside the asynchronous RPC
context.
This fixes a bug in the current NFSv4 atomic open code where the client
isn't able to determine what the true uid/gid fields of the file are,
(because the asynchronous nature of the OPEN call denies it the ability
to do an upcall) and so fills them with default values, marking the
inode as needing revalidation.
Unfortunately, in some cases, the VFS will do some additional sanity
checks on the file, and may override the server's decision to allow
the open because it sees the wrong owner/group fields.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c            |  83 ++++++++++++++++++++++++++++++++++++
 fs/nfs/inode.c            |   2 +
 fs/nfs/nfs4proc.c         |  10 +++++
 fs/nfs/nfs4xdr.c          | 106 ++++++++++++++++++++--------------------------
 include/linux/nfs_idmap.h |   8 ++++
 include/linux/nfs_xdr.h   |  17 +++++---
 6 files changed, 162 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 47d1c6ff2d8..2c05f1991e1 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -38,6 +38,89 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
+
+/**
+ * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
+ * @fattr: fully initialised struct nfs_fattr
+ * @owner_name: owner name string cache
+ * @group_name: group name string cache
+ */
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+		struct nfs4_string *owner_name,
+		struct nfs4_string *group_name)
+{
+	fattr->owner_name = owner_name;
+	fattr->group_name = group_name;
+}
+
+static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
+{
+	fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
+	kfree(fattr->owner_name->data);
+}
+
+static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
+{
+	fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
+	kfree(fattr->group_name->data);
+}
+
+static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+	struct nfs4_string *owner = fattr->owner_name;
+	__u32 uid;
+
+	if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
+		return false;
+	if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
+		fattr->uid = uid;
+		fattr->valid |= NFS_ATTR_FATTR_OWNER;
+	}
+	return true;
+}
+
+static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+	struct nfs4_string *group = fattr->group_name;
+	__u32 gid;
+
+	if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
+		return false;
+	if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
+		fattr->gid = gid;
+		fattr->valid |= NFS_ATTR_FATTR_GROUP;
+	}
+	return true;
+}
+
+/**
+ * nfs_fattr_free_names - free up the NFSv4 owner and group strings
+ * @fattr: a fully initialised nfs_fattr structure
+ */
+void nfs_fattr_free_names(struct nfs_fattr *fattr)
+{
+	if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
+		nfs_fattr_free_owner_name(fattr);
+	if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
+		nfs_fattr_free_group_name(fattr);
+}
+
+/**
+ * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
+ * @server: pointer to the filesystem nfs_server structure
+ * @fattr: a fully initialised nfs_fattr structure
+ *
+ * This helper maps the cached NFSv4 owner/group strings in fattr into
+ * their numeric uid/gid equivalents, and then frees the cached strings.
+ */
+void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+	if (nfs_fattr_map_owner_name(server, fattr))
+		nfs_fattr_free_owner_name(fattr);
+	if (nfs_fattr_map_group_name(server, fattr))
+		nfs_fattr_free_group_name(fattr);
+}
 
 static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
 {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 50a15fa8cf9..f59cab12a8e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1019,6 +1019,8 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
 	fattr->valid = 0;
 	fattr->time_start = jiffies;
 	fattr->gencount = nfs_inc_attr_generation_counter();
+	fattr->owner_name = NULL;
+	fattr->group_name = NULL;
 }
 
 struct nfs_fattr *nfs_alloc_fattr(void)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3b108011845..df3d3068242 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -52,6 +52,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/module.h>
+#include <linux/nfs_idmap.h>
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
@@ -760,6 +761,8 @@ struct nfs4_opendata {
 	struct nfs_openres o_res;
 	struct nfs_open_confirmargs c_arg;
 	struct nfs_open_confirmres c_res;
+	struct nfs4_string owner_name;
+	struct nfs4_string group_name;
 	struct nfs_fattr f_attr;
 	struct nfs_fattr dir_attr;
 	struct dentry *dir;
@@ -783,6 +786,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 	p->o_res.server = p->o_arg.server;
 	nfs_fattr_init(&p->f_attr);
 	nfs_fattr_init(&p->dir_attr);
+	nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
 }
 
 static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
@@ -814,6 +818,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p->o_arg.name = &dentry->d_name;
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
+	p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
 	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
 	if (flags & O_CREAT) {
 		u32 *s;
@@ -850,6 +855,7 @@ static void nfs4_opendata_free(struct kref *kref)
 	dput(p->dir);
 	dput(p->dentry);
 	nfs_sb_deactive(sb);
+	nfs_fattr_free_names(&p->f_attr);
 	kfree(p);
 }
 
@@ -1574,6 +1580,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
 	if (status != 0 || !data->rpc_done)
 		return status;
 
+	nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
+
 	nfs_refresh_inode(dir, o_res->dir_attr);
 
 	if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
@@ -1606,6 +1614,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 		return status;
 	}
 
+	nfs_fattr_map_and_free_names(server, &data->f_attr);
+
 	if (o_arg->open_flags & O_CREAT) {
 		update_changeattr(dir, &o_res->cinfo);
 		nfs_post_op_update_inode(dir, o_res->dir_attr);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index dcaf69309d8..95e92e43840 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2298,7 +2298,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_getfh(xdr, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_restorefh(xdr, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_getfattr(xdr, args->dir_bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -3792,7 +3792,8 @@ out_overflow:
 }
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
-		const struct nfs_server *server, uint32_t *uid, int may_sleep)
+		const struct nfs_server *server, uint32_t *uid,
+		struct nfs4_string *owner_name)
 {
 	uint32_t len;
 	__be32 *p;
@@ -3809,8 +3810,12 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
-		if (!may_sleep) {
-			/* do nothing */
+		if (owner_name != NULL) {
+			owner_name->data = kmemdup(p, len, GFP_NOWAIT);
+			if (owner_name->data != NULL) {
+				owner_name->len = len;
+				ret = NFS_ATTR_FATTR_OWNER_NAME;
+			}
 		} else if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
 				ret = NFS_ATTR_FATTR_OWNER;
@@ -3830,7 +3835,8 @@ out_overflow:
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
-		const struct nfs_server *server, uint32_t *gid, int may_sleep)
+		const struct nfs_server *server, uint32_t *gid,
+		struct nfs4_string *group_name)
 {
 	uint32_t len;
 	__be32 *p;
@@ -3847,8 +3853,12 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
-		if (!may_sleep) {
-			/* do nothing */
+		if (group_name != NULL) {
+			group_name->data = kmemdup(p, len, GFP_NOWAIT);
+			if (group_name->data != NULL) {
+				group_name->len = len;
+				ret = NFS_ATTR_FATTR_GROUP_NAME;
+			}
 		} else if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
 				ret = NFS_ATTR_FATTR_GROUP;
@@ -4285,7 +4295,7 @@ xdr_error:
 
 static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		struct nfs_fattr *fattr, struct nfs_fh *fh,
-		const struct nfs_server *server, int may_sleep)
+		const struct nfs_server *server)
 {
 	int status;
 	umode_t fmode = 0;
@@ -4352,12 +4362,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		goto xdr_error;
 	fattr->valid |= status;
 
-	status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
+	status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name);
 	if (status < 0)
 		goto xdr_error;
 	fattr->valid |= status;
 
-	status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
+	status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name);
 	if (status < 0)
 		goto xdr_error;
 	fattr->valid |= status;
@@ -4398,7 +4408,7 @@ xdr_error:
 }
 
 static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
-		struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+		struct nfs_fh *fh, const struct nfs_server *server)
 {
 	__be32 *savep;
 	uint32_t attrlen,
@@ -4417,7 +4427,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
 	if (status < 0)
 		goto xdr_error;
 
-	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server);
 	if (status < 0)
 		goto xdr_error;
 
@@ -4428,9 +4438,9 @@ xdr_error:
 }
 
 static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
-		const struct nfs_server *server, int may_sleep)
+		const struct nfs_server *server)
 {
-	return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+	return decode_getfattr_generic(xdr, fattr, NULL, server);
 }
 
 /*
@@ -5711,8 +5721,7 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
 	status = decode_open_downgrade(xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -5738,8 +5747,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_access(xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -5768,8 +5776,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_getfh(xdr, res->fh);
 	if (status)
 		goto out;
-	status = decode_getfattr(xdr, res->fattr, res->server
-			,!RPC_IS_ASYNC(rqstp->rq_task));
+	status = decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -5795,8 +5802,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
 		goto out;
 	status = decode_getfh(xdr, res->fh);
 	if (status == 0)
-		status = decode_getfattr(xdr, res->fattr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task));
+		status = decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -5822,8 +5828,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_remove(xdr, &res->cinfo);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->dir_attr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->dir_attr, res->server);
 out:
 	return status;
 }
@@ -5856,14 +5861,12 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	/* Current FH is target directory */
-	if (decode_getfattr(xdr, res->new_fattr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+	if (decode_getfattr(xdr, res->new_fattr, res->server))
 		goto out;
 	status = decode_restorefh(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->old_fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->old_fattr, res->server);
 out:
 	return status;
 }
@@ -5899,14 +5902,12 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	 * Note order: OP_LINK leaves the directory as the current
 	 *             filehandle.
 	 */
-	if (decode_getfattr(xdr, res->dir_attr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+	if (decode_getfattr(xdr, res->dir_attr, res->server))
 		goto out;
 	status = decode_restorefh(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -5938,14 +5939,12 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_getfh(xdr, res->fh);
 	if (status)
 		goto out;
-	if (decode_getfattr(xdr, res->fattr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+	if (decode_getfattr(xdr, res->fattr, res->server))
 		goto out;
 	status = decode_restorefh(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->dir_fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->dir_fattr, res->server);
 out:
 	return status;
 }
@@ -5977,8 +5976,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	status = decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6076,8 +6074,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	 * 	an ESTALE error. Shouldn't be a problem,
 	 * 	though, since fattr->valid will remain unset.
 	 */
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6108,13 +6105,11 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 		goto out;
 	if (decode_getfh(xdr, &res->fh) != 0)
 		goto out;
-	if (decode_getfattr(xdr, res->f_attr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+	if (decode_getfattr(xdr, res->f_attr, res->server) != 0)
 		goto out;
 	if (decode_restorefh(xdr) != 0)
 		goto out;
-	decode_getfattr(xdr, res->dir_attr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->dir_attr, res->server);
 out:
 	return status;
 }
@@ -6162,8 +6157,7 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
 	status = decode_open(xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->f_attr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->f_attr, res->server);
 out:
 	return status;
 }
@@ -6190,8 +6184,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
 	status = decode_setattr(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6371,8 +6364,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	if (res->fattr)
-		decode_getfattr(xdr, res->fattr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task));
+		decode_getfattr(xdr, res->fattr, res->server);
 	if (!status)
 		status = res->count;
 out:
@@ -6401,8 +6393,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	if (res->fattr)
-		decode_getfattr(xdr, res->fattr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task));
+		decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6561,8 +6552,7 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
 	status = decode_delegreturn(xdr);
 	if (status != 0)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6591,8 +6581,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
 		goto out;
 	xdr_enter_page(xdr, PAGE_SIZE);
 	status = decode_getfattr(xdr, &res->fs_locations->fattr,
-				 res->fs_locations->server,
-				 !RPC_IS_ASYNC(req->rq_task));
+				 res->fs_locations->server);
 out:
 	return status;
 }
@@ -6841,8 +6830,7 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
 	status = decode_layoutcommit(xdr, rqstp, res);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6973,7 +6961,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		goto out_overflow;
 
 	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
-					entry->server, 1) < 0)
+					entry->server) < 0)
 		goto out_overflow;
 	if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
 		entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index ae7d6a380da..308c1887701 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -66,6 +66,8 @@ struct idmap_msg {
 /* Forward declaration to make this header independent of others */
 struct nfs_client;
 struct nfs_server;
+struct nfs_fattr;
+struct nfs4_string;
 
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 
@@ -97,6 +99,12 @@ void nfs_idmap_delete(struct nfs_client *);
 
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+		struct nfs4_string *owner_name,
+		struct nfs4_string *group_name);
+void nfs_fattr_free_names(struct nfs_fattr *);
+void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *);
+
 int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, __u32 *);
 int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, __u32 *);
 int nfs_map_uid_to_name(const struct nfs_server *, __u32, char *, size_t);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6c898afe609..a764cef06b7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -18,6 +18,11 @@
 /* Forward declaration for NFS v3 */
 struct nfs4_secinfo_flavors;
 
+struct nfs4_string {
+	unsigned int len;
+	char *data;
+};
+
 struct nfs_fsid {
 	uint64_t		major;
 	uint64_t		minor;
@@ -61,6 +66,8 @@ struct nfs_fattr {
 	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
 	unsigned long		time_start;
 	unsigned long		gencount;
+	struct nfs4_string	*owner_name;
+	struct nfs4_string	*group_name;
 };
 
 #define NFS_ATTR_FATTR_TYPE		(1U << 0)
@@ -85,6 +92,8 @@ struct nfs_fattr {
 #define NFS_ATTR_FATTR_V4_REFERRAL	(1U << 19)	/* NFSv4 referral */
 #define NFS_ATTR_FATTR_MOUNTPOINT	(1U << 20)	/* Treat as mountpoint */
 #define NFS_ATTR_FATTR_MOUNTED_ON_FILEID		(1U << 21)
+#define NFS_ATTR_FATTR_OWNER_NAME	(1U << 22)
+#define NFS_ATTR_FATTR_GROUP_NAME	(1U << 23)
 
 #define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
 		| NFS_ATTR_FATTR_MODE \
@@ -324,6 +333,7 @@ struct nfs_openargs {
 	const struct qstr *	name;
 	const struct nfs_server *server;	 /* Needed for ID mapping */
 	const u32 *		bitmask;
+	const u32 *		dir_bitmask;
 	__u32			claim;
 	struct nfs4_sequence_args	seq_args;
 };
@@ -342,6 +352,8 @@ struct nfs_openres {
 	__u32			do_recall;
 	__u64			maxsize;
 	__u32			attrset[NFS4_BITMAP_SIZE];
+	struct nfs4_string	*owner;
+	struct nfs4_string	*group_owner;
 	struct nfs4_sequence_res	seq_res;
 };
 
@@ -778,11 +790,6 @@ struct nfs3_getaclres {
 	struct posix_acl *	acl_default;
 };
 
-struct nfs4_string {
-	unsigned int len;
-	char *data;
-};
-
 #ifdef CONFIG_NFS_V4
 
 typedef u64 clientid4;
-- 
cgit v1.2.3


From fc7c1077ceb99c35e5f9d0ce03dc7740565bb2bf Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <lxoliva@fsfla.org>
Date: Mon, 28 Nov 2011 12:36:17 -0200
Subject: Btrfs: don't set up allocation result twice

We store the allocation start and length twice in ins, once right
after the other, but with intervening calls that may prevent the
duplicate from being optimized out by the compiler.  Remove one of the
assignments.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5ea3acc5324..37594e4bf66 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5441,9 +5441,6 @@ checks:
 			goto loop;
 		}
 
-		ins->objectid = search_start;
-		ins->offset = num_bytes;
-
 		if (offset < search_start)
 			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
-- 
cgit v1.2.3


From 1bb91902dc90e25449893e693ad45605cb08fbe5 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <lxoliva@fsfla.org>
Date: Fri, 14 Oct 2011 12:10:36 -0300
Subject: Btrfs: revamp clustered allocation logic

Parameterize clusters on minimum total size, minimum chunk size and
minimum contiguous size for at least one chunk, without limits on
cluster, window or gap sizes.  Don't tolerate any fragmentation for
SSD_SPREAD; accept it for metadata, but try to keep data dense.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 112 +++++++++++++++++++-------------------------
 1 file changed, 49 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ec23d43d0c3..ce40db59c70 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2283,23 +2283,23 @@ out:
 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 				struct btrfs_free_space *entry,
 				struct btrfs_free_cluster *cluster,
-				u64 offset, u64 bytes, u64 min_bytes)
+				u64 offset, u64 bytes,
+				u64 cont1_bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	unsigned long next_zero;
 	unsigned long i;
-	unsigned long search_bits;
-	unsigned long total_bits;
+	unsigned long want_bits;
+	unsigned long min_bits;
 	unsigned long found_bits;
 	unsigned long start = 0;
 	unsigned long total_found = 0;
 	int ret;
-	bool found = false;
 
 	i = offset_to_bit(entry->offset, block_group->sectorsize,
 			  max_t(u64, offset, entry->offset));
-	search_bits = bytes_to_bits(bytes, block_group->sectorsize);
-	total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+	want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+	min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
 
 again:
 	found_bits = 0;
@@ -2308,7 +2308,7 @@ again:
 	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
 		next_zero = find_next_zero_bit(entry->bitmap,
 					       BITS_PER_BITMAP, i);
-		if (next_zero - i >= search_bits) {
+		if (next_zero - i >= min_bits) {
 			found_bits = next_zero - i;
 			break;
 		}
@@ -2318,10 +2318,9 @@ again:
 	if (!found_bits)
 		return -ENOSPC;
 
-	if (!found) {
+	if (!total_found) {
 		start = i;
 		cluster->max_size = 0;
-		found = true;
 	}
 
 	total_found += found_bits;
@@ -2329,13 +2328,8 @@ again:
 	if (cluster->max_size < found_bits * block_group->sectorsize)
 		cluster->max_size = found_bits * block_group->sectorsize;
 
-	if (total_found < total_bits) {
-		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
-		if (i - start > total_bits * 2) {
-			total_found = 0;
-			cluster->max_size = 0;
-			found = false;
-		}
+	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+		i = next_zero + 1;
 		goto again;
 	}
 
@@ -2351,23 +2345,23 @@ again:
 
 /*
  * This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
  */
 static noinline int
 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 			struct btrfs_free_cluster *cluster,
 			struct list_head *bitmaps, u64 offset, u64 bytes,
-			u64 min_bytes)
+			u64 cont1_bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *first = NULL;
 	struct btrfs_free_space *entry = NULL;
-	struct btrfs_free_space *prev = NULL;
 	struct btrfs_free_space *last;
 	struct rb_node *node;
 	u64 window_start;
 	u64 window_free;
 	u64 max_extent;
-	u64 max_gap = 128 * 1024;
 
 	entry = tree_search_offset(ctl, offset, 0, 1);
 	if (!entry)
@@ -2377,8 +2371,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 	 * We don't want bitmaps, so just move along until we find a normal
 	 * extent entry.
 	 */
-	while (entry->bitmap) {
-		if (list_empty(&entry->list))
+	while (entry->bitmap || entry->bytes < min_bytes) {
+		if (entry->bitmap && list_empty(&entry->list))
 			list_add_tail(&entry->list, bitmaps);
 		node = rb_next(&entry->offset_index);
 		if (!node)
@@ -2391,12 +2385,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 	max_extent = entry->bytes;
 	first = entry;
 	last = entry;
-	prev = entry;
 
-	while (window_free <= min_bytes) {
-		node = rb_next(&entry->offset_index);
-		if (!node)
-			return -ENOSPC;
+	for (node = rb_next(&entry->offset_index); node;
+	     node = rb_next(&entry->offset_index)) {
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 
 		if (entry->bitmap) {
@@ -2405,26 +2396,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 			continue;
 		}
 
-		/*
-		 * we haven't filled the empty size and the window is
-		 * very large.  reset and try again
-		 */
-		if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
-		    entry->offset - window_start > (min_bytes * 2)) {
-			first = entry;
-			window_start = entry->offset;
-			window_free = entry->bytes;
-			last = entry;
+		if (entry->bytes < min_bytes)
+			continue;
+
+		last = entry;
+		window_free += entry->bytes;
+		if (entry->bytes > max_extent)
 			max_extent = entry->bytes;
-		} else {
-			last = entry;
-			window_free += entry->bytes;
-			if (entry->bytes > max_extent)
-				max_extent = entry->bytes;
-		}
-		prev = entry;
 	}
 
+	if (window_free < bytes || max_extent < cont1_bytes)
+		return -ENOSPC;
+
 	cluster->window_start = first->offset;
 
 	node = &first->offset_index;
@@ -2438,7 +2421,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 		node = rb_next(&entry->offset_index);
-		if (entry->bitmap)
+		if (entry->bitmap || entry->bytes < min_bytes)
 			continue;
 
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
@@ -2460,7 +2443,7 @@ static noinline int
 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 		     struct btrfs_free_cluster *cluster,
 		     struct list_head *bitmaps, u64 offset, u64 bytes,
-		     u64 min_bytes)
+		     u64 cont1_bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry;
@@ -2485,7 +2468,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 		if (entry->bytes < min_bytes)
 			continue;
 		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-					   bytes, min_bytes);
+					   bytes, cont1_bytes, min_bytes);
 		if (!ret)
 			return 0;
 	}
@@ -2499,7 +2482,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 
 /*
  * here we try to find a cluster of blocks in a block group.  The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
  * We might not find them all in one contiguous area.
  *
  * returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2498,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	struct btrfs_free_space *entry, *tmp;
 	LIST_HEAD(bitmaps);
 	u64 min_bytes;
+	u64 cont1_bytes;
 	int ret;
 
-	/* for metadata, allow allocates with more holes */
+	/*
+	 * Choose the minimum extent size we'll require for this
+	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
+	 * For metadata, allow allocates with smaller extents.  For
+	 * data, keep it dense.
+	 */
 	if (btrfs_test_opt(root, SSD_SPREAD)) {
-		min_bytes = bytes + empty_size;
+		cont1_bytes = min_bytes = bytes + empty_size;
 	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
-		/*
-		 * we want to do larger allocations when we are
-		 * flushing out the delayed refs, it helps prevent
-		 * making more work as we go along.
-		 */
-		if (trans->transaction->delayed_refs.flushing)
-			min_bytes = max(bytes, (bytes + empty_size) >> 1);
-		else
-			min_bytes = max(bytes, (bytes + empty_size) >> 4);
-	} else
-		min_bytes = max(bytes, (bytes + empty_size) >> 2);
+		cont1_bytes = bytes;
+		min_bytes = block_group->sectorsize;
+	} else {
+		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+		min_bytes = block_group->sectorsize;
+	}
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2539,7 +2523,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	 * If we know we don't have enough space to make a cluster don't even
 	 * bother doing all the work to try and find one.
 	 */
-	if (ctl->free_space < min_bytes) {
+	if (ctl->free_space < bytes) {
 		spin_unlock(&ctl->tree_lock);
 		return -ENOSPC;
 	}
@@ -2553,10 +2537,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	}
 
 	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
-				      bytes, min_bytes);
+				      bytes + empty_size,
+				      cont1_bytes, min_bytes);
 	if (ret)
 		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
-					   offset, bytes, min_bytes);
+					   offset, bytes + empty_size,
+					   cont1_bytes, min_bytes);
 
 	/* Clear our temporary list */
 	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
-- 
cgit v1.2.3


From bc31b86a5923fad5f3fbb6192f767f410241ba27 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sat, 7 Jan 2012 20:41:55 -0600
Subject: writeback: move MIN_WRITEBACK_PAGES to fs-writeback.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix compile error

 fs/fs-writeback.c:515:33: error: ‘PAGE_CACHE_SHIFT’ undeclared (first use in this function)

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c         | 6 ++++++
 include/linux/writeback.h | 5 -----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 92d353e069d..22e2d42742a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/writeback.h>
@@ -29,6 +30,11 @@
 #include <linux/tracepoint.h>
 #include "internal.h"
 
+/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
+
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b30419cd425..4e0a5549302 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -25,11 +25,6 @@ DECLARE_PER_CPU(int, dirty_throttle_leaks);
 #define DIRTY_SCOPE		8
 #define DIRTY_FULL_SCOPE	(DIRTY_SCOPE / 2)
 
-/*
- * 4MB minimal write chunk size
- */
-#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
-
 struct backing_dev_info;
 
 /*
-- 
cgit v1.2.3


From 724577ca355795b0a25c93ccbeee927871ca1a77 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 28 Dec 2011 19:21:45 +0200
Subject: ore: Must support none-PAGE-aligned IO

NFS might send us offsets that are not PAGE aligned. So
we must read in the reminder of the first/last pages, in cases
we need it for Parity calculations.

We only add an sg segments to read the partial page. But
we don't mark it as read=true because it is a lock-for-write
page.

TODO: In some cases (IO spans a single unit) we can just
adjust the raid_unit offset/length, but this is left for
later Kernels.

[Bug in 3.2.0 Kernel]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore_raid.c | 72 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 60 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 414a2dfd950..d222c77cfa1 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -328,8 +328,8 @@ static int _alloc_read_4_write(struct ore_io_state *ios)
 /* @si contains info of the to-be-inserted page. Update of @si should be
  * maintained by caller. Specificaly si->dev, si->obj_offset, ...
  */
-static int _add_to_read_4_write(struct ore_io_state *ios,
-				struct ore_striping_info *si, struct page *page)
+static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si,
+		       struct page *page, unsigned pg_len)
 {
 	struct request_queue *q;
 	struct ore_per_dev_state *per_dev;
@@ -366,17 +366,60 @@ static int _add_to_read_4_write(struct ore_io_state *ios,
 		_ore_add_sg_seg(per_dev, gap, true);
 	}
 	q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
-	added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
-	if (unlikely(added_len != PAGE_SIZE)) {
+	added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len,
+				    si->obj_offset % PAGE_SIZE);
+	if (unlikely(added_len != pg_len)) {
 		ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
 			      per_dev->bio->bi_vcnt);
 		return -ENOMEM;
 	}
 
-	per_dev->length += PAGE_SIZE;
+	per_dev->length += pg_len;
 	return 0;
 }
 
+/* read the beginning of an unaligned first page */
+static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page)
+{
+	struct ore_striping_info si;
+	unsigned pg_len;
+
+	ore_calc_stripe_info(ios->layout, ios->offset, 0, &si);
+
+	pg_len = si.obj_offset % PAGE_SIZE;
+	si.obj_offset -= pg_len;
+
+	ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
+		   _LLU(si.obj_offset), pg_len, page->index, si.dev);
+
+	return _add_to_r4w(ios, &si, page, pg_len);
+}
+
+/* read the end of an incomplete last page */
+static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
+{
+	struct ore_striping_info si;
+	struct page *page;
+	unsigned pg_len, p, c;
+
+	ore_calc_stripe_info(ios->layout, *offset, 0, &si);
+
+	p = si.unit_off / PAGE_SIZE;
+	c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
+		       ios->layout->mirrors_p1, si.par_dev, si.dev);
+	page = ios->sp2d->_1p_stripes[p].pages[c];
+
+	pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
+	*offset += pg_len;
+
+	ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
+		   p, c, _LLU(*offset), pg_len, si.dev, si.par_dev);
+
+	BUG_ON(!page);
+
+	return _add_to_r4w(ios, &si, page, pg_len);
+}
+
 static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
 {
 	struct bio_vec *bv;
@@ -444,9 +487,13 @@ static int _read_4_write(struct ore_io_state *ios)
 			struct page **pp = &_1ps->pages[c];
 			bool uptodate;
 
-			if (*pp)
+			if (*pp) {
+				if (ios->offset % PAGE_SIZE)
+					/* Read the remainder of the page */
+					_add_to_r4w_first_page(ios, *pp);
 				/* to-be-written pages start here */
 				goto read_last_stripe;
+			}
 
 			*pp = ios->r4w->get_page(ios->private, offset,
 						 &uptodate);
@@ -454,7 +501,7 @@ static int _read_4_write(struct ore_io_state *ios)
 				return -ENOMEM;
 
 			if (!uptodate)
-				_add_to_read_4_write(ios, &read_si, *pp);
+				_add_to_r4w(ios, &read_si, *pp, PAGE_SIZE);
 
 			/* Mark read-pages to be cache_released */
 			_1ps->page_is_read[c] = true;
@@ -465,8 +512,11 @@ static int _read_4_write(struct ore_io_state *ios)
 	}
 
 read_last_stripe:
-	offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
-				PAGE_SIZE * PAGE_SIZE;
+	offset = ios->offset + ios->length;
+	if (offset % PAGE_SIZE)
+		_add_to_r4w_last_page(ios, &offset);
+		/* offset will be aligned to next page */
+
 	last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
 				 * bytes_in_stripe;
 	if (offset == last_stripe_end) /* Optimize for the aligned case */
@@ -503,7 +553,7 @@ read_last_stripe:
 			/* Mark read-pages to be cache_released */
 			_1ps->page_is_read[c] = true;
 			if (!uptodate)
-				_add_to_read_4_write(ios, &read_si, page);
+				_add_to_r4w(ios, &read_si, page, PAGE_SIZE);
 		}
 
 		offset += PAGE_SIZE;
@@ -616,8 +666,6 @@ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
 			return -ENOMEM;
 		}
 
-		BUG_ON(ios->offset % PAGE_SIZE);
-
 		/* Round io down to last full strip */
 		first_stripe = div_u64(ios->offset, stripe_size);
 		last_stripe = div_u64(ios->offset + ios->length, stripe_size);
-- 
cgit v1.2.3


From 98c7089c769048f941bd5c5285287f8fc301f8b1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:00:31 -0500
Subject: btrfs: preparation to fixing mount/umount race

We need fs_info and root to live until the moment when the victim
superblock leaves the list, so we need to postpone free_fs_info()
until after ->put_super().  The call is buried in close_ctree(),
though, so we need to lift it into the callers (including
btrfs_put_super()) first.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 3 +--
 fs/btrfs/super.c   | 6 +++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f99a099a774..dcb5d949b54 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2424,6 +2424,7 @@ retry_root_backup:
 		up_read(&fs_info->cleanup_work_sem);
 		if (err) {
 			close_ctree(tree_root);
+			free_fs_info(fs_info);
 			return ERR_PTR(err);
 		}
 	}
@@ -3059,8 +3060,6 @@ int close_ctree(struct btrfs_root *root)
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
-	free_fs_info(fs_info);
-
 	return 0;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae488aa1966..a3f435e5898 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -151,6 +151,7 @@ static void btrfs_put_super(struct super_block *sb)
 	int ret;
 
 	ret = close_ctree(root);
+	free_fs_info(root->fs_info);
 	sb->s_fs_info = NULL;
 
 	(void)ret; /* FIXME: need to fix VFS to return error? */
@@ -589,6 +590,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	struct inode *inode;
 	struct dentry *root_dentry;
 	struct btrfs_root *tree_root;
+	struct btrfs_fs_info *fs_info;
 	struct btrfs_key key;
 	int err;
 
@@ -609,12 +611,13 @@ static int btrfs_fill_super(struct super_block *sb,
 		printk("btrfs: open_ctree failed\n");
 		return PTR_ERR(tree_root);
 	}
+	fs_info = tree_root->fs_info;
 	sb->s_fs_info = tree_root;
 
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+	inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto fail_close;
@@ -635,6 +638,7 @@ static int btrfs_fill_super(struct super_block *sb,
 
 fail_close:
 	close_ctree(tree_root);
+	free_fs_info(fs_info);
 	return err;
 }
 
-- 
cgit v1.2.3


From aea52e19dd2085617dd7d247afb76a90cf2ea40c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:22:46 -0500
Subject: btrfs: get ->kill_sb() of its own

... and move free_fs_info() to that, out of ->put_super().  Do NOT
set ->s_fs_info to NULL in the latter; we need it for sget() to
be able to see and wait for fs in the middle of umount if we get a
mount/umount race.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a3f435e5898..eca48624a4f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -147,14 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 
 static void btrfs_put_super(struct super_block *sb)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	int ret;
-
-	ret = close_ctree(root);
-	free_fs_info(root->fs_info);
-	sb->s_fs_info = NULL;
-
-	(void)ret; /* FIXME: need to fix VFS to return error? */
+	(void)close_ctree(btrfs_sb(sb));
+	/* FIXME: need to fix VFS to return error? */
+	/* AV: return it _where_?  ->put_super() can be triggered by any number
+	 * of async events, up to and including delivery of SIGKILL to the
+	 * last process that kept it busy.  Or segfault in the aforementioned
+	 * process...  Whom would you report that to?
+	 */
 }
 
 enum {
@@ -1223,11 +1222,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static void btrfs_kill_super(struct super_block *sb)
+{
+	struct btrfs_fs_info *fs_info = NULL;
+	if (sb->s_root)
+		fs_info = btrfs_sb(sb)->fs_info;
+	kill_anon_super(sb);
+	if (fs_info)
+		free_fs_info(fs_info);
+}
+
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
 	.mount		= btrfs_mount,
-	.kill_sb	= kill_anon_super,
+	.kill_sb	= btrfs_kill_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-- 
cgit v1.2.3


From 6de1d09d9677dce7a04ef485d426821159ab7de9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:29:09 -0500
Subject: btrfs: fix mount/umount race

Do *NOT* skip doomed superblocks in btrfs_test_super(); we want
sget() to wait for their shutdown to complete.  Since we don't
mutilate ->s_fs_info in ->put_super() anymore (or free what it
used to point to until the superblock is past being findable
by sget()), we can just DTRT there and report a match.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index eca48624a4f..b9fd62a0fca 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -733,20 +733,15 @@ static int btrfs_test_super(struct super_block *s, void *data)
 	struct btrfs_root *test_root = data;
 	struct btrfs_root *root = btrfs_sb(s);
 
-	/*
-	 * If this super block is going away, return false as it
-	 * can't match as an existing super block.
-	 */
-	if (!atomic_read(&s->s_active))
-		return 0;
 	return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
 }
 
 static int btrfs_set_super(struct super_block *s, void *data)
 {
-	s->s_fs_info = data;
-
-	return set_anon_super(s, data);
+	int err = set_anon_super(s, data);
+	if (!err)
+		s->s_fs_info = data;
+	return err;
 }
 
 /*
-- 
cgit v1.2.3


From 10f6327b5d26660e06120ba17cff993cb616b1d2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 15:05:22 -0500
Subject: btrfs: fix a deadlock in btrfs_scan_one_device()

pathname resolution under a global mutex, taken on some paths in ->mount()
is a Bad Idea(tm) - think what happens if said pathname resolution triggers
automount of some btrfs instance and walks into attempt to grab the same
mutex.  Deadlock - we are waiting for daemon to finish walking the path,
daemon is waiting for us to release the mutex...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/volumes.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9..96bb3c0a79b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -706,8 +706,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	u64 devid;
 	u64 transid;
 
-	mutex_lock(&uuid_mutex);
-
 	flags |= FMODE_EXCL;
 	bdev = blkdev_get_by_path(path, flags, holder);
 
@@ -716,6 +714,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		goto error;
 	}
 
+	mutex_lock(&uuid_mutex);
 	ret = set_blocksize(bdev, 4096);
 	if (ret)
 		goto error_close;
@@ -737,9 +736,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	brelse(bh);
 error_close:
+	mutex_unlock(&uuid_mutex);
 	blkdev_put(bdev, flags);
 error:
-	mutex_unlock(&uuid_mutex);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 2eb34cd368b3e5cef960ae9d9971a037cd6fa9d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 00:32:06 -0500
Subject: btrfs: sanitizing ->fs_info, part 1

take assignment of ->fs_info to callers of __setup_root()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dcb5d949b54..04eba10af82 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1142,7 +1142,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->orphan_item_inserted = 0;
 	root->orphan_cleanup_state = 0;
 
-	root->fs_info = fs_info;
 	root->objectid = objectid;
 	root->last_trans = 0;
 	root->highest_objectid = 0;
@@ -1193,6 +1192,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	u32 blocksize;
 	u64 generation;
 
+	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, objectid);
@@ -1227,6 +1227,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
+	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1330,6 +1331,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 		goto out;
 	}
 
+	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, location->objectid);
@@ -2055,6 +2057,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
 
+	tree_root->fs_info = fs_info;
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
@@ -2247,6 +2250,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 				     btrfs_super_chunk_root_level(disk_super));
 	generation = btrfs_super_chunk_root_generation(disk_super);
 
+	chunk_root->fs_info = fs_info;
 	__setup_root(nodesize, leafsize, sectorsize, stripesize,
 		     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
 
@@ -2373,6 +2377,7 @@ retry_root_backup:
 			goto fail_trans_kthread;
 		}
 
+		log_tree_root->fs_info = fs_info;
 		__setup_root(nodesize, leafsize, sectorsize, stripesize,
 			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
 
-- 
cgit v1.2.3


From 1233f546ec29ac32424327baf6ae5df81e3d9eae Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 00:34:00 -0500
Subject: btrfs: sanitizing ->fs_info, part 2

lift assignment to callers of find_and_setup_root()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 04eba10af82..b8dae517f80 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1192,7 +1192,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	u32 blocksize;
 	u64 generation;
 
-	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, objectid);
@@ -1322,6 +1321,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 	if (location->offset == (u64)-1) {
+		root->fs_info = fs_info;
 		ret = find_and_setup_root(tree_root, fs_info,
 					  location->objectid, root);
 		if (ret) {
@@ -2300,18 +2300,21 @@ retry_root_backup:
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
 
+	extent_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	if (ret)
 		goto recovery_tree_root;
 	extent_root->track_dirty = 1;
 
+	dev_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	if (ret)
 		goto recovery_tree_root;
 	dev_root->track_dirty = 1;
 
+	csum_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
 	if (ret)
-- 
cgit v1.2.3


From 38a77db49ad8f78369dcdfb693b8e5a818a60104 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 00:38:54 -0500
Subject: btrfs: sanitizing ->fs_info, part 3

move assignments to ->fs_info in open_ctree() up, to the place
just after the original allocations.  Assignment for tree_root
becomes a no-op - we'd obtained fs_info from tree_root->fs_info
in the first place.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8dae517f80..06480fcf0fb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1320,8 +1320,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
+	root->fs_info = fs_info;
 	if (location->offset == (u64)-1) {
-		root->fs_info = fs_info;
 		ret = find_and_setup_root(tree_root, fs_info,
 					  location->objectid, root);
 		if (ret) {
@@ -1331,7 +1331,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 		goto out;
 	}
 
-	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, location->objectid);
@@ -1914,6 +1913,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail;
 	}
+	chunk_root->fs_info = fs_info;
+	extent_root->fs_info = fs_info;
+	dev_root->fs_info = fs_info;
+	csum_root->fs_info = fs_info;
 
 	ret = init_srcu_struct(&fs_info->subvol_srcu);
 	if (ret) {
@@ -2057,7 +2060,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
 
-	tree_root->fs_info = fs_info;
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
@@ -2250,7 +2252,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 				     btrfs_super_chunk_root_level(disk_super));
 	generation = btrfs_super_chunk_root_generation(disk_super);
 
-	chunk_root->fs_info = fs_info;
 	__setup_root(nodesize, leafsize, sectorsize, stripesize,
 		     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
 
@@ -2300,21 +2301,18 @@ retry_root_backup:
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
 
-	extent_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	if (ret)
 		goto recovery_tree_root;
 	extent_root->track_dirty = 1;
 
-	dev_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	if (ret)
 		goto recovery_tree_root;
 	dev_root->track_dirty = 1;
 
-	csum_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
 	if (ret)
-- 
cgit v1.2.3


From 6f07e42ee6fcc252a210781d7262f4051e9fd8f6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 00:46:16 -0500
Subject: btrfs: sanitizing ->fs_info, part 4

A new helper: btrfs_alloc_root(fs_info); allocates btrfs_root
and sets ->fs_info.  All places allocating the suckers converted
to it.  At that point we *never* reassign ->fs_info of btrfs_root;
it's set before anyone sees the address of newly allocated
struct btrfs_root and never assigned anywhere else.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 33 +++++++++++++++------------------
 fs/btrfs/disk-io.h |  2 ++
 fs/btrfs/super.c   |  3 +--
 3 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 06480fcf0fb..ee846ce5884 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1215,6 +1215,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	return 0;
 }
 
+struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (root)
+		root->fs_info = fs_info;
+	return root;
+}
+
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info)
 {
@@ -1222,11 +1230,10 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct extent_buffer *leaf;
 
-	root = kzalloc(sizeof(*root), GFP_NOFS);
+	root = btrfs_alloc_root(fs_info);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
-	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1317,10 +1324,9 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	u32 blocksize;
 	int ret = 0;
 
-	root = kzalloc(sizeof(*root), GFP_NOFS);
+	root = btrfs_alloc_root(fs_info);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
-	root->fs_info = fs_info;
 	if (location->offset == (u64)-1) {
 		ret = find_and_setup_root(tree_root, fs_info,
 					  location->objectid, root);
@@ -1900,23 +1906,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	int num_backups_tried = 0;
 	int backup_index = 0;
 
-	extent_root = fs_info->extent_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	csum_root = fs_info->csum_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	chunk_root = fs_info->chunk_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	dev_root = fs_info->dev_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
+	csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
+	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+	dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
 
 	if (!extent_root || !csum_root || !chunk_root || !dev_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
-	chunk_root->fs_info = fs_info;
-	extent_root->fs_info = fs_info;
-	dev_root->fs_info = fs_info;
-	csum_root->fs_info = fs_info;
 
 	ret = init_srcu_struct(&fs_info->subvol_srcu);
 	if (ret) {
@@ -2372,13 +2370,12 @@ retry_root_backup:
 		     btrfs_level_size(tree_root,
 				      btrfs_super_log_root_level(disk_super));
 
-		log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+		log_tree_root = btrfs_alloc_root(fs_info);
 		if (!log_tree_root) {
 			err = -ENOMEM;
 			goto fail_trans_kthread;
 		}
 
-		log_tree_root->fs_info = fs_info;
 		__setup_root(nodesize, leafsize, sectorsize, stripesize,
 			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13f..2bb5f59ddf9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -86,6 +86,8 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 
+struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
 void btrfs_set_buffer_lockdep_class(u64 objectid,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b9fd62a0fca..e9f876a1655 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -901,12 +901,11 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (!fs_info)
 		return ERR_PTR(-ENOMEM);
 
-	fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	fs_info->tree_root = btrfs_alloc_root(fs_info);
 	if (!fs_info->tree_root) {
 		error = -ENOMEM;
 		goto error_fs_info;
 	}
-	fs_info->tree_root->fs_info = fs_info;
 	fs_info->fs_devices = fs_devices;
 
 	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
-- 
cgit v1.2.3


From e3029d9fd426c8f582210ba35551ae5506218345 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 00:56:18 -0500
Subject: btrfs: sanitizing ->fs_info, part 5

close_ctree() uses a weird mix of accesses to root->fs_info and
its value at the beginning of function stored in local variable.
Since ->fs_info *never* changes, let's just use the local variable
to avoid confusion.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ee846ce5884..9308c7f13c1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2982,7 +2982,7 @@ int close_ctree(struct btrfs_root *root)
 		   (atomic_read(&fs_info->defrag_running) == 0));
 
 	/* clear out the rbtree of defraggable inodes */
-	btrfs_run_defrag_inodes(root->fs_info);
+	btrfs_run_defrag_inodes(fs_info);
 
 	/*
 	 * Here come 2 situations when btrfs is broken to flip readonly:
@@ -3011,8 +3011,8 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_put_block_group_cache(fs_info);
 
-	kthread_stop(root->fs_info->transaction_kthread);
-	kthread_stop(root->fs_info->cleaner_kthread);
+	kthread_stop(fs_info->transaction_kthread);
+	kthread_stop(fs_info->cleaner_kthread);
 
 	fs_info->closing = 2;
 	smp_mb();
@@ -3030,14 +3030,14 @@ int close_ctree(struct btrfs_root *root)
 	free_extent_buffer(fs_info->extent_root->commit_root);
 	free_extent_buffer(fs_info->tree_root->node);
 	free_extent_buffer(fs_info->tree_root->commit_root);
-	free_extent_buffer(root->fs_info->chunk_root->node);
-	free_extent_buffer(root->fs_info->chunk_root->commit_root);
-	free_extent_buffer(root->fs_info->dev_root->node);
-	free_extent_buffer(root->fs_info->dev_root->commit_root);
-	free_extent_buffer(root->fs_info->csum_root->node);
-	free_extent_buffer(root->fs_info->csum_root->commit_root);
-
-	btrfs_free_block_groups(root->fs_info);
+	free_extent_buffer(fs_info->chunk_root->node);
+	free_extent_buffer(fs_info->chunk_root->commit_root);
+	free_extent_buffer(fs_info->dev_root->node);
+	free_extent_buffer(fs_info->dev_root->commit_root);
+	free_extent_buffer(fs_info->csum_root->node);
+	free_extent_buffer(fs_info->csum_root->commit_root);
+
+	btrfs_free_block_groups(fs_info);
 
 	del_fs_roots(fs_info);
 
-- 
cgit v1.2.3


From ad2b2c802be2d3e8ed8364fef5ffaddabe448219 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:10:02 -0500
Subject: btrfs: make open_ctree() return int

It returns either ERR_PTR(-ve) or sb->s_fs_info.  The latter can
be found by caller just as well, TYVM, no need to return it.  Just
return -ve or 0...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 12 ++++++------
 fs/btrfs/disk-io.h |  6 +++---
 fs/btrfs/super.c   |  8 ++++----
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9308c7f13c1..78247522c69 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1880,9 +1880,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 }
 
 
-struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices,
-			      char *options)
+int open_ctree(struct super_block *sb,
+	       struct btrfs_fs_devices *fs_devices,
+	       char *options)
 {
 	u32 sectorsize;
 	u32 nodesize;
@@ -2428,11 +2428,11 @@ retry_root_backup:
 		if (err) {
 			close_ctree(tree_root);
 			free_fs_info(fs_info);
-			return ERR_PTR(err);
+			return err;
 		}
 	}
 
-	return tree_root;
+	return 0;
 
 fail_trans_kthread:
 	kthread_stop(fs_info->transaction_kthread);
@@ -2479,7 +2479,7 @@ fail_srcu:
 fail:
 	btrfs_close_devices(fs_info->fs_devices);
 	free_fs_info(fs_info);
-	return ERR_PTR(err);
+	return err;
 
 recovery_tree_root:
 	if (!btrfs_test_opt(tree_root, RECOVERY))
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2bb5f59ddf9..6f5f48778b0 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices,
-			      char *options);
+int open_ctree(struct super_block *sb,
+	       struct btrfs_fs_devices *fs_devices,
+	       char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e9f876a1655..56e007fd670 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -604,12 +604,12 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_flags |= MS_POSIXACL;
 #endif
 
-	tree_root = open_ctree(sb, fs_devices, (char *)data);
-
-	if (IS_ERR(tree_root)) {
+	err = open_ctree(sb, fs_devices, (char *)data);
+	if (err) {
 		printk("btrfs: open_ctree failed\n");
-		return PTR_ERR(tree_root);
+		return err;
 	}
+	tree_root = sb->s_fs_info;
 	fs_info = tree_root->fs_info;
 	sb->s_fs_info = tree_root;
 
-- 
cgit v1.2.3


From 29db78aa0ac82319b764b87a1c5030d74523e296 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:12:38 -0500
Subject: btrfs: kill pointless reassignment of ->s_fs_info in
 btrfs_fill_super()

We do not (fortunately) modify ->s_fs_info of superblock on the fly in
btrfs_fill_super(); apparent assignment is a no-op.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 56e007fd670..a381f9f9b0c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -588,8 +588,8 @@ static int btrfs_fill_super(struct super_block *sb,
 {
 	struct inode *inode;
 	struct dentry *root_dentry;
-	struct btrfs_root *tree_root;
-	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *tree_root = sb->s_fs_info;
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
 	struct btrfs_key key;
 	int err;
 
@@ -609,9 +609,6 @@ static int btrfs_fill_super(struct super_block *sb,
 		printk("btrfs: open_ctree failed\n");
 		return err;
 	}
-	tree_root = sb->s_fs_info;
-	fs_info = tree_root->fs_info;
-	sb->s_fs_info = tree_root;
 
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
-- 
cgit v1.2.3


From be7e0950def403e90b5295ff2192c39967bf2aec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:40:05 -0500
Subject: btrfs: merge free_fs_info() calls on fill_super failures

... all the way up into btrfs_mount().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 2 --
 fs/btrfs/super.c   | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 78247522c69..b6d11eb4e43 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2427,7 +2427,6 @@ retry_root_backup:
 		up_read(&fs_info->cleanup_work_sem);
 		if (err) {
 			close_ctree(tree_root);
-			free_fs_info(fs_info);
 			return err;
 		}
 	}
@@ -2478,7 +2477,6 @@ fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
 	btrfs_close_devices(fs_info->fs_devices);
-	free_fs_info(fs_info);
 	return err;
 
 recovery_tree_root:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a381f9f9b0c..8901b6c8526 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -634,7 +634,6 @@ static int btrfs_fill_super(struct super_block *sb,
 
 fail_close:
 	close_ctree(tree_root);
-	free_fs_info(fs_info);
 	return err;
 }
 
@@ -947,6 +946,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
+			free_fs_info(fs_info);
 			deactivate_locked_super(s);
 			return ERR_PTR(error);
 		}
-- 
cgit v1.2.3


From d22ca7de770e2a683eac000ba4db5a95e2f4c969 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:46:50 -0500
Subject: btrfs: make free_fs_info() call ->kill_sb() unconditional

... and don't bother with it after btrfs_fill_super() failure -
->kill_sb() (unlike ->put_super()) will be called even if we
have not got non-NULL ->s_root.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8901b6c8526..f628a6a79ca 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -946,7 +946,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
-			free_fs_info(fs_info);
 			deactivate_locked_super(s);
 			return ERR_PTR(error);
 		}
@@ -1215,12 +1214,9 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static void btrfs_kill_super(struct super_block *sb)
 {
-	struct btrfs_fs_info *fs_info = NULL;
-	if (sb->s_root)
-		fs_info = btrfs_sb(sb)->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
 	kill_anon_super(sb);
-	if (fs_info)
-		free_fs_info(fs_info);
+	free_fs_info(fs_info);
 }
 
 static struct file_system_type btrfs_fs_type = {
-- 
cgit v1.2.3


From 59553edf110e5576d91be9dd5bd53d110e0d0290 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:56:28 -0500
Subject: btrfs: consolidate failure exits in btrfs_mount() a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f628a6a79ca..15c8ae571e4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -630,6 +630,7 @@ static int btrfs_fill_super(struct super_block *sb,
 
 	save_mount_options(sb, data);
 	cleancache_init_fs(sb);
+	sb->s_flags |= MS_ACTIVE;
 	return 0;
 
 fail_close:
@@ -929,14 +930,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (s->s_root) {
-		if ((flags ^ s->s_flags) & MS_RDONLY) {
-			deactivate_locked_super(s);
-			error = -EBUSY;
-			goto error_close_devices;
-		}
-
 		btrfs_close_devices(fs_devices);
 		free_fs_info(fs_info);
+		if ((flags ^ s->s_flags) & MS_RDONLY)
+			error = -EBUSY;
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -945,19 +942,11 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(s);
-			return ERR_PTR(error);
-		}
-
-		s->s_flags |= MS_ACTIVE;
 	}
 
-	root = get_default_root(s, subvol_objectid);
-	if (IS_ERR(root)) {
+	root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+	if (IS_ERR(root))
 		deactivate_locked_super(s);
-		return root;
-	}
 
 	return root;
 
-- 
cgit v1.2.3


From 815745cf3e46681241ad8025602ffbf2a452d514 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 15:40:49 -0500
Subject: btrfs: let ->s_fs_info point to fs_info, not root...

the latter can be obtained from the former (by looking as ->tree_root)
just as cheaply as we currently are doing the other way round.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c |  4 +--
 fs/btrfs/export.c  |  2 +-
 fs/btrfs/ioctl.c   |  7 +++--
 fs/btrfs/super.c   | 75 +++++++++++++++++++++++++++---------------------------
 5 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323..dbb999a3827 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2196,7 +2196,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
 	return btrfs_item_size(eb, e) - offset;
 }
 
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b6d11eb4e43..f7d8b9ba519 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1894,8 +1894,8 @@ int open_ctree(struct super_block *sb,
 	struct btrfs_key location;
 	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
-	struct btrfs_root *tree_root = btrfs_sb(sb);
-	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *csum_root;
 	struct btrfs_root *chunk_root;
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f..5f77166fd01 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 				       u64 root_objectid, u32 generation,
 				       int check_generation)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *root;
 	struct inode *inode;
 	struct btrfs_key key;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5441ff1480f..0c55f8f3809 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -276,14 +276,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 
 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
 	struct btrfs_device *device;
 	struct request_queue *q;
 	struct fstrim_range range;
 	u64 minlen = ULLONG_MAX;
 	u64 num_devices = 0;
-	u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +311,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
 	range.len = min(range.len, total_bytes - range.start);
 	range.minlen = max(range.minlen, minlen);
-	ret = btrfs_trim_fs(root, &range);
+	ret = btrfs_trim_fs(fs_info->tree_root, &range);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15c8ae571e4..305adf6dc09 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -147,7 +147,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 
 static void btrfs_put_super(struct super_block *sb)
 {
-	(void)close_ctree(btrfs_sb(sb));
+	(void)close_ctree(btrfs_sb(sb)->tree_root);
 	/* FIXME: need to fix VFS to return error? */
 	/* AV: return it _where_?  ->put_super() can be triggered by any number
 	 * of async events, up to and including delivery of SIGKILL to the
@@ -500,7 +500,8 @@ out:
 static struct dentry *get_default_root(struct super_block *sb,
 				       u64 subvol_objectid)
 {
-	struct btrfs_root *root = sb->s_fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_root *new_root;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
@@ -530,7 +531,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 	 * will mount by default if we haven't been given a specific subvolume
 	 * to mount.
 	 */
-	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
 	if (IS_ERR(di)) {
 		btrfs_free_path(path);
@@ -544,7 +545,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 		 */
 		btrfs_free_path(path);
 		dir_id = BTRFS_FIRST_FREE_OBJECTID;
-		new_root = root->fs_info->fs_root;
+		new_root = fs_info->fs_root;
 		goto setup_root;
 	}
 
@@ -552,7 +553,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 	btrfs_free_path(path);
 
 find_root:
-	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (IS_ERR(new_root))
 		return ERR_CAST(new_root);
 
@@ -588,8 +589,7 @@ static int btrfs_fill_super(struct super_block *sb,
 {
 	struct inode *inode;
 	struct dentry *root_dentry;
-	struct btrfs_root *tree_root = sb->s_fs_info;
-	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_key key;
 	int err;
 
@@ -634,20 +634,21 @@ static int btrfs_fill_super(struct super_block *sb,
 	return 0;
 
 fail_close:
-	close_ctree(tree_root);
+	close_ctree(fs_info->tree_root);
 	return err;
 }
 
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = btrfs_sb(sb);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
 	int ret;
 
 	trace_btrfs_sync_fs(wait);
 
 	if (!wait) {
-		filemap_flush(root->fs_info->btree_inode->i_mapping);
+		filemap_flush(fs_info->btree_inode->i_mapping);
 		return 0;
 	}
 
@@ -663,8 +664,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+	struct btrfs_root *root = info->tree_root;
 	char *compress_type;
 
 	if (btrfs_test_opt(root, DEGRADED))
@@ -727,10 +728,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-	struct btrfs_root *test_root = data;
-	struct btrfs_root *root = btrfs_sb(s);
+	struct btrfs_fs_info *p = data;
+	struct btrfs_fs_info *fs_info = btrfs_sb(s);
 
-	return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+	return fs_info->fs_devices == p->fs_devices;
 }
 
 static int btrfs_set_super(struct super_block *s, void *data)
@@ -922,8 +923,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super,
-		 fs_info->tree_root);
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto error_close_devices;
@@ -939,7 +939,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 		s->s_flags = flags | MS_NOSEC;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+		btrfs_sb(s)->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 	}
@@ -959,7 +959,8 @@ error_fs_info:
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
 	int ret;
 
 	ret = btrfs_parse_options(root, data);
@@ -975,13 +976,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		ret =  btrfs_commit_super(root);
 		WARN_ON(ret);
 	} else {
-		if (root->fs_info->fs_devices->rw_devices == 0)
+		if (fs_info->fs_devices->rw_devices == 0)
 			return -EACCES;
 
-		if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
+		if (btrfs_super_log_root(fs_info->super_copy) != 0)
 			return -EINVAL;
 
-		ret = btrfs_cleanup_fs_roots(root->fs_info);
+		ret = btrfs_cleanup_fs_roots(fs_info);
 		WARN_ON(ret);
 
 		/* recover relocation */
@@ -1150,18 +1151,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
-	struct list_head *head = &root->fs_info->space_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	struct list_head *head = &fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
 	u64 total_free_data = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
-	__be32 *fsid = (__be32 *)root->fs_info->fsid;
+	__be32 *fsid = (__be32 *)fs_info->fsid;
 	int ret;
 
 	/* holding chunk_muext to avoid allocating new chunks */
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_lock(&fs_info->chunk_mutex);
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
 		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1180,14 +1181,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
 	buf->f_bavail = total_free_data;
-	ret = btrfs_calc_avail_data_space(root, &total_free_data);
+	ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
 	if (ret) {
-		mutex_unlock(&root->fs_info->chunk_mutex);
+		mutex_unlock(&fs_info->chunk_mutex);
 		return ret;
 	}
 	buf->f_bavail += total_free_data;
 	buf->f_bavail = buf->f_bavail >> bits;
-	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&fs_info->chunk_mutex);
 
 	/* We treat it as constant endianness (it doesn't matter _which_)
 	   because we want the fsid to come out the same whether mounted
@@ -1203,7 +1204,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static void btrfs_kill_super(struct super_block *sb)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	kill_anon_super(sb);
 	free_fs_info(fs_info);
 }
@@ -1246,17 +1247,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	mutex_lock(&root->fs_info->transaction_kthread_mutex);
-	mutex_lock(&root->fs_info->cleaner_mutex);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	mutex_lock(&fs_info->transaction_kthread_mutex);
+	mutex_lock(&fs_info->cleaner_mutex);
 	return 0;
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	mutex_unlock(&root->fs_info->cleaner_mutex);
-	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	mutex_unlock(&fs_info->cleaner_mutex);
+	mutex_unlock(&fs_info->transaction_kthread_mutex);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f84a8bd60e3ee49eacc9ba824babf149ba3dad7e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 15:57:57 -0500
Subject: btrfs: take allocation of ->tree_root into open_ctree()

now that we don't need it for sget() anymore...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 8 +++++---
 fs/btrfs/disk-io.h | 2 --
 fs/btrfs/super.c   | 5 -----
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f7d8b9ba519..dd71be87593 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1215,7 +1215,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	return 0;
 }
 
-struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (root)
@@ -1895,7 +1895,7 @@ int open_ctree(struct super_block *sb,
 	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *tree_root;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *csum_root;
 	struct btrfs_root *chunk_root;
@@ -1906,12 +1906,14 @@ int open_ctree(struct super_block *sb,
 	int num_backups_tried = 0;
 	int backup_index = 0;
 
+	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
 	extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
 	csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
 	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
 	dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
 
-	if (!extent_root || !csum_root || !chunk_root || !dev_root) {
+	if (!tree_root || !extent_root || !csum_root ||
+	    !chunk_root || !dev_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 6f5f48778b0..e4bc4741319 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -86,8 +86,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 
-struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info);
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
 void btrfs_set_buffer_lockdep_class(u64 objectid,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 305adf6dc09..044652ee7ef 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -899,11 +899,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (!fs_info)
 		return ERR_PTR(-ENOMEM);
 
-	fs_info->tree_root = btrfs_alloc_root(fs_info);
-	if (!fs_info->tree_root) {
-		error = -ENOMEM;
-		goto error_fs_info;
-	}
 	fs_info->fs_devices = fs_devices;
 
 	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
-- 
cgit v1.2.3


From 3850aba74873aa47fefe6900b99f42f5e656a6e7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Jan 2012 19:40:27 -0500
Subject: devpts: fix double-free on mount failure

devpts_kill_sb() is called even if devpts_fill_super() fails;
we should not do that kfree() in the latter, especially not
with ->s_fs_info left pointing to freed object.  Double kfree()
is a Bad Thing(tm)...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/devpts/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 79673eb7115..c4e2a58a2e8 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -301,7 +301,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 
 	inode = new_inode(s);
 	if (!inode)
-		goto free_fsi;
+		goto fail;
 	inode->i_ino = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
@@ -316,8 +316,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	printk(KERN_ERR "devpts: get root dentry failed\n");
 	iput(inode);
 
-free_fsi:
-	kfree(s->s_fs_info);
 fail:
 	return -ENOMEM;
 }
-- 
cgit v1.2.3


From da01636a6511c3bd0c1cf546c47b8e92a837a613 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Jan 2012 19:45:28 -0500
Subject: exofs: oops after late failure in mount

We have already set ->s_root, so ->put_super() is going to be called.
Freeing ->s_fs_info is a bloody bad idea when it's going to be
dereferenced very shortly...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exofs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8addfe314dc..d22cd168c6e 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -838,6 +838,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
 	if (ret) {
 		EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
+		dput(sb->s_root);
+		sb->s_root = NULL;
 		goto free_sbi;
 	}
 
-- 
cgit v1.2.3


From 0ce8c0109f548ed75535d96ec5a347b410ed1472 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Jan 2012 19:50:23 -0500
Subject: ext[34]: avoid i_nlink warnings triggered by drop_nlink/inc_nlink
 kludge in symlink()

Both ext3 and ext4 put the half-created symlink inode into the orphan list
for a while (see the comment in ext[34]_symlink() for gory details).  Then,
if everything went fine, they pull it out of the orphan list and bump the
link count back to 1.  The thing is, inc_nlink() is going to complain about
seeing somebody changing i_nlink from 0 to 1.  With a good reason, since
normally something like that is a bug.  Explicit set_nlink(inode, 1) does
the same thing as inc_nlink() here, but it does *not* complain - exactly
because it should be usable in strange situations like this one.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/namei.c | 2 +-
 fs/ext4/namei.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 4f35b2f315d..d269821203f 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2272,7 +2272,7 @@ retry:
 			err = PTR_ERR(handle);
 			goto err_drop_inode;
 		}
-		inc_nlink(inode);
+		set_nlink(inode, 1);
 		err = ext3_orphan_del(handle, inode);
 		if (err) {
 			ext3_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 86edc45b52a..2043f482375 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2315,7 +2315,7 @@ retry:
 			err = PTR_ERR(handle);
 			goto err_drop_inode;
 		}
-		inc_nlink(inode);
+		set_nlink(inode, 1);
 		err = ext4_orphan_del(handle, inode);
 		if (err) {
 			ext4_journal_stop(handle);
-- 
cgit v1.2.3


From d03e1292c46721f60830c5d2e334966472002ed0 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Mon, 5 Dec 2011 15:55:11 +0800
Subject: ext3: replace ll_rw_block with other functions

ll_rw_block() is deprecated. Thus we replace it with other functions.

CC: Jan Kara <jack@suse.cz>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c | 14 +++++++-------
 fs/ext3/namei.c |  9 ++++++---
 fs/ext3/super.c | 10 +++++-----
 3 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 9da9125ba3e..a8d3217c5cf 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1136,9 +1136,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
 	bh = ext3_getblk(handle, inode, block, create, err);
 	if (!bh)
 		return bh;
-	if (buffer_uptodate(bh))
+	if (bh_uptodate_or_lock(bh))
 		return bh;
-	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
+	get_bh(bh);
+	bh->b_end_io = end_buffer_read_sync;
+	submit_bh(READ | REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
@@ -2068,12 +2070,10 @@ static int ext3_block_truncate_page(struct inode *inode, loff_t from)
 	if (PageUptodate(page))
 		set_buffer_uptodate(bh);
 
-	if (!buffer_uptodate(bh)) {
-		err = -EIO;
-		ll_rw_block(READ, 1, &bh);
-		wait_on_buffer(bh);
+	if (!bh_uptodate_or_lock(bh)) {
+		err = bh_submit_read(bh);
 		/* Uhhuh. Read error. Complain and punt. */
-		if (!buffer_uptodate(bh))
+		if (err)
 			goto unlock;
 	}
 
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 642dc6d66df..337c3f3375c 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -921,9 +921,12 @@ restart:
 				num++;
 				bh = ext3_getblk(NULL, dir, b++, 0, &err);
 				bh_use[ra_max] = bh;
-				if (bh)
-					ll_rw_block(READ | REQ_META | REQ_PRIO,
-						    1, &bh);
+				if (bh && !bh_uptodate_or_lock(bh)) {
+					get_bh(bh);
+					bh->b_end_io = end_buffer_read_sync;
+					submit_bh(READ | REQ_META | REQ_PRIO,
+						  bh);
+				}
 			}
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 767fa3a2bd1..662cef4569a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2231,11 +2231,11 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 		goto out_bdev;
 	}
 	journal->j_private = sb;
-	ll_rw_block(READ, 1, &journal->j_sb_buffer);
-	wait_on_buffer(journal->j_sb_buffer);
-	if (!buffer_uptodate(journal->j_sb_buffer)) {
-		ext3_msg(sb, KERN_ERR, "I/O error on journal device");
-		goto out_journal;
+	if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
+		if (bh_submit_read(journal->j_sb_buffer)) {
+			ext3_msg(sb, KERN_ERR, "I/O error on journal device");
+			goto out_journal;
+		}
 	}
 	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
 		ext3_msg(sb, KERN_ERR,
-- 
cgit v1.2.3


From 1415dd8705394399d59a3df1ab48d149e1e41e77 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Dec 2011 21:13:46 +0100
Subject: ext3: Fix error handling on inode bitmap corruption

When insert_inode_locked() fails in ext3_new_inode() it most likely
means inode bitmap got corrupted and we allocated again inode which
is already in use. Also doing unlock_new_inode() during error recovery
is wrong since inode does not have I_NEW set. Fix the problem by jumping
to fail: (instead of fail_drop:) which declares filesystem error and
does not call unlock_new_inode().

Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/ialloc.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 5c866e06e7a..adae962ee95 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -525,8 +525,12 @@ got:
 	if (IS_DIRSYNC(inode))
 		handle->h_sync = 1;
 	if (insert_inode_locked(inode) < 0) {
-		err = -EINVAL;
-		goto fail_drop;
+		/*
+		 * Likely a bitmap corruption causing inode to be allocated
+		 * twice.
+		 */
+		err = -EIO;
+		goto fail;
 	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
-- 
cgit v1.2.3


From ef6919c283257155def420bd247140e9fd2e9843 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 9 Dec 2011 00:08:58 +0100
Subject: ext2: Fix error handling on inode bitmap corruption

When insert_inode_locked() fails in ext2_new_inode() it most likely means inode
bitmap got corrupted and we allocated again inode which is already in use. Also
doing unlock_new_inode() during error recovery is wrong since the inode does
not have I_NEW set. Fix the problem by informing about filesystem error and
jumping to fail: (instead of fail_drop:) which doesn't call unlock_new_inode().

Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/ialloc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c4e81dfb74b..78502c16681 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -573,8 +573,11 @@ got:
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
 	if (insert_inode_locked(inode) < 0) {
-		err = -EINVAL;
-		goto fail_drop;
+		ext2_error(sb, "ext2_new_inode",
+			   "inode number already in use - inode=%lu",
+			   (unsigned long) ino);
+		err = -EIO;
+		goto fail;
 	}
 
 	dquot_initialize(inode);
-- 
cgit v1.2.3


From 7b0b0933a3ff6052addf4d49ea99f75ab27df2d0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 10 Dec 2011 01:43:33 +0100
Subject: udf: Cleanup calling convention of inode_getblk()

inode_getblk() always returned NULL and passed results in its parameters.
Make the function return something useful - found block number.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4fd1d809738..1bd2c42f3b4 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -53,8 +53,7 @@ static int udf_update_inode(struct inode *, int);
 static void udf_fill_inode(struct inode *, struct buffer_head *);
 static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
-static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
-					sector_t *, int *);
+static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
 static int8_t udf_insert_aext(struct inode *, struct extent_position,
 			      struct kernel_lb_addr, uint32_t);
 static void udf_split_extents(struct inode *, int *, int, int,
@@ -310,7 +309,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
 			 struct buffer_head *bh_result, int create)
 {
 	int err, new;
-	struct buffer_head *bh;
 	sector_t phys = 0;
 	struct udf_inode_info *iinfo;
 
@@ -323,7 +321,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
 
 	err = -EIO;
 	new = 0;
-	bh = NULL;
 	iinfo = UDF_I(inode);
 
 	down_write(&iinfo->i_data_sem);
@@ -332,13 +329,10 @@ static int udf_get_block(struct inode *inode, sector_t block,
 		iinfo->i_next_alloc_goal++;
 	}
 
-	err = 0;
 
-	bh = inode_getblk(inode, block, &err, &phys, &new);
-	BUG_ON(bh);
-	if (err)
+	phys = inode_getblk(inode, block, &err, &new);
+	if (!phys)
 		goto abort;
-	BUG_ON(!phys);
 
 	if (new)
 		set_buffer_new(bh_result);
@@ -547,11 +541,10 @@ out:
 	return err;
 }
 
-static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
-					int *err, sector_t *phys, int *new)
+static sector_t inode_getblk(struct inode *inode, sector_t block,
+			     int *err, int *new)
 {
 	static sector_t last_block;
-	struct buffer_head *result = NULL;
 	struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
 	struct extent_position prev_epos, cur_epos, next_epos;
 	int count = 0, startnum = 0, endnum = 0;
@@ -566,6 +559,8 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 	int goal = 0, pgoal = iinfo->i_location.logicalBlockNum;
 	int lastblock = 0;
 
+	*err = 0;
+	*new = 0;
 	prev_epos.offset = udf_file_entry_alloc_offset(inode);
 	prev_epos.block = iinfo->i_location;
 	prev_epos.bh = NULL;
@@ -635,8 +630,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 		brelse(cur_epos.bh);
 		brelse(next_epos.bh);
 		newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
-		*phys = newblock;
-		return NULL;
+		return newblock;
 	}
 
 	last_block = block;
@@ -664,7 +658,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 			brelse(cur_epos.bh);
 			brelse(next_epos.bh);
 			*err = ret;
-			return NULL;
+			return 0;
 		}
 		c = 0;
 		offset = 0;
@@ -729,7 +723,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 		if (!newblocknum) {
 			brelse(prev_epos.bh);
 			*err = -ENOSPC;
-			return NULL;
+			return 0;
 		}
 		iinfo->i_lenExtents += inode->i_sb->s_blocksize;
 	}
@@ -761,10 +755,10 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 
 	newblock = udf_get_pblock(inode->i_sb, newblocknum,
 				iinfo->i_location.partitionReferenceNum, 0);
-	if (!newblock)
-		return NULL;
-	*phys = newblock;
-	*err = 0;
+	if (!newblock) {
+		*err = -EIO;
+		return 0;
+	}
 	*new = 1;
 	iinfo->i_next_alloc_block = block;
 	iinfo->i_next_alloc_goal = newblocknum;
@@ -775,7 +769,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 	else
 		mark_inode_dirty(inode);
 
-	return result;
+	return newblock;
 }
 
 static void udf_split_extents(struct inode *inode, int *c, int offset,
-- 
cgit v1.2.3


From d2eb8c359309ec45d6bf5b147303ab8e13be86ea Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 10 Dec 2011 02:30:48 +0100
Subject: udf: Fix deadlock when converting file from in-ICB one to normal one

During BKL removal in 2.6.38, conversion of files from in-ICB format to normal
format got broken. We call ->writepage with i_data_sem held but udf_get_block()
also acquires i_data_sem thus creating A-A deadlock.

We fix the problem by dropping i_data_sem before calling ->writepage() which is
safe since i_mutex still protects us against any changes in the file. Also fix
pagelock - i_data_sem lock inversion in udf_expand_file_adinicb() by dropping
i_data_sem before calling find_or_create_page().

CC: stable@kernel.org
Reported-by: Matthias Matiak <netzpython@mail-on.us>
Tested-by: Matthias Matiak <netzpython@mail-on.us>
Reviewed-by: Namjae Jeon <linkinjeon@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/file.c  |  6 +++---
 fs/udf/inode.c | 21 ++++++++++++++++++---
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/file.c b/fs/udf/file.c
index d8ffa7cc661..dca0c3881e8 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -125,7 +125,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			err = udf_expand_file_adinicb(inode);
 			if (err) {
 				udf_debug("udf_expand_adinicb: err=%d\n", err);
-				up_write(&iinfo->i_data_sem);
 				return err;
 			}
 		} else {
@@ -133,9 +132,10 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				iinfo->i_lenAlloc = pos + count;
 			else
 				iinfo->i_lenAlloc = inode->i_size;
+			up_write(&iinfo->i_data_sem);
 		}
-	}
-	up_write(&iinfo->i_data_sem);
+	} else
+		up_write(&iinfo->i_data_sem);
 
 	retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
 	if (retval > 0)
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 1bd2c42f3b4..4f7b1ffd9e3 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -150,6 +150,12 @@ const struct address_space_operations udf_aops = {
 	.bmap		= udf_bmap,
 };
 
+/*
+ * Expand file stored in ICB to a normal one-block-file
+ *
+ * This function requires i_data_sem for writing and releases it.
+ * This function requires i_mutex held
+ */
 int udf_expand_file_adinicb(struct inode *inode)
 {
 	struct page *page;
@@ -168,9 +174,15 @@ int udf_expand_file_adinicb(struct inode *inode)
 			iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
 		/* from now on we have normal address_space methods */
 		inode->i_data.a_ops = &udf_aops;
+		up_write(&iinfo->i_data_sem);
 		mark_inode_dirty(inode);
 		return 0;
 	}
+	/*
+	 * Release i_data_sem so that we can lock a page - page lock ranks
+	 * above i_data_sem. i_mutex still protects us against file changes.
+	 */
+	up_write(&iinfo->i_data_sem);
 
 	page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
 	if (!page)
@@ -186,6 +198,7 @@ int udf_expand_file_adinicb(struct inode *inode)
 		SetPageUptodate(page);
 		kunmap(page);
 	}
+	down_write(&iinfo->i_data_sem);
 	memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00,
 	       iinfo->i_lenAlloc);
 	iinfo->i_lenAlloc = 0;
@@ -195,17 +208,20 @@ int udf_expand_file_adinicb(struct inode *inode)
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
 	/* from now on we have normal address_space methods */
 	inode->i_data.a_ops = &udf_aops;
+	up_write(&iinfo->i_data_sem);
 	err = inode->i_data.a_ops->writepage(page, &udf_wbc);
 	if (err) {
 		/* Restore everything back so that we don't lose data... */
 		lock_page(page);
 		kaddr = kmap(page);
+		down_write(&iinfo->i_data_sem);
 		memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
 		       inode->i_size);
 		kunmap(page);
 		unlock_page(page);
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
 		inode->i_data.a_ops = &udf_adinicb_aops;
+		up_write(&iinfo->i_data_sem);
 	}
 	page_cache_release(page);
 	mark_inode_dirty(inode);
@@ -1105,10 +1121,9 @@ int udf_setsize(struct inode *inode, loff_t newsize)
 			if (bsize <
 			    (udf_file_entry_alloc_offset(inode) + newsize)) {
 				err = udf_expand_file_adinicb(inode);
-				if (err) {
-					up_write(&iinfo->i_data_sem);
+				if (err)
 					return err;
-				}
+				down_write(&iinfo->i_data_sem);
 			} else
 				iinfo->i_lenAlloc = newsize;
 		}
-- 
cgit v1.2.3


From fef2e9f3301934773e4f1b3cc5c7bffb119346b8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Dec 2011 15:13:50 +0100
Subject: udf: Treat symlink component of type 2 as /
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, we ignore symlink component of type 2. But mkisofs and other OS'
seem to treat it as / so do the same for compatibility.

Reported-by: "Gábor S." <otnaccess@hotmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/symlink.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index b1d4488b0f1..d7c6dbe4194 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -41,10 +41,16 @@ static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
 		pc = (struct pathComponent *)(from + elen);
 		switch (pc->componentType) {
 		case 1:
-			if (pc->lengthComponentIdent == 0) {
-				p = to;
-				*p++ = '/';
-			}
+			/*
+			 * Symlink points to some place which should be agreed
+ 			 * upon between originator and receiver of the media. Ignore.
+			 */
+			if (pc->lengthComponentIdent > 0)
+				break;
+			/* Fall through */
+		case 2:
+			p = to;
+			*p++ = '/';
 			break;
 		case 3:
 			memcpy(p, "../", 3);
-- 
cgit v1.2.3


From a06d789b424190e9f59da391681f908486db2554 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Dec 2011 17:35:34 +0100
Subject: reiserfs: Fix quota mount option parsing

When jqfmt mount option is not specified on remount, we mistakenly clear
s_jquota_fmt value stored in superblock. Fix the problem.

CC: stable@kernel.org
CC: reiserfs-devel@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/reiserfs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 14363b96b6a..f9eaa4a4f5f 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1164,7 +1164,8 @@ static void handle_quota_files(struct super_block *s, char **qf_names,
 			kfree(REISERFS_SB(s)->s_qf_names[i]);
 		REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
 	}
-	REISERFS_SB(s)->s_jquota_fmt = *qfmt;
+	if (*qfmt)
+		REISERFS_SB(s)->s_jquota_fmt = *qfmt;
 }
 #endif
 
-- 
cgit v1.2.3


From a9e36da655e54545c3289b2a0700b5c443de0edd Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 21 Dec 2011 21:18:43 +0100
Subject: reiserfs: Force inode evictions before umount to avoid crash

This patch fixes a crash in reiserfs_delete_xattrs during umount.

When shrink_dcache_for_umount clears the dcache from
generic_shutdown_super, delayed evictions are forced to disk. If an
evicted inode has extended attributes associated with it, it will
need to walk the xattr tree to locate and remove them.

But since shrink_dcache_for_umount will BUG if it encounters active
dentries, the xattr tree must be released before it's called or it will
crash during every umount.

This patch forces the evictions to occur before generic_shutdown_super
by calling shrink_dcache_sb first. The additional evictions caused
by the removal of each associated xattr file and dir will be automatically
handled as they're added to the LRU list.

CC: reiserfs-devel@vger.kernel.org
CC: stable@kernel.org
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/reiserfs/super.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f9eaa4a4f5f..5e3527be114 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -453,16 +453,20 @@ int remove_save_link(struct inode *inode, int truncate)
 static void reiserfs_kill_sb(struct super_block *s)
 {
 	if (REISERFS_SB(s)) {
-		if (REISERFS_SB(s)->xattr_root) {
-			d_invalidate(REISERFS_SB(s)->xattr_root);
-			dput(REISERFS_SB(s)->xattr_root);
-			REISERFS_SB(s)->xattr_root = NULL;
-		}
-		if (REISERFS_SB(s)->priv_root) {
-			d_invalidate(REISERFS_SB(s)->priv_root);
-			dput(REISERFS_SB(s)->priv_root);
-			REISERFS_SB(s)->priv_root = NULL;
-		}
+		/*
+		 * Force any pending inode evictions to occur now. Any
+		 * inodes to be removed that have extended attributes
+		 * associated with them need to clean them up before
+		 * we can release the extended attribute root dentries.
+		 * shrink_dcache_for_umount will BUG if we don't release
+		 * those before it's called so ->put_super is too late.
+		 */
+		shrink_dcache_sb(s);
+
+		dput(REISERFS_SB(s)->xattr_root);
+		REISERFS_SB(s)->xattr_root = NULL;
+		dput(REISERFS_SB(s)->priv_root);
+		REISERFS_SB(s)->priv_root = NULL;
 	}
 
 	kill_block_super(s);
-- 
cgit v1.2.3


From 0048278552e9752fd578c3d8deee756987c10873 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 22 Dec 2011 14:52:21 +0100
Subject: jbd: Remove j_barrier mutex

j_barrier mutex is used for serializing different journal lock operations.  The
problem with it is that e.g. FIFREEZE ioctl results in process leaving kernel
with j_barrier mutex held which makes lockdep freak out. Also hibernation code
wants to freeze filesystem but it cannot do so because it then cannot hibernate
the system because of mutex being locked.

So we remove j_barrier mutex and use direct wait on j_barrier_count instead.
Since locking journal is a rare operation we don't have to care about fairness
or such things.

CC: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/journal.c     |  1 -
 fs/jbd/transaction.c | 38 ++++++++++++++++++++++----------------
 include/linux/jbd.h  |  4 ----
 3 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index fea8dd661d2..1656dc2e6a0 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -721,7 +721,6 @@ static journal_t * journal_init_common (void)
 	init_waitqueue_head(&journal->j_wait_checkpoint);
 	init_waitqueue_head(&journal->j_wait_commit);
 	init_waitqueue_head(&journal->j_wait_updates);
-	mutex_init(&journal->j_barrier);
 	mutex_init(&journal->j_checkpoint_mutex);
 	spin_lock_init(&journal->j_revoke_lock);
 	spin_lock_init(&journal->j_list_lock);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 7e59c6e66f9..7fce94b04bc 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -426,17 +426,34 @@ int journal_restart(handle_t *handle, int nblocks)
  * void journal_lock_updates () - establish a transaction barrier.
  * @journal:  Journal to establish a barrier on.
  *
- * This locks out any further updates from being started, and blocks
- * until all existing updates have completed, returning only once the
- * journal is in a quiescent state with no updates running.
- *
- * The journal lock should not be held on entry.
+ * This locks out any further updates from being started, and blocks until all
+ * existing updates have completed, returning only once the journal is in a
+ * quiescent state with no updates running.
+ *
+ * We do not use simple mutex for synchronization as there are syscalls which
+ * want to return with filesystem locked and that trips up lockdep. Also
+ * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
+ * Since locking filesystem is rare operation, we use simple counter and
+ * waitqueue for locking.
  */
 void journal_lock_updates(journal_t *journal)
 {
 	DEFINE_WAIT(wait);
 
+wait:
+	/* Wait for previous locked operation to finish */
+	wait_event(journal->j_wait_transaction_locked,
+		   journal->j_barrier_count == 0);
+
 	spin_lock(&journal->j_state_lock);
+	/*
+	 * Check reliably under the lock whether we are the ones winning the race
+	 * and locking the journal
+	 */
+	if (journal->j_barrier_count > 0) {
+		spin_unlock(&journal->j_state_lock);
+		goto wait;
+	}
 	++journal->j_barrier_count;
 
 	/* Wait until there are no running updates */
@@ -460,14 +477,6 @@ void journal_lock_updates(journal_t *journal)
 		spin_lock(&journal->j_state_lock);
 	}
 	spin_unlock(&journal->j_state_lock);
-
-	/*
-	 * We have now established a barrier against other normal updates, but
-	 * we also need to barrier against other journal_lock_updates() calls
-	 * to make sure that we serialise special journal-locked operations
-	 * too.
-	 */
-	mutex_lock(&journal->j_barrier);
 }
 
 /**
@@ -475,14 +484,11 @@ void journal_lock_updates(journal_t *journal)
  * @journal:  Journal to release the barrier on.
  *
  * Release a transaction barrier obtained with journal_lock_updates().
- *
- * Should be called without the journal lock held.
  */
 void journal_unlock_updates (journal_t *journal)
 {
 	J_ASSERT(journal->j_barrier_count != 0);
 
-	mutex_unlock(&journal->j_barrier);
 	spin_lock(&journal->j_state_lock);
 	--journal->j_barrier_count;
 	spin_unlock(&journal->j_state_lock);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 492cc12244f..d211732b9e9 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -497,7 +497,6 @@ struct transaction_s
  * @j_format_version: Version of the superblock format
  * @j_state_lock: Protect the various scalars in the journal
  * @j_barrier_count:  Number of processes waiting to create a barrier lock
- * @j_barrier: The barrier lock itself
  * @j_running_transaction: The current running transaction..
  * @j_committing_transaction: the transaction we are pushing to disk
  * @j_checkpoint_transactions: a linked circular list of all transactions
@@ -580,9 +579,6 @@ struct journal_s
 	 */
 	int			j_barrier_count;
 
-	/* The barrier lock itself */
-	struct mutex		j_barrier;
-
 	/*
 	 * Transactions: The current running transaction...
 	 * [j_state_lock] [caller holding open handle]
-- 
cgit v1.2.3


From 33c104d415e92a51aaf638dc3d93920cfa601e5c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 22 Dec 2011 16:49:05 +0100
Subject: ext3: Don't warn from writepage when readonly inode is spotted after
 error

WARN_ON_ONCE(IS_RDONLY(inode)) tends to trip when filesystem hits error and is
remounted read-only. This unnecessarily scares users (well, they should be
scared because of filesystem error, but the stack trace distracts them from the
right source of their fear ;-). We could as well just remove the WARN_ON but
it's not hard to fix it to not trip on filesystem with errors and not use more
cycles in the common case so that's what we do.

CC: stable@kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a8d3217c5cf..fbf6599a1ce 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1623,7 +1623,13 @@ static int ext3_ordered_writepage(struct page *page,
 	int err;
 
 	J_ASSERT(PageLocked(page));
-	WARN_ON_ONCE(IS_RDONLY(inode));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
 
 	/*
 	 * We give up here if we're reentered, because it might be for a
@@ -1698,7 +1704,13 @@ static int ext3_writeback_writepage(struct page *page,
 	int err;
 
 	J_ASSERT(PageLocked(page));
-	WARN_ON_ONCE(IS_RDONLY(inode));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
 
 	if (ext3_journal_current_handle())
 		goto out_fail;
@@ -1741,7 +1753,13 @@ static int ext3_journalled_writepage(struct page *page,
 	int err;
 
 	J_ASSERT(PageLocked(page));
-	WARN_ON_ONCE(IS_RDONLY(inode));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
 
 	if (ext3_journal_current_handle())
 		goto no_write;
-- 
cgit v1.2.3


From 853a0c25baf96b028de1654bea1e0c8857eadf3d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 23 Dec 2011 11:53:07 +0100
Subject: udf: Mark LVID buffer as uptodate before marking it dirty

When we hit EIO while writing LVID, the buffer uptodate bit is cleared.
This then results in an anoying warning from mark_buffer_dirty() when we
write the buffer again. So just set uptodate flag unconditionally.

Reviewed-by: Namjae Jeon <linkinjeon@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index e185253470d..87cb24a0ee7 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1799,6 +1799,12 @@ static void udf_close_lvid(struct super_block *sb)
 				le16_to_cpu(lvid->descTag.descCRCLength)));
 
 	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+	/*
+	 * We set buffer uptodate unconditionally here to avoid spurious
+	 * warnings from mark_buffer_dirty() when previous EIO has marked
+	 * the buffer as !uptodate
+	 */
+	set_buffer_uptodate(bh);
 	mark_buffer_dirty(bh);
 	sbi->s_lvid_dirty = 0;
 	mutex_unlock(&sbi->s_alloc_mutex);
-- 
cgit v1.2.3


From 6c2155b9cc5a193e85194bbeaae2e2e4512dd597 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Tue, 3 Jan 2012 02:31:52 +0100
Subject: ext{3,4}: Fix potential race when setversion ioctl updates inode

The EXT{3,4}_IOC_SETVERSION ioctl() updates i_ctime and i_generation
without i_mutex. This can lead to a race with the other operations that
update i_ctime. This is not a big issue but let's make the ioctl consistent
with how we handle e.g. other timestamp updates and use i_mutex to protect
inode changes.

Signed-off-by: Djalal Harouni <tixxdz@opendz.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/ioctl.c | 6 +++++-
 fs/ext4/ioctl.c | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index ba1b54e23ca..e7b2ed9d36c 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -134,10 +134,11 @@ flags_out:
 			goto setversion_out;
 		}
 
+		mutex_lock(&inode->i_mutex);
 		handle = ext3_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
-			goto setversion_out;
+			goto unlock_out;
 		}
 		err = ext3_reserve_inode_write(handle, inode, &iloc);
 		if (err == 0) {
@@ -146,6 +147,9 @@ flags_out:
 			err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 		}
 		ext3_journal_stop(handle);
+
+unlock_out:
+		mutex_unlock(&inode->i_mutex);
 setversion_out:
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a56796814d6..46a8de6f208 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -158,10 +158,11 @@ flags_out:
 			goto setversion_out;
 		}
 
+		mutex_lock(&inode->i_mutex);
 		handle = ext4_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
-			goto setversion_out;
+			goto unlock_out;
 		}
 		err = ext4_reserve_inode_write(handle, inode, &iloc);
 		if (err == 0) {
@@ -170,6 +171,9 @@ flags_out:
 			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 		}
 		ext4_journal_stop(handle);
+
+unlock_out:
+		mutex_unlock(&inode->i_mutex);
 setversion_out:
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
-- 
cgit v1.2.3


From 302bf2f3259948c93361d501b04a5ed69c3bd4f8 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Wed, 4 Jan 2012 15:59:47 -0500
Subject: ext2/3/4: delete unneeded includes of module.h

Delete any instances of include module.h that were not strictly
required.  In the case of ext2, the declaration of MODULE_LICENSE
etc. were in inode.c but the module_init/exit were in super.c, so
relocate the MODULE_LICENCE/AUTHOR block to super.c which makes it
consistent with ext3 and ext4 at the same time.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/inode.c          | 5 -----
 fs/ext2/super.c          | 3 +++
 fs/ext2/xattr.c          | 1 -
 fs/ext2/xattr_security.c | 1 -
 fs/ext2/xattr_trusted.c  | 1 -
 fs/ext2/xattr_user.c     | 1 -
 fs/ext3/inode.c          | 1 -
 fs/ext3/xattr_security.c | 1 -
 fs/ext3/xattr_trusted.c  | 1 -
 fs/ext3/xattr_user.c     | 1 -
 fs/ext4/block_validity.c | 1 -
 fs/ext4/extents.c        | 1 -
 fs/ext4/indirect.c       | 1 -
 fs/ext4/inode.c          | 1 -
 fs/ext4/migrate.c        | 1 -
 fs/ext4/page-io.c        | 1 -
 fs/ext4/xattr_security.c | 1 -
 fs/ext4/xattr_trusted.c  | 1 -
 fs/ext4/xattr_user.c     | 1 -
 19 files changed, 3 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 91a6945af6d..740cad8dcd8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,7 +26,6 @@
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
-#include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
@@ -36,10 +35,6 @@
 #include "acl.h"
 #include "xip.h"
 
-MODULE_AUTHOR("Remy Card and others");
-MODULE_DESCRIPTION("Second Extended Filesystem");
-MODULE_LICENSE("GPL");
-
 static int __ext2_write_inode(struct inode *inode, int do_sync);
 
 /*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index bd8ac164a3b..9e7e203146b 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1521,5 +1521,8 @@ static void __exit exit_ext2_fs(void)
 	exit_ext2_xattr();
 }
 
+MODULE_AUTHOR("Remy Card and others");
+MODULE_DESCRIPTION("Second Extended Filesystem");
+MODULE_LICENSE("GPL");
 module_init(init_ext2_fs)
 module_exit(exit_ext2_fs)
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index d27b71f1d18..6dcafc7efdf 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -54,7 +54,6 @@
  */
 
 #include <linux/buffer_head.h>
-#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/mbcache.h>
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c922adc8ef4..be7a8d02c9a 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -3,7 +3,6 @@
  * Handler for storing security labels as extended attributes.
  */
 
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 667e46a8d62..2989467d359 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -5,7 +5,6 @@
  * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
  */
 
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 099d20f4716..f470e44c4b8 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -6,7 +6,6 @@
  */
 
 #include <linux/init.h>
-#include <linux/module.h>
 #include <linux/string.h>
 #include "ext2.h"
 #include "xattr.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index fbf6599a1ce..261979feb4b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -22,7 +22,6 @@
  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
  */
 
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/ext3_jbd.h>
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3c218b8a51d..ea26f2acab9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -3,7 +3,6 @@
  * Handler for storing security labels as extended attributes.
  */
 
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index dc8edda9ffe..2526a8829de 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -5,7 +5,6 @@
  * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
  */
 
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 7a321974d58..b32e473a1e3 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -5,7 +5,6 @@
  * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
  */
 
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext3_jbd.h>
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8efb2f0a344..3f11656bd72 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -13,7 +13,6 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
-#include <linux/module.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 61fa9e1614a..130ffe4818f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -29,7 +29,6 @@
  *   - smart tree reduction
  */
 
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3cfc73fbca8..830e1b2bf14 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,7 +20,6 @@
  *	(sct@redhat.com), 1993, 1998
  */
 
-#include <linux/module.h>
 #include "ext4_jbd2.h"
 #include "truncate.h"
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 240f6e2dc7e..e5385803336 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -18,7 +18,6 @@
  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  */
 
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 16ac228dbec..e7d6bb0acfa 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -12,7 +12,6 @@
  *
  */
 
-#include <linux/module.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
 
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7ce1d0b19c9..b1758538b3b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -6,7 +6,6 @@
  * Written by Theodore Ts'o, 2010.
  */
 
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 34e4350dd4d..b60f9f81e33 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -3,7 +3,6 @@
  * Handler for storing security labels as extended attributes.
  */
 
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/security.h>
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 37e6ebca2cc..95f1f4ab59a 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -5,7 +5,6 @@
  * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
  */
 
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 98c375352d0..0edb7611ffb 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -5,7 +5,6 @@
  * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
  */
 
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include "ext4_jbd2.h"
-- 
cgit v1.2.3


From 8fdd8c49fe50394fef3e193db27222cb03c2b212 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Jan 2012 10:48:11 -0500
Subject: isofs: inode leak on mount failure

d_alloc_root() failure leaves root inode leaked...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/isofs/inode.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 7b99f5f460b..bd62c76fb5d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -948,8 +948,11 @@ root_found:
 
 	/* get the root dentry */
 	s->s_root = d_alloc_root(inode);
-	if (!(s->s_root))
-		goto out_no_root;
+	if (!(s->s_root)) {
+		iput(inode);
+		error = -ENOMEM;
+		goto out_no_inode;
+	}
 
 	kfree(opt.iocharset);
 
-- 
cgit v1.2.3


From 53466710202900ce49e471f480cac11275e1d0c4 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 6 Dec 2011 17:06:06 -0600
Subject: jffs2: fix up error handling for insert_inode_locked

after 250df6ed274d767da844a5d9f05720b804240197
(fs: protect inode->i_state with inode->i_lock), insert_inode_locked()
no longer returns the inode with I_NEW set on failure.  However,
the error handler still calls unlock_new_inode() on failure,
which does a WARN_ON if I_NEW is not set, so any failure spews
a lot of warnings.

We can just drop the unlock_new_inode() if insert_inode_locked()
fails here.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/fs.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 4b8afe39a87..2e0123867cb 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -466,7 +466,6 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
 
 	if (insert_inode_locked(inode) < 0) {
 		make_bad_inode(inode);
-		unlock_new_inode(inode);
 		iput(inode);
 		return ERR_PTR(-EINVAL);
 	}
-- 
cgit v1.2.3


From 48d3610268cef9cea0704119a74a00d1bf82f536 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 15:44:14 +0200
Subject: logfs: rename functions starting with mtd_
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We are going to re-work the MTD interface and change 'mtd->write()' to
'mtd_write()', 'mtd->read()' to 'mtd_read()' and so forth for all functions
in the 'struct mtd_info' structure.

However, logfs has its own 'mtd_read()', 'mtd_write()', etc functions
which collide with our changes. This patch renames these logfs functions
to 'logfs_mtd_read()', 'logfs_mtd_write()', etc.

Additionally, to make the 'fs/logfs/dev_mtd.c' file look consistent, rename
similarly all the other functions starting with 'mtd_'.

Cc: Jörn Engel <joern@logfs.org>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/logfs/dev_mtd.c | 61 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 339e17e9133..eb423ebcf53 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -13,7 +13,8 @@
 
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
 
-static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len,
+			void *buf)
 {
 	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
 	size_t retlen;
@@ -31,7 +32,8 @@ static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
 	return 0;
 }
 
-static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
+			void *buf)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct mtd_info *mtd = super->s_mtd;
@@ -60,14 +62,15 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
  * asynchronous properties.  So just to prevent the first implementor of such
  * a thing from breaking logfs in 2350, we do the usual pointless dance to
  * declare a completion variable and wait for completion before returning
- * from mtd_erase().  What an exercise in futility!
+ * from logfs_mtd_erase().  What an exercise in futility!
  */
 static void logfs_erase_callback(struct erase_info *ei)
 {
 	complete((struct completion *)ei->priv);
 }
 
-static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
+static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
+				size_t len)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct address_space *mapping = super->s_mapping_inode->i_mapping;
@@ -84,7 +87,7 @@ static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
 	return 0;
 }
 
-static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
+static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
 		int ensure_write)
 {
 	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
@@ -109,10 +112,10 @@ static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
 	wait_for_completion(&complete);
 	if (ei.state != MTD_ERASE_DONE)
 		return -EIO;
-	return mtd_erase_mapping(sb, ofs, len);
+	return logfs_mtd_erase_mapping(sb, ofs, len);
 }
 
-static void mtd_sync(struct super_block *sb)
+static void logfs_mtd_sync(struct super_block *sb)
 {
 	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
 
@@ -120,12 +123,12 @@ static void mtd_sync(struct super_block *sb)
 		mtd->sync(mtd);
 }
 
-static int mtd_readpage(void *_sb, struct page *page)
+static int logfs_mtd_readpage(void *_sb, struct page *page)
 {
 	struct super_block *sb = _sb;
 	int err;
 
-	err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+	err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
 			page_address(page));
 	if (err == -EUCLEAN || err == -EBADMSG) {
 		/* -EBADMSG happens regularly on power failures */
@@ -143,11 +146,11 @@ static int mtd_readpage(void *_sb, struct page *page)
 	return err;
 }
 
-static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
+static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	filler_t *filler = mtd_readpage;
+	filler_t *filler = logfs_mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
 	if (!mtd->block_isbad)
@@ -163,11 +166,11 @@ static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
 	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
 }
 
-static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
+static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	filler_t *filler = mtd_readpage;
+	filler_t *filler = logfs_mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
 	if (!mtd->block_isbad)
@@ -184,7 +187,7 @@ static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
 	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
 }
 
-static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 		size_t nr_pages)
 {
 	struct logfs_super *super = logfs_super(sb);
@@ -196,8 +199,8 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 		page = find_lock_page(mapping, index + i);
 		BUG_ON(!page);
 
-		err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
-				page_address(page));
+		err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+					page_address(page));
 		unlock_page(page);
 		page_cache_release(page);
 		if (err)
@@ -206,7 +209,7 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 	return 0;
 }
 
-static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
+static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
 {
 	struct logfs_super *super = logfs_super(sb);
 	int head;
@@ -227,15 +230,15 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
 		len += head;
 	}
 	len = PAGE_ALIGN(len);
-	__mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+	__logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
 }
 
-static void mtd_put_device(struct logfs_super *s)
+static void logfs_mtd_put_device(struct logfs_super *s)
 {
 	put_mtd_device(s->s_mtd);
 }
 
-static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
+static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs)
 {
 	struct logfs_super *super = logfs_super(sb);
 	void *buf;
@@ -244,7 +247,7 @@ static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
 	buf = kmalloc(super->s_writesize, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
-	err = mtd_read(sb, ofs, super->s_writesize, buf);
+	err = logfs_mtd_read(sb, ofs, super->s_writesize, buf);
 	if (err)
 		goto out;
 	if (memchr_inv(buf, 0xff, super->s_writesize))
@@ -255,14 +258,14 @@ out:
 }
 
 static const struct logfs_device_ops mtd_devops = {
-	.find_first_sb	= mtd_find_first_sb,
-	.find_last_sb	= mtd_find_last_sb,
-	.readpage	= mtd_readpage,
-	.writeseg	= mtd_writeseg,
-	.erase		= mtd_erase,
-	.can_write_buf	= mtd_can_write_buf,
-	.sync		= mtd_sync,
-	.put_device	= mtd_put_device,
+	.find_first_sb	= logfs_mtd_find_first_sb,
+	.find_last_sb	= logfs_mtd_find_last_sb,
+	.readpage	= logfs_mtd_readpage,
+	.writeseg	= logfs_mtd_writeseg,
+	.erase		= logfs_mtd_erase,
+	.can_write_buf	= logfs_mtd_can_write_buf,
+	.sync		= logfs_mtd_sync,
+	.put_device	= logfs_mtd_put_device,
 };
 
 int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-- 
cgit v1.2.3


From 7e1f0dc0551b99acb5e8fa161a7ac401994d57d8 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 15:25:39 +0200
Subject: mtd: introduce mtd_erase interface

This patch is part of a patch-set which changes the MTD interface
from 'mtd->func()' form to 'mtd_func()' form. We need this because
we want to add common code to to all drivers in the mtd core level,
which is impossible with the current interface when MTD clients
call driver functions like 'read()' or 'write()' directly.

At this point we just introduce a new inline wrapper function, but
later some of them are expected to gain more code. E.g., the input
parameters check should be moved to the wrappers rather than be
duplicated at many drivers.

This particular patch introduced the 'mtd_erase()' interface. The
following patches add all the other interfaces one by one.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/ftl.c                   |  2 +-
 drivers/mtd/inftlmount.c            |  4 ++--
 drivers/mtd/mtdblock.c              |  2 +-
 drivers/mtd/mtdchar.c               |  2 +-
 drivers/mtd/mtdconcat.c             |  2 +-
 drivers/mtd/mtdoops.c               |  2 +-
 drivers/mtd/mtdpart.c               |  2 +-
 drivers/mtd/mtdswap.c               |  2 +-
 drivers/mtd/nftlmount.c             |  2 +-
 drivers/mtd/rfd_ftl.c               |  2 +-
 drivers/mtd/sm_ftl.c                |  2 +-
 drivers/mtd/tests/mtd_oobtest.c     |  2 +-
 drivers/mtd/tests/mtd_pagetest.c    |  2 +-
 drivers/mtd/tests/mtd_speedtest.c   |  4 ++--
 drivers/mtd/tests/mtd_stresstest.c  |  2 +-
 drivers/mtd/tests/mtd_subpagetest.c |  2 +-
 drivers/mtd/tests/mtd_torturetest.c |  2 +-
 drivers/mtd/ubi/io.c                |  2 +-
 drivers/staging/spectra/lld_mtd.c   |  2 +-
 fs/jffs2/erase.c                    |  2 +-
 fs/logfs/dev_mtd.c                  |  2 +-
 include/linux/mtd/mtd.h             | 19 ++++++++++++++-----
 22 files changed, 37 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index c7382bb686c..a982889277c 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -355,7 +355,7 @@ static int erase_xfer(partition_t *part,
     erase->len = 1 << part->header.EraseUnitSize;
     erase->priv = (u_long)part;
 
-    ret = part->mbd.mtd->erase(part->mbd.mtd, erase);
+    ret = mtd_erase(part->mbd.mtd, erase);
 
     if (!ret)
 	    xfer->EraseCount++;
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index 2ff601f816c..0d946f10a68 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -220,7 +220,7 @@ static int find_boot_record(struct INFTLrecord *inftl)
 				 */
 				instr->addr = ip->Reserved0 * inftl->EraseSize;
 				instr->len = inftl->EraseSize;
-				mtd->erase(mtd, instr);
+				mtd_erase(mtd, instr);
 			}
 			if ((ip->lastUnit - ip->firstUnit + 1) < ip->virtualUnits) {
 				printk(KERN_WARNING "INFTL: Media Header "
@@ -393,7 +393,7 @@ int INFTL_formatblock(struct INFTLrecord *inftl, int block)
 	   mark only the failed block in the bbt. */
 	for (physblock = 0; physblock < inftl->EraseSize;
 	     physblock += instr->len, instr->addr += instr->len) {
-		mtd->erase(inftl->mbd.mtd, instr);
+		mtd_erase(inftl->mbd.mtd, instr);
 
 		if (instr->state == MTD_ERASE_FAILED) {
 			printk(KERN_WARNING "INFTL: error while formatting block %d\n",
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index 7c1dc908a17..9b01cb0266e 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -85,7 +85,7 @@ static int erase_write (struct mtd_info *mtd, unsigned long pos,
 	set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(&wait_q, &wait);
 
-	ret = mtd->erase(mtd, &erase);
+	ret = mtd_erase(mtd, &erase);
 	if (ret) {
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&wait_q, &wait);
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 00423cc8580..41d64ff4c25 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -731,7 +731,7 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 			  wq_head is no longer there when the
 			  callback routine tries to wake us up.
 			*/
-			ret = mtd->erase(mtd, erase);
+			ret = mtd_erase(mtd, erase);
 			if (!ret) {
 				set_current_state(TASK_UNINTERRUPTIBLE);
 				add_wait_queue(&waitq, &wait);
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 6df4d4d4eb9..76123bd4931 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -379,7 +379,7 @@ static int concat_dev_erase(struct mtd_info *mtd, struct erase_info *erase)
 	 * FIXME: Allow INTERRUPTIBLE. Which means
 	 * not having the wait_queue head on the stack.
 	 */
-	err = mtd->erase(mtd, erase);
+	err = mtd_erase(mtd, erase);
 	if (!err) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		add_wait_queue(&waitq, &wait);
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index f3cdce9a85a..9b2d8632316 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -112,7 +112,7 @@ static int mtdoops_erase_block(struct mtdoops_context *cxt, int offset)
 	set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(&wait_q, &wait);
 
-	ret = mtd->erase(mtd, &erase);
+	ret = mtd_erase(mtd, &erase);
 	if (ret) {
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&wait_q, &wait);
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index a0bd2de4752..d318fee2859 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -257,7 +257,7 @@ static int part_erase(struct mtd_info *mtd, struct erase_info *instr)
 	if (instr->addr >= mtd->size)
 		return -EINVAL;
 	instr->addr += part->offset;
-	ret = part->master->erase(part->master, instr);
+	ret = mtd_erase(part->master, instr);
 	if (ret) {
 		if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN)
 			instr->fail_addr -= part->offset;
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index bd9590c723e..4e12875a916 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -567,7 +567,7 @@ retry:
 	erase.len	= mtd->erasesize;
 	erase.priv	= (u_long)&wq;
 
-	ret = mtd->erase(mtd, &erase);
+	ret = mtd_erase(mtd, &erase);
 	if (ret) {
 		if (retries++ < MTDSWAP_ERASE_RETRIES) {
 			dev_warn(d->dev,
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index ac4092591ae..9164a56fb5c 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -326,7 +326,7 @@ int NFTL_formatblock(struct NFTLrecord *nftl, int block)
 	instr->mtd = nftl->mbd.mtd;
 	instr->addr = block * nftl->EraseSize;
 	instr->len = nftl->EraseSize;
-	mtd->erase(mtd, instr);
+	mtd_erase(mtd, instr);
 
 	if (instr->state == MTD_ERASE_FAILED) {
 		printk("Error while formatting block %d\n", block);
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index 73ae217a425..39de8727a52 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -342,7 +342,7 @@ static int erase_block(struct partition *part, int block)
 	part->blocks[block].state = BLOCK_ERASING;
 	part->blocks[block].free_sectors = 0;
 
-	rc = part->mbd.mtd->erase(part->mbd.mtd, erase);
+	rc = mtd_erase(part->mbd.mtd, erase);
 
 	if (rc) {
 		printk(KERN_ERR PREFIX "erase of region %llx,%llx on '%s' "
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index 1c9f307ae0a..2f1acb1ab5e 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -479,7 +479,7 @@ static int sm_erase_block(struct sm_ftl *ftl, int zone_num, uint16_t block,
 		return -EIO;
 	}
 
-	if (mtd->erase(mtd, &erase)) {
+	if (mtd_erase(mtd, &erase)) {
 		sm_printk("erase of block %d in zone %d failed",
 							block, zone_num);
 		goto error;
diff --git a/drivers/mtd/tests/mtd_oobtest.c b/drivers/mtd/tests/mtd_oobtest.c
index 933f7e5f32d..7d52854c16d 100644
--- a/drivers/mtd/tests/mtd_oobtest.c
+++ b/drivers/mtd/tests/mtd_oobtest.c
@@ -78,7 +78,7 @@ static int erase_eraseblock(int ebnum)
 	ei.addr = addr;
 	ei.len  = mtd->erasesize;
 
-	err = mtd->erase(mtd, &ei);
+	err = mtd_erase(mtd, &ei);
 	if (err) {
 		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
 		return err;
diff --git a/drivers/mtd/tests/mtd_pagetest.c b/drivers/mtd/tests/mtd_pagetest.c
index afafb6935fd..271819fabb5 100644
--- a/drivers/mtd/tests/mtd_pagetest.c
+++ b/drivers/mtd/tests/mtd_pagetest.c
@@ -77,7 +77,7 @@ static int erase_eraseblock(int ebnum)
 	ei.addr = addr;
 	ei.len  = mtd->erasesize;
 
-	err = mtd->erase(mtd, &ei);
+	err = mtd_erase(mtd, &ei);
 	if (err) {
 		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
 		return err;
diff --git a/drivers/mtd/tests/mtd_speedtest.c b/drivers/mtd/tests/mtd_speedtest.c
index 493b367bdd3..f67a65e2104 100644
--- a/drivers/mtd/tests/mtd_speedtest.c
+++ b/drivers/mtd/tests/mtd_speedtest.c
@@ -79,7 +79,7 @@ static int erase_eraseblock(int ebnum)
 	ei.addr = addr;
 	ei.len  = mtd->erasesize;
 
-	err = mtd->erase(mtd, &ei);
+	err = mtd_erase(mtd, &ei);
 	if (err) {
 		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
 		return err;
@@ -105,7 +105,7 @@ static int multiblock_erase(int ebnum, int blocks)
 	ei.addr = addr;
 	ei.len  = mtd->erasesize * blocks;
 
-	err = mtd->erase(mtd, &ei);
+	err = mtd_erase(mtd, &ei);
 	if (err) {
 		printk(PRINT_PREF "error %d while erasing EB %d, blocks %d\n",
 		       err, ebnum, blocks);
diff --git a/drivers/mtd/tests/mtd_stresstest.c b/drivers/mtd/tests/mtd_stresstest.c
index 811642fea6b..a204a9f9052 100644
--- a/drivers/mtd/tests/mtd_stresstest.c
+++ b/drivers/mtd/tests/mtd_stresstest.c
@@ -112,7 +112,7 @@ static int erase_eraseblock(int ebnum)
 	ei.addr = addr;
 	ei.len  = mtd->erasesize;
 
-	err = mtd->erase(mtd, &ei);
+	err = mtd_erase(mtd, &ei);
 	if (unlikely(err)) {
 		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
 		return err;
diff --git a/drivers/mtd/tests/mtd_subpagetest.c b/drivers/mtd/tests/mtd_subpagetest.c
index 1a05bfac4ee..16d0c05024d 100644
--- a/drivers/mtd/tests/mtd_subpagetest.c
+++ b/drivers/mtd/tests/mtd_subpagetest.c
@@ -80,7 +80,7 @@ static int erase_eraseblock(int ebnum)
 	ei.addr = addr;
 	ei.len  = mtd->erasesize;
 
-	err = mtd->erase(mtd, &ei);
+	err = mtd_erase(mtd, &ei);
 	if (err) {
 		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
 		return err;
diff --git a/drivers/mtd/tests/mtd_torturetest.c b/drivers/mtd/tests/mtd_torturetest.c
index 03ab649a696..102c79b7ac6 100644
--- a/drivers/mtd/tests/mtd_torturetest.c
+++ b/drivers/mtd/tests/mtd_torturetest.c
@@ -105,7 +105,7 @@ static inline int erase_eraseblock(int ebnum)
 	ei.addr = addr;
 	ei.len  = mtd->erasesize;
 
-	err = mtd->erase(mtd, &ei);
+	err = mtd_erase(mtd, &ei);
 	if (err) {
 		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
 		return err;
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c
index f20b6f22f24..b6c8959e6c7 100644
--- a/drivers/mtd/ubi/io.c
+++ b/drivers/mtd/ubi/io.c
@@ -361,7 +361,7 @@ retry:
 	ei.callback = erase_callback;
 	ei.priv     = (unsigned long)&wq;
 
-	err = ubi->mtd->erase(ubi->mtd, &ei);
+	err = mtd_erase(ubi->mtd, &ei);
 	if (err) {
 		if (retries++ < UBI_IO_RETRIES) {
 			dbg_io("error %d while erasing PEB %d, retry",
diff --git a/drivers/staging/spectra/lld_mtd.c b/drivers/staging/spectra/lld_mtd.c
index a9c309a167c..d638fafab64 100644
--- a/drivers/staging/spectra/lld_mtd.c
+++ b/drivers/staging/spectra/lld_mtd.c
@@ -188,7 +188,7 @@ u16 mtd_Erase_Block(u32 block_add)
 	erase.len = spectra_mtd->erasesize;
 	erase.priv = (unsigned long)&comp;
 
-	ret = spectra_mtd->erase(spectra_mtd, &erase);
+	ret = mtd_erase(spectra_mtd, &erase);
 	if (!ret) {
 		wait_for_completion(&comp);
 		if (erase.state != MTD_ERASE_DONE)
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index e513f1913c1..540e8eca1b4 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -74,7 +74,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
 	((struct erase_priv_struct *)instr->priv)->jeb = jeb;
 	((struct erase_priv_struct *)instr->priv)->c = c;
 
-	ret = c->mtd->erase(c->mtd, instr);
+	ret = mtd_erase(c->mtd, instr);
 	if (!ret)
 		return;
 
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index eb423ebcf53..04636289435 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -105,7 +105,7 @@ static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
 	ei.len = len;
 	ei.callback = logfs_erase_callback;
 	ei.priv = (long)&complete;
-	ret = mtd->erase(mtd, &ei);
+	ret = mtd_erase(mtd, &ei);
 	if (ret)
 		return -EIO;
 
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 9f5b312af78..201bad55704 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -171,11 +171,8 @@ struct mtd_info {
 	struct mtd_erase_region_info *eraseregions;
 
 	/*
-	 * Erase is an asynchronous operation.  Device drivers are supposed
-	 * to call instr->callback() whenever the operation completes, even
-	 * if it completes with a failure.
-	 * Callers are supposed to pass a callback function and wait for it
-	 * to be called before writing to the block.
+	 * Do not call via these pointers, use corresponding mtd_*()
+	 * wrappers instead.
 	 */
 	int (*erase) (struct mtd_info *mtd, struct erase_info *instr);
 
@@ -274,6 +271,18 @@ struct mtd_info {
 	void (*put_device) (struct mtd_info *mtd);
 };
 
+/*
+ * Erase is an asynchronous operation.  Device drivers are supposed
+ * to call instr->callback() whenever the operation completes, even
+ * if it completes with a failure.
+ * Callers are supposed to pass a callback function and wait for it
+ * to be called before writing to the block.
+ */
+static inline int mtd_erase(struct mtd_info *mtd, struct erase_info *instr)
+{
+	return mtd->erase(mtd, instr);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From d35ea200c0fb5315f16fb2599a4bafd9c1a7b386 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 17:00:37 +0200
Subject: mtd: introduce mtd_point interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdpart.c   |  4 ++--
 fs/jffs2/erase.c        |  4 ++--
 fs/jffs2/readinode.c    |  4 ++--
 fs/jffs2/scan.c         |  4 ++--
 include/linux/mtd/mtd.h | 14 ++++++++++----
 5 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index d318fee2859..5b664722e5b 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -89,8 +89,8 @@ static int part_point(struct mtd_info *mtd, loff_t from, size_t len,
 		len = 0;
 	else if (from + len > mtd->size)
 		len = mtd->size - from;
-	return part->master->point (part->master, from + part->offset,
-				    len, retlen, virt, phys);
+	return mtd_point(part->master, from + part->offset, len, retlen,
+			 virt, phys);
 }
 
 static void part_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 540e8eca1b4..53f8794fda6 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -340,8 +340,8 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
 	if (c->mtd->point) {
 		unsigned long *wordebuf;
 
-		ret = c->mtd->point(c->mtd, jeb->offset, c->sector_size,
-				    &retlen, &ebuf, NULL);
+		ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen,
+				&ebuf, NULL);
 		if (ret) {
 			D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
 			goto do_flash_read;
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index ee57bac1ba6..dde61effeda 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -63,8 +63,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 	/* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
 	 * adding and jffs2_flash_read_end() interface. */
 	if (c->mtd->point) {
-		err = c->mtd->point(c->mtd, ofs, len, &retlen,
-				    (void **)&buffer, NULL);
+		err = mtd_point(c->mtd, ofs, len, &retlen, (void **)&buffer,
+				NULL);
 		if (!err && retlen < len) {
 			JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
 			c->mtd->unpoint(c->mtd, ofs, retlen);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 28107ca136e..53e05c8e5b6 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -97,8 +97,8 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 	size_t pointlen, try_size;
 
 	if (c->mtd->point) {
-		ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen,
-				    (void **)&flashbuf, NULL);
+		ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen,
+				(void **)&flashbuf, NULL);
 		if (!ret && pointlen < c->mtd->size) {
 			/* Don't muck about if it won't let us point to the whole flash */
 			D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen));
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 201bad55704..ca7bfdaf7a6 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -175,11 +175,8 @@ struct mtd_info {
 	 * wrappers instead.
 	 */
 	int (*erase) (struct mtd_info *mtd, struct erase_info *instr);
-
-	/* This stuff for eXecute-In-Place */
-	/* phys is optional and may be set to NULL */
 	int (*point) (struct mtd_info *mtd, loff_t from, size_t len,
-			size_t *retlen, void **virt, resource_size_t *phys);
+		      size_t *retlen, void **virt, resource_size_t *phys);
 
 	/* We probably shouldn't allow XIP if the unpoint isn't a NULL */
 	void (*unpoint) (struct mtd_info *mtd, loff_t from, size_t len);
@@ -283,6 +280,15 @@ static inline int mtd_erase(struct mtd_info *mtd, struct erase_info *instr)
 	return mtd->erase(mtd, instr);
 }
 
+/*
+ * This stuff for eXecute-In-Place. phys is optional and may be set to NULL.
+ */
+static inline int mtd_point(struct mtd_info *mtd, loff_t from, size_t len,
+			    size_t *retlen, void **virt, resource_size_t *phys)
+{
+	return mtd->point(mtd, from, len, retlen, virt, phys);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From 7219778ad9c18cc2c05c7fca0abe026afbc19dfb Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 17:05:52 +0200
Subject: mtd: introduce mtd_unpoint interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdpart.c   | 2 +-
 fs/jffs2/erase.c        | 4 ++--
 fs/jffs2/readinode.c    | 6 +++---
 fs/jffs2/scan.c         | 4 ++--
 include/linux/mtd/mtd.h | 8 ++++++--
 5 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 5b664722e5b..b09624a5497 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -97,7 +97,7 @@ static void part_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
 {
 	struct mtd_part *part = PART(mtd);
 
-	part->master->unpoint(part->master, from + part->offset, len);
+	mtd_unpoint(part->master, from + part->offset, len);
 }
 
 static unsigned long part_get_unmapped_area(struct mtd_info *mtd,
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 53f8794fda6..ffdf4fca9c5 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -349,7 +349,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
 		if (retlen < c->sector_size) {
 			/* Don't muck about if it won't let us point to the whole erase sector */
 			D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen));
-			c->mtd->unpoint(c->mtd, jeb->offset, retlen);
+			mtd_unpoint(c->mtd, jeb->offset, retlen);
 			goto do_flash_read;
 		}
 		wordebuf = ebuf-sizeof(*wordebuf);
@@ -358,7 +358,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
 		   if (*++wordebuf != ~0)
 			   break;
 		} while(--retlen);
-		c->mtd->unpoint(c->mtd, jeb->offset, c->sector_size);
+		mtd_unpoint(c->mtd, jeb->offset, c->sector_size);
 		if (retlen) {
 			printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n",
 			       *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf));
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index dde61effeda..fca2f84e1ad 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -67,7 +67,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 				NULL);
 		if (!err && retlen < len) {
 			JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
-			c->mtd->unpoint(c->mtd, ofs, retlen);
+			mtd_unpoint(c->mtd, ofs, retlen);
 		} else if (err)
 			JFFS2_WARNING("MTD point failed: error code %d.\n", err);
 		else
@@ -101,7 +101,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 		kfree(buffer);
 #ifndef __ECOS
 	else
-		c->mtd->unpoint(c->mtd, ofs, len);
+		mtd_unpoint(c->mtd, ofs, len);
 #endif
 
 	if (crc != tn->data_crc) {
@@ -137,7 +137,7 @@ free_out:
 		kfree(buffer);
 #ifndef __ECOS
 	else
-		c->mtd->unpoint(c->mtd, ofs, len);
+		mtd_unpoint(c->mtd, ofs, len);
 #endif
 	return err;
 }
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 53e05c8e5b6..72f3960f44a 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -102,7 +102,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 		if (!ret && pointlen < c->mtd->size) {
 			/* Don't muck about if it won't let us point to the whole flash */
 			D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen));
-			c->mtd->unpoint(c->mtd, 0, pointlen);
+			mtd_unpoint(c->mtd, 0, pointlen);
 			flashbuf = NULL;
 		}
 		if (ret)
@@ -273,7 +273,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 		kfree(flashbuf);
 #ifndef __ECOS
 	else
-		c->mtd->unpoint(c->mtd, 0, c->mtd->size);
+		mtd_unpoint(c->mtd, 0, c->mtd->size);
 #endif
 	kfree(s);
 	return ret;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index ca7bfdaf7a6..a7d22b7fcb4 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -177,8 +177,6 @@ struct mtd_info {
 	int (*erase) (struct mtd_info *mtd, struct erase_info *instr);
 	int (*point) (struct mtd_info *mtd, loff_t from, size_t len,
 		      size_t *retlen, void **virt, resource_size_t *phys);
-
-	/* We probably shouldn't allow XIP if the unpoint isn't a NULL */
 	void (*unpoint) (struct mtd_info *mtd, loff_t from, size_t len);
 
 	/* Allow NOMMU mmap() to directly map the device (if not NULL)
@@ -289,6 +287,12 @@ static inline int mtd_point(struct mtd_info *mtd, loff_t from, size_t len,
 	return mtd->point(mtd, from, len, retlen, virt, phys);
 }
 
+/* We probably shouldn't allow XIP if the unpoint isn't a NULL */
+static inline void mtd_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
+{
+	return mtd->unpoint(mtd, from, len);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From 04c601bfa4cb29c968dcb66e44c799c9c01d8675 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 17:10:15 +0200
Subject: mtd: introduce mtd_get_unmapped_area interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdchar.c   |  2 +-
 drivers/mtd/mtdconcat.c |  4 ++--
 drivers/mtd/mtdpart.c   |  3 +--
 fs/romfs/mmap-nommu.c   |  2 +-
 include/linux/mtd/mtd.h | 18 +++++++++++++-----
 5 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 41d64ff4c25..c51f04a00af 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1135,7 +1135,7 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file,
 		if (offset > mtd->size - len)
 			return (unsigned long) -EINVAL;
 
-		return mtd->get_unmapped_area(mtd, len, offset, flags);
+		return mtd_get_unmapped_area(mtd, len, offset, flags);
 	}
 
 	/* can't map directly */
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 76123bd4931..b3895cf20bb 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -726,8 +726,8 @@ static unsigned long concat_get_unmapped_area(struct mtd_info *mtd,
 			return (unsigned long) -EINVAL;
 
 		if (subdev->get_unmapped_area)
-			return subdev->get_unmapped_area(subdev, len, offset,
-							 flags);
+			return mtd_get_unmapped_area(subdev, len, offset,
+						     flags);
 
 		break;
 	}
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index b09624a5497..55a9cb544fc 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -108,8 +108,7 @@ static unsigned long part_get_unmapped_area(struct mtd_info *mtd,
 	struct mtd_part *part = PART(mtd);
 
 	offset += part->offset;
-	return part->master->get_unmapped_area(part->master, len, offset,
-					       flags);
+	return mtd_get_unmapped_area(part->master, len, offset, flags);
 }
 
 static int part_read_oob(struct mtd_info *mtd, loff_t from,
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index eed99428f10..d5168e8e7dc 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -53,7 +53,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
 		if (offset > mtd->size - len)
 			return (unsigned long) -EINVAL;
 
-		return mtd->get_unmapped_area(mtd, len, offset, flags);
+		return mtd_get_unmapped_area(mtd, len, offset, flags);
 	}
 
 cant_map_directly:
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index a7d22b7fcb4..f38e8276b40 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -178,11 +178,6 @@ struct mtd_info {
 	int (*point) (struct mtd_info *mtd, loff_t from, size_t len,
 		      size_t *retlen, void **virt, resource_size_t *phys);
 	void (*unpoint) (struct mtd_info *mtd, loff_t from, size_t len);
-
-	/* Allow NOMMU mmap() to directly map the device (if not NULL)
-	 * - return the address to which the offset maps
-	 * - return -ENOSYS to indicate refusal to do the mapping
-	 */
 	unsigned long (*get_unmapped_area) (struct mtd_info *mtd,
 					    unsigned long len,
 					    unsigned long offset,
@@ -293,6 +288,19 @@ static inline void mtd_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
 	return mtd->unpoint(mtd, from, len);
 }
 
+/*
+ * Allow NOMMU mmap() to directly map the device (if not NULL)
+ * - return the address to which the offset maps
+ * - return -ENOSYS to indicate refusal to do the mapping
+ */
+static inline unsigned long mtd_get_unmapped_area(struct mtd_info *mtd,
+						  unsigned long len,
+						  unsigned long offset,
+						  unsigned long flags)
+{
+	return mtd->get_unmapped_area(mtd, len, offset, flags);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From 329ad399a9b3adf52c90637b21ca029fcf7f8795 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 17:30:16 +0200
Subject: mtd: introduce mtd_read interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/arm/mach-davinci/board-da850-evm.c   |  2 +-
 arch/cris/arch-v32/drivers/axisflashmap.c |  4 +--
 drivers/mtd/afs.c                         |  4 +--
 drivers/mtd/ar7part.c                     | 15 ++++++-----
 drivers/mtd/bcm63xxpart.c                 | 12 ++++-----
 drivers/mtd/ftl.c                         | 41 ++++++++++++++++---------------
 drivers/mtd/inftlcore.c                   | 19 ++++++++------
 drivers/mtd/inftlmount.c                  | 10 ++++----
 drivers/mtd/mtdblock.c                    |  8 +++---
 drivers/mtd/mtdblock_ro.c                 |  2 +-
 drivers/mtd/mtdchar.c                     |  2 +-
 drivers/mtd/mtdconcat.c                   |  2 +-
 drivers/mtd/mtdoops.c                     |  4 +--
 drivers/mtd/mtdpart.c                     |  3 +--
 drivers/mtd/mtdswap.c                     |  4 +--
 drivers/mtd/nand/diskonchip.c             |  4 +--
 drivers/mtd/nand/nand_bbt.c               |  6 ++---
 drivers/mtd/nftlcore.c                    | 17 ++++++++-----
 drivers/mtd/nftlmount.c                   |  6 ++---
 drivers/mtd/redboot.c                     |  4 +--
 drivers/mtd/rfd_ftl.c                     | 24 +++++++++---------
 drivers/mtd/ssfdc.c                       |  6 ++---
 drivers/mtd/tests/mtd_pagetest.c          | 28 ++++++++++-----------
 drivers/mtd/tests/mtd_readtest.c          |  2 +-
 drivers/mtd/tests/mtd_speedtest.c         |  8 +++---
 drivers/mtd/tests/mtd_stresstest.c        |  2 +-
 drivers/mtd/tests/mtd_subpagetest.c       |  8 +++---
 drivers/mtd/tests/mtd_torturetest.c       |  2 +-
 drivers/mtd/ubi/debug.c                   |  2 +-
 drivers/mtd/ubi/io.c                      |  6 ++---
 drivers/staging/spectra/lld_mtd.c         |  8 +++---
 fs/jffs2/erase.c                          |  2 +-
 fs/jffs2/wbuf.c                           |  9 ++++---
 fs/logfs/dev_mtd.c                        |  2 +-
 include/linux/mtd/mtd.h                   |  9 ++++++-
 35 files changed, 152 insertions(+), 135 deletions(-)

(limited to 'fs')

diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c
index 6659a90dbca..8b079f9d692 100644
--- a/arch/arm/mach-davinci/board-da850-evm.c
+++ b/arch/arm/mach-davinci/board-da850-evm.c
@@ -127,7 +127,7 @@ static void da850_evm_m25p80_notify_add(struct mtd_info *mtd)
 	size_t retlen;
 
 	if (!strcmp(mtd->name, "MAC-Address")) {
-		mtd->read(mtd, 0, ETH_ALEN, &retlen, mac_addr);
+		mtd_read(mtd, 0, ETH_ALEN, &retlen, mac_addr);
 		if (retlen == ETH_ALEN)
 			pr_info("Read MAC addr from SPI Flash: %pM\n",
 				mac_addr);
diff --git a/arch/cris/arch-v32/drivers/axisflashmap.c b/arch/cris/arch-v32/drivers/axisflashmap.c
index a2bde374462..011bddbf073 100644
--- a/arch/cris/arch-v32/drivers/axisflashmap.c
+++ b/arch/cris/arch-v32/drivers/axisflashmap.c
@@ -413,8 +413,8 @@ static int __init init_axis_flash(void)
 		} while (blockstat && ptable_sector);
 #endif
 		if (ptable_sector) {
-			main_mtd->read(main_mtd, ptable_sector, PAGESIZE,
-				&len, page);
+			mtd_read(main_mtd, ptable_sector, PAGESIZE, &len,
+				 page);
 			ptable_head = &((struct partitiontable *) page)->head;
 		}
 
diff --git a/drivers/mtd/afs.c b/drivers/mtd/afs.c
index 89a02f6f65d..5a3942bf109 100644
--- a/drivers/mtd/afs.c
+++ b/drivers/mtd/afs.c
@@ -75,7 +75,7 @@ afs_read_footer(struct mtd_info *mtd, u_int *img_start, u_int *iis_start,
 	size_t sz;
 	int ret;
 
-	ret = mtd->read(mtd, ptr, sizeof(fs), &sz, (u_char *) &fs);
+	ret = mtd_read(mtd, ptr, sizeof(fs), &sz, (u_char *)&fs);
 	if (ret >= 0 && sz != sizeof(fs))
 		ret = -EINVAL;
 
@@ -132,7 +132,7 @@ afs_read_iis(struct mtd_info *mtd, struct image_info_struct *iis, u_int ptr)
 	int ret, i;
 
 	memset(iis, 0, sizeof(*iis));
-	ret = mtd->read(mtd, ptr, sizeof(*iis), &sz, (u_char *) iis);
+	ret = mtd_read(mtd, ptr, sizeof(*iis), &sz, (u_char *)iis);
 	if (ret < 0)
 		goto failed;
 
diff --git a/drivers/mtd/ar7part.c b/drivers/mtd/ar7part.c
index f40ea454755..94539312995 100644
--- a/drivers/mtd/ar7part.c
+++ b/drivers/mtd/ar7part.c
@@ -73,8 +73,8 @@ static int create_mtd_partitions(struct mtd_info *master,
 
 	do { /* Try 10 blocks starting from master->erasesize */
 		offset = pre_size;
-		master->read(master, offset,
-			     sizeof(header), &len, (uint8_t *)&header);
+		mtd_read(master, offset, sizeof(header), &len,
+			 (uint8_t *)&header);
 		if (!strncmp((char *)&header, "TIENV0.8", 8))
 			ar7_parts[1].offset = pre_size;
 		if (header.checksum == LOADER_MAGIC1)
@@ -95,16 +95,16 @@ static int create_mtd_partitions(struct mtd_info *master,
 	case LOADER_MAGIC1:
 		while (header.length) {
 			offset += sizeof(header) + header.length;
-			master->read(master, offset, sizeof(header),
-				     &len, (uint8_t *)&header);
+			mtd_read(master, offset, sizeof(header), &len,
+				 (uint8_t *)&header);
 		}
 		root_offset = offset + sizeof(header) + 4;
 		break;
 	case LOADER_MAGIC2:
 		while (header.length) {
 			offset += sizeof(header) + header.length;
-			master->read(master, offset, sizeof(header),
-				     &len, (uint8_t *)&header);
+			mtd_read(master, offset, sizeof(header), &len,
+				 (uint8_t *)&header);
 		}
 		root_offset = offset + sizeof(header) + 4 + 0xff;
 		root_offset &= ~(uint32_t)0xff;
@@ -114,8 +114,7 @@ static int create_mtd_partitions(struct mtd_info *master,
 		break;
 	}
 
-	master->read(master, root_offset,
-		sizeof(header), &len, (u8 *)&header);
+	mtd_read(master, root_offset, sizeof(header), &len, (u8 *)&header);
 	if (header.checksum != SQUASHFS_MAGIC) {
 		root_offset += master->erasesize - 1;
 		root_offset &= ~(master->erasesize - 1);
diff --git a/drivers/mtd/bcm63xxpart.c b/drivers/mtd/bcm63xxpart.c
index 9ee8bc426e9..608321ee056 100644
--- a/drivers/mtd/bcm63xxpart.c
+++ b/drivers/mtd/bcm63xxpart.c
@@ -48,8 +48,8 @@ static int bcm63xx_detect_cfe(struct mtd_info *master)
 	int ret;
 	size_t retlen;
 
-	ret = master->read(master, BCM963XX_CFE_VERSION_OFFSET, 5, &retlen,
-			   (void *)buf);
+	ret = mtd_read(master, BCM963XX_CFE_VERSION_OFFSET, 5, &retlen,
+		       (void *)buf);
 	buf[retlen] = 0;
 
 	if (ret)
@@ -59,8 +59,8 @@ static int bcm63xx_detect_cfe(struct mtd_info *master)
 		return 0;
 
 	/* very old CFE's do not have the cfe-v string, so check for magic */
-	ret = master->read(master, BCM63XX_CFE_MAGIC_OFFSET, 8, &retlen,
-			   (void *)buf);
+	ret = mtd_read(master, BCM63XX_CFE_MAGIC_OFFSET, 8, &retlen,
+		       (void *)buf);
 	buf[retlen] = 0;
 
 	return strncmp("CFE1CFE1", buf, 8);
@@ -95,8 +95,8 @@ static int bcm63xx_parse_cfe_partitions(struct mtd_info *master,
 		return -ENOMEM;
 
 	/* Get the tag */
-	ret = master->read(master, cfelen, sizeof(struct bcm_tag), &retlen,
-			   (void *)buf);
+	ret = mtd_read(master, cfelen, sizeof(struct bcm_tag), &retlen,
+		       (void *)buf);
 
 	if (retlen != sizeof(struct bcm_tag)) {
 		vfree(buf);
diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index a982889277c..12fd7ebd3fd 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -168,8 +168,8 @@ static int scan_header(partition_t *part)
 	 (offset + sizeof(header)) < max_offset;
 	 offset += part->mbd.mtd->erasesize ? : 0x2000) {
 
-	err = part->mbd.mtd->read(part->mbd.mtd, offset, sizeof(header), &ret,
-			      (unsigned char *)&header);
+	err = mtd_read(part->mbd.mtd, offset, sizeof(header), &ret,
+                       (unsigned char *)&header);
 
 	if (err)
 	    return err;
@@ -224,8 +224,8 @@ static int build_maps(partition_t *part)
     for (i = 0; i < le16_to_cpu(part->header.NumEraseUnits); i++) {
 	offset = ((i + le16_to_cpu(part->header.FirstPhysicalEUN))
 		      << part->header.EraseUnitSize);
-	ret = part->mbd.mtd->read(part->mbd.mtd, offset, sizeof(header), &retval,
-			      (unsigned char *)&header);
+	ret = mtd_read(part->mbd.mtd, offset, sizeof(header), &retval,
+                       (unsigned char *)&header);
 
 	if (ret)
 	    goto out_XferInfo;
@@ -289,9 +289,9 @@ static int build_maps(partition_t *part)
 	part->EUNInfo[i].Deleted = 0;
 	offset = part->EUNInfo[i].Offset + le32_to_cpu(header.BAMOffset);
 
-	ret = part->mbd.mtd->read(part->mbd.mtd, offset,
-			      part->BlocksPerUnit * sizeof(uint32_t), &retval,
-			      (unsigned char *)part->bam_cache);
+	ret = mtd_read(part->mbd.mtd, offset,
+                       part->BlocksPerUnit * sizeof(uint32_t), &retval,
+                       (unsigned char *)part->bam_cache);
 
 	if (ret)
 		goto out_bam_cache;
@@ -485,9 +485,9 @@ static int copy_erase_unit(partition_t *part, uint16_t srcunit,
 
 	offset = eun->Offset + le32_to_cpu(part->header.BAMOffset);
 
-	ret = part->mbd.mtd->read(part->mbd.mtd, offset,
-			      part->BlocksPerUnit * sizeof(uint32_t),
-			      &retlen, (u_char *) (part->bam_cache));
+	ret = mtd_read(part->mbd.mtd, offset,
+                       part->BlocksPerUnit * sizeof(uint32_t), &retlen,
+                       (u_char *)(part->bam_cache));
 
 	/* mark the cache bad, in case we get an error later */
 	part->bam_index = 0xffff;
@@ -523,8 +523,8 @@ static int copy_erase_unit(partition_t *part, uint16_t srcunit,
 	    break;
 	case BLOCK_DATA:
 	case BLOCK_REPLACEMENT:
-	    ret = part->mbd.mtd->read(part->mbd.mtd, src, SECTOR_SIZE,
-                        &retlen, (u_char *) buf);
+	    ret = mtd_read(part->mbd.mtd, src, SECTOR_SIZE, &retlen,
+                           (u_char *)buf);
 	    if (ret) {
 		printk(KERN_WARNING "ftl: Error reading old xfer unit in copy_erase_unit\n");
 		return ret;
@@ -747,10 +747,11 @@ static uint32_t find_free(partition_t *part)
 	/* Invalidate cache */
 	part->bam_index = 0xffff;
 
-	ret = part->mbd.mtd->read(part->mbd.mtd,
-		       part->EUNInfo[eun].Offset + le32_to_cpu(part->header.BAMOffset),
-		       part->BlocksPerUnit * sizeof(uint32_t),
-		       &retlen, (u_char *) (part->bam_cache));
+	ret = mtd_read(part->mbd.mtd,
+                       part->EUNInfo[eun].Offset + le32_to_cpu(part->header.BAMOffset),
+                       part->BlocksPerUnit * sizeof(uint32_t),
+                       &retlen,
+                       (u_char *)(part->bam_cache));
 
 	if (ret) {
 	    printk(KERN_WARNING"ftl: Error reading BAM in find_free\n");
@@ -810,8 +811,8 @@ static int ftl_read(partition_t *part, caddr_t buffer,
 	else {
 	    offset = (part->EUNInfo[log_addr / bsize].Offset
 			  + (log_addr % bsize));
-	    ret = part->mbd.mtd->read(part->mbd.mtd, offset, SECTOR_SIZE,
-			   &retlen, (u_char *) buffer);
+	    ret = mtd_read(part->mbd.mtd, offset, SECTOR_SIZE, &retlen,
+                           (u_char *)buffer);
 
 	    if (ret) {
 		printk(KERN_WARNING "Error reading MTD device in ftl_read()\n");
@@ -849,8 +850,8 @@ static int set_bam_entry(partition_t *part, uint32_t log_addr,
 		  le32_to_cpu(part->header.BAMOffset));
 
 #ifdef PSYCHO_DEBUG
-    ret = part->mbd.mtd->read(part->mbd.mtd, offset, sizeof(uint32_t),
-                        &retlen, (u_char *)&old_addr);
+    ret = mtd_read(part->mbd.mtd, offset, sizeof(uint32_t), &retlen,
+                   (u_char *)&old_addr);
     if (ret) {
 	printk(KERN_WARNING"ftl: Error reading old_addr in set_bam_entry: %d\n",ret);
 	return ret;
diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index dd034efd187..0b038bed7b9 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -343,14 +343,17 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 		if (BlockMap[block] == BLOCK_NIL)
 			continue;
 
-		ret = mtd->read(mtd, (inftl->EraseSize * BlockMap[block]) +
-				(block * SECTORSIZE), SECTORSIZE, &retlen,
-				movebuf);
+		ret = mtd_read(mtd,
+			       (inftl->EraseSize * BlockMap[block]) + (block * SECTORSIZE),
+			       SECTORSIZE,
+			       &retlen,
+			       movebuf);
 		if (ret < 0 && !mtd_is_bitflip(ret)) {
-			ret = mtd->read(mtd,
-					(inftl->EraseSize * BlockMap[block]) +
-					(block * SECTORSIZE), SECTORSIZE,
-					&retlen, movebuf);
+			ret = mtd_read(mtd,
+				       (inftl->EraseSize * BlockMap[block]) + (block * SECTORSIZE),
+				       SECTORSIZE,
+				       &retlen,
+				       movebuf);
 			if (ret != -EIO)
 				pr_debug("INFTL: error went away on retry?\n");
 		}
@@ -914,7 +917,7 @@ foundit:
 	} else {
 		size_t retlen;
 		loff_t ptr = (thisEUN * inftl->EraseSize) + blockofs;
-		int ret = mtd->read(mtd, ptr, SECTORSIZE, &retlen, buffer);
+		int ret = mtd_read(mtd, ptr, SECTORSIZE, &retlen, buffer);
 
 		/* Handle corrected bit flips gracefully */
 		if (ret < 0 && !mtd_is_bitflip(ret))
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index 0d946f10a68..9bfbca5d88d 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -73,8 +73,8 @@ static int find_boot_record(struct INFTLrecord *inftl)
 		 * Check for BNAND header first. Then whinge if it's found
 		 * but later checks fail.
 		 */
-		ret = mtd->read(mtd, block * inftl->EraseSize,
-				SECTORSIZE, &retlen, buf);
+		ret = mtd_read(mtd, block * inftl->EraseSize, SECTORSIZE,
+			       &retlen, buf);
 		/* We ignore ret in case the ECC of the MediaHeader is invalid
 		   (which is apparently acceptable) */
 		if (retlen != SECTORSIZE) {
@@ -118,8 +118,8 @@ static int find_boot_record(struct INFTLrecord *inftl)
 		memcpy(mh, buf, sizeof(struct INFTLMediaHeader));
 
 		/* Read the spare media header at offset 4096 */
-		mtd->read(mtd, block * inftl->EraseSize + 4096,
-			  SECTORSIZE, &retlen, buf);
+		mtd_read(mtd, block * inftl->EraseSize + 4096, SECTORSIZE,
+			 &retlen, buf);
 		if (retlen != SECTORSIZE) {
 			printk(KERN_WARNING "INFTL: Unable to read spare "
 			       "Media Header\n");
@@ -342,7 +342,7 @@ static int check_free_sectors(struct INFTLrecord *inftl, unsigned int address,
 	int i;
 
 	for (i = 0; i < len; i += SECTORSIZE) {
-		if (mtd->read(mtd, address, SECTORSIZE, &retlen, buf))
+		if (mtd_read(mtd, address, SECTORSIZE, &retlen, buf))
 			return -1;
 		if (memcmpb(buf, 0xff, SECTORSIZE) != 0)
 			return -1;
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index 9b01cb0266e..b0644d2d2a6 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -184,8 +184,8 @@ static int do_cached_write (struct mtdblk_dev *mtdblk, unsigned long pos,
 			    mtdblk->cache_offset != sect_start) {
 				/* fill the cache with the current sector */
 				mtdblk->cache_state = STATE_EMPTY;
-				ret = mtd->read(mtd, sect_start, sect_size,
-						&retlen, mtdblk->cache_data);
+				ret = mtd_read(mtd, sect_start, sect_size,
+					       &retlen, mtdblk->cache_data);
 				if (ret)
 					return ret;
 				if (retlen != sect_size)
@@ -222,7 +222,7 @@ static int do_cached_read (struct mtdblk_dev *mtdblk, unsigned long pos,
 			mtd->name, pos, len);
 
 	if (!sect_size)
-		return mtd->read(mtd, pos, len, &retlen, buf);
+		return mtd_read(mtd, pos, len, &retlen, buf);
 
 	while (len > 0) {
 		unsigned long sect_start = (pos/sect_size)*sect_size;
@@ -241,7 +241,7 @@ static int do_cached_read (struct mtdblk_dev *mtdblk, unsigned long pos,
 		    mtdblk->cache_offset == sect_start) {
 			memcpy (buf, mtdblk->cache_data + offset, size);
 		} else {
-			ret = mtd->read(mtd, pos, size, &retlen, buf);
+			ret = mtd_read(mtd, pos, size, &retlen, buf);
 			if (ret)
 				return ret;
 			if (retlen != size)
diff --git a/drivers/mtd/mtdblock_ro.c b/drivers/mtd/mtdblock_ro.c
index 0470a6e8630..f5737b1153f 100644
--- a/drivers/mtd/mtdblock_ro.c
+++ b/drivers/mtd/mtdblock_ro.c
@@ -30,7 +30,7 @@ static int mtdblock_readsect(struct mtd_blktrans_dev *dev,
 {
 	size_t retlen;
 
-	if (dev->mtd->read(dev->mtd, (block * 512), 512, &retlen, buf))
+	if (mtd_read(dev->mtd, (block * 512), 512, &retlen, buf))
 		return 1;
 	return 0;
 }
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index c51f04a00af..c7f484687fa 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -232,7 +232,7 @@ static ssize_t mtdchar_read(struct file *file, char __user *buf, size_t count,
 			break;
 		}
 		default:
-			ret = mtd->read(mtd, *ppos, len, &retlen, kbuf);
+			ret = mtd_read(mtd, *ppos, len, &retlen, kbuf);
 		}
 		/* Nand returns -EBADMSG on ECC errors, but it returns
 		 * the data. For our userspace tools it is important
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index b3895cf20bb..45460349fd1 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -91,7 +91,7 @@ concat_read(struct mtd_info *mtd, loff_t from, size_t len,
 			/* Entire transaction goes into this subdev */
 			size = len;
 
-		err = subdev->read(subdev, from, size, &retsize, buf);
+		err = mtd_read(subdev, from, size, &retsize, buf);
 
 		/* Save information about bitflips! */
 		if (unlikely(err)) {
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index 9b2d8632316..23629ad0850 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -258,8 +258,8 @@ static void find_next_position(struct mtdoops_context *cxt)
 			continue;
 		/* Assume the page is used */
 		mark_page_used(cxt, page);
-		ret = mtd->read(mtd, page * record_size, MTDOOPS_HEADER_SIZE,
-				&retlen, (u_char *) &count[0]);
+		ret = mtd_read(mtd, page * record_size, MTDOOPS_HEADER_SIZE,
+			       &retlen, (u_char *)&count[0]);
 		if (retlen != MTDOOPS_HEADER_SIZE ||
 				(ret < 0 && !mtd_is_bitflip(ret))) {
 			printk(KERN_ERR "mtdoops: read failure at %ld (%td of %d read), err %d\n",
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 55a9cb544fc..59cd7974bc5 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -70,8 +70,7 @@ static int part_read(struct mtd_info *mtd, loff_t from, size_t len,
 		len = 0;
 	else if (from + len > mtd->size)
 		len = mtd->size - from;
-	res = part->master->read(part->master, from + part->offset,
-				   len, retlen, buf);
+	res = mtd_read(part->master, from + part->offset, len, retlen, buf);
 	if (unlikely(res)) {
 		if (mtd_is_bitflip(res))
 			mtd->ecc_stats.corrected += part->master->ecc_stats.corrected - stats.corrected;
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index 4e12875a916..b3282d2aa8f 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -736,7 +736,7 @@ static int mtdswap_move_block(struct mtdswap_dev *d, unsigned int oldblock,
 	retries = 0;
 
 retry:
-	ret = mtd->read(mtd, readpos, PAGE_SIZE, &retlen, d->page_buf);
+	ret = mtd_read(mtd, readpos, PAGE_SIZE, &retlen, d->page_buf);
 
 	if (ret < 0 && !mtd_is_bitflip(ret)) {
 		oldeb = d->eb_data + oldblock / d->pages_per_eblk;
@@ -1161,7 +1161,7 @@ static int mtdswap_readsect(struct mtd_blktrans_dev *dev,
 	retries = 0;
 
 retry:
-	ret = mtd->read(mtd, readpos, PAGE_SIZE, &retlen, buf);
+	ret = mtd_read(mtd, readpos, PAGE_SIZE, &retlen, buf);
 
 	d->mtd_read_count++;
 	if (mtd_is_bitflip(ret)) {
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index 5780dbab611..df921e7a496 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -1072,7 +1072,7 @@ static int __init find_media_headers(struct mtd_info *mtd, u_char *buf, const ch
 	size_t retlen;
 
 	for (offs = 0; offs < mtd->size; offs += mtd->erasesize) {
-		ret = mtd->read(mtd, offs, mtd->writesize, &retlen, buf);
+		ret = mtd_read(mtd, offs, mtd->writesize, &retlen, buf);
 		if (retlen != mtd->writesize)
 			continue;
 		if (ret) {
@@ -1097,7 +1097,7 @@ static int __init find_media_headers(struct mtd_info *mtd, u_char *buf, const ch
 	/* Only one mediaheader was found.  We want buf to contain a
 	   mediaheader on return, so we'll have to re-read the one we found. */
 	offs = doc->mh0_page << this->page_shift;
-	ret = mtd->read(mtd, offs, mtd->writesize, &retlen, buf);
+	ret = mtd_read(mtd, offs, mtd->writesize, &retlen, buf);
 	if (retlen != mtd->writesize) {
 		/* Insanity.  Give up. */
 		printk(KERN_ERR "Read DiskOnChip Media Header once, but can't reread it???\n");
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 69148ae3bf5..1bcd6bc6798 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -201,7 +201,7 @@ static int read_bbt(struct mtd_info *mtd, uint8_t *buf, int page, int num,
 			from += marker_len;
 			marker_len = 0;
 		}
-		res = mtd->read(mtd, from, len, &retlen, buf);
+		res = mtd_read(mtd, from, len, &retlen, buf);
 		if (res < 0) {
 			if (mtd_is_eccerr(res)) {
 				pr_info("nand_bbt: ECC error in BBT at "
@@ -298,7 +298,7 @@ static int scan_read_raw_data(struct mtd_info *mtd, uint8_t *buf, loff_t offs,
 	if (td->options & NAND_BBT_VERSION)
 		len++;
 
-	return mtd->read(mtd, offs, len, &retlen, buf);
+	return mtd_read(mtd, offs, len, &retlen, buf);
 }
 
 /* Scan read raw data from flash */
@@ -756,7 +756,7 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 			/* Make it block aligned */
 			to &= ~((loff_t)((1 << this->bbt_erase_shift) - 1));
 			len = 1 << this->bbt_erase_shift;
-			res = mtd->read(mtd, to, len, &retlen, buf);
+			res = mtd_read(mtd, to, len, &retlen, buf);
 			if (res < 0) {
 				if (retlen != len) {
 					pr_info("nand_bbt: error reading block "
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index cda77b562ad..1a9d9c1d3a7 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -423,12 +423,17 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
 		if (BlockMap[block] == BLOCK_NIL)
 			continue;
 
-		ret = mtd->read(mtd, (nftl->EraseSize * BlockMap[block]) + (block * 512),
-				512, &retlen, movebuf);
+		ret = mtd_read(mtd,
+			       (nftl->EraseSize * BlockMap[block]) + (block * 512),
+			       512,
+			       &retlen,
+			       movebuf);
 		if (ret < 0 && !mtd_is_bitflip(ret)) {
-			ret = mtd->read(mtd, (nftl->EraseSize * BlockMap[block])
-					+ (block * 512), 512, &retlen,
-					movebuf);
+			ret = mtd_read(mtd,
+				       (nftl->EraseSize * BlockMap[block]) + (block * 512),
+				       512,
+				       &retlen,
+				       movebuf);
 			if (ret != -EIO)
 				printk("Error went away on retry.\n");
 		}
@@ -771,7 +776,7 @@ static int nftl_readblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 	} else {
 		loff_t ptr = (lastgoodEUN * nftl->EraseSize) + blockofs;
 		size_t retlen;
-		int res = mtd->read(mtd, ptr, 512, &retlen, buffer);
+		int res = mtd_read(mtd, ptr, 512, &retlen, buffer);
 
 		if (res < 0 && !mtd_is_bitflip(res))
 			return -EIO;
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index 9164a56fb5c..b068dc8a366 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -63,8 +63,8 @@ static int find_boot_record(struct NFTLrecord *nftl)
 
 		/* Check for ANAND header first. Then can whinge if it's found but later
 		   checks fail */
-		ret = mtd->read(mtd, block * nftl->EraseSize, SECTORSIZE,
-				&retlen, buf);
+		ret = mtd_read(mtd, block * nftl->EraseSize, SECTORSIZE,
+			       &retlen, buf);
 		/* We ignore ret in case the ECC of the MediaHeader is invalid
 		   (which is apparently acceptable) */
 		if (retlen != SECTORSIZE) {
@@ -274,7 +274,7 @@ static int check_free_sectors(struct NFTLrecord *nftl, unsigned int address, int
 	int i;
 
 	for (i = 0; i < len; i += SECTORSIZE) {
-		if (mtd->read(mtd, address, SECTORSIZE, &retlen, buf))
+		if (mtd_read(mtd, address, SECTORSIZE, &retlen, buf))
 			return -1;
 		if (memcmpb(buf, 0xff, SECTORSIZE) != 0)
 			return -1;
diff --git a/drivers/mtd/redboot.c b/drivers/mtd/redboot.c
index e366b1d84ea..623d9b86d0d 100644
--- a/drivers/mtd/redboot.c
+++ b/drivers/mtd/redboot.c
@@ -104,8 +104,8 @@ static int parse_redboot_partitions(struct mtd_info *master,
 	printk(KERN_NOTICE "Searching for RedBoot partition table in %s at offset 0x%lx\n",
 	       master->name, offset);
 
-	ret = master->read(master, offset,
-			   master->erasesize, &retlen, (void *)buf);
+	ret = mtd_read(master, offset, master->erasesize, &retlen,
+		       (void *)buf);
 
 	if (ret)
 		goto out;
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index 39de8727a52..d9fe2d0533d 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -200,9 +200,9 @@ static int scan_header(struct partition *part)
 		part->sector_map[i] = -1;
 
 	for (i=0, blocks_found=0; i<part->total_blocks; i++) {
-		rc = part->mbd.mtd->read(part->mbd.mtd,
-				i * part->block_size, part->header_size,
-				&retlen, (u_char*)part->header_cache);
+		rc = mtd_read(part->mbd.mtd, i * part->block_size,
+			      part->header_size, &retlen,
+			      (u_char *)part->header_cache);
 
 		if (!rc && retlen != part->header_size)
 			rc = -EIO;
@@ -250,8 +250,8 @@ static int rfd_ftl_readsect(struct mtd_blktrans_dev *dev, u_long sector, char *b
 
 	addr = part->sector_map[sector];
 	if (addr != -1) {
-		rc = part->mbd.mtd->read(part->mbd.mtd, addr, SECTOR_SIZE,
-						&retlen, (u_char*)buf);
+		rc = mtd_read(part->mbd.mtd, addr, SECTOR_SIZE, &retlen,
+			      (u_char *)buf);
 		if (!rc && retlen != SECTOR_SIZE)
 			rc = -EIO;
 
@@ -372,9 +372,8 @@ static int move_block_contents(struct partition *part, int block_no, u_long *old
 	if (!map)
 		goto err2;
 
-	rc = part->mbd.mtd->read(part->mbd.mtd,
-		part->blocks[block_no].offset, part->header_size,
-		&retlen, (u_char*)map);
+	rc = mtd_read(part->mbd.mtd, part->blocks[block_no].offset,
+		      part->header_size, &retlen, (u_char *)map);
 
 	if (!rc && retlen != part->header_size)
 		rc = -EIO;
@@ -413,8 +412,8 @@ static int move_block_contents(struct partition *part, int block_no, u_long *old
 			}
 			continue;
 		}
-		rc = part->mbd.mtd->read(part->mbd.mtd, addr,
-			SECTOR_SIZE, &retlen, sector_data);
+		rc = mtd_read(part->mbd.mtd, addr, SECTOR_SIZE, &retlen,
+			      sector_data);
 
 		if (!rc && retlen != SECTOR_SIZE)
 			rc = -EIO;
@@ -563,8 +562,9 @@ static int find_writable_block(struct partition *part, u_long *old_sector)
 		}
 	}
 
-	rc = part->mbd.mtd->read(part->mbd.mtd, part->blocks[block].offset,
-		part->header_size, &retlen, (u_char*)part->header_cache);
+	rc = mtd_read(part->mbd.mtd, part->blocks[block].offset,
+		      part->header_size, &retlen,
+		      (u_char *)part->header_cache);
 
 	if (!rc && retlen != part->header_size)
 		rc = -EIO;
diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index 976e3d28b96..293e22a5710 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -123,8 +123,8 @@ static int get_valid_cis_sector(struct mtd_info *mtd)
 	 */
 	for (k = 0, offset = 0; k < 4; k++, offset += mtd->erasesize) {
 		if (!mtd->block_isbad(mtd, offset)) {
-			ret = mtd->read(mtd, offset, SECTOR_SIZE, &retlen,
-				sect_buf);
+			ret = mtd_read(mtd, offset, SECTOR_SIZE, &retlen,
+				       sect_buf);
 
 			/* CIS pattern match on the sector buffer */
 			if (ret < 0 || retlen != SECTOR_SIZE) {
@@ -156,7 +156,7 @@ static int read_physical_sector(struct mtd_info *mtd, uint8_t *sect_buf,
 	size_t retlen;
 	loff_t offset = (loff_t)sect_no << SECTOR_SHIFT;
 
-	ret = mtd->read(mtd, offset, SECTOR_SIZE, &retlen, sect_buf);
+	ret = mtd_read(mtd, offset, SECTOR_SIZE, &retlen, sect_buf);
 	if (ret < 0 || retlen != SECTOR_SIZE)
 		return -1;
 
diff --git a/drivers/mtd/tests/mtd_pagetest.c b/drivers/mtd/tests/mtd_pagetest.c
index 271819fabb5..6d62e24a03e 100644
--- a/drivers/mtd/tests/mtd_pagetest.c
+++ b/drivers/mtd/tests/mtd_pagetest.c
@@ -127,7 +127,7 @@ static int verify_eraseblock(int ebnum)
 	set_random_data(writebuf, mtd->erasesize);
 	for (j = 0; j < pgcnt - 1; ++j, addr += pgsize) {
 		/* Do a read to set the internal dataRAMs to different data */
-		err = mtd->read(mtd, addr0, bufsize, &read, twopages);
+		err = mtd_read(mtd, addr0, bufsize, &read, twopages);
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
@@ -135,7 +135,7 @@ static int verify_eraseblock(int ebnum)
 			       (long long)addr0);
 			return err;
 		}
-		err = mtd->read(mtd, addrn - bufsize, bufsize, &read, twopages);
+		err = mtd_read(mtd, addrn - bufsize, bufsize, &read, twopages);
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
@@ -145,7 +145,7 @@ static int verify_eraseblock(int ebnum)
 		}
 		memset(twopages, 0, bufsize);
 		read = 0;
-		err = mtd->read(mtd, addr, bufsize, &read, twopages);
+		err = mtd_read(mtd, addr, bufsize, &read, twopages);
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
@@ -163,7 +163,7 @@ static int verify_eraseblock(int ebnum)
 	if (addr <= addrn - pgsize - pgsize && !bbt[ebnum + 1]) {
 		unsigned long oldnext = next;
 		/* Do a read to set the internal dataRAMs to different data */
-		err = mtd->read(mtd, addr0, bufsize, &read, twopages);
+		err = mtd_read(mtd, addr0, bufsize, &read, twopages);
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
@@ -171,7 +171,7 @@ static int verify_eraseblock(int ebnum)
 			       (long long)addr0);
 			return err;
 		}
-		err = mtd->read(mtd, addrn - bufsize, bufsize, &read, twopages);
+		err = mtd_read(mtd, addrn - bufsize, bufsize, &read, twopages);
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
@@ -181,7 +181,7 @@ static int verify_eraseblock(int ebnum)
 		}
 		memset(twopages, 0, bufsize);
 		read = 0;
-		err = mtd->read(mtd, addr, bufsize, &read, twopages);
+		err = mtd_read(mtd, addr, bufsize, &read, twopages);
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
@@ -230,7 +230,7 @@ static int crosstest(void)
 	/* Read 2nd-to-last page to pp1 */
 	read = 0;
 	addr = addrn - pgsize - pgsize;
-	err = mtd->read(mtd, addr, pgsize, &read, pp1);
+	err = mtd_read(mtd, addr, pgsize, &read, pp1);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
@@ -243,7 +243,7 @@ static int crosstest(void)
 	/* Read 3rd-to-last page to pp1 */
 	read = 0;
 	addr = addrn - pgsize - pgsize - pgsize;
-	err = mtd->read(mtd, addr, pgsize, &read, pp1);
+	err = mtd_read(mtd, addr, pgsize, &read, pp1);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
@@ -257,7 +257,7 @@ static int crosstest(void)
 	read = 0;
 	addr = addr0;
 	printk(PRINT_PREF "reading page at %#llx\n", (long long)addr);
-	err = mtd->read(mtd, addr, pgsize, &read, pp2);
+	err = mtd_read(mtd, addr, pgsize, &read, pp2);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
@@ -271,7 +271,7 @@ static int crosstest(void)
 	read = 0;
 	addr = addrn - pgsize;
 	printk(PRINT_PREF "reading page at %#llx\n", (long long)addr);
-	err = mtd->read(mtd, addr, pgsize, &read, pp3);
+	err = mtd_read(mtd, addr, pgsize, &read, pp3);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
@@ -285,7 +285,7 @@ static int crosstest(void)
 	read = 0;
 	addr = addr0;
 	printk(PRINT_PREF "reading page at %#llx\n", (long long)addr);
-	err = mtd->read(mtd, addr, pgsize, &read, pp4);
+	err = mtd_read(mtd, addr, pgsize, &read, pp4);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
@@ -344,7 +344,7 @@ static int erasecrosstest(void)
 
 	printk(PRINT_PREF "reading 1st page of block %d\n", ebnum);
 	memset(readbuf, 0, pgsize);
-	err = mtd->read(mtd, addr0, pgsize, &read, readbuf);
+	err = mtd_read(mtd, addr0, pgsize, &read, readbuf);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
@@ -382,7 +382,7 @@ static int erasecrosstest(void)
 
 	printk(PRINT_PREF "reading 1st page of block %d\n", ebnum);
 	memset(readbuf, 0, pgsize);
-	err = mtd->read(mtd, addr0, pgsize, &read, readbuf);
+	err = mtd_read(mtd, addr0, pgsize, &read, readbuf);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
@@ -438,7 +438,7 @@ static int erasetest(void)
 		return err;
 
 	printk(PRINT_PREF "reading 1st page of block %d\n", ebnum);
-	err = mtd->read(mtd, addr0, pgsize, &read, twopages);
+	err = mtd_read(mtd, addr0, pgsize, &read, twopages);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
diff --git a/drivers/mtd/tests/mtd_readtest.c b/drivers/mtd/tests/mtd_readtest.c
index 550fe51225a..0c58d2976c7 100644
--- a/drivers/mtd/tests/mtd_readtest.c
+++ b/drivers/mtd/tests/mtd_readtest.c
@@ -52,7 +52,7 @@ static int read_eraseblock_by_page(int ebnum)
 
 	for (i = 0; i < pgcnt; i++) {
 		memset(buf, 0 , pgcnt);
-		ret = mtd->read(mtd, addr, pgsize, &read, buf);
+		ret = mtd_read(mtd, addr, pgsize, &read, buf);
 		if (ret == -EUCLEAN)
 			ret = 0;
 		if (ret || read != pgsize) {
diff --git a/drivers/mtd/tests/mtd_speedtest.c b/drivers/mtd/tests/mtd_speedtest.c
index f67a65e2104..3c9529bd0a6 100644
--- a/drivers/mtd/tests/mtd_speedtest.c
+++ b/drivers/mtd/tests/mtd_speedtest.c
@@ -214,7 +214,7 @@ static int read_eraseblock(int ebnum)
 	int err = 0;
 	loff_t addr = ebnum * mtd->erasesize;
 
-	err = mtd->read(mtd, addr, mtd->erasesize, &read, iobuf);
+	err = mtd_read(mtd, addr, mtd->erasesize, &read, iobuf);
 	/* Ignore corrected ECC errors */
 	if (mtd_is_bitflip(err))
 		err = 0;
@@ -235,7 +235,7 @@ static int read_eraseblock_by_page(int ebnum)
 	void *buf = iobuf;
 
 	for (i = 0; i < pgcnt; i++) {
-		err = mtd->read(mtd, addr, pgsize, &read, buf);
+		err = mtd_read(mtd, addr, pgsize, &read, buf);
 		/* Ignore corrected ECC errors */
 		if (mtd_is_bitflip(err))
 			err = 0;
@@ -261,7 +261,7 @@ static int read_eraseblock_by_2pages(int ebnum)
 	void *buf = iobuf;
 
 	for (i = 0; i < n; i++) {
-		err = mtd->read(mtd, addr, sz, &read, buf);
+		err = mtd_read(mtd, addr, sz, &read, buf);
 		/* Ignore corrected ECC errors */
 		if (mtd_is_bitflip(err))
 			err = 0;
@@ -276,7 +276,7 @@ static int read_eraseblock_by_2pages(int ebnum)
 		buf += sz;
 	}
 	if (pgcnt % 2) {
-		err = mtd->read(mtd, addr, pgsize, &read, buf);
+		err = mtd_read(mtd, addr, pgsize, &read, buf);
 		/* Ignore corrected ECC errors */
 		if (mtd_is_bitflip(err))
 			err = 0;
diff --git a/drivers/mtd/tests/mtd_stresstest.c b/drivers/mtd/tests/mtd_stresstest.c
index a204a9f9052..83a84372388 100644
--- a/drivers/mtd/tests/mtd_stresstest.c
+++ b/drivers/mtd/tests/mtd_stresstest.c
@@ -153,7 +153,7 @@ static int do_read(void)
 			len = mtd->erasesize - offs;
 	}
 	addr = eb * mtd->erasesize + offs;
-	err = mtd->read(mtd, addr, len, &read, readbuf);
+	err = mtd_read(mtd, addr, len, &read, readbuf);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (unlikely(err || read != len)) {
diff --git a/drivers/mtd/tests/mtd_subpagetest.c b/drivers/mtd/tests/mtd_subpagetest.c
index 16d0c05024d..d81f89a19da 100644
--- a/drivers/mtd/tests/mtd_subpagetest.c
+++ b/drivers/mtd/tests/mtd_subpagetest.c
@@ -196,7 +196,7 @@ static int verify_eraseblock(int ebnum)
 	set_random_data(writebuf, subpgsize);
 	clear_data(readbuf, subpgsize);
 	read = 0;
-	err = mtd->read(mtd, addr, subpgsize, &read, readbuf);
+	err = mtd_read(mtd, addr, subpgsize, &read, readbuf);
 	if (unlikely(err || read != subpgsize)) {
 		if (mtd_is_bitflip(err) && read == subpgsize) {
 			printk(PRINT_PREF "ECC correction at %#llx\n",
@@ -224,7 +224,7 @@ static int verify_eraseblock(int ebnum)
 	set_random_data(writebuf, subpgsize);
 	clear_data(readbuf, subpgsize);
 	read = 0;
-	err = mtd->read(mtd, addr, subpgsize, &read, readbuf);
+	err = mtd_read(mtd, addr, subpgsize, &read, readbuf);
 	if (unlikely(err || read != subpgsize)) {
 		if (mtd_is_bitflip(err) && read == subpgsize) {
 			printk(PRINT_PREF "ECC correction at %#llx\n",
@@ -262,7 +262,7 @@ static int verify_eraseblock2(int ebnum)
 		set_random_data(writebuf, subpgsize * k);
 		clear_data(readbuf, subpgsize * k);
 		read = 0;
-		err = mtd->read(mtd, addr, subpgsize * k, &read, readbuf);
+		err = mtd_read(mtd, addr, subpgsize * k, &read, readbuf);
 		if (unlikely(err || read != subpgsize * k)) {
 			if (mtd_is_bitflip(err) && read == subpgsize * k) {
 				printk(PRINT_PREF "ECC correction at %#llx\n",
@@ -296,7 +296,7 @@ static int verify_eraseblock_ff(int ebnum)
 	for (j = 0; j < mtd->erasesize / subpgsize; ++j) {
 		clear_data(readbuf, subpgsize);
 		read = 0;
-		err = mtd->read(mtd, addr, subpgsize, &read, readbuf);
+		err = mtd_read(mtd, addr, subpgsize, &read, readbuf);
 		if (unlikely(err || read != subpgsize)) {
 			if (mtd_is_bitflip(err) && read == subpgsize) {
 				printk(PRINT_PREF "ECC correction at %#llx\n",
diff --git a/drivers/mtd/tests/mtd_torturetest.c b/drivers/mtd/tests/mtd_torturetest.c
index 102c79b7ac6..ecc68bf3f3f 100644
--- a/drivers/mtd/tests/mtd_torturetest.c
+++ b/drivers/mtd/tests/mtd_torturetest.c
@@ -137,7 +137,7 @@ static inline int check_eraseblock(int ebnum, unsigned char *buf)
 	}
 
 retry:
-	err = mtd->read(mtd, addr, len, &read, check_buf);
+	err = mtd_read(mtd, addr, len, &read, check_buf);
 	if (mtd_is_bitflip(err))
 		printk(PRINT_PREF "single bit flip occurred at EB %d "
 		       "MTD reported that it was fixed.\n", ebnum);
diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c
index ab80c0debac..e2cdebf4084 100644
--- a/drivers/mtd/ubi/debug.c
+++ b/drivers/mtd/ubi/debug.c
@@ -216,7 +216,7 @@ void ubi_dbg_dump_flash(struct ubi_device *ubi, int pnum, int offset, int len)
 	buf = vmalloc(len);
 	if (!buf)
 		return;
-	err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf);
+	err = mtd_read(ubi->mtd, addr, len, &read, buf);
 	if (err && err != -EUCLEAN) {
 		ubi_err("error %d while reading %d bytes from PEB %d:%d, "
 			"read %zd bytes", err, len, pnum, offset, read);
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c
index b6c8959e6c7..433382951d3 100644
--- a/drivers/mtd/ubi/io.c
+++ b/drivers/mtd/ubi/io.c
@@ -170,7 +170,7 @@ int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset,
 
 	addr = (loff_t)pnum * ubi->peb_size + offset;
 retry:
-	err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf);
+	err = mtd_read(ubi->mtd, addr, len, &read, buf);
 	if (err) {
 		const char *errstr = mtd_is_eccerr(err) ? " (ECC error)" : "";
 
@@ -1357,7 +1357,7 @@ int ubi_dbg_check_write(struct ubi_device *ubi, const void *buf, int pnum,
 		return 0;
 	}
 
-	err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf1);
+	err = mtd_read(ubi->mtd, addr, len, &read, buf1);
 	if (err && !mtd_is_bitflip(err))
 		goto out_free;
 
@@ -1421,7 +1421,7 @@ int ubi_dbg_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len)
 		return 0;
 	}
 
-	err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf);
+	err = mtd_read(ubi->mtd, addr, len, &read, buf);
 	if (err && !mtd_is_bitflip(err)) {
 		ubi_err("error %d while reading %d bytes from PEB %d:%d, "
 			"read %zd bytes", err, len, pnum, offset, read);
diff --git a/drivers/staging/spectra/lld_mtd.c b/drivers/staging/spectra/lld_mtd.c
index d638fafab64..eccd08d0e00 100644
--- a/drivers/staging/spectra/lld_mtd.c
+++ b/drivers/staging/spectra/lld_mtd.c
@@ -283,9 +283,11 @@ u16 mtd_Read_Page_Main(u8 *read_data, u32 Block,
 
 
 	while (PageCount) {
-		ret = spectra_mtd->read(spectra_mtd,
-					(Block * spectra_mtd->erasesize) + (Page * spectra_mtd->writesize),
-					DeviceInfo.wPageDataSize, &retlen, read_data);
+		ret = mtd_read(spectra_mtd,
+			       (Block * spectra_mtd->erasesize) + (Page * spectra_mtd->writesize),
+			       DeviceInfo.wPageDataSize,
+			       &retlen,
+			       read_data);
 		if (ret) {
 			printk(KERN_ERR "%s failed %d\n", __func__, ret);
 			return FAIL;
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index ffdf4fca9c5..c59d642cade 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -381,7 +381,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
 
 		*bad_offset = ofs;
 
-		ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf);
+		ret = mtd_read(c->mtd, ofs, readlen, &retlen, ebuf);
 		if (ret) {
 			printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
 			ret = -EIO;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index b09e51d2f81..a24d3d21b63 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -228,7 +228,7 @@ static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf,
 	size_t retlen;
 	char *eccstr;
 
-	ret = c->mtd->read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify);
+	ret = mtd_read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify);
 	if (ret && ret != -EUCLEAN && ret != -EBADMSG) {
 		printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret);
 		return ret;
@@ -337,7 +337,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		}
 
 		/* Do the read... */
-		ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
+		ret = mtd_read(c->mtd, start, c->wbuf_ofs - start, &retlen,
+			       buf);
 
 		/* ECC recovered ? */
 		if ((ret == -EUCLEAN || ret == -EBADMSG) &&
@@ -948,11 +949,11 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
 	int	ret;
 
 	if (!jffs2_is_writebuffered(c))
-		return c->mtd->read(c->mtd, ofs, len, retlen, buf);
+		return mtd_read(c->mtd, ofs, len, retlen, buf);
 
 	/* Read flash */
 	down_read(&c->wbuf_sem);
-	ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
+	ret = mtd_read(c->mtd, ofs, len, retlen, buf);
 
 	if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) {
 		if (ret == -EBADMSG)
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 04636289435..3ee64351685 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -20,7 +20,7 @@ static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len,
 	size_t retlen;
 	int ret;
 
-	ret = mtd->read(mtd, ofs, len, &retlen, buf);
+	ret = mtd_read(mtd, ofs, len, &retlen, buf);
 	BUG_ON(ret == -EINVAL);
 	if (ret)
 		return ret;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index f38e8276b40..56478eb4bbc 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -182,6 +182,8 @@ struct mtd_info {
 					    unsigned long len,
 					    unsigned long offset,
 					    unsigned long flags);
+	int (*read) (struct mtd_info *mtd, loff_t from, size_t len,
+		     size_t *retlen, u_char *buf);
 
 	/* Backing device capabilities for this device
 	 * - provides mmap capabilities
@@ -189,7 +191,6 @@ struct mtd_info {
 	struct backing_dev_info *backing_dev_info;
 
 
-	int (*read) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
 	int (*write) (struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf);
 
 	/* In blackbox flight recorder like scenarios we want to make successful
@@ -301,6 +302,12 @@ static inline unsigned long mtd_get_unmapped_area(struct mtd_info *mtd,
 	return mtd->get_unmapped_area(mtd, len, offset, flags);
 }
 
+static inline int mtd_read(struct mtd_info *mtd, loff_t from, size_t len,
+			   size_t *retlen, u_char *buf)
+{
+	return mtd->read(mtd, from, len, retlen, buf);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From eda95cbf75193808f62948fb0142ba0901d8bee2 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 17:35:41 +0200
Subject: mtd: introduce mtd_write interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_cmdset_0020.c |  8 +++++---
 drivers/mtd/ftl.c                   | 35 ++++++++++++++++++-----------------
 drivers/mtd/mtdblock.c              |  4 ++--
 drivers/mtd/mtdblock_ro.c           |  2 +-
 drivers/mtd/mtdchar.c               |  2 +-
 drivers/mtd/mtdconcat.c             |  2 +-
 drivers/mtd/mtdcore.c               |  3 ++-
 drivers/mtd/mtdoops.c               |  4 ++--
 drivers/mtd/mtdpart.c               |  3 +--
 drivers/mtd/mtdswap.c               |  2 +-
 drivers/mtd/rfd_ftl.c               | 17 ++++++++---------
 drivers/mtd/tests/mtd_pagetest.c    |  8 ++++----
 drivers/mtd/tests/mtd_speedtest.c   |  8 ++++----
 drivers/mtd/tests/mtd_stresstest.c  |  2 +-
 drivers/mtd/tests/mtd_subpagetest.c |  6 +++---
 drivers/mtd/tests/mtd_torturetest.c |  2 +-
 drivers/mtd/ubi/io.c                |  7 +++----
 drivers/staging/spectra/lld_mtd.c   |  8 +++++---
 fs/jffs2/wbuf.c                     | 18 +++++++++---------
 fs/jffs2/writev.c                   |  5 +++--
 fs/logfs/dev_mtd.c                  |  2 +-
 include/linux/mtd/mtd.h             |  9 ++++++++-
 22 files changed, 84 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/chips/cfi_cmdset_0020.c b/drivers/mtd/chips/cfi_cmdset_0020.c
index 666c52f8bf8..85e80180b65 100644
--- a/drivers/mtd/chips/cfi_cmdset_0020.c
+++ b/drivers/mtd/chips/cfi_cmdset_0020.c
@@ -699,7 +699,8 @@ cfi_staa_writev(struct mtd_info *mtd, const struct kvec *vecs,
 				continue;
 			}
 			memcpy(buffer+buflen, elem_base, ECCBUF_SIZE-buflen);
-			ret = mtd->write(mtd, to, ECCBUF_SIZE, &thislen, buffer);
+			ret = mtd_write(mtd, to, ECCBUF_SIZE, &thislen,
+					buffer);
 			totlen += thislen;
 			if (ret || thislen != ECCBUF_SIZE)
 				goto write_error;
@@ -708,7 +709,8 @@ cfi_staa_writev(struct mtd_info *mtd, const struct kvec *vecs,
 			to += ECCBUF_SIZE;
 		}
 		if (ECCBUF_DIV(elem_len)) { /* write clean aligned data */
-			ret = mtd->write(mtd, to, ECCBUF_DIV(elem_len), &thislen, elem_base);
+			ret = mtd_write(mtd, to, ECCBUF_DIV(elem_len),
+					&thislen, elem_base);
 			totlen += thislen;
 			if (ret || thislen != ECCBUF_DIV(elem_len))
 				goto write_error;
@@ -722,7 +724,7 @@ cfi_staa_writev(struct mtd_info *mtd, const struct kvec *vecs,
 	}
 	if (buflen) { /* flush last page, even if not full */
 		/* This is sometimes intended behaviour, really */
-		ret = mtd->write(mtd, to, buflen, &thislen, buffer);
+		ret = mtd_write(mtd, to, buflen, &thislen, buffer);
 		totlen += thislen;
 		if (ret || thislen != ECCBUF_SIZE)
 			goto write_error;
diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index 12fd7ebd3fd..d591b1d0a6c 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -422,8 +422,8 @@ static int prepare_xfer(partition_t *part, int i)
     header.LogicalEUN = cpu_to_le16(0xffff);
     header.EraseCount = cpu_to_le32(xfer->EraseCount);
 
-    ret = part->mbd.mtd->write(part->mbd.mtd, xfer->Offset, sizeof(header),
-			   &retlen, (u_char *)&header);
+    ret = mtd_write(part->mbd.mtd, xfer->Offset, sizeof(header), &retlen,
+                    (u_char *)&header);
 
     if (ret) {
 	return ret;
@@ -438,8 +438,8 @@ static int prepare_xfer(partition_t *part, int i)
 
     for (i = 0; i < nbam; i++, offset += sizeof(uint32_t)) {
 
-	ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint32_t),
-			       &retlen, (u_char *)&ctl);
+	ret = mtd_write(part->mbd.mtd, offset, sizeof(uint32_t), &retlen,
+                        (u_char *)&ctl);
 
 	if (ret)
 	    return ret;
@@ -503,8 +503,8 @@ static int copy_erase_unit(partition_t *part, uint16_t srcunit,
     offset = xfer->Offset + 20; /* Bad! */
     unit = cpu_to_le16(0x7fff);
 
-    ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint16_t),
-			   &retlen, (u_char *) &unit);
+    ret = mtd_write(part->mbd.mtd, offset, sizeof(uint16_t), &retlen,
+                    (u_char *)&unit);
 
     if (ret) {
 	printk( KERN_WARNING "ftl: Failed to write back to BAM cache in copy_erase_unit()!\n");
@@ -531,8 +531,8 @@ static int copy_erase_unit(partition_t *part, uint16_t srcunit,
             }
 
 
-	    ret = part->mbd.mtd->write(part->mbd.mtd, dest, SECTOR_SIZE,
-                        &retlen, (u_char *) buf);
+	    ret = mtd_write(part->mbd.mtd, dest, SECTOR_SIZE, &retlen,
+                            (u_char *)buf);
 	    if (ret)  {
 		printk(KERN_WARNING "ftl: Error writing new xfer unit in copy_erase_unit\n");
 		return ret;
@@ -550,9 +550,11 @@ static int copy_erase_unit(partition_t *part, uint16_t srcunit,
     }
 
     /* Write the BAM to the transfer unit */
-    ret = part->mbd.mtd->write(part->mbd.mtd, xfer->Offset + le32_to_cpu(part->header.BAMOffset),
-                    part->BlocksPerUnit * sizeof(int32_t), &retlen,
-		    (u_char *)part->bam_cache);
+    ret = mtd_write(part->mbd.mtd,
+                    xfer->Offset + le32_to_cpu(part->header.BAMOffset),
+                    part->BlocksPerUnit * sizeof(int32_t),
+                    &retlen,
+                    (u_char *)part->bam_cache);
     if (ret) {
 	printk( KERN_WARNING "ftl: Error writing BAM in copy_erase_unit\n");
 	return ret;
@@ -560,8 +562,8 @@ static int copy_erase_unit(partition_t *part, uint16_t srcunit,
 
 
     /* All clear? Then update the LogicalEUN again */
-    ret = part->mbd.mtd->write(part->mbd.mtd, xfer->Offset + 20, sizeof(uint16_t),
-			   &retlen, (u_char *)&srcunitswap);
+    ret = mtd_write(part->mbd.mtd, xfer->Offset + 20, sizeof(uint16_t),
+                    &retlen, (u_char *)&srcunitswap);
 
     if (ret) {
 	printk(KERN_WARNING "ftl: Error writing new LogicalEUN in copy_erase_unit\n");
@@ -887,8 +889,8 @@ static int set_bam_entry(partition_t *part, uint32_t log_addr,
 #endif
 	part->bam_cache[blk] = le_virt_addr;
     }
-    ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint32_t),
-                            &retlen, (u_char *)&le_virt_addr);
+    ret = mtd_write(part->mbd.mtd, offset, sizeof(uint32_t), &retlen,
+                    (u_char *)&le_virt_addr);
 
     if (ret) {
 	printk(KERN_NOTICE "ftl_cs: set_bam_entry() failed!\n");
@@ -947,8 +949,7 @@ static int ftl_write(partition_t *part, caddr_t buffer,
 	part->EUNInfo[part->bam_index].Deleted++;
 	offset = (part->EUNInfo[part->bam_index].Offset +
 		      blk * SECTOR_SIZE);
-	ret = part->mbd.mtd->write(part->mbd.mtd, offset, SECTOR_SIZE, &retlen,
-                                     buffer);
+	ret = mtd_write(part->mbd.mtd, offset, SECTOR_SIZE, &retlen, buffer);
 
 	if (ret) {
 	    printk(KERN_NOTICE "ftl_cs: block write failed!\n");
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index b0644d2d2a6..ac7f1f1faa2 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -102,7 +102,7 @@ static int erase_write (struct mtd_info *mtd, unsigned long pos,
 	 * Next, write the data to flash.
 	 */
 
-	ret = mtd->write(mtd, pos, len, &retlen, buf);
+	ret = mtd_write(mtd, pos, len, &retlen, buf);
 	if (ret)
 		return ret;
 	if (retlen != len)
@@ -152,7 +152,7 @@ static int do_cached_write (struct mtdblk_dev *mtdblk, unsigned long pos,
 		mtd->name, pos, len);
 
 	if (!sect_size)
-		return mtd->write(mtd, pos, len, &retlen, buf);
+		return mtd_write(mtd, pos, len, &retlen, buf);
 
 	while (len > 0) {
 		unsigned long sect_start = (pos/sect_size)*sect_size;
diff --git a/drivers/mtd/mtdblock_ro.c b/drivers/mtd/mtdblock_ro.c
index f5737b1153f..92759a9d298 100644
--- a/drivers/mtd/mtdblock_ro.c
+++ b/drivers/mtd/mtdblock_ro.c
@@ -40,7 +40,7 @@ static int mtdblock_writesect(struct mtd_blktrans_dev *dev,
 {
 	size_t retlen;
 
-	if (dev->mtd->write(dev->mtd, (block * 512), 512, &retlen, buf))
+	if (mtd_write(dev->mtd, (block * 512), 512, &retlen, buf))
 		return 1;
 	return 0;
 }
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index c7f484687fa..922da31d2c6 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -331,7 +331,7 @@ static ssize_t mtdchar_write(struct file *file, const char __user *buf, size_t c
 		}
 
 		default:
-			ret = (*(mtd->write))(mtd, *ppos, len, &retlen, kbuf);
+			ret = mtd_write(mtd, *ppos, len, &retlen, kbuf);
 		}
 		if (!ret) {
 			*ppos += retlen;
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 45460349fd1..45215501c4c 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -148,7 +148,7 @@ concat_write(struct mtd_info *mtd, loff_t to, size_t len,
 		if (!(subdev->flags & MTD_WRITEABLE))
 			err = -EROFS;
 		else
-			err = subdev->write(subdev, to, size, &retsize, buf);
+			err = mtd_write(subdev, to, size, &retsize, buf);
 
 		if (err)
 			break;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index b01993ea260..e36191ab47c 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -699,7 +699,8 @@ int default_mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
 		for (i=0; i<count; i++) {
 			if (!vecs[i].iov_len)
 				continue;
-			ret = mtd->write(mtd, to, vecs[i].iov_len, &thislen, vecs[i].iov_base);
+			ret = mtd_write(mtd, to, vecs[i].iov_len, &thislen,
+					vecs[i].iov_base);
 			totlen += thislen;
 			if (ret || thislen != vecs[i].iov_len)
 				break;
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index 23629ad0850..9c9d58617c9 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -225,8 +225,8 @@ static void mtdoops_write(struct mtdoops_context *cxt, int panic)
 		ret = mtd->panic_write(mtd, cxt->nextpage * record_size,
 					record_size, &retlen, cxt->oops_buf);
 	else
-		ret = mtd->write(mtd, cxt->nextpage * record_size,
-					record_size, &retlen, cxt->oops_buf);
+		ret = mtd_write(mtd, cxt->nextpage * record_size,
+				record_size, &retlen, cxt->oops_buf);
 
 	if (retlen != record_size || ret < 0)
 		printk(KERN_ERR "mtdoops: write failure at %ld (%td of %ld written), error %d\n",
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 59cd7974bc5..96574a03656 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -188,8 +188,7 @@ static int part_write(struct mtd_info *mtd, loff_t to, size_t len,
 		len = 0;
 	else if (to + len > mtd->size)
 		len = mtd->size - to;
-	return part->master->write(part->master, to + part->offset,
-				    len, retlen, buf);
+	return mtd_write(part->master, to + part->offset, len, retlen, buf);
 }
 
 static int part_panic_write(struct mtd_info *mtd, loff_t to, size_t len,
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index b3282d2aa8f..6ff823e29c0 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -689,7 +689,7 @@ retry:
 		return ret;
 
 	writepos = (loff_t)*bp << PAGE_SHIFT;
-	ret =  mtd->write(mtd, writepos, PAGE_SIZE, &retlen, buf);
+	ret =  mtd_write(mtd, writepos, PAGE_SIZE, &retlen, buf);
 	if (ret == -EIO || mtd_is_eccerr(ret)) {
 		d->curr_write_pos--;
 		eb->active_count--;
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index d9fe2d0533d..c594bb7abfa 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -304,9 +304,8 @@ static void erase_callback(struct erase_info *erase)
 	part->blocks[i].used_sectors = 0;
 	part->blocks[i].erases++;
 
-	rc = part->mbd.mtd->write(part->mbd.mtd,
-		part->blocks[i].offset, sizeof(magic), &retlen,
-		(u_char*)&magic);
+	rc = mtd_write(part->mbd.mtd, part->blocks[i].offset, sizeof(magic),
+		       &retlen, (u_char *)&magic);
 
 	if (!rc && retlen != sizeof(magic))
 		rc = -EIO;
@@ -595,8 +594,8 @@ static int mark_sector_deleted(struct partition *part, u_long old_addr)
 
 	addr = part->blocks[block].offset +
 			(HEADER_MAP_OFFSET + offset) * sizeof(u16);
-	rc = part->mbd.mtd->write(part->mbd.mtd, addr,
-		sizeof(del), &retlen, (u_char*)&del);
+	rc = mtd_write(part->mbd.mtd, addr, sizeof(del), &retlen,
+		       (u_char *)&del);
 
 	if (!rc && retlen != sizeof(del))
 		rc = -EIO;
@@ -668,8 +667,8 @@ static int do_writesect(struct mtd_blktrans_dev *dev, u_long sector, char *buf,
 
 	addr = (i + part->header_sectors_per_block) * SECTOR_SIZE +
 		block->offset;
-	rc = part->mbd.mtd->write(part->mbd.mtd,
-		addr, SECTOR_SIZE, &retlen, (u_char*)buf);
+	rc = mtd_write(part->mbd.mtd, addr, SECTOR_SIZE, &retlen,
+		       (u_char *)buf);
 
 	if (!rc && retlen != SECTOR_SIZE)
 		rc = -EIO;
@@ -688,8 +687,8 @@ static int do_writesect(struct mtd_blktrans_dev *dev, u_long sector, char *buf,
 	part->header_cache[i + HEADER_MAP_OFFSET] = entry;
 
 	addr = block->offset + (HEADER_MAP_OFFSET + i) * sizeof(u16);
-	rc = part->mbd.mtd->write(part->mbd.mtd, addr,
-			sizeof(entry), &retlen, (u_char*)&entry);
+	rc = mtd_write(part->mbd.mtd, addr, sizeof(entry), &retlen,
+		       (u_char *)&entry);
 
 	if (!rc && retlen != sizeof(entry))
 		rc = -EIO;
diff --git a/drivers/mtd/tests/mtd_pagetest.c b/drivers/mtd/tests/mtd_pagetest.c
index 6d62e24a03e..83da97e54f9 100644
--- a/drivers/mtd/tests/mtd_pagetest.c
+++ b/drivers/mtd/tests/mtd_pagetest.c
@@ -100,7 +100,7 @@ static int write_eraseblock(int ebnum)
 
 	set_random_data(writebuf, mtd->erasesize);
 	cond_resched();
-	err = mtd->write(mtd, addr, mtd->erasesize, &written, writebuf);
+	err = mtd_write(mtd, addr, mtd->erasesize, &written, writebuf);
 	if (err || written != mtd->erasesize)
 		printk(PRINT_PREF "error: write failed at %#llx\n",
 		       (long long)addr);
@@ -335,7 +335,7 @@ static int erasecrosstest(void)
 	printk(PRINT_PREF "writing 1st page of block %d\n", ebnum);
 	set_random_data(writebuf, pgsize);
 	strcpy(writebuf, "There is no data like this!");
-	err = mtd->write(mtd, addr0, pgsize, &written, writebuf);
+	err = mtd_write(mtd, addr0, pgsize, &written, writebuf);
 	if (err || written != pgsize) {
 		printk(PRINT_PREF "error: write failed at %#llx\n",
 		       (long long)addr0);
@@ -368,7 +368,7 @@ static int erasecrosstest(void)
 	printk(PRINT_PREF "writing 1st page of block %d\n", ebnum);
 	set_random_data(writebuf, pgsize);
 	strcpy(writebuf, "There is no data like this!");
-	err = mtd->write(mtd, addr0, pgsize, &written, writebuf);
+	err = mtd_write(mtd, addr0, pgsize, &written, writebuf);
 	if (err || written != pgsize) {
 		printk(PRINT_PREF "error: write failed at %#llx\n",
 		       (long long)addr0);
@@ -425,7 +425,7 @@ static int erasetest(void)
 
 	printk(PRINT_PREF "writing 1st page of block %d\n", ebnum);
 	set_random_data(writebuf, pgsize);
-	err = mtd->write(mtd, addr0, pgsize, &written, writebuf);
+	err = mtd_write(mtd, addr0, pgsize, &written, writebuf);
 	if (err || written != pgsize) {
 		printk(PRINT_PREF "error: write failed at %#llx\n",
 		       (long long)addr0);
diff --git a/drivers/mtd/tests/mtd_speedtest.c b/drivers/mtd/tests/mtd_speedtest.c
index 3c9529bd0a6..c7b18e18908 100644
--- a/drivers/mtd/tests/mtd_speedtest.c
+++ b/drivers/mtd/tests/mtd_speedtest.c
@@ -143,7 +143,7 @@ static int write_eraseblock(int ebnum)
 	int err = 0;
 	loff_t addr = ebnum * mtd->erasesize;
 
-	err = mtd->write(mtd, addr, mtd->erasesize, &written, iobuf);
+	err = mtd_write(mtd, addr, mtd->erasesize, &written, iobuf);
 	if (err || written != mtd->erasesize) {
 		printk(PRINT_PREF "error: write failed at %#llx\n", addr);
 		if (!err)
@@ -161,7 +161,7 @@ static int write_eraseblock_by_page(int ebnum)
 	void *buf = iobuf;
 
 	for (i = 0; i < pgcnt; i++) {
-		err = mtd->write(mtd, addr, pgsize, &written, buf);
+		err = mtd_write(mtd, addr, pgsize, &written, buf);
 		if (err || written != pgsize) {
 			printk(PRINT_PREF "error: write failed at %#llx\n",
 			       addr);
@@ -184,7 +184,7 @@ static int write_eraseblock_by_2pages(int ebnum)
 	void *buf = iobuf;
 
 	for (i = 0; i < n; i++) {
-		err = mtd->write(mtd, addr, sz, &written, buf);
+		err = mtd_write(mtd, addr, sz, &written, buf);
 		if (err || written != sz) {
 			printk(PRINT_PREF "error: write failed at %#llx\n",
 			       addr);
@@ -196,7 +196,7 @@ static int write_eraseblock_by_2pages(int ebnum)
 		buf += sz;
 	}
 	if (pgcnt % 2) {
-		err = mtd->write(mtd, addr, pgsize, &written, buf);
+		err = mtd_write(mtd, addr, pgsize, &written, buf);
 		if (err || written != pgsize) {
 			printk(PRINT_PREF "error: write failed at %#llx\n",
 			       addr);
diff --git a/drivers/mtd/tests/mtd_stresstest.c b/drivers/mtd/tests/mtd_stresstest.c
index 83a84372388..f8aac4b7e59 100644
--- a/drivers/mtd/tests/mtd_stresstest.c
+++ b/drivers/mtd/tests/mtd_stresstest.c
@@ -192,7 +192,7 @@ static int do_write(void)
 		}
 	}
 	addr = eb * mtd->erasesize + offs;
-	err = mtd->write(mtd, addr, len, &written, writebuf);
+	err = mtd_write(mtd, addr, len, &written, writebuf);
 	if (unlikely(err || written != len)) {
 		printk(PRINT_PREF "error: write failed at 0x%llx\n",
 		       (long long)addr);
diff --git a/drivers/mtd/tests/mtd_subpagetest.c b/drivers/mtd/tests/mtd_subpagetest.c
index d81f89a19da..b90c01036b4 100644
--- a/drivers/mtd/tests/mtd_subpagetest.c
+++ b/drivers/mtd/tests/mtd_subpagetest.c
@@ -120,7 +120,7 @@ static int write_eraseblock(int ebnum)
 	loff_t addr = ebnum * mtd->erasesize;
 
 	set_random_data(writebuf, subpgsize);
-	err = mtd->write(mtd, addr, subpgsize, &written, writebuf);
+	err = mtd_write(mtd, addr, subpgsize, &written, writebuf);
 	if (unlikely(err || written != subpgsize)) {
 		printk(PRINT_PREF "error: write failed at %#llx\n",
 		       (long long)addr);
@@ -134,7 +134,7 @@ static int write_eraseblock(int ebnum)
 	addr += subpgsize;
 
 	set_random_data(writebuf, subpgsize);
-	err = mtd->write(mtd, addr, subpgsize, &written, writebuf);
+	err = mtd_write(mtd, addr, subpgsize, &written, writebuf);
 	if (unlikely(err || written != subpgsize)) {
 		printk(PRINT_PREF "error: write failed at %#llx\n",
 		       (long long)addr);
@@ -158,7 +158,7 @@ static int write_eraseblock2(int ebnum)
 		if (addr + (subpgsize * k) > (ebnum + 1) * mtd->erasesize)
 			break;
 		set_random_data(writebuf, subpgsize * k);
-		err = mtd->write(mtd, addr, subpgsize * k, &written, writebuf);
+		err = mtd_write(mtd, addr, subpgsize * k, &written, writebuf);
 		if (unlikely(err || written != subpgsize * k)) {
 			printk(PRINT_PREF "error: write failed at %#llx\n",
 			       (long long)addr);
diff --git a/drivers/mtd/tests/mtd_torturetest.c b/drivers/mtd/tests/mtd_torturetest.c
index ecc68bf3f3f..dd34a519fa7 100644
--- a/drivers/mtd/tests/mtd_torturetest.c
+++ b/drivers/mtd/tests/mtd_torturetest.c
@@ -189,7 +189,7 @@ static inline int write_pattern(int ebnum, void *buf)
 		addr = (ebnum + 1) * mtd->erasesize - pgcnt * pgsize;
 		len = pgcnt * pgsize;
 	}
-	err = mtd->write(mtd, addr, len, &written, buf);
+	err = mtd_write(mtd, addr, len, &written, buf);
 	if (err) {
 		printk(PRINT_PREF "error %d while writing EB %d, written %zd"
 		      " bytes\n", err, ebnum, written);
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c
index 433382951d3..8d832fc9e9e 100644
--- a/drivers/mtd/ubi/io.c
+++ b/drivers/mtd/ubi/io.c
@@ -289,7 +289,7 @@ int ubi_io_write(struct ubi_device *ubi, const void *buf, int pnum, int offset,
 	}
 
 	addr = (loff_t)pnum * ubi->peb_size + offset;
-	err = ubi->mtd->write(ubi->mtd, addr, len, &written, buf);
+	err = mtd_write(ubi->mtd, addr, len, &written, buf);
 	if (err) {
 		ubi_err("error %d while writing %d bytes to PEB %d:%d, written "
 			"%zd bytes", err, len, pnum, offset, written);
@@ -525,11 +525,10 @@ static int nor_erase_prepare(struct ubi_device *ubi, int pnum)
 	 * the header comment in scan.c for more information).
 	 */
 	addr = (loff_t)pnum * ubi->peb_size;
-	err = ubi->mtd->write(ubi->mtd, addr, 4, &written, (void *)&data);
+	err = mtd_write(ubi->mtd, addr, 4, &written, (void *)&data);
 	if (!err) {
 		addr += ubi->vid_hdr_aloffset;
-		err = ubi->mtd->write(ubi->mtd, addr, 4, &written,
-				      (void *)&data);
+		err = mtd_write(ubi->mtd, addr, 4, &written, (void *)&data);
 		if (!err)
 			return 0;
 	}
diff --git a/drivers/staging/spectra/lld_mtd.c b/drivers/staging/spectra/lld_mtd.c
index eccd08d0e00..2eb03213196 100644
--- a/drivers/staging/spectra/lld_mtd.c
+++ b/drivers/staging/spectra/lld_mtd.c
@@ -233,9 +233,11 @@ u16 mtd_Write_Page_Main(u8 *write_data, u32 Block,
 
 
 	while (PageCount) {
-		ret = spectra_mtd->write(spectra_mtd,
-					 (Block * spectra_mtd->erasesize) + (Page * spectra_mtd->writesize),
-					 DeviceInfo.wPageDataSize, &retlen, write_data);
+		ret = mtd_write(spectra_mtd,
+				(Block * spectra_mtd->erasesize) + (Page * spectra_mtd->writesize),
+				DeviceInfo.wPageDataSize,
+				&retlen,
+				write_data);
 		if (ret) {
 			printk(KERN_ERR "%s failed %d\n", __func__, ret);
 			return FAIL;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a24d3d21b63..3ea2f8db935 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -414,13 +414,12 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		if (breakme++ == 20) {
 			printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs);
 			breakme = 0;
-			c->mtd->write(c->mtd, ofs, towrite, &retlen,
-				      brokenbuf);
+			mtd_write(c->mtd, ofs, towrite, &retlen, brokenbuf);
 			ret = -EIO;
 		} else
 #endif
-			ret = c->mtd->write(c->mtd, ofs, towrite, &retlen,
-					    rewrite_buf);
+			ret = mtd_write(c->mtd, ofs, towrite, &retlen,
+					rewrite_buf);
 
 		if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) {
 			/* Argh. We tried. Really we did. */
@@ -620,13 +619,14 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	if (breakme++ == 20) {
 		printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs);
 		breakme = 0;
-		c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
-			      brokenbuf);
+		mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
+			  brokenbuf);
 		ret = -EIO;
 	} else
 #endif
 
-		ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf);
+		ret = mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize,
+				&retlen, c->wbuf);
 
 	if (ret) {
 		printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret);
@@ -862,8 +862,8 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
 		v += wbuf_retlen;
 
 		if (vlen >= c->wbuf_pagesize) {
-			ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen),
-					    &wbuf_retlen, v);
+			ret = mtd_write(c->mtd, outvec_to, PAGE_DIV(vlen),
+					&wbuf_retlen, v);
 			if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen))
 				goto outfile;
 
diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
index b9276b11bac..b05710fd552 100644
--- a/fs/jffs2/writev.c
+++ b/fs/jffs2/writev.c
@@ -26,7 +26,8 @@ static inline int mtd_fake_writev(struct mtd_info *mtd, const struct kvec *vecs,
 	for (i=0; i<count; i++) {
 		if (!vecs[i].iov_len)
 			continue;
-		ret = mtd->write(mtd, to, vecs[i].iov_len, &thislen, vecs[i].iov_base);
+		ret = mtd_write(mtd, to, vecs[i].iov_len, &thislen,
+				vecs[i].iov_base);
 		totlen += thislen;
 		if (ret || thislen != vecs[i].iov_len)
 			break;
@@ -61,7 +62,7 @@ int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
 			size_t *retlen, const u_char *buf)
 {
 	int ret;
-	ret = c->mtd->write(c->mtd, ofs, len, retlen, buf);
+	ret = mtd_write(c->mtd, ofs, len, retlen, buf);
 
 	if (jffs2_sum_active()) {
 		struct kvec vecs[1];
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 3ee64351685..1842440d656 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -49,7 +49,7 @@ static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
 	BUG_ON(len > PAGE_CACHE_SIZE);
 	page_start = ofs & PAGE_CACHE_MASK;
 	page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
-	ret = mtd->write(mtd, ofs, len, &retlen, buf);
+	ret = mtd_write(mtd, ofs, len, &retlen, buf);
 	if (ret || (retlen != len))
 		return -EIO;
 
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 56478eb4bbc..1da7f4a6ef8 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -184,6 +184,8 @@ struct mtd_info {
 					    unsigned long flags);
 	int (*read) (struct mtd_info *mtd, loff_t from, size_t len,
 		     size_t *retlen, u_char *buf);
+	int (*write) (struct mtd_info *mtd, loff_t to, size_t len,
+		      size_t *retlen, const u_char *buf);
 
 	/* Backing device capabilities for this device
 	 * - provides mmap capabilities
@@ -191,7 +193,6 @@ struct mtd_info {
 	struct backing_dev_info *backing_dev_info;
 
 
-	int (*write) (struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf);
 
 	/* In blackbox flight recorder like scenarios we want to make successful
 	   writes in interrupt context. panic_write() is only intended to be
@@ -308,6 +309,12 @@ static inline int mtd_read(struct mtd_info *mtd, loff_t from, size_t len,
 	return mtd->read(mtd, from, len, retlen, buf);
 }
 
+static inline int mtd_write(struct mtd_info *mtd, loff_t to, size_t len,
+			    size_t *retlen, const u_char *buf)
+{
+	return mtd->write(mtd, to, len, retlen, buf);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From fd2819bbc92fc98bed5d612e4acbe16b6326f6bf Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 18:27:05 +0200
Subject: mtd: introduce mtd_read_oob interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/inftlcore.c           |  2 +-
 drivers/mtd/mtdchar.c             |  4 ++--
 drivers/mtd/mtdconcat.c           |  2 +-
 drivers/mtd/mtdpart.c             |  2 +-
 drivers/mtd/mtdswap.c             |  4 ++--
 drivers/mtd/nand/nand_bbt.c       |  6 +++---
 drivers/mtd/nftlcore.c            |  2 +-
 drivers/mtd/sm_ftl.c              |  2 +-
 drivers/mtd/ssfdc.c               |  2 +-
 drivers/mtd/tests/mtd_oobtest.c   | 14 +++++++-------
 drivers/mtd/tests/mtd_readtest.c  |  2 +-
 drivers/staging/spectra/lld_mtd.c | 12 ++++++------
 fs/jffs2/wbuf.c                   |  4 ++--
 include/linux/mtd/mtd.h           | 10 ++++++++--
 14 files changed, 37 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index 0b038bed7b9..07646e1273e 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -158,7 +158,7 @@ int inftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
 	ops.oobbuf = buf;
 	ops.datbuf = NULL;
 
-	res = mtd->read_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
+	res = mtd_read_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
 	*retlen = ops.oobretlen;
 	return res;
 }
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 922da31d2c6..e74f570a7b9 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -227,7 +227,7 @@ static ssize_t mtdchar_read(struct file *file, char __user *buf, size_t count,
 			ops.oobbuf = NULL;
 			ops.len = len;
 
-			ret = mtd->read_oob(mtd, *ppos, &ops);
+			ret = mtd_read_oob(mtd, *ppos, &ops);
 			retlen = ops.retlen;
 			break;
 		}
@@ -471,7 +471,7 @@ static int mtdchar_readoob(struct file *file, struct mtd_info *mtd,
 		return -ENOMEM;
 
 	start &= ~((uint64_t)mtd->writesize - 1);
-	ret = mtd->read_oob(mtd, start, &ops);
+	ret = mtd_read_oob(mtd, start, &ops);
 
 	if (put_user(ops.oobretlen, retp))
 		ret = -EFAULT;
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 45215501c4c..cf35642e5f4 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -273,7 +273,7 @@ concat_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops)
 		if (from + devops.len > subdev->size)
 			devops.len = subdev->size - from;
 
-		err = subdev->read_oob(subdev, from, &devops);
+		err = mtd_read_oob(subdev, from, &devops);
 		ops->retlen += devops.retlen;
 		ops->oobretlen += devops.oobretlen;
 
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 9ed58f7d746..6fdc74ef19c 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -138,7 +138,7 @@ static int part_read_oob(struct mtd_info *mtd, loff_t from,
 			return -EINVAL;
 	}
 
-	res = part->master->read_oob(part->master, from + part->offset, ops);
+	res = mtd_read_oob(part->master, from + part->offset, ops);
 	if (unlikely(res)) {
 		if (mtd_is_bitflip(res))
 			mtd->ecc_stats.corrected++;
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index 6ff823e29c0..0f0ab18d440 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -312,7 +312,7 @@ static int mtdswap_handle_write_error(struct mtdswap_dev *d, struct swap_eb *eb)
 static int mtdswap_read_oob(struct mtdswap_dev *d, loff_t from,
 			struct mtd_oob_ops *ops)
 {
-	int ret = d->mtd->read_oob(d->mtd, from, ops);
+	int ret = mtd_read_oob(d->mtd, from, ops);
 
 	if (mtd_is_bitflip(ret))
 		return ret;
@@ -955,7 +955,7 @@ static unsigned int mtdswap_eblk_passes(struct mtdswap_dev *d,
 
 		pos = base;
 		for (i = 0; i < mtd_pages; i++) {
-			ret = mtd->read_oob(mtd, pos, &ops);
+			ret = mtd_read_oob(mtd, pos, &ops);
 			if (ret)
 				goto error;
 
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 1bcd6bc6798..fcab50e80b9 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -317,7 +317,7 @@ static int scan_read_raw_oob(struct mtd_info *mtd, uint8_t *buf, loff_t offs,
 		ops.len = min(len, (size_t)mtd->writesize);
 		ops.oobbuf = buf + ops.len;
 
-		res = mtd->read_oob(mtd, offs, &ops);
+		res = mtd_read_oob(mtd, offs, &ops);
 
 		if (res)
 			return res;
@@ -434,7 +434,7 @@ static int scan_block_fast(struct mtd_info *mtd, struct nand_bbt_descr *bd,
 		 * Read the full oob until read_oob is fixed to handle single
 		 * byte reads for 16 bit buswidth.
 		 */
-		ret = mtd->read_oob(mtd, offs, &ops);
+		ret = mtd_read_oob(mtd, offs, &ops);
 		/* Ignore ECC errors when checking for BBM */
 		if (ret && !mtd_is_bitflip_or_eccerr(ret))
 			return ret;
@@ -769,7 +769,7 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 			/* Read oob data */
 			ops.ooblen = (len >> this->page_shift) * mtd->oobsize;
 			ops.oobbuf = &buf[len];
-			res = mtd->read_oob(mtd, to + mtd->writesize, &ops);
+			res = mtd_read_oob(mtd, to + mtd->writesize, &ops);
 			if (res < 0 || ops.oobretlen != ops.ooblen)
 				goto outerr;
 
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index 1a9d9c1d3a7..7497f5efc26 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -153,7 +153,7 @@ int nftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
 	ops.oobbuf = buf;
 	ops.datbuf = NULL;
 
-	res = mtd->read_oob(mtd, offs & ~mask, &ops);
+	res = mtd_read_oob(mtd, offs & ~mask, &ops);
 	*retlen = ops.oobretlen;
 	return res;
 }
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index 2f1acb1ab5e..748aa441669 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -278,7 +278,7 @@ again:
 
 	/* Unfortunately, oob read will _always_ succeed,
 		despite card removal..... */
-	ret = mtd->read_oob(mtd, sm_mkoffset(ftl, zone, block, boffset), &ops);
+	ret = mtd_read_oob(mtd, sm_mkoffset(ftl, zone, block, boffset), &ops);
 
 	/* Test for unknown errors */
 	if (ret != 0 && !mtd_is_bitflip_or_eccerr(ret)) {
diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index 293e22a5710..0e688133835 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -175,7 +175,7 @@ static int read_raw_oob(struct mtd_info *mtd, loff_t offs, uint8_t *buf)
 	ops.oobbuf = buf;
 	ops.datbuf = NULL;
 
-	ret = mtd->read_oob(mtd, offs, &ops);
+	ret = mtd_read_oob(mtd, offs, &ops);
 	if (ret < 0 || ops.oobretlen != OOB_SIZE)
 		return -1;
 
diff --git a/drivers/mtd/tests/mtd_oobtest.c b/drivers/mtd/tests/mtd_oobtest.c
index 7d52854c16d..962d27a64e6 100644
--- a/drivers/mtd/tests/mtd_oobtest.c
+++ b/drivers/mtd/tests/mtd_oobtest.c
@@ -192,7 +192,7 @@ static int verify_eraseblock(int ebnum)
 		ops.ooboffs   = use_offset;
 		ops.datbuf    = NULL;
 		ops.oobbuf    = readbuf;
-		err = mtd->read_oob(mtd, addr, &ops);
+		err = mtd_read_oob(mtd, addr, &ops);
 		if (err || ops.oobretlen != use_len) {
 			printk(PRINT_PREF "error: readoob failed at %#llx\n",
 			       (long long)addr);
@@ -219,7 +219,7 @@ static int verify_eraseblock(int ebnum)
 			ops.ooboffs   = 0;
 			ops.datbuf    = NULL;
 			ops.oobbuf    = readbuf;
-			err = mtd->read_oob(mtd, addr, &ops);
+			err = mtd_read_oob(mtd, addr, &ops);
 			if (err || ops.oobretlen != mtd->ecclayout->oobavail) {
 				printk(PRINT_PREF "error: readoob failed at "
 				       "%#llx\n", (long long)addr);
@@ -284,7 +284,7 @@ static int verify_eraseblock_in_one_go(int ebnum)
 	ops.ooboffs   = 0;
 	ops.datbuf    = NULL;
 	ops.oobbuf    = readbuf;
-	err = mtd->read_oob(mtd, addr, &ops);
+	err = mtd_read_oob(mtd, addr, &ops);
 	if (err || ops.oobretlen != len) {
 		printk(PRINT_PREF "error: readoob failed at %#llx\n",
 		       (long long)addr);
@@ -544,7 +544,7 @@ static int __init mtd_oobtest_init(void)
 	ops.oobbuf    = readbuf;
 	printk(PRINT_PREF "attempting to start read past end of OOB\n");
 	printk(PRINT_PREF "an error is expected...\n");
-	err = mtd->read_oob(mtd, addr0, &ops);
+	err = mtd_read_oob(mtd, addr0, &ops);
 	if (err) {
 		printk(PRINT_PREF "error occurred as expected\n");
 		err = 0;
@@ -588,7 +588,7 @@ static int __init mtd_oobtest_init(void)
 		ops.oobbuf    = readbuf;
 		printk(PRINT_PREF "attempting to read past end of device\n");
 		printk(PRINT_PREF "an error is expected...\n");
-		err = mtd->read_oob(mtd, mtd->size - mtd->writesize, &ops);
+		err = mtd_read_oob(mtd, mtd->size - mtd->writesize, &ops);
 		if (err) {
 			printk(PRINT_PREF "error occurred as expected\n");
 			err = 0;
@@ -632,7 +632,7 @@ static int __init mtd_oobtest_init(void)
 		ops.oobbuf    = readbuf;
 		printk(PRINT_PREF "attempting to read past end of device\n");
 		printk(PRINT_PREF "an error is expected...\n");
-		err = mtd->read_oob(mtd, mtd->size - mtd->writesize, &ops);
+		err = mtd_read_oob(mtd, mtd->size - mtd->writesize, &ops);
 		if (err) {
 			printk(PRINT_PREF "error occurred as expected\n");
 			err = 0;
@@ -698,7 +698,7 @@ static int __init mtd_oobtest_init(void)
 		ops.ooboffs   = 0;
 		ops.datbuf    = NULL;
 		ops.oobbuf    = readbuf;
-		err = mtd->read_oob(mtd, addr, &ops);
+		err = mtd_read_oob(mtd, addr, &ops);
 		if (err)
 			goto out;
 		if (memcmp(readbuf, writebuf, mtd->ecclayout->oobavail * 2)) {
diff --git a/drivers/mtd/tests/mtd_readtest.c b/drivers/mtd/tests/mtd_readtest.c
index 0c58d2976c7..5eaeada8428 100644
--- a/drivers/mtd/tests/mtd_readtest.c
+++ b/drivers/mtd/tests/mtd_readtest.c
@@ -74,7 +74,7 @@ static int read_eraseblock_by_page(int ebnum)
 			ops.ooboffs   = 0;
 			ops.datbuf    = NULL;
 			ops.oobbuf    = oobbuf;
-			ret = mtd->read_oob(mtd, addr, &ops);
+			ret = mtd_read_oob(mtd, addr, &ops);
 			if ((ret && !mtd_is_bitflip(ret)) ||
 					ops.oobretlen != mtd->oobsize) {
 				printk(PRINT_PREF "error: read oob failed at "
diff --git a/drivers/staging/spectra/lld_mtd.c b/drivers/staging/spectra/lld_mtd.c
index 2eb03213196..ed8e5f06708 100644
--- a/drivers/staging/spectra/lld_mtd.c
+++ b/drivers/staging/spectra/lld_mtd.c
@@ -351,9 +351,9 @@ u16 mtd_Read_Page_Main_Spare(u8 *read_data, u32 Block,
 		ops.ooblen = BTSIG_BYTES;
 		ops.ooboffs = 0;
 
-		ret = spectra_mtd->read_oob(spectra_mtd,
-					    (Block * spectra_mtd->erasesize) + (Page * spectra_mtd->writesize),
-					    &ops);
+		ret = mtd_read_oob(spectra_mtd,
+				   (Block * spectra_mtd->erasesize) + (Page * spectra_mtd->writesize),
+				   &ops);
 		if (ret) {
 			printk(KERN_ERR "%s failed %d\n", __func__, ret);
 			return FAIL;
@@ -484,9 +484,9 @@ u16 mtd_Read_Page_Spare(u8 *read_data, u32 Block,
 		ops.ooblen = BTSIG_BYTES;
 		ops.ooboffs = 0;
 
-		ret = spectra_mtd->read_oob(spectra_mtd,
-					    (Block * spectra_mtd->erasesize) + (Page * spectra_mtd->writesize),
-					    &ops);
+		ret = mtd_read_oob(spectra_mtd,
+				   (Block * spectra_mtd->erasesize) + (Page * spectra_mtd->writesize),
+				   &ops);
 		if (ret) {
 			printk(KERN_ERR "%s failed %d\n", __func__, ret);
 			return FAIL;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 3ea2f8db935..efc0cb37030 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1032,7 +1032,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
 	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
 	ops.datbuf = NULL;
 
-	ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
+	ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
 	if (ret || ops.oobretlen != ops.ooblen) {
 		printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
 				" bytes, read %zd bytes, error %d\n",
@@ -1075,7 +1075,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
 	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
 	ops.datbuf = NULL;
 
-	ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
+	ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
 	if (ret || ops.oobretlen != ops.ooblen) {
 		printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
 				" bytes, read %zd bytes, error %d\n",
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 2fb83cd3d26..0db8d87ce45 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -188,6 +188,8 @@ struct mtd_info {
 		      size_t *retlen, const u_char *buf);
 	int (*panic_write) (struct mtd_info *mtd, loff_t to, size_t len,
 			    size_t *retlen, const u_char *buf);
+	int (*read_oob) (struct mtd_info *mtd, loff_t from,
+			 struct mtd_oob_ops *ops);
 
 	/* Backing device capabilities for this device
 	 * - provides mmap capabilities
@@ -195,8 +197,6 @@ struct mtd_info {
 	struct backing_dev_info *backing_dev_info;
 
 
-	int (*read_oob) (struct mtd_info *mtd, loff_t from,
-			 struct mtd_oob_ops *ops);
 	int (*write_oob) (struct mtd_info *mtd, loff_t to,
 			 struct mtd_oob_ops *ops);
 
@@ -320,6 +320,12 @@ static inline int mtd_panic_write(struct mtd_info *mtd, loff_t to, size_t len,
 	return mtd->panic_write(mtd, to, len, retlen, buf);
 }
 
+static inline int mtd_read_oob(struct mtd_info *mtd, loff_t from,
+			       struct mtd_oob_ops *ops)
+{
+	return mtd->read_oob(mtd, from, ops);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From a2cc5ba075f9bc837d0b4d4ec7328dcefc11859d Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 18:29:55 +0200
Subject: mtd: introduce mtd_write_oob interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/inftlcore.c           |  4 ++--
 drivers/mtd/mtdchar.c             |  6 +++---
 drivers/mtd/mtdconcat.c           |  2 +-
 drivers/mtd/mtdpart.c             |  2 +-
 drivers/mtd/mtdswap.c             |  4 ++--
 drivers/mtd/nand/nand_bbt.c       |  2 +-
 drivers/mtd/nand/sm_common.c      |  2 +-
 drivers/mtd/nftlcore.c            |  4 ++--
 drivers/mtd/sm_ftl.c              |  2 +-
 drivers/mtd/tests/mtd_oobtest.c   | 10 +++++-----
 drivers/staging/spectra/lld_mtd.c |  6 +++---
 fs/jffs2/wbuf.c                   |  2 +-
 include/linux/mtd/mtd.h           | 12 ++++++++----
 13 files changed, 31 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index 07646e1273e..28646c95cfb 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -178,7 +178,7 @@ int inftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
 	ops.oobbuf = buf;
 	ops.datbuf = NULL;
 
-	res = mtd->write_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
+	res = mtd_write_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
 	*retlen = ops.oobretlen;
 	return res;
 }
@@ -199,7 +199,7 @@ static int inftl_write(struct mtd_info *mtd, loff_t offs, size_t len,
 	ops.datbuf = buf;
 	ops.len = len;
 
-	res = mtd->write_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
+	res = mtd_write_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
 	*retlen = ops.retlen;
 	return res;
 }
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index e74f570a7b9..234e3d27143 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -325,7 +325,7 @@ static ssize_t mtdchar_write(struct file *file, const char __user *buf, size_t c
 			ops.ooboffs = 0;
 			ops.len = len;
 
-			ret = mtd->write_oob(mtd, *ppos, &ops);
+			ret = mtd_write_oob(mtd, *ppos, &ops);
 			retlen = ops.retlen;
 			break;
 		}
@@ -426,7 +426,7 @@ static int mtdchar_writeoob(struct file *file, struct mtd_info *mtd,
 		return PTR_ERR(ops.oobbuf);
 
 	start &= ~((uint64_t)mtd->writesize - 1);
-	ret = mtd->write_oob(mtd, start, &ops);
+	ret = mtd_write_oob(mtd, start, &ops);
 
 	if (ops.oobretlen > 0xFFFFFFFFU)
 		ret = -EOVERFLOW;
@@ -609,7 +609,7 @@ static int mtdchar_write_ioctl(struct mtd_info *mtd,
 		ops.oobbuf = NULL;
 	}
 
-	ret = mtd->write_oob(mtd, (loff_t)req.start, &ops);
+	ret = mtd_write_oob(mtd, (loff_t)req.start, &ops);
 
 	kfree(ops.datbuf);
 	kfree(ops.oobbuf);
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index cf35642e5f4..3d9c1ffdbbb 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -333,7 +333,7 @@ concat_write_oob(struct mtd_info *mtd, loff_t to, struct mtd_oob_ops *ops)
 		if (to + devops.len > subdev->size)
 			devops.len = subdev->size - to;
 
-		err = subdev->write_oob(subdev, to, &devops);
+		err = mtd_write_oob(subdev, to, &devops);
 		ops->retlen += devops.oobretlen;
 		if (err)
 			return err;
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 6fdc74ef19c..8a46cd2bb78 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -217,7 +217,7 @@ static int part_write_oob(struct mtd_info *mtd, loff_t to,
 		return -EINVAL;
 	if (ops->datbuf && to + ops->len > mtd->size)
 		return -EINVAL;
-	return part->master->write_oob(part->master, to + part->offset, ops);
+	return mtd_write_oob(part->master, to + part->offset, ops);
 }
 
 static int part_write_user_prot_reg(struct mtd_info *mtd, loff_t from,
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index 0f0ab18d440..85797390e3d 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -403,7 +403,7 @@ static int mtdswap_write_marker(struct mtdswap_dev *d, struct swap_eb *eb,
 		offset = mtdswap_eb_offset(d, eb) + d->mtd->writesize;
 	}
 
-	ret = d->mtd->write_oob(d->mtd, offset , &ops);
+	ret = mtd_write_oob(d->mtd, offset, &ops);
 
 	if (ret) {
 		dev_warn(d->dev, "Write OOB failed for block at %08llx "
@@ -946,7 +946,7 @@ static unsigned int mtdswap_eblk_passes(struct mtdswap_dev *d,
 			patt = mtdswap_test_patt(test + i);
 			memset(d->page_buf, patt, mtd->writesize);
 			memset(d->oob_buf, patt, mtd->ecclayout->oobavail);
-			ret = mtd->write_oob(mtd, pos, &ops);
+			ret = mtd_write_oob(mtd, pos, &ops);
 			if (ret)
 				goto error;
 
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index fcab50e80b9..20a112f591f 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -350,7 +350,7 @@ static int scan_write_bbt(struct mtd_info *mtd, loff_t offs, size_t len,
 	ops.oobbuf = oob;
 	ops.len = len;
 
-	return mtd->write_oob(mtd, offs, &ops);
+	return mtd_write_oob(mtd, offs, &ops);
 }
 
 static u32 bbt_get_ver_offs(struct mtd_info *mtd, struct nand_bbt_descr *td)
diff --git a/drivers/mtd/nand/sm_common.c b/drivers/mtd/nand/sm_common.c
index 32ae5af7444..774c3c26671 100644
--- a/drivers/mtd/nand/sm_common.c
+++ b/drivers/mtd/nand/sm_common.c
@@ -55,7 +55,7 @@ static int sm_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	ops.datbuf = NULL;
 
 
-	ret = mtd->write_oob(mtd, ofs, &ops);
+	ret = mtd_write_oob(mtd, ofs, &ops);
 	if (ret < 0 || ops.oobretlen != SM_OOB_SIZE) {
 		printk(KERN_NOTICE
 			"sm_common: can't mark sector at %i as bad\n",
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index 7497f5efc26..8847e60ad16 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -174,7 +174,7 @@ int nftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
 	ops.oobbuf = buf;
 	ops.datbuf = NULL;
 
-	res = mtd->write_oob(mtd, offs & ~mask, &ops);
+	res = mtd_write_oob(mtd, offs & ~mask, &ops);
 	*retlen = ops.oobretlen;
 	return res;
 }
@@ -198,7 +198,7 @@ static int nftl_write(struct mtd_info *mtd, loff_t offs, size_t len,
 	ops.datbuf = buf;
 	ops.len = len;
 
-	res = mtd->write_oob(mtd, offs & ~mask, &ops);
+	res = mtd_write_oob(mtd, offs & ~mask, &ops);
 	*retlen = ops.retlen;
 	return res;
 }
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index 748aa441669..4ec2af7fb84 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -343,7 +343,7 @@ static int sm_write_sector(struct sm_ftl *ftl,
 	ops.ooblen = SM_OOB_SIZE;
 	ops.oobbuf = (void *)oob;
 
-	ret = mtd->write_oob(mtd, sm_mkoffset(ftl, zone, block, boffset), &ops);
+	ret = mtd_write_oob(mtd, sm_mkoffset(ftl, zone, block, boffset), &ops);
 
 	/* Now we assume that hardware will catch write bitflip errors */
 	/* If you are paranoid, use CONFIG_MTD_NAND_VERIFY_WRITE */
diff --git a/drivers/mtd/tests/mtd_oobtest.c b/drivers/mtd/tests/mtd_oobtest.c
index 962d27a64e6..81113885e08 100644
--- a/drivers/mtd/tests/mtd_oobtest.c
+++ b/drivers/mtd/tests/mtd_oobtest.c
@@ -139,7 +139,7 @@ static int write_eraseblock(int ebnum)
 		ops.ooboffs   = use_offset;
 		ops.datbuf    = NULL;
 		ops.oobbuf    = writebuf;
-		err = mtd->write_oob(mtd, addr, &ops);
+		err = mtd_write_oob(mtd, addr, &ops);
 		if (err || ops.oobretlen != use_len) {
 			printk(PRINT_PREF "error: writeoob failed at %#llx\n",
 			       (long long)addr);
@@ -524,7 +524,7 @@ static int __init mtd_oobtest_init(void)
 	ops.oobbuf    = writebuf;
 	printk(PRINT_PREF "attempting to start write past end of OOB\n");
 	printk(PRINT_PREF "an error is expected...\n");
-	err = mtd->write_oob(mtd, addr0, &ops);
+	err = mtd_write_oob(mtd, addr0, &ops);
 	if (err) {
 		printk(PRINT_PREF "error occurred as expected\n");
 		err = 0;
@@ -568,7 +568,7 @@ static int __init mtd_oobtest_init(void)
 		ops.oobbuf    = writebuf;
 		printk(PRINT_PREF "attempting to write past end of device\n");
 		printk(PRINT_PREF "an error is expected...\n");
-		err = mtd->write_oob(mtd, mtd->size - mtd->writesize, &ops);
+		err = mtd_write_oob(mtd, mtd->size - mtd->writesize, &ops);
 		if (err) {
 			printk(PRINT_PREF "error occurred as expected\n");
 			err = 0;
@@ -612,7 +612,7 @@ static int __init mtd_oobtest_init(void)
 		ops.oobbuf    = writebuf;
 		printk(PRINT_PREF "attempting to write past end of device\n");
 		printk(PRINT_PREF "an error is expected...\n");
-		err = mtd->write_oob(mtd, mtd->size - mtd->writesize, &ops);
+		err = mtd_write_oob(mtd, mtd->size - mtd->writesize, &ops);
 		if (err) {
 			printk(PRINT_PREF "error occurred as expected\n");
 			err = 0;
@@ -670,7 +670,7 @@ static int __init mtd_oobtest_init(void)
 			ops.ooboffs   = 0;
 			ops.datbuf    = NULL;
 			ops.oobbuf    = writebuf;
-			err = mtd->write_oob(mtd, addr, &ops);
+			err = mtd_write_oob(mtd, addr, &ops);
 			if (err)
 				goto out;
 			if (i % 256 == 0)
diff --git a/drivers/staging/spectra/lld_mtd.c b/drivers/staging/spectra/lld_mtd.c
index ed8e5f06708..4aa48ddf979 100644
--- a/drivers/staging/spectra/lld_mtd.c
+++ b/drivers/staging/spectra/lld_mtd.c
@@ -411,9 +411,9 @@ u16 mtd_Write_Page_Main_Spare(u8 *write_data, u32 Block,
 		ops.ooblen = BTSIG_BYTES;
 		ops.ooboffs = 0;
 
-		ret = spectra_mtd->write_oob(spectra_mtd,
-					     (Block * spectra_mtd->erasesize) + (Page * spectra_mtd->writesize),
-					     &ops);
+		ret = mtd_write_oob(spectra_mtd,
+				    (Block * spectra_mtd->erasesize) + (Page * spectra_mtd->writesize),
+				    &ops);
 		if (ret) {
 			printk(KERN_ERR "%s failed %d\n", __func__, ret);
 			return FAIL;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index efc0cb37030..eae5be48368 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1101,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
 	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
 	ops.datbuf = NULL;
 
-	ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops);
+	ret = mtd_write_oob(c->mtd, jeb->offset, &ops);
 	if (ret || ops.oobretlen != ops.ooblen) {
 		printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd"
 				" bytes, read %zd bytes, error %d\n",
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 0db8d87ce45..abbc96ad3b2 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -190,16 +190,14 @@ struct mtd_info {
 			    size_t *retlen, const u_char *buf);
 	int (*read_oob) (struct mtd_info *mtd, loff_t from,
 			 struct mtd_oob_ops *ops);
+	int (*write_oob) (struct mtd_info *mtd, loff_t to,
+			  struct mtd_oob_ops *ops);
 
 	/* Backing device capabilities for this device
 	 * - provides mmap capabilities
 	 */
 	struct backing_dev_info *backing_dev_info;
 
-
-	int (*write_oob) (struct mtd_info *mtd, loff_t to,
-			 struct mtd_oob_ops *ops);
-
 	/*
 	 * Methods to access the protection register area, present in some
 	 * flash devices. The user data is one time programmable but the
@@ -326,6 +324,12 @@ static inline int mtd_read_oob(struct mtd_info *mtd, loff_t from,
 	return mtd->read_oob(mtd, from, ops);
 }
 
+static inline int mtd_write_oob(struct mtd_info *mtd, loff_t to,
+				struct mtd_oob_ops *ops)
+{
+	return mtd->write_oob(mtd, to, ops);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From b0a31f7b2a668f00a8d0546dfeed65fac871b2da Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 18:59:12 +0200
Subject: mtd: introduce mtd_writev interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdconcat.c |  5 +++--
 drivers/mtd/mtdpart.c   |  4 ++--
 fs/jffs2/writev.c       |  2 +-
 include/linux/mtd/mtd.h | 18 ++++++++++++------
 4 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 3d9c1ffdbbb..6fdae191e1b 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -227,8 +227,9 @@ concat_writev(struct mtd_info *mtd, const struct kvec *vecs,
 		if (!(subdev->flags & MTD_WRITEABLE))
 			err = -EROFS;
 		else
-			err = subdev->writev(subdev, &vecs_copy[entry_low],
-				entry_high - entry_low + 1, to, &retsize);
+			err = mtd_writev(subdev, &vecs_copy[entry_low],
+					 entry_high - entry_low + 1, to,
+					 &retsize);
 
 		vecs_copy[entry_high].iov_len = old_iov_len - size;
 		vecs_copy[entry_high].iov_base += size;
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 0bb16d6ed08..c0bfa88c82f 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -238,8 +238,8 @@ static int part_writev(struct mtd_info *mtd, const struct kvec *vecs,
 	struct mtd_part *part = PART(mtd);
 	if (!(mtd->flags & MTD_WRITEABLE))
 		return -EROFS;
-	return part->master->writev(part->master, vecs, count,
-					to + part->offset, retlen);
+	return mtd_writev(part->master, vecs, count, to + part->offset,
+			  retlen);
 }
 
 static int part_erase(struct mtd_info *mtd, struct erase_info *instr)
diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
index b05710fd552..d0ef068709a 100644
--- a/fs/jffs2/writev.c
+++ b/fs/jffs2/writev.c
@@ -52,7 +52,7 @@ int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
 	}
 
 	if (c->mtd->writev)
-		return c->mtd->writev(c->mtd, vecs, count, to, retlen);
+		return mtd_writev(c->mtd, vecs, count, to, retlen);
 	else {
 		return mtd_fake_writev(c->mtd, vecs, count, to, retlen);
 	}
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index b58e5e8746e..4129cb5c3de 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -204,18 +204,14 @@ struct mtd_info {
 				    size_t *retlen, u_char *buf);
 	int (*lock_user_prot_reg) (struct mtd_info *mtd, loff_t from,
 				   size_t len);
+	int (*writev) (struct mtd_info *mtd, const struct kvec *vecs,
+			unsigned long count, loff_t to, size_t *retlen);
 
 	/* Backing device capabilities for this device
 	 * - provides mmap capabilities
 	 */
 	struct backing_dev_info *backing_dev_info;
 
-	/* kvec-based read/write methods.
-	   NB: The 'count' parameter is the number of _vectors_, each of
-	   which contains an (ofs, len) tuple.
-	*/
-	int (*writev) (struct mtd_info *mtd, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen);
-
 	/* Sync */
 	void (*sync) (struct mtd_info *mtd);
 
@@ -375,6 +371,16 @@ static inline int mtd_lock_user_prot_reg(struct mtd_info *mtd, loff_t from,
 	return mtd->lock_user_prot_reg(mtd, from, len);
 }
 
+/*
+ * kvec-based read/write method. NB: The 'count' parameter is the number of
+ * _vectors_, each of which contains an (ofs, len) tuple.
+ */
+static inline int mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
+			     unsigned long count, loff_t to, size_t *retlen)
+{
+	return mtd->writev(mtd, vecs, count, to, retlen);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From 85f2f2a809d658c15b574df02ede92090f45a1f2 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 19:03:12 +0200
Subject: mtd: introduce mtd_sync interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/ftl.c       | 2 +-
 drivers/mtd/mtdblock.c  | 4 ++--
 drivers/mtd/mtdchar.c   | 2 +-
 drivers/mtd/mtdconcat.c | 2 +-
 drivers/mtd/mtdpart.c   | 2 +-
 drivers/mtd/mtdswap.c   | 2 +-
 drivers/mtd/rfd_ftl.c   | 2 +-
 drivers/mtd/ubi/kapi.c  | 2 +-
 fs/jffs2/super.c        | 2 +-
 fs/logfs/dev_mtd.c      | 2 +-
 include/linux/mtd/mtd.h | 9 ++++++---
 11 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index d591b1d0a6c..c9c90299c9e 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -651,7 +651,7 @@ static int reclaim_block(partition_t *part)
 		pr_debug("ftl_cs: waiting for transfer "
 		      "unit to be prepared...\n");
 		if (part->mbd.mtd->sync)
-			part->mbd.mtd->sync(part->mbd.mtd);
+			mtd_sync(part->mbd.mtd);
 	    } else {
 		static int ne = 0;
 		if (++ne < 5)
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index ac7f1f1faa2..496e1a6e802 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -323,7 +323,7 @@ static int mtdblock_release(struct mtd_blktrans_dev *mbd)
 	if (!--mtdblk->count) {
 		/* It was the last usage. Free the cache */
 		if (mbd->mtd->sync)
-			mbd->mtd->sync(mbd->mtd);
+			mtd_sync(mbd->mtd);
 		vfree(mtdblk->cache_data);
 	}
 
@@ -343,7 +343,7 @@ static int mtdblock_flush(struct mtd_blktrans_dev *dev)
 	mutex_unlock(&mtdblk->cache_mutex);
 
 	if (dev->mtd->sync)
-		dev->mtd->sync(dev->mtd);
+		mtd_sync(dev->mtd);
 	return 0;
 }
 
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 86308acb40e..b5722ecf19d 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -155,7 +155,7 @@ static int mtdchar_close(struct inode *inode, struct file *file)
 
 	/* Only sync if opened RW */
 	if ((file->f_mode & FMODE_WRITE) && mtd->sync)
-		mtd->sync(mtd);
+		mtd_sync(mtd);
 
 	iput(mfi->ino);
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 6fdae191e1b..cc2336edfe2 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -620,7 +620,7 @@ static void concat_sync(struct mtd_info *mtd)
 
 	for (i = 0; i < concat->num_subdev; i++) {
 		struct mtd_info *subdev = concat->subdev[i];
-		subdev->sync(subdev);
+		mtd_sync(subdev);
 	}
 }
 
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index c0bfa88c82f..2b545052795 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -301,7 +301,7 @@ static int part_is_locked(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 static void part_sync(struct mtd_info *mtd)
 {
 	struct mtd_part *part = PART(mtd);
-	part->master->sync(part->master);
+	mtd_sync(part->master);
 }
 
 static int part_suspend(struct mtd_info *mtd)
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index 85797390e3d..cb794e76101 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -1048,7 +1048,7 @@ static int mtdswap_flush(struct mtd_blktrans_dev *dev)
 	struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
 
 	if (d->mtd->sync)
-		d->mtd->sync(d->mtd);
+		mtd_sync(d->mtd);
 	return 0;
 }
 
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index c594bb7abfa..5426d42cdea 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -449,7 +449,7 @@ static int reclaim_block(struct partition *part, u_long *old_sector)
 
 	/* we have a race if sync doesn't exist */
 	if (part->mbd.mtd->sync)
-		part->mbd.mtd->sync(part->mbd.mtd);
+		mtd_sync(part->mbd.mtd);
 
 	score = 0x7fffffff; /* MAX_INT */
 	best_block = -1;
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index 1a35fc5e3b4..9f265cc1a0d 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -715,7 +715,7 @@ int ubi_sync(int ubi_num)
 		return -ENODEV;
 
 	if (ubi->mtd->sync)
-		ubi->mtd->sync(ubi->mtd);
+		mtd_sync(ubi->mtd);
 
 	ubi_put_device(ubi);
 	return 0;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index e7e97445411..e78bf3cd1b7 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -337,7 +337,7 @@ static void jffs2_put_super (struct super_block *sb)
 	kfree(c->inocache_list);
 	jffs2_clear_xattr_subsystem(c);
 	if (c->mtd->sync)
-		c->mtd->sync(c->mtd);
+		mtd_sync(c->mtd);
 
 	D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 1842440d656..0ca7a07db6c 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -120,7 +120,7 @@ static void logfs_mtd_sync(struct super_block *sb)
 	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
 
 	if (mtd->sync)
-		mtd->sync(mtd);
+		mtd_sync(mtd);
 }
 
 static int logfs_mtd_readpage(void *_sb, struct page *page)
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 4129cb5c3de..47ea19c1e52 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -206,15 +206,13 @@ struct mtd_info {
 				   size_t len);
 	int (*writev) (struct mtd_info *mtd, const struct kvec *vecs,
 			unsigned long count, loff_t to, size_t *retlen);
+	void (*sync) (struct mtd_info *mtd);
 
 	/* Backing device capabilities for this device
 	 * - provides mmap capabilities
 	 */
 	struct backing_dev_info *backing_dev_info;
 
-	/* Sync */
-	void (*sync) (struct mtd_info *mtd);
-
 	/* Chip-supported device locking */
 	int (*lock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
 	int (*unlock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
@@ -381,6 +379,11 @@ static inline int mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
 	return mtd->writev(mtd, vecs, count, to, retlen);
 }
 
+static inline void mtd_sync(struct mtd_info *mtd)
+{
+	mtd->sync(mtd);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From 7086c19d07429d697057587caf1e5e0345442d16 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 19:35:30 +0200
Subject: mtd: introduce mtd_block_isbad interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/cris/arch-v32/drivers/axisflashmap.c | 3 +--
 drivers/mtd/inftlmount.c                  | 3 ++-
 drivers/mtd/mtdchar.c                     | 2 +-
 drivers/mtd/mtdconcat.c                   | 2 +-
 drivers/mtd/mtdoops.c                     | 4 ++--
 drivers/mtd/mtdpart.c                     | 5 ++---
 drivers/mtd/mtdswap.c                     | 4 ++--
 drivers/mtd/nftlmount.c                   | 3 ++-
 drivers/mtd/redboot.c                     | 4 ++--
 drivers/mtd/ssfdc.c                       | 4 ++--
 drivers/mtd/tests/mtd_oobtest.c           | 2 +-
 drivers/mtd/tests/mtd_pagetest.c          | 2 +-
 drivers/mtd/tests/mtd_readtest.c          | 2 +-
 drivers/mtd/tests/mtd_speedtest.c         | 2 +-
 drivers/mtd/tests/mtd_stresstest.c        | 2 +-
 drivers/mtd/tests/mtd_subpagetest.c       | 2 +-
 drivers/mtd/tests/mtd_torturetest.c       | 3 +--
 drivers/mtd/ubi/io.c                      | 2 +-
 fs/jffs2/scan.c                           | 2 +-
 fs/logfs/dev_mtd.c                        | 4 ++--
 include/linux/mtd/mtd.h                   | 7 ++++++-
 21 files changed, 34 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/arch/cris/arch-v32/drivers/axisflashmap.c b/arch/cris/arch-v32/drivers/axisflashmap.c
index 011bddbf073..b34438e026b 100644
--- a/arch/cris/arch-v32/drivers/axisflashmap.c
+++ b/arch/cris/arch-v32/drivers/axisflashmap.c
@@ -404,8 +404,7 @@ static int __init init_axis_flash(void)
 		 */
 		int blockstat;
 		do {
-			blockstat = main_mtd->block_isbad(main_mtd,
-				ptable_sector);
+			blockstat = mtd_block_isbad(main_mtd, ptable_sector);
 			if (blockstat < 0)
 				ptable_sector = 0; /* read error */
 			else if (blockstat)
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index 9bfbca5d88d..38519401196 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -306,7 +306,8 @@ static int find_boot_record(struct INFTLrecord *inftl)
 			/* If any of the physical eraseblocks are bad, don't
 			   use the unit. */
 			for (physblock = 0; physblock < inftl->EraseSize; physblock += inftl->mbd.mtd->erasesize) {
-				if (inftl->mbd.mtd->block_isbad(inftl->mbd.mtd, i * inftl->EraseSize + physblock))
+				if (mtd_block_isbad(inftl->mbd.mtd,
+						    i * inftl->EraseSize + physblock))
 					inftl->PUtable[i] = BLOCK_RESERVED;
 			}
 		}
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 6d598b23cf3..a499bf7a821 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -886,7 +886,7 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 		if (!mtd->block_isbad)
 			ret = -EOPNOTSUPP;
 		else
-			return mtd->block_isbad(mtd, offs);
+			return mtd_block_isbad(mtd, offs);
 		break;
 	}
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 4b7f825ce01..d0db5e61d5a 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -667,7 +667,7 @@ static int concat_block_isbad(struct mtd_info *mtd, loff_t ofs)
 			continue;
 		}
 
-		res = subdev->block_isbad(subdev, ofs);
+		res = mtd_block_isbad(subdev, ofs);
 		break;
 	}
 
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index 7be2018ffbc..bc43d2f7272 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -170,7 +170,7 @@ static void mtdoops_workfunc_erase(struct work_struct *work)
 	}
 
 	while (mtd->block_isbad) {
-		ret = mtd->block_isbad(mtd, cxt->nextpage * record_size);
+		ret = mtd_block_isbad(mtd, cxt->nextpage * record_size);
 		if (!ret)
 			break;
 		if (ret < 0) {
@@ -254,7 +254,7 @@ static void find_next_position(struct mtdoops_context *cxt)
 
 	for (page = 0; page < cxt->oops_pages; page++) {
 		if (mtd->block_isbad &&
-		    mtd->block_isbad(mtd, page * record_size))
+		    mtd_block_isbad(mtd, page * record_size))
 			continue;
 		/* Assume the page is used */
 		mark_page_used(cxt, page);
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 8610750852a..0e7dfc79d33 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -322,7 +322,7 @@ static int part_block_isbad(struct mtd_info *mtd, loff_t ofs)
 	if (ofs >= mtd->size)
 		return -EINVAL;
 	ofs += part->offset;
-	return part->master->block_isbad(part->master, ofs);
+	return mtd_block_isbad(part->master, ofs);
 }
 
 static int part_block_markbad(struct mtd_info *mtd, loff_t ofs)
@@ -553,8 +553,7 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
 		uint64_t offs = 0;
 
 		while (offs < slave->mtd.size) {
-			if (master->block_isbad(master,
-						offs + slave->offset))
+			if (mtd_block_isbad(master, offs + slave->offset))
 				slave->mtd.ecc_stats.badblocks++;
 			offs += slave->mtd.erasesize;
 		}
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index cb794e76101..87aa0a6323c 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -343,7 +343,7 @@ static int mtdswap_read_markers(struct mtdswap_dev *d, struct swap_eb *eb)
 	offset = mtdswap_eb_offset(d, eb);
 
 	/* Check first if the block is bad. */
-	if (d->mtd->block_isbad && d->mtd->block_isbad(d->mtd, offset))
+	if (d->mtd->block_isbad && mtd_block_isbad(d->mtd, offset))
 		return MTDSWAP_SCANNED_BAD;
 
 	ops.ooblen = 2 * d->mtd->ecclayout->oobavail;
@@ -1061,7 +1061,7 @@ static unsigned int mtdswap_badblocks(struct mtd_info *mtd, uint64_t size)
 
 	if (mtd->block_isbad)
 		for (offset = 0; offset < size; offset += mtd->erasesize)
-			if (mtd->block_isbad(mtd, offset))
+			if (mtd_block_isbad(mtd, offset))
 				badcnt++;
 
 	return badcnt;
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index b068dc8a366..156af9f8796 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -242,7 +242,8 @@ The new DiskOnChip driver already scanned the bad block table.  Just query it.
 			if (buf[i & (SECTORSIZE - 1)] != 0xff)
 				nftl->ReplUnitTable[i] = BLOCK_RESERVED;
 #endif
-			if (nftl->mbd.mtd->block_isbad(nftl->mbd.mtd, i * nftl->EraseSize))
+			if (mtd_block_isbad(nftl->mbd.mtd,
+					    i * nftl->EraseSize))
 				nftl->ReplUnitTable[i] = BLOCK_RESERVED;
 		}
 
diff --git a/drivers/mtd/redboot.c b/drivers/mtd/redboot.c
index 623d9b86d0d..09bb81ea3a7 100644
--- a/drivers/mtd/redboot.c
+++ b/drivers/mtd/redboot.c
@@ -79,7 +79,7 @@ static int parse_redboot_partitions(struct mtd_info *master,
 	if ( directory < 0 ) {
 		offset = master->size + directory * master->erasesize;
 		while (master->block_isbad && 
-		       master->block_isbad(master, offset)) {
+		       mtd_block_isbad(master, offset)) {
 			if (!offset) {
 			nogood:
 				printk(KERN_NOTICE "Failed to find a non-bad block to check for RedBoot partition table\n");
@@ -90,7 +90,7 @@ static int parse_redboot_partitions(struct mtd_info *master,
 	} else {
 		offset = directory * master->erasesize;
 		while (master->block_isbad && 
-		       master->block_isbad(master, offset)) {
+		       mtd_block_isbad(master, offset)) {
 			offset += master->erasesize;
 			if (offset == master->size)
 				goto nogood;
diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index 0e688133835..ab2a52a039c 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -122,7 +122,7 @@ static int get_valid_cis_sector(struct mtd_info *mtd)
 	 * is not SSFDC formatted
 	 */
 	for (k = 0, offset = 0; k < 4; k++, offset += mtd->erasesize) {
-		if (!mtd->block_isbad(mtd, offset)) {
+		if (mtd_block_isbad(mtd, offset)) {
 			ret = mtd_read(mtd, offset, SECTOR_SIZE, &retlen,
 				       sect_buf);
 
@@ -255,7 +255,7 @@ static int build_logical_block_map(struct ssfdcr_record *ssfdc)
 	for (phys_block = ssfdc->cis_block + 1; phys_block < ssfdc->map_len;
 			phys_block++) {
 		offset = (unsigned long)phys_block * ssfdc->erase_size;
-		if (mtd->block_isbad(mtd, offset))
+		if (mtd_block_isbad(mtd, offset))
 			continue;	/* skip bad blocks */
 
 		ret = read_raw_oob(mtd, offset, oob_buf);
diff --git a/drivers/mtd/tests/mtd_oobtest.c b/drivers/mtd/tests/mtd_oobtest.c
index 81113885e08..ed9b62827f1 100644
--- a/drivers/mtd/tests/mtd_oobtest.c
+++ b/drivers/mtd/tests/mtd_oobtest.c
@@ -329,7 +329,7 @@ static int is_block_bad(int ebnum)
 	int ret;
 	loff_t addr = ebnum * mtd->erasesize;
 
-	ret = mtd->block_isbad(mtd, addr);
+	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
 		printk(PRINT_PREF "block %d is bad\n", ebnum);
 	return ret;
diff --git a/drivers/mtd/tests/mtd_pagetest.c b/drivers/mtd/tests/mtd_pagetest.c
index 83da97e54f9..8024eaf4c1a 100644
--- a/drivers/mtd/tests/mtd_pagetest.c
+++ b/drivers/mtd/tests/mtd_pagetest.c
@@ -469,7 +469,7 @@ static int is_block_bad(int ebnum)
 	loff_t addr = ebnum * mtd->erasesize;
 	int ret;
 
-	ret = mtd->block_isbad(mtd, addr);
+	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
 		printk(PRINT_PREF "block %d is bad\n", ebnum);
 	return ret;
diff --git a/drivers/mtd/tests/mtd_readtest.c b/drivers/mtd/tests/mtd_readtest.c
index 5eaeada8428..ad5fd0df86e 100644
--- a/drivers/mtd/tests/mtd_readtest.c
+++ b/drivers/mtd/tests/mtd_readtest.c
@@ -132,7 +132,7 @@ static int is_block_bad(int ebnum)
 	loff_t addr = ebnum * mtd->erasesize;
 	int ret;
 
-	ret = mtd->block_isbad(mtd, addr);
+	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
 		printk(PRINT_PREF "block %d is bad\n", ebnum);
 	return ret;
diff --git a/drivers/mtd/tests/mtd_speedtest.c b/drivers/mtd/tests/mtd_speedtest.c
index c7b18e18908..ecb28784750 100644
--- a/drivers/mtd/tests/mtd_speedtest.c
+++ b/drivers/mtd/tests/mtd_speedtest.c
@@ -296,7 +296,7 @@ static int is_block_bad(int ebnum)
 	loff_t addr = ebnum * mtd->erasesize;
 	int ret;
 
-	ret = mtd->block_isbad(mtd, addr);
+	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
 		printk(PRINT_PREF "block %d is bad\n", ebnum);
 	return ret;
diff --git a/drivers/mtd/tests/mtd_stresstest.c b/drivers/mtd/tests/mtd_stresstest.c
index f8aac4b7e59..4789c0ee3e9 100644
--- a/drivers/mtd/tests/mtd_stresstest.c
+++ b/drivers/mtd/tests/mtd_stresstest.c
@@ -132,7 +132,7 @@ static int is_block_bad(int ebnum)
 	loff_t addr = ebnum * mtd->erasesize;
 	int ret;
 
-	ret = mtd->block_isbad(mtd, addr);
+	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
 		printk(PRINT_PREF "block %d is bad\n", ebnum);
 	return ret;
diff --git a/drivers/mtd/tests/mtd_subpagetest.c b/drivers/mtd/tests/mtd_subpagetest.c
index b90c01036b4..4b873d49fe6 100644
--- a/drivers/mtd/tests/mtd_subpagetest.c
+++ b/drivers/mtd/tests/mtd_subpagetest.c
@@ -344,7 +344,7 @@ static int is_block_bad(int ebnum)
 	loff_t addr = ebnum * mtd->erasesize;
 	int ret;
 
-	ret = mtd->block_isbad(mtd, addr);
+	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
 		printk(PRINT_PREF "block %d is bad\n", ebnum);
 	return ret;
diff --git a/drivers/mtd/tests/mtd_torturetest.c b/drivers/mtd/tests/mtd_torturetest.c
index dd34a519fa7..30c4ed9855e 100644
--- a/drivers/mtd/tests/mtd_torturetest.c
+++ b/drivers/mtd/tests/mtd_torturetest.c
@@ -292,8 +292,7 @@ static int __init tort_init(void)
 	memset(&bad_ebs[0], 0, sizeof(int) * ebcnt);
 	if (mtd->block_isbad) {
 		for (i = eb; i < eb + ebcnt; i++) {
-			err = mtd->block_isbad(mtd,
-					       (loff_t)i * mtd->erasesize);
+			err = mtd_block_isbad(mtd, (loff_t)i * mtd->erasesize);
 
 			if (err < 0) {
 				printk(PRINT_PREF "block_isbad() returned %d "
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c
index 8d832fc9e9e..a1b683ad639 100644
--- a/drivers/mtd/ubi/io.c
+++ b/drivers/mtd/ubi/io.c
@@ -634,7 +634,7 @@ int ubi_io_is_bad(const struct ubi_device *ubi, int pnum)
 	if (ubi->bad_allowed) {
 		int ret;
 
-		ret = mtd->block_isbad(mtd, (loff_t)pnum * ubi->peb_size);
+		ret = mtd_block_isbad(mtd, (loff_t)pnum * ubi->peb_size);
 		if (ret < 0)
 			ubi_err("error %d while checking if PEB %d is bad",
 				ret, pnum);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 72f3960f44a..83e1665e257 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -455,7 +455,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 	if (jffs2_cleanmarker_oob(c)) {
 		int ret;
 
-		if (c->mtd->block_isbad(c->mtd, jeb->offset))
+		if (mtd_block_isbad(c->mtd, jeb->offset))
 			return BLK_STATE_BADBLOCK;
 
 		ret = jffs2_check_nand_cleanmarker(c, jeb);
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 0ca7a07db6c..136c7360a9b 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -157,7 +157,7 @@ static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
 		return NULL;
 
 	*ofs = 0;
-	while (mtd->block_isbad(mtd, *ofs)) {
+	while (mtd_block_isbad(mtd, *ofs)) {
 		*ofs += mtd->erasesize;
 		if (*ofs >= mtd->size)
 			return NULL;
@@ -177,7 +177,7 @@ static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
 		return NULL;
 
 	*ofs = mtd->size - mtd->erasesize;
-	while (mtd->block_isbad(mtd, *ofs)) {
+	while (mtd_block_isbad(mtd, *ofs)) {
 		*ofs -= mtd->erasesize;
 		if (*ofs <= 0)
 			return NULL;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index d6b4aa17750..a307ad093a5 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -210,6 +210,7 @@ struct mtd_info {
 	int (*lock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
 	int (*unlock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
 	int (*is_locked) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
+	int (*block_isbad) (struct mtd_info *mtd, loff_t ofs);
 	int (*suspend) (struct mtd_info *mtd);
 	void (*resume) (struct mtd_info *mtd);
 
@@ -219,7 +220,6 @@ struct mtd_info {
 	struct backing_dev_info *backing_dev_info;
 
 	/* Bad block management functions */
-	int (*block_isbad) (struct mtd_info *mtd, loff_t ofs);
 	int (*block_markbad) (struct mtd_info *mtd, loff_t ofs);
 
 	struct notifier_block reboot_notifier;  /* default mode before reboot */
@@ -406,6 +406,11 @@ static inline void mtd_resume(struct mtd_info *mtd)
 	mtd->resume(mtd);
 }
 
+static inline int mtd_block_isbad(struct mtd_info *mtd, loff_t ofs)
+{
+	return mtd->block_isbad(mtd, ofs);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From 5942ddbc500d1c9b75e571b656be97f65b26adfe Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 23 Dec 2011 19:37:38 +0200
Subject: mtd: introduce mtd_block_markbad interface

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/inftlmount.c           | 2 +-
 drivers/mtd/mtdchar.c              | 2 +-
 drivers/mtd/mtdconcat.c            | 2 +-
 drivers/mtd/mtdoops.c              | 2 +-
 drivers/mtd/mtdpart.c              | 2 +-
 drivers/mtd/mtdswap.c              | 2 +-
 drivers/mtd/nand/nandsim.c         | 2 +-
 drivers/mtd/nftlmount.c            | 2 +-
 drivers/mtd/onenand/onenand_base.c | 2 +-
 drivers/mtd/ubi/io.c               | 2 +-
 fs/jffs2/wbuf.c                    | 2 +-
 include/linux/mtd/mtd.h            | 9 ++++++---
 12 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index 38519401196..4adc0374fb6 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -424,7 +424,7 @@ int INFTL_formatblock(struct INFTLrecord *inftl, int block)
 fail:
 	/* could not format, update the bad block table (caller is responsible
 	   for setting the PUtable to BLOCK_RESERVED on failure) */
-	inftl->mbd.mtd->block_markbad(inftl->mbd.mtd, instr->addr);
+	mtd_block_markbad(inftl->mbd.mtd, instr->addr);
 	return -1;
 }
 
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index a499bf7a821..15a3f6224be 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -899,7 +899,7 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 		if (!mtd->block_markbad)
 			ret = -EOPNOTSUPP;
 		else
-			return mtd->block_markbad(mtd, offs);
+			return mtd_block_markbad(mtd, offs);
 		break;
 	}
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index d0db5e61d5a..f694b51e785 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -693,7 +693,7 @@ static int concat_block_markbad(struct mtd_info *mtd, loff_t ofs)
 			continue;
 		}
 
-		err = subdev->block_markbad(subdev, ofs);
+		err = mtd_block_markbad(subdev, ofs);
 		if (!err)
 			mtd->ecc_stats.badblocks++;
 		break;
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index bc43d2f7272..69532a34e56 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -200,7 +200,7 @@ badblock:
 	}
 
 	if (mtd->block_markbad && ret == -EIO) {
-		ret = mtd->block_markbad(mtd, cxt->nextpage * record_size);
+		ret = mtd_block_markbad(mtd, cxt->nextpage * record_size);
 		if (ret < 0) {
 			printk(KERN_ERR "mtdoops: block_markbad failed, aborting\n");
 			return;
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 0e7dfc79d33..a3d44c3416b 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -335,7 +335,7 @@ static int part_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	if (ofs >= mtd->size)
 		return -EINVAL;
 	ofs += part->offset;
-	res = part->master->block_markbad(part->master, ofs);
+	res = mtd_block_markbad(part->master, ofs);
 	if (!res)
 		mtd->ecc_stats.badblocks++;
 	return res;
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index 87aa0a6323c..4441c08b082 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -279,7 +279,7 @@ static int mtdswap_handle_badblock(struct mtdswap_dev *d, struct swap_eb *eb)
 
 	offset = mtdswap_eb_offset(d, eb);
 	dev_warn(d->dev, "Marking bad block at %08llx\n", offset);
-	ret = d->mtd->block_markbad(d->mtd, offset);
+	ret = mtd_block_markbad(d->mtd, offset);
 
 	if (ret) {
 		dev_warn(d->dev, "Mark block bad failed for block at %08llx "
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 34c03be7730..261f478f8cc 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -737,7 +737,7 @@ static int parse_badblocks(struct nandsim *ns, struct mtd_info *mtd)
 			return -EINVAL;
 		}
 		offset = erase_block_no * ns->geom.secsz;
-		if (mtd->block_markbad(mtd, offset)) {
+		if (mtd_block_markbad(mtd, offset)) {
 			NS_ERR("invalid badblocks.\n");
 			return -EINVAL;
 		}
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index 156af9f8796..51b9d6af307 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -356,7 +356,7 @@ int NFTL_formatblock(struct NFTLrecord *nftl, int block)
 fail:
 	/* could not format, update the bad block table (caller is responsible
 	   for setting the ReplUnitTable to BLOCK_RESERVED on failure) */
-	nftl->mbd.mtd->block_markbad(nftl->mbd.mtd, instr->addr);
+	mtd_block_markbad(nftl->mbd.mtd, instr->addr);
 	return -1;
 }
 
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index a8394730b4b..dd278a28413 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -2645,7 +2645,7 @@ static int onenand_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	}
 
 	onenand_get_device(mtd, FL_WRITING);
-	ret = this->block_markbad(mtd, ofs);
+	ret = mtd_block_markbad(mtd, ofs);
 	onenand_release_device(mtd);
 	return ret;
 }
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c
index a1b683ad639..5cde4e5ca3e 100644
--- a/drivers/mtd/ubi/io.c
+++ b/drivers/mtd/ubi/io.c
@@ -669,7 +669,7 @@ int ubi_io_mark_bad(const struct ubi_device *ubi, int pnum)
 	if (!ubi->bad_allowed)
 		return 0;
 
-	err = mtd->block_markbad(mtd, (loff_t)pnum * ubi->peb_size);
+	err = mtd_block_markbad(mtd, (loff_t)pnum * ubi->peb_size);
 	if (err)
 		ubi_err("cannot mark PEB %d bad, error %d", pnum, err);
 	return err;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index eae5be48368..fd96b757433 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1134,7 +1134,7 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
 		return 1; // What else can we do?
 
 	printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset);
-	ret = c->mtd->block_markbad(c->mtd, bad_offset);
+	ret = mtd_block_markbad(c->mtd, bad_offset);
 
 	if (ret) {
 		D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret));
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index a307ad093a5..64aa54fba2d 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -211,6 +211,7 @@ struct mtd_info {
 	int (*unlock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
 	int (*is_locked) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
 	int (*block_isbad) (struct mtd_info *mtd, loff_t ofs);
+	int (*block_markbad) (struct mtd_info *mtd, loff_t ofs);
 	int (*suspend) (struct mtd_info *mtd);
 	void (*resume) (struct mtd_info *mtd);
 
@@ -219,9 +220,6 @@ struct mtd_info {
 	 */
 	struct backing_dev_info *backing_dev_info;
 
-	/* Bad block management functions */
-	int (*block_markbad) (struct mtd_info *mtd, loff_t ofs);
-
 	struct notifier_block reboot_notifier;  /* default mode before reboot */
 
 	/* ECC status information */
@@ -411,6 +409,11 @@ static inline int mtd_block_isbad(struct mtd_info *mtd, loff_t ofs)
 	return mtd->block_isbad(mtd, ofs);
 }
 
+static inline int mtd_block_markbad(struct mtd_info *mtd, loff_t ofs)
+{
+	return mtd->block_markbad(mtd, ofs);
+}
+
 static inline struct mtd_info *dev_to_mtd(struct device *dev)
 {
 	return dev ? dev_get_drvdata(dev) : NULL;
-- 
cgit v1.2.3


From 4ccf2f1349e681401b5fae73efc87b8d2d70ce0e Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 30 Dec 2011 15:48:55 +0200
Subject: jffs: remove custom mtd_fake_writev function

Instead, use 'default_mtd_writev()' function which MTD provides.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/writev.c | 27 +--------------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
index d0ef068709a..8d704073f8b 100644
--- a/fs/jffs2/writev.c
+++ b/fs/jffs2/writev.c
@@ -13,31 +13,6 @@
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
 
-/* This ought to be in core MTD code. All registered MTD devices
-   without writev should have this put in place. Bug the MTD
-   maintainer */
-static inline int mtd_fake_writev(struct mtd_info *mtd, const struct kvec *vecs,
-				  unsigned long count, loff_t to, size_t *retlen)
-{
-	unsigned long i;
-	size_t totlen = 0, thislen;
-	int ret = 0;
-
-	for (i=0; i<count; i++) {
-		if (!vecs[i].iov_len)
-			continue;
-		ret = mtd_write(mtd, to, vecs[i].iov_len, &thislen,
-				vecs[i].iov_base);
-		totlen += thislen;
-		if (ret || thislen != vecs[i].iov_len)
-			break;
-		to += vecs[i].iov_len;
-	}
-	if (retlen)
-		*retlen = totlen;
-	return ret;
-}
-
 int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
 			      unsigned long count, loff_t to, size_t *retlen)
 {
@@ -54,7 +29,7 @@ int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
 	if (c->mtd->writev)
 		return mtd_writev(c->mtd, vecs, count, to, retlen);
 	else {
-		return mtd_fake_writev(c->mtd, vecs, count, to, retlen);
+		return default_mtd_writev(c->mtd, vecs, count, to, retlen);
 	}
 }
 
-- 
cgit v1.2.3


From 10934478e44d9a5a7b16dadd89094fb608cf101e Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 28 Dec 2011 15:55:42 +0200
Subject: mtd: do use mtd->point directly

Remove direct usage of the "mtd->point" function pointer. Instead,
test the mtd_point() return code for '-EOPNOTSUPP'.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/erase.c        |  9 ++++-----
 fs/jffs2/readinode.c    | 18 ++++++++----------
 fs/jffs2/scan.c         |  2 +-
 include/linux/mtd/mtd.h |  2 ++
 4 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index c59d642cade..a01cdad6aad 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -336,12 +336,11 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
 	uint32_t ofs;
 	size_t retlen;
 	int ret = -EIO;
+	unsigned long *wordebuf;
 
-	if (c->mtd->point) {
-		unsigned long *wordebuf;
-
-		ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen,
-				&ebuf, NULL);
+	ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen,
+			&ebuf, NULL);
+	if (ret != -EOPNOTSUPP) {
 		if (ret) {
 			D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
 			goto do_flash_read;
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index fca2f84e1ad..3093ac4fb24 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -62,17 +62,15 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 #ifndef __ECOS
 	/* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
 	 * adding and jffs2_flash_read_end() interface. */
-	if (c->mtd->point) {
-		err = mtd_point(c->mtd, ofs, len, &retlen, (void **)&buffer,
-				NULL);
-		if (!err && retlen < len) {
-			JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
-			mtd_unpoint(c->mtd, ofs, retlen);
-		} else if (err)
+	err = mtd_point(c->mtd, ofs, len, &retlen, (void **)&buffer, NULL);
+	if (!err && retlen < len) {
+		JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
+		mtd_unpoint(c->mtd, ofs, retlen);
+	} else if (err) {
+		if (err != -EOPNOTSUPP)
 			JFFS2_WARNING("MTD point failed: error code %d.\n", err);
-		else
-			pointed = 1; /* succefully pointed to device */
-	}
+	} else
+		pointed = 1; /* succefully pointed to device */
 #endif
 
 	if (!pointed) {
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 83e1665e257..f99464833bb 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -105,7 +105,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 			mtd_unpoint(c->mtd, 0, pointlen);
 			flashbuf = NULL;
 		}
-		if (ret)
+		if (ret && ret != -EOPNOTSUPP)
 			D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
 	}
 #endif
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 478701566ba..b355a83e7cc 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -259,6 +259,8 @@ static inline int mtd_point(struct mtd_info *mtd, loff_t from, size_t len,
 			    size_t *retlen, void **virt, resource_size_t *phys)
 {
 	*retlen = 0;
+	if (!mtd->point)
+		return -EOPNOTSUPP;
 	return mtd->point(mtd, from, len, retlen, virt, phys);
 }
 
-- 
cgit v1.2.3


From 4991e7251ed951a5f33faf25912e9db416306309 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 28 Dec 2011 17:08:03 +0200
Subject: romfs: do not use mtd->get_unmapped_area directly

Remove direct usage of mtd->get_unmapped_area. Instead, just call
'mtd_get_unmapped_area()' which will return -EOPNOTSUPP if the function
is not implemented, and then test for this code.

We also translate -EOPNOTSUPP to -ENOSYS because this return code is
probably part of the kernel ABI which we do not want to break.

Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/romfs/mmap-nommu.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index d5168e8e7dc..e1a7779dd3c 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -28,9 +28,10 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
 	struct inode *inode = file->f_mapping->host;
 	struct mtd_info *mtd = inode->i_sb->s_mtd;
 	unsigned long isize, offset, maxpages, lpages;
+	int ret;
 
 	if (!mtd)
-		goto cant_map_directly;
+		return (unsigned long) -ENOSYS;
 
 	/* the mapping mustn't extend beyond the EOF */
 	lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -41,23 +42,20 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
 	if ((pgoff >= maxpages) || (maxpages - pgoff < lpages))
 		return (unsigned long) -EINVAL;
 
-	/* we need to call down to the MTD layer to do the actual mapping */
-	if (mtd->get_unmapped_area) {
-		if (addr != 0)
-			return (unsigned long) -EINVAL;
-
-		if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
-			return (unsigned long) -EINVAL;
+	if (addr != 0)
+		return (unsigned long) -EINVAL;
 
-		offset += ROMFS_I(inode)->i_dataoffset;
-		if (offset > mtd->size - len)
-			return (unsigned long) -EINVAL;
+	if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+		return (unsigned long) -EINVAL;
 
-		return mtd_get_unmapped_area(mtd, len, offset, flags);
-	}
+	offset += ROMFS_I(inode)->i_dataoffset;
+	if (offset > mtd->size - len)
+		return (unsigned long) -EINVAL;
 
-cant_map_directly:
-	return (unsigned long) -ENOSYS;
+	ret = mtd_get_unmapped_area(mtd, len, offset, flags);
+	if (ret == -EOPNOTSUPP)
+		ret = -ENOSYS;
+	return (unsigned long) ret;
 }
 
 /*
-- 
cgit v1.2.3


From 1dbebd32562b3c2caeca35960e5cb00bfcc12900 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 30 Dec 2011 16:23:41 +0200
Subject: mtd: harmonize mtd_writev usage

This patch makes the 'mtd_writev()' function more usable and logical. We first
teach it to fall-back to the 'default_mtd_writev()' function if the MTD driver
does not define its own '->writev()' method. Then we make block2mtd and JFFS2
just 'mtd_writev()' instead of 'default_mtd_writev()' function. This means we
can now stop exporting 'default_mtd_writev()' and instead, export
'mtd_writev()'. This is much cleaner and more logical, as well as allows us to
get read of another direct 'mtd->writev' access in JFFS2.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/devices/block2mtd.c |  2 +-
 drivers/mtd/mtdcore.c           | 26 +++++++++++++++++++++++---
 fs/jffs2/writev.c               |  6 +-----
 include/linux/mtd/mtd.h         | 16 ++--------------
 4 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index b78f23169d4..c16f6b4e893 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -288,7 +288,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 	dev->mtd.flags = MTD_CAP_RAM;
 	dev->mtd.erase = block2mtd_erase;
 	dev->mtd.write = block2mtd_write;
-	dev->mtd.writev = default_mtd_writev;
+	dev->mtd.writev = mtd_writev;
 	dev->mtd.sync = block2mtd_sync;
 	dev->mtd.read = block2mtd_read;
 	dev->mtd.priv = dev;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 53a200f722b..4d0f3e557bd 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -696,8 +696,8 @@ EXPORT_SYMBOL_GPL(__put_mtd_device);
  * This function returns zero in case of success and a negative error code in
  * case of failure.
  */
-int default_mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
-		       unsigned long count, loff_t to, size_t *retlen)
+static int default_mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
+			      unsigned long count, loff_t to, size_t *retlen)
 {
 	unsigned long i;
 	size_t totlen = 0, thislen;
@@ -716,7 +716,27 @@ int default_mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
 	*retlen = totlen;
 	return ret;
 }
-EXPORT_SYMBOL_GPL(default_mtd_writev);
+
+/*
+ * mtd_writev - the vector-based MTD write method
+ * @mtd: mtd device description object pointer
+ * @vecs: the vectors to write
+ * @count: count of vectors in @vecs
+ * @to: the MTD device offset to write to
+ * @retlen: on exit contains the count of bytes written to the MTD device.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
+	       unsigned long count, loff_t to, size_t *retlen)
+{
+	*retlen = 0;
+	if (!mtd->writev)
+		return default_mtd_writev(mtd, vecs, count, to, retlen);
+	return mtd->writev(mtd, vecs, count, to, retlen);
+}
+EXPORT_SYMBOL_GPL(mtd_writev);
 
 /**
  * mtd_kmalloc_up_to - allocate a contiguous buffer up to the specified size
diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
index 8d704073f8b..a1bda9dab3f 100644
--- a/fs/jffs2/writev.c
+++ b/fs/jffs2/writev.c
@@ -26,11 +26,7 @@ int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
 		}
 	}
 
-	if (c->mtd->writev)
-		return mtd_writev(c->mtd, vecs, count, to, retlen);
-	else {
-		return default_mtd_writev(c->mtd, vecs, count, to, retlen);
-	}
+	return mtd_writev(c->mtd, vecs, count, to, retlen);
 }
 
 int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index a994129ede5..a58ecf4d1f8 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -394,16 +394,8 @@ static inline int mtd_lock_user_prot_reg(struct mtd_info *mtd, loff_t from,
 	return mtd->lock_user_prot_reg(mtd, from, len);
 }
 
-/*
- * kvec-based read/write method. NB: The 'count' parameter is the number of
- * _vectors_, each of which contains an (ofs, len) tuple.
- */
-static inline int mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
-			     unsigned long count, loff_t to, size_t *retlen)
-{
-	*retlen = 0;
-	return mtd->writev(mtd, vecs, count, to, retlen);
-}
+int mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
+	       unsigned long count, loff_t to, size_t *retlen);
 
 static inline void mtd_sync(struct mtd_info *mtd)
 {
@@ -510,10 +502,6 @@ struct mtd_notifier {
 
 extern void register_mtd_user (struct mtd_notifier *new);
 extern int unregister_mtd_user (struct mtd_notifier *old);
-
-int default_mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
-		       unsigned long count, loff_t to, size_t *retlen);
-
 void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size);
 
 void mtd_erase_callback(struct erase_info *instr);
-- 
cgit v1.2.3


From 327cf2922b4edf0439b219469722d2a502e37349 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 30 Dec 2011 16:35:35 +0200
Subject: mtd: do not use mtd->sync directly

This patch teaches 'mtd_sync()' to do nothing when the MTD driver does
not have the '->sync()' method, which allows us to remove all direct
'mtd->sync' accesses.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/ftl.c       | 3 +--
 drivers/mtd/mtdblock.c  | 7 ++-----
 drivers/mtd/mtdchar.c   | 2 +-
 drivers/mtd/mtdswap.c   | 3 +--
 drivers/mtd/rfd_ftl.c   | 3 +--
 drivers/mtd/ubi/kapi.c  | 4 +---
 fs/jffs2/super.c        | 4 +---
 fs/logfs/dev_mtd.c      | 3 +--
 include/linux/mtd/mtd.h | 3 ++-
 9 files changed, 11 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index c9c90299c9e..19d637266fc 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -650,8 +650,7 @@ static int reclaim_block(partition_t *part)
 	    if (queued) {
 		pr_debug("ftl_cs: waiting for transfer "
 		      "unit to be prepared...\n");
-		if (part->mbd.mtd->sync)
-			mtd_sync(part->mbd.mtd);
+		mtd_sync(part->mbd.mtd);
 	    } else {
 		static int ne = 0;
 		if (++ne < 5)
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index 496e1a6e802..af6591237b9 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -322,8 +322,7 @@ static int mtdblock_release(struct mtd_blktrans_dev *mbd)
 
 	if (!--mtdblk->count) {
 		/* It was the last usage. Free the cache */
-		if (mbd->mtd->sync)
-			mtd_sync(mbd->mtd);
+		mtd_sync(mbd->mtd);
 		vfree(mtdblk->cache_data);
 	}
 
@@ -341,9 +340,7 @@ static int mtdblock_flush(struct mtd_blktrans_dev *dev)
 	mutex_lock(&mtdblk->cache_mutex);
 	write_cached_data(mtdblk);
 	mutex_unlock(&mtdblk->cache_mutex);
-
-	if (dev->mtd->sync)
-		mtd_sync(dev->mtd);
+	mtd_sync(dev->mtd);
 	return 0;
 }
 
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 2020a169ed9..23a51104aeb 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -154,7 +154,7 @@ static int mtdchar_close(struct inode *inode, struct file *file)
 	pr_debug("MTD_close\n");
 
 	/* Only sync if opened RW */
-	if ((file->f_mode & FMODE_WRITE) && mtd->sync)
+	if ((file->f_mode & FMODE_WRITE))
 		mtd_sync(mtd);
 
 	iput(mfi->ino);
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index 4441c08b082..fe4426c1c73 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -1047,8 +1047,7 @@ static int mtdswap_flush(struct mtd_blktrans_dev *dev)
 {
 	struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
 
-	if (d->mtd->sync)
-		mtd_sync(d->mtd);
+	mtd_sync(d->mtd);
 	return 0;
 }
 
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index 5426d42cdea..233b946e5d6 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -448,8 +448,7 @@ static int reclaim_block(struct partition *part, u_long *old_sector)
 	int rc;
 
 	/* we have a race if sync doesn't exist */
-	if (part->mbd.mtd->sync)
-		mtd_sync(part->mbd.mtd);
+	mtd_sync(part->mbd.mtd);
 
 	score = 0x7fffffff; /* MAX_INT */
 	best_block = -1;
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index 9f265cc1a0d..9fdb35367fe 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -714,9 +714,7 @@ int ubi_sync(int ubi_num)
 	if (!ubi)
 		return -ENODEV;
 
-	if (ubi->mtd->sync)
-		mtd_sync(ubi->mtd);
-
+	mtd_sync(ubi->mtd);
 	ubi_put_device(ubi);
 	return 0;
 }
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index e78bf3cd1b7..5863a369d92 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -336,9 +336,7 @@ static void jffs2_put_super (struct super_block *sb)
 	jffs2_flash_cleanup(c);
 	kfree(c->inocache_list);
 	jffs2_clear_xattr_subsystem(c);
-	if (c->mtd->sync)
-		mtd_sync(c->mtd);
-
+	mtd_sync(c->mtd);
 	D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
 
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 136c7360a9b..3f465882ee7 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -119,8 +119,7 @@ static void logfs_mtd_sync(struct super_block *sb)
 {
 	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
 
-	if (mtd->sync)
-		mtd_sync(mtd);
+	mtd_sync(mtd);
 }
 
 static int logfs_mtd_readpage(void *_sb, struct page *page)
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index a58ecf4d1f8..305f12b940f 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -399,7 +399,8 @@ int mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
 
 static inline void mtd_sync(struct mtd_info *mtd)
 {
-	mtd->sync(mtd);
+	if (mtd->sync)
+		mtd->sync(mtd);
 }
 
 /* Chip-supported device locking */
-- 
cgit v1.2.3


From d58b27ed58a30faf376e40d19945f34301944b8d Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 2 Jan 2012 13:52:14 +0200
Subject: logfs: do not use 'mtd->block_isbad' directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead, use the new 'mtd_can_have_bb()' helper.

Cc: Jörn Engel <joern@logfs.org>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/logfs/dev_mtd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 3f465882ee7..e97404d611e 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -152,7 +152,7 @@ static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
 	filler_t *filler = logfs_mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
-	if (!mtd->block_isbad)
+	if (!mtd_can_have_bb(mtd))
 		return NULL;
 
 	*ofs = 0;
@@ -172,7 +172,7 @@ static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
 	filler_t *filler = logfs_mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
-	if (!mtd->block_isbad)
+	if (!mtd_can_have_bb(mtd))
 		return NULL;
 
 	*ofs = mtd->size - mtd->erasesize;
-- 
cgit v1.2.3


From 800ffd3496987e91f599a135060ef49731e045ac Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 2 Jan 2012 13:59:12 +0200
Subject: mtd: do not use mtd->block_markbad directly

Instead, use the new 'mtd_can_have_bb()', or just rely on 'mtd_block_markbad()'
return code, which will be -EOPNOTSUPP if bad blocks are not supported.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdchar.c   | 5 +----
 drivers/mtd/mtdconcat.c | 2 +-
 drivers/mtd/mtdoops.c   | 2 +-
 drivers/mtd/mtdswap.c   | 2 +-
 fs/jffs2/wbuf.c         | 3 ---
 include/linux/mtd/mtd.h | 2 ++
 6 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 64efcbf087e..50c6a1e7f67 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -877,10 +877,7 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 
 		if (copy_from_user(&offs, argp, sizeof(loff_t)))
 			return -EFAULT;
-		if (!mtd->block_markbad)
-			ret = -EOPNOTSUPP;
-		else
-			return mtd_block_markbad(mtd, offs);
+		return mtd_block_markbad(mtd, offs);
 		break;
 	}
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index fbf3cb124a9..1ed5103b219 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -673,7 +673,7 @@ static int concat_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	struct mtd_concat *concat = CONCAT(mtd);
 	int i, err = -EINVAL;
 
-	if (!concat->subdev[0]->block_markbad)
+	if (!mtd_can_have_bb(concat->subdev[0]))
 		return 0;
 
 	if (ofs > mtd->size)
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index a4c8f67560e..db8e8272d69 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -199,7 +199,7 @@ badblock:
 		return;
 	}
 
-	if (mtd->block_markbad && ret == -EIO) {
+	if (mtd_can_have_bb(mtd) && ret == -EIO) {
 		ret = mtd_block_markbad(mtd, cxt->nextpage * record_size);
 		if (ret < 0) {
 			printk(KERN_ERR "mtdoops: block_markbad failed, aborting\n");
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index 3fc8cb2756c..c92f0f6bc13 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -274,7 +274,7 @@ static int mtdswap_handle_badblock(struct mtdswap_dev *d, struct swap_eb *eb)
 	eb->root = NULL;
 
 	/* badblocks not supported */
-	if (!d->mtd->block_markbad)
+	if (!mtd_can_have_bb(d->mtd))
 		return 1;
 
 	offset = mtdswap_eb_offset(d, eb);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index fd96b757433..30e8f47e8a2 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1130,9 +1130,6 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
 	if( ++jeb->bad_count < MAX_ERASE_FAILURES)
 		return 0;
 
-	if (!c->mtd->block_markbad)
-		return 1; // What else can we do?
-
 	printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset);
 	ret = mtd_block_markbad(c->mtd, bad_offset);
 
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 7e35755f693..1a81fde8f33 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -447,6 +447,8 @@ static inline int mtd_block_isbad(struct mtd_info *mtd, loff_t ofs)
 
 static inline int mtd_block_markbad(struct mtd_info *mtd, loff_t ofs)
 {
+	if (!mtd->block_markbad)
+		return -EOPNOTSUPP;
 	return mtd->block_markbad(mtd, ofs);
 }
 
-- 
cgit v1.2.3


From 074b1d12fe2500d7d453902f9266e6674b30d84c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 9 Jan 2012 13:46:26 -0500
Subject: NFSv4: Change the default setting of the nfs4_disable_idmapping
 parameter

Now that the use of numeric uids/gids is officially sanctioned in
RFC3530bis, it is time to change the default here to 'enabled'.

By doing so, we ensure that NFSv4 copies the behaviour of NFSv3 when we're
using the default AUTH_SYS authentication (i.e. when the client uses the
numeric uids/gids as authentication tokens), so that when new files are
created, they will appear to have the correct user/group.
It also fixes a number of backward compatibility issues when migrating
from NFSv3 to NFSv4 on a platform where the server uses different uid/gid
mappings than the client.

Note also that this setting has been successfully tested against servers
that do not support numeric uids/gids at several Connectathon/Bakeathon
events at this point, and the fall back to using string names/groups has
been shown to work well in all those test cases.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/kernel-parameters.txt | 17 +++++++++++------
 fs/nfs/client.c                     |  2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 81c287fad79..dfd21cfdf57 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1630,12 +1630,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			The default is to return 64-bit inode numbers.
 
 	nfs.nfs4_disable_idmapping=
-			[NFSv4] When set, this option disables the NFSv4
-			idmapper on the client, but only if the mount
-			is using the 'sec=sys' security flavour. This may
-			make migration from legacy NFSv2/v3 systems easier
-			provided that the server has the appropriate support.
-			The default is to always enable NFSv4 idmapping.
+			[NFSv4] When set to the default of '1', this option
+			ensures that both the RPC level authentication
+			scheme and the NFS level operations agree to use
+			numeric uids/gids if the mount is using the
+			'sec=sys' security flavour. In effect it is
+			disabling idmapping, which can make migration from
+			legacy NFSv2/v3 systems to NFSv4 easier.
+			Servers that do not support this mode of operation
+			will be autodetected by the client, and it will fall
+			back to using the idmapper.
+			To turn off this behaviour, set the value to '0'.
 
 	nmi_debug=	[KNL,AVR32,SH] Specify one or more actions to take
 			when a NMI is triggered.
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 41bd67f80d3..277dfaf2e99 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -84,7 +84,7 @@ retry:
 /*
  * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
  */
-static int nfs4_disable_idmapping = 0;
+static int nfs4_disable_idmapping = 1;
 
 /*
  * RPC cruft for NFS
-- 
cgit v1.2.3


From 94bf608a18fa4421315275a81c5489734599297a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Jan 2012 15:53:24 -0500
Subject: ext4: fix failure exits

a) leaking root dentry is bad
b) in case of failed ext4_mb_init() we don't want to do ext4_mb_release()
c) OTOH, in the same case we *do* want ext4_ext_release()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/super.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 64e2529ae9b..ed3ce82e2de 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3733,10 +3733,12 @@ no_journal:
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
+		iput(root);
 		goto failed_mount4;
 	}
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
+		iput(root);
 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
 		ret = -ENOMEM;
 		goto failed_mount4;
@@ -3773,7 +3775,7 @@ no_journal:
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize system "
 			 "zone (%d)", err);
-		goto failed_mount4;
+		goto failed_mount4a;
 	}
 
 	ext4_ext_init(sb);
@@ -3830,13 +3832,14 @@ cantfind_ext4:
 failed_mount7:
 	ext4_unregister_li_request(sb);
 failed_mount6:
-	ext4_ext_release(sb);
-failed_mount5:
 	ext4_mb_release(sb);
+failed_mount5:
+	ext4_ext_release(sb);
 	ext4_release_system_zone(sb);
-failed_mount4:
-	iput(root);
+failed_mount4a:
+	dput(sb->s_root);
 	sb->s_root = NULL;
+failed_mount4:
 	ext4_msg(sb, KERN_ERR, "mount failed");
 	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
-- 
cgit v1.2.3


From 3c5184ef1216dd476c9c67f22a199d90ac4d5892 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Jan 2012 16:34:32 -0500
Subject: ceph: d_alloc_root() may fail

... and ceph_init_dentry(NULL) will oops

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ceph/super.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 11bd0fc4853..48f61a12af6 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -636,19 +636,26 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	if (err == 0) {
+		struct inode *inode = req->r_target_inode;
+		req->r_target_inode = NULL;
 		dout("open_root_inode success\n");
-		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+		if (ceph_ino(inode) == CEPH_INO_ROOT &&
 		    fsc->sb->s_root == NULL) {
-			root = d_alloc_root(req->r_target_inode);
+			root = d_alloc_root(inode);
+			if (!root) {
+				iput(inode);
+				root = ERR_PTR(-ENOMEM);
+				goto out;
+			}
 			ceph_init_dentry(root);
 		} else {
-			root = d_obtain_alias(req->r_target_inode);
+			root = d_obtain_alias(inode);
 		}
-		req->r_target_inode = NULL;
 		dout("open_root_inode success, root dentry is %p\n", root);
 	} else {
 		root = ERR_PTR(err);
 	}
+out:
 	ceph_mdsc_put_request(req);
 	return root;
 }
-- 
cgit v1.2.3


From b48f03b319ba78f3abf9a7044d1f436d8d90f4f9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 23 Aug 2011 18:56:24 +1000
Subject: dcache: use a dispose list in select_parent

select_parent currently abuses the dentry cache LRU to provide
cleanup features for child dentries that need to be freed. It moves
them to the tail of the LRU, then tells shrink_dcache_parent() to
calls __shrink_dcache_sb to unconditionally move them to a dispose
list (as DCACHE_REFERENCED is ignored). __shrink_dcache_sb() has to
relock the dentries to move them off the LRU onto the dispose list,
but otherwise does not touch the dentries that select_parent() moved
to the tail of the LRU. It then passses the dispose list to
shrink_dentry_list() which tries to free the dentries.

IOWs, the use of __shrink_dcache_sb() is superfluous - we can build
exactly the same list of dentries for disposal directly in
select_parent() and call shrink_dentry_list() instead of calling
__shrink_dcache_sb() to do that. This means that we avoid long holds
on the lru lock walking the LRU moving dentries to the dispose list
We also avoid the need to relock each dentry just to move it off the
LRU, reducing the numebr of times we lock each dentry to dispose of
them in shrink_dcache_parent() from 3 to 2 times.

Further, we remove one of the two callers of __shrink_dcache_sb().
This also means that __shrink_dcache_sb can be moved into back into
prune_dcache_sb() and we no longer have to handle referenced
dentries conditionally, simplifying the code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 63 +++++++++++++++++++++----------------------------------------
 1 file changed, 21 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 9791b1e7eee..b209d73f9a9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -276,15 +276,15 @@ static void dentry_lru_prune(struct dentry *dentry)
 	}
 }
 
-static void dentry_lru_move_tail(struct dentry *dentry)
+static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
 {
 	spin_lock(&dcache_lru_lock);
 	if (list_empty(&dentry->d_lru)) {
-		list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+		list_add_tail(&dentry->d_lru, list);
 		dentry->d_sb->s_nr_dentry_unused++;
 		dentry_stat.nr_unused++;
 	} else {
-		list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+		list_move_tail(&dentry->d_lru, list);
 	}
 	spin_unlock(&dcache_lru_lock);
 }
@@ -770,14 +770,18 @@ static void shrink_dentry_list(struct list_head *list)
 }
 
 /**
- * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
- * @sb:		superblock to shrink dentry LRU.
- * @count:	number of entries to prune
- * @flags:	flags to control the dentry processing
+ * prune_dcache_sb - shrink the dcache
+ * @sb: superblock
+ * @count: number of entries to try to free
+ *
+ * Attempt to shrink the superblock dcache LRU by @count entries. This is
+ * done when we need more memory an called from the superblock shrinker
+ * function.
  *
- * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ * This function may fail to free any resources if all the dentries are in
+ * use.
  */
-static void __shrink_dcache_sb(struct super_block *sb, int count, int flags)
+void prune_dcache_sb(struct super_block *sb, int count)
 {
 	struct dentry *dentry;
 	LIST_HEAD(referenced);
@@ -796,13 +800,7 @@ relock:
 			goto relock;
 		}
 
-		/*
-		 * If we are honouring the DCACHE_REFERENCED flag and the
-		 * dentry has this flag set, don't free it.  Clear the flag
-		 * and put it back on the LRU.
-		 */
-		if (flags & DCACHE_REFERENCED &&
-				dentry->d_flags & DCACHE_REFERENCED) {
+		if (dentry->d_flags & DCACHE_REFERENCED) {
 			dentry->d_flags &= ~DCACHE_REFERENCED;
 			list_move(&dentry->d_lru, &referenced);
 			spin_unlock(&dentry->d_lock);
@@ -821,23 +819,6 @@ relock:
 	shrink_dentry_list(&tmp);
 }
 
-/**
- * prune_dcache_sb - shrink the dcache
- * @sb: superblock
- * @nr_to_scan: number of entries to try to free
- *
- * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
- * done when we need more memory an called from the superblock shrinker
- * function.
- *
- * This function may fail to free any resources if all the dentries are in
- * use.
- */
-void prune_dcache_sb(struct super_block *sb, int nr_to_scan)
-{
-	__shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED);
-}
-
 /**
  * shrink_dcache_sb - shrink dcache for a superblock
  * @sb: superblock
@@ -1092,7 +1073,7 @@ EXPORT_SYMBOL(have_submounts);
  * drop the lock and return early due to latency
  * constraints.
  */
-static int select_parent(struct dentry * parent)
+static int select_parent(struct dentry *parent, struct list_head *dispose)
 {
 	struct dentry *this_parent;
 	struct list_head *next;
@@ -1114,12 +1095,11 @@ resume:
 
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 
-		/* 
-		 * move only zero ref count dentries to the end 
-		 * of the unused list for prune_dcache
+		/*
+		 * move only zero ref count dentries to the dispose list.
 		 */
 		if (!dentry->d_count) {
-			dentry_lru_move_tail(dentry);
+			dentry_lru_move_list(dentry, dispose);
 			found++;
 		} else {
 			dentry_lru_del(dentry);
@@ -1181,14 +1161,13 @@ rename_retry:
  *
  * Prune the dcache to remove unused children of the parent dentry.
  */
- 
 void shrink_dcache_parent(struct dentry * parent)
 {
-	struct super_block *sb = parent->d_sb;
+	LIST_HEAD(dispose);
 	int found;
 
-	while ((found = select_parent(parent)) != 0)
-		__shrink_dcache_sb(sb, found, 0);
+	while ((found = select_parent(parent, &dispose)) != 0)
+		shrink_dentry_list(&dispose);
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
 
-- 
cgit v1.2.3


From adc0e91ab142abe93f5b0d7980ada8a7676231fe Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Jan 2012 16:49:21 -0500
Subject: vfs: new helper - d_make_root()

d_alloc_root() with iput() in case of allocation failure...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 17 +++++++++++++++++
 include/linux/dcache.h |  1 +
 2 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index b209d73f9a9..3c6d3113a25 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1440,6 +1440,23 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
 
+struct dentry *d_make_root(struct inode *root_inode)
+{
+	struct dentry *res = NULL;
+
+	if (root_inode) {
+		static const struct qstr name = { .name = "/", .len = 1 };
+
+		res = __d_alloc(root_inode->i_sb, &name);
+		if (res)
+			d_instantiate(res, root_inode);
+		else
+			iput(root_inode);
+	}
+	return res;
+}
+EXPORT_SYMBOL(d_make_root);
+
 static struct dentry * __d_find_any_alias(struct inode *inode)
 {
 	struct dentry *alias;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index ed9f74f6c51..a47bda5f76d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -249,6 +249,7 @@ extern int d_invalidate(struct dentry *);
 
 /* only used at mount-time */
 extern struct dentry * d_alloc_root(struct inode *);
+extern struct dentry * d_make_root(struct inode *);
 
 /* <clickety>-<click> the ramfs-type tree */
 extern void d_genocide(struct dentry *);
-- 
cgit v1.2.3


From 0b2c4e39c014219ef73f05ab580c284bf8e6af0a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 10:46:03 -0500
Subject: coda: deal correctly with allocation failure from
 coda_cnode_makectl()

lookup should fail with ENOMEM, not silently make dentry negative.
Switched to saner calling conventions, while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/cnode.c     | 21 +++++++++------------
 fs/coda/coda_fs_i.h |  2 +-
 fs/coda/dir.c       |  4 ++--
 3 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 6475877b076..8af67c9c47d 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -156,19 +156,16 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
 }
 
 /* the CONTROL inode is made without asking attributes from Venus */
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb)
+struct inode *coda_cnode_makectl(struct super_block *sb)
 {
-	int error = -ENOMEM;
-
-	*inode = new_inode(sb);
-	if (*inode) {
-		(*inode)->i_ino = CTL_INO;
-		(*inode)->i_op = &coda_ioctl_inode_operations;
-		(*inode)->i_fop = &coda_ioctl_operations;
-		(*inode)->i_mode = 0444;
-		error = 0;
+	struct inode *inode = new_inode(sb);
+	if (inode) {
+		inode->i_ino = CTL_INO;
+		inode->i_op = &coda_ioctl_inode_operations;
+		inode->i_fop = &coda_ioctl_operations;
+		inode->i_mode = 0444;
+		return inode;
 	}
-
-	return error;
+	return ERR_PTR(-ENOMEM);
 }
 
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index e35071b1de0..1c17446f1af 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -51,7 +51,7 @@ struct coda_file_info {
 
 int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
 struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_cnode_makectl(struct super_block *sb);
 struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
 void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 83d2fd8ec24..df0f9c1b01d 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -111,7 +111,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 
 	/* control object, create inode on the fly */
 	if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-		error = coda_cnode_makectl(&inode, dir->i_sb);
+		inode = coda_cnode_makectl(dir->i_sb);
 		type = CODA_NOCACHE;
 		goto exit;
 	}
@@ -125,7 +125,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 		return ERR_PTR(error);
 
 exit:
-	if (inode && (type & CODA_NOCACHE))
+	if (inode && !IS_ERR(inode) && (type & CODA_NOCACHE))
 		coda_flag_inode(inode, C_VATTR | C_PURGE);
 
 	return d_splice_alias(inode, entry);
-- 
cgit v1.2.3


From f4947fbce208990266920d51837e4e7ba9779db1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 11:11:49 -0500
Subject: coda: switch coda_cnode_make() to sane API as well, clean
 coda_lookup()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/cnode.c     | 17 +++++++----------
 fs/coda/coda_fs_i.h |  2 +-
 fs/coda/dir.c       | 29 +++++++++++++----------------
 fs/coda/inode.c     | 10 ++++++----
 4 files changed, 27 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 8af67c9c47d..911cf30d057 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -88,24 +88,21 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
    - link the two up if this is needed
    - fill in the attributes
 */
-int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb)
+struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb)
 {
         struct coda_vattr attr;
+	struct inode *inode;
         int error;
         
 	/* We get inode numbers from Venus -- see venus source */
 	error = venus_getattr(sb, fid, &attr);
-	if ( error ) {
-	    *inode = NULL;
-	    return error;
-	} 
+	if (error)
+		return ERR_PTR(error);
 
-	*inode = coda_iget(sb, fid, &attr);
-	if ( IS_ERR(*inode) ) {
+	inode = coda_iget(sb, fid, &attr);
+	if (IS_ERR(inode))
 		printk("coda_cnode_make: coda_iget failed\n");
-                return PTR_ERR(*inode);
-        }
-	return 0;
+	return inode;
 }
 
 
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index 1c17446f1af..b24fdfd8a3f 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -49,7 +49,7 @@ struct coda_file_info {
 #define C_DYING       0x4   /* from venus (which died) */
 #define C_PURGE       0x8
 
-int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_cnode_make(struct CodaFid *, struct super_block *);
 struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
 struct inode *coda_cnode_makectl(struct super_block *sb);
 struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index df0f9c1b01d..17751582906 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -96,12 +96,11 @@ const struct file_operations coda_dir_operations = {
 /* access routines: lookup, readlink, permission */
 static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd)
 {
-	struct inode *inode = NULL;
-	struct CodaFid resfid = { { 0, } };
-	int type = 0;
-	int error = 0;
+	struct super_block *sb = dir->i_sb;
 	const char *name = entry->d_name.name;
 	size_t length = entry->d_name.len;
+	struct inode *inode;
+	int type = 0;
 
 	if (length > CODA_MAXNAMLEN) {
 		printk(KERN_ERR "name too long: lookup, %s (%*s)\n",
@@ -111,23 +110,21 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 
 	/* control object, create inode on the fly */
 	if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-		inode = coda_cnode_makectl(dir->i_sb);
+		inode = coda_cnode_makectl(sb);
 		type = CODA_NOCACHE;
-		goto exit;
+	} else {
+		struct CodaFid fid = { { 0, } };
+		int error = venus_lookup(sb, coda_i2f(dir), name, length,
+				     &type, &fid);
+		inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error);
 	}
 
-	error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
-			     &type, &resfid);
-	if (!error)
-		error = coda_cnode_make(&inode, &resfid, dir->i_sb);
-
-	if (error && error != -ENOENT)
-		return ERR_PTR(error);
-
-exit:
-	if (inode && !IS_ERR(inode) && (type & CODA_NOCACHE))
+	if (!IS_ERR(inode) && (type & CODA_NOCACHE))
 		coda_flag_inode(inode, C_VATTR | C_PURGE);
 
+	if (inode == ERR_PTR(-ENOENT))
+		inode = NULL;
+
 	return d_splice_alias(inode, entry);
 }
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 1c08a8cd673..5e2e1b3f068 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -204,10 +204,12 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid));
 	
 	/* make root inode */
-        error = coda_cnode_make(&root, &fid, sb);
-        if ( error || !root ) {
-	    printk("Failure of coda_cnode_make for root: error %d\n", error);
-	    goto error;
+        root = coda_cnode_make(&fid, sb);
+        if (IS_ERR(root)) {
+		error = PTR_ERR(root);
+		printk("Failure of coda_cnode_make for root: error %d\n", error);
+		root = NULL;
+		goto error;
 	} 
 
 	printk("coda_read_super: rootinode is %ld dev %s\n", 
-- 
cgit v1.2.3


From d50f2ab6f050311dbf7b8f5501b25f0bf64a439b Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Tue, 10 Jan 2012 11:51:10 -0500
Subject: ext4: fix undefined behavior in ext4_fill_flex_info()

Commit 503358ae01b70ce6909d19dd01287093f6b6271c ("ext4: avoid divide by
zero when trying to mount a corrupted file system") fixes CVE-2009-4307
by performing a sanity check on s_log_groups_per_flex, since it can be
set to a bogus value by an attacker.

	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
	groups_per_flex = 1 << sbi->s_log_groups_per_flex;

	if (groups_per_flex < 2) { ... }

This patch fixes two potential issues in the previous commit.

1) The sanity check might only work on architectures like PowerPC.
On x86, 5 bits are used for the shifting amount.  That means, given a
large s_log_groups_per_flex value like 36, groups_per_flex = 1 << 36
is essentially 1 << 4 = 16, rather than 0.  This will bypass the check,
leaving s_log_groups_per_flex and groups_per_flex inconsistent.

2) The sanity check relies on undefined behavior, i.e., oversized shift.
A standard-confirming C compiler could rewrite the check in unexpected
ways.  Consider the following equivalent form, assuming groups_per_flex
is unsigned for simplicity.

	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
	if (groups_per_flex == 0 || groups_per_flex == 1) {

We compile the code snippet using Clang 3.0 and GCC 4.6.  Clang will
completely optimize away the check groups_per_flex == 0, leaving the
patched code as vulnerable as the original.  GCC keeps the check, but
there is no guarantee that future versions will do the same.

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 36570b7af7c..108c3af8617 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2006,17 +2006,16 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	struct ext4_group_desc *gdp = NULL;
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
-	int groups_per_flex = 0;
+	unsigned int groups_per_flex = 0;
 	size_t size;
 	int i;
 
 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
-	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
-	if (groups_per_flex < 2) {
+	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
 		sbi->s_log_groups_per_flex = 0;
 		return 1;
 	}
+	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
 
 	/* We allocate both existing and potentially added groups */
 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
-- 
cgit v1.2.3


From 3d8eb7a94e8f25a33362f708974ac7daae9e84f8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 11 Nov 2011 09:48:53 -0800
Subject: ceph: remove unnecessary d_fsdata conditional checks

We now set d_fsdata unconditionally on all dentries prior to setting up
the d_ops, so all of these checks are unnecessary.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c        | 48 ++++++++++++++++++++----------------------------
 fs/ceph/mds_client.c |  4 ++--
 2 files changed, 22 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 98954003a8d..a421555b229 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -973,7 +973,7 @@ static int dentry_lease_is_valid(struct dentry *dentry)
 
 	spin_lock(&dentry->d_lock);
 	di = ceph_dentry(dentry);
-	if (di && di->lease_session) {
+	if (di->lease_session) {
 		s = di->lease_session;
 		spin_lock(&s->s_cap_lock);
 		gen = s->s_cap_gen;
@@ -1072,13 +1072,11 @@ static void ceph_d_release(struct dentry *dentry)
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
 
 	dout("d_release %p\n", dentry);
-	if (di) {
-		ceph_dentry_lru_del(dentry);
-		if (di->lease_session)
-			ceph_put_mds_session(di->lease_session);
-		kmem_cache_free(ceph_dentry_cachep, di);
-		dentry->d_fsdata = NULL;
-	}
+	ceph_dentry_lru_del(dentry);
+	if (di->lease_session)
+		ceph_put_mds_session(di->lease_session);
+	kmem_cache_free(ceph_dentry_cachep, di);
+	dentry->d_fsdata = NULL;
 }
 
 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
@@ -1259,13 +1257,11 @@ void ceph_dentry_lru_add(struct dentry *dn)
 
 	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
-	if (di) {
-		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-		spin_lock(&mdsc->dentry_lru_lock);
-		list_add_tail(&di->lru, &mdsc->dentry_lru);
-		mdsc->num_dentry++;
-		spin_unlock(&mdsc->dentry_lru_lock);
-	}
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_add_tail(&di->lru, &mdsc->dentry_lru);
+	mdsc->num_dentry++;
+	spin_unlock(&mdsc->dentry_lru_lock);
 }
 
 void ceph_dentry_lru_touch(struct dentry *dn)
@@ -1275,12 +1271,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
 
 	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
 	     dn->d_name.len, dn->d_name.name, di->offset);
-	if (di) {
-		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-		spin_lock(&mdsc->dentry_lru_lock);
-		list_move_tail(&di->lru, &mdsc->dentry_lru);
-		spin_unlock(&mdsc->dentry_lru_lock);
-	}
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_move_tail(&di->lru, &mdsc->dentry_lru);
+	spin_unlock(&mdsc->dentry_lru_lock);
 }
 
 void ceph_dentry_lru_del(struct dentry *dn)
@@ -1290,13 +1284,11 @@ void ceph_dentry_lru_del(struct dentry *dn)
 
 	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
-	if (di) {
-		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-		spin_lock(&mdsc->dentry_lru_lock);
-		list_del_init(&di->lru);
-		mdsc->num_dentry--;
-		spin_unlock(&mdsc->dentry_lru_lock);
-	}
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_del_init(&di->lru);
+	mdsc->num_dentry--;
+	spin_unlock(&mdsc->dentry_lru_lock);
 }
 
 /*
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6203d805eb4..23ab6a3f182 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2772,7 +2772,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 	di = ceph_dentry(dentry);
 	switch (h->action) {
 	case CEPH_MDS_LEASE_REVOKE:
-		if (di && di->lease_session == session) {
+		if (di->lease_session == session) {
 			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
 				h->seq = cpu_to_le32(di->lease_seq);
 			__ceph_mdsc_drop_dentry_lease(dentry);
@@ -2781,7 +2781,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 		break;
 
 	case CEPH_MDS_LEASE_RENEW:
-		if (di && di->lease_session == session &&
+		if (di->lease_session == session &&
 		    di->lease_gen == session->s_cap_gen &&
 		    di->lease_renew_from &&
 		    di->lease_renew_after == 0) {
-- 
cgit v1.2.3


From b8cd952b51034ad9f20ca147507ee68dc641c98c Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 13 Dec 2011 09:56:30 -0800
Subject: ceph: dereference pointer after checking for NULL

moved dereference after BUG_ON

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 87fb132fb33..f556e76c72e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -851,11 +851,12 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 {
 	struct dentry *dir = dn->d_parent;
 	struct inode *inode = dir->d_inode;
-	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_info *ci;
 	struct ceph_dentry_info *di;
 
 	BUG_ON(!inode);
 
+	ci = ceph_inode(inode);
 	di = ceph_dentry(dn);
 
 	spin_lock(&ci->i_ceph_lock);
-- 
cgit v1.2.3


From ee6b1baf67591b6d7ce1a6a07544343433d5ec9e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 3 Jan 2012 08:20:40 -0800
Subject: ceph: avoid useless dget/dput in encode_fh

Nothing we do here sleeps, so just do it under d_lock and avoid the dget/
dput entirely.

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/export.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9fbcdecaacc..fbb2a643ef1 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,9 +56,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
 		return -EINVAL;
 
 	spin_lock(&dentry->d_lock);
-	parent = dget(dentry->d_parent);
-	spin_unlock(&dentry->d_lock);
-
+	parent = dentry->d_parent;
 	if (*max_len >= connected_handle_length) {
 		dout("encode_fh %p connectable\n", dentry);
 		cfh->ino = ceph_ino(dentry->d_inode);
@@ -81,7 +79,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
 		*max_len = handle_length;
 		type = 255;
 	}
-	dput(parent);
+	spin_unlock(&dentry->d_lock);
 	return type;
 }
 
-- 
cgit v1.2.3


From 2ff179e650e95c2b21841b21dc46dc2edefd04cd Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 3 Jan 2012 10:09:07 -0800
Subject: ceph: avoid iput() while holding spinlock in ceph_dir_fsync

ceph_mdsc_put_request() can call iput(), which can sleep.  Don't do that.

Fixes: #1812
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a421555b229..974ef1e4d26 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1218,6 +1218,7 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
 	do {
 		ceph_mdsc_get_request(req);
 		spin_unlock(&ci->i_unsafe_lock);
+
 		dout("dir_fsync %p wait on tid %llu (until %llu)\n",
 		     inode, req->r_tid, last_tid);
 		if (req->r_timeout) {
@@ -1230,9 +1231,9 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
 		} else {
 			wait_for_completion(&req->r_safe_completion);
 		}
-		spin_lock(&ci->i_unsafe_lock);
 		ceph_mdsc_put_request(req);
 
+		spin_lock(&ci->i_unsafe_lock);
 		if (ret || list_empty(head))
 			break;
 		req = list_entry(head->next,
-- 
cgit v1.2.3


From eaf5f9073533cde21c7121c136f1c3f072d9cf59 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 10 Jan 2012 18:22:25 +0100
Subject: fix shrink_dcache_parent() livelock

Two (or more) concurrent calls of shrink_dcache_parent() on the same dentry may
cause shrink_dcache_parent() to loop forever.

Here's what appears to happen:

1 - CPU0: select_parent(P) finds C and puts it on dispose list, returns 1

2 - CPU1: select_parent(P) locks P->d_lock

3 - CPU0: shrink_dentry_list() locks C->d_lock
   dentry_kill(C) tries to lock P->d_lock but fails, unlocks C->d_lock

4 - CPU1: select_parent(P) locks C->d_lock,
         moves C from dispose list being processed on CPU0 to the new
dispose list, returns 1

5 - CPU0: shrink_dentry_list() finds dispose list empty, returns

6 - Goto 2 with CPU0 and CPU1 switched

Basically select_parent() steals the dentry from shrink_dentry_list() and thinks
it found a new one, causing shrink_dentry_list() to think it's making progress
and loop over and over.

One way to trigger this is to make udev calls stat() on the sysfs file while it
is going away.

Having a file in /lib/udev/rules.d/ with only this one rule seems to the trick:

ATTR{vendor}=="0x8086", ATTR{device}=="0x10ca", ENV{PCI_SLOT_NAME}="%k", ENV{MATCHADDR}="$attr{address}", RUN+="/bin/true"

Then execute the following loop:

while true; do
        echo -bond0 > /sys/class/net/bonding_masters
        echo +bond0 > /sys/class/net/bonding_masters
        echo -bond1 > /sys/class/net/bonding_masters
        echo +bond1 > /sys/class/net/bonding_masters
done

One fix would be to check all callers and prevent concurrent calls to
shrink_dcache_parent().  But I think a better solution is to stop the
stealing behavior.

This patch adds a new dentry flag that is set when the dentry is added to the
dispose list.  The flag is cleared in dentry_lru_del() in case the dentry gets a
new reference just before being pruned.

If the dentry has this flag, select_parent() will skip it and let
shrink_dentry_list() retry pruning it.  With select_parent() skipping those
dentries there will not be the appearance of progress (new dentries found) when
there is none, hence shrink_dcache_parent() will not loop forever.

Set the flag is also set in prune_dcache_sb() for consistency as suggested by
Linus.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 15 +++++++++++----
 include/linux/dcache.h |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 3c6d3113a25..616fedff011 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -243,6 +243,7 @@ static void dentry_lru_add(struct dentry *dentry)
 static void __dentry_lru_del(struct dentry *dentry)
 {
 	list_del_init(&dentry->d_lru);
+	dentry->d_flags &= ~DCACHE_SHRINK_LIST;
 	dentry->d_sb->s_nr_dentry_unused--;
 	dentry_stat.nr_unused--;
 }
@@ -806,6 +807,7 @@ relock:
 			spin_unlock(&dentry->d_lock);
 		} else {
 			list_move_tail(&dentry->d_lru, &tmp);
+			dentry->d_flags |= DCACHE_SHRINK_LIST;
 			spin_unlock(&dentry->d_lock);
 			if (!--count)
 				break;
@@ -1097,14 +1099,19 @@ resume:
 
 		/*
 		 * move only zero ref count dentries to the dispose list.
+		 *
+		 * Those which are presently on the shrink list, being processed
+		 * by shrink_dentry_list(), shouldn't be moved.  Otherwise the
+		 * loop in shrink_dcache_parent() might not make any progress
+		 * and loop forever.
 		 */
-		if (!dentry->d_count) {
+		if (dentry->d_count) {
+			dentry_lru_del(dentry);
+		} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 			dentry_lru_move_list(dentry, dispose);
+			dentry->d_flags |= DCACHE_SHRINK_LIST;
 			found++;
-		} else {
-			dentry_lru_del(dentry);
 		}
-
 		/*
 		 * We can return to the caller if we have found some (this
 		 * ensures forward progress). We'll be coming back to find
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index a47bda5f76d..31f73220e7d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -203,6 +203,7 @@ struct dentry_operations {
 
 #define DCACHE_CANT_MOUNT	0x0100
 #define DCACHE_GENOCIDE		0x0200
+#define DCACHE_SHRINK_LIST	0x0400
 
 #define DCACHE_NFSFS_RENAMED	0x1000
      /* this dentry has been "silly renamed" and has to be deleted on the last
-- 
cgit v1.2.3


From ace8577aeb438025ecf642f5eda3aa551d251951 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 10 Jan 2012 02:43:59 +0300
Subject: block_dev: Suppress bdev_cache_init() kmemleak warninig

Kmemleak reports the following warning in bdev_cache_init()
[    0.003738] kmemleak: Object 0xffff880153035200 (size 256):
[    0.003823] kmemleak:   comm "swapper/0", pid 0, jiffies 4294667299
[    0.003909] kmemleak:   min_count = 1
[    0.003988] kmemleak:   count = 0
[    0.004066] kmemleak:   flags = 0x1
[    0.004144] kmemleak:   checksum = 0
[    0.004224] kmemleak:   backtrace:
[    0.004303]      [<ffffffff814755ac>] kmemleak_alloc+0x21/0x3e
[    0.004446]      [<ffffffff811100ba>] kmem_cache_alloc+0xca/0x1dc
[    0.004592]      [<ffffffff811371b1>] alloc_vfsmnt+0x1f/0x198
[    0.004736]      [<ffffffff811375c5>] vfs_kern_mount+0x36/0xd2
[    0.004879]      [<ffffffff8113929a>] kern_mount_data+0x18/0x32
[    0.005025]      [<ffffffff81ab9075>] bdev_cache_init+0x51/0x81
[    0.005169]      [<ffffffff81ab8abf>] vfs_caches_init+0x101/0x10d
[    0.005313]      [<ffffffff81a9bae3>] start_kernel+0x344/0x383
[    0.005456]      [<ffffffff81a9b2a7>] x86_64_start_reservations+0xae/0xb2
[    0.005602]      [<ffffffff81a9b3ad>] x86_64_start_kernel+0x102/0x111
[    0.005747]      [<ffffffffffffffff>] 0xffffffffffffffff
[    0.008653] kmemleak: Trying to color unknown object at 0xffff880153035220 as Grey
[    0.008754] Pid: 0, comm: swapper/0 Not tainted 3.3.0-rc0-dbg-04200-g8180888-dirty #888
[    0.008856] Call Trace:
[    0.008934]  [<ffffffff81118704>] ? find_and_get_object+0x44/0x118
[    0.009023]  [<ffffffff81118fe6>] paint_ptr+0x57/0x8f
[    0.009109]  [<ffffffff81475935>] kmemleak_not_leak+0x23/0x42
[    0.009195]  [<ffffffff81ab9096>] bdev_cache_init+0x72/0x81
[    0.009282]  [<ffffffff81ab8abf>] vfs_caches_init+0x101/0x10d
[    0.009368]  [<ffffffff81a9bae3>] start_kernel+0x344/0x383
[    0.009466]  [<ffffffff81a9b2a7>] x86_64_start_reservations+0xae/0xb2
[    0.009555]  [<ffffffff81a9b140>] ? early_idt_handlers+0x140/0x140
[    0.009643]  [<ffffffff81a9b3ad>] x86_64_start_kernel+0x102/0x111

due to attempt to mark pointer to `struct vfsmount' as a gray object, which
is embedded into `struct mount' returned from alloc_vfsmnt().

Make `bd_mnt' static, avoiding need to tell kmemleak to mark it gray, as
suggested by Al Viro.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 69a5b6fbee2..afe74dda632 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,7 +25,6 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
-#include <linux/kmemleak.h>
 #include <linux/cleancache.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -521,7 +520,7 @@ static struct super_block *blockdev_superblock __read_mostly;
 void __init bdev_cache_init(void)
 {
 	int err;
-	struct vfsmount *bd_mnt;
+	static struct vfsmount *bd_mnt;
 
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -533,12 +532,7 @@ void __init bdev_cache_init(void)
 	bd_mnt = kern_mount(&bd_type);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
-	/*
-	 * This vfsmount structure is only used to obtain the
-	 * blockdev_superblock, so tell kmemleak not to report it.
-	 */
-	kmemleak_not_leak(bd_mnt);
-	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
+	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
 }
 
 /*
-- 
cgit v1.2.3


From b3f2a92447b8443360ac117a3d7c06689562a70c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 17:48:52 -0500
Subject: hfsplus: creation of hidden dir on mount can fail

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfsplus/super.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index edf0a801446..427682ca9e4 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -499,9 +499,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sbi->hidden_dir) {
 			mutex_lock(&sbi->vh_mutex);
 			sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-			hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
-					   sbi->hidden_dir);
+			if (!sbi->hidden_dir) {
+				mutex_unlock(&sbi->vh_mutex);
+				err = -ENOMEM;
+				goto out_put_root;
+			}
+			err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,
+						 &str, sbi->hidden_dir);
 			mutex_unlock(&sbi->vh_mutex);
+			if (err)
+				goto out_put_hidden_dir;
 
 			hfsplus_mark_inode_dirty(sbi->hidden_dir,
 						 HFSPLUS_I_CAT_DIRTY);
-- 
cgit v1.2.3


From 5f8aefd44e64ed2f6950a1dcc77309b7dd9979f4 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue, 10 Jan 2012 15:07:18 -0800
Subject: mm: account reaped page cache on inode cache pruning

Inode cache pruning indirectly reclaims page-cache by invalidating mapping
pages.  Let's account them into reclaim-state to notice this progress in
memory reclaimer.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 87535753ab0..4fa4f0916af 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -776,6 +776,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
 	else
 		__count_vm_events(PGINODESTEAL, reap);
 	spin_unlock(&sb->s_inode_lru_lock);
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed_slab += reap;
 
 	dispose_list(&freeable);
 }
-- 
cgit v1.2.3


From e3a41a5ba9c2ab988b9f1442925109dca2382fd9 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@redhat.com>
Date: Tue, 10 Jan 2012 15:07:55 -0800
Subject: btrfs: pass __GFP_WRITE for buffered write page allocations

Tell the page allocator that pages allocated for a buffered write are
expected to become dirty soon.

Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97fbe939c05..20375e6691c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = find_or_create_page(inode->i_mapping, index + i,
-					       mask);
+					       mask | __GFP_WRITE);
 		if (!pages[i]) {
 			faili = i - 1;
 			err = -ENOMEM;
-- 
cgit v1.2.3


From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 10 Jan 2012 15:08:09 -0800
Subject: tracepoint: add tracepoints for debugging oom_score_adj

oom_score_adj is used for guarding processes from OOM-Killer.  One of
problem is that it's inherited at fork().  When a daemon set oom_score_adj
and make children, it's hard to know where the value is set.

This patch adds some tracepoints useful for debugging. This patch adds
3 trace points.
  - creating new task
  - renaming a task (exec)
  - set oom_score_adj

To debug, users need to enable some trace pointer. Maybe filtering is useful as

# EVENT=/sys/kernel/debug/tracing/events/task/
# echo "oom_score_adj != 0" > $EVENT/task_newtask/filter
# echo "oom_score_adj != 0" > $EVENT/task_rename/filter
# echo 1 > $EVENT/enable
# EVENT=/sys/kernel/debug/tracing/events/oom/
# echo 1 > $EVENT/enable

output will be like this.
# grep oom /sys/kernel/debug/tracing/trace
bash-7699  [007] d..3  5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000
bash-7699  [007] ...1  5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000
ls-7729  [003] ...2  5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000
bash-7699  [002] ...1  5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000
grep-7730  [007] ...2  5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                   |  4 +++
 fs/proc/base.c              |  3 +++
 include/trace/events/oom.h  | 33 ++++++++++++++++++++++++
 include/trace/events/task.h | 61 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c               |  6 +++++
 mm/oom_kill.c               |  6 +++++
 6 files changed, 113 insertions(+)
 create mode 100644 include/trace/events/oom.h
 create mode 100644 include/trace/events/task.h

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 3f64b9f26e7..aeb135c7ff5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,6 +59,8 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+
+#include <trace/events/task.h>
 #include "internal.h"
 
 int core_uses_pid;
@@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 {
 	task_lock(tsk);
 
+	trace_task_rename(tsk, buf);
+
 	/*
 	 * Threads may access current->comm without holding
 	 * the task lock, so write the string carefully.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1dddda999f..1aab5fe05a1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -86,6 +86,7 @@
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
+#include <trace/events/oom.h>
 #include "internal.h"
 
 /* NOTE:
@@ -1010,6 +1011,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
+	trace_oom_score_adj_update(task);
 err_sighand:
 	unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -1097,6 +1099,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	task->signal->oom_score_adj = oom_score_adj;
 	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
 		task->signal->oom_score_adj_min = oom_score_adj;
+	trace_oom_score_adj_update(task);
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
 	 * always attainable.
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
new file mode 100644
index 00000000000..dd4ba3b9200
--- /dev/null
+++ b/include/trace/events/oom.h
@@ -0,0 +1,33 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM oom
+
+#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OOM_H
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(oom_score_adj_update,
+
+	TP_PROTO(struct task_struct *task),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__field(	pid_t,	pid)
+		__array(	char,	comm,	TASK_COMM_LEN )
+		__field(	 int,	oom_score_adj)
+	),
+
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__entry->oom_score_adj = task->signal->oom_score_adj;
+	),
+
+	TP_printk("pid=%d comm=%s oom_score_adj=%d",
+		__entry->pid, __entry->comm, __entry->oom_score_adj)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/task.h b/include/trace/events/task.h
new file mode 100644
index 00000000000..b53add02e92
--- /dev/null
+++ b/include/trace/events/task.h
@@ -0,0 +1,61 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM task
+
+#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TASK_H
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(task_newtask,
+
+	TP_PROTO(struct task_struct *task, unsigned long clone_flags),
+
+	TP_ARGS(task, clone_flags),
+
+	TP_STRUCT__entry(
+		__field(	pid_t,	pid)
+		__array(	char,	comm, TASK_COMM_LEN)
+		__field( unsigned long, clone_flags)
+		__field(	int,    oom_score_adj)
+	),
+
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__entry->clone_flags = clone_flags;
+		__entry->oom_score_adj = task->signal->oom_score_adj;
+	),
+
+	TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d",
+		__entry->pid, __entry->comm,
+		__entry->clone_flags, __entry->oom_score_adj)
+);
+
+TRACE_EVENT(task_rename,
+
+	TP_PROTO(struct task_struct *task, char *comm),
+
+	TP_ARGS(task, comm),
+
+	TP_STRUCT__entry(
+		__field(	pid_t,	pid)
+		__array(	char, oldcomm,  TASK_COMM_LEN)
+		__array(	char, newcomm,  TASK_COMM_LEN)
+		__field(	int, oom_score_adj)
+	),
+
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN);
+		memcpy(entry->newcomm, comm, TASK_COMM_LEN);
+		__entry->oom_score_adj = task->signal->oom_score_adj;
+	),
+
+	TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d",
+		__entry->pid, __entry->oldcomm,
+		__entry->newcomm, __entry->oom_score_adj)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index b00711ce7c1..5e1391b5ade 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,6 +76,9 @@
 
 #include <trace/events/sched.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -1370,6 +1373,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
 	perf_event_fork(p);
+
+	trace_task_newtask(p, clone_flags);
+
 	return p;
 
 bad_fork_free_pid:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eeb27e27dce..7c122faa05c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -33,6 +33,10 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/freezer.h>
+#include <linux/ftrace.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/oom.h>
 
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
@@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val)
 	spin_lock_irq(&sighand->siglock);
 	if (current->signal->oom_score_adj == old_val)
 		current->signal->oom_score_adj = new_val;
+	trace_oom_score_adj_update(current);
 	spin_unlock_irq(&sighand->siglock);
 }
 
@@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val)
 	spin_lock_irq(&sighand->siglock);
 	old_val = current->signal->oom_score_adj;
 	current->signal->oom_score_adj = new_val;
+	trace_oom_score_adj_update(current);
 	spin_unlock_irq(&sighand->siglock);
 
 	return old_val;
-- 
cgit v1.2.3


From e39f560239984c3098237ad94c9449b1494163f8 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney.cavm@gmail.com>
Date: Tue, 10 Jan 2012 15:10:21 -0800
Subject: fs: binfmt_elf: create Kconfig variable for PIE randomization

Randomization of PIE load address is hard coded in binfmt_elf.c for X86
and ARM.  Create a new Kconfig variable
(CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE) for this and use it instead.  Thus
architecture specific policy is pushed out of the generic binfmt_elf.c and
into the architecture Kconfig files.

X86 and ARM Kconfigs are modified to select the new variable so there is
no change in behavior.  A follow on patch will select it for MIPS too.

Signed-off-by: David Daney <david.daney@cavium.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/Kconfig  | 1 +
 arch/x86/Kconfig  | 1 +
 fs/Kconfig.binfmt | 3 +++
 fs/binfmt_elf.c   | 2 +-
 4 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9d66dfc33a5..98a6459cd39 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -16,6 +16,7 @@ config ARM
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
+	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d2a69dd36d..d6ddc0bfe36 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -62,6 +62,7 @@ config X86
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
+	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_TEXT_POKE_SMP
 	select HAVE_GENERIC_HARDIRQS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 79e2ca7973b..e95d1b64082 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF
 	bool
 	depends on COMPAT && BINFMT_ELF
 
+config ARCH_BINFMT_ELF_RANDOMIZE_PIE
+	bool
+
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
 	default y
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 21ac5ee4b43..bcb884e2d61 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * default mmap base, as well as whatever program they
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
-#if defined(CONFIG_X86) || defined(CONFIG_ARM)
+#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
 			/* Memory randomization might have been switched off
 			 * in runtime via sysctl.
 			 * If that is the case, retain the original non-zero
-- 
cgit v1.2.3


From b18c1c6e0c90cbcd38ba879bd63a44c94e4f7301 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Tue, 10 Jan 2012 15:11:05 -0800
Subject: reiserfs: delete comments referring to the BKL

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/journal.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index eb711060a6f..cce8e8710df 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2896,14 +2896,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
 	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
 		return 1;
 	}
-	/* protected by the BKL here */
+
 	journal->j_len_alloc += new_alloc;
 	th->t_blocks_allocated += new_alloc ;
 	return 0;
 }
 
-/* this must be called inside a transaction, and requires the
-** kernel_lock to be held
+/* this must be called inside a transaction
 */
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
 {
@@ -2914,8 +2913,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
 	return;
 }
 
-/* this must be called without a transaction started, and does not
-** require BKL
+/* this must be called without a transaction started
 */
 void reiserfs_allow_writes(struct super_block *s)
 {
@@ -2924,8 +2922,7 @@ void reiserfs_allow_writes(struct super_block *s)
 	wake_up(&journal->j_join_wait);
 }
 
-/* this must be called without a transaction started, and does not
-** require BKL
+/* this must be called without a transaction started
 */
 void reiserfs_wait_on_write_block(struct super_block *s)
 {
-- 
cgit v1.2.3


From f32485be8397ad811312bc055d2e2a5906bc7576 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Jan 2012 15:11:07 -0800
Subject: reiserfs: delay reiserfs lock until journal initialization

In the mount path, transactions that are made before journal
initialization don't involve the filesystem.  We can delay the reiserfs
lock until we play with the journal.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/bitmap.c |  3 ---
 fs/reiserfs/super.c  | 43 ++++++++++++++++++++++++-------------------
 2 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index a945cd26522..70de42f09f1 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1364,10 +1364,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
 	struct reiserfs_bitmap_info *bitmap;
 	unsigned int bmap_nr = reiserfs_bmap_count(sb);
 
-	/* Avoid lock recursion in fault case */
-	reiserfs_write_unlock(sb);
 	bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
-	reiserfs_write_lock(sb);
 	if (bitmap == NULL)
 		return -ENOMEM;
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1d42e707d5f..620dd5d9e1b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1746,22 +1746,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	mutex_init(&REISERFS_SB(s)->lock);
 	REISERFS_SB(s)->lock_depth = -1;
 
-	/*
-	 * This function is called with the bkl, which also was the old
-	 * locking used here.
-	 * do_journal_begin() will soon check if we hold the lock (ie: was the
-	 * bkl). This is likely because do_journal_begin() has several another
-	 * callers because at this time, it doesn't seem to be necessary to
-	 * protect against anything.
-	 * Anyway, let's be conservative and lock for now.
-	 */
-	reiserfs_write_lock(s);
-
 	jdev_name = NULL;
 	if (reiserfs_parse_options
 	    (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
 	     &commit_max_age, qf_names, &qfmt) == 0) {
-		goto error;
+		goto error_unlocked;
 	}
 	if (jdev_name && jdev_name[0]) {
 		REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
@@ -1777,7 +1766,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 
 	if (blocks) {
 		SWARN(silent, s, "jmacd-7", "resize option for remount only");
-		goto error;
+		goto error_unlocked;
 	}
 
 	/* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
@@ -1787,7 +1776,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
 		SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
 		      reiserfs_bdevname(s));
-		goto error;
+		goto error_unlocked;
 	}
 
 	rs = SB_DISK_SUPER_BLOCK(s);
@@ -1803,7 +1792,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 		      "or increase size of your LVM partition");
 		SWARN(silent, s, "", "Or may be you forgot to "
 		      "reboot after fdisk when it told you to");
-		goto error;
+		goto error_unlocked;
 	}
 
 	sbi->s_mount_state = SB_REISERFS_STATE(s);
@@ -1811,8 +1800,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 
 	if ((errval = reiserfs_init_bitmap_cache(s))) {
 		SWARN(silent, s, "jmacd-8", "unable to read bitmap");
-		goto error;
+		goto error_unlocked;
 	}
+
 	errval = -EINVAL;
 #ifdef CONFIG_REISERFS_CHECK
 	SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
@@ -1835,6 +1825,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	if (reiserfs_barrier_flush(s)) {
 		printk("reiserfs: using flush barriers\n");
 	}
+
+	/*
+	 * This path assumed to be called with the BKL in the old times.
+	 * Now we have inherited the big reiserfs lock from it and many
+	 * reiserfs helpers called in the mount path and elsewhere require
+	 * this lock to be held even if it's not always necessary. Let's be
+	 * conservative and hold it early. The window can be reduced after
+	 * careful review of the code.
+	 */
+	reiserfs_write_lock(s);
+
 	// set_device_ro(s->s_dev, 1) ;
 	if (journal_init(s, jdev_name, old_format, commit_max_age)) {
 		SWARN(silent, s, "sh-2022",
@@ -1995,12 +1996,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	return (0);
 
 error:
-	if (jinit_done) {	/* kill the commit thread, free journal ram */
+	reiserfs_write_unlock(s);
+
+error_unlocked:
+	/* kill the commit thread, free journal ram */
+	if (jinit_done) {
+		reiserfs_write_lock(s);
 		journal_release_error(NULL, s);
+		reiserfs_write_unlock(s);
 	}
 
-	reiserfs_write_unlock(s);
-
 	reiserfs_free_bitmap_cache(s);
 	if (SB_BUFFER_WITH_SB(s))
 		brelse(SB_BUFFER_WITH_SB(s));
-- 
cgit v1.2.3


From 37c69b98d0dca54d9eb72226bbf2e211aaaf126e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Jan 2012 15:11:09 -0800
Subject: reiserfs: don't lock journal_init()

journal_init() doesn't need the lock since no operation on the filesystem
is involved there.  journal_read() and get_list_bitmap() have yet to be
reviewed carefully though before removing the lock there.  Just keep the
it around these two calls for safety.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/journal.c | 53 +++++++++++++++++++--------------------------------
 fs/reiserfs/super.c   | 21 ++++++++++----------
 2 files changed, 31 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index cce8e8710df..c3cf54fd4de 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	char b[BDEVNAME_SIZE];
 	int ret;
 
-	/*
-	 * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
-	 * dependency inversion warnings.
-	 */
-	reiserfs_write_unlock(sb);
 	journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
 	if (!journal) {
 		reiserfs_warning(sb, "journal-1256",
 				 "unable to get memory for journal structure");
-		reiserfs_write_lock(sb);
 		return 1;
 	}
 	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
@@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	INIT_LIST_HEAD(&journal->j_working_list);
 	INIT_LIST_HEAD(&journal->j_journal_list);
 	journal->j_persistent_trans = 0;
-	ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
-					   reiserfs_bmap_count(sb));
-	reiserfs_write_lock(sb);
-	if (ret)
+	if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
+					   reiserfs_bmap_count(sb)))
 		goto free_and_return;
 
 	allocate_bitmap_nodes(sb);
@@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 		goto free_and_return;
 	}
 
-	/*
-	 * We need to unlock here to avoid creating the following
-	 * dependency:
-	 * reiserfs_lock -> sysfs_mutex
-	 * Because the reiserfs mmap path creates the following dependency:
-	 * mm->mmap -> reiserfs_lock, hence we have
-	 * mm->mmap -> reiserfs_lock ->sysfs_mutex
-	 * This would ends up in a circular dependency with sysfs readdir path
-	 * which does sysfs_mutex -> mm->mmap_sem
-	 * This is fine because the reiserfs lock is useless in mount path,
-	 * at least until we call journal_begin. We keep it for paranoid
-	 * reasons.
-	 */
-	reiserfs_write_unlock(sb);
 	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
-		reiserfs_write_lock(sb);
 		reiserfs_warning(sb, "sh-462",
 				 "unable to initialize jornal device");
 		goto free_and_return;
 	}
-	reiserfs_write_lock(sb);
 
 	rs = SB_DISK_SUPER_BLOCK(sb);
 
@@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	journal->j_mount_id = 10;
 	journal->j_state = 0;
 	atomic_set(&(journal->j_jlock), 0);
-	reiserfs_write_unlock(sb);
 	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
-	reiserfs_write_lock(sb);
 	journal->j_cnode_free_orig = journal->j_cnode_free_list;
 	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
 	journal->j_cnode_used = 0;
@@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 
 	init_journal_hash(sb);
 	jl = journal->j_current_jl;
+
+	/*
+	 * get_list_bitmap() may call flush_commit_list() which
+	 * requires the lock. Calling flush_commit_list() shouldn't happen
+	 * this early but I like to be paranoid.
+	 */
+	reiserfs_write_lock(sb);
 	jl->j_list_bitmap = get_list_bitmap(sb, jl);
+	reiserfs_write_unlock(sb);
 	if (!jl->j_list_bitmap) {
 		reiserfs_warning(sb, "journal-2005",
 				 "get_list_bitmap failed for journal list 0");
 		goto free_and_return;
 	}
-	if (journal_read(sb) < 0) {
+
+	/*
+	 * Journal_read needs to be inspected in order to push down
+	 * the lock further inside (or even remove it).
+	 */
+	reiserfs_write_lock(sb);
+	ret = journal_read(sb);
+	reiserfs_write_unlock(sb);
+	if (ret < 0) {
 		reiserfs_warning(sb, "reiserfs-2006",
 				 "Replay Failure, unable to mount");
 		goto free_and_return;
 	}
 
 	reiserfs_mounted_fs_count++;
-	if (reiserfs_mounted_fs_count <= 1) {
-		reiserfs_write_unlock(sb);
+	if (reiserfs_mounted_fs_count <= 1)
 		commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
-		reiserfs_write_lock(sb);
-	}
 
 	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
 	journal->j_work_sb = sb;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 620dd5d9e1b..61b60380d2c 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1826,6 +1826,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 		printk("reiserfs: using flush barriers\n");
 	}
 
+	// set_device_ro(s->s_dev, 1) ;
+	if (journal_init(s, jdev_name, old_format, commit_max_age)) {
+		SWARN(silent, s, "sh-2022",
+		      "unable to initialize journal space");
+		goto error_unlocked;
+	} else {
+		jinit_done = 1;	/* once this is set, journal_release must be called
+				 ** if we error out of the mount
+				 */
+	}
+
 	/*
 	 * This path assumed to be called with the BKL in the old times.
 	 * Now we have inherited the big reiserfs lock from it and many
@@ -1836,16 +1847,6 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	 */
 	reiserfs_write_lock(s);
 
-	// set_device_ro(s->s_dev, 1) ;
-	if (journal_init(s, jdev_name, old_format, commit_max_age)) {
-		SWARN(silent, s, "sh-2022",
-		      "unable to initialize journal space");
-		goto error;
-	} else {
-		jinit_done = 1;	/* once this is set, journal_release must be called
-				 ** if we error out of the mount
-				 */
-	}
 	if (reread_meta_blocks(s)) {
 		SWARN(silent, s, "jmacd-9",
 		      "unable to reread meta blocks after journal init");
-- 
cgit v1.2.3


From 9b467e6ebebbe75288aeb7e816ffbb5d35d6eaa3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Jan 2012 15:11:11 -0800
Subject: reiserfs: don't lock root inode searching

Nothing requires that we lock the filesystem until the root inode is
provided.

Also iget5_locked() triggers a warning because we are holding the
filesystem lock while allocating the inode, which result in a lockdep
suspicion that we have a lock inversion against the reclaim path:

[ 1986.896979] =================================
[ 1986.896990] [ INFO: inconsistent lock state ]
[ 1986.896997] 3.1.1-main #8
[ 1986.897001] ---------------------------------
[ 1986.897007] inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage.
[ 1986.897016] kswapd0/16 [HC0[0]:SC0[0]:HE1:SE1] takes:
[ 1986.897023]  (&REISERFS_SB(s)->lock){+.+.?.}, at: [<c01f8bd4>] reiserfs_write_lock+0x20/0x2a
[ 1986.897044] {RECLAIM_FS-ON-W} state was registered at:
[ 1986.897050]   [<c014a5b9>] mark_held_locks+0xae/0xd0
[ 1986.897060]   [<c014aab3>] lockdep_trace_alloc+0x7d/0x91
[ 1986.897068]   [<c0190ee0>] kmem_cache_alloc+0x1a/0x93
[ 1986.897078]   [<c01e7728>] reiserfs_alloc_inode+0x13/0x3d
[ 1986.897088]   [<c01a5b06>] alloc_inode+0x14/0x5f
[ 1986.897097]   [<c01a5cb9>] iget5_locked+0x62/0x13a
[ 1986.897106]   [<c01e99e0>] reiserfs_fill_super+0x410/0x8b9
[ 1986.897114]   [<c01953da>] mount_bdev+0x10b/0x159
[ 1986.897123]   [<c01e764d>] get_super_block+0x10/0x12
[ 1986.897131]   [<c0195b38>] mount_fs+0x59/0x12d
[ 1986.897138]   [<c01a80d1>] vfs_kern_mount+0x45/0x7a
[ 1986.897147]   [<c01a83e3>] do_kern_mount+0x2f/0xb0
[ 1986.897155]   [<c01a987a>] do_mount+0x5c2/0x612
[ 1986.897163]   [<c01a9a72>] sys_mount+0x61/0x8f
[ 1986.897170]   [<c044060c>] sysenter_do_call+0x12/0x32
[ 1986.897181] irq event stamp: 7509691
[ 1986.897186] hardirqs last  enabled at (7509691): [<c0190f34>] kmem_cache_alloc+0x6e/0x93
[ 1986.897197] hardirqs last disabled at (7509690): [<c0190eea>] kmem_cache_alloc+0x24/0x93
[ 1986.897209] softirqs last  enabled at (7508896): [<c01294bd>] __do_softirq+0xee/0xfd
[ 1986.897222] softirqs last disabled at (7508859): [<c01030ed>] do_softirq+0x50/0x9d
[ 1986.897234]
[ 1986.897235] other info that might help us debug this:
[ 1986.897242]  Possible unsafe locking scenario:
[ 1986.897244]
[ 1986.897250]        CPU0
[ 1986.897254]        ----
[ 1986.897257]   lock(&REISERFS_SB(s)->lock);
[ 1986.897265] <Interrupt>
[ 1986.897269]     lock(&REISERFS_SB(s)->lock);
[ 1986.897276]
[ 1986.897277]  *** DEADLOCK ***
[ 1986.897278]
[ 1986.897286] no locks held by kswapd0/16.
[ 1986.897291]
[ 1986.897292] stack backtrace:
[ 1986.897299] Pid: 16, comm: kswapd0 Not tainted 3.1.1-main #8
[ 1986.897306] Call Trace:
[ 1986.897314]  [<c0439e76>] ? printk+0xf/0x11
[ 1986.897324]  [<c01482d1>] print_usage_bug+0x20e/0x21a
[ 1986.897332]  [<c01479b8>] ? print_irq_inversion_bug+0x172/0x172
[ 1986.897341]  [<c014855c>] mark_lock+0x27f/0x483
[ 1986.897349]  [<c0148d88>] __lock_acquire+0x628/0x1472
[ 1986.897358]  [<c0149fae>] lock_acquire+0x47/0x5e
[ 1986.897366]  [<c01f8bd4>] ? reiserfs_write_lock+0x20/0x2a
[ 1986.897384]  [<c01f8bd4>] ? reiserfs_write_lock+0x20/0x2a
[ 1986.897397]  [<c043b5ef>] mutex_lock_nested+0x35/0x26f
[ 1986.897409]  [<c01f8bd4>] ? reiserfs_write_lock+0x20/0x2a
[ 1986.897421]  [<c01f8bd4>] reiserfs_write_lock+0x20/0x2a
[ 1986.897433]  [<c01e2edd>] map_block_for_writepage+0xc9/0x590
[ 1986.897448]  [<c01b1706>] ? create_empty_buffers+0x33/0x8f
[ 1986.897461]  [<c0121124>] ? get_parent_ip+0xb/0x31
[ 1986.897472]  [<c043ef7f>] ? sub_preempt_count+0x81/0x8e
[ 1986.897485]  [<c043cae0>] ? _raw_spin_unlock+0x27/0x3d
[ 1986.897496]  [<c0121124>] ? get_parent_ip+0xb/0x31
[ 1986.897508]  [<c01e355d>] reiserfs_writepage+0x1b9/0x3e7
[ 1986.897521]  [<c0173b40>] ? clear_page_dirty_for_io+0xcb/0xde
[ 1986.897533]  [<c014a6e3>] ? trace_hardirqs_on_caller+0x108/0x138
[ 1986.897546]  [<c014a71e>] ? trace_hardirqs_on+0xb/0xd
[ 1986.897559]  [<c0177b38>] shrink_page_list+0x34f/0x5e2
[ 1986.897572]  [<c01780a7>] shrink_inactive_list+0x172/0x22c
[ 1986.897585]  [<c0178464>] shrink_zone+0x303/0x3b1
[ 1986.897597]  [<c043cae0>] ? _raw_spin_unlock+0x27/0x3d
[ 1986.897611]  [<c01788c9>] kswapd+0x3b7/0x5f2

The deadlock shouldn't happen since we are doing that allocation in the
mount path, the filesystem is not available for any reclaim.  Still the
warning is annoying.

To solve this, acquire the lock later only where we need it, right before
calling reiserfs_read_locked_inode() that wants to lock to walk the tree.

Reported-by: Knut Petersen <Knut_Petersen@t-online.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/super.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 61b60380d2c..e12d8b97cd4 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1519,9 +1519,7 @@ static int read_super_block(struct super_block *s, int offset)
 static int reread_meta_blocks(struct super_block *s)
 {
 	ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
-	reiserfs_write_unlock(s);
 	wait_on_buffer(SB_BUFFER_WITH_SB(s));
-	reiserfs_write_lock(s);
 	if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
 		reiserfs_warning(s, "reiserfs-2504", "error reading the super");
 		return 1;
@@ -1837,24 +1835,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 				 */
 	}
 
-	/*
-	 * This path assumed to be called with the BKL in the old times.
-	 * Now we have inherited the big reiserfs lock from it and many
-	 * reiserfs helpers called in the mount path and elsewhere require
-	 * this lock to be held even if it's not always necessary. Let's be
-	 * conservative and hold it early. The window can be reduced after
-	 * careful review of the code.
-	 */
-	reiserfs_write_lock(s);
-
 	if (reread_meta_blocks(s)) {
 		SWARN(silent, s, "jmacd-9",
 		      "unable to reread meta blocks after journal init");
-		goto error;
+		goto error_unlocked;
 	}
 
 	if (replay_only(s))
-		goto error;
+		goto error_unlocked;
 
 	if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
 		SWARN(silent, s, "clm-7000",
@@ -1868,9 +1856,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 			 reiserfs_init_locked_inode, (void *)(&args));
 	if (!root_inode) {
 		SWARN(silent, s, "jmacd-10", "get root inode failed");
-		goto error;
+		goto error_unlocked;
 	}
 
+	/*
+	 * This path assumed to be called with the BKL in the old times.
+	 * Now we have inherited the big reiserfs lock from it and many
+	 * reiserfs helpers called in the mount path and elsewhere require
+	 * this lock to be held even if it's not always necessary. Let's be
+	 * conservative and hold it early. The window can be reduced after
+	 * careful review of the code.
+	 */
+	reiserfs_write_lock(s);
+
 	if (root_inode->i_state & I_NEW) {
 		reiserfs_read_locked_inode(root_inode, &args);
 		unlock_new_inode(root_inode);
-- 
cgit v1.2.3


From 7773fbc54182a90cd248656619c7d33859e5f91d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue, 10 Jan 2012 15:11:20 -0800
Subject: procfs: make proc_get_link to use dentry instead of inode

Prepare the ground for the next "map_files" patch which needs a name of a
link file to analyse.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c          | 20 ++++++++++----------
 include/linux/proc_fs.h |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1aab5fe05a1..e31d95055c6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,9 +166,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
 	return result;
 }
 
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
 {
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(dentry->d_inode);
 	int result = -ENOENT;
 
 	if (task) {
@@ -183,9 +183,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
 	return result;
 }
 
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
 {
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(dentry->d_inode);
 	int result = -ENOENT;
 
 	if (task) {
@@ -1456,13 +1456,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
 	.release	= single_release,
 };
 
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
 	struct task_struct *task;
 	struct mm_struct *mm;
 	struct file *exe_file;
 
-	task = get_proc_task(inode);
+	task = get_proc_task(dentry->d_inode);
 	if (!task)
 		return -ENOENT;
 	mm = get_task_mm(task);
@@ -1492,7 +1492,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
 out:
 	return ERR_PTR(error);
 }
@@ -1531,7 +1531,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
 	if (error)
 		goto out;
 
@@ -1823,9 +1823,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
 	return -ENOENT;
 }
 
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-	return proc_fd_info(inode, path, NULL);
+	return proc_fd_info(dentry->d_inode, path, NULL);
 }
 
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 6d9e575519c..85c50730623 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
 
 union proc_op {
-	int (*proc_get_link)(struct inode *, struct path *);
+	int (*proc_get_link)(struct dentry *, struct path *);
 	int (*proc_read)(struct task_struct *task, char *page);
 	int (*proc_show)(struct seq_file *m,
 		struct pid_namespace *ns, struct pid *pid,
-- 
cgit v1.2.3


From 640708a2cff7f81e246243b0073c66e6ece7e53e Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Tue, 10 Jan 2012 15:11:23 -0800
Subject: procfs: introduce the /proc/<pid>/map_files/ directory

This one behaves similarly to the /proc/<pid>/fd/ one - it contains
symlinks one for each mapping with file, the name of a symlink is
"vma->vm_start-vma->vm_end", the target is the file.  Opening a symlink
results in a file that point exactly to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This *helps* checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped
   by particular region.  We do this by opening
   /proc/$pid/map_files/$address symlink the way we do with file
   descriptors.

2. This also helps in determining which anonymous shared mappings are
   shared with each other by comparing the inodes of them.

3. When restoring a set of processes in case two of them has a mapping
   shared, we map the memory by the 1st one and then open its
   /proc/$pid/map_files/$address file and map it by the 2nd task.

Using /proc/$pid/maps for this is quite inconvenient since it brings
repeatable re-reading and reparsing for this text file which slows down
restore procedure significantly.  Also as being pointed in (3) it is a way
easier to use top level shared mapping in children as
/proc/$pid/map_files/$address when needed.

[akpm@linux-foundation.org: coding-style fixes]
[gorcunov@openvz.org: make map_files depend on CHECKPOINT_RESTORE]
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Vasiliy Kulikov <segoon@openwall.com>
Reviewed-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Tejun Heo <tj@kernel.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c     | 355 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |  12 ++
 2 files changed, 367 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e31d95055c6..4d755fed3ec 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,6 +83,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -134,6 +135,8 @@ struct pid_entry {
 		NULL, &proc_single_file_operations,	\
 		{ .proc_show = show } )
 
+static int proc_fd_permission(struct inode *inode, int mask);
+
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
@@ -2046,6 +2049,355 @@ static const struct file_operations proc_fd_operations = {
 	.llseek		= default_llseek,
 };
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+			     unsigned long *start, unsigned long *end)
+{
+	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	bool exact_vma_exists = false;
+	struct mm_struct *mm = NULL;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct inode *inode;
+	int status = 0;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		status = -EACCES;
+		goto out_notask;
+	}
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_notask;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out;
+
+	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (exact_vma_exists) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		status = 1;
+	}
+
+out:
+	put_task_struct(task);
+
+out_notask:
+	if (status <= 0)
+		d_drop(dentry);
+
+	return status;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc;
+
+	rc = -ENOENT;
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei = PROC_I(inode);
+	ei->op.proc_get_link = proc_map_files_get_link;
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	inode->i_mode = S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct dentry *result;
+	struct mm_struct *mm;
+
+	result = ERR_PTR(-EACCES);
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out;
+
+	result = ERR_PTR(-EACCES);
+	if (lock_trace(task))
+		goto out_put_task;
+
+	result = ERR_PTR(-ENOENT);
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+		goto out_unlock;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_unlock;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_unlock:
+	unlock_trace(task);
+out_put_task:
+	put_task_struct(task);
+out:
+	return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.permission	= proc_fd_permission,
+	.setattr	= proc_setattr,
+};
+
+static int
+proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	ino_t ino;
+	int ret;
+
+	ret = -EACCES;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	ret = -EACCES;
+	if (lock_trace(task))
+		goto out_put_task;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out_unlock;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out_unlock;
+		filp->f_pos++;
+	default:
+	{
+		unsigned long nr_files, pos, i;
+		struct flex_array *fa = NULL;
+		struct map_files_info info;
+		struct map_files_info *p;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out_unlock;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+
+		/*
+		 * We need two passes here:
+		 *
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 *
+		 * otherwise we get lockdep complained, since filldir()
+		 * routine might require mmap_sem taken in might_fault().
+		 */
+
+		for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+			if (vma->vm_file && ++pos > filp->f_pos)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			fa = flex_array_alloc(sizeof(info), nr_files,
+						GFP_KERNEL);
+			if (!fa || flex_array_prealloc(fa, 0, nr_files,
+							GFP_KERNEL)) {
+				ret = -ENOMEM;
+				if (fa)
+					flex_array_free(fa);
+				up_read(&mm->mmap_sem);
+				mmput(mm);
+				goto out_unlock;
+			}
+			for (i = 0, vma = mm->mmap, pos = 2; vma;
+					vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				if (++pos <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info.file = vma->vm_file;
+				info.len = snprintf(info.name,
+						sizeof(info.name), "%lx-%lx",
+						vma->vm_start, vma->vm_end);
+				if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+					BUG();
+			}
+		}
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < nr_files; i++) {
+			p = flex_array_get(fa, i);
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      p->name, p->len,
+					      proc_map_files_instantiate,
+					      task, p->file);
+			if (ret)
+				break;
+			filp->f_pos++;
+			fput(p->file);
+		}
+		for (; i < nr_files; i++) {
+			/*
+			 * In case of error don't forget
+			 * to put rest of file refs.
+			 */
+			p = flex_array_get(fa, i);
+			fput(p->file);
+		}
+		if (fa)
+			flex_array_free(fa);
+		mmput(mm);
+	}
+	}
+
+out_unlock:
+	unlock_trace(task);
+out_put_task:
+	put_task_struct(task);
+out:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
 /*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
@@ -2661,6 +3013,9 @@ static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+#endif
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5568553a41f..6eba2cc016c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1482,6 +1482,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
+				unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+
+	return vma;
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 #else
-- 
cgit v1.2.3


From 97412950b10e64f347aec4a9b759395c2465adf6 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segooon@gmail.com>
Date: Tue, 10 Jan 2012 15:11:27 -0800
Subject: procfs: parse mount options

Add support for procfs mount options.  Actual mount options are coming in
the next patches.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Theodore Tso <tytso@MIT.EDU>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: James Morris <jmorris@namei.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c    | 10 ++++++++++
 fs/proc/internal.h |  1 +
 fs/proc/root.c     | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 64 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 51a176622b8..27c762f3487 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -7,6 +7,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
+#include <linux/pid_namespace.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
@@ -17,7 +18,9 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
+#include <linux/mount.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -101,12 +104,19 @@ void __init proc_init_inodecache(void)
 					     init_once);
 }
 
+static int proc_show_options(struct seq_file *seq, struct dentry *root)
+{
+	return 0;
+}
+
 static const struct super_operations proc_sops = {
 	.alloc_inode	= proc_alloc_inode,
 	.destroy_inode	= proc_destroy_inode,
 	.drop_inode	= generic_delete_inode,
 	.evict_inode	= proc_evict_inode,
 	.statfs		= simple_statfs,
+	.remount_fs	= proc_remount,
+	.show_options	= proc_show_options,
 };
 
 static void __pde_users_dec(struct proc_dir_entry *pde)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7838e5cfec1..292577531ad 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde);
 
 int proc_fill_super(struct super_block *);
 struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+int proc_remount(struct super_block *sb, int *flags, char *data);
 
 /*
  * These are generic /proc routines that use the internal
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 03102d97818..6a8ac1d361a 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,7 @@
 #include <linux/bitops.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
+#include <linux/parser.h>
 
 #include "internal.h"
 
@@ -36,6 +37,48 @@ static int proc_set_super(struct super_block *sb, void *data)
 	return err;
 }
 
+enum {
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_err, NULL},
+};
+
+static int proc_parse_options(char *options, struct pid_namespace *pid)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	pr_debug("proc: options = %s\n", options);
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		args[0].to = args[0].from = 0;
+		token = match_token(p, tokens, args);
+		switch (token) {
+		default:
+			pr_err("proc: unrecognized mount option \"%s\" "
+			       "or missing value\n", p);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+int proc_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct pid_namespace *pid = sb->s_fs_info;
+	return !proc_parse_options(data, pid);
+}
+
 static struct dentry *proc_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
@@ -43,11 +86,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	struct super_block *sb;
 	struct pid_namespace *ns;
 	struct proc_inode *ei;
+	char *options;
 
-	if (flags & MS_KERNMOUNT)
+	if (flags & MS_KERNMOUNT) {
 		ns = (struct pid_namespace *)data;
-	else
+		options = NULL;
+	} else {
 		ns = current->nsproxy->pid_ns;
+		options = data;
+	}
 
 	sb = sget(fs_type, proc_test_super, proc_set_super, ns);
 	if (IS_ERR(sb))
@@ -55,6 +102,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 
 	if (!sb->s_root) {
 		sb->s_flags = flags;
+		if (!proc_parse_options(options, ns)) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(-EINVAL);
+		}
 		err = proc_fill_super(sb);
 		if (err) {
 			deactivate_locked_super(sb);
-- 
cgit v1.2.3


From 0499680a42141d86417a8fbaa8c8db806bea1201 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segooon@gmail.com>
Date: Tue, 10 Jan 2012 15:11:31 -0800
Subject: procfs: add hidepid= and gid= mount options

Add support for mount options to restrict access to /proc/PID/
directories.  The default backward-compatible "relaxed" behaviour is left
untouched.

The first mount option is called "hidepid" and its value defines how much
info about processes we want to be available for non-owners:

hidepid=0 (default) means the old behavior - anybody may read all
world-readable /proc/PID/* files.

hidepid=1 means users may not access any /proc/<pid>/ directories, but
their own.  Sensitive files like cmdline, sched*, status are now protected
against other users.  As permission checking done in proc_pid_permission()
and files' permissions are left untouched, programs expecting specific
files' modes are not confused.

hidepid=2 means hidepid=1 plus all /proc/PID/ will be invisible to other
users.  It doesn't mean that it hides whether a process exists (it can be
learned by other means, e.g.  by kill -0 $PID), but it hides process' euid
and egid.  It compicates intruder's task of gathering info about running
processes, whether some daemon runs with elevated privileges, whether
another user runs some sensitive program, whether other users run any
program at all, etc.

gid=XXX defines a group that will be able to gather all processes' info
(as in hidepid=0 mode).  This group should be used instead of putting
nonroot user in sudoers file or something.  However, untrusted users (like
daemons, etc.) which are not supposed to monitor the tasks in the whole
system should not be added to the group.

hidepid=1 or higher is designed to restrict access to procfs files, which
might reveal some sensitive private information like precise keystrokes
timings:

http://www.openwall.com/lists/oss-security/2011/11/05/3

hidepid=1/2 doesn't break monitoring userspace tools.  ps, top, pgrep, and
conky gracefully handle EPERM/ENOENT and behave as if the current user is
the only user running processes.  pstree shows the process subtree which
contains "pstree" process.

Note: the patch doesn't deal with setuid/setgid issues of keeping
preopened descriptors of procfs files (like
https://lkml.org/lkml/2011/2/7/368).  We rely on that the leaked
information like the scheduling counters of setuid apps doesn't threaten
anybody's privacy - only the user started the setuid program may read the
counters.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Theodore Tso <tytso@MIT.EDU>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: James Morris <jmorris@namei.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 39 +++++++++++++++++++++
 fs/proc/base.c                     | 69 +++++++++++++++++++++++++++++++++++++-
 fs/proc/inode.c                    |  8 +++++
 fs/proc/root.c                     | 21 ++++++++++--
 include/linux/pid_namespace.h      |  2 ++
 5 files changed, 135 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0ec91f03422..12fee132fbe 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -41,6 +41,8 @@ Table of Contents
   3.5	/proc/<pid>/mountinfo - Information about mounts
   3.6	/proc/<pid>/comm  & /proc/<pid>/task/<tid>/comm
 
+  4	Configuring procfs
+  4.1	Mount options
 
 ------------------------------------------------------------------------------
 Preface
@@ -1542,3 +1544,40 @@ a task to set its own or one of its thread siblings comm value. The comm value
 is limited in size compared to the cmdline value, so writing anything longer
 then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated
 comm value.
+
+
+------------------------------------------------------------------------------
+Configuring procfs
+------------------------------------------------------------------------------
+
+4.1	Mount options
+---------------------
+
+The following mount options are supported:
+
+	hidepid=	Set /proc/<pid>/ access mode.
+	gid=		Set the group authorized to learn processes information.
+
+hidepid=0 means classic mode - everybody may access all /proc/<pid>/ directories
+(default).
+
+hidepid=1 means users may not access any /proc/<pid>/ directories but their
+own.  Sensitive files like cmdline, sched*, status are now protected against
+other users.  This makes it impossible to learn whether any user runs
+specific program (given the program doesn't reveal itself by its behaviour).
+As an additional bonus, as /proc/<pid>/cmdline is unaccessible for other users,
+poorly written programs passing sensitive information via program arguments are
+now protected against local eavesdroppers.
+
+hidepid=2 means hidepid=1 plus all /proc/<pid>/ will be fully invisible to other
+users.  It doesn't mean that it hides a fact whether a process with a specific
+pid value exists (it can be learned by other means, e.g. by "kill -0 $PID"),
+but it hides process' uid and gid, which may be learned by stat()'ing
+/proc/<pid>/ otherwise.  It greatly complicates an intruder's task of gathering
+information about running processes, whether some daemon runs with elevated
+privileges, whether other user runs some sensitive program, whether other users
+run any program at all, etc.
+
+gid= defines a group authorized to learn processes information otherwise
+prohibited by hidepid=.  If you use some daemon like identd which needs to learn
+information about processes information, just add identd to this group.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4d755fed3ec..8173dfd89cb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -631,6 +631,50 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+				 struct task_struct *task,
+				 int hide_pid_min)
+{
+	if (pid->hide_pid < hide_pid_min)
+		return true;
+	if (in_group_p(pid->pid_gid))
+		return true;
+	return ptrace_may_access(task, PTRACE_MODE_READ);
+}
+
+
+static int proc_pid_permission(struct inode *inode, int mask)
+{
+	struct pid_namespace *pid = inode->i_sb->s_fs_info;
+	struct task_struct *task;
+	bool has_perms;
+
+	task = get_proc_task(inode);
+	has_perms = has_pid_permissions(pid, task, 1);
+	put_task_struct(task);
+
+	if (!has_perms) {
+		if (pid->hide_pid == 2) {
+			/*
+			 * Let's make getdents(), stat(), and open()
+			 * consistent with each other.  If a process
+			 * may not stat() a file, it shouldn't be seen
+			 * in procfs at all.
+			 */
+			return -ENOENT;
+		}
+
+		return -EPERM;
+	}
+	return generic_permission(inode, mask);
+}
+
+
+
 static const struct inode_operations proc_def_inode_operations = {
 	.setattr	= proc_setattr,
 };
@@ -1615,6 +1659,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task;
 	const struct cred *cred;
+	struct pid_namespace *pid = dentry->d_sb->s_fs_info;
 
 	generic_fillattr(inode, stat);
 
@@ -1623,6 +1668,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	stat->gid = 0;
 	task = pid_task(proc_pid(inode), PIDTYPE_PID);
 	if (task) {
+		if (!has_pid_permissions(pid, task, 2)) {
+			rcu_read_unlock();
+			/*
+			 * This doesn't prevent learning whether PID exists,
+			 * it only makes getattr() consistent with readdir().
+			 */
+			return -ENOENT;
+		}
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
 			cred = __task_cred(task);
@@ -3119,6 +3172,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
 	.lookup		= proc_tgid_base_lookup,
 	.getattr	= pid_getattr,
 	.setattr	= proc_setattr,
+	.permission	= proc_pid_permission,
 };
 
 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -3322,6 +3376,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
 				proc_pid_instantiate, iter.task, NULL);
 }
 
+static int fake_filldir(void *buf, const char *name, int namelen,
+			loff_t offset, u64 ino, unsigned d_type)
+{
+	return 0;
+}
+
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
@@ -3329,6 +3389,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	struct task_struct *reaper;
 	struct tgid_iter iter;
 	struct pid_namespace *ns;
+	filldir_t __filldir;
 
 	if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
 		goto out_no_task;
@@ -3350,8 +3411,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	for (iter = next_tgid(ns, iter);
 	     iter.task;
 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
+		if (has_pid_permissions(ns, iter.task, 2))
+			__filldir = filldir;
+		else
+			__filldir = fake_filldir;
+
 		filp->f_pos = iter.tgid + TGID_OFFSET;
-		if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+		if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
 			put_task_struct(iter.task);
 			goto out;
 		}
@@ -3686,6 +3752,7 @@ static const struct inode_operations proc_task_inode_operations = {
 	.lookup		= proc_task_lookup,
 	.getattr	= proc_task_getattr,
 	.setattr	= proc_setattr,
+	.permission	= proc_pid_permission,
 };
 
 static const struct file_operations proc_task_operations = {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 27c762f3487..84fd3235a59 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -106,6 +106,14 @@ void __init proc_init_inodecache(void)
 
 static int proc_show_options(struct seq_file *seq, struct dentry *root)
 {
+	struct super_block *sb = root->d_sb;
+	struct pid_namespace *pid = sb->s_fs_info;
+
+	if (pid->pid_gid)
+		seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid);
+	if (pid->hide_pid != 0)
+		seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+
 	return 0;
 }
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 6a8ac1d361a..46a15d8a29c 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -38,10 +38,12 @@ static int proc_set_super(struct super_block *sb, void *data)
 }
 
 enum {
-	Opt_err,
+	Opt_gid, Opt_hidepid, Opt_err,
 };
 
 static const match_table_t tokens = {
+	{Opt_hidepid, "hidepid=%u"},
+	{Opt_gid, "gid=%u"},
 	{Opt_err, NULL},
 };
 
@@ -49,8 +51,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
-
-	pr_debug("proc: options = %s\n", options);
+	int option;
 
 	if (!options)
 		return 1;
@@ -63,6 +64,20 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
 		args[0].to = args[0].from = 0;
 		token = match_token(p, tokens, args);
 		switch (token) {
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			pid->pid_gid = option;
+			break;
+		case Opt_hidepid:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > 2) {
+				pr_err("proc: hidepid value must be between 0 and 2.\n");
+				return 0;
+			}
+			pid->hide_pid = option;
+			break;
 		default:
 			pr_err("proc: unrecognized mount option \"%s\" "
 			       "or missing value\n", p);
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 38d10326246..e7cf6669ac3 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -30,6 +30,8 @@ struct pid_namespace {
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
 #endif
+	gid_t pid_gid;
+	int hide_pid;
 };
 
 extern struct pid_namespace init_pid_ns;
-- 
cgit v1.2.3


From db804f23a72bada58f083dfad6a65d019ddb3bd4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 10 Jan 2012 16:41:01 +0800
Subject: Btrfs: add pinned extents to on-disk free space cache correctly

I got this while running xfstests:

[24256.836098] block group 317849600 has an wrong amount of free space
[24256.836100] btrfs: failed to load free space cache for block group 317849600

We should clamp the extent returned by find_first_extent_bit(),
so the start of the extent won't smaller than the start of the
block group.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ec23d43d0c3..01840ef95a3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -838,7 +838,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	struct io_ctl io_ctl;
 	struct list_head bitmap_list;
 	struct btrfs_key key;
-	u64 start, end, len;
+	u64 start, extent_start, extent_end, len;
 	int entries = 0;
 	int bitmaps = 0;
 	int ret;
@@ -857,25 +857,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 				     struct btrfs_free_cluster,
 				     block_group_list);
 
-	/*
-	 * We shouldn't have switched the pinned extents yet so this is the
-	 * right one
-	 */
-	unpin = root->fs_info->pinned_extents;
-
 	/* Lock all pages first so we can lock the extent safely. */
 	io_ctl_prepare_pages(&io_ctl, inode, 0);
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
 			 0, &cached_state, GFP_NOFS);
 
-	/*
-	 * When searching for pinned extents, we need to start at our start
-	 * offset.
-	 */
-	if (block_group)
-		start = block_group->key.objectid;
-
 	node = rb_first(&ctl->free_space_offset);
 	if (!node && cluster) {
 		node = rb_first(&cluster->root);
@@ -918,9 +905,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * We want to add any pinned extents to our free space cache
 	 * so we don't leak the space
 	 */
+
+	/*
+	 * We shouldn't have switched the pinned extents yet so this is the
+	 * right one
+	 */
+	unpin = root->fs_info->pinned_extents;
+
+	if (block_group)
+		start = block_group->key.objectid;
+
 	while (block_group && (start < block_group->key.objectid +
 			       block_group->key.offset)) {
-		ret = find_first_extent_bit(unpin, start, &start, &end,
+		ret = find_first_extent_bit(unpin, start,
+					    &extent_start, &extent_end,
 					    EXTENT_DIRTY);
 		if (ret) {
 			ret = 0;
@@ -928,20 +926,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		}
 
 		/* This pinned extent is out of our range */
-		if (start >= block_group->key.objectid +
+		if (extent_start >= block_group->key.objectid +
 		    block_group->key.offset)
 			break;
 
-		len = block_group->key.objectid +
-			block_group->key.offset - start;
-		len = min(len, end + 1 - start);
+		extent_start = max(extent_start, start);
+		extent_end = min(block_group->key.objectid +
+				 block_group->key.offset, extent_end + 1);
+		len = extent_end - extent_start;
 
 		entries++;
-		ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+		ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
 		if (ret)
 			goto out_nospc;
 
-		start = end + 1;
+		start = extent_end;
 	}
 
 	/* Write out the bitmaps */
-- 
cgit v1.2.3


From a1ee5a45818acc7f9c13e560827cf3e8735ac919 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 9 Jan 2012 14:27:42 +0800
Subject: Btrfs: avoid possible NULL deref in io_ctl_drop_pages()

If we run into some failure path in io_ctl_prepare_pages(),
io_ctl->pages[] array may have some NULL pointers.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 01840ef95a3..4e55af333e1 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
 	io_ctl_unmap_page(io_ctl);
 
 	for (i = 0; i < io_ctl->num_pages; i++) {
-		ClearPageChecked(io_ctl->pages[i]);
-		unlock_page(io_ctl->pages[i]);
-		page_cache_release(io_ctl->pages[i]);
+		if (io_ctl->pages[i]) {
+			ClearPageChecked(io_ctl->pages[i]);
+			unlock_page(io_ctl->pages[i]);
+			page_cache_release(io_ctl->pages[i]);
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 706efc6630c2722602541a6a2fc5900a4e38456a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 9 Jan 2012 14:36:28 +0800
Subject: Btrfs: check the return value of io_ctl_init()

It can return -ENOMEM.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4e55af333e1..e4eb222147c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -637,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 	if (!num_entries)
 		return 0;
 
-	io_ctl_init(&io_ctl, inode, root);
+	ret = io_ctl_init(&io_ctl, inode, root);
+	if (ret)
+		return ret;
+
 	ret = readahead_cache(inode);
 	if (ret)
 		goto out;
@@ -851,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (!i_size_read(inode))
 		return -1;
 
-	io_ctl_init(&io_ctl, inode, root);
+	ret = io_ctl_init(&io_ctl, inode, root);
+	if (ret)
+		return -1;
 
 	/* Get the cluster for this block_group if it exists */
 	if (block_group && !list_empty(&block_group->cluster_list))
-- 
cgit v1.2.3


From f062abf089ff705e09bbaa6fa1e2fd7688a0f2ea Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 29 Dec 2011 13:36:45 +0800
Subject: Btrfs: remove BUG_ON()s in btrfs_ioctl_setflags()

We can recover from errors and return -errno to user space.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c04f02c7d5b..9619fb03a5d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	struct btrfs_trans_handle *trans;
 	unsigned int flags, oldflags;
 	int ret;
+	u64 ip_oldflags;
+	unsigned int i_oldflags;
 
 	if (btrfs_root_readonly(root))
 		return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 	mutex_lock(&inode->i_mutex);
 
+	ip_oldflags = ip->flags;
+	i_oldflags = inode->i_flags;
+
 	flags = btrfs_mask_flags(inode->i_mode, flags);
 	oldflags = btrfs_flags_to_ioctl(ip->flags);
 	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -250,18 +255,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	}
 
 	trans = btrfs_join_transaction(root);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_drop;
+	}
 
 	btrfs_update_iflags(inode);
 	inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, root, inode);
-	BUG_ON(ret);
 
 	btrfs_end_transaction(trans, root);
+ out_drop:
+	if (ret) {
+		ip->flags = ip_oldflags;
+		inode->i_flags = i_oldflags;
+	}
 
 	mnt_drop_write(file->f_path.mnt);
-
-	ret = 0;
  out_unlock:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
-- 
cgit v1.2.3


From 4da6f1a332f6c16b6594c7892f13c31459b9b1c8 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 29 Dec 2011 13:39:50 +0800
Subject: Btrfs: reserve metadata space in btrfs_ioctl_setflags()

Check and reserve space for btrfs_update_inode().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9619fb03a5d..fe8a60c865e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -254,7 +254,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
 	}
 
-	trans = btrfs_join_transaction(root);
+	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto out_drop;
-- 
cgit v1.2.3


From 125ccb0ae6806dbec31abf4a85448971df3b4e39 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 8 Dec 2011 15:07:24 +0800
Subject: Btrfs: don't pass a trans handle unnecessarily in volumes.c

Some functions never use the transaction handle passed to them.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/volumes.c     | 18 +++++++-----------
 fs/btrfs/volumes.h     |  3 +--
 3 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8603ee4e3df..5b53479ce07 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7084,7 +7084,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		 * space to fit our block group in.
 		 */
 		if (device->total_bytes > device->bytes_used + min_free) {
-			ret = find_free_dev_extent(NULL, device, min_free,
+			ret = find_free_dev_extent(device, min_free,
 						   &dev_offset, NULL);
 			if (!ret)
 				dev_nr++;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9..73f673c8d8d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -829,7 +829,6 @@ out:
 
 /*
  * find_free_dev_extent - find free space in the specified device
- * @trans:	transaction handler
  * @device:	the device which we search the free space in
  * @num_bytes:	the size of the free space that we need
  * @start:	store the start of the free space.
@@ -848,8 +847,7 @@ out:
  * But if we don't find suitable free space, it is used to store the size of
  * the max free space.
  */
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *len)
 {
 	struct btrfs_key key;
@@ -893,7 +891,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
-	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
@@ -1469,8 +1467,7 @@ error_undo:
 /*
  * does all the dirty work required for changing file system's UUID.
  */
-static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_root *root)
 {
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
@@ -1695,7 +1692,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 	if (seeding_dev) {
 		sb->s_flags &= ~MS_RDONLY;
-		ret = btrfs_prepare_sprout(trans, root);
+		ret = btrfs_prepare_sprout(root);
 		BUG_ON(ret);
 	}
 
@@ -2323,8 +2320,7 @@ done:
 	return ret;
 }
 
-static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_root *root,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
 {
@@ -2496,7 +2492,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		if (total_avail == 0)
 			continue;
 
-		ret = find_free_dev_extent(trans, device,
+		ret = find_free_dev_extent(device,
 					   max_stripe_size * dev_stripes,
 					   &dev_offset, &max_avail);
 		if (ret && ret != -ENOSPC)
@@ -2687,7 +2683,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+		ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
 					     item_size);
 		BUG_ON(ret);
 	}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37..c1701ec9d49 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -230,7 +230,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_root *dev_root);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *max_avail);
 #endif
-- 
cgit v1.2.3


From de11cc12df17337979e0929d2831887432f236ca Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 1 Dec 2011 12:55:47 +0800
Subject: Btrfs: don't pre-allocate btrfs bio

We pre-allocate a btrfs bio with fixed size, and then may re-allocate
memory if we find stripes are bigger than the fixed size. But this
pre-allocation is not necessary.

Also we don't have to calcuate the stripe number twice.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/volumes.c | 67 +++++++++++++++++-------------------------------------
 1 file changed, 21 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 73f673c8d8d..540fdd25fb5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2897,26 +2897,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	u64 stripe_nr;
 	u64 stripe_nr_orig;
 	u64 stripe_nr_end;
-	int stripes_allocated = 8;
-	int stripes_required = 1;
 	int stripe_index;
 	int i;
+	int ret = 0;
 	int num_stripes;
 	int max_errors = 0;
 	struct btrfs_bio *bbio = NULL;
 
-	if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
-		stripes_allocated = 1;
-again:
-	if (bbio_ret) {
-		bbio = kzalloc(btrfs_bio_size(stripes_allocated),
-				GFP_NOFS);
-		if (!bbio)
-			return -ENOMEM;
-
-		atomic_set(&bbio->error, 0);
-	}
-
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	read_unlock(&em_tree->lock);
@@ -2935,32 +2922,6 @@ again:
 	if (mirror_num > map->num_stripes)
 		mirror_num = 0;
 
-	/* if our btrfs_bio struct is too small, back off and try again */
-	if (rw & REQ_WRITE) {
-		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
-				 BTRFS_BLOCK_GROUP_DUP)) {
-			stripes_required = map->num_stripes;
-			max_errors = 1;
-		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-			stripes_required = map->sub_stripes;
-			max_errors = 1;
-		}
-	}
-	if (rw & REQ_DISCARD) {
-		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-				 BTRFS_BLOCK_GROUP_RAID1 |
-				 BTRFS_BLOCK_GROUP_DUP |
-				 BTRFS_BLOCK_GROUP_RAID10)) {
-			stripes_required = map->num_stripes;
-		}
-	}
-	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
-	    stripes_allocated < stripes_required) {
-		stripes_allocated = map->num_stripes;
-		free_extent_map(em);
-		kfree(bbio);
-		goto again;
-	}
 	stripe_nr = offset;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
@@ -3055,6 +3016,13 @@ again:
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
+	bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+	if (!bbio) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	atomic_set(&bbio->error, 0);
+
 	if (rw & REQ_DISCARD) {
 		for (i = 0; i < num_stripes; i++) {
 			bbio->stripes[i].physical =
@@ -3151,15 +3119,22 @@ again:
 			stripe_index++;
 		}
 	}
-	if (bbio_ret) {
-		*bbio_ret = bbio;
-		bbio->num_stripes = num_stripes;
-		bbio->max_errors = max_errors;
-		bbio->mirror_num = mirror_num;
+
+	if (rw & REQ_WRITE) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_RAID10 |
+				 BTRFS_BLOCK_GROUP_DUP)) {
+			max_errors = 1;
+		}
 	}
+
+	*bbio_ret = bbio;
+	bbio->num_stripes = num_stripes;
+	bbio->max_errors = max_errors;
+	bbio->mirror_num = mirror_num;
 out:
 	free_extent_map(em);
-	return 0;
+	return ret;
 }
 
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
-- 
cgit v1.2.3


From ec9ef7a13be4dcce964c8503e8999087945e5b9e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 1 Dec 2011 14:06:42 +0800
Subject: Btrfs: simplfy calculation of stripe length for discard operation

For btrfs raid, while discarding a range of space, we'll need to know
the start offset and length to discard for each device, and it's done
in btrfs_map_block().

However the calculation is a bit complex for raid0 and raid10, so I
reimplement it based on a fact that:

        dev1          dev2           dev3    (raid0)
        -----------------------------------
        s0 s3 s6      s1 s4 s7       s2 s5

Each device has (total_stripes / nr_dev) stripes, or plus one.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/volumes.c | 95 ++++++++++++++++++------------------------------------
 1 file changed, 31 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 540fdd25fb5..563ef650850 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3024,80 +3024,47 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	atomic_set(&bbio->error, 0);
 
 	if (rw & REQ_DISCARD) {
+		int factor = 0;
+		int sub_stripes = 0;
+		u64 stripes_per_dev = 0;
+		u32 remaining_stripes = 0;
+
+		if (map->type &
+		    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+			if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+				sub_stripes = 1;
+			else
+				sub_stripes = map->sub_stripes;
+
+			factor = map->num_stripes / sub_stripes;
+			stripes_per_dev = div_u64_rem(stripe_nr_end -
+						      stripe_nr_orig,
+						      factor,
+						      &remaining_stripes);
+		}
+
 		for (i = 0; i < num_stripes; i++) {
 			bbio->stripes[i].physical =
 				map->stripes[stripe_index].physical +
 				stripe_offset + stripe_nr * map->stripe_len;
 			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
 
-			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
-				u64 stripes;
-				u32 last_stripe = 0;
-				int j;
-
-				div_u64_rem(stripe_nr_end - 1,
-					    map->num_stripes,
-					    &last_stripe);
-
-				for (j = 0; j < map->num_stripes; j++) {
-					u32 test;
-
-					div_u64_rem(stripe_nr_end - 1 - j,
-						    map->num_stripes, &test);
-					if (test == stripe_index)
-						break;
-				}
-				stripes = stripe_nr_end - 1 - j;
-				do_div(stripes, map->num_stripes);
-				bbio->stripes[i].length = map->stripe_len *
-					(stripes - stripe_nr + 1);
-
-				if (i == 0) {
+			if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+					 BTRFS_BLOCK_GROUP_RAID10)) {
+				bbio->stripes[i].length = stripes_per_dev *
+							  map->stripe_len;
+				if (i / sub_stripes < remaining_stripes)
+					bbio->stripes[i].length +=
+						map->stripe_len;
+				if (i < sub_stripes)
 					bbio->stripes[i].length -=
 						stripe_offset;
-					stripe_offset = 0;
-				}
-				if (stripe_index == last_stripe)
-					bbio->stripes[i].length -=
-						stripe_end_offset;
-			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-				u64 stripes;
-				int j;
-				int factor = map->num_stripes /
-					     map->sub_stripes;
-				u32 last_stripe = 0;
-
-				div_u64_rem(stripe_nr_end - 1,
-					    factor, &last_stripe);
-				last_stripe *= map->sub_stripes;
-
-				for (j = 0; j < factor; j++) {
-					u32 test;
-
-					div_u64_rem(stripe_nr_end - 1 - j,
-						    factor, &test);
-
-					if (test ==
-					    stripe_index / map->sub_stripes)
-						break;
-				}
-				stripes = stripe_nr_end - 1 - j;
-				do_div(stripes, factor);
-				bbio->stripes[i].length = map->stripe_len *
-					(stripes - stripe_nr + 1);
-
-				if (i < map->sub_stripes) {
-					bbio->stripes[i].length -=
-						stripe_offset;
-					if (i == map->sub_stripes - 1)
-						stripe_offset = 0;
-				}
-				if (stripe_index >= last_stripe &&
-				    stripe_index <= (last_stripe +
-						     map->sub_stripes - 1)) {
+				if ((i / sub_stripes + 1) %
+				    sub_stripes == remaining_stripes)
 					bbio->stripes[i].length -=
 						stripe_end_offset;
-				}
+				if (i == sub_stripes - 1)
+					stripe_offset = 0;
 			} else
 				bbio->stripes[i].length = *length;
 
-- 
cgit v1.2.3


From 7fe1e641502616220437079258506196bc4d8cbf Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 29 Dec 2011 14:47:27 +0800
Subject: Btrfs: rewrite btrfs_trim_block_group()

There are various bugs in block group trimming:

- It may trim from offset smaller than user-specified offset.
- It may trim beyond user-specified range.
- It may leak free space for extents smaller than specified minlen.
- It may truncate the last trimmed extent thus leak free space.
- With mixed extents+bitmaps, some extents may not be trimmed.
- With mixed extents+bitmaps, some bitmaps may not be trimmed (even
none will be trimmed). Even for those trimmed, not all the free space
in the bitmaps will be trimmed.

I rewrite btrfs_trim_block_group() and break it into two functions.
One is to trim extents only, and the other is to trim bitmaps only.

Before patching:

	# fstrim -v /mnt/
	/mnt/: 1496465408 bytes were trimmed

After patching:

	# fstrim -v /mnt/
	/mnt/: 2193768448 bytes were trimmed

And this matches the total free space:

	# btrfs fi df /mnt
	Data: total=3.58GB, used=1.79GB
	System, DUP: total=8.00MB, used=4.00KB
	System: total=4.00MB, used=0.00
	Metadata, DUP: total=205.12MB, used=97.14MB
	Metadata: total=8.00MB, used=0.00

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 235 +++++++++++++++++++++++++++++++-------------
 1 file changed, 164 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e4eb222147c..b3cbb8939fa 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2594,17 +2594,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 	cluster->block_group = NULL;
 }
 
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
-			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+		       u64 *total_trimmed, u64 start, u64 bytes,
+		       u64 reserved_start, u64 reserved_bytes)
 {
-	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-	struct btrfs_free_space *entry = NULL;
+	struct btrfs_space_info *space_info = block_group->space_info;
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	u64 bytes = 0;
-	u64 actually_trimmed;
-	int ret = 0;
+	int ret;
+	int update = 0;
+	u64 trimmed = 0;
 
-	*trimmed = 0;
+	spin_lock(&space_info->lock);
+	spin_lock(&block_group->lock);
+	if (!block_group->ro) {
+		block_group->reserved += reserved_bytes;
+		space_info->bytes_reserved += reserved_bytes;
+		update = 1;
+	}
+	spin_unlock(&block_group->lock);
+	spin_unlock(&space_info->lock);
+
+	ret = btrfs_error_discard_extent(fs_info->extent_root,
+					 start, bytes, &trimmed);
+	if (!ret)
+		*total_trimmed += trimmed;
+
+	btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+
+	if (update) {
+		spin_lock(&space_info->lock);
+		spin_lock(&block_group->lock);
+		if (block_group->ro)
+			space_info->bytes_readonly += reserved_bytes;
+		block_group->reserved -= reserved_bytes;
+		space_info->bytes_reserved -= reserved_bytes;
+		spin_unlock(&space_info->lock);
+		spin_unlock(&block_group->lock);
+	}
+
+	return ret;
+}
+
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+			  u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+	int ret = 0;
+	u64 extent_start;
+	u64 extent_bytes;
+	u64 bytes;
 
 	while (start < end) {
 		spin_lock(&ctl->tree_lock);
@@ -2615,81 +2655,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 		}
 
 		entry = tree_search_offset(ctl, start, 0, 1);
-		if (!entry)
-			entry = tree_search_offset(ctl,
-						   offset_to_bitmap(ctl, start),
-						   1, 1);
-
-		if (!entry || entry->offset >= end) {
+		if (!entry) {
 			spin_unlock(&ctl->tree_lock);
 			break;
 		}
 
-		if (entry->bitmap) {
-			ret = search_bitmap(ctl, entry, &start, &bytes);
-			if (!ret) {
-				if (start >= end) {
-					spin_unlock(&ctl->tree_lock);
-					break;
-				}
-				bytes = min(bytes, end - start);
-				bitmap_clear_bits(ctl, entry, start, bytes);
-				if (entry->bytes == 0)
-					free_bitmap(ctl, entry);
-			} else {
-				start = entry->offset + BITS_PER_BITMAP *
-					block_group->sectorsize;
+		/* skip bitmaps */
+		while (entry->bitmap) {
+			node = rb_next(&entry->offset_index);
+			if (!node) {
 				spin_unlock(&ctl->tree_lock);
-				ret = 0;
-				continue;
+				goto out;
 			}
-		} else {
-			start = entry->offset;
-			bytes = min(entry->bytes, end - start);
-			unlink_free_space(ctl, entry);
-			kmem_cache_free(btrfs_free_space_cachep, entry);
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
+		}
+
+		if (entry->offset >= end) {
+			spin_unlock(&ctl->tree_lock);
+			break;
+		}
+
+		extent_start = entry->offset;
+		extent_bytes = entry->bytes;
+		start = max(start, extent_start);
+		bytes = min(extent_start + extent_bytes, end) - start;
+		if (bytes < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			goto next;
 		}
 
+		unlink_free_space(ctl, entry);
+		kmem_cache_free(btrfs_free_space_cachep, entry);
+
 		spin_unlock(&ctl->tree_lock);
 
-		if (bytes >= minlen) {
-			struct btrfs_space_info *space_info;
-			int update = 0;
-
-			space_info = block_group->space_info;
-			spin_lock(&space_info->lock);
-			spin_lock(&block_group->lock);
-			if (!block_group->ro) {
-				block_group->reserved += bytes;
-				space_info->bytes_reserved += bytes;
-				update = 1;
-			}
-			spin_unlock(&block_group->lock);
-			spin_unlock(&space_info->lock);
-
-			ret = btrfs_error_discard_extent(fs_info->extent_root,
-							 start,
-							 bytes,
-							 &actually_trimmed);
-
-			btrfs_add_free_space(block_group, start, bytes);
-			if (update) {
-				spin_lock(&space_info->lock);
-				spin_lock(&block_group->lock);
-				if (block_group->ro)
-					space_info->bytes_readonly += bytes;
-				block_group->reserved -= bytes;
-				space_info->bytes_reserved -= bytes;
-				spin_unlock(&space_info->lock);
-				spin_unlock(&block_group->lock);
-			}
+		ret = do_trimming(block_group, total_trimmed, start, bytes,
+				  extent_start, extent_bytes);
+		if (ret)
+			break;
+next:
+		start += bytes;
 
-			if (ret)
-				break;
-			*trimmed += actually_trimmed;
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+	}
+out:
+	return ret;
+}
+
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+			u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	int ret = 0;
+	int ret2;
+	u64 bytes;
+	u64 offset = offset_to_bitmap(ctl, start);
+
+	while (offset < end) {
+		bool next_bitmap = false;
+
+		spin_lock(&ctl->tree_lock);
+
+		if (ctl->free_space < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			break;
+		}
+
+		entry = tree_search_offset(ctl, offset, 1, 0);
+		if (!entry) {
+			spin_unlock(&ctl->tree_lock);
+			next_bitmap = true;
+			goto next;
+		}
+
+		bytes = minlen;
+		ret2 = search_bitmap(ctl, entry, &start, &bytes);
+		if (ret2 || start >= end) {
+			spin_unlock(&ctl->tree_lock);
+			next_bitmap = true;
+			goto next;
+		}
+
+		bytes = min(bytes, end - start);
+		if (bytes < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			goto next;
+		}
+
+		bitmap_clear_bits(ctl, entry, start, bytes);
+		if (entry->bytes == 0)
+			free_bitmap(ctl, entry);
+
+		spin_unlock(&ctl->tree_lock);
+
+		ret = do_trimming(block_group, total_trimmed, start, bytes,
+				  start, bytes);
+		if (ret)
+			break;
+next:
+		if (next_bitmap) {
+			offset += BITS_PER_BITMAP * ctl->unit;
+		} else {
+			start += bytes;
+			if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+				offset += BITS_PER_BITMAP * ctl->unit;
 		}
-		start += bytes;
-		bytes = 0;
 
 		if (fatal_signal_pending(current)) {
 			ret = -ERESTARTSYS;
@@ -2702,6 +2779,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 	return ret;
 }
 
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+	int ret;
+
+	*trimmed = 0;
+
+	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+	if (ret)
+		return ret;
+
+	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+
+	return ret;
+}
+
 /*
  * Find the left-most item in the cache tree, and then return the
  * smallest inode number in the item.
-- 
cgit v1.2.3


From c7c144db531fda414e532adac56e965ce332e2a5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Dec 2011 10:39:22 +0800
Subject: Btrfs: update global block_rsv when creating a new block group

A bug was triggered while using seed device:

    # mkfs.btrfs /dev/loop1
    # btrfstune -S 1 /dev/loop1
    # mount -o /dev/loop1 /mnt
    # btrfs dev add /dev/loop2 /mnt

btrfs: block rsv returned -28
------------[ cut here ]------------
WARNING: at fs/btrfs/extent-tree.c:5969 btrfs_alloc_free_block+0x166/0x396 [btrfs]()
...
Call Trace:
...
[<f7b7c31c>] btrfs_cow_block+0x101/0x147 [btrfs]
[<f7b7eaa6>] btrfs_search_slot+0x1b8/0x55f [btrfs]
[<f7b7f844>] btrfs_insert_empty_items+0x42/0x7f [btrfs]
[<f7b7f8c1>] btrfs_insert_item+0x40/0x7e [btrfs]
[<f7b8ac02>] btrfs_make_block_group+0x243/0x2aa [btrfs]
[<f7bb3f53>] __btrfs_alloc_chunk+0x672/0x70e [btrfs]
[<f7bb41ff>] init_first_rw_device+0x77/0x13c [btrfs]
[<f7bb5a62>] btrfs_init_new_device+0x664/0x9fd [btrfs]
[<f7bbb65a>] btrfs_ioctl+0x694/0xdbe [btrfs]
[<c04f55f7>] do_vfs_ioctl+0x496/0x4cc
[<c04f5660>] sys_ioctl+0x33/0x4f
[<c07b9edf>] sysenter_do_call+0x12/0x38
---[ end trace 906adac595facc7d ]---

Since seed device is readonly, there's no usable space in the filesystem.
Afterwards we add a sprout device to it, and the kernel creates a METADATA
block group and a SYSTEM block group where comes free space we can reserve,
but we still get revervation failure because the global block_rsv hasn't
been updated accordingly.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5b53479ce07..bf30f670cda 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7446,6 +7446,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	BUG_ON(ret);
+	update_global_block_rsv(root->fs_info);
 
 	spin_lock(&cache->space_info->lock);
 	cache->space_info->bytes_readonly += cache->bytes_super;
-- 
cgit v1.2.3


From b367e47fb3a70f5d24ebd6faf7d42436d485fb2d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Dec 2011 11:38:24 +0800
Subject: Btrfs: fix possible deadlock when opening a seed device

The correct lock order is uuid_mutex -> volume_mutex -> chunk_mutex,
but when we mount a filesystem which has backing seed devices, we have
this lock chain:

    open_ctree()
        lock(chunk_mutex);
        read_chunk_tree();
            read_one_dev();
                open_seed_devices();
                    lock(uuid_mutex);

and then we hit a lockdep splat.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/disk-io.c | 2 --
 fs/btrfs/volumes.c | 9 +++++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3f9d5551e58..858ab347413 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2270,9 +2270,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
 	   BTRFS_UUID_SIZE);
 
-	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
-	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
 		       sb->s_id);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 563ef650850..fbb493b28d5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3506,7 +3506,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 	struct btrfs_fs_devices *fs_devices;
 	int ret;
 
-	mutex_lock(&uuid_mutex);
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
 
 	fs_devices = root->fs_info->fs_devices->seed;
 	while (fs_devices) {
@@ -3544,7 +3544,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 	fs_devices->seed = root->fs_info->fs_devices->seed;
 	root->fs_info->fs_devices->seed = fs_devices;
 out:
-	mutex_unlock(&uuid_mutex);
 	return ret;
 }
 
@@ -3687,6 +3686,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
+	mutex_lock(&uuid_mutex);
+	lock_chunks(root);
+
 	/* first we search for all of the device items, and then we
 	 * read in all of the chunk items.  This way we can create chunk
 	 * mappings that reference all of the devices that are afound
@@ -3737,6 +3739,9 @@ again:
 	}
 	ret = 0;
 error:
+	unlock_chunks(root);
+	mutex_unlock(&uuid_mutex);
+
 	btrfs_free_path(path);
 	return ret;
 }
-- 
cgit v1.2.3


From 4041bcdc7bef06a2fb29c57394c713a74bd13b08 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 22:20:12 -0500
Subject: autofs4: autofs4_wait() vs. autofs4_catatonic_mode() race

We need to recheck ->catatonic after autofs4_wait() got ->wq_mutex
for good, or we might end up with wq inserted into queue after
autofs4_catatonic_mode() had done its thing.  It will stick there
forever, since there won't be anything to clear its ->name.name.

A bit of a complication: validate_request() drops and regains ->wq_mutex.
It actually ends up the most convenient place to stick the check into...

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/waitq.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e1fbdeef85d..c13273afd54 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -257,6 +257,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 	struct autofs_wait_queue *wq;
 	struct autofs_info *ino;
 
+	if (sbi->catatonic)
+		return -ENOENT;
+
 	/* Wait in progress, continue; */
 	wq = autofs4_find_wait(sbi, qstr);
 	if (wq) {
@@ -289,6 +292,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 			if (mutex_lock_interruptible(&sbi->wq_mutex))
 				return -EINTR;
 
+			if (sbi->catatonic)
+				return -ENOENT;
+
 			wq = autofs4_find_wait(sbi, qstr);
 			if (wq) {
 				*wait = wq;
@@ -389,7 +395,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 
 	ret = validate_request(&wq, sbi, &qstr, dentry, notify);
 	if (ret <= 0) {
-		if (ret == 0)
+		if (ret != -EINTR)
 			mutex_unlock(&sbi->wq_mutex);
 		kfree(qstr.name);
 		return ret;
-- 
cgit v1.2.3


From 8753333266be67ff3a984ac1f6566d31c260bee4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 22:24:48 -0500
Subject: autofs4: catatonic_mode vs. notify_daemon race

we need to hold ->wq_mutex while we are forming the packet to send,
lest we have autofs4_catatonic_mode() setting wq->name.name to NULL
just as autofs4_notify_daemon() decides to memcpy() from it...

We do have check for catatonic mode immediately after that (under
->wq_mutex, as it ought to be) and packet won't be actually sent,
but it'll be too late for us if we oops on that memcpy() from NULL...

Fix is obvious - just extend the area covered by ->wq_mutex over
that switch and check whether it's catatonic *before* doing anything
else.

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/waitq.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index c13273afd54..9a0256da5d5 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -110,6 +110,13 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 	pkt.hdr.proto_version = sbi->version;
 	pkt.hdr.type = type;
+	mutex_lock(&sbi->wq_mutex);
+
+	/* Check if we have become catatonic */
+	if (sbi->catatonic) {
+		mutex_unlock(&sbi->wq_mutex);
+		return;
+	}
 	switch (type) {
 	/* Kernel protocol v4 missing and expire packets */
 	case autofs_ptype_missing:
@@ -163,22 +170,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	}
 	default:
 		printk("autofs4_notify_daemon: bad type %d!\n", type);
+		mutex_unlock(&sbi->wq_mutex);
 		return;
 	}
 
-	/* Check if we have become catatonic */
-	mutex_lock(&sbi->wq_mutex);
-	if (!sbi->catatonic) {
-		pipe = sbi->pipe;
-		get_file(pipe);
-	}
+	pipe = sbi->pipe;
+	get_file(pipe);
+
 	mutex_unlock(&sbi->wq_mutex);
 
-	if (pipe) {
-		if (autofs4_write(pipe, &pkt, pktsz))
-			autofs4_catatonic_mode(sbi);
-		fput(pipe);
-	}
+	if (autofs4_write(pipe, &pkt, pktsz))
+		autofs4_catatonic_mode(sbi);
+	fput(pipe);
 }
 
 static int autofs4_getpath(struct autofs_sb_info *sbi,
-- 
cgit v1.2.3


From d668dc56631da067540b2494d2a1f29ff7b5f15a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 22:35:38 -0500
Subject: autofs4: deal with autofs4_write/autofs4_write races

Just serialize the actual writing of packets into pipe on
a new mutex, independent from everything else in the locking
hierarchy.  As soon as something has started feeding a piece
of packet into the pipe to daemon, we *want* everything else
about to try the same to wait until we are done.

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h | 1 +
 fs/autofs4/inode.c    | 1 +
 fs/autofs4/waitq.c    | 9 +++++----
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 5869d4e974a..d8d8e7ba6a1 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -116,6 +116,7 @@ struct autofs_sb_info {
 	int needs_reghost;
 	struct super_block *sb;
 	struct mutex wq_mutex;
+	struct mutex pipe_mutex;
 	spinlock_t fs_lock;
 	struct autofs_wait_queue *queues; /* Wait queue pointer */
 	spinlock_t lookup_lock;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 2ba44c79d54..e16980b00b8 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -225,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	sbi->min_proto = 0;
 	sbi->max_proto = 0;
 	mutex_init(&sbi->wq_mutex);
+	mutex_init(&sbi->pipe_mutex);
 	spin_lock_init(&sbi->fs_lock);
 	sbi->queues = NULL;
 	spin_lock_init(&sbi->lookup_lock);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 9a0256da5d5..9ef5b291440 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -56,26 +56,27 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 	mutex_unlock(&sbi->wq_mutex);
 }
 
-static int autofs4_write(struct file *file, const void *addr, int bytes)
+static int autofs4_write(struct autofs_sb_info *sbi,
+			 struct file *file, const void *addr, int bytes)
 {
 	unsigned long sigpipe, flags;
 	mm_segment_t fs;
 	const char *data = (const char *)addr;
 	ssize_t wr = 0;
 
-	/** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
-
 	sigpipe = sigismember(&current->pending.signal, SIGPIPE);
 
 	/* Save pointer to user space and point back to kernel space */
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 
+	mutex_lock(&sbi->pipe_mutex);
 	while (bytes &&
 	       (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
 		data += wr;
 		bytes -= wr;
 	}
+	mutex_lock(&sbi->pipe_mutex);
 
 	set_fs(fs);
 
@@ -179,7 +180,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 	mutex_unlock(&sbi->wq_mutex);
 
-	if (autofs4_write(pipe, &pkt, pktsz))
+	if (autofs4_write(sbi, pipe, &pkt, pktsz))
 		autofs4_catatonic_mode(sbi);
 	fput(pipe);
 }
-- 
cgit v1.2.3


From e0c2a9aa1e68455dc3439e95d85cabcaff073666 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 9 Jan 2012 17:18:05 -0500
Subject: GFS2: dlm based recovery coordination

This new method of managing recovery is an alternative to
the previous approach of using the userland gfs_controld.

- use dlm slot numbers to assign journal id's
- use dlm recovery callbacks to initiate journal recovery
- use a dlm lock to determine the first node to mount fs
- use a dlm lock to track journals that need recovery

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c             |   2 +-
 fs/gfs2/glock.h             |   7 +-
 fs/gfs2/incore.h            |  58 ++-
 fs/gfs2/lock_dlm.c          | 993 +++++++++++++++++++++++++++++++++++++++++++-
 fs/gfs2/main.c              |  10 +
 fs/gfs2/ops_fstype.c        |  29 +-
 fs/gfs2/recovery.c          |   4 +
 fs/gfs2/sys.c               |  33 +-
 fs/gfs2/sys.h               |   2 +
 include/linux/gfs2_ondisk.h |   2 +
 10 files changed, 1098 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 88e8a23d002..376816fcd04 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1353,7 +1353,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 	spin_lock(&gl->gl_spin);
 	gl->gl_reply = ret;
 
-	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
+	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
 		if (gfs2_should_freeze(gl)) {
 			set_bit(GLF_FROZEN, &gl->gl_flags);
 			spin_unlock(&gl->gl_spin);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2553b858a72..307ac31df78 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -121,8 +121,11 @@ enum {
 
 struct lm_lockops {
 	const char *lm_proto_name;
-	int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
- 	void (*lm_unmount) (struct gfs2_sbd *sdp);
+	int (*lm_mount) (struct gfs2_sbd *sdp, const char *table);
+	void (*lm_first_done) (struct gfs2_sbd *sdp);
+	void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid,
+				    unsigned int result);
+	void (*lm_unmount) (struct gfs2_sbd *sdp);
 	void (*lm_withdraw) (struct gfs2_sbd *sdp);
 	void (*lm_put_lock) (struct gfs2_glock *gl);
 	int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e1d3bb59945..b9422bc8e2f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -139,8 +139,45 @@ struct gfs2_bufdata {
 #define GDLM_STRNAME_BYTES	25
 #define GDLM_LVB_SIZE		32
 
+/*
+ * ls_recover_flags:
+ *
+ * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been
+ * held by failed nodes whose journals need recovery.  Those locks should
+ * only be used for journal recovery until the journal recovery is done.
+ * This is set by the dlm recover_prep callback and cleared by the
+ * gfs2_control thread when journal recovery is complete.  To avoid
+ * races between recover_prep setting and gfs2_control clearing, recover_spin
+ * is held while changing this bit and reading/writing recover_block
+ * and recover_start.
+ *
+ * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used.
+ *
+ * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing
+ * recovery of all journals before allowing other nodes to mount the fs.
+ * This is cleared when FIRST_MOUNT_DONE is set.
+ *
+ * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished
+ * recovery of all journals, and now allows other nodes to mount the fs.
+ *
+ * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared
+ * BLOCK_LOCKS for the first time.  The gfs2_control thread should now
+ * control clearing BLOCK_LOCKS for further recoveries.
+ *
+ * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq.
+ *
+ * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep()
+ * and recover_done(), i.e. set while recover_block == recover_start.
+ */
+
 enum {
 	DFL_BLOCK_LOCKS		= 0,
+	DFL_NO_DLM_OPS		= 1,
+	DFL_FIRST_MOUNT		= 2,
+	DFL_FIRST_MOUNT_DONE	= 3,
+	DFL_MOUNT_DONE		= 4,
+	DFL_UNMOUNT		= 5,
+	DFL_DLM_RECOVERY	= 6,
 };
 
 struct lm_lockname {
@@ -499,14 +536,26 @@ struct gfs2_sb_host {
 struct lm_lockstruct {
 	int ls_jid;
 	unsigned int ls_first;
-	unsigned int ls_first_done;
 	unsigned int ls_nodir;
 	const struct lm_lockops *ls_ops;
-	unsigned long ls_flags;
 	dlm_lockspace_t *ls_dlm;
 
-	int ls_recover_jid_done;
-	int ls_recover_jid_status;
+	int ls_recover_jid_done;   /* These two are deprecated, */
+	int ls_recover_jid_status; /* used previously by gfs_controld */
+
+	struct dlm_lksb ls_mounted_lksb; /* mounted_lock */
+	struct dlm_lksb ls_control_lksb; /* control_lock */
+	char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */
+	struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
+
+	spinlock_t ls_recover_spin; /* protects following fields */
+	unsigned long ls_recover_flags; /* DFL_ */
+	uint32_t ls_recover_mount; /* gen in first recover_done cb */
+	uint32_t ls_recover_start; /* gen in last recover_done cb */
+	uint32_t ls_recover_block; /* copy recover_start in last recover_prep */
+	uint32_t ls_recover_size; /* size of recover_submit, recover_result */
+	uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */
+	uint32_t *ls_recover_result; /* result of last jid recovery */
 };
 
 struct gfs2_sbd {
@@ -544,6 +593,7 @@ struct gfs2_sbd {
 	wait_queue_head_t sd_glock_wait;
 	atomic_t sd_glock_disposal;
 	struct completion sd_locking_init;
+	struct delayed_work sd_control_work;
 
 	/* Inode Stuff */
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index ce85b62bc0a..8944d1e32ab 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
+ * Copyright 2004-2011 Red Hat, Inc.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -11,12 +11,15 @@
 #include <linux/dlm.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
 
 #include "incore.h"
 #include "glock.h"
 #include "util.h"
+#include "sys.h"
 
+extern struct workqueue_struct *gfs2_control_wq;
 
 static void gdlm_ast(void *arg)
 {
@@ -185,34 +188,1002 @@ static void gdlm_cancel(struct gfs2_glock *gl)
 	dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
 }
 
-static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
+/*
+ * dlm/gfs2 recovery coordination using dlm_recover callbacks
+ *
+ *  1. dlm_controld sees lockspace members change
+ *  2. dlm_controld blocks dlm-kernel locking activity
+ *  3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
+ *  4. dlm_controld starts and finishes its own user level recovery
+ *  5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery
+ *  6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot)
+ *  7. dlm_recoverd does its own lock recovery
+ *  8. dlm_recoverd unblocks dlm-kernel locking activity
+ *  9. dlm_recoverd notifies gfs2 when done (recover_done with new generation)
+ * 10. gfs2_control updates control_lock lvb with new generation and jid bits
+ * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none)
+ * 12. gfs2_recover dequeues and recovers journals of failed nodes
+ * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result)
+ * 14. gfs2_control updates control_lock lvb jid bits for recovered journals
+ * 15. gfs2_control unblocks normal locking when all journals are recovered
+ *
+ * - failures during recovery
+ *
+ * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control
+ * clears BLOCK_LOCKS (step 15), e.g. another node fails while still
+ * recovering for a prior failure.  gfs2_control needs a way to detect
+ * this so it can leave BLOCK_LOCKS set in step 15.  This is managed using
+ * the recover_block and recover_start values.
+ *
+ * recover_done() provides a new lockspace generation number each time it
+ * is called (step 9).  This generation number is saved as recover_start.
+ * When recover_prep() is called, it sets BLOCK_LOCKS and sets
+ * recover_block = recover_start.  So, while recover_block is equal to
+ * recover_start, BLOCK_LOCKS should remain set.  (recover_spin must
+ * be held around the BLOCK_LOCKS/recover_block/recover_start logic.)
+ *
+ * - more specific gfs2 steps in sequence above
+ *
+ *  3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start
+ *  6. recover_slot records any failed jids (maybe none)
+ *  9. recover_done sets recover_start = new generation number
+ * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids
+ * 12. gfs2_recover does journal recoveries for failed jids identified above
+ * 14. gfs2_control clears control_lock lvb bits for recovered jids
+ * 15. gfs2_control checks if recover_block == recover_start (step 3 occured
+ *     again) then do nothing, otherwise if recover_start > recover_block
+ *     then clear BLOCK_LOCKS.
+ *
+ * - parallel recovery steps across all nodes
+ *
+ * All nodes attempt to update the control_lock lvb with the new generation
+ * number and jid bits, but only the first to get the control_lock EX will
+ * do so; others will see that it's already done (lvb already contains new
+ * generation number.)
+ *
+ * . All nodes get the same recover_prep/recover_slot/recover_done callbacks
+ * . All nodes attempt to set control_lock lvb gen + bits for the new gen
+ * . One node gets control_lock first and writes the lvb, others see it's done
+ * . All nodes attempt to recover jids for which they see control_lock bits set
+ * . One node succeeds for a jid, and that one clears the jid bit in the lvb
+ * . All nodes will eventually see all lvb bits clear and unblock locks
+ *
+ * - is there a problem with clearing an lvb bit that should be set
+ *   and missing a journal recovery?
+ *
+ * 1. jid fails
+ * 2. lvb bit set for step 1
+ * 3. jid recovered for step 1
+ * 4. jid taken again (new mount)
+ * 5. jid fails (for step 4)
+ * 6. lvb bit set for step 5 (will already be set)
+ * 7. lvb bit cleared for step 3
+ *
+ * This is not a problem because the failure in step 5 does not
+ * require recovery, because the mount in step 4 could not have
+ * progressed far enough to unblock locks and access the fs.  The
+ * control_mount() function waits for all recoveries to be complete
+ * for the latest lockspace generation before ever unblocking locks
+ * and returning.  The mount in step 4 waits until the recovery in
+ * step 1 is done.
+ *
+ * - special case of first mounter: first node to mount the fs
+ *
+ * The first node to mount a gfs2 fs needs to check all the journals
+ * and recover any that need recovery before other nodes are allowed
+ * to mount the fs.  (Others may begin mounting, but they must wait
+ * for the first mounter to be done before taking locks on the fs
+ * or accessing the fs.)  This has two parts:
+ *
+ * 1. The mounted_lock tells a node it's the first to mount the fs.
+ * Each node holds the mounted_lock in PR while it's mounted.
+ * Each node tries to acquire the mounted_lock in EX when it mounts.
+ * If a node is granted the mounted_lock EX it means there are no
+ * other mounted nodes (no PR locks exist), and it is the first mounter.
+ * The mounted_lock is demoted to PR when first recovery is done, so
+ * others will fail to get an EX lock, but will get a PR lock.
+ *
+ * 2. The control_lock blocks others in control_mount() while the first
+ * mounter is doing first mount recovery of all journals.
+ * A mounting node needs to acquire control_lock in EX mode before
+ * it can proceed.  The first mounter holds control_lock in EX while doing
+ * the first mount recovery, blocking mounts from other nodes, then demotes
+ * control_lock to NL when it's done (others_may_mount/first_done),
+ * allowing other nodes to continue mounting.
+ *
+ * first mounter:
+ * control_lock EX/NOQUEUE success
+ * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters)
+ * set first=1
+ * do first mounter recovery
+ * mounted_lock EX->PR
+ * control_lock EX->NL, write lvb generation
+ *
+ * other mounter:
+ * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry)
+ * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR)
+ * mounted_lock PR/NOQUEUE success
+ * read lvb generation
+ * control_lock EX->NL
+ * set first=0
+ *
+ * - mount during recovery
+ *
+ * If a node mounts while others are doing recovery (not first mounter),
+ * the mounting node will get its initial recover_done() callback without
+ * having seen any previous failures/callbacks.
+ *
+ * It must wait for all recoveries preceding its mount to be finished
+ * before it unblocks locks.  It does this by repeating the "other mounter"
+ * steps above until the lvb generation number is >= its mount generation
+ * number (from initial recover_done) and all lvb bits are clear.
+ *
+ * - control_lock lvb format
+ *
+ * 4 bytes generation number: the latest dlm lockspace generation number
+ * from recover_done callback.  Indicates the jid bitmap has been updated
+ * to reflect all slot failures through that generation.
+ * 4 bytes unused.
+ * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates
+ * that jid N needs recovery.
+ */
+
+#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */
+
+static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
+			     char *lvb_bits)
+{
+	uint32_t gen;
+	memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
+	memcpy(&gen, lvb_bits, sizeof(uint32_t));
+	*lvb_gen = le32_to_cpu(gen);
+}
+
+static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
+			      char *lvb_bits)
+{
+	uint32_t gen;
+	memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
+	gen = cpu_to_le32(lvb_gen);
+	memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t));
+}
+
+static int all_jid_bits_clear(char *lvb)
+{
+	int i;
+	for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) {
+		if (lvb[i])
+			return 0;
+	}
+	return 1;
+}
+
+static void sync_wait_cb(void *arg)
+{
+	struct lm_lockstruct *ls = arg;
+	complete(&ls->ls_sync_wait);
+}
+
+static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 	int error;
 
-	if (fsname == NULL) {
-		fs_info(sdp, "no fsname found\n");
-		return -EINVAL;
+	error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
+	if (error) {
+		fs_err(sdp, "%s lkid %x error %d\n",
+		       name, lksb->sb_lkid, error);
+		return error;
+	}
+
+	wait_for_completion(&ls->ls_sync_wait);
+
+	if (lksb->sb_status != -DLM_EUNLOCK) {
+		fs_err(sdp, "%s lkid %x status %d\n",
+		       name, lksb->sb_lkid, lksb->sb_status);
+		return -1;
+	}
+	return 0;
+}
+
+static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
+		     unsigned int num, struct dlm_lksb *lksb, char *name)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	char strname[GDLM_STRNAME_BYTES];
+	int error, status;
+
+	memset(strname, 0, GDLM_STRNAME_BYTES);
+	snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
+
+	error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
+			 strname, GDLM_STRNAME_BYTES - 1,
+			 0, sync_wait_cb, ls, NULL);
+	if (error) {
+		fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
+		       name, lksb->sb_lkid, flags, mode, error);
+		return error;
+	}
+
+	wait_for_completion(&ls->ls_sync_wait);
+
+	status = lksb->sb_status;
+
+	if (status && status != -EAGAIN) {
+		fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n",
+		       name, lksb->sb_lkid, flags, mode, status);
+	}
+
+	return status;
+}
+
+static int mounted_unlock(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock");
+}
+
+static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK,
+			 &ls->ls_mounted_lksb, "mounted_lock");
+}
+
+static int control_unlock(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock");
+}
+
+static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK,
+			 &ls->ls_control_lksb, "control_lock");
+}
+
+static void gfs2_control_func(struct work_struct *work)
+{
+	struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	char lvb_bits[GDLM_LVB_SIZE];
+	uint32_t block_gen, start_gen, lvb_gen, flags;
+	int recover_set = 0;
+	int write_lvb = 0;
+	int recover_size;
+	int i, error;
+
+	spin_lock(&ls->ls_recover_spin);
+	/*
+	 * No MOUNT_DONE means we're still mounting; control_mount()
+	 * will set this flag, after which this thread will take over
+	 * all further clearing of BLOCK_LOCKS.
+	 *
+	 * FIRST_MOUNT means this node is doing first mounter recovery,
+	 * for which recovery control is handled by
+	 * control_mount()/control_first_done(), not this thread.
+	 */
+	if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+	     test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+		spin_unlock(&ls->ls_recover_spin);
+		return;
+	}
+	block_gen = ls->ls_recover_block;
+	start_gen = ls->ls_recover_start;
+	spin_unlock(&ls->ls_recover_spin);
+
+	/*
+	 * Equal block_gen and start_gen implies we are between
+	 * recover_prep and recover_done callbacks, which means
+	 * dlm recovery is in progress and dlm locking is blocked.
+	 * There's no point trying to do any work until recover_done.
+	 */
+
+	if (block_gen == start_gen)
+		return;
+
+	/*
+	 * Propagate recover_submit[] and recover_result[] to lvb:
+	 * dlm_recoverd adds to recover_submit[] jids needing recovery
+	 * gfs2_recover adds to recover_result[] journal recovery results
+	 *
+	 * set lvb bit for jids in recover_submit[] if the lvb has not
+	 * yet been updated for the generation of the failure
+	 *
+	 * clear lvb bit for jids in recover_result[] if the result of
+	 * the journal recovery is SUCCESS
+	 */
+
+	error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+	if (error) {
+		fs_err(sdp, "control lock EX error %d\n", error);
+		return;
+	}
+
+	control_lvb_read(ls, &lvb_gen, lvb_bits);
+
+	spin_lock(&ls->ls_recover_spin);
+	if (block_gen != ls->ls_recover_block ||
+	    start_gen != ls->ls_recover_start) {
+		fs_info(sdp, "recover generation %u block1 %u %u\n",
+			start_gen, block_gen, ls->ls_recover_block);
+		spin_unlock(&ls->ls_recover_spin);
+		control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+		return;
+	}
+
+	recover_size = ls->ls_recover_size;
+
+	if (lvb_gen <= start_gen) {
+		/*
+		 * Clear lvb bits for jids we've successfully recovered.
+		 * Because all nodes attempt to recover failed journals,
+		 * a journal can be recovered multiple times successfully
+		 * in succession.  Only the first will really do recovery,
+		 * the others find it clean, but still report a successful
+		 * recovery.  So, another node may have already recovered
+		 * the jid and cleared the lvb bit for it.
+		 */
+		for (i = 0; i < recover_size; i++) {
+			if (ls->ls_recover_result[i] != LM_RD_SUCCESS)
+				continue;
+
+			ls->ls_recover_result[i] = 0;
+
+			if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET))
+				continue;
+
+			__clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+			write_lvb = 1;
+		}
+	}
+
+	if (lvb_gen == start_gen) {
+		/*
+		 * Failed slots before start_gen are already set in lvb.
+		 */
+		for (i = 0; i < recover_size; i++) {
+			if (!ls->ls_recover_submit[i])
+				continue;
+			if (ls->ls_recover_submit[i] < lvb_gen)
+				ls->ls_recover_submit[i] = 0;
+		}
+	} else if (lvb_gen < start_gen) {
+		/*
+		 * Failed slots before start_gen are not yet set in lvb.
+		 */
+		for (i = 0; i < recover_size; i++) {
+			if (!ls->ls_recover_submit[i])
+				continue;
+			if (ls->ls_recover_submit[i] < start_gen) {
+				ls->ls_recover_submit[i] = 0;
+				__set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+			}
+		}
+		/* even if there are no bits to set, we need to write the
+		   latest generation to the lvb */
+		write_lvb = 1;
+	} else {
+		/*
+		 * we should be getting a recover_done() for lvb_gen soon
+		 */
+	}
+	spin_unlock(&ls->ls_recover_spin);
+
+	if (write_lvb) {
+		control_lvb_write(ls, start_gen, lvb_bits);
+		flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;
+	} else {
+		flags = DLM_LKF_CONVERT;
+	}
+
+	error = control_lock(sdp, DLM_LOCK_NL, flags);
+	if (error) {
+		fs_err(sdp, "control lock NL error %d\n", error);
+		return;
+	}
+
+	/*
+	 * Everyone will see jid bits set in the lvb, run gfs2_recover_set(),
+	 * and clear a jid bit in the lvb if the recovery is a success.
+	 * Eventually all journals will be recovered, all jid bits will
+	 * be cleared in the lvb, and everyone will clear BLOCK_LOCKS.
+	 */
+
+	for (i = 0; i < recover_size; i++) {
+		if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) {
+			fs_info(sdp, "recover generation %u jid %d\n",
+				start_gen, i);
+			gfs2_recover_set(sdp, i);
+			recover_set++;
+		}
+	}
+	if (recover_set)
+		return;
+
+	/*
+	 * No more jid bits set in lvb, all recovery is done, unblock locks
+	 * (unless a new recover_prep callback has occured blocking locks
+	 * again while working above)
+	 */
+
+	spin_lock(&ls->ls_recover_spin);
+	if (ls->ls_recover_block == block_gen &&
+	    ls->ls_recover_start == start_gen) {
+		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		fs_info(sdp, "recover generation %u done\n", start_gen);
+		gfs2_glock_thaw(sdp);
+	} else {
+		fs_info(sdp, "recover generation %u block2 %u %u\n",
+			start_gen, block_gen, ls->ls_recover_block);
+		spin_unlock(&ls->ls_recover_spin);
+	}
+}
+
+static int control_mount(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	char lvb_bits[GDLM_LVB_SIZE];
+	uint32_t start_gen, block_gen, mount_gen, lvb_gen;
+	int mounted_mode;
+	int retries = 0;
+	int error;
+
+	memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb));
+	memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb));
+	memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE);
+	ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb;
+	init_completion(&ls->ls_sync_wait);
+
+	set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+
+	error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK);
+	if (error) {
+		fs_err(sdp, "control_mount control_lock NL error %d\n", error);
+		return error;
+	}
+
+	error = mounted_lock(sdp, DLM_LOCK_NL, 0);
+	if (error) {
+		fs_err(sdp, "control_mount mounted_lock NL error %d\n", error);
+		control_unlock(sdp);
+		return error;
+	}
+	mounted_mode = DLM_LOCK_NL;
+
+restart:
+	if (retries++ && signal_pending(current)) {
+		error = -EINTR;
+		goto fail;
+	}
+
+	/*
+	 * We always start with both locks in NL. control_lock is
+	 * demoted to NL below so we don't need to do it here.
+	 */
+
+	if (mounted_mode != DLM_LOCK_NL) {
+		error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+		if (error)
+			goto fail;
+		mounted_mode = DLM_LOCK_NL;
+	}
+
+	/*
+	 * Other nodes need to do some work in dlm recovery and gfs2_control
+	 * before the recover_done and control_lock will be ready for us below.
+	 * A delay here is not required but often avoids having to retry.
+	 */
+
+	msleep_interruptible(500);
+
+	/*
+	 * Acquire control_lock in EX and mounted_lock in either EX or PR.
+	 * control_lock lvb keeps track of any pending journal recoveries.
+	 * mounted_lock indicates if any other nodes have the fs mounted.
+	 */
+
+	error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK);
+	if (error == -EAGAIN) {
+		goto restart;
+	} else if (error) {
+		fs_err(sdp, "control_mount control_lock EX error %d\n", error);
+		goto fail;
+	}
+
+	error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+	if (!error) {
+		mounted_mode = DLM_LOCK_EX;
+		goto locks_done;
+	} else if (error != -EAGAIN) {
+		fs_err(sdp, "control_mount mounted_lock EX error %d\n", error);
+		goto fail;
+	}
+
+	error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+	if (!error) {
+		mounted_mode = DLM_LOCK_PR;
+		goto locks_done;
+	} else {
+		/* not even -EAGAIN should happen here */
+		fs_err(sdp, "control_mount mounted_lock PR error %d\n", error);
+		goto fail;
+	}
+
+locks_done:
+	/*
+	 * If we got both locks above in EX, then we're the first mounter.
+	 * If not, then we need to wait for the control_lock lvb to be
+	 * updated by other mounted nodes to reflect our mount generation.
+	 *
+	 * In simple first mounter cases, first mounter will see zero lvb_gen,
+	 * but in cases where all existing nodes leave/fail before mounting
+	 * nodes finish control_mount, then all nodes will be mounting and
+	 * lvb_gen will be non-zero.
+	 */
+
+	control_lvb_read(ls, &lvb_gen, lvb_bits);
+
+	if (lvb_gen == 0xFFFFFFFF) {
+		/* special value to force mount attempts to fail */
+		fs_err(sdp, "control_mount control_lock disabled\n");
+		error = -EINVAL;
+		goto fail;
+	}
+
+	if (mounted_mode == DLM_LOCK_EX) {
+		/* first mounter, keep both EX while doing first recovery */
+		spin_lock(&ls->ls_recover_spin);
+		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+		set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+		set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		fs_info(sdp, "first mounter control generation %u\n", lvb_gen);
+		return 0;
+	}
+
+	error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+	if (error)
+		goto fail;
+
+	/*
+	 * We are not first mounter, now we need to wait for the control_lock
+	 * lvb generation to be >= the generation from our first recover_done
+	 * and all lvb bits to be clear (no pending journal recoveries.)
+	 */
+
+	if (!all_jid_bits_clear(lvb_bits)) {
+		/* journals need recovery, wait until all are clear */
+		fs_info(sdp, "control_mount wait for journal recovery\n");
+		goto restart;
+	}
+
+	spin_lock(&ls->ls_recover_spin);
+	block_gen = ls->ls_recover_block;
+	start_gen = ls->ls_recover_start;
+	mount_gen = ls->ls_recover_mount;
+
+	if (lvb_gen < mount_gen) {
+		/* wait for mounted nodes to update control_lock lvb to our
+		   generation, which might include new recovery bits set */
+		fs_info(sdp, "control_mount wait1 block %u start %u mount %u "
+			"lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+			lvb_gen, ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		goto restart;
+	}
+
+	if (lvb_gen != start_gen) {
+		/* wait for mounted nodes to update control_lock lvb to the
+		   latest recovery generation */
+		fs_info(sdp, "control_mount wait2 block %u start %u mount %u "
+			"lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+			lvb_gen, ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		goto restart;
+	}
+
+	if (block_gen == start_gen) {
+		/* dlm recovery in progress, wait for it to finish */
+		fs_info(sdp, "control_mount wait3 block %u start %u mount %u "
+			"lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+			lvb_gen, ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		goto restart;
 	}
 
-	error = dlm_new_lockspace(fsname, NULL, 
-				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
-				  (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
-				  GDLM_LVB_SIZE, NULL, NULL, NULL, &ls->ls_dlm);
+	clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+	set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+	memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+	memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+	spin_unlock(&ls->ls_recover_spin);
+	return 0;
+
+fail:
+	mounted_unlock(sdp);
+	control_unlock(sdp);
+	return error;
+}
+
+static int dlm_recovery_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
+static int control_first_done(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	char lvb_bits[GDLM_LVB_SIZE];
+	uint32_t start_gen, block_gen;
+	int error;
+
+restart:
+	spin_lock(&ls->ls_recover_spin);
+	start_gen = ls->ls_recover_start;
+	block_gen = ls->ls_recover_block;
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) ||
+	    !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+	    !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+		/* sanity check, should not happen */
+		fs_err(sdp, "control_first_done start %u block %u flags %lx\n",
+		       start_gen, block_gen, ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		control_unlock(sdp);
+		return -1;
+	}
+
+	if (start_gen == block_gen) {
+		/*
+		 * Wait for the end of a dlm recovery cycle to switch from
+		 * first mounter recovery.  We can ignore any recover_slot
+		 * callbacks between the recover_prep and next recover_done
+		 * because we are still the first mounter and any failed nodes
+		 * have not fully mounted, so they don't need recovery.
+		 */
+		spin_unlock(&ls->ls_recover_spin);
+		fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
+
+		wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
+			    dlm_recovery_wait, TASK_UNINTERRUPTIBLE);
+		goto restart;
+	}
+
+	clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+	set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags);
+	memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+	memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+	spin_unlock(&ls->ls_recover_spin);
+
+	memset(lvb_bits, 0, sizeof(lvb_bits));
+	control_lvb_write(ls, start_gen, lvb_bits);
+
+	error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);
+	if (error)
+		fs_err(sdp, "control_first_done mounted PR error %d\n", error);
+
+	error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
 	if (error)
-		printk(KERN_ERR "dlm_new_lockspace error %d", error);
+		fs_err(sdp, "control_first_done control NL error %d\n", error);
 
 	return error;
 }
 
+/*
+ * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
+ * to accomodate the largest slot number.  (NB dlm slot numbers start at 1,
+ * gfs2 jids start at 0, so jid = slot - 1)
+ */
+
+#define RECOVER_SIZE_INC 16
+
+static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
+			    int num_slots)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	uint32_t *submit = NULL;
+	uint32_t *result = NULL;
+	uint32_t old_size, new_size;
+	int i, max_jid;
+
+	max_jid = 0;
+	for (i = 0; i < num_slots; i++) {
+		if (max_jid < slots[i].slot - 1)
+			max_jid = slots[i].slot - 1;
+	}
+
+	old_size = ls->ls_recover_size;
+
+	if (old_size >= max_jid + 1)
+		return 0;
+
+	new_size = old_size + RECOVER_SIZE_INC;
+
+	submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+	result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+	if (!submit || !result) {
+		kfree(submit);
+		kfree(result);
+		return -ENOMEM;
+	}
+
+	spin_lock(&ls->ls_recover_spin);
+	memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t));
+	memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t));
+	kfree(ls->ls_recover_submit);
+	kfree(ls->ls_recover_result);
+	ls->ls_recover_submit = submit;
+	ls->ls_recover_result = result;
+	ls->ls_recover_size = new_size;
+	spin_unlock(&ls->ls_recover_spin);
+	return 0;
+}
+
+static void free_recover_size(struct lm_lockstruct *ls)
+{
+	kfree(ls->ls_recover_submit);
+	kfree(ls->ls_recover_result);
+	ls->ls_recover_submit = NULL;
+	ls->ls_recover_result = NULL;
+	ls->ls_recover_size = 0;
+}
+
+/* dlm calls before it does lock recovery */
+
+static void gdlm_recover_prep(void *arg)
+{
+	struct gfs2_sbd *sdp = arg;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	spin_lock(&ls->ls_recover_spin);
+	ls->ls_recover_block = ls->ls_recover_start;
+	set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+
+	if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+	     test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+		spin_unlock(&ls->ls_recover_spin);
+		return;
+	}
+	set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+	spin_unlock(&ls->ls_recover_spin);
+}
+
+/* dlm calls after recover_prep has been completed on all lockspace members;
+   identifies slot/jid of failed member */
+
+static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
+{
+	struct gfs2_sbd *sdp = arg;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int jid = slot->slot - 1;
+
+	spin_lock(&ls->ls_recover_spin);
+	if (ls->ls_recover_size < jid + 1) {
+		fs_err(sdp, "recover_slot jid %d gen %u short size %d",
+		       jid, ls->ls_recover_block, ls->ls_recover_size);
+		spin_unlock(&ls->ls_recover_spin);
+		return;
+	}
+
+	if (ls->ls_recover_submit[jid]) {
+		fs_info(sdp, "recover_slot jid %d gen %u prev %u",
+			jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
+	}
+	ls->ls_recover_submit[jid] = ls->ls_recover_block;
+	spin_unlock(&ls->ls_recover_spin);
+}
+
+/* dlm calls after recover_slot and after it completes lock recovery */
+
+static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
+			      int our_slot, uint32_t generation)
+{
+	struct gfs2_sbd *sdp = arg;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	/* ensure the ls jid arrays are large enough */
+	set_recover_size(sdp, slots, num_slots);
+
+	spin_lock(&ls->ls_recover_spin);
+	ls->ls_recover_start = generation;
+
+	if (!ls->ls_recover_mount) {
+		ls->ls_recover_mount = generation;
+		ls->ls_jid = our_slot - 1;
+	}
+
+	if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+		queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
+
+	clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
+	spin_unlock(&ls->ls_recover_spin);
+}
+
+/* gfs2_recover thread has a journal recovery result */
+
+static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
+				 unsigned int result)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+		return;
+
+	/* don't care about the recovery of own journal during mount */
+	if (jid == ls->ls_jid)
+		return;
+
+	spin_lock(&ls->ls_recover_spin);
+	if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+		spin_unlock(&ls->ls_recover_spin);
+		return;
+	}
+	if (ls->ls_recover_size < jid + 1) {
+		fs_err(sdp, "recovery_result jid %d short size %d",
+		       jid, ls->ls_recover_size);
+		spin_unlock(&ls->ls_recover_spin);
+		return;
+	}
+
+	fs_info(sdp, "recover jid %d result %s\n", jid,
+		result == LM_RD_GAVEUP ? "busy" : "success");
+
+	ls->ls_recover_result[jid] = result;
+
+	/* GAVEUP means another node is recovering the journal; delay our
+	   next attempt to recover it, to give the other node a chance to
+	   finish before trying again */
+
+	if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+		queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work,
+				   result == LM_RD_GAVEUP ? HZ : 0);
+	spin_unlock(&ls->ls_recover_spin);
+}
+
+const struct dlm_lockspace_ops gdlm_lockspace_ops = {
+	.recover_prep = gdlm_recover_prep,
+	.recover_slot = gdlm_recover_slot,
+	.recover_done = gdlm_recover_done,
+};
+
+static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	char cluster[GFS2_LOCKNAME_LEN];
+	const char *fsname;
+	uint32_t flags;
+	int error, ops_result;
+
+	/*
+	 * initialize everything
+	 */
+
+	INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
+	spin_lock_init(&ls->ls_recover_spin);
+	ls->ls_recover_flags = 0;
+	ls->ls_recover_mount = 0;
+	ls->ls_recover_start = 0;
+	ls->ls_recover_block = 0;
+	ls->ls_recover_size = 0;
+	ls->ls_recover_submit = NULL;
+	ls->ls_recover_result = NULL;
+
+	error = set_recover_size(sdp, NULL, 0);
+	if (error)
+		goto fail;
+
+	/*
+	 * prepare dlm_new_lockspace args
+	 */
+
+	fsname = strchr(table, ':');
+	if (!fsname) {
+		fs_info(sdp, "no fsname found\n");
+		error = -EINVAL;
+		goto fail_free;
+	}
+	memset(cluster, 0, sizeof(cluster));
+	memcpy(cluster, table, strlen(table) - strlen(fsname));
+	fsname++;
+
+	flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
+	if (ls->ls_nodir)
+		flags |= DLM_LSFL_NODIR;
+
+	/*
+	 * create/join lockspace
+	 */
+
+	error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
+				  &gdlm_lockspace_ops, sdp, &ops_result,
+				  &ls->ls_dlm);
+	if (error) {
+		fs_err(sdp, "dlm_new_lockspace error %d\n", error);
+		goto fail_free;
+	}
+
+	if (ops_result < 0) {
+		/*
+		 * dlm does not support ops callbacks,
+		 * old dlm_controld/gfs_controld are used, try without ops.
+		 */
+		fs_info(sdp, "dlm lockspace ops not used\n");
+		free_recover_size(ls);
+		set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags);
+		return 0;
+	}
+
+	if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) {
+		fs_err(sdp, "dlm lockspace ops disallow jid preset\n");
+		error = -EINVAL;
+		goto fail_release;
+	}
+
+	/*
+	 * control_mount() uses control_lock to determine first mounter,
+	 * and for later mounts, waits for any recoveries to be cleared.
+	 */
+
+	error = control_mount(sdp);
+	if (error) {
+		fs_err(sdp, "mount control error %d\n", error);
+		goto fail_release;
+	}
+
+	ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+	clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+	return 0;
+
+fail_release:
+	dlm_release_lockspace(ls->ls_dlm, 2);
+fail_free:
+	free_recover_size(ls);
+fail:
+	return error;
+}
+
+static void gdlm_first_done(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int error;
+
+	if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+		return;
+
+	error = control_first_done(sdp);
+	if (error)
+		fs_err(sdp, "mount first_done error %d\n", error);
+}
+
 static void gdlm_unmount(struct gfs2_sbd *sdp)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
+	if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+		goto release;
+
+	/* wait for gfs2_control_wq to be done with this mount */
+
+	spin_lock(&ls->ls_recover_spin);
+	set_bit(DFL_UNMOUNT, &ls->ls_recover_flags);
+	spin_unlock(&ls->ls_recover_spin);
+	flush_delayed_work_sync(&sdp->sd_control_work);
+
+	/* mounted_lock and control_lock will be purged in dlm recovery */
+release:
 	if (ls->ls_dlm) {
 		dlm_release_lockspace(ls->ls_dlm, 2);
 		ls->ls_dlm = NULL;
 	}
+
+	free_recover_size(ls);
 }
 
 static const match_table_t dlm_tokens = {
@@ -226,6 +1197,8 @@ static const match_table_t dlm_tokens = {
 const struct lm_lockops gfs2_dlm_ops = {
 	.lm_proto_name = "lock_dlm",
 	.lm_mount = gdlm_mount,
+	.lm_first_done = gdlm_first_done,
+	.lm_recovery_result = gdlm_recovery_result,
 	.lm_unmount = gdlm_unmount,
 	.lm_put_lock = gdlm_put_lock,
 	.lm_lock = gdlm_lock,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c150298e2d8..a8d9bcd0e19 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -28,6 +28,8 @@
 #include "recovery.h"
 #include "dir.h"
 
+struct workqueue_struct *gfs2_control_wq;
+
 static struct shrinker qd_shrinker = {
 	.shrink = gfs2_shrink_qd_memory,
 	.seeks = DEFAULT_SEEKS,
@@ -146,12 +148,19 @@ static int __init init_gfs2_fs(void)
 	if (!gfs_recovery_wq)
 		goto fail_wq;
 
+	gfs2_control_wq = alloc_workqueue("gfs2_control",
+			       WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);
+	if (!gfs2_control_wq)
+		goto fail_control;
+
 	gfs2_register_debugfs();
 
 	printk("GFS2 installed\n");
 
 	return 0;
 
+fail_control:
+	destroy_workqueue(gfs_recovery_wq);
 fail_wq:
 	unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
@@ -195,6 +204,7 @@ static void __exit exit_gfs2_fs(void)
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
 	destroy_workqueue(gfs_recovery_wq);
+	destroy_workqueue(gfs2_control_wq);
 
 	rcu_barrier();
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index fe72e79e6ff..b01573b7ad9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -562,8 +562,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
 {
 	char *message = "FIRSTMOUNT=Done";
 	char *envp[] = { message, NULL };
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	ls->ls_first_done = 1;
+
+	fs_info(sdp, "first mount done, others may mount\n");
+
+	if (sdp->sd_lockstruct.ls_ops->lm_first_done)
+		sdp->sd_lockstruct.ls_ops->lm_first_done(sdp);
+
 	kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
@@ -944,7 +948,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 	struct gfs2_args *args = &sdp->sd_args;
 	const char *proto = sdp->sd_proto_name;
 	const char *table = sdp->sd_table_name;
-	const char *fsname;
 	char *o, *options;
 	int ret;
 
@@ -1004,21 +1007,12 @@ hostdata_error:
 		}
 	}
 
-	if (sdp->sd_args.ar_spectator)
-		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
-	else
-		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
-			 sdp->sd_lockstruct.ls_jid);
-
-	fsname = strchr(table, ':');
-	if (fsname)
-		fsname++;
 	if (lm->lm_mount == NULL) {
 		fs_info(sdp, "Now mounting FS...\n");
 		complete_all(&sdp->sd_locking_init);
 		return 0;
 	}
-	ret = lm->lm_mount(sdp, fsname);
+	ret = lm->lm_mount(sdp, table);
 	if (ret == 0)
 		fs_info(sdp, "Joined cluster. Now mounting FS...\n");
 	complete_all(&sdp->sd_locking_init);
@@ -1124,6 +1118,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	if (error)
 		goto fail;
 
+	snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
+
 	gfs2_create_debugfs_file(sdp);
 
 	error = gfs2_sys_fs_add(sdp);
@@ -1160,6 +1156,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 		goto fail_sb;
 	}
 
+	if (sdp->sd_args.ar_spectator)
+		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
+			 sdp->sd_table_name);
+	else
+		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
+			 sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
+
 	error = init_inodes(sdp, DO);
 	if (error)
 		goto fail_sb;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f2a02edcac8..af49e8f432f 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
 	char env_status[20];
 	char *envp[] = { env_jid, env_status, NULL };
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
         ls->ls_recover_jid_done = jid;
         ls->ls_recover_jid_status = message;
 	sprintf(env_jid, "JID=%d", jid);
 	sprintf(env_status, "RECOVERY=%s",
 		message == LM_RD_SUCCESS ? "Done" : "Failed");
         kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
+
+	if (sdp->sd_lockstruct.ls_ops->lm_recovery_result)
+		sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
 }
 
 void gfs2_recover_func(struct work_struct *work)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 443cabcfcd2..d33172c291b 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
 	ssize_t ret;
 	int val = 0;
 
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))
 		val = 1;
 	ret = sprintf(buf, "%d\n", val);
 	return ret;
@@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 	val = simple_strtol(buf, NULL, 0);
 
 	if (val == 1)
-		set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+		set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
 	else if (val == 0) {
-		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
 		smp_mb__after_clear_bit();
 		gfs2_glock_thaw(sdp);
 	} else {
@@ -350,8 +350,8 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 		goto out;
 	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
 		goto out;
-        sdp->sd_lockstruct.ls_first = first;
-        rv = 0;
+	sdp->sd_lockstruct.ls_first = first;
+	rv = 0;
 out:
         spin_unlock(&sdp->sd_jindex_spin);
         return rv ? rv : len;
@@ -360,19 +360,14 @@ out:
 static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	return sprintf(buf, "%d\n", ls->ls_first_done);
+	return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags));
 }
 
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
 {
-	unsigned jid;
 	struct gfs2_jdesc *jd;
 	int rv;
 
-	rv = sscanf(buf, "%u", &jid);
-	if (rv != 1)
-		return -EINVAL;
-
 	rv = -ESHUTDOWN;
 	spin_lock(&sdp->sd_jindex_spin);
 	if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
@@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 	}
 out:
 	spin_unlock(&sdp->sd_jindex_spin);
+	return rv;
+}
+
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	unsigned jid;
+	int rv;
+
+	rv = sscanf(buf, "%u", &jid);
+	if (rv != 1)
+		return -EINVAL;
+
+	rv = gfs2_recover_set(sdp, jid);
+
 	return rv ? rv : len;
 }
 
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index e94560e836d..79182d6ad6a 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
 int gfs2_sys_init(void);
 void gfs2_sys_uninit(void);
 
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid);
+
 #endif /* __SYS_DOT_H__ */
 
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 4f4462974c1..b148087f49a 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -22,6 +22,8 @@
 #define GFS2_LIVE_LOCK		1
 #define GFS2_TRANS_LOCK		2
 #define GFS2_RENAME_LOCK	3
+#define GFS2_CONTROL_LOCK	4
+#define GFS2_MOUNTED_LOCK	5
 
 /* Format numbers for various metadata types */
 
-- 
cgit v1.2.3


From 49528b4e479195e5db4fe51fcd5ddd97901efc16 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Fri, 6 Jan 2012 16:48:50 -0500
Subject: GFS2: Fix a use-after-free that coverity spotted

In function gfs2_inplace_release it was trying to unlock a gfs2_holder
structure associated with a reservation, after said reservation was
freed. The problem is that the statements have the wrong order.
This patch corrects the order so that the reservation is freed after
the gfs2_holder is unlocked.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 22234627f68..981bfa32121 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1108,9 +1108,9 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 {
 	struct gfs2_blkreserv *rs = ip->i_res;
 
-	gfs2_blkrsv_put(ip);
 	if (rs->rs_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+	gfs2_blkrsv_put(ip);
 }
 
 /**
-- 
cgit v1.2.3


From e8ca5cc571a60339491f8c273a01093096ff8704 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 9 Jan 2012 14:40:06 -0500
Subject: GFS2: let spectator mount do read only recovery

Previously, a spectator mount would not even attempt to do
journal recovery for a failed node.  This meant that if all
mounted nodes were spectators, everyone would be stuck after
a node failed, all waiting for recovery to be performed.
This is unnecessary since the failed node had a clean journal.

Instead, allow a spectator mount to do a partial "read only"
recovery, which means it will check if the failed journal is
clean, and if so, report a successful recovery.  If the failed
journal is not clean, it reports that journal recovery failed.
This makes it work the same as a read only mount on a read only
block device.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     | 1 +
 fs/gfs2/ops_fstype.c | 2 +-
 fs/gfs2/recovery.c   | 4 +++-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b9422bc8e2f..e5701c70f6f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -498,6 +498,7 @@ enum {
 	SDF_NORECOVERY		= 4,
 	SDF_DEMOTE		= 5,
 	SDF_NOJOURNALID		= 6,
+	SDF_RORECOVERY		= 7, /* read only recovery */
 };
 
 #define GFS2_FSNAME_LEN		256
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b01573b7ad9..6aacf3f230a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1078,7 +1078,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 
 	if (sdp->sd_args.ar_spectator) {
                 sb->s_flags |= MS_RDONLY;
-		set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+		set_bit(SDF_RORECOVERY, &sdp->sd_flags);
 	}
 	if (sdp->sd_args.ar_posix_acl)
 		sb->s_flags |= MS_POSIXACL;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index af49e8f432f..80701d1566a 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -516,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work)
 		if (error)
 			goto fail_gunlock_ji;
 
-		if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+		if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) {
+			ro = 1;
+		} else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
 			if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
 				ro = 1;
 		} else {
-- 
cgit v1.2.3


From 376d37788b56bc2800e5bd56b7a36b3544d89f97 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 9 Jan 2012 15:29:20 -0500
Subject: GFS2: fail mount if journal recovery fails

If the first mounter fails to recover one of the journals
during mount, the mount should fail.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h   | 1 +
 fs/gfs2/recovery.c | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e5701c70f6f..97742a7ea9c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -429,6 +429,7 @@ struct gfs2_jdesc {
 #define JDF_RECOVERY 1
 	unsigned int jd_jid;
 	unsigned int jd_blocks;
+	int jd_recover_error;
 };
 
 struct gfs2_statfs_change_host {
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 80701d1566a..963b2d75200 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -583,6 +583,7 @@ fail_gunlock_j:
 
 	fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
 fail:
+	jd->jd_recover_error = error;
 	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
 done:
 	clear_bit(JDF_RECOVERY, &jd->jd_flags);
@@ -611,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
 		wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
 			    TASK_UNINTERRUPTIBLE);
 
-	return 0;
+	return wait ? jd->jd_recover_error : 0;
 }
 
-- 
cgit v1.2.3


From 66ad863b410efb7f537719006f9ac52400c1a5c5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 11 Jan 2012 12:35:05 +0000
Subject: GFS2: Fix nlink setting on inode creation

Since the nlink count will be 0, we need to use set_nlink rather
than inc_nlink in order to avoid triggering the inc_nlink warning
which was added recently.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 017960cf1d7..a7d611b93f0 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -599,9 +599,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
 		goto fail_end_trans;
-	inc_nlink(&ip->i_inode);
-	if (S_ISDIR(ip->i_inode.i_mode))
-		inc_nlink(&ip->i_inode);
+	set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
-- 
cgit v1.2.3


From 353b67d8ced4dc53281c88150ad295e24bc4b4c5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 26 Nov 2011 00:35:39 +0100
Subject: jbd: Issue cache flush after checkpointing

When we reach cleanup_journal_tail(), there is no guarantee that
checkpointed buffers are on a stable storage - especially if buffers were
written out by log_do_checkpoint(), they are likely to be only in disk's
caches. Thus when we update journal superblock, effectively removing old
transaction from journal, this write of superblock can get to stable storage
before those checkpointed buffers which can result in filesystem corruption
after a crash.

A similar problem can happen if we replay the journal and wipe it before
flushing disk's caches.

Thus we must unconditionally issue a cache flush before we update journal
superblock in these cases. The fix is slightly complicated by the fact that we
have to get log tail before we issue cache flush but we can store it in the
journal superblock only after the cache flush. Otherwise we risk races where
new tail is written before appropriate cache flush is finished.

I managed to reproduce the corruption using somewhat tweaked Chris Mason's
barrier-test scheduler. Also this should fix occasional reports of 'Bit already
freed' filesystem errors which are totally unreproducible but inspection of
several fs images I've gathered over time points to a problem like this.

CC: stable@kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/checkpoint.c | 27 ++++++++++++++++++++++-----
 fs/jbd/recovery.c   |  4 ++++
 2 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 5d1a00a5041..05f0754f2b4 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -453,8 +453,6 @@ out:
  *
  * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
  *
- * Called with the journal lock held.
- *
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
@@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)
 	if (is_journal_aborted(journal))
 		return 1;
 
-	/* OK, work out the oldest transaction remaining in the log, and
+	/*
+	 * OK, work out the oldest transaction remaining in the log, and
 	 * the log block it starts at.
 	 *
 	 * If the log is now empty, we need to work out which is the
 	 * next transaction ID we will write, and where it will
-	 * start. */
-
+	 * start.
+	 */
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	transaction = journal->j_checkpoint_transactions;
@@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)
 		spin_unlock(&journal->j_state_lock);
 		return 1;
 	}
+	spin_unlock(&journal->j_state_lock);
+
+	/*
+	 * We need to make sure that any blocks that were recently written out
+	 * --- perhaps by log_do_checkpoint() --- are flushed out before we
+	 * drop the transactions from the journal. It's unlikely this will be
+	 * necessary, especially with an appropriately sized journal, but we
+	 * need this to guarantee correctness.  Fortunately
+	 * cleanup_journal_tail() doesn't get called all that often.
+	 */
+	if (journal->j_flags & JFS_BARRIER)
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 
+	spin_lock(&journal->j_state_lock);
+	if (!tid_gt(first_tid, journal->j_tail_sequence)) {
+		spin_unlock(&journal->j_state_lock);
+		/* Someone else cleaned up journal so return 0 */
+		return 0;
+	}
 	/* OK, update the superblock to recover the freed space.
 	 * Physical blocks come first: have we wrapped beyond the end of
 	 * the log?  */
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 5b43e96788e..008bf062fd2 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,6 +20,7 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
+#include <linux/blkdev.h>
 #endif
 
 /*
@@ -263,6 +264,9 @@ int journal_recover(journal_t *journal)
 	err2 = sync_blockdev(journal->j_fs_dev);
 	if (!err)
 		err = err2;
+	/* Flush disk caches to get replayed data on the permanent storage */
+	if (journal->j_flags & JFS_BARRIER)
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 
 	return err;
 }
-- 
cgit v1.2.3


From 34b07840565004cfa444e165e88bf77a5cbcdb25 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Mon, 9 Jan 2012 15:58:37 +0100
Subject: ext2: protect inode changes in the SETVERSION and SETFLAGS ioctls

Unlock mutex after i_flags and i_ctime updates in the EXT2_IOC_SETFLAGS
ioctl.

Use i_mutex in the EXT2_IOC_SETVERSION ioctl to protect i_ctime and
i_generation updates and make the ioctl consistent since i_mutex is
also used in other places to protect timestamps and inode changes.

Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Djalal Harouni <tixxdz@opendz.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/ioctl.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 1089f760c84..2de655f5d62 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -77,10 +77,11 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		flags = flags & EXT2_FL_USER_MODIFIABLE;
 		flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
 		ei->i_flags = flags;
-		mutex_unlock(&inode->i_mutex);
 
 		ext2_set_inode_flags(inode);
 		inode->i_ctime = CURRENT_TIME_SEC;
+		mutex_unlock(&inode->i_mutex);
+
 		mark_inode_dirty(inode);
 setflags_out:
 		mnt_drop_write_file(filp);
@@ -88,20 +89,29 @@ setflags_out:
 	}
 	case EXT2_IOC_GETVERSION:
 		return put_user(inode->i_generation, (int __user *) arg);
-	case EXT2_IOC_SETVERSION:
+	case EXT2_IOC_SETVERSION: {
+		__u32 generation;
+
 		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 		ret = mnt_want_write_file(filp);
 		if (ret)
 			return ret;
-		if (get_user(inode->i_generation, (int __user *) arg)) {
+		if (get_user(generation, (int __user *) arg)) {
 			ret = -EFAULT;
-		} else {
-			inode->i_ctime = CURRENT_TIME_SEC;
-			mark_inode_dirty(inode);
+			goto setversion_out;
 		}
+
+		mutex_lock(&inode->i_mutex);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_generation = generation;
+		mutex_unlock(&inode->i_mutex);
+
+		mark_inode_dirty(inode);
+setversion_out:
 		mnt_drop_write_file(filp);
 		return ret;
+	}
 	case EXT2_IOC_GETRSVSZ:
 		if (test_opt(inode->i_sb, RESERVATION)
 			&& S_ISREG(inode->i_mode)
-- 
cgit v1.2.3


From 1f5d78dc4823a85f112aaa2d0f17624f8c2a6c52 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 11 Jan 2012 15:13:27 +0200
Subject: UBIFS: make debugging messages light again

We switch to dynamic debugging in commit
56e46742e846e4de167dde0e1e1071ace1c882a5 but did not take into account that
now we do not control anymore whether a specific message is enabled or not.
So now we lock the "dbg_lock" and release it in every debugging macro, which
make them not so light-weight.

This commit removes the "dbg_lock" protection from the debugging macros to
fix the issue.

The downside is that now our DBGKEY() stuff is broken, but this is not
critical at all and will be fixed later.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: stable@kernel.org [3.0+]
---
 fs/ubifs/debug.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8d9c4681018..164fd48c583 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -175,19 +175,17 @@ const char *dbg_key_str1(const struct ubifs_info *c,
 			 const union ubifs_key *key);
 
 /*
- * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
- * macros.
+ * TODO: these macros are now broken because there is no locking around them
+ * and we use a global buffer for the key string. This means that in case of
+ * concurrent execution we will end up with incorrect and messy key strings.
  */
 #define DBGKEY(key) dbg_key_str0(c, (key))
 #define DBGKEY1(key) dbg_key_str1(c, (key))
 
 extern spinlock_t dbg_lock;
 
-#define ubifs_dbg_msg(type, fmt, ...) do {                        \
-	spin_lock(&dbg_lock);                                     \
-	pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
-	spin_unlock(&dbg_lock);                                   \
-} while (0)
+#define ubifs_dbg_msg(type, fmt, ...) \
+	pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
 
 /* Just a debugging messages not related to any specific UBIFS subsystem */
 #define dbg_msg(fmt, ...)   ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
-- 
cgit v1.2.3


From d34315da9146253351146140ea4b277193ee5e5f Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Tue, 10 Jan 2012 19:32:30 +0200
Subject: UBIFS: fix debugging messages

Patch 56e46742e846e4de167dde0e1e1071ace1c882a5 broke UBIFS debugging messages:
before that commit when UBIFS debugging was enabled, users saw few useful
debugging messages after mount. However, that patch turned 'dbg_msg()' into
'pr_debug()', so to enable the debugging messages users have to enable them
first via /sys/kernel/debug/dynamic_debug/control, which is very impractical.

This commit makes 'dbg_msg()' to use 'printk()' instead of 'pr_debug()', just
as it was before the breakage.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: stable@kernel.org [3.0+]
---
 fs/ubifs/debug.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 164fd48c583..c9d294111a5 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -188,7 +188,10 @@ extern spinlock_t dbg_lock;
 	pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
 
 /* Just a debugging messages not related to any specific UBIFS subsystem */
-#define dbg_msg(fmt, ...)   ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...)                                                     \
+	printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,  \
+	       __func__, ##__VA_ARGS__)
+
 /* General messages */
 #define dbg_gen(fmt, ...)   ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
 /* Additional journal messages */
-- 
cgit v1.2.3


From d46cfba5363a163851dc768f717f34185527a472 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Wed, 4 Jan 2012 16:30:15 -0600
Subject: ceph: always initialize the dentry in open_root_dentry()

When open_root_dentry() gets a dentry via d_obtain_alias() it does
not get initialized.  If the dentry obtained came from the cache,
this is OK.  But if not, the result is an improperly initialized
dentry.

To fix this, call ceph_init_dentry() regardless of which path
produced the dentry.  That function returns immediately for a dentry
that is already initialized, it is safe to use either way.

(Credit to Sage, who suggested this fix.)

Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/ceph/super.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b48f15f101a..ec74313e901 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -638,12 +638,11 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 	if (err == 0) {
 		dout("open_root_inode success\n");
 		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
-		    fsc->sb->s_root == NULL) {
+		    fsc->sb->s_root == NULL)
 			root = d_alloc_root(req->r_target_inode);
-			ceph_init_dentry(root);
-		} else {
+		else
 			root = d_obtain_alias(req->r_target_inode);
-		}
+		ceph_init_dentry(root);
 		req->r_target_inode = NULL;
 		dout("open_root_inode success, root dentry is %p\n", root);
 	} else {
-- 
cgit v1.2.3


From 0b4156eb27214e81f7012458bb15d1e038db9a00 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@gmail.com>
Date: Thu, 12 Jan 2012 09:11:56 +0100
Subject: fs: remove unneeded plug in mpage_readpages()

The block plug in mpage_readpages() duplicates the one in read_pages().

Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Signed-off-by: Amit Sahrawat <amit.sahrawat83@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/mpage.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/mpage.c b/fs/mpage.c
index fdfae9fa98c..643e9f55ef2 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -371,9 +371,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
-	struct blk_plug plug;
-
-	blk_start_plug(&plug);
 
 	map_bh.b_state = 0;
 	map_bh.b_size = 0;
@@ -395,7 +392,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		mpage_bio_submit(READ, bio);
-	blk_finish_plug(&plug);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
-- 
cgit v1.2.3


From 46fe44ce8777f087aa8ad4a2605fdcfb9c2d63af Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 16 Nov 2011 15:03:59 +0100
Subject: quota: Pass information that quota is stored in system file to
 userspace

Quota tools need to know whether quota is stored in a system file or in
classical aquota.{user|group} files. So pass this information as a flag
in GETINFO quotactl.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c      | 8 +++++---
 include/linux/quota.h | 6 +++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 5ec59b20cf7..46741970371 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2125,6 +2125,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		mutex_unlock(&dqopt->dqio_mutex);
 		goto out_file_init;
 	}
+	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+		dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
 	mutex_unlock(&dqopt->dqio_mutex);
 	spin_lock(&dq_state_lock);
 	dqopt->flags |= dquot_state_flag(flags, type);
@@ -2464,7 +2466,7 @@ int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	spin_lock(&dq_data_lock);
 	ii->dqi_bgrace = mi->dqi_bgrace;
 	ii->dqi_igrace = mi->dqi_igrace;
-	ii->dqi_flags = mi->dqi_flags & DQF_MASK;
+	ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK;
 	ii->dqi_valid = IIF_ALL;
 	spin_unlock(&dq_data_lock);
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
@@ -2490,8 +2492,8 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	if (ii->dqi_valid & IIF_IGRACE)
 		mi->dqi_igrace = ii->dqi_igrace;
 	if (ii->dqi_valid & IIF_FLAGS)
-		mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) |
-				(ii->dqi_flags & DQF_MASK);
+		mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) |
+				(ii->dqi_flags & DQF_SETINFO_MASK);
 	spin_unlock(&dq_data_lock);
 	mark_info_dirty(sb, type);
 	/* Force write to disk */
diff --git a/include/linux/quota.h b/include/linux/quota.h
index cb785569903..c09fa042b5e 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -230,7 +230,11 @@ struct mem_dqinfo {
 struct super_block;
 
 #define DQF_MASK 0xffff		/* Mask for format specific flags */
-#define DQF_INFO_DIRTY_B 16
+#define DQF_GETINFO_MASK 0x1ffff	/* Mask for flags passed to userspace */
+#define DQF_SETINFO_MASK 0xffff		/* Mask for flags modifiable from userspace */
+#define DQF_SYS_FILE_B		16
+#define DQF_SYS_FILE (1 << DQF_SYS_FILE_B)	/* Quota file stored as system file */
+#define DQF_INFO_DIRTY_B	31
 #define DQF_INFO_DIRTY (1 << DQF_INFO_DIRTY_B)	/* Is info dirty? */
 
 extern void mark_info_dirty(struct super_block *sb, int type);
-- 
cgit v1.2.3


From 46f72b349290d2bd7aecea38f02609d814332df6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 10 Jan 2012 09:04:37 -0800
Subject: vfs: export symbol d_find_any_alias()

Ceph needs this.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/dcache.c            | 11 +++++++++--
 include/linux/dcache.h |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 89509b5a090..ba960051dfb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1471,7 +1471,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
 	return alias;
 }
 
-static struct dentry * d_find_any_alias(struct inode *inode)
+/**
+ * d_find_any_alias - find any alias for a given inode
+ * @inode: inode to find an alias for
+ *
+ * If any aliases exist for the given inode, take and return a
+ * reference for one of them.  If no aliases exist, return %NULL.
+ */
+struct dentry *d_find_any_alias(struct inode *inode)
 {
 	struct dentry *de;
 
@@ -1480,7 +1487,7 @@ static struct dentry * d_find_any_alias(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 	return de;
 }
-
+EXPORT_SYMBOL(d_find_any_alias);
 
 /**
  * d_obtain_alias - find or allocate a dentry for a given inode
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index ed9f74f6c51..3871ba743b4 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -241,6 +241,7 @@ extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
+extern struct dentry *d_find_any_alias(struct inode *inode);
 extern struct dentry * d_obtain_alias(struct inode *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
-- 
cgit v1.2.3


From a40dc6cc2e121abcbd1b22583ef5447763df510c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 10 Jan 2012 09:12:55 -0800
Subject: ceph: enable/disable dentry complete flags via mount option

Enable/disable use of the dentry dir 'complete' flag via a mount option.
This lets the admin control whether ceph uses the dcache to satisfy
negative lookups or readdir when it has the entire directory contents in
its cache.

This is purely a performance optimization; correctness is guaranteed
whether it is enabled or not.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 Documentation/filesystems/ceph.txt | 18 +++++++++++++-----
 fs/ceph/dir.c                      | 25 ++++++++++++++++++++++---
 fs/ceph/super.c                    | 14 ++++++++++++++
 fs/ceph/super.h                    |  1 +
 4 files changed, 50 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index 763d8ebbbeb..d6030aa3337 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -119,12 +119,20 @@ Mount Options
 	must rely on TCP's error correction to detect data corruption
 	in the data payload.
 
-  noasyncreaddir
-	Disable client's use its local cache to satisfy	readdir
-	requests.  (This does not change correctness; the client uses
-	cached metadata only when a lease or capability ensures it is
-	valid.)
+  dcache
+        Use the dcache contents to perform negative lookups and
+        readdir when the client has the entire directory contents in
+        its cache.  (This does not change correctness; the client uses
+        cached metadata only when a lease or capability ensures it is
+        valid.)
+
+  nodcache
+        Do not use the dcache as above.  This avoids a significant amount of
+        complex code, sacrificing performance without affecting correctness,
+        and is useful for tracking down bugs.
 
+  noasyncreaddir
+	Do not use the dcache as above for readdir.
 
 More Information
 ================
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 974ef1e4d26..5259abfb5dd 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1094,17 +1094,36 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
  */
 void ceph_dir_set_complete(struct inode *inode)
 {
-	/* not yet implemented */
+	struct dentry *dentry = d_find_any_alias(inode);
+	
+	if (dentry && ceph_dentry(dentry) &&
+	    ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
+		dout(" marking %p (%p) complete\n", inode, dentry);
+		set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+	}
+	dput(dentry);
 }
 
 void ceph_dir_clear_complete(struct inode *inode)
 {
-	/* not yet implemented */
+	struct dentry *dentry = d_find_any_alias(inode);
+
+	if (dentry && ceph_dentry(dentry)) {
+		dout(" marking %p (%p) complete\n", inode, dentry);
+		set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+	}
+	dput(dentry);
 }
 
 bool ceph_dir_test_complete(struct inode *inode)
 {
-	/* not yet implemented */
+	struct dentry *dentry = d_find_any_alias(inode);
+
+	if (dentry && ceph_dentry(dentry)) {
+		dout(" marking %p (%p) NOT complete\n", inode, dentry);
+		clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+	}
+	dput(dentry);
 	return false;
 }
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index ec74313e901..9c62fe02ce0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,8 @@ enum {
 	Opt_rbytes,
 	Opt_norbytes,
 	Opt_noasyncreaddir,
+	Opt_dcache,
+	Opt_nodcache,
 	Opt_ino32,
 };
 
@@ -152,6 +154,8 @@ static match_table_t fsopt_tokens = {
 	{Opt_rbytes, "rbytes"},
 	{Opt_norbytes, "norbytes"},
 	{Opt_noasyncreaddir, "noasyncreaddir"},
+	{Opt_dcache, "dcache"},
+	{Opt_nodcache, "nodcache"},
 	{Opt_ino32, "ino32"},
 	{-1, NULL}
 };
@@ -231,6 +235,12 @@ static int parse_fsopt_token(char *c, void *private)
 	case Opt_noasyncreaddir:
 		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
 		break;
+	case Opt_dcache:
+		fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
+		break;
+	case Opt_nodcache:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
+		break;
 	case Opt_ino32:
 		fsopt->flags |= CEPH_MOUNT_OPT_INO32;
 		break;
@@ -377,6 +387,10 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_puts(m, ",norbytes");
 	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
 		seq_puts(m, ",noasyncreaddir");
+	if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
+		seq_puts(m, ",dcache");
+	else
+		seq_puts(m, ",nodcache");
 
 	if (fsopt->wsize)
 		seq_printf(m, ",wsize=%d", fsopt->wsize);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index edcbf3774a5..140f99f978c 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -28,6 +28,7 @@
 #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
 #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
 #define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */
+#define CEPH_MOUNT_OPT_DCACHE          (1<<9) /* use dcache for readdir etc */
 
 #define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
 
-- 
cgit v1.2.3


From 83eb26af0db71f2dfe551405c55d982288fa6178 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Wed, 11 Jan 2012 17:41:01 -0800
Subject: ceph: ensure prealloc_blob is in place when removing xattr

In __ceph_build_xattrs_blob(), if a ceph inode's extended attributes
are marked dirty, all attributes recorded in its rb_tree index are
formatted into a "blob" buffer.  The target buffer is recorded in
ceph_inode->i_xattrs.prealloc_blob, and it is expected to exist and
be of sufficient size to hold the attributes.

The extended attributes are marked dirty in two cases: when a new
attribute is added to the inode; or when one is removed.  In the
former case work is done to ensure the prealloc_blob buffer is
properly set up, but in the latter it is not.

Change the logic in ceph_removexattr() so it matches what is
done in ceph_setxattr().  Note that this is done in a way that
keeps the two blocks of code nearly identical, in anticipation
of a subsequent patch that encapsulates some of this logic into
one or more helper routines.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a5e36e4488a..857214ae8c0 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -818,6 +818,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int issued;
 	int err;
+	int required_blob_size;
 	int dirty;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -833,14 +834,34 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 			return -EOPNOTSUPP;
 	}
 
+	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
 	__build_xattrs(inode);
+retry:
 	issued = __ceph_caps_issued(ci, NULL);
 	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
 
 	if (!(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
 
+	required_blob_size = __get_required_blob_size(ci, 0, 0);
+
+	if (!ci->i_xattrs.prealloc_blob ||
+	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+		struct ceph_buffer *blob;
+
+		spin_unlock(&ci->i_ceph_lock);
+		dout(" preaallocating new blob size=%d\n", required_blob_size);
+		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+		if (!blob)
+			goto out;
+		spin_lock(&ci->i_ceph_lock);
+		if (ci->i_xattrs.prealloc_blob)
+			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+		ci->i_xattrs.prealloc_blob = blob;
+		goto retry;
+	}
+
 	err = __remove_xattr_by_name(ceph_inode(inode), name);
 	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
 	ci->i_xattrs.dirty = true;
@@ -853,6 +874,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
 	err = ceph_send_removexattr(dentry, name);
+out:
 	return err;
 }
 
-- 
cgit v1.2.3


From 7250170c9ed00f3b74b11b98afefab45020672dd Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 11 Jan 2012 10:46:27 +0300
Subject: cifs: integer overflow in parse_dacl()

On 32 bit systems num_aces * sizeof(struct cifs_ace *) could overflow
leading to a smaller ppace buffer than we expected.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsacl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 72ddf23ef6f..c1b25448738 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -909,6 +909,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 		umode_t group_mask = S_IRWXG;
 		umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
 
+		if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *))
+			return;
 		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
 				GFP_KERNEL);
 		if (!ppace) {
-- 
cgit v1.2.3


From 0e0243dc35a2349b3946e54f90e874be396fdb8b Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 12 Jan 2012 10:06:05 +0300
Subject: NFS: add an endian notation

This function returns a big endian value.  The implementation in
fs/nfs/callback_proc.c is declared with "__be32" but the .h file uses
"unsigned" instead.  It makes sparse complain:

fs/nfs/callback_proc.c:232:8: error:
	symbol 'nfs4_callback_layoutrecall' redeclared with different
	type (originally declared at fs/nfs/callback.h:165) - different
	base types

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 07df5f1d85e..c89d3b9e483 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -162,7 +162,7 @@ struct cb_layoutrecallargs {
 	};
 };
 
-extern unsigned nfs4_callback_layoutrecall(
+extern __be32 nfs4_callback_layoutrecall(
 	struct cb_layoutrecallargs *args,
 	void *dummy, struct cb_process_state *cps);
 
-- 
cgit v1.2.3


From 13fff2f35fd21d69ee84ef6a78610420e1a42818 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 12 Jan 2012 10:07:35 +0300
Subject: NFS: cleanup endian type in decode_ds_addr()

port is supposed to be a __be16 here.  The existing code should work
fine, but this is a cleanup.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayoutdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index ed388aae968..8ae91908f5a 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -382,7 +382,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
 {
 	struct nfs4_pnfs_ds_addr *da = NULL;
 	char *buf, *portstr;
-	u32 port;
+	__be16 port;
 	int nlen, rlen;
 	int tmp[2];
 	__be32 *p;
-- 
cgit v1.2.3


From 363e0df057ea8da539645fe4c3c227e3d44054cc Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 12 Jan 2012 10:16:14 +0300
Subject: nfs: check for integer overflow in decode_devicenotify_args()

On 32 bit, if n is too large then "n * sizeof(*args->devs)" could
overflow and args->devs would be smaller than expected.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_xdr.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 726e59a9e50..d50b2742f23 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -305,6 +305,10 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
 	n = ntohl(*p++);
 	if (n <= 0)
 		goto out;
+	if (n > ULONG_MAX / sizeof(*args->devs)) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
 
 	args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
 	if (!args->devs) {
-- 
cgit v1.2.3


From de040beccd52bb5fcac90031505384d037b1111c Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Tue, 10 Jan 2012 22:42:47 +0800
Subject: NFS4: fix compile warnings in nfs4proc.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

compile in nfs-for-3.3 branch shows following warnings. Fix it here.

fs/nfs/nfs4proc.c: In function ‘__nfs4_get_acl_uncached’:
fs/nfs/nfs4proc.c:3589: warning: format ‘%ld’ expects type ‘long int’, but argument 4 has type ‘size_t’
fs/nfs/nfs4proc.c:3589: warning: format ‘%ld’ expects type ‘long int’, but argument 6 has type ‘size_t’

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 75366dc8968..f0c849c98fe 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3587,7 +3587,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		res.acl_flags |= NFS4_ACL_LEN_REQUEST;
 	resp_buf = page_address(pages[0]);
 
-	dprintk("%s  buf %p buflen %ld npages %d args.acl_len %ld\n",
+	dprintk("%s  buf %p buflen %zu npages %d args.acl_len %zu\n",
 		__func__, buf, buflen, npages, args.acl_len);
 	ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
 			     &msg, &args.seq_args, &res.seq_res, 0);
-- 
cgit v1.2.3


From 39e567ae36fe03c2b446e1b83ee3d39bea08f90b Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 12 Jan 2012 23:18:41 +0800
Subject: pnfsblock: acquire im_lock in _preload_range

When calling _add_entry, we should take the im_lock to protect
agains other modifiers.

Cc: <stable@vger.kernel.org> #3.1+
Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/extents.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 19fa7b0b8c0..c69682a4262 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -139,11 +139,13 @@ static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
 }
 
 /* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
+static int _preload_range(struct pnfs_inval_markings *marks,
+		u64 offset, u64 length)
 {
 	u64 start, end, s;
 	int count, i, used = 0, status = -ENOMEM;
 	struct pnfs_inval_tracking **storage;
+	struct my_tree  *tree = &marks->im_tree;
 
 	dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
 	start = normalize(offset, tree->mtt_step_size);
@@ -161,12 +163,11 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
 			goto out_cleanup;
 	}
 
-	/* Now need lock - HOW??? */
-
+	spin_lock(&marks->im_lock);
 	for (s = start; s < end; s += tree->mtt_step_size)
 		used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+	spin_unlock(&marks->im_lock);
 
-	/* Unlock - HOW??? */
 	status = 0;
 
  out_cleanup:
@@ -286,7 +287,7 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
 
 	start = normalize(offset, marks->im_block_size);
 	end = normalize_up(offset + length, marks->im_block_size);
-	if (_preload_range(&marks->im_tree, start, end - start))
+	if (_preload_range(marks, start, end - start))
 		goto outerr;
 
 	spin_lock(&marks->im_lock);
-- 
cgit v1.2.3


From 82b906d6550ee5fe0d5553359b3c9692dd0aed31 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 12 Jan 2012 23:18:43 +0800
Subject: pnfsblock: set read/write tk_status to pnfs_error

To pass the IO status to upper layer.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 281ae95932c..06fe0802118 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -216,6 +216,7 @@ bl_end_par_io_read(void *data)
 {
 	struct nfs_read_data *rdata = data;
 
+	rdata->task.tk_status = rdata->pnfs_error;
 	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
 	schedule_work(&rdata->task.u.tk_work);
 }
@@ -405,7 +406,7 @@ static void bl_end_par_io_write(void *data)
 {
 	struct nfs_write_data *wdata = data;
 
-	wdata->task.tk_status = 0;
+	wdata->task.tk_status = wdata->pnfs_error;
 	wdata->verf.committed = NFS_FILE_SYNC;
 	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
 	schedule_work(&wdata->task.u.tk_work);
-- 
cgit v1.2.3


From 57582b372f63d0f655b1a35b0d306d73d1a46775 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 12 Jan 2012 23:18:45 +0800
Subject: pnfsblock: clean up _add_entry

It is wrong to kmalloc in _add_entry() as it is inside
spinlock. memory should be already allocated _add_entry() is called.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/extents.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index c69682a4262..369ef53e89e 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -110,13 +110,7 @@ static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
 		return 0;
 	} else {
 		struct pnfs_inval_tracking *new;
-		if (storage)
-			new = storage;
-		else {
-			new = kmalloc(sizeof(*new), GFP_NOFS);
-			if (!new)
-				return -ENOMEM;
-		}
+		new = storage;
 		new->it_sector = s;
 		new->it_tags = (1 << tag);
 		list_add(&new->it_link, &pos->it_link);
-- 
cgit v1.2.3


From 93a3844ee0f843b05a1df4b52e1a19ff26b98d24 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 12 Jan 2012 23:18:47 +0800
Subject: pnfsblock: don't spinlock when freeing block_dev

bl_free_block_dev() may sleep. We can not call it with spinlock held.
Besides, there is no need to take bm_lock as we are last user freeing bm_devlist.

Cc: <stable@vger.kernel.org> #3.1+
Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 06fe0802118..6d39e9ab1e6 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -780,16 +780,13 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
 static void free_blk_mountid(struct block_mount_id *mid)
 {
 	if (mid) {
-		struct pnfs_block_dev *dev;
-		spin_lock(&mid->bm_lock);
-		while (!list_empty(&mid->bm_devlist)) {
-			dev = list_first_entry(&mid->bm_devlist,
-					       struct pnfs_block_dev,
-					       bm_node);
+		struct pnfs_block_dev *dev, *tmp;
+
+		/* No need to take bm_lock as we are last user freeing bm_devlist */
+		list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
 			list_del(&dev->bm_node);
 			bl_free_block_dev(dev);
 		}
-		spin_unlock(&mid->bm_lock);
 		kfree(mid);
 	}
 }
-- 
cgit v1.2.3


From 74a6eeb44ca6174d9cc93b9b8b4d58211c57bc80 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 12 Jan 2012 23:18:48 +0800
Subject: pnfsblock: limit bio page count

One bio can have at most BIO_MAX_PAGES pages. We should limit it bec otherwise
bio_alloc will fail when there are many pages in one read/write_pagelist.

Cc: <stable@vger.kernel.org> #3.1+
Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 6d39e9ab1e6..baf0bf2acbd 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -146,14 +146,19 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
 {
 	struct bio *bio;
 
+	npg = min(npg, BIO_MAX_PAGES);
 	bio = bio_alloc(GFP_NOIO, npg);
-	if (!bio)
-		return NULL;
+	if (!bio && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (npg /= 2))
+			bio = bio_alloc(GFP_NOIO, npg);
+	}
 
-	bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
-	bio->bi_bdev = be->be_mdev;
-	bio->bi_end_io = end_io;
-	bio->bi_private = par;
+	if (bio) {
+		bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+		bio->bi_bdev = be->be_mdev;
+		bio->bi_end_io = end_io;
+		bio->bi_private = par;
+	}
 	return bio;
 }
 
-- 
cgit v1.2.3


From 60c52e3a72fda10e82f38b6f979956eb2dcb3d4e Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 12 Jan 2012 23:18:40 +0800
Subject: pnfsblock: cleanup bl_mark_sectors_init

It does not need to manipulate on partial initialized blocks.
Writeback code takes care of it.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c |  6 ++--
 fs/nfs/blocklayout/blocklayout.h |  3 +-
 fs/nfs/blocklayout/extents.c     | 76 +++-------------------------------------
 3 files changed, 8 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index baf0bf2acbd..a263810803c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -577,8 +577,7 @@ fill_invalid_ext:
 			unlock_page(page);
 
 			ret = bl_mark_sectors_init(be->be_inval, isect,
-						       PAGE_CACHE_SECTORS,
-						       NULL);
+						       PAGE_CACHE_SECTORS);
 			if (unlikely(ret)) {
 				dprintk("%s bl_mark_sectors_init fail %d\n",
 					__func__, ret);
@@ -627,8 +626,7 @@ next_page:
 		}
 		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
 			ret = bl_mark_sectors_init(be->be_inval, isect,
-						       PAGE_CACHE_SECTORS,
-						       NULL);
+						       PAGE_CACHE_SECTORS);
 			if (unlikely(ret)) {
 				dprintk("%s bl_mark_sectors_init fail %d\n",
 					__func__, ret);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 42acf7ef599..60728acc7b9 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -186,8 +186,7 @@ struct pnfs_block_extent *
 bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
 		struct pnfs_block_extent **cow_read);
 int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-			     sector_t offset, sector_t length,
-			     sector_t **pages);
+			     sector_t offset, sector_t length);
 void bl_put_extent(struct pnfs_block_extent *be);
 struct pnfs_block_extent *bl_alloc_extent(void);
 int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 369ef53e89e..d0f52ed2242 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -174,33 +174,6 @@ static int _preload_range(struct pnfs_inval_markings *marks,
 	return status;
 }
 
-static void set_needs_init(sector_t *array, sector_t offset)
-{
-	sector_t *p = array;
-
-	dprintk("%s enter\n", __func__);
-	if (!p)
-		return;
-	while (*p < offset)
-		p++;
-	if (*p == offset)
-		return;
-	else if (*p == ~0) {
-		*p++ = offset;
-		*p = ~0;
-		return;
-	} else {
-		sector_t *save = p;
-		dprintk("%s Adding %llu\n", __func__, (u64)offset);
-		while (*p != ~0)
-			p++;
-		p++;
-		memmove(save + 1, save, (char *)p - (char *)save);
-		*save = offset;
-		return;
-	}
-}
-
 /* We are relying on page lock to serialize this */
 int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
 {
@@ -256,28 +229,15 @@ static int is_range_written(struct pnfs_inval_markings *marks,
 
 /* Marks sectors in [offest, offset_length) as having been initialized.
  * All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Notes where partial block is initialized, and helps prepare it for
- * complete initialization later.
+ * Currently assumes offset is page-aligned
  */
-/* Currently assumes offset is page-aligned */
 int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-			     sector_t offset, sector_t length,
-			     sector_t **pages)
+			     sector_t offset, sector_t length)
 {
-	sector_t s, start, end;
-	sector_t *array = NULL; /* Pages to mark */
+	sector_t start, end;
 
 	dprintk("%s(offset=%llu,len=%llu) enter\n",
 		__func__, (u64)offset, (u64)length);
-	s = max((sector_t) 3,
-		2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
-	dprintk("%s set max=%llu\n", __func__, (u64)s);
-	if (pages) {
-		array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
-		if (!array)
-			goto outerr;
-		array[0] = ~0;
-	}
 
 	start = normalize(offset, marks->im_block_size);
 	end = normalize_up(offset + length, marks->im_block_size);
@@ -285,41 +245,15 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
 		goto outerr;
 
 	spin_lock(&marks->im_lock);
-
-	for (s = normalize_up(start, PAGE_CACHE_SECTORS);
-	     s < offset; s += PAGE_CACHE_SECTORS) {
-		dprintk("%s pre-area pages\n", __func__);
-		/* Portion of used block is not initialized */
-		if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
-			set_needs_init(array, s);
-	}
 	if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
 		goto out_unlock;
-	for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
-	     s < end; s += PAGE_CACHE_SECTORS) {
-		dprintk("%s post-area pages\n", __func__);
-		if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
-			set_needs_init(array, s);
-	}
-
 	spin_unlock(&marks->im_lock);
 
-	if (pages) {
-		if (array[0] == ~0) {
-			kfree(array);
-			*pages = NULL;
-		} else
-			*pages = array;
-	}
 	return 0;
 
- out_unlock:
+out_unlock:
 	spin_unlock(&marks->im_lock);
- outerr:
-	if (pages) {
-		kfree(array);
-		*pages = NULL;
-	}
+outerr:
 	return -ENOMEM;
 }
 
-- 
cgit v1.2.3


From 72c508879979522de347bcec706507e00d7c443d Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 12 Jan 2012 23:18:42 +0800
Subject: pnfsblock: move find lock page logic out of bl_write_pagelist

Also avoid unnecessary lock_page if page is handled by others.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c | 78 +++++++++++++++++++++++++++-------------
 1 file changed, 54 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index a263810803c..23427362185 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -490,6 +490,55 @@ cleanup:
 	return ret;
 }
 
+/* Find or create a zeroing page marked being writeback.
+ * Return ERR_PTR on error, NULL to indicate skip this page and page itself
+ * to indicate write out.
+ */
+static struct page *
+bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
+			struct pnfs_block_extent *cow_read)
+{
+	struct page *page;
+	int locked = 0;
+	page = find_get_page(inode->i_mapping, index);
+	if (page)
+		goto check_page;
+
+	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+	if (unlikely(!page)) {
+		dprintk("%s oom\n", __func__);
+		return ERR_PTR(-ENOMEM);
+	}
+	locked = 1;
+
+check_page:
+	/* PageDirty: Other will write this out
+	 * PageWriteback: Other is writing this out
+	 * PageUptodate: It was read before
+	 */
+	if (PageDirty(page) || PageWriteback(page)) {
+		print_page(page);
+		if (locked)
+			unlock_page(page);
+		page_cache_release(page);
+		return NULL;
+	}
+
+	if (!locked) {
+		lock_page(page);
+		locked = 1;
+		goto check_page;
+	}
+	if (!PageUptodate(page)) {
+		/* New page, readin or zero it */
+		init_page_for_write(page, cow_read);
+	}
+	set_page_writeback(page);
+	unlock_page(page);
+
+	return page;
+}
+
 static enum pnfs_try_status
 bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 {
@@ -549,32 +598,13 @@ fill_invalid_ext:
 			dprintk("%s zero %dth page: index %lu isect %llu\n",
 				__func__, npg_zero, index,
 				(unsigned long long)isect);
-			page =
-			    find_or_create_page(wdata->inode->i_mapping, index,
-						GFP_NOFS);
-			if (!page) {
-				dprintk("%s oom\n", __func__);
-				wdata->pnfs_error = -ENOMEM;
+			page = bl_find_get_zeroing_page(wdata->inode, index,
+							cow_read);
+			if (unlikely(IS_ERR(page))) {
+				wdata->pnfs_error = PTR_ERR(page);
 				goto out;
-			}
-
-			/* PageDirty: Other will write this out
-			 * PageWriteback: Other is writing this out
-			 * PageUptodate: It was read before
-			 * sector_initialized: already written out
-			 */
-			if (PageDirty(page) || PageWriteback(page)) {
-				print_page(page);
-				unlock_page(page);
-				page_cache_release(page);
+			} else if (page == NULL)
 				goto next_page;
-			}
-			if (!PageUptodate(page)) {
-				/* New page, readin or zero it */
-				init_page_for_write(page, cow_read);
-			}
-			set_page_writeback(page);
-			unlock_page(page);
 
 			ret = bl_mark_sectors_init(be->be_inval, isect,
 						       PAGE_CACHE_SECTORS);
-- 
cgit v1.2.3


From c0411a94a8f318379464e29dd81db806249dbca6 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 12 Jan 2012 23:18:44 +0800
Subject: pnfsblock: remove rpc_call_ops from struct parallel_io

block layout can just make use of generic read/write_done.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 23427362185..9215c6644a3 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -90,7 +90,6 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
  */
 struct parallel_io {
 	struct kref refcnt;
-	struct rpc_call_ops call_ops;
 	void (*pnfs_callback) (void *data);
 	void *data;
 };
@@ -226,14 +225,6 @@ bl_end_par_io_read(void *data)
 	schedule_work(&rdata->task.u.tk_work);
 }
 
-/* We don't want normal .rpc_call_done callback used, so we replace it
- * with this stub.
- */
-static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
-{
-	return;
-}
-
 static enum pnfs_try_status
 bl_read_pagelist(struct nfs_read_data *rdata)
 {
@@ -253,8 +244,6 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 	par = alloc_parallel(rdata);
 	if (!par)
 		goto use_mds;
-	par->call_ops = *rdata->mds_ops;
-	par->call_ops.rpc_call_done = bl_rpc_do_nothing;
 	par->pnfs_callback = bl_end_par_io_read;
 	/* At this point, we can no longer jump to use_mds */
 
@@ -564,8 +553,6 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 	par = alloc_parallel(wdata);
 	if (!par)
 		return PNFS_NOT_ATTEMPTED;
-	par->call_ops = *wdata->mds_ops;
-	par->call_ops.rpc_call_done = bl_rpc_do_nothing;
 	par->pnfs_callback = bl_end_par_io_write;
 	/* At this point, have to be more careful with error handling */
 
-- 
cgit v1.2.3


From 7c5465d6ccd759caa959828e2add5603518dafc4 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 12 Jan 2012 23:18:46 +0800
Subject: pnfsblock: alloc short extent before submit bio

As discussed earlier, it is better for block client to allocate memory for
tracking extents state before submitting bio. So the patch does it by allocating
a short_extent for every INVALID extent touched by write pagelist and for
every zeroing page we created, saving them in layout header. Then in end_io we
can just use them to create commit list items and avoid memory allocation there.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c | 74 ++++++++++++++++++++++++++--------
 fs/nfs/blocklayout/blocklayout.h |  9 ++++-
 fs/nfs/blocklayout/extents.c     | 85 ++++++++++++++++++++++++++++++----------
 3 files changed, 131 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9215c6644a3..48cfac31f64 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
  */
 struct parallel_io {
 	struct kref refcnt;
-	void (*pnfs_callback) (void *data);
+	void (*pnfs_callback) (void *data, int num_se);
 	void *data;
+	int bse_count;
 };
 
 static inline struct parallel_io *alloc_parallel(void *data)
@@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
 	if (rv) {
 		rv->data = data;
 		kref_init(&rv->refcnt);
+		rv->bse_count = 0;
 	}
 	return rv;
 }
@@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
 	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
 
 	dprintk("%s enter\n", __func__);
-	p->pnfs_callback(p->data);
+	p->pnfs_callback(p->data, p->bse_count);
 	kfree(p);
 }
 
@@ -216,7 +218,7 @@ static void bl_read_cleanup(struct work_struct *work)
 }
 
 static void
-bl_end_par_io_read(void *data)
+bl_end_par_io_read(void *data, int unused)
 {
 	struct nfs_read_data *rdata = data;
 
@@ -317,6 +319,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
 {
 	sector_t isect, end;
 	struct pnfs_block_extent *be;
+	struct pnfs_block_short_extent *se;
 
 	dprintk("%s(%llu, %u)\n", __func__, offset, count);
 	if (count == 0)
@@ -329,8 +332,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
 		be = bl_find_get_extent(bl, isect, NULL);
 		BUG_ON(!be); /* FIXME */
 		len = min(end, be->be_f_offset + be->be_length) - isect;
-		if (be->be_state == PNFS_BLOCK_INVALID_DATA)
-			bl_mark_for_commit(be, isect, len); /* What if fails? */
+		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+			se = bl_pop_one_short_extent(be->be_inval);
+			BUG_ON(!se);
+			bl_mark_for_commit(be, isect, len, se);
+		}
 		isect += len;
 		bl_put_extent(be);
 	}
@@ -352,7 +358,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 		end_page_writeback(page);
 		page_cache_release(page);
 	} while (bvec >= bio->bi_io_vec);
-	if (!uptodate) {
+
+	if (unlikely(!uptodate)) {
 		if (!wdata->pnfs_error)
 			wdata->pnfs_error = -EIO;
 		pnfs_set_lo_fail(wdata->lseg);
@@ -361,7 +368,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	put_parallel(par);
 }
 
-/* This is basically copied from mpage_end_io_read */
 static void bl_end_io_write(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
@@ -387,7 +393,7 @@ static void bl_write_cleanup(struct work_struct *work)
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
 	wdata = container_of(task, struct nfs_write_data, task);
-	if (!wdata->pnfs_error) {
+	if (likely(!wdata->pnfs_error)) {
 		/* Marks for LAYOUTCOMMIT */
 		mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
 				     wdata->args.offset, wdata->args.count);
@@ -396,10 +402,15 @@ static void bl_write_cleanup(struct work_struct *work)
 }
 
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data)
+static void bl_end_par_io_write(void *data, int num_se)
 {
 	struct nfs_write_data *wdata = data;
 
+	if (unlikely(wdata->pnfs_error)) {
+		bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
+					num_se);
+	}
+
 	wdata->task.tk_status = wdata->pnfs_error;
 	wdata->verf.committed = NFS_FILE_SYNC;
 	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
@@ -552,7 +563,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 	 */
 	par = alloc_parallel(wdata);
 	if (!par)
-		return PNFS_NOT_ATTEMPTED;
+		goto out_mds;
 	par->pnfs_callback = bl_end_par_io_write;
 	/* At this point, have to be more careful with error handling */
 
@@ -560,12 +571,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 	be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
 	if (!be || !is_writable(be, isect)) {
 		dprintk("%s no matching extents!\n", __func__);
-		wdata->pnfs_error = -EINVAL;
-		goto out;
+		goto out_mds;
 	}
 
 	/* First page inside INVALID extent */
 	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+		if (likely(!bl_push_one_short_extent(be->be_inval)))
+			par->bse_count++;
+		else
+			goto out_mds;
 		temp = offset >> PAGE_CACHE_SHIFT;
 		npg_zero = do_div(temp, npg_per_block);
 		isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
@@ -603,6 +617,19 @@ fill_invalid_ext:
 				wdata->pnfs_error = ret;
 				goto out;
 			}
+			if (likely(!bl_push_one_short_extent(be->be_inval)))
+				par->bse_count++;
+			else {
+				end_page_writeback(page);
+				page_cache_release(page);
+				wdata->pnfs_error = -ENOMEM;
+				goto out;
+			}
+			/* FIXME: This should be done in bi_end_io */
+			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+					     page->index << PAGE_CACHE_SHIFT,
+					     PAGE_CACHE_SIZE);
+
 			bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
 						 isect, page, be,
 						 bl_end_io_write_zero, par);
@@ -611,10 +638,6 @@ fill_invalid_ext:
 				bio = NULL;
 				goto out;
 			}
-			/* FIXME: This should be done in bi_end_io */
-			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
-					     page->index << PAGE_CACHE_SHIFT,
-					     PAGE_CACHE_SIZE);
 next_page:
 			isect += PAGE_CACHE_SECTORS;
 			extent_length -= PAGE_CACHE_SECTORS;
@@ -638,6 +661,15 @@ next_page:
 				wdata->pnfs_error = -EINVAL;
 				goto out;
 			}
+			if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+				if (likely(!bl_push_one_short_extent(
+								be->be_inval)))
+					par->bse_count++;
+				else {
+					wdata->pnfs_error = -ENOMEM;
+					goto out;
+				}
+			}
 			extent_length = be->be_length -
 			    (isect - be->be_f_offset);
 		}
@@ -685,6 +717,10 @@ out:
 	bl_submit_bio(WRITE, bio);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
+out_mds:
+	bl_put_extent(be);
+	kfree(par);
+	return PNFS_NOT_ATTEMPTED;
 }
 
 /* FIXME - range ignored */
@@ -711,11 +747,17 @@ static void
 release_inval_marks(struct pnfs_inval_markings *marks)
 {
 	struct pnfs_inval_tracking *pos, *temp;
+	struct pnfs_block_short_extent *se, *stemp;
 
 	list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
 		list_del(&pos->it_link);
 		kfree(pos);
 	}
+
+	list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
+		list_del(&se->bse_node);
+		kfree(se);
+	}
 	return;
 }
 
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 60728acc7b9..e31a2df28e7 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -70,6 +70,7 @@ struct pnfs_inval_markings {
 	spinlock_t	im_lock;
 	struct my_tree	im_tree;	/* Sectors that need LAYOUTCOMMIT */
 	sector_t	im_block_size;	/* Server blocksize in sectors */
+	struct list_head im_extents;	/* Short extents for INVAL->RW conversion */
 };
 
 struct pnfs_inval_tracking {
@@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
 {
 	spin_lock_init(&marks->im_lock);
 	INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+	INIT_LIST_HEAD(&marks->im_extents);
 	marks->im_block_size = blocksize;
 	marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
 					   blocksize);
@@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
 int bl_add_merge_extent(struct pnfs_block_layout *bl,
 			 struct pnfs_block_extent *new);
 int bl_mark_for_commit(struct pnfs_block_extent *be,
-			sector_t offset, sector_t length);
+			sector_t offset, sector_t length,
+			struct pnfs_block_short_extent *new);
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
 
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index d0f52ed2242..1abac09f7cd 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -157,10 +157,10 @@ static int _preload_range(struct pnfs_inval_markings *marks,
 			goto out_cleanup;
 	}
 
-	spin_lock(&marks->im_lock);
+	spin_lock_bh(&marks->im_lock);
 	for (s = start; s < end; s += tree->mtt_step_size)
 		used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
-	spin_unlock(&marks->im_lock);
+	spin_unlock_bh(&marks->im_lock);
 
 	status = 0;
 
@@ -179,9 +179,9 @@ int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
 {
 	int rv;
 
-	spin_lock(&marks->im_lock);
+	spin_lock_bh(&marks->im_lock);
 	rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
-	spin_unlock(&marks->im_lock);
+	spin_unlock_bh(&marks->im_lock);
 	return rv;
 }
 
@@ -221,9 +221,9 @@ static int is_range_written(struct pnfs_inval_markings *marks,
 {
 	int rv;
 
-	spin_lock(&marks->im_lock);
+	spin_lock_bh(&marks->im_lock);
 	rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
-	spin_unlock(&marks->im_lock);
+	spin_unlock_bh(&marks->im_lock);
 	return rv;
 }
 
@@ -244,15 +244,15 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
 	if (_preload_range(marks, start, end - start))
 		goto outerr;
 
-	spin_lock(&marks->im_lock);
+	spin_lock_bh(&marks->im_lock);
 	if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
 		goto out_unlock;
-	spin_unlock(&marks->im_lock);
+	spin_unlock_bh(&marks->im_lock);
 
 	return 0;
 
 out_unlock:
-	spin_unlock(&marks->im_lock);
+	spin_unlock_bh(&marks->im_lock);
 outerr:
 	return -ENOMEM;
 }
@@ -267,9 +267,9 @@ static int mark_written_sectors(struct pnfs_inval_markings *marks,
 
 	dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
 		(u64)offset, (u64)length);
-	spin_lock(&marks->im_lock);
+	spin_lock_bh(&marks->im_lock);
 	status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
-	spin_unlock(&marks->im_lock);
+	spin_unlock_bh(&marks->im_lock);
 	return status;
 }
 
@@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
 
 /* Note the range described by offset, length is guaranteed to be contained
  * within be.
+ * new will be freed, either by this function or add_to_commitlist if they
+ * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
  */
 int bl_mark_for_commit(struct pnfs_block_extent *be,
-		    sector_t offset, sector_t length)
+		    sector_t offset, sector_t length,
+		    struct pnfs_block_short_extent *new)
 {
 	sector_t new_end, end = offset + length;
-	struct pnfs_block_short_extent *new;
 	struct pnfs_block_layout *bl = container_of(be->be_inval,
 						    struct pnfs_block_layout,
 						    bl_inval);
 
-	new = kmalloc(sizeof(*new), GFP_NOFS);
-	if (!new)
-		return -ENOMEM;
-
 	mark_written_sectors(be->be_inval, offset, length);
 	/* We want to add the range to commit list, but it must be
 	 * block-normalized, and verified that the normalized range has
@@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
 	new->bse_mdev = be->be_mdev;
 
 	spin_lock(&bl->bl_ext_lock);
-	/* new will be freed, either by add_to_commitlist if it decides not
-	 * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
-	 */
 	add_to_commitlist(bl, new);
 	spin_unlock(&bl->bl_ext_lock);
 	return 0;
@@ -862,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
 		}
 	}
 }
+
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
+{
+	struct pnfs_block_short_extent *new;
+
+	new = kmalloc(sizeof(*new), GFP_NOFS);
+	if (unlikely(!new))
+		return -ENOMEM;
+
+	spin_lock_bh(&marks->im_lock);
+	list_add(&new->bse_node, &marks->im_extents);
+	spin_unlock_bh(&marks->im_lock);
+
+	return 0;
+}
+
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
+{
+	struct pnfs_block_short_extent *rv = NULL;
+
+	spin_lock_bh(&marks->im_lock);
+	if (!list_empty(&marks->im_extents)) {
+		rv = list_entry((&marks->im_extents)->next,
+				struct pnfs_block_short_extent, bse_node);
+		list_del_init(&rv->bse_node);
+	}
+	spin_unlock_bh(&marks->im_lock);
+
+	return rv;
+}
+
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
+{
+	struct pnfs_block_short_extent *se = NULL, *tmp;
+
+	if (num_to_free <= 0)
+		return;
+
+	spin_lock(&marks->im_lock);
+	list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
+		list_del(&se->bse_node);
+		kfree(se);
+		if (--num_to_free == 0)
+			break;
+	}
+	spin_unlock(&marks->im_lock);
+
+	BUG_ON(num_to_free > 0);
+}
-- 
cgit v1.2.3


From 69116f279a9eaf4c540934269342d9149538fc79 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:17 +1030
Subject: module_param: avoid bool abuse, add bint for special cases.

For historical reasons, we allow module_param(bool) to take an int (or
an unsigned int).  That's going away.

A few drivers really want an int: they set it to -1 and a parameter
will set it to 0 or 1.  This sucks: reading them from sysfs will give
'Y' for both -1 and 1, but if we change it to an int, then the users
might be broken (if they did "param" instead of "param=1").

Use a new 'bint' parser for them.

(ntfs has a different problem: it needs an int for debug_msgs because
it's also exposed via sysctl.)

Cc: Steve Glendinning <steve.glendinning@smsc.com>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Guenter Roeck <guenter.roeck@ericsson.com>
Cc: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
Cc: Christoph Raisch <raisch@de.ibm.com>
Cc: Roland Dreier <roland@kernel.org>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
Cc: linux390@de.ibm.com
Cc: Anton Altaparmakov <anton@tuxera.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: lm-sensors@lm-sensors.org
Cc: linux-rdma@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linux-ntfs-dev@lists.sourceforge.net
Cc: alsa-devel@alsa-project.org
Acked-by: Takashi Iwai <tiwai@suse.de> (For the sound part)
Acked-by: Guenter Roeck <guenter.roeck@ericsson.com> (For the hwmon driver)
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/hwmon/emc2103.c                |  2 +-
 drivers/infiniband/hw/ehca/ehca_main.c |  2 +-
 drivers/s390/cio/cmf.c                 |  2 +-
 fs/ntfs/super.c                        |  2 +-
 include/linux/moduleparam.h            |  6 ++++++
 kernel/params.c                        | 24 ++++++++++++++++++++++++
 sound/pci/intel8x0.c                   |  4 ++--
 7 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/drivers/hwmon/emc2103.c b/drivers/hwmon/emc2103.c
index 848a2b0bc83..865063914d7 100644
--- a/drivers/hwmon/emc2103.c
+++ b/drivers/hwmon/emc2103.c
@@ -55,7 +55,7 @@ static const u8 REG_TEMP_MAX[4] = { 0x34, 0x30, 0x31, 0x32 };
  * it.  Default is to leave the device in the state it's already in (-1).
  * This parameter allows APD mode to be optionally forced on or off */
 static int apd = -1;
-module_param(apd, bool, 0);
+module_param(apd, bint, 0);
 MODULE_PARM_DESC(init, "Set to zero to disable anti-parallel diode mode");
 
 struct temperature {
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index c240e9972cb..8af8d4f7bdb 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -82,7 +82,7 @@ module_param_named(port_act_time, ehca_port_act_time, int,  S_IRUGO);
 module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  bool, S_IRUGO);
 module_param_named(static_rate,   ehca_static_rate,   int,  S_IRUGO);
 module_param_named(scaling_code,  ehca_scaling_code,  bool, S_IRUGO);
-module_param_named(lock_hcalls,   ehca_lock_hcalls,   bool, S_IRUGO);
+module_param_named(lock_hcalls,   ehca_lock_hcalls,   bint, S_IRUGO);
 module_param_named(number_of_cqs, ehca_max_cq,        int,  S_IRUGO);
 module_param_named(number_of_qps, ehca_max_qp,        int,  S_IRUGO);
 
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index 2985eb43948..204ca728e7f 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -98,7 +98,7 @@ enum cmb_format {
  * enum cmb_format.
  */
 static int format = CMF_AUTODETECT;
-module_param(format, bool, 0444);
+module_param(format, bint, 0444);
 
 /**
  * struct cmb_operations - functions to use depending on cmb_format
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 608be451609..5a4a8af5c40 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3198,7 +3198,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
-module_param(debug_msgs, bool, 0);
+module_param(debug_msgs, bint, 0);
 MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
 #endif
 
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 794d4b0f121..6bdde0c3bcc 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -367,6 +367,12 @@ extern int param_set_invbool(const char *val, const struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, const struct kernel_param *kp);
 #define param_check_invbool(name, p) __param_check(name, p, bool)
 
+/* An int, which can only be set like a bool (though it shows as an int). */
+extern struct kernel_param_ops param_ops_bint;
+extern int param_set_bint(const char *val, const struct kernel_param *kp);
+#define param_get_bint param_get_int
+#define param_check_bint param_check_int
+
 /**
  * module_param_array - a parameter which is an array of some type
  * @name: the name of the array variable
diff --git a/kernel/params.c b/kernel/params.c
index 9240664af11..32ee0430828 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -363,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = {
 };
 EXPORT_SYMBOL(param_ops_invbool);
 
+int param_set_bint(const char *val, const struct kernel_param *kp)
+{
+	struct kernel_param boolkp;
+	bool v;
+	int ret;
+
+	/* Match bool exactly, by re-using it. */
+	boolkp = *kp;
+	boolkp.arg = &v;
+	boolkp.flags |= KPARAM_ISBOOL;
+
+	ret = param_set_bool(val, &boolkp);
+	if (ret == 0)
+		*(int *)kp->arg = v;
+	return ret;
+}
+EXPORT_SYMBOL(param_set_bint);
+
+struct kernel_param_ops param_ops_bint = {
+	.set = param_set_bint,
+	.get = param_get_int,
+};
+EXPORT_SYMBOL(param_ops_bint);
+
 /* We break the rule and mangle the string. */
 static int param_array(const char *name,
 		       const char *val,
diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c
index 40b181bab93..9f3b01bb72c 100644
--- a/sound/pci/intel8x0.c
+++ b/sound/pci/intel8x0.c
@@ -95,13 +95,13 @@ module_param(ac97_quirk, charp, 0444);
 MODULE_PARM_DESC(ac97_quirk, "AC'97 workaround for strange hardware.");
 module_param(buggy_semaphore, bool, 0444);
 MODULE_PARM_DESC(buggy_semaphore, "Enable workaround for hardwares with problematic codec semaphores.");
-module_param(buggy_irq, bool, 0444);
+module_param(buggy_irq, bint, 0444);
 MODULE_PARM_DESC(buggy_irq, "Enable workaround for buggy interrupts on some motherboards.");
 module_param(xbox, bool, 0444);
 MODULE_PARM_DESC(xbox, "Set to 1 for Xbox, if you have problems with the AC'97 codec detection.");
 module_param(spdif_aclink, int, 0444);
 MODULE_PARM_DESC(spdif_aclink, "S/PDIF over AC-link.");
-module_param(inside_vm, bool, 0444);
+module_param(inside_vm, bint, 0444);
 MODULE_PARM_DESC(inside_vm, "KVM/Parallels optimization.");
 
 /* just for backward compatibility */
-- 
cgit v1.2.3


From 90ab5ee94171b3e28de6bb42ee30b527014e0be7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:20 +1030
Subject: module_param: make bool parameters really bool (drivers & misc)

module_param(bool) used to counter-intuitively take an int.  In
fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy
trick.

It's time to remove the int/unsigned int option.  For this version
it'll simply give a warning, but it'll break next kernel version.

Acked-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/accessibility/braille/braille_console.c |  2 +-
 drivers/acpi/acpica/acglobal.h                  |  2 +-
 drivers/acpi/apei/ghes.c                        |  2 +-
 drivers/acpi/apei/hest.c                        |  2 +-
 drivers/acpi/dock.c                             |  2 +-
 drivers/acpi/pci_slot.c                         |  2 +-
 drivers/acpi/video.c                            |  6 +++---
 drivers/ata/sata_nv.c                           |  6 +++---
 drivers/ata/sata_sil24.c                        |  2 +-
 drivers/atm/he.c                                |  6 +++---
 drivers/block/drbd/drbd_int.h                   |  4 ++--
 drivers/block/drbd/drbd_main.c                  |  4 ++--
 drivers/block/paride/bpck6.c                    |  5 ++---
 drivers/block/paride/pd.c                       |  3 ++-
 drivers/block/paride/pf.c                       |  4 +++-
 drivers/block/paride/pg.c                       |  3 ++-
 drivers/block/paride/pt.c                       |  4 +++-
 drivers/block/xd.c                              |  2 +-
 drivers/bluetooth/btusb.c                       | 12 ++++++------
 drivers/bluetooth/hci_bcsp.c                    |  4 ++--
 drivers/bluetooth/hci_ldisc.c                   |  2 +-
 drivers/cdrom/cdrom.c                           | 12 ++++++------
 drivers/char/agp/amd64-agp.c                    |  2 +-
 drivers/char/agp/sis-agp.c                      |  2 +-
 drivers/char/i8k.c                              |  8 ++++----
 drivers/char/ipmi/ipmi_si_intf.c                |  2 +-
 drivers/char/lp.c                               |  2 +-
 drivers/char/nwflash.c                          |  2 +-
 drivers/char/pcmcia/synclink_cs.c               |  2 +-
 drivers/char/random.c                           |  2 +-
 drivers/char/tpm/tpm_tis.c                      |  6 +++---
 drivers/edac/r82600_edac.c                      |  2 +-
 drivers/firewire/sbp2.c                         |  2 +-
 drivers/hid/hid-prodikeys.c                     |  2 +-
 drivers/hwmon/abituguru.c                       |  2 +-
 drivers/hwmon/abituguru3.c                      |  4 ++--
 drivers/hwmon/acpi_power_meter.c                |  2 +-
 drivers/hwmon/adm1021.c                         |  2 +-
 drivers/hwmon/ads7828.c                         |  4 ++--
 drivers/hwmon/dme1737.c                         |  4 ++--
 drivers/hwmon/it87.c                            |  4 ++--
 drivers/hwmon/lm93.c                            |  4 ++--
 drivers/hwmon/max1668.c                         |  2 +-
 drivers/hwmon/w83627hf.c                        |  2 +-
 drivers/hwmon/w83781d.c                         |  4 ++--
 drivers/hwmon/w83791d.c                         |  4 ++--
 drivers/hwmon/w83792d.c                         |  2 +-
 drivers/hwmon/w83793.c                          |  2 +-
 drivers/hwmon/w83795.c                          |  2 +-
 drivers/hwmon/w83l786ng.c                       |  2 +-
 drivers/i2c/busses/i2c-highlander.c             |  2 +-
 drivers/i2c/busses/i2c-ibm_iic.c                |  4 ++--
 drivers/i2c/busses/i2c-sis630.c                 |  4 ++--
 drivers/i2c/busses/i2c-viapro.c                 |  2 +-
 drivers/ide/ali14xx.c                           |  2 +-
 drivers/ide/cmd640.c                            |  2 +-
 drivers/ide/dtc2278.c                           |  2 +-
 drivers/ide/gayle.c                             |  2 +-
 drivers/ide/ht6560b.c                           |  2 +-
 drivers/ide/ide-4drives.c                       |  2 +-
 drivers/ide/ide-acpi.c                          |  6 +++---
 drivers/ide/ide-pci-generic.c                   |  2 +-
 drivers/ide/qd65xx.c                            |  2 +-
 drivers/ide/umc8672.c                           |  2 +-
 drivers/infiniband/hw/ehca/ehca_classes.h       |  4 ++--
 drivers/infiniband/hw/ehca/ehca_main.c          |  8 ++++----
 drivers/infiniband/hw/nes/nes.c                 |  2 +-
 drivers/input/joystick/xpad.c                   |  6 +++---
 drivers/input/misc/wistron_btns.c               |  2 +-
 drivers/input/mouse/psmouse-base.c              |  2 +-
 drivers/input/mouse/synaptics_i2c.c             |  6 +++---
 drivers/input/serio/hp_sdc.c                    |  2 +-
 drivers/input/touchscreen/eeti_ts.c             |  4 ++--
 drivers/input/touchscreen/htcpen.c              |  4 ++--
 drivers/input/touchscreen/ucb1400_ts.c          |  2 +-
 drivers/input/touchscreen/usbtouchscreen.c      |  4 ++--
 drivers/isdn/hardware/avm/b1dma.c               |  2 +-
 drivers/isdn/hardware/avm/c4.c                  |  2 +-
 drivers/isdn/sc/init.c                          |  2 +-
 drivers/leds/leds-clevo-mail.c                  |  2 +-
 drivers/leds/leds-ss4200.c                      |  2 +-
 drivers/macintosh/ams/ams-core.c                |  2 +-
 drivers/macintosh/ams/ams-input.c               |  4 ++--
 drivers/macintosh/therm_adt746x.c               |  2 +-
 drivers/media/dvb/dvb-usb/af9005.c              |  2 +-
 drivers/media/dvb/dvb-usb/af9005.h              |  2 +-
 drivers/media/radio/radio-gemtek.c              | 10 +++++-----
 drivers/media/radio/radio-miropcm20.c           |  2 +-
 drivers/media/rc/lirc_dev.c                     |  2 +-
 drivers/media/rc/mceusb.c                       |  4 ++--
 drivers/media/rc/streamzap.c                    |  4 ++--
 drivers/media/rc/winbond-cir.c                  |  4 ++--
 drivers/media/video/c-qcam.c                    |  2 +-
 drivers/media/video/cs5345.c                    |  2 +-
 drivers/media/video/cs53l32a.c                  |  2 +-
 drivers/media/video/cx18/cx18-driver.c          |  2 +-
 drivers/media/video/cx25821/cx25821-alsa.c      |  2 +-
 drivers/media/video/cx88/cx88-alsa.c            |  2 +-
 drivers/media/video/gspca/m5602/m5602_core.c    |  4 ++--
 drivers/media/video/gspca/m5602/m5602_mt9m111.h |  2 +-
 drivers/media/video/gspca/m5602/m5602_ov7660.h  |  2 +-
 drivers/media/video/gspca/m5602/m5602_ov9650.h  |  2 +-
 drivers/media/video/gspca/m5602/m5602_po1030.h  |  2 +-
 drivers/media/video/gspca/m5602/m5602_s5k4aa.h  |  2 +-
 drivers/media/video/gspca/m5602/m5602_s5k83a.h  |  2 +-
 drivers/media/video/gspca/stv06xx/stv06xx.c     |  4 ++--
 drivers/media/video/hdpvr/hdpvr-core.c          |  2 +-
 drivers/media/video/ivtv/ivtv-driver.c          |  2 +-
 drivers/media/video/ivtv/ivtvfb.c               |  2 +-
 drivers/media/video/marvell-ccic/mcam-core.c    |  6 +++---
 drivers/media/video/msp3400-driver.c            |  6 +++---
 drivers/media/video/msp3400-driver.h            |  6 +++---
 drivers/media/video/omap/omap_vout.c            |  6 +++---
 drivers/media/video/omap/omap_vout_vrfb.c       |  2 +-
 drivers/media/video/ov7670.c                    |  2 +-
 drivers/media/video/saa7115.c                   |  2 +-
 drivers/media/video/stk-webcam.c                |  4 ++--
 drivers/media/video/tm6000/tm6000-alsa.c        |  2 +-
 drivers/media/video/tvp514x.c                   |  2 +-
 drivers/media/video/tvp7002.c                   |  2 +-
 drivers/media/video/upd64083.c                  |  2 +-
 drivers/media/video/via-camera.c                |  4 ++--
 drivers/media/video/zoran/zoran_device.c        |  2 +-
 drivers/media/video/zoran/zr36060.c             |  2 +-
 drivers/memstick/host/jmb38x_ms.c               |  2 +-
 drivers/memstick/host/r592.c                    |  2 +-
 drivers/memstick/host/tifm_ms.c                 |  2 +-
 drivers/misc/iwmc3200top/main.c                 | 12 ++++++------
 drivers/mmc/core/core.c                         |  6 +++---
 drivers/mmc/core/core.h                         |  2 +-
 drivers/mmc/host/tifm_sd.c                      |  4 ++--
 drivers/mmc/host/vub300.c                       | 10 +++++-----
 drivers/mtd/nand/pxa3xx_nand.c                  |  2 +-
 drivers/mtd/nand/r852.c                         |  2 +-
 drivers/parport/parport_ip32.c                  |  2 +-
 drivers/pci/hotplug/acpi_pcihp.c                |  2 +-
 drivers/pci/hotplug/acpiphp_core.c              |  2 +-
 drivers/pci/hotplug/acpiphp_ibm.c               |  2 +-
 drivers/pci/hotplug/cpcihp_zt5550.c             |  4 ++--
 drivers/pci/hotplug/cpqphp_core.c               |  4 ++--
 drivers/pci/hotplug/ibmphp_core.c               |  2 +-
 drivers/pci/hotplug/pci_hotplug_core.c          |  2 +-
 drivers/pci/hotplug/pciehp.h                    |  6 +++---
 drivers/pci/hotplug/pciehp_core.c               |  6 +++---
 drivers/pci/hotplug/pcihp_skeleton.c            |  2 +-
 drivers/pci/hotplug/rpaphp.h                    |  2 +-
 drivers/pci/hotplug/rpaphp_core.c               |  2 +-
 drivers/pci/hotplug/shpchp.h                    |  4 ++--
 drivers/pci/hotplug/shpchp_core.c               |  4 ++--
 drivers/pci/pcie/aer/aer_inject.c               |  2 +-
 drivers/pci/pcie/aer/aerdrv_core.c              |  4 ++--
 drivers/pcmcia/yenta_socket.c                   |  6 +++---
 drivers/platform/x86/compal-laptop.c            |  2 +-
 drivers/platform/x86/intel_oaktrail.c           |  2 +-
 drivers/platform/x86/msi-laptop.c               |  2 +-
 drivers/platform/x86/samsung-laptop.c           |  4 ++--
 drivers/platform/x86/thinkpad_acpi.c            | 16 ++++++++--------
 drivers/platform/x86/wmi.c                      |  4 ++--
 drivers/power/ds2760_battery.c                  |  2 +-
 drivers/s390/char/raw3270.c                     |  2 +-
 drivers/s390/char/vmwatchdog.c                  |  4 ++--
 drivers/scsi/aha1542.c                          |  2 +-
 drivers/scsi/dc395x.c                           |  2 +-
 drivers/scsi/nsp32.c                            |  4 ++--
 drivers/scsi/pcmcia/nsp_cs.c                    |  2 +-
 drivers/staging/comedi/comedi_fops.c            |  2 +-
 drivers/staging/comedi/comedi_fops.h            |  3 ++-
 drivers/staging/media/go7007/snd-go7007.c       |  2 +-
 drivers/staging/media/lirc/lirc_bt829.c         |  2 +-
 drivers/staging/media/lirc/lirc_igorplugusb.c   |  4 ++--
 drivers/staging/media/lirc/lirc_parallel.c      |  4 ++--
 drivers/staging/media/lirc/lirc_serial.c        | 10 +++++-----
 drivers/staging/media/lirc/lirc_sir.c           |  2 +-
 drivers/staging/media/lirc/lirc_zilog.c         |  4 ++--
 drivers/staging/quatech_usb2/quatech_usb2.c     |  2 +-
 drivers/staging/serqt_usb2/serqt_usb2.c         |  2 +-
 drivers/staging/speakup/speakup.h               |  2 +-
 drivers/staging/speakup/synth.c                 |  2 +-
 drivers/staging/vme/bridges/vme_tsi148.c        |  2 +-
 drivers/tty/rocket.c                            |  2 +-
 drivers/tty/synclink.c                          |  2 +-
 drivers/tty/synclinkmp.c                        |  2 +-
 drivers/usb/atm/speedtch.c                      |  6 +++---
 drivers/usb/atm/ueagle-atm.c                    |  2 +-
 drivers/usb/core/devio.c                        |  2 +-
 drivers/usb/core/hub.c                          |  8 ++++----
 drivers/usb/core/usb.c                          |  2 +-
 drivers/usb/gadget/amd5536udc.c                 |  8 ++++----
 drivers/usb/gadget/ether.c                      |  4 ++--
 drivers/usb/gadget/file_storage.c               | 10 +++++-----
 drivers/usb/gadget/net2272.c                    |  2 +-
 drivers/usb/gadget/net2280.c                    |  6 +++---
 drivers/usb/gadget/omap_udc.c                   |  2 +-
 drivers/usb/gadget/pch_udc.c                    |  2 +-
 drivers/usb/gadget/serial.c                     |  4 ++--
 drivers/usb/gadget/zero.c                       |  2 +-
 drivers/usb/host/ehci-hcd.c                     |  2 +-
 drivers/usb/host/ohci-hcd.c                     |  4 ++--
 drivers/usb/host/oxu210hp-hcd.c                 |  2 +-
 drivers/usb/host/u132-hcd.c                     |  2 +-
 drivers/usb/host/uhci-hcd.c                     |  2 +-
 drivers/usb/misc/ftdi-elan.c                    |  2 +-
 drivers/usb/misc/iowarrior.c                    |  2 +-
 drivers/usb/musb/cppi_dma.c                     |  2 +-
 drivers/usb/musb/musb_core.c                    |  2 +-
 drivers/usb/serial/aircable.c                   |  2 +-
 drivers/usb/serial/ark3116.c                    |  2 +-
 drivers/usb/serial/belkin_sa.c                  |  2 +-
 drivers/usb/serial/ch341.c                      |  2 +-
 drivers/usb/serial/cp210x.c                     |  2 +-
 drivers/usb/serial/cyberjack.c                  |  2 +-
 drivers/usb/serial/cypress_m8.c                 |  6 +++---
 drivers/usb/serial/digi_acceleport.c            |  2 +-
 drivers/usb/serial/empeg.c                      |  2 +-
 drivers/usb/serial/ftdi_sio.c                   |  2 +-
 drivers/usb/serial/funsoft.c                    |  2 +-
 drivers/usb/serial/garmin_gps.c                 |  2 +-
 drivers/usb/serial/io_edgeport.c                |  2 +-
 drivers/usb/serial/io_ti.c                      |  4 ++--
 drivers/usb/serial/ipaq.c                       |  2 +-
 drivers/usb/serial/ipw.c                        |  2 +-
 drivers/usb/serial/ir-usb.c                     |  2 +-
 drivers/usb/serial/iuu_phoenix.c                |  6 +++---
 drivers/usb/serial/keyspan.c                    |  2 +-
 drivers/usb/serial/keyspan_pda.c                |  2 +-
 drivers/usb/serial/kl5kusb105.c                 |  2 +-
 drivers/usb/serial/mct_u232.c                   |  2 +-
 drivers/usb/serial/mos7720.c                    |  2 +-
 drivers/usb/serial/mos7840.c                    |  2 +-
 drivers/usb/serial/navman.c                     |  2 +-
 drivers/usb/serial/omninet.c                    |  2 +-
 drivers/usb/serial/opticon.c                    |  2 +-
 drivers/usb/serial/option.c                     |  2 +-
 drivers/usb/serial/oti6858.c                    |  2 +-
 drivers/usb/serial/pl2303.c                     |  2 +-
 drivers/usb/serial/qcserial.c                   |  2 +-
 drivers/usb/serial/safe_serial.c                |  6 +++---
 drivers/usb/serial/sierra.c                     |  4 ++--
 drivers/usb/serial/spcp8x5.c                    |  2 +-
 drivers/usb/serial/ssu100.c                     |  2 +-
 drivers/usb/serial/symbolserial.c               |  2 +-
 drivers/usb/serial/ti_usb_3410_5052.c           |  2 +-
 drivers/usb/serial/usb-serial.c                 |  2 +-
 drivers/usb/serial/usb_wwan.c                   |  2 +-
 drivers/usb/serial/visor.c                      |  2 +-
 drivers/usb/serial/whiteheat.c                  |  2 +-
 drivers/video/aty/atyfb_base.c                  |  4 ++--
 drivers/video/aty/radeon_base.c                 | 18 +++++++++---------
 drivers/video/cirrusfb.c                        |  2 +-
 drivers/video/hgafb.c                           |  2 +-
 drivers/video/intelfb/intelfbdrv.c              | 16 ++++++++--------
 drivers/video/logo/logo.c                       |  2 +-
 drivers/video/neofb.c                           | 10 +++++-----
 drivers/video/omap/omapfb_main.c                |  4 ++--
 drivers/video/omap2/dss/core.c                  |  2 +-
 drivers/video/omap2/dss/dsi.c                   |  4 ++--
 drivers/video/omap2/dss/dss.h                   |  2 +-
 drivers/video/omap2/omapfb/omapfb-main.c        |  8 ++++----
 drivers/video/omap2/omapfb/omapfb.h             |  2 +-
 drivers/video/pm2fb.c                           |  8 ++++----
 drivers/video/pm3fb.c                           |  4 ++--
 drivers/video/riva/fbdev.c                      |  6 +++---
 drivers/video/smscufx.c                         |  4 ++--
 drivers/video/sstfb.c                           |  6 +++---
 drivers/video/tdfxfb.c                          |  2 +-
 drivers/video/udlfb.c                           |  6 +++---
 drivers/video/uvesafb.c                         |  6 +++---
 drivers/video/vfb.c                             |  2 +-
 drivers/watchdog/f71808e_wdt.c                  |  2 +-
 drivers/watchdog/mpc8xxx_wdt.c                  |  2 +-
 drivers/xen/xen-pciback/conf_space.c            |  2 +-
 drivers/xen/xen-pciback/xenbus.c                |  2 +-
 fs/lockd/mon.c                                  |  2 +-
 fs/nfs/client.c                                 |  2 +-
 fs/nfs/inode.c                                  |  2 +-
 include/acpi/acpixf.h                           |  2 +-
 include/acpi/apei.h                             |  4 ++--
 include/linux/console.h                         |  2 +-
 include/linux/lockd/lockd.h                     |  2 +-
 include/linux/mmc/host.h                        |  2 +-
 security/apparmor/include/apparmor.h            | 10 +++++-----
 security/apparmor/lsm.c                         | 12 ++++++------
 virt/kvm/iommu.c                                |  2 +-
 283 files changed, 471 insertions(+), 465 deletions(-)

(limited to 'fs')

diff --git a/drivers/accessibility/braille/braille_console.c b/drivers/accessibility/braille/braille_console.c
index cb423f5aef2..c339a0880e6 100644
--- a/drivers/accessibility/braille/braille_console.c
+++ b/drivers/accessibility/braille/braille_console.c
@@ -44,7 +44,7 @@ MODULE_LICENSE("GPL");
  */
 
 /* Emit various sounds */
-static int sound;
+static bool sound;
 module_param(sound, bool, 0);
 MODULE_PARM_DESC(sound, "emit sounds");
 
diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
index 76dc02f1557..e6652d716e4 100644
--- a/drivers/acpi/acpica/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -108,7 +108,7 @@ u8 ACPI_INIT_GLOBAL(acpi_gbl_use_default_register_widths, TRUE);
 /*
  * Optionally enable output from the AML Debug Object.
  */
-u32 ACPI_INIT_GLOBAL(acpi_gbl_enable_aml_debug_object, FALSE);
+bool ACPI_INIT_GLOBAL(acpi_gbl_enable_aml_debug_object, FALSE);
 
 /*
  * Optionally copy the entire DSDT to local memory (instead of simply
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index b8e08cb67a1..ebaf037a787 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -118,7 +118,7 @@ struct ghes_estatus_cache {
 	struct rcu_head rcu;
 };
 
-int ghes_disable;
+bool ghes_disable;
 module_param_named(disable, ghes_disable, bool, 0);
 
 static int ghes_panic_timeout	__read_mostly = 30;
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 05fee06f4d6..ee7fddc4665 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -41,7 +41,7 @@
 
 #define HEST_PFX "HEST: "
 
-int hest_disable;
+bool hest_disable;
 EXPORT_SYMBOL_GPL(hest_disable);
 
 /* HEST table parsing */
diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c
index 19a61136d84..88eb1430466 100644
--- a/drivers/acpi/dock.c
+++ b/drivers/acpi/dock.c
@@ -43,7 +43,7 @@ MODULE_AUTHOR("Kristen Carlson Accardi");
 MODULE_DESCRIPTION(ACPI_DOCK_DRIVER_DESCRIPTION);
 MODULE_LICENSE("GPL");
 
-static int immediate_undock = 1;
+static bool immediate_undock = 1;
 module_param(immediate_undock, bool, 0644);
 MODULE_PARM_DESC(immediate_undock, "1 (default) will cause the driver to "
 	"undock immediately when the undock button is pressed, 0 will cause"
diff --git a/drivers/acpi/pci_slot.c b/drivers/acpi/pci_slot.c
index 07f7fea8a4e..e50e31a518a 100644
--- a/drivers/acpi/pci_slot.c
+++ b/drivers/acpi/pci_slot.c
@@ -34,7 +34,7 @@
 #include <acpi/acpi_drivers.h>
 #include <linux/dmi.h>
 
-static int debug;
+static bool debug;
 static int check_sta_before_sun;
 
 #define DRIVER_VERSION 	"0.1"
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index 08a44b532f7..eaef02afc7c 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -69,21 +69,21 @@ MODULE_AUTHOR("Bruno Ducrot");
 MODULE_DESCRIPTION("ACPI Video Driver");
 MODULE_LICENSE("GPL");
 
-static int brightness_switch_enabled = 1;
+static bool brightness_switch_enabled = 1;
 module_param(brightness_switch_enabled, bool, 0644);
 
 /*
  * By default, we don't allow duplicate ACPI video bus devices
  * under the same VGA controller
  */
-static int allow_duplicates;
+static bool allow_duplicates;
 module_param(allow_duplicates, bool, 0644);
 
 /*
  * Some BIOSes claim they use minimum backlight at boot,
  * and this may bring dimming screen after boot
  */
-static int use_bios_initial_backlight = 1;
+static bool use_bios_initial_backlight = 1;
 module_param(use_bios_initial_backlight, bool, 0644);
 
 static int register_count = 0;
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index e0bc9646a38..55d6179dde5 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -599,9 +599,9 @@ MODULE_LICENSE("GPL");
 MODULE_DEVICE_TABLE(pci, nv_pci_tbl);
 MODULE_VERSION(DRV_VERSION);
 
-static int adma_enabled;
-static int swncq_enabled = 1;
-static int msi_enabled;
+static bool adma_enabled;
+static bool swncq_enabled = 1;
+static bool msi_enabled;
 
 static void nv_adma_register_mode(struct ata_port *ap)
 {
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 1e9140626a8..e7e610aa9a7 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -417,7 +417,7 @@ static struct ata_port_operations sil24_ops = {
 #endif
 };
 
-static int sata_sil24_msi;    /* Disable MSI */
+static bool sata_sil24_msi;    /* Disable MSI */
 module_param_named(msi, sata_sil24_msi, bool, S_IRUGO);
 MODULE_PARM_DESC(msi, "Enable MSI (Default: false)");
 
diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index 9a51df4f5b7..b182c2f7d77 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -112,12 +112,12 @@ static u8 read_prom_byte(struct he_dev *he_dev, int addr);
 /* globals */
 
 static struct he_dev *he_devs;
-static int disable64;
+static bool disable64;
 static short nvpibits = -1;
 static short nvcibits = -1;
 static short rx_skb_reserve = 16;
-static int irq_coalesce = 1;
-static int sdh = 0;
+static bool irq_coalesce = 1;
+static bool sdh = 0;
 
 /* Read from EEPROM = 0000 0011b */
 static unsigned int readtab[] = {
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 9cf20355cee..8d680562ba7 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -59,8 +59,8 @@
 
 /* module parameter, defined in drbd_main.c */
 extern unsigned int minor_count;
-extern int disable_sendpage;
-extern int allow_oos;
+extern bool disable_sendpage;
+extern bool allow_oos;
 extern unsigned int cn_idx;
 
 #ifdef CONFIG_DRBD_FAULT_INJECTION
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0358e55356c..211fc44f84b 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -117,8 +117,8 @@ module_param(fault_devs, int, 0644);
 
 /* module parameter, defined */
 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
-int disable_sendpage;
-int allow_oos;
+bool disable_sendpage;
+bool allow_oos;
 unsigned int cn_idx = CN_IDX_DRBD;
 int proc_details;       /* Detail level in proc drbd*/
 
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
index ad124525ac2..ec64e7f5d1c 100644
--- a/drivers/block/paride/bpck6.c
+++ b/drivers/block/paride/bpck6.c
@@ -20,9 +20,6 @@
 */
 
 
-/* PARAMETERS */
-static int verbose; /* set this to 1 to see debugging messages and whatnot */
-
 #define BACKPACK_VERSION "2.0.2"
 
 #include <linux/module.h>
@@ -36,6 +33,8 @@ static int verbose; /* set this to 1 to see debugging messages and whatnot */
 #include "ppc6lnx.c"
 #include "paride.h"
 
+/* PARAMETERS */
+static bool verbose; /* set this to 1 to see debugging messages and whatnot */
  
 
 #define PPCSTRUCT(pi) ((Interface *)(pi->private))
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 869e7676d46..831e3ac156e 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -124,8 +124,9 @@
    by default.
 
 */
+#include <linux/types.h>
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PD_MAJOR;
 static char *name = PD_NAME;
 static int cluster = 64;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index f21b520ef41..ec8f9ed6326 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -118,13 +118,15 @@
 #define PF_NAME		"pf"
 #define PF_UNITS	4
 
+#include <linux/types.h>
+
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is off
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PF_MAJOR;
 static char *name = PF_NAME;
 static int cluster = 64;
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index a79fb4f7ff6..4a27b1de5fc 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -130,13 +130,14 @@
 #define PI_PG	4
 #endif
 
+#include <linux/types.h>
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is 0
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PG_MAJOR;
 static char *name = PG_NAME;
 static int disable = 0;
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 7179f79d746..2596042eb98 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -109,13 +109,15 @@
 #define PT_NAME		"pt"
 #define PT_UNITS	4
 
+#include <linux/types.h>
+
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is on
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PT_MAJOR;
 static char *name = PT_NAME;
 static int disable = 0;
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 4abd2bcd20f..51a972704db 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -148,7 +148,7 @@ static volatile int xdc_busy;
 static struct timer_list xd_watchdog_int;
 
 static volatile u_char xd_error;
-static int nodma = XD_DONT_USE_DMA;
+static bool nodma = XD_DONT_USE_DMA;
 
 static struct request_queue *xd_queue;
 
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 55ac349695c..f00f596c102 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -37,13 +37,13 @@
 
 #define VERSION "0.6"
 
-static int ignore_dga;
-static int ignore_csr;
-static int ignore_sniffer;
-static int disable_scofix;
-static int force_scofix;
+static bool ignore_dga;
+static bool ignore_csr;
+static bool ignore_sniffer;
+static bool disable_scofix;
+static bool force_scofix;
 
-static int reset = 1;
+static bool reset = 1;
 
 static struct usb_driver btusb_driver;
 
diff --git a/drivers/bluetooth/hci_bcsp.c b/drivers/bluetooth/hci_bcsp.c
index 9c5b2dc38e2..a767d4de45a 100644
--- a/drivers/bluetooth/hci_bcsp.c
+++ b/drivers/bluetooth/hci_bcsp.c
@@ -49,8 +49,8 @@
 
 #define VERSION "0.3"
 
-static int txcrc = 1;
-static int hciextn = 1;
+static bool txcrc = 1;
+static bool hciextn = 1;
 
 #define BCSP_TXWINSIZE	4
 
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 48ad2a7ab08..07114489994 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -48,7 +48,7 @@
 
 #define VERSION "2.2"
 
-static int reset = 0;
+static bool reset = 0;
 
 static struct hci_uart_proto *hup[HCI_UART_MAX_PROTO];
 
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 2118211aff9..1bbf7645a97 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -285,17 +285,17 @@
 #include <asm/uaccess.h>
 
 /* used to tell the module to turn on full debugging messages */
-static int debug;
+static bool debug;
 /* used to keep tray locked at all times */
 static int keeplocked;
 /* default compatibility mode */
-static int autoclose=1;
-static int autoeject;
-static int lockdoor = 1;
+static bool autoclose=1;
+static bool autoeject;
+static bool lockdoor = 1;
 /* will we ever get to use this... sigh. */
-static int check_media_type;
+static bool check_media_type;
 /* automatically restart mrw format */
-static int mrw_format_restart = 1;
+static bool mrw_format_restart = 1;
 module_param(debug, bool, 0);
 module_param(autoclose, bool, 0);
 module_param(autoeject, bool, 0);
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 780498d7658..444f8b6ab41 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -33,7 +33,7 @@
 #define ULI_X86_64_ENU_SCR_REG		0x54
 
 static struct resource *aperture_resource;
-static int __initdata agp_try_unsupported = 1;
+static bool __initdata agp_try_unsupported = 1;
 static int agp_bridges_found;
 
 static void amd64_tlbflush(struct agp_memory *temp)
diff --git a/drivers/char/agp/sis-agp.c b/drivers/char/agp/sis-agp.c
index 29aacd81de7..08704ae5395 100644
--- a/drivers/char/agp/sis-agp.c
+++ b/drivers/char/agp/sis-agp.c
@@ -17,7 +17,7 @@
 #define PCI_DEVICE_ID_SI_662	0x0662
 #define PCI_DEVICE_ID_SI_671	0x0671
 
-static int __devinitdata agp_sis_force_delay = 0;
+static bool __devinitdata agp_sis_force_delay = 0;
 static int __devinitdata agp_sis_agp_spec = -1;
 
 static int sis_fetch_size(void)
diff --git a/drivers/char/i8k.c b/drivers/char/i8k.c
index 6e40072fbf6..40cc0cf2ded 100644
--- a/drivers/char/i8k.c
+++ b/drivers/char/i8k.c
@@ -69,19 +69,19 @@ MODULE_AUTHOR("Massimo Dal Zotto (dz@debian.org)");
 MODULE_DESCRIPTION("Driver for accessing SMM BIOS on Dell laptops");
 MODULE_LICENSE("GPL");
 
-static int force;
+static bool force;
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Force loading without checking for supported models");
 
-static int ignore_dmi;
+static bool ignore_dmi;
 module_param(ignore_dmi, bool, 0);
 MODULE_PARM_DESC(ignore_dmi, "Continue probing hardware even if DMI data does not match");
 
-static int restricted;
+static bool restricted;
 module_param(restricted, bool, 0);
 MODULE_PARM_DESC(restricted, "Allow fan control if SYS_ADMIN capability set");
 
-static int power_status;
+static bool power_status;
 module_param(power_status, bool, 0600);
 MODULE_PARM_DESC(power_status, "Report power status in /proc/i8k");
 
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 9397ab49b72..50fcf9c0456 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1227,7 +1227,7 @@ static int smi_num; /* Used to sequence the SMIs */
 #define DEFAULT_REGSPACING	1
 #define DEFAULT_REGSIZE		1
 
-static int           si_trydefaults = 1;
+static bool          si_trydefaults = 1;
 static char          *si_type[SI_MAX_PARMS];
 #define MAX_SI_TYPE_STR 30
 static char          si_type_str[MAX_SI_TYPE_STR];
diff --git a/drivers/char/lp.c b/drivers/char/lp.c
index 97c3edb95ae..f4348560706 100644
--- a/drivers/char/lp.c
+++ b/drivers/char/lp.c
@@ -829,7 +829,7 @@ static struct console lpcons = {
 
 static int parport_nr[LP_NO] = { [0 ... LP_NO-1] = LP_PARPORT_UNSPEC };
 static char *parport[LP_NO];
-static int reset;
+static bool reset;
 
 module_param_array(parport, charp, NULL, 0);
 module_param(reset, bool, 0);
diff --git a/drivers/char/nwflash.c b/drivers/char/nwflash.c
index a12f52400db..bf586ae1ee8 100644
--- a/drivers/char/nwflash.c
+++ b/drivers/char/nwflash.c
@@ -51,7 +51,7 @@ static int write_block(unsigned long p, const char __user *buf, int count);
 #define KFLASH_ID	0x89A6		//Intel flash
 #define KFLASH_ID4	0xB0D4		//Intel flash 4Meg
 
-static int flashdebug;		//if set - we will display progress msgs
+static bool flashdebug;		//if set - we will display progress msgs
 
 static int gbWriteEnable;
 static int gbWriteBase64Enable;
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 15781396af2..07f6a5abe37 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -439,7 +439,7 @@ static int mgslpc_device_count = 0;
  * .text section address and breakpoint on module load.
  * This is useful for use with gdb and add-symbol-file command.
  */
-static int break_on_load=0;
+static bool break_on_load=0;
 
 /*
  * Driver major number, defaults to zero to get auto
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 85da8740586..732215b805c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -387,7 +387,7 @@ static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
 static struct fasync_struct *fasync;
 
 #if 0
-static int debug;
+static bool debug;
 module_param(debug, bool, 0644);
 #define DEBUG_ENT(fmt, arg...) do { \
 	if (debug) \
diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
index 10cc44ceb5d..a1748621111 100644
--- a/drivers/char/tpm/tpm_tis.c
+++ b/drivers/char/tpm/tpm_tis.c
@@ -255,7 +255,7 @@ out:
 	return size;
 }
 
-static int itpm;
+static bool itpm;
 module_param(itpm, bool, 0444);
 MODULE_PARM_DESC(itpm, "Force iTPM workarounds (found on some Lenovo laptops)");
 
@@ -500,7 +500,7 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int interrupts = 1;
+static bool interrupts = 1;
 module_param(interrupts, bool, 0444);
 MODULE_PARM_DESC(interrupts, "Enable interrupts");
 
@@ -828,7 +828,7 @@ static struct platform_driver tis_drv = {
 
 static struct platform_device *pdev;
 
-static int force;
+static bool force;
 module_param(force, bool, 0444);
 MODULE_PARM_DESC(force, "Force device probe rather than using ACPI entry");
 static int __init init_tis(void)
diff --git a/drivers/edac/r82600_edac.c b/drivers/edac/r82600_edac.c
index b153674431f..e294e1b3616 100644
--- a/drivers/edac/r82600_edac.c
+++ b/drivers/edac/r82600_edac.c
@@ -131,7 +131,7 @@ struct r82600_error_info {
 	u32 eapr;
 };
 
-static unsigned int disable_hardware_scrub;
+static bool disable_hardware_scrub;
 
 static struct edac_pci_ctl_info *r82600_pci;
 
diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c
index 68375bc3aef..80e95aa3bf1 100644
--- a/drivers/firewire/sbp2.c
+++ b/drivers/firewire/sbp2.c
@@ -66,7 +66,7 @@
  *
  * Concurrent logins are useful together with cluster filesystems.
  */
-static int sbp2_param_exclusive_login = 1;
+static bool sbp2_param_exclusive_login = 1;
 module_param_named(exclusive_login, sbp2_param_exclusive_login, bool, 0644);
 MODULE_PARM_DESC(exclusive_login, "Exclusive login to sbp2 device "
 		 "(default = Y, use N for concurrent initiators)");
diff --git a/drivers/hid/hid-prodikeys.c b/drivers/hid/hid-prodikeys.c
index f779009104e..b71b77ab0dc 100644
--- a/drivers/hid/hid-prodikeys.c
+++ b/drivers/hid/hid-prodikeys.c
@@ -90,7 +90,7 @@ static const char longname[] = "Prodikeys PC-MIDI Keyboard";
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
-static int enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
+static bool enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
 
 module_param_array(index, int, NULL, 0444);
 module_param_array(id, charp, NULL, 0444);
diff --git a/drivers/hwmon/abituguru.c b/drivers/hwmon/abituguru.c
index 65a35cf5b3c..3b728e8f169 100644
--- a/drivers/hwmon/abituguru.c
+++ b/drivers/hwmon/abituguru.c
@@ -145,7 +145,7 @@ static const u8 abituguru_pwm_max[5] = { 0, 255, 255, 75, 75 };
 
 
 /* Insmod parameters */
-static int force;
+static bool force;
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Set to one to force detection.");
 static int bank1_types[ABIT_UGURU_MAX_BANK1_SENSORS] = { -1, -1, -1, -1, -1,
diff --git a/drivers/hwmon/abituguru3.c b/drivers/hwmon/abituguru3.c
index d30855a7578..34a14a77e00 100644
--- a/drivers/hwmon/abituguru3.c
+++ b/drivers/hwmon/abituguru3.c
@@ -603,11 +603,11 @@ static const struct abituguru3_motherboard_info abituguru3_motherboards[] = {
 
 
 /* Insmod parameters */
-static int force;
+static bool force;
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Set to one to force detection.");
 /* Default verbose is 1, since this driver is still in the testing phase */
-static int verbose = 1;
+static bool verbose = 1;
 module_param(verbose, bool, 0644);
 MODULE_PARM_DESC(verbose, "Enable/disable verbose error reporting");
 
diff --git a/drivers/hwmon/acpi_power_meter.c b/drivers/hwmon/acpi_power_meter.c
index 522860ab6ce..554f046bcf2 100644
--- a/drivers/hwmon/acpi_power_meter.c
+++ b/drivers/hwmon/acpi_power_meter.c
@@ -58,7 +58,7 @@ ACPI_MODULE_NAME(ACPI_POWER_METER_NAME);
 #define POWER_ALARM_NAME	"power1_alarm"
 
 static int cap_in_hardware;
-static int force_cap_on;
+static bool force_cap_on;
 
 static int can_cap_in_hardware(void)
 {
diff --git a/drivers/hwmon/adm1021.c b/drivers/hwmon/adm1021.c
index 1ad0a885c5a..0158cc35cb2 100644
--- a/drivers/hwmon/adm1021.c
+++ b/drivers/hwmon/adm1021.c
@@ -103,7 +103,7 @@ static int adm1021_remove(struct i2c_client *client);
 static struct adm1021_data *adm1021_update_device(struct device *dev);
 
 /* (amalysh) read only mode, otherwise any limit's writing confuse BIOS */
-static int read_only;
+static bool read_only;
 
 
 static const struct i2c_device_id adm1021_id[] = {
diff --git a/drivers/hwmon/ads7828.c b/drivers/hwmon/ads7828.c
index cfcc3b6fb6b..ed60242d6a0 100644
--- a/drivers/hwmon/ads7828.c
+++ b/drivers/hwmon/ads7828.c
@@ -48,8 +48,8 @@ static const unsigned short normal_i2c[] = { 0x48, 0x49, 0x4a, 0x4b,
 	I2C_CLIENT_END };
 
 /* Module parameters */
-static int se_input = 1; /* Default is SE, 0 == diff */
-static int int_vref = 1; /* Default is internal ref ON */
+static bool se_input = 1; /* Default is SE, 0 == diff */
+static bool int_vref = 1; /* Default is internal ref ON */
 static int vref_mv = ADS7828_INT_VREF_MV; /* set if vref != 2.5V */
 module_param(se_input, bool, S_IRUGO);
 module_param(int_vref, bool, S_IRUGO);
diff --git a/drivers/hwmon/dme1737.c b/drivers/hwmon/dme1737.c
index d9803958e49..ffb229af786 100644
--- a/drivers/hwmon/dme1737.c
+++ b/drivers/hwmon/dme1737.c
@@ -45,7 +45,7 @@
 static struct platform_device *pdev;
 
 /* Module load parameters */
-static int force_start;
+static bool force_start;
 module_param(force_start, bool, 0);
 MODULE_PARM_DESC(force_start, "Force the chip to start monitoring inputs");
 
@@ -53,7 +53,7 @@ static unsigned short force_id;
 module_param(force_id, ushort, 0);
 MODULE_PARM_DESC(force_id, "Override the detected device ID");
 
-static int probe_all_addr;
+static bool probe_all_addr;
 module_param(probe_all_addr, bool, 0);
 MODULE_PARM_DESC(probe_all_addr, "Include probing of non-standard LPC "
 		 "addresses");
diff --git a/drivers/hwmon/it87.c b/drivers/hwmon/it87.c
index 38c0b87676d..603ef2af270 100644
--- a/drivers/hwmon/it87.c
+++ b/drivers/hwmon/it87.c
@@ -146,10 +146,10 @@ static inline void superio_exit(void)
 #define IT87_SIO_BEEP_PIN_REG	0xf6	/* Beep pin mapping */
 
 /* Update battery voltage after every reading if true */
-static int update_vbat;
+static bool update_vbat;
 
 /* Not all BIOSes properly configure the PWM registers */
-static int fix_pwm_polarity;
+static bool fix_pwm_polarity;
 
 /* Many IT87 constants specified below */
 
diff --git a/drivers/hwmon/lm93.c b/drivers/hwmon/lm93.c
index 3b43df41861..8bd6c5c9e05 100644
--- a/drivers/hwmon/lm93.c
+++ b/drivers/hwmon/lm93.c
@@ -151,12 +151,12 @@ static const unsigned short normal_i2c[] = { 0x2c, 0x2d, 0x2e, I2C_CLIENT_END };
 
 /* Insmod parameters */
 
-static int disable_block;
+static bool disable_block;
 module_param(disable_block, bool, 0);
 MODULE_PARM_DESC(disable_block,
 	"Set to non-zero to disable SMBus block data transactions.");
 
-static int init;
+static bool init;
 module_param(init, bool, 0);
 MODULE_PARM_DESC(init, "Set to non-zero to force chip initialization.");
 
diff --git a/drivers/hwmon/max1668.c b/drivers/hwmon/max1668.c
index 6914195cfd3..88953f99e91 100644
--- a/drivers/hwmon/max1668.c
+++ b/drivers/hwmon/max1668.c
@@ -59,7 +59,7 @@ static unsigned short max1668_addr_list[] = {
 #define DEV_ID_MAX1989		0xb
 
 /* read only mode module parameter */
-static int read_only;
+static bool read_only;
 module_param(read_only, bool, 0);
 MODULE_PARM_DESC(read_only, "Don't set any values, read only mode");
 
diff --git a/drivers/hwmon/w83627hf.c b/drivers/hwmon/w83627hf.c
index bde50e34d01..374118f2b9f 100644
--- a/drivers/hwmon/w83627hf.c
+++ b/drivers/hwmon/w83627hf.c
@@ -71,7 +71,7 @@ module_param(force_i2c, byte, 0);
 MODULE_PARM_DESC(force_i2c,
 		 "Initialize the i2c address of the sensors");
 
-static int init = 1;
+static bool init = 1;
 module_param(init, bool, 0);
 MODULE_PARM_DESC(init, "Set to zero to bypass chip initialization");
 
diff --git a/drivers/hwmon/w83781d.c b/drivers/hwmon/w83781d.c
index 65b685e2c7b..17a8fa2d9ae 100644
--- a/drivers/hwmon/w83781d.c
+++ b/drivers/hwmon/w83781d.c
@@ -67,11 +67,11 @@ module_param_array(force_subclients, short, NULL, 0);
 MODULE_PARM_DESC(force_subclients, "List of subclient addresses: "
 		    "{bus, clientaddr, subclientaddr1, subclientaddr2}");
 
-static int reset;
+static bool reset;
 module_param(reset, bool, 0);
 MODULE_PARM_DESC(reset, "Set to one to reset chip on load");
 
-static int init = 1;
+static bool init = 1;
 module_param(init, bool, 0);
 MODULE_PARM_DESC(init, "Set to zero to bypass chip initialization");
 
diff --git a/drivers/hwmon/w83791d.c b/drivers/hwmon/w83791d.c
index 6e5d0ae594b..35aa5149307 100644
--- a/drivers/hwmon/w83791d.c
+++ b/drivers/hwmon/w83791d.c
@@ -58,11 +58,11 @@ module_param_array(force_subclients, short, NULL, 0);
 MODULE_PARM_DESC(force_subclients, "List of subclient addresses: "
 			"{bus, clientaddr, subclientaddr1, subclientaddr2}");
 
-static int reset;
+static bool reset;
 module_param(reset, bool, 0);
 MODULE_PARM_DESC(reset, "Set to one to force a hardware chip reset");
 
-static int init;
+static bool init;
 module_param(init, bool, 0);
 MODULE_PARM_DESC(init, "Set to one to force extra software initialization");
 
diff --git a/drivers/hwmon/w83792d.c b/drivers/hwmon/w83792d.c
index 9ded133e43f..d3100eab6b2 100644
--- a/drivers/hwmon/w83792d.c
+++ b/drivers/hwmon/w83792d.c
@@ -56,7 +56,7 @@ module_param_array(force_subclients, short, NULL, 0);
 MODULE_PARM_DESC(force_subclients, "List of subclient addresses: "
 			"{bus, clientaddr, subclientaddr1, subclientaddr2}");
 
-static int init;
+static bool init;
 module_param(init, bool, 0);
 MODULE_PARM_DESC(init, "Set to one to force chip initialization");
 
diff --git a/drivers/hwmon/w83793.c b/drivers/hwmon/w83793.c
index 3cc6fef2208..45ec7e7c3c2 100644
--- a/drivers/hwmon/w83793.c
+++ b/drivers/hwmon/w83793.c
@@ -61,7 +61,7 @@ module_param_array(force_subclients, short, NULL, 0);
 MODULE_PARM_DESC(force_subclients, "List of subclient addresses: "
 		       "{bus, clientaddr, subclientaddr1, subclientaddr2}");
 
-static int reset;
+static bool reset;
 module_param(reset, bool, 0);
 MODULE_PARM_DESC(reset, "Set to 1 to reset chip, not recommended");
 
diff --git a/drivers/hwmon/w83795.c b/drivers/hwmon/w83795.c
index 3ee398d0e4c..aa58b25565b 100644
--- a/drivers/hwmon/w83795.c
+++ b/drivers/hwmon/w83795.c
@@ -42,7 +42,7 @@ static const unsigned short normal_i2c[] = {
 };
 
 
-static int reset;
+static bool reset;
 module_param(reset, bool, 0);
 MODULE_PARM_DESC(reset, "Set to 1 to reset chip, not recommended");
 
diff --git a/drivers/hwmon/w83l786ng.c b/drivers/hwmon/w83l786ng.c
index 0254e181893..063bd9508d8 100644
--- a/drivers/hwmon/w83l786ng.c
+++ b/drivers/hwmon/w83l786ng.c
@@ -39,7 +39,7 @@ static const unsigned short normal_i2c[] = { 0x2e, 0x2f, I2C_CLIENT_END };
 
 /* Insmod parameters */
 
-static int reset;
+static bool reset;
 module_param(reset, bool, 0);
 MODULE_PARM_DESC(reset, "Set to 1 to reset chip, not recommended");
 
diff --git a/drivers/i2c/busses/i2c-highlander.c b/drivers/i2c/busses/i2c-highlander.c
index 63bb1cc2a04..fa88868cb55 100644
--- a/drivers/i2c/busses/i2c-highlander.c
+++ b/drivers/i2c/busses/i2c-highlander.c
@@ -52,7 +52,7 @@ struct highlander_i2c_dev {
 	size_t			buf_len;
 };
 
-static int iic_force_poll, iic_force_normal;
+static bool iic_force_poll, iic_force_normal;
 static int iic_timeout = 1000, iic_read_delay;
 
 static inline void highlander_i2c_irq_enable(struct highlander_i2c_dev *dev)
diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c
index 3c110fbc409..c08ceb957aa 100644
--- a/drivers/i2c/busses/i2c-ibm_iic.c
+++ b/drivers/i2c/busses/i2c-ibm_iic.c
@@ -51,11 +51,11 @@
 MODULE_DESCRIPTION("IBM IIC driver v" DRIVER_VERSION);
 MODULE_LICENSE("GPL");
 
-static int iic_force_poll;
+static bool iic_force_poll;
 module_param(iic_force_poll, bool, 0);
 MODULE_PARM_DESC(iic_force_poll, "Force polling mode");
 
-static int iic_force_fast;
+static bool iic_force_fast;
 module_param(iic_force_fast, bool, 0);
 MODULE_PARM_DESC(iic_force_fast, "Force fast mode (400 kHz)");
 
diff --git a/drivers/i2c/busses/i2c-sis630.c b/drivers/i2c/busses/i2c-sis630.c
index e6f539e26f6..58893772c3d 100644
--- a/drivers/i2c/busses/i2c-sis630.c
+++ b/drivers/i2c/busses/i2c-sis630.c
@@ -93,8 +93,8 @@
 static struct pci_driver sis630_driver;
 
 /* insmod parameters */
-static int high_clock;
-static int force;
+static bool high_clock;
+static bool force;
 module_param(high_clock, bool, 0);
 MODULE_PARM_DESC(high_clock, "Set Host Master Clock to 56KHz (default 14KHz).");
 module_param(force, bool, 0);
diff --git a/drivers/i2c/busses/i2c-viapro.c b/drivers/i2c/busses/i2c-viapro.c
index 0b012f1f8ac..2a62c998044 100644
--- a/drivers/i2c/busses/i2c-viapro.c
+++ b/drivers/i2c/busses/i2c-viapro.c
@@ -91,7 +91,7 @@ static unsigned short SMBHSTCFG = 0xD2;
 
 /* If force is set to anything different from 0, we forcibly enable the
    VT596. DANGEROUS! */
-static int force;
+static bool force;
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Forcibly enable the SMBus. DANGEROUS!");
 
diff --git a/drivers/ide/ali14xx.c b/drivers/ide/ali14xx.c
index 25b9fe3a9f8..d3be99fb415 100644
--- a/drivers/ide/ali14xx.c
+++ b/drivers/ide/ali14xx.c
@@ -221,7 +221,7 @@ static int __init ali14xx_probe(void)
 	return ide_legacy_device_add(&ali14xx_port_info, 0);
 }
 
-static int probe_ali14xx;
+static bool probe_ali14xx;
 
 module_param_named(probe, probe_ali14xx, bool, 0);
 MODULE_PARM_DESC(probe, "probe for ALI M14xx chipsets");
diff --git a/drivers/ide/cmd640.c b/drivers/ide/cmd640.c
index a81bd757579..14717304b38 100644
--- a/drivers/ide/cmd640.c
+++ b/drivers/ide/cmd640.c
@@ -111,7 +111,7 @@
 
 #define DRV_NAME "cmd640"
 
-static int cmd640_vlb;
+static bool cmd640_vlb;
 
 /*
  * CMD640 specific registers definition.
diff --git a/drivers/ide/dtc2278.c b/drivers/ide/dtc2278.c
index 6929f7fce93..46af4743b3e 100644
--- a/drivers/ide/dtc2278.c
+++ b/drivers/ide/dtc2278.c
@@ -130,7 +130,7 @@ static int __init dtc2278_probe(void)
 	return ide_legacy_device_add(&dtc2278_port_info, 0);
 }
 
-static int probe_dtc2278;
+static bool probe_dtc2278;
 
 module_param_named(probe, probe_dtc2278, bool, 0);
 MODULE_PARM_DESC(probe, "probe for DTC2278xx chipsets");
diff --git a/drivers/ide/gayle.c b/drivers/ide/gayle.c
index 3feaa26410b..51beb85250d 100644
--- a/drivers/ide/gayle.c
+++ b/drivers/ide/gayle.c
@@ -50,7 +50,7 @@
 					       GAYLE_NUM_HWIFS-1)
 #define GAYLE_HAS_CONTROL_REG	(!ide_doubler)
 
-static int ide_doubler;
+static bool ide_doubler;
 module_param_named(doubler, ide_doubler, bool, 0);
 MODULE_PARM_DESC(doubler, "enable support for IDE doublers");
 
diff --git a/drivers/ide/ht6560b.c b/drivers/ide/ht6560b.c
index 808bcdcbf8e..986f2513eab 100644
--- a/drivers/ide/ht6560b.c
+++ b/drivers/ide/ht6560b.c
@@ -317,7 +317,7 @@ static void __init ht6560b_init_dev(ide_drive_t *drive)
 	ide_set_drivedata(drive, (void *)t);
 }
 
-static int probe_ht6560b;
+static bool probe_ht6560b;
 
 module_param_named(probe, probe_ht6560b, bool, 0);
 MODULE_PARM_DESC(probe, "probe for HT6560B chipset");
diff --git a/drivers/ide/ide-4drives.c b/drivers/ide/ide-4drives.c
index 979d342c338..547d7cf2e01 100644
--- a/drivers/ide/ide-4drives.c
+++ b/drivers/ide/ide-4drives.c
@@ -6,7 +6,7 @@
 
 #define DRV_NAME "ide-4drives"
 
-static int probe_4drives;
+static bool probe_4drives;
 
 module_param_named(probe, probe_4drives, bool, 0);
 MODULE_PARM_DESC(probe, "probe for generic IDE chipset with 4 drives/port");
diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index f22edc66b03..f1a6796b165 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -53,15 +53,15 @@ struct ide_acpi_hwif_link {
 #define DEBPRINT(fmt, args...)	do {} while (0)
 #endif	/* DEBUGGING */
 
-static int ide_noacpi;
+static bool ide_noacpi;
 module_param_named(noacpi, ide_noacpi, bool, 0);
 MODULE_PARM_DESC(noacpi, "disable IDE ACPI support");
 
-static int ide_acpigtf;
+static bool ide_acpigtf;
 module_param_named(acpigtf, ide_acpigtf, bool, 0);
 MODULE_PARM_DESC(acpigtf, "enable IDE ACPI _GTF support");
 
-static int ide_acpionboot;
+static bool ide_acpionboot;
 module_param_named(acpionboot, ide_acpionboot, bool, 0);
 MODULE_PARM_DESC(acpionboot, "call IDE ACPI methods on boot");
 
diff --git a/drivers/ide/ide-pci-generic.c b/drivers/ide/ide-pci-generic.c
index a743e68a890..7f56b738d76 100644
--- a/drivers/ide/ide-pci-generic.c
+++ b/drivers/ide/ide-pci-generic.c
@@ -28,7 +28,7 @@
 
 #define DRV_NAME "ide_pci_generic"
 
-static int ide_generic_all;		/* Set to claim all devices */
+static bool ide_generic_all;		/* Set to claim all devices */
 
 module_param_named(all_generic_ide, ide_generic_all, bool, 0444);
 MODULE_PARM_DESC(all_generic_ide, "IDE generic will claim all unknown PCI IDE storage controllers.");
diff --git a/drivers/ide/qd65xx.c b/drivers/ide/qd65xx.c
index 3f0244fd8e6..8bbfe5557c7 100644
--- a/drivers/ide/qd65xx.c
+++ b/drivers/ide/qd65xx.c
@@ -417,7 +417,7 @@ static int __init qd_probe(int base)
 	return rc;
 }
 
-static int probe_qd65xx;
+static bool probe_qd65xx;
 
 module_param_named(probe, probe_qd65xx, bool, 0);
 MODULE_PARM_DESC(probe, "probe for QD65xx chipsets");
diff --git a/drivers/ide/umc8672.c b/drivers/ide/umc8672.c
index 47adcd09cb2..5cfb7812066 100644
--- a/drivers/ide/umc8672.c
+++ b/drivers/ide/umc8672.c
@@ -160,7 +160,7 @@ static int __init umc8672_probe(void)
 	return ide_legacy_device_add(&umc8672_port_info, 0);
 }
 
-static int probe_umc8672;
+static bool probe_umc8672;
 
 module_param_named(probe, probe_umc8672, bool, 0);
 MODULE_PARM_DESC(probe, "probe for UMC8672 chipset");
diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index aaf6023a483..f08f6eaf3fa 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -379,8 +379,8 @@ extern spinlock_t shca_list_lock;
 
 extern int ehca_static_rate;
 extern int ehca_port_act_time;
-extern int ehca_use_hp_mr;
-extern int ehca_scaling_code;
+extern bool ehca_use_hp_mr;
+extern bool ehca_scaling_code;
 extern int ehca_lock_hcalls;
 extern int ehca_nr_ports;
 extern int ehca_max_cq;
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 8af8d4f7bdb..832e7a7d0ae 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -59,16 +59,16 @@ MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
 MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
 MODULE_VERSION(HCAD_VERSION);
 
-static int ehca_open_aqp1     = 0;
+static bool ehca_open_aqp1    = 0;
 static int ehca_hw_level      = 0;
-static int ehca_poll_all_eqs  = 1;
+static bool ehca_poll_all_eqs = 1;
 
 int ehca_debug_level   = 0;
 int ehca_nr_ports      = -1;
-int ehca_use_hp_mr     = 0;
+bool ehca_use_hp_mr    = 0;
 int ehca_port_act_time = 30;
 int ehca_static_rate   = -1;
-int ehca_scaling_code  = 0;
+bool ehca_scaling_code = 0;
 int ehca_lock_hcalls   = -1;
 int ehca_max_cq        = -1;
 int ehca_max_qp        = -1;
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index 5965b3df8f2..7013da5e9ed 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -96,7 +96,7 @@ unsigned int wqm_quanta = 0x10000;
 module_param(wqm_quanta, int, 0644);
 MODULE_PARM_DESC(wqm_quanta, "WQM quanta");
 
-static unsigned int limit_maxrdreqsz;
+static bool limit_maxrdreqsz;
 module_param(limit_maxrdreqsz, bool, 0644);
 MODULE_PARM_DESC(limit_maxrdreqsz, "Limit max read request size to 256 Bytes");
 
diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c
index 32bbd4c77b7..fd7a0d5bc94 100644
--- a/drivers/input/joystick/xpad.c
+++ b/drivers/input/joystick/xpad.c
@@ -98,15 +98,15 @@
 #define XTYPE_XBOX360W    2
 #define XTYPE_UNKNOWN     3
 
-static int dpad_to_buttons;
+static bool dpad_to_buttons;
 module_param(dpad_to_buttons, bool, S_IRUGO);
 MODULE_PARM_DESC(dpad_to_buttons, "Map D-PAD to buttons rather than axes for unknown pads");
 
-static int triggers_to_buttons;
+static bool triggers_to_buttons;
 module_param(triggers_to_buttons, bool, S_IRUGO);
 MODULE_PARM_DESC(triggers_to_buttons, "Map triggers to buttons rather than axes for unknown pads");
 
-static int sticks_to_null;
+static bool sticks_to_null;
 module_param(sticks_to_null, bool, S_IRUGO);
 MODULE_PARM_DESC(sticks_to_null, "Do not map sticks at all for unknown pads");
 
diff --git a/drivers/input/misc/wistron_btns.c b/drivers/input/misc/wistron_btns.c
index 52b41934898..e2bdfd4bea7 100644
--- a/drivers/input/misc/wistron_btns.c
+++ b/drivers/input/misc/wistron_btns.c
@@ -48,7 +48,7 @@ MODULE_DESCRIPTION("Wistron laptop button driver");
 MODULE_LICENSE("GPL v2");
 MODULE_VERSION("0.3");
 
-static int force; /* = 0; */
+static bool force; /* = 0; */
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Load even if computer is not in database");
 
diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c
index de7e8bc17b1..e6c9931f02c 100644
--- a/drivers/input/mouse/psmouse-base.c
+++ b/drivers/input/mouse/psmouse-base.c
@@ -60,7 +60,7 @@ static unsigned int psmouse_rate = 100;
 module_param_named(rate, psmouse_rate, uint, 0644);
 MODULE_PARM_DESC(rate, "Report rate, in reports per second.");
 
-static unsigned int psmouse_smartscroll = 1;
+static bool psmouse_smartscroll = 1;
 module_param_named(smartscroll, psmouse_smartscroll, bool, 0644);
 MODULE_PARM_DESC(smartscroll, "Logitech Smartscroll autorepeat, 1 = enabled (default), 0 = disabled.");
 
diff --git a/drivers/input/mouse/synaptics_i2c.c b/drivers/input/mouse/synaptics_i2c.c
index 4b755cb5b38..1c58aafa523 100644
--- a/drivers/input/mouse/synaptics_i2c.c
+++ b/drivers/input/mouse/synaptics_i2c.c
@@ -185,17 +185,17 @@
 #define NO_DATA_SLEEP_MSECS	(MSEC_PER_SEC / 4)
 
 /* Control touchpad's No Deceleration option */
-static int no_decel = 1;
+static bool no_decel = 1;
 module_param(no_decel, bool, 0644);
 MODULE_PARM_DESC(no_decel, "No Deceleration. Default = 1 (on)");
 
 /* Control touchpad's Reduced Reporting option */
-static int reduce_report;
+static bool reduce_report;
 module_param(reduce_report, bool, 0644);
 MODULE_PARM_DESC(reduce_report, "Reduced Reporting. Default = 0 (off)");
 
 /* Control touchpad's No Filter option */
-static int no_filter;
+static bool no_filter;
 module_param(no_filter, bool, 0644);
 MODULE_PARM_DESC(no_filter, "No Filter. Default = 0 (off)");
 
diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c
index 979c443bf1e..be3316073ae 100644
--- a/drivers/input/serio/hp_sdc.c
+++ b/drivers/input/serio/hp_sdc.c
@@ -105,7 +105,7 @@ EXPORT_SYMBOL(__hp_sdc_enqueue_transaction);
 EXPORT_SYMBOL(hp_sdc_enqueue_transaction);
 EXPORT_SYMBOL(hp_sdc_dequeue_transaction);
 
-static unsigned int hp_sdc_disabled;
+static bool hp_sdc_disabled;
 module_param_named(no_hpsdc, hp_sdc_disabled, bool, 0);
 MODULE_PARM_DESC(no_hpsdc, "Do not enable HP SDC driver.");
 
diff --git a/drivers/input/touchscreen/eeti_ts.c b/drivers/input/touchscreen/eeti_ts.c
index 7f8f538a980..1df19bb8534 100644
--- a/drivers/input/touchscreen/eeti_ts.c
+++ b/drivers/input/touchscreen/eeti_ts.c
@@ -35,11 +35,11 @@
 #include <linux/input/eeti_ts.h>
 #include <linux/slab.h>
 
-static int flip_x;
+static bool flip_x;
 module_param(flip_x, bool, 0644);
 MODULE_PARM_DESC(flip_x, "flip x coordinate");
 
-static int flip_y;
+static bool flip_y;
 module_param(flip_y, bool, 0644);
 MODULE_PARM_DESC(flip_y, "flip y coordinate");
 
diff --git a/drivers/input/touchscreen/htcpen.c b/drivers/input/touchscreen/htcpen.c
index 81e33862394..d13143b68b3 100644
--- a/drivers/input/touchscreen/htcpen.c
+++ b/drivers/input/touchscreen/htcpen.c
@@ -40,10 +40,10 @@ MODULE_LICENSE("GPL");
 #define X_AXIS_MAX		2040
 #define Y_AXIS_MAX		2040
 
-static int invert_x;
+static bool invert_x;
 module_param(invert_x, bool, 0644);
 MODULE_PARM_DESC(invert_x, "If set, X axis is inverted");
-static int invert_y;
+static bool invert_y;
 module_param(invert_y, bool, 0644);
 MODULE_PARM_DESC(invert_y, "If set, Y axis is inverted");
 
diff --git a/drivers/input/touchscreen/ucb1400_ts.c b/drivers/input/touchscreen/ucb1400_ts.c
index d2b57536fee..46e83ad53f4 100644
--- a/drivers/input/touchscreen/ucb1400_ts.c
+++ b/drivers/input/touchscreen/ucb1400_ts.c
@@ -30,7 +30,7 @@
 
 #define UCB1400_TS_POLL_PERIOD	10 /* ms */
 
-static int adcsync;
+static bool adcsync;
 static int ts_delay = 55; /* us */
 static int ts_delay_pressure;	/* us */
 
diff --git a/drivers/input/touchscreen/usbtouchscreen.c b/drivers/input/touchscreen/usbtouchscreen.c
index 06cef3ccc63..3a5ebf452e8 100644
--- a/drivers/input/touchscreen/usbtouchscreen.c
+++ b/drivers/input/touchscreen/usbtouchscreen.c
@@ -60,11 +60,11 @@
 #define DRIVER_AUTHOR		"Daniel Ritz <daniel.ritz@gmx.ch>"
 #define DRIVER_DESC		"USB Touchscreen Driver"
 
-static int swap_xy;
+static bool swap_xy;
 module_param(swap_xy, bool, 0644);
 MODULE_PARM_DESC(swap_xy, "If set X and Y axes are swapped.");
 
-static int hwcalib_xy;
+static bool hwcalib_xy;
 module_param(hwcalib_xy, bool, 0644);
 MODULE_PARM_DESC(hwcalib_xy, "If set hw-calibrated X/Y are used if available");
 
diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c
index 9c8d7aa053c..a0ed668d4d2 100644
--- a/drivers/isdn/hardware/avm/b1dma.c
+++ b/drivers/isdn/hardware/avm/b1dma.c
@@ -40,7 +40,7 @@ MODULE_DESCRIPTION("CAPI4Linux: DMA support for active AVM cards");
 MODULE_AUTHOR("Carsten Paeth");
 MODULE_LICENSE("GPL");
 
-static int suppress_pollack = 0;
+static bool suppress_pollack = 0;
 module_param(suppress_pollack, bool, 0);
 
 /* ------------------------------------------------------------- */
diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c
index d3530f6e811..9743b24ef9d 100644
--- a/drivers/isdn/hardware/avm/c4.c
+++ b/drivers/isdn/hardware/avm/c4.c
@@ -40,7 +40,7 @@ static char *revision = "$Revision: 1.1.2.2 $";
 
 /* ------------------------------------------------------------- */
 
-static int suppress_pollack;
+static bool suppress_pollack;
 
 static struct pci_device_id c4_pci_tbl[] = {
 	{ PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285, PCI_VENDOR_ID_AVM, PCI_DEVICE_ID_AVM_C4, 0, 0, (unsigned long)4 },
diff --git a/drivers/isdn/sc/init.c b/drivers/isdn/sc/init.c
index ca710ab278e..023de789f25 100644
--- a/drivers/isdn/sc/init.c
+++ b/drivers/isdn/sc/init.c
@@ -30,7 +30,7 @@ static const char *boardname[] = { "DataCommute/BRI", "DataCommute/PRI", "TeleCo
 static unsigned int io[] = {0,0,0,0};
 static unsigned char irq[] = {0,0,0,0};
 static unsigned long ram[] = {0,0,0,0};
-static int do_reset = 0;
+static bool do_reset = 0;
 
 module_param_array(io, int, NULL, 0);
 module_param_array(irq, int, NULL, 0);
diff --git a/drivers/leds/leds-clevo-mail.c b/drivers/leds/leds-clevo-mail.c
index a498135a4e8..1ed1677c916 100644
--- a/drivers/leds/leds-clevo-mail.c
+++ b/drivers/leds/leds-clevo-mail.c
@@ -18,7 +18,7 @@ MODULE_AUTHOR("Márton Németh <nm127@freemail.hu>");
 MODULE_DESCRIPTION("Clevo mail LED driver");
 MODULE_LICENSE("GPL");
 
-static unsigned int __initdata nodetect;
+static bool __initdata nodetect;
 module_param_named(nodetect, nodetect, bool, 0);
 MODULE_PARM_DESC(nodetect, "Skip DMI hardware detection");
 
diff --git a/drivers/leds/leds-ss4200.c b/drivers/leds/leds-ss4200.c
index 614ebebaaa2..57371e1485a 100644
--- a/drivers/leds/leds-ss4200.c
+++ b/drivers/leds/leds-ss4200.c
@@ -79,7 +79,7 @@ static int __init ss4200_led_dmi_callback(const struct dmi_system_id *id)
 	return 1;
 }
 
-static unsigned int __initdata nodetect;
+static bool __initdata nodetect;
 module_param_named(nodetect, nodetect, bool, 0);
 MODULE_PARM_DESC(nodetect, "Skip DMI-based hardware detection");
 
diff --git a/drivers/macintosh/ams/ams-core.c b/drivers/macintosh/ams/ams-core.c
index 399beb1638d..5c6a2d87656 100644
--- a/drivers/macintosh/ams/ams-core.c
+++ b/drivers/macintosh/ams/ams-core.c
@@ -31,7 +31,7 @@
 /* There is only one motion sensor per machine */
 struct ams ams_info;
 
-static unsigned int verbose;
+static bool verbose;
 module_param(verbose, bool, 0644);
 MODULE_PARM_DESC(verbose, "Show free falls and shocks in kernel output");
 
diff --git a/drivers/macintosh/ams/ams-input.c b/drivers/macintosh/ams/ams-input.c
index 8a712392cd3..b27e530a87a 100644
--- a/drivers/macintosh/ams/ams-input.c
+++ b/drivers/macintosh/ams/ams-input.c
@@ -19,11 +19,11 @@
 
 #include "ams.h"
 
-static unsigned int joystick;
+static bool joystick;
 module_param(joystick, bool, S_IRUGO);
 MODULE_PARM_DESC(joystick, "Enable the input class device on module load");
 
-static unsigned int invert;
+static bool invert;
 module_param(invert, bool, S_IWUSR | S_IRUGO);
 MODULE_PARM_DESC(invert, "Invert input data on X and Y axis");
 
diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c
index 02367308ff2..c60d025044e 100644
--- a/drivers/macintosh/therm_adt746x.c
+++ b/drivers/macintosh/therm_adt746x.c
@@ -52,7 +52,7 @@ static const char *sensor_location[3];
 
 static int limit_adjust;
 static int fan_speed = -1;
-static int verbose;
+static bool verbose;
 
 MODULE_AUTHOR("Colin Leroy <colin@colino.net>");
 MODULE_DESCRIPTION("Driver for ADT746x thermostat in iBook G4 and "
diff --git a/drivers/media/dvb/dvb-usb/af9005.c b/drivers/media/dvb/dvb-usb/af9005.c
index bd51a764351..4fc024d7704 100644
--- a/drivers/media/dvb/dvb-usb/af9005.c
+++ b/drivers/media/dvb/dvb-usb/af9005.c
@@ -30,7 +30,7 @@ MODULE_PARM_DESC(debug,
 		 "set debugging level (1=info,xfer=2,rc=4,reg=8,i2c=16,fw=32 (or-able))."
 		 DVB_USB_DEBUG_STATUS);
 /* enable obnoxious led */
-int dvb_usb_af9005_led = 1;
+bool dvb_usb_af9005_led = 1;
 module_param_named(led, dvb_usb_af9005_led, bool, 0644);
 MODULE_PARM_DESC(led, "enable led (default: 1).");
 
diff --git a/drivers/media/dvb/dvb-usb/af9005.h b/drivers/media/dvb/dvb-usb/af9005.h
index c71c77bd7f4..6a2bf3de845 100644
--- a/drivers/media/dvb/dvb-usb/af9005.h
+++ b/drivers/media/dvb/dvb-usb/af9005.h
@@ -35,7 +35,7 @@ extern int dvb_usb_af9005_debug;
 #define deb_i2c(args...)  dprintk(dvb_usb_af9005_debug,0x10,args)
 #define deb_fw(args...)   dprintk(dvb_usb_af9005_debug,0x20,args)
 
-extern int dvb_usb_af9005_led;
+extern bool dvb_usb_af9005_led;
 
 /* firmware */
 #define FW_BULKOUT_SIZE 250
diff --git a/drivers/media/radio/radio-gemtek.c b/drivers/media/radio/radio-gemtek.c
index edadc8449a3..36ce0611c03 100644
--- a/drivers/media/radio/radio-gemtek.c
+++ b/drivers/media/radio/radio-gemtek.c
@@ -47,11 +47,11 @@ MODULE_VERSION("0.0.4");
 #endif
 
 static int io		= CONFIG_RADIO_GEMTEK_PORT;
-static int probe	= CONFIG_RADIO_GEMTEK_PROBE;
-static int hardmute;
-static int shutdown	= 1;
-static int keepmuted	= 1;
-static int initmute	= 1;
+static bool probe	= CONFIG_RADIO_GEMTEK_PROBE;
+static bool hardmute;
+static bool shutdown	= 1;
+static bool keepmuted	= 1;
+static bool initmute	= 1;
 static int radio_nr	= -1;
 
 module_param(io, int, 0444);
diff --git a/drivers/media/radio/radio-miropcm20.c b/drivers/media/radio/radio-miropcm20.c
index 3fb76e3834c..87c1ee13b05 100644
--- a/drivers/media/radio/radio-miropcm20.c
+++ b/drivers/media/radio/radio-miropcm20.c
@@ -23,7 +23,7 @@ static int radio_nr = -1;
 module_param(radio_nr, int, 0);
 MODULE_PARM_DESC(radio_nr, "Set radio device number (/dev/radioX).  Default: -1 (autodetect)");
 
-static int mono;
+static bool mono;
 module_param(mono, bool, 0);
 MODULE_PARM_DESC(mono, "Force tuner into mono mode.");
 
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index 27997a9ceb0..ca12d3289bf 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -38,7 +38,7 @@
 #include <media/lirc.h>
 #include <media/lirc_dev.h>
 
-static int debug;
+static bool debug;
 
 #define IRCTL_DEV_NAME	"BaseRemoteCtl"
 #define NOPLUG		-1
diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c
index 20bb12d6fbb..21105bf9594 100644
--- a/drivers/media/rc/mceusb.c
+++ b/drivers/media/rc/mceusb.c
@@ -156,9 +156,9 @@
 
 /* module parameters */
 #ifdef CONFIG_USB_DEBUG
-static int debug = 1;
+static bool debug = 1;
 #else
-static int debug;
+static bool debug;
 #endif
 
 #define mce_dbg(dev, fmt, ...)					\
diff --git a/drivers/media/rc/streamzap.c b/drivers/media/rc/streamzap.c
index b1d29d09eea..d6f4bfe0939 100644
--- a/drivers/media/rc/streamzap.c
+++ b/drivers/media/rc/streamzap.c
@@ -43,9 +43,9 @@
 #define DRIVER_DESC	"Streamzap Remote Control driver"
 
 #ifdef CONFIG_USB_DEBUG
-static int debug = 1;
+static bool debug = 1;
 #else
-static int debug;
+static bool debug;
 #endif
 
 #define USB_STREAMZAP_VENDOR_ID		0x0e9c
diff --git a/drivers/media/rc/winbond-cir.c b/drivers/media/rc/winbond-cir.c
index e7f7a57bf68..b09c5fae489 100644
--- a/drivers/media/rc/winbond-cir.c
+++ b/drivers/media/rc/winbond-cir.c
@@ -226,11 +226,11 @@ module_param(protocol, uint, 0444);
 MODULE_PARM_DESC(protocol, "IR protocol to use for the power-on command "
 		 "(0 = RC5, 1 = NEC, 2 = RC6A, default)");
 
-static int invert; /* default = 0 */
+static bool invert; /* default = 0 */
 module_param(invert, bool, 0444);
 MODULE_PARM_DESC(invert, "Invert the signal from the IR receiver");
 
-static int txandrx; /* default = 0 */
+static bool txandrx; /* default = 0 */
 module_param(txandrx, bool, 0444);
 MODULE_PARM_DESC(invert, "Allow simultaneous TX and RX");
 
diff --git a/drivers/media/video/c-qcam.c b/drivers/media/video/c-qcam.c
index cd8ff047318..fda32f52554 100644
--- a/drivers/media/video/c-qcam.c
+++ b/drivers/media/video/c-qcam.c
@@ -72,7 +72,7 @@ struct qcam {
 
 static int parport[MAX_CAMS] = { [1 ... MAX_CAMS-1] = -1 };
 static int probe = 2;
-static int force_rgb;
+static bool force_rgb;
 static int video_nr = -1;
 
 /* FIXME: parport=auto would never have worked, surely? --RR */
diff --git a/drivers/media/video/cs5345.c b/drivers/media/video/cs5345.c
index 5909f2557ab..1d64af9adf7 100644
--- a/drivers/media/video/cs5345.c
+++ b/drivers/media/video/cs5345.c
@@ -31,7 +31,7 @@ MODULE_DESCRIPTION("i2c device driver for cs5345 Audio ADC");
 MODULE_AUTHOR("Hans Verkuil");
 MODULE_LICENSE("GPL");
 
-static int debug;
+static bool debug;
 
 module_param(debug, bool, 0644);
 
diff --git a/drivers/media/video/cs53l32a.c b/drivers/media/video/cs53l32a.c
index d93e5ab45fd..51c5b9ad67d 100644
--- a/drivers/media/video/cs53l32a.c
+++ b/drivers/media/video/cs53l32a.c
@@ -35,7 +35,7 @@ MODULE_DESCRIPTION("i2c device driver for cs53l32a Audio ADC");
 MODULE_AUTHOR("Martin Vaughan");
 MODULE_LICENSE("GPL");
 
-static int debug;
+static bool debug;
 
 module_param(debug, bool, 0644);
 
diff --git a/drivers/media/video/cx18/cx18-driver.c b/drivers/media/video/cx18/cx18-driver.c
index c6ff32a6137..349bd9c2aff 100644
--- a/drivers/media/video/cx18/cx18-driver.c
+++ b/drivers/media/video/cx18/cx18-driver.c
@@ -75,7 +75,7 @@ static int radio[CX18_MAX_CARDS] = { -1, -1, -1, -1, -1, -1, -1, -1,
 				     -1, -1, -1, -1, -1, -1, -1, -1 };
 static unsigned cardtype_c = 1;
 static unsigned tuner_c = 1;
-static unsigned radio_c = 1;
+static bool radio_c = 1;
 static char pal[] = "--";
 static char secam[] = "--";
 static char ntsc[] = "-";
diff --git a/drivers/media/video/cx25821/cx25821-alsa.c b/drivers/media/video/cx25821/cx25821-alsa.c
index 09e99de5fd2..58be4f3bb3c 100644
--- a/drivers/media/video/cx25821/cx25821-alsa.c
+++ b/drivers/media/video/cx25821/cx25821-alsa.c
@@ -102,7 +102,7 @@ struct cx25821_audio_dev {
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
-static int enable[SNDRV_CARDS] = { 1, [1 ... (SNDRV_CARDS - 1)] = 1 };
+static bool enable[SNDRV_CARDS] = { 1, [1 ... (SNDRV_CARDS - 1)] = 1 };
 
 module_param_array(enable, bool, NULL, 0444);
 MODULE_PARM_DESC(enable, "Enable cx25821 soundcard. default enabled.");
diff --git a/drivers/media/video/cx88/cx88-alsa.c b/drivers/media/video/cx88/cx88-alsa.c
index 68d1240f493..04bf6627d36 100644
--- a/drivers/media/video/cx88/cx88-alsa.c
+++ b/drivers/media/video/cx88/cx88-alsa.c
@@ -96,7 +96,7 @@ typedef struct cx88_audio_dev snd_cx88_card_t;
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static const char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
-static int enable[SNDRV_CARDS] = {1, [1 ... (SNDRV_CARDS - 1)] = 1};
+static bool enable[SNDRV_CARDS] = {1, [1 ... (SNDRV_CARDS - 1)] = 1};
 
 module_param_array(enable, bool, NULL, 0444);
 MODULE_PARM_DESC(enable, "Enable cx88x soundcard. default enabled.");
diff --git a/drivers/media/video/gspca/m5602/m5602_core.c b/drivers/media/video/gspca/m5602/m5602_core.c
index 9fe3816b2aa..0c449367543 100644
--- a/drivers/media/video/gspca/m5602/m5602_core.c
+++ b/drivers/media/video/gspca/m5602/m5602_core.c
@@ -27,8 +27,8 @@
 
 /* Kernel module parameters */
 int force_sensor;
-static int dump_bridge;
-int dump_sensor;
+static bool dump_bridge;
+bool dump_sensor;
 
 static const struct usb_device_id m5602_table[] = {
 	{USB_DEVICE(0x0402, 0x5602)},
diff --git a/drivers/media/video/gspca/m5602/m5602_mt9m111.h b/drivers/media/video/gspca/m5602/m5602_mt9m111.h
index b1f0c492036..8c672b5c8c6 100644
--- a/drivers/media/video/gspca/m5602/m5602_mt9m111.h
+++ b/drivers/media/video/gspca/m5602/m5602_mt9m111.h
@@ -106,7 +106,7 @@
 
 /* Kernel module parameters */
 extern int force_sensor;
-extern int dump_sensor;
+extern bool dump_sensor;
 
 int mt9m111_probe(struct sd *sd);
 int mt9m111_init(struct sd *sd);
diff --git a/drivers/media/video/gspca/m5602/m5602_ov7660.h b/drivers/media/video/gspca/m5602/m5602_ov7660.h
index 2efd607987e..2b6a13b508f 100644
--- a/drivers/media/video/gspca/m5602/m5602_ov7660.h
+++ b/drivers/media/video/gspca/m5602/m5602_ov7660.h
@@ -86,7 +86,7 @@
 
 /* Kernel module parameters */
 extern int force_sensor;
-extern int dump_sensor;
+extern bool dump_sensor;
 
 int ov7660_probe(struct sd *sd);
 int ov7660_init(struct sd *sd);
diff --git a/drivers/media/video/gspca/m5602/m5602_ov9650.h b/drivers/media/video/gspca/m5602/m5602_ov9650.h
index da9a129b739..f7aa5bf6898 100644
--- a/drivers/media/video/gspca/m5602/m5602_ov9650.h
+++ b/drivers/media/video/gspca/m5602/m5602_ov9650.h
@@ -135,7 +135,7 @@
 
 /* Kernel module parameters */
 extern int force_sensor;
-extern int dump_sensor;
+extern bool dump_sensor;
 
 int ov9650_probe(struct sd *sd);
 int ov9650_init(struct sd *sd);
diff --git a/drivers/media/video/gspca/m5602/m5602_po1030.h b/drivers/media/video/gspca/m5602/m5602_po1030.h
index 33835959639..81a2bcb88fe 100644
--- a/drivers/media/video/gspca/m5602/m5602_po1030.h
+++ b/drivers/media/video/gspca/m5602/m5602_po1030.h
@@ -147,7 +147,7 @@
 
 /* Kernel module parameters */
 extern int force_sensor;
-extern int dump_sensor;
+extern bool dump_sensor;
 
 int po1030_probe(struct sd *sd);
 int po1030_init(struct sd *sd);
diff --git a/drivers/media/video/gspca/m5602/m5602_s5k4aa.h b/drivers/media/video/gspca/m5602/m5602_s5k4aa.h
index 8cc7a3f6da7..8e0035e731c 100644
--- a/drivers/media/video/gspca/m5602/m5602_s5k4aa.h
+++ b/drivers/media/video/gspca/m5602/m5602_s5k4aa.h
@@ -65,7 +65,7 @@
 
 /* Kernel module parameters */
 extern int force_sensor;
-extern int dump_sensor;
+extern bool dump_sensor;
 
 int s5k4aa_probe(struct sd *sd);
 int s5k4aa_init(struct sd *sd);
diff --git a/drivers/media/video/gspca/m5602/m5602_s5k83a.h b/drivers/media/video/gspca/m5602/m5602_s5k83a.h
index 80a63a236e2..79952247b53 100644
--- a/drivers/media/video/gspca/m5602/m5602_s5k83a.h
+++ b/drivers/media/video/gspca/m5602/m5602_s5k83a.h
@@ -41,7 +41,7 @@
 
 /* Kernel module parameters */
 extern int force_sensor;
-extern int dump_sensor;
+extern bool dump_sensor;
 
 int s5k83a_probe(struct sd *sd);
 int s5k83a_init(struct sd *sd);
diff --git a/drivers/media/video/gspca/stv06xx/stv06xx.c b/drivers/media/video/gspca/stv06xx/stv06xx.c
index 0ab425fbea9..6f878f6c6e9 100644
--- a/drivers/media/video/gspca/stv06xx/stv06xx.c
+++ b/drivers/media/video/gspca/stv06xx/stv06xx.c
@@ -36,8 +36,8 @@ MODULE_AUTHOR("Erik Andrén");
 MODULE_DESCRIPTION("STV06XX USB Camera Driver");
 MODULE_LICENSE("GPL");
 
-static int dump_bridge;
-static int dump_sensor;
+static bool dump_bridge;
+static bool dump_sensor;
 
 int stv06xx_write_bridge(struct sd *sd, u16 address, u16 i2c_data)
 {
diff --git a/drivers/media/video/hdpvr/hdpvr-core.c b/drivers/media/video/hdpvr/hdpvr-core.c
index 3f1a5b1beeb..e5eb56a5b61 100644
--- a/drivers/media/video/hdpvr/hdpvr-core.c
+++ b/drivers/media/video/hdpvr/hdpvr-core.c
@@ -49,7 +49,7 @@ module_param(default_audio_input, uint, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(default_audio_input, "default audio input: 0=RCA back / "
 		 "1=RCA front / 2=S/PDIF");
 
-static int boost_audio;
+static bool boost_audio;
 module_param(boost_audio, bool, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(boost_audio, "boost the audio signal");
 
diff --git a/drivers/media/video/ivtv/ivtv-driver.c b/drivers/media/video/ivtv/ivtv-driver.c
index 41108a9a195..544af91cbdc 100644
--- a/drivers/media/video/ivtv/ivtv-driver.c
+++ b/drivers/media/video/ivtv/ivtv-driver.c
@@ -99,7 +99,7 @@ static int i2c_clock_period[IVTV_MAX_CARDS] = { -1, -1, -1, -1, -1, -1, -1, -1,
 
 static unsigned int cardtype_c = 1;
 static unsigned int tuner_c = 1;
-static unsigned int radio_c = 1;
+static bool radio_c = 1;
 static unsigned int i2c_clock_period_c = 1;
 static char pal[] = "---";
 static char secam[] = "--";
diff --git a/drivers/media/video/ivtv/ivtvfb.c b/drivers/media/video/ivtv/ivtvfb.c
index 6b7c9c82333..d0fbfcf7133 100644
--- a/drivers/media/video/ivtv/ivtvfb.c
+++ b/drivers/media/video/ivtv/ivtvfb.c
@@ -58,7 +58,7 @@
 /* card parameters */
 static int ivtvfb_card_id = -1;
 static int ivtvfb_debug = 0;
-static int osd_laced;
+static bool osd_laced;
 static int osd_depth;
 static int osd_upper;
 static int osd_left;
diff --git a/drivers/media/video/marvell-ccic/mcam-core.c b/drivers/media/video/marvell-ccic/mcam-core.c
index 80ec64d2d6d..2c8fc0f6d69 100644
--- a/drivers/media/video/marvell-ccic/mcam-core.c
+++ b/drivers/media/video/marvell-ccic/mcam-core.c
@@ -51,7 +51,7 @@ static int delivered;
  * sense.
  */
 
-static int alloc_bufs_at_read;
+static bool alloc_bufs_at_read;
 module_param(alloc_bufs_at_read, bool, 0444);
 MODULE_PARM_DESC(alloc_bufs_at_read,
 		"Non-zero value causes DMA buffers to be allocated when the "
@@ -73,11 +73,11 @@ MODULE_PARM_DESC(dma_buf_size,
 		"parameters require larger buffers, an attempt to reallocate "
 		"will be made.");
 #else /* MCAM_MODE_VMALLOC */
-static const int alloc_bufs_at_read = 0;
+static const bool alloc_bufs_at_read = 0;
 static const int n_dma_bufs = 3;  /* Used by S/G_PARM */
 #endif /* MCAM_MODE_VMALLOC */
 
-static int flip;
+static bool flip;
 module_param(flip, bool, 0444);
 MODULE_PARM_DESC(flip,
 		"If set, the sensor will be instructed to flip the image "
diff --git a/drivers/media/video/msp3400-driver.c b/drivers/media/video/msp3400-driver.c
index d0f53885728..d7cd0f633f6 100644
--- a/drivers/media/video/msp3400-driver.c
+++ b/drivers/media/video/msp3400-driver.c
@@ -69,12 +69,12 @@ MODULE_LICENSE("GPL");
 /* module parameters */
 static int opmode   = OPMODE_AUTO;
 int msp_debug;		 /* msp_debug output */
-int msp_once;		 /* no continuous stereo monitoring */
-int msp_amsound;	 /* hard-wire AM sound at 6.5 Hz (france),
+bool msp_once;		 /* no continuous stereo monitoring */
+bool msp_amsound;	 /* hard-wire AM sound at 6.5 Hz (france),
 			    the autoscan seems work well only with FM... */
 int msp_standard = 1;    /* Override auto detect of audio msp_standard,
 			    if needed. */
-int msp_dolby;
+bool msp_dolby;
 
 int msp_stereo_thresh = 0x190; /* a2 threshold for stereo/bilingual
 					(msp34xxg only) 0x00a0-0x03c0 */
diff --git a/drivers/media/video/msp3400-driver.h b/drivers/media/video/msp3400-driver.h
index 831e8db4368..fbe5e0715f9 100644
--- a/drivers/media/video/msp3400-driver.h
+++ b/drivers/media/video/msp3400-driver.h
@@ -44,10 +44,10 @@
 
 /* module parameters */
 extern int msp_debug;
-extern int msp_once;
-extern int msp_amsound;
+extern bool msp_once;
+extern bool msp_amsound;
 extern int msp_standard;
-extern int msp_dolby;
+extern bool msp_dolby;
 extern int msp_stereo_thresh;
 
 struct msp_state {
diff --git a/drivers/media/video/omap/omap_vout.c b/drivers/media/video/omap/omap_vout.c
index ee0d0b39cd1..0de598bf66b 100644
--- a/drivers/media/video/omap/omap_vout.c
+++ b/drivers/media/video/omap/omap_vout.c
@@ -70,9 +70,9 @@ static u32 video1_numbuffers = 3;
 static u32 video2_numbuffers = 3;
 static u32 video1_bufsize = OMAP_VOUT_MAX_BUF_SIZE;
 static u32 video2_bufsize = OMAP_VOUT_MAX_BUF_SIZE;
-static u32 vid1_static_vrfb_alloc;
-static u32 vid2_static_vrfb_alloc;
-static int debug;
+static bool vid1_static_vrfb_alloc;
+static bool vid2_static_vrfb_alloc;
+static bool debug;
 
 /* Module parameters */
 module_param(video1_numbuffers, uint, S_IRUGO);
diff --git a/drivers/media/video/omap/omap_vout_vrfb.c b/drivers/media/video/omap/omap_vout_vrfb.c
index ebebcac4922..4be26abf6ce 100644
--- a/drivers/media/video/omap/omap_vout_vrfb.c
+++ b/drivers/media/video/omap/omap_vout_vrfb.c
@@ -84,7 +84,7 @@ void omap_vout_free_vrfb_buffers(struct omap_vout_device *vout)
 }
 
 int omap_vout_setup_vrfb_bufs(struct platform_device *pdev, int vid_num,
-			u32 static_vrfb_allocation)
+			      bool static_vrfb_allocation)
 {
 	int ret = 0, i, j;
 	struct omap_vout_device *vout;
diff --git a/drivers/media/video/ov7670.c b/drivers/media/video/ov7670.c
index 8aa05853128..6a564964853 100644
--- a/drivers/media/video/ov7670.c
+++ b/drivers/media/video/ov7670.c
@@ -25,7 +25,7 @@ MODULE_AUTHOR("Jonathan Corbet <corbet@lwn.net>");
 MODULE_DESCRIPTION("A low-level driver for OmniVision ov7670 sensors");
 MODULE_LICENSE("GPL");
 
-static int debug;
+static bool debug;
 module_param(debug, bool, 0644);
 MODULE_PARM_DESC(debug, "Debug level (0-1)");
 
diff --git a/drivers/media/video/saa7115.c b/drivers/media/video/saa7115.c
index 5cfdbc78b91..0ef5484696b 100644
--- a/drivers/media/video/saa7115.c
+++ b/drivers/media/video/saa7115.c
@@ -57,7 +57,7 @@ MODULE_AUTHOR(  "Maxim Yevtyushkin, Kevin Thayer, Chris Kennedy, "
 		"Hans Verkuil, Mauro Carvalho Chehab");
 MODULE_LICENSE("GPL");
 
-static int debug;
+static bool debug;
 module_param(debug, bool, 0644);
 
 MODULE_PARM_DESC(debug, "Debug level (0-1)");
diff --git a/drivers/media/video/stk-webcam.c b/drivers/media/video/stk-webcam.c
index b7fb5a5cad7..3c61aec517a 100644
--- a/drivers/media/video/stk-webcam.c
+++ b/drivers/media/video/stk-webcam.c
@@ -38,11 +38,11 @@
 #include "stk-webcam.h"
 
 
-static int hflip = 1;
+static bool hflip = 1;
 module_param(hflip, bool, 0444);
 MODULE_PARM_DESC(hflip, "Horizontal image flip (mirror). Defaults to 1");
 
-static int vflip = 1;
+static bool vflip = 1;
 module_param(vflip, bool, 0444);
 MODULE_PARM_DESC(vflip, "Vertical image flip. Defaults to 1");
 
diff --git a/drivers/media/video/tm6000/tm6000-alsa.c b/drivers/media/video/tm6000/tm6000-alsa.c
index 7d675c72fd4..bb2047c1035 100644
--- a/drivers/media/video/tm6000/tm6000-alsa.c
+++ b/drivers/media/video/tm6000/tm6000-alsa.c
@@ -42,7 +42,7 @@
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 
-static int enable[SNDRV_CARDS] = {1, [1 ... (SNDRV_CARDS - 1)] = 1};
+static bool enable[SNDRV_CARDS] = {1, [1 ... (SNDRV_CARDS - 1)] = 1};
 
 module_param_array(enable, bool, NULL, 0444);
 MODULE_PARM_DESC(enable, "Enable tm6000x soundcard. default enabled.");
diff --git a/drivers/media/video/tvp514x.c b/drivers/media/video/tvp514x.c
index 926f0393115..dd26cacd055 100644
--- a/drivers/media/video/tvp514x.c
+++ b/drivers/media/video/tvp514x.c
@@ -52,7 +52,7 @@
 #define LOCK_RETRY_DELAY                (200)
 
 /* Debug functions */
-static int debug;
+static bool debug;
 module_param(debug, bool, 0644);
 MODULE_PARM_DESC(debug, "Debug level (0-1)");
 
diff --git a/drivers/media/video/tvp7002.c b/drivers/media/video/tvp7002.c
index 7875e80cb2f..236c559d5f5 100644
--- a/drivers/media/video/tvp7002.c
+++ b/drivers/media/video/tvp7002.c
@@ -63,7 +63,7 @@ MODULE_LICENSE("GPL");
 #define TVP7002_CL_MASK		0x0f
 
 /* Debug functions */
-static int debug;
+static bool debug;
 module_param(debug, bool, 0644);
 MODULE_PARM_DESC(debug, "Debug level (0-2)");
 
diff --git a/drivers/media/video/upd64083.c b/drivers/media/video/upd64083.c
index 9bbe61700fd..65d065aa609 100644
--- a/drivers/media/video/upd64083.c
+++ b/drivers/media/video/upd64083.c
@@ -34,7 +34,7 @@ MODULE_DESCRIPTION("uPD64083 driver");
 MODULE_AUTHOR("T. Adachi, Takeru KOMORIYA, Hans Verkuil");
 MODULE_LICENSE("GPL");
 
-static int debug;
+static bool debug;
 module_param(debug, bool, 0644);
 
 MODULE_PARM_DESC(debug, "Debug level (0-1)");
diff --git a/drivers/media/video/via-camera.c b/drivers/media/video/via-camera.c
index cbf13d09b4a..bfae41ba53c 100644
--- a/drivers/media/video/via-camera.c
+++ b/drivers/media/video/via-camera.c
@@ -34,13 +34,13 @@ MODULE_AUTHOR("Jonathan Corbet <corbet@lwn.net>");
 MODULE_DESCRIPTION("VIA framebuffer-based camera controller driver");
 MODULE_LICENSE("GPL");
 
-static int flip_image;
+static bool flip_image;
 module_param(flip_image, bool, 0444);
 MODULE_PARM_DESC(flip_image,
 		"If set, the sensor will be instructed to flip the image "
 		"vertically.");
 
-static int override_serial;
+static bool override_serial;
 module_param(override_serial, bool, 0444);
 MODULE_PARM_DESC(override_serial,
 		"The camera driver will normally refuse to load if "
diff --git a/drivers/media/video/zoran/zoran_device.c b/drivers/media/video/zoran/zoran_device.c
index e8a27844bf3..e86173bd132 100644
--- a/drivers/media/video/zoran/zoran_device.c
+++ b/drivers/media/video/zoran/zoran_device.c
@@ -57,7 +57,7 @@
 		   ZR36057_ISR_GIRQ1 | \
 		   ZR36057_ISR_JPEGRepIRQ )
 
-static int lml33dpath;		/* default = 0
+static bool lml33dpath;		/* default = 0
 				 * 1 will use digital path in capture
 				 * mode instead of analog. It can be
 				 * used for picture adjustments using
diff --git a/drivers/media/video/zoran/zr36060.c b/drivers/media/video/zoran/zr36060.c
index 5e4f57cbf31..f08546fe223 100644
--- a/drivers/media/video/zoran/zr36060.c
+++ b/drivers/media/video/zoran/zr36060.c
@@ -50,7 +50,7 @@
 /* amount of chips attached via this driver */
 static int zr36060_codecs;
 
-static int low_bitrate;
+static bool low_bitrate;
 module_param(low_bitrate, bool, 0);
 MODULE_PARM_DESC(low_bitrate, "Buz compatibility option, halves bitrate");
 
diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c
index 6ce70e9615d..5319e9b6584 100644
--- a/drivers/memstick/host/jmb38x_ms.c
+++ b/drivers/memstick/host/jmb38x_ms.c
@@ -21,7 +21,7 @@
 
 #define DRIVER_NAME "jmb38x_ms"
 
-static int no_dma;
+static bool no_dma;
 module_param(no_dma, bool, 0644);
 
 enum {
diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c
index 668f5c6a039..29b2172ae18 100644
--- a/drivers/memstick/host/r592.c
+++ b/drivers/memstick/host/r592.c
@@ -23,7 +23,7 @@
 #include <linux/swab.h>
 #include "r592.h"
 
-static int r592_enable_dma = 1;
+static bool r592_enable_dma = 1;
 static int debug;
 
 static const char *tpc_names[] = {
diff --git a/drivers/memstick/host/tifm_ms.c b/drivers/memstick/host/tifm_ms.c
index b7aacf47703..6902b83eb1b 100644
--- a/drivers/memstick/host/tifm_ms.c
+++ b/drivers/memstick/host/tifm_ms.c
@@ -22,7 +22,7 @@
 
 #define DRIVER_NAME "tifm_ms"
 
-static int no_dma;
+static bool no_dma;
 module_param(no_dma, bool, 0644);
 
 /*
diff --git a/drivers/misc/iwmc3200top/main.c b/drivers/misc/iwmc3200top/main.c
index b1f4563be9a..701eb600b12 100644
--- a/drivers/misc/iwmc3200top/main.c
+++ b/drivers/misc/iwmc3200top/main.c
@@ -376,20 +376,20 @@ static int blocks;
 module_param(blocks, int, 0604);
 MODULE_PARM_DESC(blocks, "max_blocks_to_send");
 
-static int dump;
+static bool dump;
 module_param(dump, bool, 0604);
 MODULE_PARM_DESC(dump, "dump_hex_content");
 
-static int jump = 1;
+static bool jump = 1;
 module_param(jump, bool, 0604);
 
-static int direct = 1;
+static bool direct = 1;
 module_param(direct, bool, 0604);
 
-static int checksum = 1;
+static bool checksum = 1;
 module_param(checksum, bool, 0604);
 
-static int fw_download = 1;
+static bool fw_download = 1;
 module_param(fw_download, bool, 0604);
 
 static int block_size = IWMC_SDIO_BLK_SIZE;
@@ -398,7 +398,7 @@ module_param(block_size, int, 0404);
 static int download_trans_blks = IWMC_DEFAULT_TR_BLK;
 module_param(download_trans_blks, int, 0604);
 
-static int rubbish_barker;
+static bool rubbish_barker;
 module_param(rubbish_barker, bool, 0604);
 
 #ifdef CONFIG_IWMC3200TOP_DEBUG
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 950b97d7412..75d7d7e1736 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -48,7 +48,7 @@ static struct workqueue_struct *workqueue;
  * performance cost, and for other reasons may not always be desired.
  * So we allow it it to be disabled.
  */
-int use_spi_crc = 1;
+bool use_spi_crc = 1;
 module_param(use_spi_crc, bool, 0);
 
 /*
@@ -58,9 +58,9 @@ module_param(use_spi_crc, bool, 0);
  * overridden if necessary.
  */
 #ifdef CONFIG_MMC_UNSAFE_RESUME
-int mmc_assume_removable;
+bool mmc_assume_removable;
 #else
-int mmc_assume_removable = 1;
+bool mmc_assume_removable = 1;
 #endif
 EXPORT_SYMBOL(mmc_assume_removable);
 module_param_named(removable, mmc_assume_removable, bool, 0644);
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index 14664f1fb16..afa6bd2b7b7 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -64,7 +64,7 @@ int mmc_attach_sd(struct mmc_host *host);
 int mmc_attach_sdio(struct mmc_host *host);
 
 /* Module parameters */
-extern int use_spi_crc;
+extern bool use_spi_crc;
 
 /* Debugfs information for hosts and cards */
 void mmc_add_host_debugfs(struct mmc_host *host);
diff --git a/drivers/mmc/host/tifm_sd.c b/drivers/mmc/host/tifm_sd.c
index f70d04664ca..69d249f51d6 100644
--- a/drivers/mmc/host/tifm_sd.c
+++ b/drivers/mmc/host/tifm_sd.c
@@ -22,8 +22,8 @@
 #define DRIVER_NAME "tifm_sd"
 #define DRIVER_VERSION "0.8"
 
-static int no_dma = 0;
-static int fixed_timeout = 0;
+static bool no_dma = 0;
+static bool fixed_timeout = 0;
 module_param(no_dma, bool, 0644);
 module_param(fixed_timeout, bool, 0644);
 
diff --git a/drivers/mmc/host/vub300.c b/drivers/mmc/host/vub300.c
index 2ec978bc32b..3135a1a5d75 100644
--- a/drivers/mmc/host/vub300.c
+++ b/drivers/mmc/host/vub300.c
@@ -223,25 +223,25 @@ enum SD_RESPONSE_TYPE {
 #define FUN(c) (0x000007 & (c->arg>>28))
 #define REG(c) (0x01FFFF & (c->arg>>9))
 
-static int limit_speed_to_24_MHz;
+static bool limit_speed_to_24_MHz;
 module_param(limit_speed_to_24_MHz, bool, 0644);
 MODULE_PARM_DESC(limit_speed_to_24_MHz, "Limit Max SDIO Clock Speed to 24 MHz");
 
-static int pad_input_to_usb_pkt;
+static bool pad_input_to_usb_pkt;
 module_param(pad_input_to_usb_pkt, bool, 0644);
 MODULE_PARM_DESC(pad_input_to_usb_pkt,
 		 "Pad USB data input transfers to whole USB Packet");
 
-static int disable_offload_processing;
+static bool disable_offload_processing;
 module_param(disable_offload_processing, bool, 0644);
 MODULE_PARM_DESC(disable_offload_processing, "Disable Offload Processing");
 
-static int force_1_bit_data_xfers;
+static bool force_1_bit_data_xfers;
 module_param(force_1_bit_data_xfers, bool, 0644);
 MODULE_PARM_DESC(force_1_bit_data_xfers,
 		 "Force SDIO Data Transfers to 1-bit Mode");
 
-static int force_polling_for_irqs;
+static bool force_polling_for_irqs;
 module_param(force_polling_for_irqs, bool, 0644);
 MODULE_PARM_DESC(force_polling_for_irqs, "Force Polling for SDIO interrupts");
 
diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c
index 8544d6bf50a..5c3d719c37e 100644
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -185,7 +185,7 @@ struct pxa3xx_nand_info {
 	uint32_t		ndcb2;
 };
 
-static int use_dma = 1;
+static bool use_dma = 1;
 module_param(use_dma, bool, 0444);
 MODULE_PARM_DESC(use_dma, "enable DMA for data transferring to/from NAND HW");
 
diff --git a/drivers/mtd/nand/r852.c b/drivers/mtd/nand/r852.c
index f20f393bfda..769a4e096b3 100644
--- a/drivers/mtd/nand/r852.c
+++ b/drivers/mtd/nand/r852.c
@@ -22,7 +22,7 @@
 #include "r852.h"
 
 
-static int r852_enable_dma = 1;
+static bool r852_enable_dma = 1;
 module_param(r852_enable_dma, bool, S_IRUGO);
 MODULE_PARM_DESC(r852_enable_dma, "Enable usage of the DMA (default)");
 
diff --git a/drivers/parport/parport_ip32.c b/drivers/parport/parport_ip32.c
index 0dc34f12f92..d4716273651 100644
--- a/drivers/parport/parport_ip32.c
+++ b/drivers/parport/parport_ip32.c
@@ -135,7 +135,7 @@
 #define PARPORT_IP32_ENABLE_EPP	(1U << 3)
 #define PARPORT_IP32_ENABLE_ECP	(1U << 4)
 static unsigned int features =	~0U;
-static int verbose_probing =	DEFAULT_VERBOSE_PROBING;
+static bool verbose_probing =	DEFAULT_VERBOSE_PROBING;
 
 /* We do not support more than one port. */
 static struct parport *this_port = NULL;
diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index 095f29e1373..2a47e82821d 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -44,7 +44,7 @@
 #define	METHOD_NAME__SUN	"_SUN"
 #define	METHOD_NAME_OSHP	"OSHP"
 
-static int debug_acpi;
+static bool debug_acpi;
 
 static acpi_status
 decode_type0_hpx_record(union acpi_object *record, struct hotplug_params *hpx)
diff --git a/drivers/pci/hotplug/acpiphp_core.c b/drivers/pci/hotplug/acpiphp_core.c
index efa9f2de51c..aa41631e9e0 100644
--- a/drivers/pci/hotplug/acpiphp_core.c
+++ b/drivers/pci/hotplug/acpiphp_core.c
@@ -47,7 +47,7 @@
 /* name size which is used for entries in pcihpfs */
 #define SLOT_NAME_SIZE  21              /* {_SUN} */
 
-static int debug;
+static bool debug;
 int acpiphp_debug;
 
 /* local variables */
diff --git a/drivers/pci/hotplug/acpiphp_ibm.c b/drivers/pci/hotplug/acpiphp_ibm.c
index e525263210e..c35e8ad6db0 100644
--- a/drivers/pci/hotplug/acpiphp_ibm.c
+++ b/drivers/pci/hotplug/acpiphp_ibm.c
@@ -43,7 +43,7 @@
 #define DRIVER_AUTHOR	"Irene Zubarev <zubarev@us.ibm.com>, Vernon Mauery <vernux@us.ibm.com>"
 #define DRIVER_DESC	"ACPI Hot Plug PCI Controller Driver IBM extension"
 
-static int debug;
+static bool debug;
 
 MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/pci/hotplug/cpcihp_zt5550.c b/drivers/pci/hotplug/cpcihp_zt5550.c
index 41f6a8d79c8..6bf8d2ab164 100644
--- a/drivers/pci/hotplug/cpcihp_zt5550.c
+++ b/drivers/pci/hotplug/cpcihp_zt5550.c
@@ -57,8 +57,8 @@
 #define warn(format, arg...) printk(KERN_WARNING "%s: " format "\n", MY_NAME , ## arg)
 
 /* local variables */
-static int debug;
-static int poll;
+static bool debug;
+static bool poll;
 static struct cpci_hp_controller_ops zt5550_hpc_ops;
 static struct cpci_hp_controller zt5550_hpc;
 
diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c
index f1ce99cceac..187a199da93 100644
--- a/drivers/pci/hotplug/cpqphp_core.c
+++ b/drivers/pci/hotplug/cpqphp_core.c
@@ -57,8 +57,8 @@ struct irq_routing_table *cpqhp_routing_table;
 static void __iomem *smbios_table;
 static void __iomem *smbios_start;
 static void __iomem *cpqhp_rom_start;
-static int power_mode;
-static int debug;
+static bool power_mode;
+static bool debug;
 static int initialized;
 
 #define DRIVER_VERSION	"0.9.8"
diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index d934dd4fa87..5506e0e8fbc 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -49,7 +49,7 @@
 
 int ibmphp_debug;
 
-static int debug;
+static bool debug;
 module_param(debug, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC (debug, "Debugging mode enabled or not");
 MODULE_LICENSE ("GPL");
diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index 6d2eea93298..202f4a969eb 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -51,7 +51,7 @@
 
 
 /* local variables */
-static int debug;
+static bool debug;
 
 #define DRIVER_VERSION	"0.5"
 #define DRIVER_AUTHOR	"Greg Kroah-Hartman <greg@kroah.com>, Scott Murray <scottm@somanetworks.com>"
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 9a33fdde2d1..4b7cce1de6e 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -40,10 +40,10 @@
 
 #define MY_NAME	"pciehp"
 
-extern int pciehp_poll_mode;
+extern bool pciehp_poll_mode;
 extern int pciehp_poll_time;
-extern int pciehp_debug;
-extern int pciehp_force;
+extern bool pciehp_debug;
+extern bool pciehp_force;
 extern struct workqueue_struct *pciehp_wq;
 
 #define dbg(format, arg...)						\
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index b8c99d35ac9..365c6b96c64 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -38,10 +38,10 @@
 #include <linux/time.h>
 
 /* Global variables */
-int pciehp_debug;
-int pciehp_poll_mode;
+bool pciehp_debug;
+bool pciehp_poll_mode;
 int pciehp_poll_time;
-int pciehp_force;
+bool pciehp_force;
 struct workqueue_struct *pciehp_wq;
 
 #define DRIVER_VERSION	"0.4"
diff --git a/drivers/pci/hotplug/pcihp_skeleton.c b/drivers/pci/hotplug/pcihp_skeleton.c
index 5175d9b26f0..b20ceaaa31f 100644
--- a/drivers/pci/hotplug/pcihp_skeleton.c
+++ b/drivers/pci/hotplug/pcihp_skeleton.c
@@ -59,7 +59,7 @@ static LIST_HEAD(slot_list);
 #define warn(format, arg...) printk(KERN_WARNING "%s: " format "\n", MY_NAME , ## arg)
 
 /* local variables */
-static int debug;
+static bool debug;
 static int num_slots;
 
 #define DRIVER_VERSION	"0.3"
diff --git a/drivers/pci/hotplug/rpaphp.h b/drivers/pci/hotplug/rpaphp.h
index 419919a87b0..df5677440a0 100644
--- a/drivers/pci/hotplug/rpaphp.h
+++ b/drivers/pci/hotplug/rpaphp.h
@@ -46,7 +46,7 @@
 #define PRESENT         1	/* Card in slot */
 
 #define MY_NAME "rpaphp"
-extern int rpaphp_debug;
+extern bool rpaphp_debug;
 #define dbg(format, arg...)					\
 	do {							\
 		if (rpaphp_debug)					\
diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c
index 758adb5f47f..127d6e60018 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -37,7 +37,7 @@
 				/* and pci_do_scan_bus */
 #include "rpaphp.h"
 
-int rpaphp_debug;
+bool rpaphp_debug;
 LIST_HEAD(rpaphp_slot_head);
 
 #define DRIVER_VERSION	"0.1"
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index e0c90e643b5..ca64932e658 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -43,9 +43,9 @@
 	#define MY_NAME	THIS_MODULE->name
 #endif
 
-extern int shpchp_poll_mode;
+extern bool shpchp_poll_mode;
 extern int shpchp_poll_time;
-extern int shpchp_debug;
+extern bool shpchp_debug;
 extern struct workqueue_struct *shpchp_wq;
 extern struct workqueue_struct *shpchp_ordered_wq;
 
diff --git a/drivers/pci/hotplug/shpchp_core.c b/drivers/pci/hotplug/shpchp_core.c
index dd7e0c51a33..7414fd9ad1d 100644
--- a/drivers/pci/hotplug/shpchp_core.c
+++ b/drivers/pci/hotplug/shpchp_core.c
@@ -36,8 +36,8 @@
 #include "shpchp.h"
 
 /* Global variables */
-int shpchp_debug;
-int shpchp_poll_mode;
+bool shpchp_debug;
+bool shpchp_poll_mode;
 int shpchp_poll_time;
 struct workqueue_struct *shpchp_wq;
 struct workqueue_struct *shpchp_ordered_wq;
diff --git a/drivers/pci/pcie/aer/aer_inject.c b/drivers/pci/pcie/aer/aer_inject.c
index 95489cd9a55..52229863e9f 100644
--- a/drivers/pci/pcie/aer/aer_inject.c
+++ b/drivers/pci/pcie/aer/aer_inject.c
@@ -28,7 +28,7 @@
 #include "aerdrv.h"
 
 /* Override the existing corrected and uncorrected error masks */
-static int aer_mask_override;
+static bool aer_mask_override;
 module_param(aer_mask_override, bool, 0);
 
 struct aer_error_inj {
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 9674e9f30d4..0ca05353814 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -27,8 +27,8 @@
 #include <linux/kfifo.h>
 #include "aerdrv.h"
 
-static int forceload;
-static int nosourceid;
+static bool forceload;
+static bool nosourceid;
 module_param(forceload, bool, 0);
 module_param(nosourceid, bool, 0);
 
diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c
index 9dc565c615b..849c0c11d2a 100644
--- a/drivers/pcmcia/yenta_socket.c
+++ b/drivers/pcmcia/yenta_socket.c
@@ -24,15 +24,15 @@
 #include "yenta_socket.h"
 #include "i82365.h"
 
-static int disable_clkrun;
+static bool disable_clkrun;
 module_param(disable_clkrun, bool, 0444);
 MODULE_PARM_DESC(disable_clkrun, "If PC card doesn't function properly, please try this option");
 
-static int isa_probe = 1;
+static bool isa_probe = 1;
 module_param(isa_probe, bool, 0444);
 MODULE_PARM_DESC(isa_probe, "If set ISA interrupts are probed (default). Set to N to disable probing");
 
-static int pwr_irqs_off;
+static bool pwr_irqs_off;
 module_param(pwr_irqs_off, bool, 0644);
 MODULE_PARM_DESC(pwr_irqs_off, "Force IRQs off during power-on of slot. Use only when seeing IRQ storms!");
 
diff --git a/drivers/platform/x86/compal-laptop.c b/drivers/platform/x86/compal-laptop.c
index 8877b836d27..d9673447832 100644
--- a/drivers/platform/x86/compal-laptop.c
+++ b/drivers/platform/x86/compal-laptop.c
@@ -189,7 +189,7 @@ struct compal_data{
 /* =============== */
 /* General globals */
 /* =============== */
-static int force;
+static bool force;
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Force driver load, ignore DMI data");
 
diff --git a/drivers/platform/x86/intel_oaktrail.c b/drivers/platform/x86/intel_oaktrail.c
index 7f88c7923fc..6ee0b5c9093 100644
--- a/drivers/platform/x86/intel_oaktrail.c
+++ b/drivers/platform/x86/intel_oaktrail.c
@@ -95,7 +95,7 @@
 #define OT_EC_BL_CONTROL_ON_DATA	0x1A
 
 
-static int force;
+static bool force;
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Force driver load, ignore DMI data");
 
diff --git a/drivers/platform/x86/msi-laptop.c b/drivers/platform/x86/msi-laptop.c
index f204643c505..bb5132128b3 100644
--- a/drivers/platform/x86/msi-laptop.c
+++ b/drivers/platform/x86/msi-laptop.c
@@ -89,7 +89,7 @@ static int msi_laptop_resume(struct platform_device *device);
 
 #define MSI_STANDARD_EC_DEVICES_EXISTS_ADDRESS	0x2f
 
-static int force;
+static bool force;
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Force driver load, ignore DMI data");
 
diff --git a/drivers/platform/x86/samsung-laptop.c b/drivers/platform/x86/samsung-laptop.c
index 09e26bfd464..fd73ea89b85 100644
--- a/drivers/platform/x86/samsung-laptop.c
+++ b/drivers/platform/x86/samsung-laptop.c
@@ -228,12 +228,12 @@ static struct platform_device *sdev;
 static struct rfkill *rfk;
 static bool has_stepping_quirk;
 
-static int force;
+static bool force;
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force,
 		"Disable the DMI check and forces the driver to be loaded");
 
-static int debug;
+static bool debug;
 module_param(debug, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(debug, "Debug enabled or not");
 
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 62533c105da..ea0c6075b72 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -378,13 +378,13 @@ static unsigned int bright_maxlvl;	/* 0 = unknown */
 
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 static int dbg_wlswemul;
-static int tpacpi_wlsw_emulstate;
+static bool tpacpi_wlsw_emulstate;
 static int dbg_bluetoothemul;
-static int tpacpi_bluetooth_emulstate;
+static bool tpacpi_bluetooth_emulstate;
 static int dbg_wwanemul;
-static int tpacpi_wwan_emulstate;
+static bool tpacpi_wwan_emulstate;
 static int dbg_uwbemul;
-static int tpacpi_uwb_emulstate;
+static bool tpacpi_uwb_emulstate;
 #endif
 
 
@@ -6444,7 +6444,7 @@ static struct ibm_struct brightness_driver_data = {
 
 static int alsa_index = ~((1 << (SNDRV_CARDS - 3)) - 1); /* last three slots */
 static char *alsa_id = "ThinkPadEC";
-static int alsa_enable = SNDRV_DEFAULT_ENABLE1;
+static bool alsa_enable = SNDRV_DEFAULT_ENABLE1;
 
 struct tpacpi_alsa_data {
 	struct snd_card *card;
@@ -6487,7 +6487,7 @@ static enum tpacpi_volume_access_mode volume_mode =
 	TPACPI_VOL_MODE_MAX;
 
 static enum tpacpi_volume_capabilities volume_capabilities;
-static int volume_control_allowed;
+static bool volume_control_allowed;
 
 /*
  * Used to syncronize writers to TP_EC_AUDIO and
@@ -7265,7 +7265,7 @@ enum fan_control_commands {
 						 * and also watchdog cmd */
 };
 
-static int fan_control_allowed;
+static bool fan_control_allowed;
 
 static enum fan_status_access_mode fan_status_access_mode;
 static enum fan_control_access_mode fan_control_access_mode;
@@ -8437,7 +8437,7 @@ static struct proc_dir_entry *proc_dir;
  * Module and infrastructure proble, init and exit handling
  */
 
-static int force_load;
+static bool force_load;
 
 #ifdef CONFIG_THINKPAD_ACPI_DEBUG
 static const char * __init str_supported(int is_supported)
diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index a134c26870b..42a4dcc25f9 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -82,12 +82,12 @@ struct wmi_block {
 #define ACPI_WMI_STRING      0x4	/* GUID takes & returns a string */
 #define ACPI_WMI_EVENT       0x8	/* GUID is an event */
 
-static int debug_event;
+static bool debug_event;
 module_param(debug_event, bool, 0444);
 MODULE_PARM_DESC(debug_event,
 		 "Log WMI Events [0/1]");
 
-static int debug_dump_wdg;
+static bool debug_dump_wdg;
 module_param(debug_dump_wdg, bool, 0444);
 MODULE_PARM_DESC(debug_dump_wdg,
 		 "Dump available WMI interfaces [0/1]");
diff --git a/drivers/power/ds2760_battery.c b/drivers/power/ds2760_battery.c
index 545874b1df9..076e211a40b 100644
--- a/drivers/power/ds2760_battery.c
+++ b/drivers/power/ds2760_battery.c
@@ -64,7 +64,7 @@ static unsigned int cache_time = 1000;
 module_param(cache_time, uint, 0644);
 MODULE_PARM_DESC(cache_time, "cache time in milliseconds");
 
-static unsigned int pmod_enabled;
+static bool pmod_enabled;
 module_param(pmod_enabled, bool, 0644);
 MODULE_PARM_DESC(pmod_enabled, "PMOD enable bit");
 
diff --git a/drivers/s390/char/raw3270.c b/drivers/s390/char/raw3270.c
index e5cb9248a44..f3b8bb84faf 100644
--- a/drivers/s390/char/raw3270.c
+++ b/drivers/s390/char/raw3270.c
@@ -75,7 +75,7 @@ static LIST_HEAD(raw3270_devices);
 static int raw3270_registered;
 
 /* Module parameters */
-static int tubxcorrect = 0;
+static bool tubxcorrect = 0;
 module_param(tubxcorrect, bool, 0);
 
 /*
diff --git a/drivers/s390/char/vmwatchdog.c b/drivers/s390/char/vmwatchdog.c
index 11312f401c7..2211277a107 100644
--- a/drivers/s390/char/vmwatchdog.c
+++ b/drivers/s390/char/vmwatchdog.c
@@ -28,9 +28,9 @@
 #define MAX_CMDLEN 240
 #define MIN_INTERVAL 15
 static char vmwdt_cmd[MAX_CMDLEN] = "IPL";
-static int vmwdt_conceal;
+static bool vmwdt_conceal;
 
-static int vmwdt_nowayout = WATCHDOG_NOWAYOUT;
+static bool vmwdt_nowayout = WATCHDOG_NOWAYOUT;
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Arnd Bergmann <arndb@de.ibm.com>");
diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c
index 195823a51aa..ed119cedaae 100644
--- a/drivers/scsi/aha1542.c
+++ b/drivers/scsi/aha1542.c
@@ -102,7 +102,7 @@ static int setup_dmaspeed[MAXBOARDS] __initdata = { -1, -1, -1, -1 };
  */
 
 #if defined(MODULE)
-static int isapnp = 0;
+static bool isapnp = 0;
 static int aha1542[] = {0x330, 11, 4, -1};
 module_param_array(aha1542, int, NULL, 0);
 module_param(isapnp, bool, 0);
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index f5b718d3c31..13aeca3d51f 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -546,7 +546,7 @@ static struct ParameterData __devinitdata cfg_data[] = {
  * command line overrides will be used. If set to 1 then safe and
  * slow settings will be used.
  */
-static int use_safe_settings = 0;
+static bool use_safe_settings = 0;
 module_param_named(safe, use_safe_settings, bool, 0);
 MODULE_PARM_DESC(safe, "Use safe and slow settings only. Default: false");
 
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index f6a50c98c36..002924963cd 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -59,11 +59,11 @@ MODULE_PARM_DESC(trans_mode, "transfer mode (0: BIOS(default) 1: Async 2: Ultra2
 #define ASYNC_MODE    1
 #define ULTRA20M_MODE 2
 
-static int       auto_param = 0;	/* default: ON */
+static bool      auto_param = 0;	/* default: ON */
 module_param     (auto_param, bool, 0);
 MODULE_PARM_DESC(auto_param, "AutoParameter mode (0: ON(default) 1: OFF)");
 
-static int       disc_priv  = 1;	/* default: OFF */
+static bool      disc_priv  = 1;	/* default: OFF */
 module_param     (disc_priv, bool, 0);
 MODULE_PARM_DESC(disc_priv,  "disconnection privilege mode (0: ON 1: OFF(default))");
 
diff --git a/drivers/scsi/pcmcia/nsp_cs.c b/drivers/scsi/pcmcia/nsp_cs.c
index ca86721a71b..b61a753eb89 100644
--- a/drivers/scsi/pcmcia/nsp_cs.c
+++ b/drivers/scsi/pcmcia/nsp_cs.c
@@ -70,7 +70,7 @@ module_param(nsp_burst_mode, int, 0);
 MODULE_PARM_DESC(nsp_burst_mode, "Burst transfer mode (0=io8, 1=io32, 2=mem32(default))");
 
 /* Release IO ports after configuration? */
-static int       free_ports = 0;
+static bool       free_ports = 0;
 module_param(free_ports, bool, 0);
 MODULE_PARM_DESC(free_ports, "Release IO ports after configuration? (default: 0 (=no))");
 
diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c
index 0d18d80bcd2..9bcf87ae4c0 100644
--- a/drivers/staging/comedi/comedi_fops.c
+++ b/drivers/staging/comedi/comedi_fops.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL(comedi_debug);
 module_param(comedi_debug, int, 0644);
 #endif
 
-int comedi_autoconfig = 1;
+bool comedi_autoconfig = 1;
 module_param(comedi_autoconfig, bool, 0444);
 
 static int comedi_num_legacy_minors;
diff --git a/drivers/staging/comedi/comedi_fops.h b/drivers/staging/comedi/comedi_fops.h
index da4b4f5553f..006cf14c577 100644
--- a/drivers/staging/comedi/comedi_fops.h
+++ b/drivers/staging/comedi/comedi_fops.h
@@ -1,10 +1,11 @@
 
 #ifndef _COMEDI_FOPS_H
 #define _COMEDI_FOPS_H
+#include <linux/types.h>
 
 extern struct class *comedi_class;
 extern const struct file_operations comedi_fops;
-extern int comedi_autoconfig;
+extern bool comedi_autoconfig;
 extern struct comedi_driver *comedi_drivers;
 
 #endif /* _COMEDI_FOPS_H */
diff --git a/drivers/staging/media/go7007/snd-go7007.c b/drivers/staging/media/go7007/snd-go7007.c
index deac938d850..d071c838ac2 100644
--- a/drivers/staging/media/go7007/snd-go7007.c
+++ b/drivers/staging/media/go7007/snd-go7007.c
@@ -38,7 +38,7 @@
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
-static int enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
+static bool enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
 
 module_param_array(index, int, NULL, 0444);
 module_param_array(id, charp, NULL, 0444);
diff --git a/drivers/staging/media/lirc/lirc_bt829.c b/drivers/staging/media/lirc/lirc_bt829.c
index c5a0d27a02d..4d20e9f7411 100644
--- a/drivers/staging/media/lirc/lirc_bt829.c
+++ b/drivers/staging/media/lirc/lirc_bt829.c
@@ -53,7 +53,7 @@ static unsigned char do_get_bits(void);
 
 #define DRIVER_NAME "lirc_bt829"
 
-static int debug;
+static bool debug;
 #define dprintk(fmt, args...)						 \
 	do {								 \
 		if (debug)						 \
diff --git a/drivers/staging/media/lirc/lirc_igorplugusb.c b/drivers/staging/media/lirc/lirc_igorplugusb.c
index 6cd4cd67a1d..7a250177667 100644
--- a/drivers/staging/media/lirc/lirc_igorplugusb.c
+++ b/drivers/staging/media/lirc/lirc_igorplugusb.c
@@ -62,9 +62,9 @@
 
 /* debugging support */
 #ifdef CONFIG_USB_DEBUG
-static int debug = 1;
+static bool debug = 1;
 #else
-static int debug;
+static bool debug;
 #endif
 
 #define dprintk(fmt, args...)					\
diff --git a/drivers/staging/media/lirc/lirc_parallel.c b/drivers/staging/media/lirc/lirc_parallel.c
index 02b07a6c177..dd2bca7b56f 100644
--- a/drivers/staging/media/lirc/lirc_parallel.c
+++ b/drivers/staging/media/lirc/lirc_parallel.c
@@ -63,8 +63,8 @@
 
 /*** Global Variables ***/
 
-static int debug;
-static int check_pselecd;
+static bool debug;
+static bool check_pselecd;
 
 unsigned int irq = LIRC_IRQ;
 unsigned int io = LIRC_PORT;
diff --git a/drivers/staging/media/lirc/lirc_serial.c b/drivers/staging/media/lirc/lirc_serial.c
index 8a060a8a722..2aac67c9828 100644
--- a/drivers/staging/media/lirc/lirc_serial.c
+++ b/drivers/staging/media/lirc/lirc_serial.c
@@ -107,13 +107,13 @@ struct lirc_serial {
 static int type;
 static int io;
 static int irq;
-static int iommap;
+static bool iommap;
 static int ioshift;
-static int softcarrier = 1;
-static int share_irq;
-static int debug;
+static bool softcarrier = 1;
+static bool share_irq;
+static bool debug;
 static int sense = -1;	/* -1 = auto, 0 = active high, 1 = active low */
-static int txsense;	/* 0 = active high, 1 = active low */
+static bool txsense;	/* 0 = active high, 1 = active low */
 
 #define dprintk(fmt, args...)					\
 	do {							\
diff --git a/drivers/staging/media/lirc/lirc_sir.c b/drivers/staging/media/lirc/lirc_sir.c
index 6903d3992ec..c94382b917a 100644
--- a/drivers/staging/media/lirc/lirc_sir.c
+++ b/drivers/staging/media/lirc/lirc_sir.c
@@ -173,7 +173,7 @@ static DEFINE_SPINLOCK(hardware_lock);
 static int rx_buf[RBUF_LEN];
 static unsigned int rx_tail, rx_head;
 
-static int debug;
+static bool debug;
 #define dprintk(fmt, args...)						\
 	do {								\
 		if (debug)						\
diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c
index 0302d82a12f..76ea4a8f2c7 100644
--- a/drivers/staging/media/lirc/lirc_zilog.c
+++ b/drivers/staging/media/lirc/lirc_zilog.c
@@ -155,8 +155,8 @@ static struct mutex tx_data_lock;
 #define zilog_info(s, args...) printk(KERN_INFO KBUILD_MODNAME ": " s, ## args)
 
 /* module parameters */
-static int debug;	/* debug output */
-static int tx_only;	/* only handle the IR Tx function */
+static bool debug;	/* debug output */
+static bool tx_only;	/* only handle the IR Tx function */
 static int minor = -1;	/* minor number */
 
 #define dprintk(fmt, args...)						\
diff --git a/drivers/staging/quatech_usb2/quatech_usb2.c b/drivers/staging/quatech_usb2/quatech_usb2.c
index 02fafecd477..897a3a99c79 100644
--- a/drivers/staging/quatech_usb2/quatech_usb2.c
+++ b/drivers/staging/quatech_usb2/quatech_usb2.c
@@ -16,7 +16,7 @@
 #include <linux/usb/serial.h>
 #include <linux/uaccess.h>
 
-static int debug;
+static bool debug;
 
 /* Version Information */
 #define DRIVER_VERSION "v2.00"
diff --git a/drivers/staging/serqt_usb2/serqt_usb2.c b/drivers/staging/serqt_usb2/serqt_usb2.c
index c44e41af288..1c5780f1571 100644
--- a/drivers/staging/serqt_usb2/serqt_usb2.c
+++ b/drivers/staging/serqt_usb2/serqt_usb2.c
@@ -16,7 +16,7 @@
 #include <linux/usb/serial.h>
 #include <linux/uaccess.h>
 
-static int debug;
+static bool debug;
 
 /* Version Information */
 #define DRIVER_VERSION "v2.14"
diff --git a/drivers/staging/speakup/speakup.h b/drivers/staging/speakup/speakup.h
index 412b87947f6..e66579e6147 100644
--- a/drivers/staging/speakup/speakup.h
+++ b/drivers/staging/speakup/speakup.h
@@ -116,7 +116,7 @@ extern int bleep_time, bell_pos;
 extern int spell_delay, key_echo;
 extern short punc_mask;
 extern short pitch_shift, synth_flags;
-extern int quiet_boot;
+extern bool quiet_boot;
 extern char *synth_name;
 extern struct bleep unprocessed_sound;
 
diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c
index c241074a4b5..2222d6919ef 100644
--- a/drivers/staging/speakup/synth.c
+++ b/drivers/staging/speakup/synth.c
@@ -22,7 +22,7 @@ static struct spk_synth *synths[MAXSYNTHS];
 struct spk_synth *synth;
 char pitch_buff[32] = "";
 static int module_status;
-int quiet_boot;
+bool quiet_boot;
 
 struct speakup_info_t speakup_info = {
 	.spinlock = __SPIN_LOCK_UNLOCKED(speakup_info.spinlock),
diff --git a/drivers/staging/vme/bridges/vme_tsi148.c b/drivers/staging/vme/bridges/vme_tsi148.c
index 08a449b4abf..f50582169b2 100644
--- a/drivers/staging/vme/bridges/vme_tsi148.c
+++ b/drivers/staging/vme/bridges/vme_tsi148.c
@@ -41,7 +41,7 @@ static void __exit tsi148_exit(void);
 
 
 /* Module parameter */
-static int err_chk;
+static bool err_chk;
 static int geoid;
 
 static const char driver_name[] = "vme_tsi148";
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index 6a1241c7f84..de88aa5566e 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -118,7 +118,7 @@ static unsigned long board2;
 static unsigned long board3;
 static unsigned long board4;
 static unsigned long controller;
-static int support_low_speed;
+static bool support_low_speed;
 static unsigned long modem1;
 static unsigned long modem2;
 static unsigned long modem3;
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index e67fb20490d..ff8017f8791 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -850,7 +850,7 @@ static int mgsl_device_count;
  * .text section address and breakpoint on module load.
  * This is useful for use with gdb and add-symbol-file command.
  */
-static int break_on_load;
+static bool break_on_load;
 
 /*
  * Driver major number, defaults to zero to get auto
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 0f6b796c95c..a7efe538df0 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -456,7 +456,7 @@ static int synclinkmp_device_count = 0;
  * .text section address and breakpoint on module load.
  * This is useful for use with gdb and add-symbol-file command.
  */
-static int break_on_load = 0;
+static bool break_on_load = 0;
 
 /*
  * Driver major number, defaults to zero to get auto
diff --git a/drivers/usb/atm/speedtch.c b/drivers/usb/atm/speedtch.c
index b42092e1f16..98dd9e49b68 100644
--- a/drivers/usb/atm/speedtch.c
+++ b/drivers/usb/atm/speedtch.c
@@ -73,9 +73,9 @@ static const char speedtch_driver_name[] = "speedtch";
 #define DEFAULT_SW_BUFFERING	0
 
 static unsigned int altsetting = 0; /* zero means: use the default */
-static int dl_512_first = DEFAULT_DL_512_FIRST;
-static int enable_isoc = DEFAULT_ENABLE_ISOC;
-static int sw_buffering = DEFAULT_SW_BUFFERING;
+static bool dl_512_first = DEFAULT_DL_512_FIRST;
+static bool enable_isoc = DEFAULT_ENABLE_ISOC;
+static bool sw_buffering = DEFAULT_SW_BUFFERING;
 
 #define DEFAULT_B_MAX_DSL	8128
 #define DEFAULT_MODEM_MODE	11
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index 00f171a7a8a..01ea5d7421d 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -542,7 +542,7 @@ static int modem_index;
 static unsigned int debug;
 static unsigned int altsetting[NB_MODEM] = {
 				[0 ... (NB_MODEM - 1)] = FASTEST_ISO_INTF};
-static int sync_wait[NB_MODEM];
+static bool sync_wait[NB_MODEM];
 static char *cmv_file[NB_MODEM];
 static int annex[NB_MODEM];
 
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 3af5e2dd1d8..8df4b76465a 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -93,7 +93,7 @@ struct async {
 	u8 bulk_status;
 };
 
-static int usbfs_snoop;
+static bool usbfs_snoop;
 module_param(usbfs_snoop, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(usbfs_snoop, "true to log all usbfs traffic");
 
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 79d339e2e70..a0613d8f9be 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -102,7 +102,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khubd_wait);
 static struct task_struct *khubd_task;
 
 /* cycle leds on hubs that aren't blinking for attention */
-static int blinkenlights = 0;
+static bool blinkenlights = 0;
 module_param (blinkenlights, bool, S_IRUGO);
 MODULE_PARM_DESC (blinkenlights, "true to cycle leds on hubs");
 
@@ -131,12 +131,12 @@ MODULE_PARM_DESC(initial_descriptor_timeout,
  * otherwise the new scheme is used.  If that fails and "use_both_schemes"
  * is set, then the driver will make another attempt, using the other scheme.
  */
-static int old_scheme_first = 0;
+static bool old_scheme_first = 0;
 module_param(old_scheme_first, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(old_scheme_first,
 		 "start with the old device initialization scheme");
 
-static int use_both_schemes = 1;
+static bool use_both_schemes = 1;
 module_param(use_both_schemes, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(use_both_schemes,
 		"try the other device initialization scheme if the "
@@ -2026,7 +2026,7 @@ static unsigned hub_is_wusb(struct usb_hub *hub)
 #define SET_ADDRESS_TRIES	2
 #define GET_DESCRIPTOR_TRIES	2
 #define SET_CONFIG_TRIES	(2 * (use_both_schemes + 1))
-#define USE_NEW_SCHEME(i)	((i) / 2 == old_scheme_first)
+#define USE_NEW_SCHEME(i)	((i) / 2 == (int)old_scheme_first)
 
 #define HUB_ROOT_RESET_TIME	50	/* times are in msec */
 #define HUB_SHORT_RESET_TIME	10
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 1382c90d083..8ca9f994a28 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -47,7 +47,7 @@
 
 const char *usbcore_name = "usbcore";
 
-static int nousb;	/* Disable USB when built into kernel image */
+static bool nousb;	/* Disable USB when built into kernel image */
 
 #ifdef	CONFIG_USB_SUSPEND
 static int usb_autosuspend_delay = 2;		/* Default delay value,
diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c
index e9a2c5c4445..c16ff55a74e 100644
--- a/drivers/usb/gadget/amd5536udc.c
+++ b/drivers/usb/gadget/amd5536udc.c
@@ -152,15 +152,15 @@ static const char *ep_string[] = {
 };
 
 /* DMA usage flag */
-static int use_dma = 1;
+static bool use_dma = 1;
 /* packet per buffer dma */
-static int use_dma_ppb = 1;
+static bool use_dma_ppb = 1;
 /* with per descr. update */
-static int use_dma_ppb_du;
+static bool use_dma_ppb_du;
 /* buffer fill mode */
 static int use_dma_bufferfill_mode;
 /* full speed only mode */
-static int use_fullspeed;
+static bool use_fullspeed;
 /* tx buffer size for high speed */
 static unsigned long hs_tx_buf = UDC_EPIN_BUFF_SIZE;
 
diff --git a/drivers/usb/gadget/ether.c b/drivers/usb/gadget/ether.c
index 0cd764d5935..a28f6ffcd0f 100644
--- a/drivers/usb/gadget/ether.c
+++ b/drivers/usb/gadget/ether.c
@@ -250,9 +250,9 @@ static struct usb_configuration rndis_config_driver = {
 /*-------------------------------------------------------------------------*/
 
 #ifdef CONFIG_USB_ETH_EEM
-static int use_eem = 1;
+static bool use_eem = 1;
 #else
-static int use_eem;
+static bool use_eem;
 #endif
 module_param(use_eem, bool, 0);
 MODULE_PARM_DESC(use_eem, "use CDC EEM mode");
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index e0f30fc70e4..47766f0e7ca 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -303,16 +303,16 @@ MODULE_LICENSE("Dual BSD/GPL");
 static struct {
 	char		*file[FSG_MAX_LUNS];
 	char		*serial;
-	int		ro[FSG_MAX_LUNS];
-	int		nofua[FSG_MAX_LUNS];
+	bool		ro[FSG_MAX_LUNS];
+	bool		nofua[FSG_MAX_LUNS];
 	unsigned int	num_filenames;
 	unsigned int	num_ros;
 	unsigned int	num_nofuas;
 	unsigned int	nluns;
 
-	int		removable;
-	int		can_stall;
-	int		cdrom;
+	bool		removable;
+	bool		can_stall;
+	bool		cdrom;
 
 	char		*transport_parm;
 	char		*protocol_parm;
diff --git a/drivers/usb/gadget/net2272.c b/drivers/usb/gadget/net2272.c
index 4c81d540bc2..7322d293213 100644
--- a/drivers/usb/gadget/net2272.c
+++ b/drivers/usb/gadget/net2272.c
@@ -69,7 +69,7 @@ static const char * const ep_name[] = {
  *
  * If use_dma is disabled, pio will be used instead.
  */
-static int use_dma = 0;
+static bool use_dma = 0;
 module_param(use_dma, bool, 0644);
 
 /*
diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c
index cf1f36454d0..cdedd133674 100644
--- a/drivers/usb/gadget/net2280.c
+++ b/drivers/usb/gadget/net2280.c
@@ -90,8 +90,8 @@ static const char *const ep_name [] = {
  * Some gadget drivers work better with the dma support here than others.
  * These two parameters let you use PIO or more aggressive DMA.
  */
-static int use_dma = 1;
-static int use_dma_chaining = 0;
+static bool use_dma = 1;
+static bool use_dma_chaining = 0;
 
 /* "modprobe net2280 use_dma=n" etc */
 module_param (use_dma, bool, S_IRUGO);
@@ -112,7 +112,7 @@ module_param (fifo_mode, ushort, 0644);
  * USB suspend requests will be ignored.  This is acceptable for
  * self-powered devices
  */
-static int enable_suspend = 0;
+static bool enable_suspend = 0;
 
 /* "modprobe net2280 enable_suspend=1" etc */
 module_param (enable_suspend, bool, S_IRUGO);
diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c
index 7db5bbe6251..576cd8578b4 100644
--- a/drivers/usb/gadget/omap_udc.c
+++ b/drivers/usb/gadget/omap_udc.c
@@ -98,7 +98,7 @@ module_param (fifo_mode, uint, 0);
 MODULE_PARM_DESC (fifo_mode, "endpoint configuration");
 
 #ifdef	USE_DMA
-static unsigned use_dma = 1;
+static bool use_dma = 1;
 
 /* "modprobe omap_udc use_dma=y", or else as a kernel
  * boot parameter "omap_udc:use_dma=y"
diff --git a/drivers/usb/gadget/pch_udc.c b/drivers/usb/gadget/pch_udc.c
index dd2313cce1d..a3fcaae4bc2 100644
--- a/drivers/usb/gadget/pch_udc.c
+++ b/drivers/usb/gadget/pch_udc.c
@@ -359,7 +359,7 @@ struct pch_udc_dev {
 static const char	ep0_string[] = "ep0in";
 static DEFINE_SPINLOCK(udc_stall_spinlock);	/* stall spin lock */
 struct pch_udc_dev *pch_udc;		/* pointer to device object */
-static int speed_fs;
+static bool speed_fs;
 module_param_named(speed_fs, speed_fs, bool, S_IRUGO);
 MODULE_PARM_DESC(speed_fs, "true for Full speed operation");
 
diff --git a/drivers/usb/gadget/serial.c b/drivers/usb/gadget/serial.c
index ed1b816e58d..ad9e5b2df64 100644
--- a/drivers/usb/gadget/serial.c
+++ b/drivers/usb/gadget/serial.c
@@ -123,11 +123,11 @@ MODULE_AUTHOR("Al Borchers");
 MODULE_AUTHOR("David Brownell");
 MODULE_LICENSE("GPL");
 
-static int use_acm = true;
+static bool use_acm = true;
 module_param(use_acm, bool, 0);
 MODULE_PARM_DESC(use_acm, "Use CDC ACM, default=yes");
 
-static int use_obex = false;
+static bool use_obex = false;
 module_param(use_obex, bool, 0);
 MODULE_PARM_DESC(use_obex, "Use CDC OBEX, default=no");
 
diff --git a/drivers/usb/gadget/zero.c b/drivers/usb/gadget/zero.c
index 20697cc132d..31d34832907 100644
--- a/drivers/usb/gadget/zero.c
+++ b/drivers/usb/gadget/zero.c
@@ -81,7 +81,7 @@ module_param(buflen, uint, 0);
  * work better with hosts where config changes are problematic or
  * controllers (like original superh) that only support one config.
  */
-static int loopdefault = 0;
+static bool loopdefault = 0;
 module_param(loopdefault, bool, S_IRUGO|S_IWUSR);
 
 /*-------------------------------------------------------------------------*/
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index e311a511529..a007a9fe0f8 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -112,7 +112,7 @@ module_param (park, uint, S_IRUGO);
 MODULE_PARM_DESC (park, "park setting; 1-3 back-to-back async packets");
 
 /* for flakey hardware, ignore overcurrent indicators */
-static int ignore_oc = 0;
+static bool ignore_oc = 0;
 module_param (ignore_oc, bool, S_IRUGO);
 MODULE_PARM_DESC (ignore_oc, "ignore bogus hardware overcurrent indications");
 
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 5f5a6324143..34b9edd8665 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -115,13 +115,13 @@ static inline void sb800_prefetch(struct ohci_hcd *ohci, int on)
 
 
 /* Some boards misreport power switching/overcurrent */
-static int distrust_firmware = 1;
+static bool distrust_firmware = 1;
 module_param (distrust_firmware, bool, 0);
 MODULE_PARM_DESC (distrust_firmware,
 	"true to distrust firmware power/overcurrent setup");
 
 /* Some boards leave IR set wrongly, since they fail BIOS/SMM handshakes */
-static int no_handshake = 0;
+static bool no_handshake = 0;
 module_param (no_handshake, bool, 0);
 MODULE_PARM_DESC (no_handshake, "true (not default) disables BIOS handshake");
 
diff --git a/drivers/usb/host/oxu210hp-hcd.c b/drivers/usb/host/oxu210hp-hcd.c
index 6f62de5c6e3..015c7c62ed4 100644
--- a/drivers/usb/host/oxu210hp-hcd.c
+++ b/drivers/usb/host/oxu210hp-hcd.c
@@ -233,7 +233,7 @@ module_param(park, uint, S_IRUGO);
 MODULE_PARM_DESC(park, "park setting; 1-3 back-to-back async packets");
 
 /* For flakey hardware, ignore overcurrent indicators */
-static int ignore_oc;
+static bool ignore_oc;
 module_param(ignore_oc, bool, S_IRUGO);
 MODULE_PARM_DESC(ignore_oc, "ignore bogus hardware overcurrent indications");
 
diff --git a/drivers/usb/host/u132-hcd.c b/drivers/usb/host/u132-hcd.c
index 533d12cca37..16dd6a6abf0 100644
--- a/drivers/usb/host/u132-hcd.c
+++ b/drivers/usb/host/u132-hcd.c
@@ -74,7 +74,7 @@ MODULE_LICENSE("GPL");
 #define INT_MODULE_PARM(n, v) static int n = v;module_param(n, int, 0444)
 INT_MODULE_PARM(testing, 0);
 /* Some boards misreport power switching/overcurrent*/
-static int distrust_firmware = 1;
+static bool distrust_firmware = 1;
 module_param(distrust_firmware, bool, 0);
 MODULE_PARM_DESC(distrust_firmware, "true to distrust firmware power/overcurren"
 	"t setup");
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index c8ae199cfbb..6b5eb1017e2 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -59,7 +59,7 @@
 #define DRIVER_DESC "USB Universal Host Controller Interface driver"
 
 /* for flakey hardware, ignore overcurrent indicators */
-static int ignore_oc;
+static bool ignore_oc;
 module_param(ignore_oc, bool, S_IRUGO);
 MODULE_PARM_DESC(ignore_oc, "ignore hardware overcurrent indications");
 
diff --git a/drivers/usb/misc/ftdi-elan.c b/drivers/usb/misc/ftdi-elan.c
index 2dbe600fbc1..a4a3c7cd4a1 100644
--- a/drivers/usb/misc/ftdi-elan.c
+++ b/drivers/usb/misc/ftdi-elan.c
@@ -53,7 +53,7 @@ MODULE_AUTHOR("Tony Olech");
 MODULE_DESCRIPTION("FTDI ELAN driver");
 MODULE_LICENSE("GPL");
 #define INT_MODULE_PARM(n, v) static int n = v;module_param(n, int, 0444)
-static int distrust_firmware = 1;
+static bool distrust_firmware = 1;
 module_param(distrust_firmware, bool, 0);
 MODULE_PARM_DESC(distrust_firmware, "true to distrust firmware power/overcurren"
         "t setup");
diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index 2453a39b479..4fd0dc835ae 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -62,7 +62,7 @@ MODULE_LICENSE("GPL");
 
 /* Module parameters */
 static DEFINE_MUTEX(iowarrior_mutex);
-static int debug = 0;
+static bool debug = 0;
 module_param(debug, bool, 0644);
 MODULE_PARM_DESC(debug, "debug=1 enables debugging messages");
 
diff --git a/drivers/usb/musb/cppi_dma.c b/drivers/usb/musb/cppi_dma.c
index 53be7aef630..66bc376005d 100644
--- a/drivers/usb/musb/cppi_dma.c
+++ b/drivers/usb/musb/cppi_dma.c
@@ -750,7 +750,7 @@ cppi_next_tx_segment(struct musb *musb, struct cppi_channel *tx)
  * So this module parameter lets the heuristic be disabled.  When using
  * gadgetfs, the heuristic will probably need to be disabled.
  */
-static int cppi_rx_rndis = 1;
+static bool cppi_rx_rndis = 1;
 
 module_param(cppi_rx_rndis, bool, 0);
 MODULE_PARM_DESC(cppi_rx_rndis, "enable/disable RX RNDIS heuristic");
diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c
index f6ff7923048..56cf0243979 100644
--- a/drivers/usb/musb/musb_core.c
+++ b/drivers/usb/musb/musb_core.c
@@ -1586,7 +1586,7 @@ irqreturn_t musb_interrupt(struct musb *musb)
 EXPORT_SYMBOL_GPL(musb_interrupt);
 
 #ifndef CONFIG_MUSB_PIO_ONLY
-static int __initdata use_dma = 1;
+static bool __initdata use_dma = 1;
 
 /* "modprobe ... use_dma=0" etc */
 module_param(use_dma, bool, 0);
diff --git a/drivers/usb/serial/aircable.c b/drivers/usb/serial/aircable.c
index b43d07df4c4..123bf915533 100644
--- a/drivers/usb/serial/aircable.c
+++ b/drivers/usb/serial/aircable.c
@@ -52,7 +52,7 @@
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
 
-static int debug;
+static bool debug;
 
 /* Vendor and Product ID */
 #define AIRCABLE_VID		0x16CA
diff --git a/drivers/usb/serial/ark3116.c b/drivers/usb/serial/ark3116.c
index 18e875b92e0..69328dcfd91 100644
--- a/drivers/usb/serial/ark3116.c
+++ b/drivers/usb/serial/ark3116.c
@@ -37,7 +37,7 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 
-static int debug;
+static bool debug;
 /*
  * Version information
  */
diff --git a/drivers/usb/serial/belkin_sa.c b/drivers/usb/serial/belkin_sa.c
index f9f29b289f2..29ffeb6279c 100644
--- a/drivers/usb/serial/belkin_sa.c
+++ b/drivers/usb/serial/belkin_sa.c
@@ -37,7 +37,7 @@
 #include <linux/usb/serial.h>
 #include "belkin_sa.h"
 
-static int debug;
+static bool debug;
 
 /*
  * Version Information
diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c
index 0e77511060c..5e53cc59e65 100644
--- a/drivers/usb/serial/ch341.c
+++ b/drivers/usb/serial/ch341.c
@@ -70,7 +70,7 @@
 #define CH341_NBREAK_BITS_REG2 0x40
 
 
-static int debug;
+static bool debug;
 
 static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x4348, 0x5523) },
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index adfe660ed00..fba1147ed91 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -49,7 +49,7 @@ static void cp210x_break_ctl(struct tty_struct *, int);
 static int cp210x_startup(struct usb_serial *);
 static void cp210x_dtr_rts(struct usb_serial_port *p, int on);
 
-static int debug;
+static bool debug;
 
 static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x045B, 0x0053) }, /* Renesas RX610 RX-Stick */
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index 98bf8334983..6bc3802a581 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -43,7 +43,7 @@
 
 #define CYBERJACK_LOCAL_BUF_SIZE 32
 
-static int debug;
+static bool debug;
 
 /*
  * Version Information
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index 07680d6b792..3bdeafa29c2 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -46,10 +46,10 @@
 #include "cypress_m8.h"
 
 
-static int debug;
-static int stats;
+static bool debug;
+static bool stats;
 static int interval;
-static int unstable_bauds;
+static bool unstable_bauds;
 
 /*
  * Version Information
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 6d26a77d0f2..b23bebd721a 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -251,7 +251,7 @@ static int digi_read_oob_callback(struct urb *urb);
 
 /* Statics */
 
-static int debug;
+static bool debug;
 
 static const struct usb_device_id id_table_combined[] = {
 	{ USB_DEVICE(DIGI_VENDOR_ID, DIGI_2_ID) },
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index 504b5585ea4..aced6817bf9 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -28,7 +28,7 @@
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
 
-static int debug;
+static bool debug;
 
 /*
  * Version Information
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index c290df97108..01b6404df39 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -55,7 +55,7 @@
 #define DRIVER_AUTHOR "Greg Kroah-Hartman <greg@kroah.com>, Bill Ryder <bryder@sgi.com>, Kuba Ober <kuba@mareimbrium.org>, Andreas Mohr, Johan Hovold <jhovold@gmail.com>"
 #define DRIVER_DESC "USB FTDI Serial Converters Driver"
 
-static int debug;
+static bool debug;
 static __u16 vendor = FTDI_VID;
 static __u16 product;
 
diff --git a/drivers/usb/serial/funsoft.c b/drivers/usb/serial/funsoft.c
index e21ce9ddfc6..5d4b099dcf8 100644
--- a/drivers/usb/serial/funsoft.c
+++ b/drivers/usb/serial/funsoft.c
@@ -16,7 +16,7 @@
 #include <linux/usb/serial.h>
 #include <linux/uaccess.h>
 
-static int debug;
+static bool debug;
 
 static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x1404, 0xcddc) },
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index bf12565f8e8..21343378c32 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -42,7 +42,7 @@
 static int initial_mode = 1;
 
 /* debug flag */
-static int debug;
+static bool debug;
 
 #define GARMIN_VENDOR_ID             0x091E
 
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index abd2ee2b2f9..0497575e479 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -191,7 +191,7 @@ static const struct divisor_table_entry divisor_table[] = {
 };
 
 /* local variables */
-static int debug;
+static bool debug;
 
 static atomic_t CmdUrbs;	/* Number of outstanding Command Write Urbs */
 
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index e44d375edaa..65bf06aa591 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -210,10 +210,10 @@ static unsigned char OperationalMajorVersion;
 static unsigned char OperationalMinorVersion;
 static unsigned short OperationalBuildNumber;
 
-static int debug;
+static bool debug;
 
 static int closing_wait = EDGE_CLOSING_WAIT;
-static int ignore_cpu_rev;
+static bool ignore_cpu_rev;
 static int default_uart_mode;		/* RS232 */
 
 static void edge_tty_recv(struct device *dev, struct tty_struct *tty,
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index 36f5cbe9048..06053a920dd 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -34,7 +34,7 @@
 #define DRIVER_DESC "USB PocketPC PDA driver"
 
 static __u16 product, vendor;
-static int debug;
+static bool debug;
 static int connect_retries = KP_RETRIES;
 static int initial_wait;
 
diff --git a/drivers/usb/serial/ipw.c b/drivers/usb/serial/ipw.c
index 5170799d6e9..6f9356f3f99 100644
--- a/drivers/usb/serial/ipw.c
+++ b/drivers/usb/serial/ipw.c
@@ -147,7 +147,7 @@ static struct usb_driver usb_ipw_driver = {
 	.no_dynamic_id = 	1,
 };
 
-static int debug;
+static bool debug;
 
 static int ipw_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c
index 0c537da0d3c..84a396e8367 100644
--- a/drivers/usb/serial/ir-usb.c
+++ b/drivers/usb/serial/ir-usb.c
@@ -45,7 +45,7 @@
 #define DRIVER_AUTHOR "Greg Kroah-Hartman <greg@kroah.com>, Johan Hovold <jhovold@gmail.com>"
 #define DRIVER_DESC "USB IR Dongle driver"
 
-static int debug;
+static bool debug;
 
 /* if overridden by the user, then use their value for the size of the read and
  * write urbs */
diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index 64d0ffd4440..3077a443697 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -34,9 +34,9 @@
 
 
 #ifdef CONFIG_USB_SERIAL_DEBUG
-static int debug = 1;
+static bool debug = 1;
 #else
-static int debug;
+static bool debug;
 #endif
 
 /*
@@ -65,7 +65,7 @@ static int clockmode = 1;
 static int cdmode = 1;
 static int iuu_cardin;
 static int iuu_cardout;
-static int xmas;
+static bool xmas;
 static int vcc_default = 5;
 
 static void read_rxcmd_callback(struct urb *urb);
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index bc8dc203e81..4cc36c76180 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -45,7 +45,7 @@
 #include <linux/usb/serial.h>
 #include "keyspan.h"
 
-static int debug;
+static bool debug;
 
 /*
  * Version Information
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index a40615674a6..7c62a704830 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -31,7 +31,7 @@
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
 
-static int debug;
+static bool debug;
 
 /* make a simple define to handle if we are compiling keyspan_pda or xircom support */
 #if defined(CONFIG_USB_SERIAL_KEYSPAN_PDA) || defined(CONFIG_USB_SERIAL_KEYSPAN_PDA_MODULE)
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index 19373cb7c5b..fc064e1442c 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -49,7 +49,7 @@
 #include <linux/usb/serial.h>
 #include "kl5kusb105.h"
 
-static int debug;
+static bool debug;
 
 /*
  * Version Information
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index a975bb80303..27fa9c8a77b 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -45,7 +45,7 @@
 #define DRIVER_AUTHOR "Wolfgang Grandegger <wolfgang@ces.ch>"
 #define DRIVER_DESC "Magic Control Technology USB-RS232 converter driver"
 
-static int debug;
+static bool debug;
 
 /*
  * Function prototypes
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 19d112f51b9..4554ee49e63 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -71,7 +71,7 @@ struct moschip_port {
 	struct urb		*write_urb_pool[NUM_URBS];
 };
 
-static int debug;
+static bool debug;
 
 static struct usb_serial_driver moschip7720_2port_driver;
 
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 55cfd6265b9..03b5e249e95 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -263,7 +263,7 @@ struct moschip_port {
 };
 
 
-static int debug;
+static bool debug;
 
 /*
  * mos7840_set_reg_sync
diff --git a/drivers/usb/serial/navman.c b/drivers/usb/serial/navman.c
index 1f00f243c26..b28f1db0195 100644
--- a/drivers/usb/serial/navman.c
+++ b/drivers/usb/serial/navman.c
@@ -21,7 +21,7 @@
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
 
-static int debug;
+static bool debug;
 
 static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x0a99, 0x0001) },	/* Talon Technology device */
diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c
index 45a8c55881d..8b8d58a2ac1 100644
--- a/drivers/usb/serial/omninet.c
+++ b/drivers/usb/serial/omninet.c
@@ -23,7 +23,7 @@
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
 
-static int debug;
+static bool debug;
 
 /*
  * Version Information
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index 691f57a9d71..262ded9e076 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -32,7 +32,7 @@
  * an examples of 1D barcode types are EAN, UPC, Code39, IATA etc.. */
 #define DRIVER_DESC	"Opticon USB barcode to serial driver (1D)"
 
-static int debug;
+static bool debug;
 
 static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x065a, 0x0009) },
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index c96b6b6509f..420d9857394 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1234,7 +1234,7 @@ static struct usb_serial_driver option_1port_device = {
 #endif
 };
 
-static int debug;
+static bool debug;
 
 /* per port private data */
 
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index 2161d1c3c08..e287fd32682 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -74,7 +74,7 @@ static struct usb_driver oti6858_driver = {
 	.no_dynamic_id = 	1,
 };
 
-static int debug;
+static bool debug;
 
 /* requests */
 #define	OTI6858_REQ_GET_STATUS		(USB_DIR_IN | USB_TYPE_VENDOR | 0x00)
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 329295615d0..3d8cda57ce7 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -36,7 +36,7 @@
  */
 #define DRIVER_DESC "Prolific PL2303 USB to serial adaptor driver"
 
-static int debug;
+static bool debug;
 
 #define PL2303_CLOSING_WAIT	(30*HZ)
 
diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index aa9367f5b42..1d5deee3be5 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c
@@ -22,7 +22,7 @@
 #define DRIVER_AUTHOR "Qualcomm Inc"
 #define DRIVER_DESC "Qualcomm USB Serial driver"
 
-static int debug;
+static bool debug;
 
 static const struct usb_device_id id_table[] = {
 	{USB_DEVICE(0x05c6, 0x9211)},	/* Acer Gobi QDL device */
diff --git a/drivers/usb/serial/safe_serial.c b/drivers/usb/serial/safe_serial.c
index a36e2313eed..d074b3740dc 100644
--- a/drivers/usb/serial/safe_serial.c
+++ b/drivers/usb/serial/safe_serial.c
@@ -81,9 +81,9 @@
 #define CONFIG_USB_SERIAL_SAFE_PADDED 0
 #endif
 
-static int debug;
-static int safe = 1;
-static int padded = CONFIG_USB_SERIAL_SAFE_PADDED;
+static bool debug;
+static bool safe = 1;
+static bool padded = CONFIG_USB_SERIAL_SAFE_PADDED;
 
 #define DRIVER_VERSION "v0.1"
 #define DRIVER_AUTHOR "sl@lineo.com, tbr@lineo.com, Johan Hovold <jhovold@gmail.com>"
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index f2485429172..fdae0a4407c 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -46,8 +46,8 @@
    allocations > PAGE_SIZE and the number of packets in a page
    is an integer 512 is the largest possible packet on EHCI */
 
-static int debug;
-static int nmea;
+static bool debug;
+static bool nmea;
 
 /* Used in interface blacklisting */
 struct sierra_iface_info {
diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c
index 180ea6c7911..d7f5eee18f0 100644
--- a/drivers/usb/serial/spcp8x5.c
+++ b/drivers/usb/serial/spcp8x5.c
@@ -33,7 +33,7 @@
 #define DRIVER_VERSION	"v0.10"
 #define DRIVER_DESC 	"SPCP8x5 USB to serial adaptor driver"
 
-static int debug;
+static bool debug;
 
 #define SPCP8x5_007_VID		0x04FC
 #define SPCP8x5_007_PID		0x0201
diff --git a/drivers/usb/serial/ssu100.c b/drivers/usb/serial/ssu100.c
index 87362e48796..7697858d885 100644
--- a/drivers/usb/serial/ssu100.c
+++ b/drivers/usb/serial/ssu100.c
@@ -46,7 +46,7 @@
 #define FULLPWRBIT          0x00000080
 #define NEXT_BOARD_POWER_BIT        0x00000004
 
-static int debug;
+static bool debug;
 
 /* Version Information */
 #define DRIVER_VERSION "v0.1"
diff --git a/drivers/usb/serial/symbolserial.c b/drivers/usb/serial/symbolserial.c
index c70cc012d03..50651cf4fc6 100644
--- a/drivers/usb/serial/symbolserial.c
+++ b/drivers/usb/serial/symbolserial.c
@@ -20,7 +20,7 @@
 #include <linux/usb/serial.h>
 #include <linux/uaccess.h>
 
-static int debug;
+static bool debug;
 
 static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x05e0, 0x0600) },
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 4af21f46096..8468eb769a2 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -150,7 +150,7 @@ static int ti_download_firmware(struct ti_device *tdev);
 /* Data */
 
 /* module parameters */
-static int debug;
+static bool debug;
 static int closing_wait = TI_DEFAULT_CLOSING_WAIT;
 static ushort vendor_3410[TI_EXTRA_VID_PID_COUNT];
 static unsigned int vendor_3410_count;
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index ce6c1a65a54..611b206591c 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -61,7 +61,7 @@ static struct usb_driver usb_serial_driver = {
    drivers depend on it.
 */
 
-static int debug;
+static bool debug;
 /* initially all NULL */
 static struct usb_serial *serial_table[SERIAL_TTY_MINORS];
 static DEFINE_MUTEX(table_lock);
diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c
index d555ca9567b..c88657dd31c 100644
--- a/drivers/usb/serial/usb_wwan.c
+++ b/drivers/usb/serial/usb_wwan.c
@@ -37,7 +37,7 @@
 #include <linux/serial.h>
 #include "usb-wwan.h"
 
-static int debug;
+static bool debug;
 
 void usb_wwan_dtr_rts(struct usb_serial_port *port, int on)
 {
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index 1c11959a7d5..210e4b10dc1 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -52,7 +52,7 @@ static int palm_os_4_probe(struct usb_serial *serial,
 					const struct usb_device_id *id);
 
 /* Parameters that may be passed into the module. */
-static int debug;
+static bool debug;
 static __u16 vendor;
 static __u16 product;
 
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 11af903cb09..7e0acf5c8e3 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -36,7 +36,7 @@
 #include <linux/ihex.h>
 #include "whiteheat.h"			/* WhiteHEAT specific commands */
 
-static int debug;
+static bool debug;
 
 #ifndef CMSPAR
 #define CMSPAR 0
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c
index 44bdce4242a..622f12b62a4 100644
--- a/drivers/video/aty/atyfb_base.c
+++ b/drivers/video/aty/atyfb_base.c
@@ -301,9 +301,9 @@ static struct fb_ops atyfb_ops = {
 	.fb_sync	= atyfb_sync,
 };
 
-static int noaccel;
+static bool noaccel;
 #ifdef CONFIG_MTRR
-static int nomtrr;
+static bool nomtrr;
 #endif
 static int vram;
 static int pll;
diff --git a/drivers/video/aty/radeon_base.c b/drivers/video/aty/radeon_base.c
index 150684882ef..ce1506b75ad 100644
--- a/drivers/video/aty/radeon_base.c
+++ b/drivers/video/aty/radeon_base.c
@@ -263,19 +263,19 @@ static reg_val common_regs[] = {
         
 static char *mode_option;
 static char *monitor_layout;
-static int noaccel = 0;
+static bool noaccel = 0;
 static int default_dynclk = -2;
-static int nomodeset = 0;
-static int ignore_edid = 0;
-static int mirror = 0;
+static bool nomodeset = 0;
+static bool ignore_edid = 0;
+static bool mirror = 0;
 static int panel_yres = 0;
-static int force_dfp = 0;
-static int force_measure_pll = 0;
+static bool force_dfp = 0;
+static bool force_measure_pll = 0;
 #ifdef CONFIG_MTRR
-static int nomtrr = 0;
+static bool nomtrr = 0;
 #endif
-static int force_sleep;
-static int ignore_devlist;
+static bool force_sleep;
+static bool ignore_devlist;
 #ifdef CONFIG_PMAC_BACKLIGHT
 static int backlight = 1;
 #else
diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 6df7c54db0a..6fb499e7678 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -350,7 +350,7 @@ struct cirrusfb_info {
 	void (*unmap)(struct fb_info *info);
 };
 
-static int noaccel __devinitdata;
+static bool noaccel __devinitdata;
 static char *mode_option __devinitdata = "640x480@60";
 
 /****************************************************************************/
diff --git a/drivers/video/hgafb.c b/drivers/video/hgafb.c
index 4394389caf6..c645f928265 100644
--- a/drivers/video/hgafb.c
+++ b/drivers/video/hgafb.c
@@ -133,7 +133,7 @@ static struct fb_fix_screeninfo hga_fix __devinitdata = {
 /* Don't assume that tty1 will be the initial current console. */
 static int release_io_port = 0;
 static int release_io_ports = 0;
-static int nologo = 0;
+static bool nologo = 0;
 
 /* -------------------------------------------------------------------------
  *
diff --git a/drivers/video/intelfb/intelfbdrv.c b/drivers/video/intelfb/intelfbdrv.c
index 5ba39999105..c94c91fd866 100644
--- a/drivers/video/intelfb/intelfbdrv.c
+++ b/drivers/video/intelfb/intelfbdrv.c
@@ -230,15 +230,15 @@ MODULE_DESCRIPTION("Framebuffer driver for Intel(R) " SUPPORTED_CHIPSETS
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DEVICE_TABLE(pci, intelfb_pci_table);
 
-static int accel        = 1;
+static bool accel       = 1;
 static int vram         = 4;
-static int hwcursor     = 0;
-static int mtrr         = 1;
-static int fixed        = 0;
-static int noinit       = 0;
-static int noregister   = 0;
-static int probeonly    = 0;
-static int idonly       = 0;
+static bool hwcursor    = 0;
+static bool mtrr        = 1;
+static bool fixed       = 0;
+static bool noinit      = 0;
+static bool noregister  = 0;
+static bool probeonly   = 0;
+static bool idonly      = 0;
 static int bailearly    = 0;
 static int voffset	= 48;
 static char *mode       = NULL;
diff --git a/drivers/video/logo/logo.c b/drivers/video/logo/logo.c
index ea7a8ccc830..080c35b34bb 100644
--- a/drivers/video/logo/logo.c
+++ b/drivers/video/logo/logo.c
@@ -21,7 +21,7 @@
 #include <asm/bootinfo.h>
 #endif
 
-static int nologo;
+static bool nologo;
 module_param(nologo, bool, 0);
 MODULE_PARM_DESC(nologo, "Disables startup logo");
 
diff --git a/drivers/video/neofb.c b/drivers/video/neofb.c
index feea7b1dc38..fb3f6739110 100644
--- a/drivers/video/neofb.c
+++ b/drivers/video/neofb.c
@@ -84,11 +84,11 @@
 
 /* --------------------------------------------------------------------- */
 
-static int internal;
-static int external;
-static int libretto;
-static int nostretch;
-static int nopciburst;
+static bool internal;
+static bool external;
+static bool libretto;
+static bool nostretch;
+static bool nopciburst;
 static char *mode_option __devinitdata = NULL;
 
 #ifdef MODULE
diff --git a/drivers/video/omap/omapfb_main.c b/drivers/video/omap/omapfb_main.c
index 25d8e510319..b291bfaac80 100644
--- a/drivers/video/omap/omapfb_main.c
+++ b/drivers/video/omap/omapfb_main.c
@@ -47,9 +47,9 @@ static unsigned int	def_rotate;
 static unsigned int	def_mirror;
 
 #ifdef CONFIG_FB_OMAP_MANUAL_UPDATE
-static int		manual_update = 1;
+static bool		manual_update = 1;
 #else
-static int		manual_update;
+static bool		manual_update;
 #endif
 
 static struct platform_device	*fbdev_pdev;
diff --git a/drivers/video/omap2/dss/core.c b/drivers/video/omap2/dss/core.c
index 86ec12e16c7..da7b1857654 100644
--- a/drivers/video/omap2/dss/core.c
+++ b/drivers/video/omap2/dss/core.c
@@ -50,7 +50,7 @@ module_param_named(def_disp, def_disp_name, charp, 0);
 MODULE_PARM_DESC(def_disp, "default display name");
 
 #ifdef DEBUG
-unsigned int dss_debug;
+bool dss_debug;
 module_param_named(debug, dss_debug, bool, 0644);
 #endif
 
diff --git a/drivers/video/omap2/dss/dsi.c b/drivers/video/omap2/dss/dsi.c
index 5abf8e7e745..46f37883e49 100644
--- a/drivers/video/omap2/dss/dsi.c
+++ b/drivers/video/omap2/dss/dsi.c
@@ -340,8 +340,8 @@ struct dsi_packet_sent_handler_data {
 static struct platform_device *dsi_pdev_map[MAX_NUM_DSI];
 
 #ifdef DEBUG
-static unsigned int dsi_perf;
-module_param_named(dsi_perf, dsi_perf, bool, 0644);
+static bool dsi_perf;
+module_param(dsi_perf, bool, 0644);
 #endif
 
 static inline struct dsi_data *dsi_get_dsidrv_data(struct platform_device *dsidev)
diff --git a/drivers/video/omap2/dss/dss.h b/drivers/video/omap2/dss/dss.h
index 6308fc59fc9..57a52eecee9 100644
--- a/drivers/video/omap2/dss/dss.h
+++ b/drivers/video/omap2/dss/dss.h
@@ -28,7 +28,7 @@
 #endif
 
 #ifdef DEBUG
-extern unsigned int dss_debug;
+extern bool dss_debug;
 #ifdef DSS_SUBSYS_NAME
 #define DSSDBG(format, ...) \
 	if (dss_debug) \
diff --git a/drivers/video/omap2/omapfb/omapfb-main.c b/drivers/video/omap2/omapfb/omapfb-main.c
index 70aa47de714..68ba1f80008 100644
--- a/drivers/video/omap2/omapfb/omapfb-main.c
+++ b/drivers/video/omap2/omapfb/omapfb-main.c
@@ -43,18 +43,18 @@
 
 static char *def_mode;
 static char *def_vram;
-static int def_vrfb;
+static bool def_vrfb;
 static int def_rotate;
-static int def_mirror;
+static bool def_mirror;
 static bool auto_update;
 static unsigned int auto_update_freq;
 module_param(auto_update, bool, 0);
 module_param(auto_update_freq, uint, 0644);
 
 #ifdef DEBUG
-unsigned int omapfb_debug;
+bool omapfb_debug;
 module_param_named(debug, omapfb_debug, bool, 0644);
-static unsigned int omapfb_test_pattern;
+static bool omapfb_test_pattern;
 module_param_named(test, omapfb_test_pattern, bool, 0644);
 #endif
 
diff --git a/drivers/video/omap2/omapfb/omapfb.h b/drivers/video/omap2/omapfb/omapfb.h
index fdf0edeccf4..e12d384ea52 100644
--- a/drivers/video/omap2/omapfb/omapfb.h
+++ b/drivers/video/omap2/omapfb/omapfb.h
@@ -32,7 +32,7 @@
 #include <video/omapdss.h>
 
 #ifdef DEBUG
-extern unsigned int omapfb_debug;
+extern bool omapfb_debug;
 #define DBG(format, ...) \
 	do { \
 		if (omapfb_debug) \
diff --git a/drivers/video/pm2fb.c b/drivers/video/pm2fb.c
index dc7bfa91e57..df31a24a502 100644
--- a/drivers/video/pm2fb.c
+++ b/drivers/video/pm2fb.c
@@ -78,12 +78,12 @@ static char *mode_option __devinitdata;
  * these flags allow the user to specify that requests for +ve sync
  * should be silently turned in -ve sync.
  */
-static int lowhsync;
-static int lowvsync;
-static int noaccel __devinitdata;
+static bool lowhsync;
+static bool lowvsync;
+static bool noaccel __devinitdata;
 /* mtrr option */
 #ifdef CONFIG_MTRR
-static int nomtrr __devinitdata;
+static bool nomtrr __devinitdata;
 #endif
 
 /*
diff --git a/drivers/video/pm3fb.c b/drivers/video/pm3fb.c
index 6632ee5ecb7..055e527a8e4 100644
--- a/drivers/video/pm3fb.c
+++ b/drivers/video/pm3fb.c
@@ -57,11 +57,11 @@
  */
 static int hwcursor = 1;
 static char *mode_option __devinitdata;
-static int noaccel __devinitdata;
+static bool noaccel __devinitdata;
 
 /* mtrr option */
 #ifdef CONFIG_MTRR
-static int nomtrr __devinitdata;
+static bool nomtrr __devinitdata;
 #endif
 
 /*
diff --git a/drivers/video/riva/fbdev.c b/drivers/video/riva/fbdev.c
index d8ab7be4fd6..2f58cf9c813 100644
--- a/drivers/video/riva/fbdev.c
+++ b/drivers/video/riva/fbdev.c
@@ -207,9 +207,9 @@ MODULE_DEVICE_TABLE(pci, rivafb_pci_tbl);
 /* command line data, set in rivafb_setup() */
 static int flatpanel __devinitdata = -1; /* Autodetect later */
 static int forceCRTC __devinitdata = -1;
-static int noaccel   __devinitdata = 0;
+static bool noaccel  __devinitdata = 0;
 #ifdef CONFIG_MTRR
-static int nomtrr __devinitdata = 0;
+static bool nomtrr __devinitdata = 0;
 #endif
 #ifdef CONFIG_PMAC_BACKLIGHT
 static int backlight __devinitdata = 1;
@@ -218,7 +218,7 @@ static int backlight __devinitdata = 0;
 #endif
 
 static char *mode_option __devinitdata = NULL;
-static int  strictmode       = 0;
+static bool strictmode       = 0;
 
 static struct fb_fix_screeninfo __devinitdata rivafb_fix = {
 	.type		= FB_TYPE_PACKED_PIXELS,
diff --git a/drivers/video/smscufx.c b/drivers/video/smscufx.c
index 3c22994ea31..ccbfef5e828 100644
--- a/drivers/video/smscufx.c
+++ b/drivers/video/smscufx.c
@@ -130,8 +130,8 @@ static struct usb_device_id id_table[] = {
 MODULE_DEVICE_TABLE(usb, id_table);
 
 /* module options */
-static int console;   /* Optionally allow fbcon to consume first framebuffer */
-static int fb_defio = true;  /* Optionally enable fb_defio mmap support */
+static bool console;   /* Optionally allow fbcon to consume first framebuffer */
+static bool fb_defio = true;  /* Optionally enable fb_defio mmap support */
 
 /* ufx keeps a list of urbs for efficient bulk transfers */
 static void ufx_urb_completion(struct urb *urb);
diff --git a/drivers/video/sstfb.c b/drivers/video/sstfb.c
index 2301c275d63..111fb32e876 100644
--- a/drivers/video/sstfb.c
+++ b/drivers/video/sstfb.c
@@ -93,11 +93,11 @@
 
 /* initialized by setup */
 
-static int vgapass;		/* enable VGA passthrough cable */
+static bool vgapass;		/* enable VGA passthrough cable */
 static int mem;			/* mem size in MB, 0 = autodetect */
-static int clipping = 1;	/* use clipping (slower, safer) */
+static bool clipping = 1;	/* use clipping (slower, safer) */
 static int gfxclk;		/* force FBI freq in Mhz . Dangerous */
-static int slowpci;		/* slow PCI settings */
+static bool slowpci;		/* slow PCI settings */
 
 /*
   Possible default video modes: 800x600@60, 640x480@75, 1024x768@76, 640x480@60
diff --git a/drivers/video/tdfxfb.c b/drivers/video/tdfxfb.c
index a99b994c9b6..e026724a3a5 100644
--- a/drivers/video/tdfxfb.c
+++ b/drivers/video/tdfxfb.c
@@ -169,7 +169,7 @@ static int nowrap = 1;      /* not implemented (yet) */
 static int hwcursor = 1;
 static char *mode_option __devinitdata;
 /* mtrr option */
-static int nomtrr __devinitdata;
+static bool nomtrr __devinitdata;
 
 /* -------------------------------------------------------------------------
  *			Hardware-specific funcions
diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c
index 1f868d0187a..a19773149bd 100644
--- a/drivers/video/udlfb.c
+++ b/drivers/video/udlfb.c
@@ -69,9 +69,9 @@ static struct usb_device_id id_table[] = {
 MODULE_DEVICE_TABLE(usb, id_table);
 
 /* module options */
-static int console = 1; /* Allow fbcon to open framebuffer */
-static int fb_defio = 1;  /* Detect mmap writes using page faults */
-static int shadow = 1; /* Optionally disable shadow framebuffer */
+static bool console = 1; /* Allow fbcon to open framebuffer */
+static bool fb_defio = 1;  /* Detect mmap writes using page faults */
+static bool shadow = 1; /* Optionally disable shadow framebuffer */
 
 /* dlfb keeps a list of urbs for efficient bulk transfers */
 static void dlfb_urb_completion(struct urb *urb);
diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c
index 7f8472cc993..e7f69ef572d 100644
--- a/drivers/video/uvesafb.c
+++ b/drivers/video/uvesafb.c
@@ -44,11 +44,11 @@ static struct fb_fix_screeninfo uvesafb_fix __devinitdata = {
 };
 
 static int mtrr		__devinitdata = 3; /* enable mtrr by default */
-static int blank	= 1;		   /* enable blanking by default */
+static bool blank	= 1;		   /* enable blanking by default */
 static int ypan		= 1; 		 /* 0: scroll, 1: ypan, 2: ywrap */
 static bool pmi_setpal	__devinitdata = true; /* use PMI for palette changes */
-static int nocrtc	__devinitdata; /* ignore CRTC settings */
-static int noedid	__devinitdata; /* don't try DDC transfers */
+static bool nocrtc	__devinitdata; /* ignore CRTC settings */
+static bool noedid	__devinitdata; /* don't try DDC transfers */
 static int vram_remap	__devinitdata; /* set amt. of memory to be used */
 static int vram_total	__devinitdata; /* set total amount of memory */
 static u16 maxclk	__devinitdata; /* maximum pixel clock */
diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c
index bf2f78065cf..501a922aa9d 100644
--- a/drivers/video/vfb.c
+++ b/drivers/video/vfb.c
@@ -110,7 +110,7 @@ static struct fb_fix_screeninfo vfb_fix __devinitdata = {
 	.accel =	FB_ACCEL_NONE,
 };
 
-static int vfb_enable __initdata = 0;	/* disabled by default */
+static bool vfb_enable __initdata = 0;	/* disabled by default */
 module_param(vfb_enable, bool, 0);
 
 static int vfb_check_var(struct fb_var_screeninfo *var,
diff --git a/drivers/watchdog/f71808e_wdt.c b/drivers/watchdog/f71808e_wdt.c
index d4d8d1fdccc..e45ca2b4bfb 100644
--- a/drivers/watchdog/f71808e_wdt.c
+++ b/drivers/watchdog/f71808e_wdt.c
@@ -100,7 +100,7 @@ MODULE_PARM_DESC(f71862fg_pin,
 	"Watchdog f71862fg reset output pin configuration. Choose pin 56 or 63"
 			" (default=" __MODULE_STRING(WATCHDOG_F71862FG_PIN)")");
 
-static int nowayout = WATCHDOG_NOWAYOUT;
+static bool nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, bool, 0444);
 MODULE_PARM_DESC(nowayout, "Disable watchdog shutdown on close");
 
diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c
index eed5436ffb5..20feb4d3d79 100644
--- a/drivers/watchdog/mpc8xxx_wdt.c
+++ b/drivers/watchdog/mpc8xxx_wdt.c
@@ -55,7 +55,7 @@ module_param(timeout, ushort, 0);
 MODULE_PARM_DESC(timeout,
 	"Watchdog timeout in ticks. (0<timeout<65536, default=65535)");
 
-static int reset = 1;
+static bool reset = 1;
 module_param(reset, bool, 0);
 MODULE_PARM_DESC(reset,
 	"Watchdog Interrupt/Reset Mode. 0 = interrupt, 1 = reset");
diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c
index 52fed16d870..30d7be026c1 100644
--- a/drivers/xen/xen-pciback/conf_space.c
+++ b/drivers/xen/xen-pciback/conf_space.c
@@ -16,7 +16,7 @@
 #include "conf_space.h"
 #include "conf_space_quirks.h"
 
-static int permissive;
+static bool permissive;
 module_param(permissive, bool, 0644);
 
 /* This is where xen_pcibk_read_config_byte, xen_pcibk_read_config_word,
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
index 8e1c44d8ab4..d5dcf8d5d3d 100644
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -16,7 +16,7 @@
 #define INVALID_EVTCHN_IRQ  (-1)
 struct workqueue_struct *xen_pcibk_wq;
 
-static int __read_mostly passthrough;
+static bool __read_mostly passthrough;
 module_param(passthrough, bool, S_IRUGO);
 MODULE_PARM_DESC(passthrough,
 	"Option to specify how to export PCI topology to guest:\n"\
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 23d7451b293..65ba36b80a9 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -55,7 +55,7 @@ static				DEFINE_SPINLOCK(nsm_lock);
  * Local NSM state
  */
 u32	__read_mostly		nsm_local_state;
-int	__read_mostly		nsm_use_hostnames;
+bool	__read_mostly		nsm_use_hostnames;
 
 static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
 {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 277dfaf2e99..31778f74357 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -84,7 +84,7 @@ retry:
 /*
  * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
  */
-static int nfs4_disable_idmapping = 1;
+static bool nfs4_disable_idmapping = true;
 
 /*
  * RPC cruft for NFS
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 25c3bfad795..f649fba8c38 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,7 +57,7 @@
 #define NFS_64_BIT_INODE_NUMBERS_ENABLED	1
 
 /* Default is to see 64-bit inode numbers */
-static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
 
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index f554a9313b4..7762bc2d840 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -66,7 +66,7 @@ extern u8 acpi_gbl_create_osi_method;
 extern u8 acpi_gbl_use_default_register_widths;
 extern acpi_name acpi_gbl_trace_method_name;
 extern u32 acpi_gbl_trace_flags;
-extern u32 acpi_gbl_enable_aml_debug_object;
+extern bool acpi_gbl_enable_aml_debug_object;
 extern u8 acpi_gbl_copy_dsdt_locally;
 extern u8 acpi_gbl_truncate_io_addresses;
 extern u8 acpi_gbl_disable_auto_repair;
diff --git a/include/acpi/apei.h b/include/acpi/apei.h
index 51a527d24a8..04f349d8da7 100644
--- a/include/acpi/apei.h
+++ b/include/acpi/apei.h
@@ -16,10 +16,10 @@
 
 #ifdef __KERNEL__
 
-extern int hest_disable;
+extern bool hest_disable;
 extern int erst_disable;
 #ifdef CONFIG_ACPI_APEI_GHES
-extern int ghes_disable;
+extern bool ghes_disable;
 #else
 #define ghes_disable 1
 #endif
diff --git a/include/linux/console.h b/include/linux/console.h
index 7453cfd593c..7201ce4280c 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -152,7 +152,7 @@ extern int braille_register_console(struct console *, int index,
 		char *console_options, char *braille_options);
 extern int braille_unregister_console(struct console *);
 extern void console_sysfs_notify(void);
-extern int console_suspend_enabled;
+extern bool console_suspend_enabled;
 
 /* Suspend and resume console messages over PM events */
 extern void suspend_console(void);
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 90b0656a869..88a114fce47 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -195,7 +195,7 @@ extern struct svc_procedure	nlmsvc_procedures4[];
 #endif
 extern int			nlmsvc_grace_period;
 extern unsigned long		nlmsvc_timeout;
-extern int			nsm_use_hostnames;
+extern bool			nsm_use_hostnames;
 extern u32			nsm_local_state;
 
 /*
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index a3ac9c48e5d..8ef7894a48d 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -396,7 +396,7 @@ static inline void mmc_set_disable_delay(struct mmc_host *host,
 }
 
 /* Module parameter */
-extern int mmc_assume_removable;
+extern bool mmc_assume_removable;
 
 static inline int mmc_card_is_removable(struct mmc_host *host)
 {
diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h
index 38ccaea0820..df364956081 100644
--- a/security/apparmor/include/apparmor.h
+++ b/security/apparmor/include/apparmor.h
@@ -21,11 +21,11 @@
 
 /* Control parameters settable through module/boot flags */
 extern enum audit_mode aa_g_audit;
-extern int aa_g_audit_header;
-extern int aa_g_debug;
-extern int aa_g_lock_policy;
-extern int aa_g_logsyscall;
-extern int aa_g_paranoid_load;
+extern bool aa_g_audit_header;
+extern bool aa_g_debug;
+extern bool aa_g_lock_policy;
+extern bool aa_g_logsyscall;
+extern bool aa_g_paranoid_load;
 extern unsigned int aa_g_path_max;
 
 /*
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index d7f06f8b283..68d50c54e43 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -708,7 +708,7 @@ module_param_call(mode, param_set_mode, param_get_mode,
 		  &aa_g_profile_mode, S_IRUSR | S_IWUSR);
 
 /* Debug mode */
-int aa_g_debug;
+bool aa_g_debug;
 module_param_named(debug, aa_g_debug, aabool, S_IRUSR | S_IWUSR);
 
 /* Audit mode */
@@ -719,7 +719,7 @@ module_param_call(audit, param_set_audit, param_get_audit,
 /* Determines if audit header is included in audited messages.  This
  * provides more context if the audit daemon is not running
  */
-int aa_g_audit_header = 1;
+bool aa_g_audit_header = 1;
 module_param_named(audit_header, aa_g_audit_header, aabool,
 		   S_IRUSR | S_IWUSR);
 
@@ -727,12 +727,12 @@ module_param_named(audit_header, aa_g_audit_header, aabool,
  * TODO: add in at boot loading of policy, which is the only way to
  *       load policy, if lock_policy is set
  */
-int aa_g_lock_policy;
+bool aa_g_lock_policy;
 module_param_named(lock_policy, aa_g_lock_policy, aalockpolicy,
 		   S_IRUSR | S_IWUSR);
 
 /* Syscall logging mode */
-int aa_g_logsyscall;
+bool aa_g_logsyscall;
 module_param_named(logsyscall, aa_g_logsyscall, aabool, S_IRUSR | S_IWUSR);
 
 /* Maximum pathname length before accesses will start getting rejected */
@@ -742,12 +742,12 @@ module_param_named(path_max, aa_g_path_max, aauint, S_IRUSR | S_IWUSR);
 /* Determines how paranoid loading of policy is and how much verification
  * on the loaded policy is done.
  */
-int aa_g_paranoid_load = 1;
+bool aa_g_paranoid_load = 1;
 module_param_named(paranoid_load, aa_g_paranoid_load, aabool,
 		   S_IRUSR | S_IWUSR);
 
 /* Boot time disable flag */
-static unsigned int apparmor_enabled = CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE;
+static bool apparmor_enabled = CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE;
 module_param_named(enabled, apparmor_enabled, aabool, S_IRUSR);
 
 static int __init apparmor_enabled_setup(char *str)
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 0fb448e6a1a..a457d2138f4 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -32,7 +32,7 @@
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 
-static int allow_unsafe_assigned_interrupts;
+static bool allow_unsafe_assigned_interrupts;
 module_param_named(allow_unsafe_assigned_interrupts,
 		   allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
-- 
cgit v1.2.3


From a2ef990ab5a6705a356d146dd773a3b359787497 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <xtfeng@gmail.com>
Date: Thu, 12 Jan 2012 17:17:08 -0800
Subject: proc: fix null pointer deref in proc_pid_permission()

get_proc_task() can fail to search the task and return NULL,
put_task_struct() will then bomb the kernel with following oops:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
  IP: [<ffffffff81217d34>] proc_pid_permission+0x64/0xe0
  PGD 112075067 PUD 112814067 PMD 0
  Oops: 0002 [#1] PREEMPT SMP

This is a regression introduced by commit 0499680a ("procfs: add hidepid=
and gid= mount options").  The kernel should return -ESRCH if
get_proc_task() failed.

Signed-off-by: Xiaotian Feng <dannyfeng@tencent.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: Stephen Wilson <wilsons@start.ca>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8173dfd89cb..5485a5388ec 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -654,6 +654,8 @@ static int proc_pid_permission(struct inode *inode, int mask)
 	bool has_perms;
 
 	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
 	has_perms = has_pid_permissions(pid, task, 1);
 	put_task_struct(task);
 
-- 
cgit v1.2.3


From 2ccd4f4d4737b37e21dd92c8c584c23cd87740a2 Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Thu, 12 Jan 2012 17:17:40 -0800
Subject: pipe: fail cleanly when root tries F_SETPIPE_SZ with big size

When a user with the CAP_SYS_RESOURCE cap tries to F_SETPIPE_SZ a pipe
with size bigger than kmalloc() can alloc it spits out an ugly warning:

  ------------[ cut here ]------------
  WARNING: at mm/page_alloc.c:2095 __alloc_pages_nodemask+0x5d3/0x7a0()
  Pid: 733, comm: a.out Not tainted 3.2.0-rc1+ #4
  Call Trace:
     warn_slowpath_common+0x75/0xb0
     warn_slowpath_null+0x15/0x20
     __alloc_pages_nodemask+0x5d3/0x7a0
     __get_free_pages+0x12/0x50
     __kmalloc+0x12b/0x150
     pipe_set_size+0x75/0x120
     pipe_fcntl+0xf8/0x140
     do_fcntl+0x2d4/0x410
     sys_fcntl+0x66/0xa0
     system_call_fastpath+0x16/0x1b
  ---[ end trace 432f702e6db7b5ee ]---

Instead, make kcalloc() handle the overflow case and fail quietly.

[akpm@linux-foundation.org: switch to sizeof(*bufs) for 80-column niceness]
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Acked-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index f0e485d54e6..a932ced92a1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 	if (nr_pages < pipe->nrbufs)
 		return -EBUSY;
 
-	bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
+	bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
 	if (unlikely(!bufs))
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 28d82dc1c4edbc352129f97f4ca22624d1fe61de Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 12 Jan 2012 17:17:43 -0800
Subject: epoll: limit paths

The current epoll code can be tickled to run basically indefinitely in
both loop detection path check (on ep_insert()), and in the wakeup paths.
The programs that tickle this behavior set up deeply linked networks of
epoll file descriptors that cause the epoll algorithms to traverse them
indefinitely.  A couple of these sample programs have been previously
posted in this thread: https://lkml.org/lkml/2011/2/25/297.

To fix the loop detection path check algorithms, I simply keep track of
the epoll nodes that have been already visited.  Thus, the loop detection
becomes proportional to the number of epoll file descriptor and links.
This dramatically decreases the run-time of the loop check algorithm.  In
one diabolical case I tried it reduced the run-time from 15 mintues (all
in kernel time) to .3 seconds.

Fixing the wakeup paths could be done at wakeup time in a similar manner
by keeping track of nodes that have already been visited, but the
complexity is harder, since there can be multiple wakeups on different
cpus...Thus, I've opted to limit the number of possible wakeup paths when
the paths are created.

This is accomplished, by noting that the end file descriptor points that
are found during the loop detection pass (from the newly added link), are
actually the sources for wakeup events.  I keep a list of these file
descriptors and limit the number and length of these paths that emanate
from these 'source file descriptors'.  In the current implemetation I
allow 1000 paths of length 1, 500 of length 2, 100 of length 3, 50 of
length 4 and 10 of length 5.  Note that it is sufficient to check the
'source file descriptors' reachable from the newly added link, since no
other 'source file descriptors' will have newly added links.  This allows
us to check only the wakeup paths that may have gotten too long, and not
re-check all possible wakeup paths on the system.

In terms of the path limit selection, I think its first worth noting that
the most common case for epoll, is probably the model where you have 1
epoll file descriptor that is monitoring n number of 'source file
descriptors'.  In this case, each 'source file descriptor' has a 1 path of
length 1.  Thus, I believe that the limits I'm proposing are quite
reasonable and in fact may be too generous.  Thus, I'm hoping that the
proposed limits will not prevent any workloads that currently work to
fail.

In terms of locking, I have extended the use of the 'epmutex' to all
epoll_ctl add and remove operations.  Currently its only used in a subset
of the add paths.  I need to hold the epmutex, so that we can correctly
traverse a coherent graph, to check the number of paths.  I believe that
this additional locking is probably ok, since its in the setup/teardown
paths, and doesn't affect the running paths, but it certainly is going to
add some extra overhead.  Also, worth noting is that the epmuex was
recently added to the ep_ctl add operations in the initial path loop
detection code using the argument that it was not on a critical path.

Another thing to note here, is the length of epoll chains that is allowed.
Currently, eventpoll.c defines:

/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4

This basically means that I am limited to a graph depth of 5 (EP_MAX_NESTS
+ 1).  However, this limit is currently only enforced during the loop
check detection code, and only when the epoll file descriptors are added
in a certain order.  Thus, this limit is currently easily bypassed.  The
newly added check for wakeup paths, stricly limits the wakeup paths to a
length of 5, regardless of the order in which ep's are linked together.
Thus, a side-effect of the new code is a more consistent enforcement of
the graph depth.

Thus far, I've tested this, using the sample programs previously
mentioned, which now either return quickly or return -EINVAL.  I've also
testing using the piptest.c epoll tester, which showed no difference in
performance.  I've also created a number of different epoll networks and
tested that they behave as expectded.

I believe this solves the original diabolical test cases, while still
preserving the sane epoll nesting.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Cc: Nelson Elhage <nelhage@ksplice.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c            | 234 +++++++++++++++++++++++++++++++++++++++++-----
 include/linux/eventpoll.h |   1 +
 include/linux/fs.h        |   1 +
 3 files changed, 211 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 828e750af23..aabdfc38cf2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -197,6 +197,12 @@ struct eventpoll {
 
 	/* The user that created the eventpoll descriptor */
 	struct user_struct *user;
+
+	struct file *file;
+
+	/* used to optimize loop detection check */
+	int visited;
+	struct list_head visited_list_link;
 };
 
 /* Wait structure used by the poll hooks */
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
 /* Slab cache used to allocate "struct eppoll_entry" */
 static struct kmem_cache *pwq_cache __read_mostly;
 
+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
+static LIST_HEAD(visited_list);
+
+/*
+ * List of files with newly added links, where we may need to limit the number
+ * of emanating paths. Protected by the epmutex.
+ */
+static LIST_HEAD(tfile_check_list);
+
 #ifdef CONFIG_SYSCTL
 
 #include <linux/sysctl.h>
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
+static const struct file_operations eventpoll_fops;
+
+static inline int is_file_epoll(struct file *f)
+{
+	return f->f_op == &eventpoll_fops;
+}
 
 /* Setup the structure that is used as key for the RB tree */
 static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = {
 	.llseek		= noop_llseek,
 };
 
-/* Fast test to see if the file is an eventpoll file */
-static inline int is_file_epoll(struct file *f)
-{
-	return f->f_op == &eventpoll_fops;
-}
-
 /*
  * This is called from eventpoll_release() to unlink files from the eventpoll
  * interface. We need to have this facility to cleanup correctly files that are
@@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
 	rb_insert_color(&epi->rbn, &ep->rbr);
 }
 
+
+
+#define PATH_ARR_SIZE 5
+/*
+ * These are the number paths of length 1 to 5, that we are allowing to emanate
+ * from a single file of interest. For example, we allow 1000 paths of length
+ * 1, to emanate from each file of interest. This essentially represents the
+ * potential wakeup paths, which need to be limited in order to avoid massive
+ * uncontrolled wakeup storms. The common use case should be a single ep which
+ * is connected to n file sources. In this case each file source has 1 path
+ * of length 1. Thus, the numbers below should be more than sufficient. These
+ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
+ * and delete can't add additional paths. Protected by the epmutex.
+ */
+static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
+static int path_count[PATH_ARR_SIZE];
+
+static int path_count_inc(int nests)
+{
+	if (++path_count[nests] > path_limits[nests])
+		return -1;
+	return 0;
+}
+
+static void path_count_init(void)
+{
+	int i;
+
+	for (i = 0; i < PATH_ARR_SIZE; i++)
+		path_count[i] = 0;
+}
+
+static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
+{
+	int error = 0;
+	struct file *file = priv;
+	struct file *child_file;
+	struct epitem *epi;
+
+	list_for_each_entry(epi, &file->f_ep_links, fllink) {
+		child_file = epi->ep->file;
+		if (is_file_epoll(child_file)) {
+			if (list_empty(&child_file->f_ep_links)) {
+				if (path_count_inc(call_nests)) {
+					error = -1;
+					break;
+				}
+			} else {
+				error = ep_call_nested(&poll_loop_ncalls,
+							EP_MAX_NESTS,
+							reverse_path_check_proc,
+							child_file, child_file,
+							current);
+			}
+			if (error != 0)
+				break;
+		} else {
+			printk(KERN_ERR "reverse_path_check_proc: "
+				"file is not an ep!\n");
+		}
+	}
+	return error;
+}
+
+/**
+ * reverse_path_check - The tfile_check_list is list of file *, which have
+ *                      links that are proposed to be newly added. We need to
+ *                      make sure that those added links don't add too many
+ *                      paths such that we will spend all our time waking up
+ *                      eventpoll objects.
+ *
+ * Returns: Returns zero if the proposed links don't create too many paths,
+ *	    -1 otherwise.
+ */
+static int reverse_path_check(void)
+{
+	int length = 0;
+	int error = 0;
+	struct file *current_file;
+
+	/* let's call this for all tfiles */
+	list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+		length++;
+		path_count_init();
+		error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+					reverse_path_check_proc, current_file,
+					current_file, current);
+		if (error)
+			break;
+	}
+	return error;
+}
+
 /*
  * Must be called with "mtx" held.
  */
@@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	 */
 	ep_rbtree_insert(ep, epi);
 
+	/* now check if we've created too many backpaths */
+	error = -EINVAL;
+	if (reverse_path_check())
+		goto error_remove_epi;
+
 	/* We have to drop the new item inside our item list to keep track of it */
 	spin_lock_irqsave(&ep->lock, flags);
 
@@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	return 0;
 
+error_remove_epi:
+	spin_lock(&tfile->f_lock);
+	if (ep_is_linked(&epi->fllink))
+		list_del_init(&epi->fllink);
+	spin_unlock(&tfile->f_lock);
+
+	rb_erase(&epi->rbn, &ep->rbr);
+
 error_unregister:
 	ep_unregister_pollwait(ep, epi);
 
@@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
 	int error = 0;
 	struct file *file = priv;
 	struct eventpoll *ep = file->private_data;
+	struct eventpoll *ep_tovisit;
 	struct rb_node *rbp;
 	struct epitem *epi;
 
 	mutex_lock_nested(&ep->mtx, call_nests + 1);
+	ep->visited = 1;
+	list_add(&ep->visited_list_link, &visited_list);
 	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 		if (unlikely(is_file_epoll(epi->ffd.file))) {
+			ep_tovisit = epi->ffd.file->private_data;
+			if (ep_tovisit->visited)
+				continue;
 			error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
-					       ep_loop_check_proc, epi->ffd.file,
-					       epi->ffd.file->private_data, current);
+					ep_loop_check_proc, epi->ffd.file,
+					ep_tovisit, current);
 			if (error != 0)
 				break;
+		} else {
+			/*
+			 * If we've reached a file that is not associated with
+			 * an ep, then we need to check if the newly added
+			 * links are going to add too many wakeup paths. We do
+			 * this by adding it to the tfile_check_list, if it's
+			 * not already there, and calling reverse_path_check()
+			 * during ep_insert().
+			 */
+			if (list_empty(&epi->ffd.file->f_tfile_llink))
+				list_add(&epi->ffd.file->f_tfile_llink,
+					 &tfile_check_list);
 		}
 	}
 	mutex_unlock(&ep->mtx);
@@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
  */
 static int ep_loop_check(struct eventpoll *ep, struct file *file)
 {
-	return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+	int ret;
+	struct eventpoll *ep_cur, *ep_next;
+
+	ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
 			      ep_loop_check_proc, file, ep, current);
+	/* clear visited list */
+	list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
+							visited_list_link) {
+		ep_cur->visited = 0;
+		list_del(&ep_cur->visited_list_link);
+	}
+	return ret;
+}
+
+static void clear_tfile_check_list(void)
+{
+	struct file *file;
+
+	/* first clear the tfile_check_list */
+	while (!list_empty(&tfile_check_list)) {
+		file = list_first_entry(&tfile_check_list, struct file,
+					f_tfile_llink);
+		list_del_init(&file->f_tfile_llink);
+	}
+	INIT_LIST_HEAD(&tfile_check_list);
 }
 
 /*
@@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
  */
 SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
-	int error;
+	int error, fd;
 	struct eventpoll *ep = NULL;
+	struct file *file;
 
 	/* Check the EPOLL_* constant for consistency.  */
 	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure and a free file descriptor.
 	 */
-	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+	if (fd < 0) {
+		error = fd;
+		goto out_free_ep;
+	}
+	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
 				 O_RDWR | (flags & O_CLOEXEC));
-	if (error < 0)
-		ep_free(ep);
-
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto out_free_fd;
+	}
+	fd_install(fd, file);
+	ep->file = file;
+	return fd;
+
+out_free_fd:
+	put_unused_fd(fd);
+out_free_ep:
+	ep_free(ep);
 	return error;
 }
 
@@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	/*
 	 * When we insert an epoll file descriptor, inside another epoll file
 	 * descriptor, there is the change of creating closed loops, which are
-	 * better be handled here, than in more critical paths.
+	 * better be handled here, than in more critical paths. While we are
+	 * checking for loops we also determine the list of files reachable
+	 * and hang them on the tfile_check_list, so we can check that we
+	 * haven't created too many possible wakeup paths.
 	 *
-	 * We hold epmutex across the loop check and the insert in this case, in
-	 * order to prevent two separate inserts from racing and each doing the
-	 * insert "at the same time" such that ep_loop_check passes on both
-	 * before either one does the insert, thereby creating a cycle.
+	 * We need to hold the epmutex across both ep_insert and ep_remove
+	 * b/c we want to make sure we are looking at a coherent view of
+	 * epoll network.
 	 */
-	if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+	if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
 		mutex_lock(&epmutex);
 		did_lock_epmutex = 1;
-		error = -ELOOP;
-		if (ep_loop_check(ep, tfile) != 0)
-			goto error_tgt_fput;
 	}
-
+	if (op == EPOLL_CTL_ADD) {
+		if (is_file_epoll(tfile)) {
+			error = -ELOOP;
+			if (ep_loop_check(ep, tfile) != 0)
+				goto error_tgt_fput;
+		} else
+			list_add(&tfile->f_tfile_llink, &tfile_check_list);
+	}
 
 	mutex_lock_nested(&ep->mtx, 0);
 
@@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 			error = ep_insert(ep, &epds, tfile, fd);
 		} else
 			error = -EEXIST;
+		clear_tfile_check_list();
 		break;
 	case EPOLL_CTL_DEL:
 		if (epi)
@@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	mutex_unlock(&ep->mtx);
 
 error_tgt_fput:
-	if (unlikely(did_lock_epmutex))
+	if (did_lock_epmutex)
 		mutex_unlock(&epmutex);
 
 	fput(tfile);
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index f362733186a..657ab55beda 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -61,6 +61,7 @@ struct file;
 static inline void eventpoll_init_file(struct file *file)
 {
 	INIT_LIST_HEAD(&file->f_ep_links);
+	INIT_LIST_HEAD(&file->f_tfile_llink);
 }
 
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7aacf31418f..a7409bc157e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1001,6 +1001,7 @@ struct file {
 #ifdef CONFIG_EPOLL
 	/* Used by fs/eventpoll.c to link all the hooks to this file */
 	struct list_head	f_ep_links;
+	struct list_head	f_tfile_llink;
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
 #ifdef CONFIG_DEBUG_WRITECOUNT
-- 
cgit v1.2.3


From b969c4ab9f182a6e1b2a0848be349f99714947b0 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Jan 2012 17:19:34 -0800
Subject: mm: compaction: determine if dirty pages can be migrated without
 blocking within ->migratepage

Asynchronous compaction is used when allocating transparent hugepages to
avoid blocking for long periods of time.  Due to reports of stalling,
there was a debate on disabling synchronous compaction but this severely
impacted allocation success rates.  Part of the reason was that many dirty
pages are skipped in asynchronous compaction by the following check;

	if (PageDirty(page) && !sync &&
		mapping->a_ops->migratepage != migrate_page)
			rc = -EBUSY;

This skips over all mapping aops using buffer_migrate_page() even though
it is possible to migrate some of these pages without blocking.  This
patch updates the ->migratepage callback with a "sync" parameter.  It is
the responsibility of the callback to fail gracefully if migration would
block.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andy Isaacson <adi@hexapodia.org>
Cc: Nai Xia <nai.xia@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/disk-io.c      |   4 +-
 fs/hugetlbfs/inode.c    |   3 +-
 fs/nfs/internal.h       |   2 +-
 fs/nfs/write.c          |   4 +-
 include/linux/fs.h      |   9 ++--
 include/linux/migrate.h |   2 +-
 mm/migrate.c            | 129 ++++++++++++++++++++++++++++++++++--------------
 7 files changed, 106 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f99a099a774..1375494c8cb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -872,7 +872,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 #ifdef CONFIG_MIGRATION
 static int btree_migratepage(struct address_space *mapping,
-			struct page *newpage, struct page *page)
+			struct page *newpage, struct page *page, bool sync)
 {
 	/*
 	 * we can't safely write a btree page from here,
@@ -887,7 +887,7 @@ static int btree_migratepage(struct address_space *mapping,
 	if (page_has_private(page) &&
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
-	return migrate_page(mapping, newpage, page);
+	return migrate_page(mapping, newpage, page, sync);
 }
 #endif
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e425ad9d049..06fd4608a99 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 }
 
 static int hugetlbfs_migrate_page(struct address_space *mapping,
-				struct page *newpage, struct page *page)
+				struct page *newpage, struct page *page,
+				bool sync)
 {
 	int rc;
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ee92538b06..114398a1583 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);
 
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
-		struct page *, struct page *);
+		struct page *, struct page *, bool);
 #else
 #define nfs_migrate_page NULL
 #endif
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0c3885255f9..889e98bc5a2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1688,7 +1688,7 @@ out_error:
 
 #ifdef CONFIG_MIGRATION
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
-		struct page *page)
+		struct page *page, bool sync)
 {
 	/*
 	 * If PagePrivate is set, then the page is currently associated with
@@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 
 	nfs_fscache_release_page(page, GFP_KERNEL);
 
-	return migrate_page(mapping, newpage, page);
+	return migrate_page(mapping, newpage, page, sync);
 }
 #endif
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a7409bc157e..b92b73d0b2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -609,9 +609,12 @@ struct address_space_operations {
 			loff_t offset, unsigned long nr_segs);
 	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
 						void **, unsigned long *);
-	/* migrate the contents of a page to the specified target */
+	/*
+	 * migrate the contents of a page to the specified target. If sync
+	 * is false, it must not block.
+	 */
 	int (*migratepage) (struct address_space *,
-			struct page *, struct page *);
+			struct page *, struct page *, bool);
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
@@ -2537,7 +2540,7 @@ extern int generic_check_addressable(unsigned, u64);
 
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
-				struct page *, struct page *);
+				struct page *, struct page *, bool);
 #else
 #define buffer_migrate_page NULL
 #endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e39aeecfe9a..14e6d2a8847 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -11,7 +11,7 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
 extern void putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
-			struct page *, struct page *);
+			struct page *, struct page *, bool);
 extern int migrate_pages(struct list_head *l, new_page_t x,
 			unsigned long private, bool offlining,
 			bool sync);
diff --git a/mm/migrate.c b/mm/migrate.c
index fc391985899..4e86f3bacb8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -216,6 +216,55 @@ out:
 	pte_unmap_unlock(ptep, ptl);
 }
 
+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync)
+{
+	struct buffer_head *bh = head;
+
+	/* Simple case, sync compaction */
+	if (sync) {
+		do {
+			get_bh(bh);
+			lock_buffer(bh);
+			bh = bh->b_this_page;
+
+		} while (bh != head);
+
+		return true;
+	}
+
+	/* async case, we cannot block on lock_buffer so use trylock_buffer */
+	do {
+		get_bh(bh);
+		if (!trylock_buffer(bh)) {
+			/*
+			 * We failed to lock the buffer and cannot stall in
+			 * async migration. Release the taken locks
+			 */
+			struct buffer_head *failed_bh = bh;
+			put_bh(failed_bh);
+			bh = head;
+			while (bh != failed_bh) {
+				unlock_buffer(bh);
+				put_bh(bh);
+				bh = bh->b_this_page;
+			}
+			return false;
+		}
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return true;
+}
+#else
+static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
+								bool sync)
+{
+	return true;
+}
+#endif /* CONFIG_BLOCK */
+
 /*
  * Replace the page in the mapping.
  *
@@ -225,7 +274,8 @@ out:
  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
 static int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page)
+		struct page *newpage, struct page *page,
+		struct buffer_head *head, bool sync)
 {
 	int expected_count;
 	void **pslot;
@@ -254,6 +304,19 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 		return -EAGAIN;
 	}
 
+	/*
+	 * In the async migration case of moving a page with buffers, lock the
+	 * buffers using trylock before the mapping is moved. If the mapping
+	 * was moved, we later failed to lock the buffers and could not move
+	 * the mapping back due to an elevated page count, we would have to
+	 * block waiting on other references to be dropped.
+	 */
+	if (!sync && head && !buffer_migrate_lock_buffers(head, sync)) {
+		page_unfreeze_refs(page, expected_count);
+		spin_unlock_irq(&mapping->tree_lock);
+		return -EAGAIN;
+	}
+
 	/*
 	 * Now we know that no one else is looking at the page.
 	 */
@@ -409,13 +472,13 @@ EXPORT_SYMBOL(fail_migrate_page);
  * Pages are locked upon entry and exit.
  */
 int migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page)
+		struct page *newpage, struct page *page, bool sync)
 {
 	int rc;
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(mapping, newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, sync);
 
 	if (rc)
 		return rc;
@@ -432,28 +495,28 @@ EXPORT_SYMBOL(migrate_page);
  * exist.
  */
 int buffer_migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page)
+		struct page *newpage, struct page *page, bool sync)
 {
 	struct buffer_head *bh, *head;
 	int rc;
 
 	if (!page_has_buffers(page))
-		return migrate_page(mapping, newpage, page);
+		return migrate_page(mapping, newpage, page, sync);
 
 	head = page_buffers(page);
 
-	rc = migrate_page_move_mapping(mapping, newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page, head, sync);
 
 	if (rc)
 		return rc;
 
-	bh = head;
-	do {
-		get_bh(bh);
-		lock_buffer(bh);
-		bh = bh->b_this_page;
-
-	} while (bh != head);
+	/*
+	 * In the async case, migrate_page_move_mapping locked the buffers
+	 * with an IRQ-safe spinlock held. In the sync case, the buffers
+	 * need to be locked now
+	 */
+	if (sync)
+		BUG_ON(!buffer_migrate_lock_buffers(head, sync));
 
 	ClearPagePrivate(page);
 	set_page_private(newpage, page_private(page));
@@ -530,10 +593,13 @@ static int writeout(struct address_space *mapping, struct page *page)
  * Default handling if a filesystem does not provide a migration function.
  */
 static int fallback_migrate_page(struct address_space *mapping,
-	struct page *newpage, struct page *page)
+	struct page *newpage, struct page *page, bool sync)
 {
-	if (PageDirty(page))
+	if (PageDirty(page)) {
+		if (!sync)
+			return -EBUSY;
 		return writeout(mapping, page);
+	}
 
 	/*
 	 * Buffers may be managed in a filesystem specific way.
@@ -543,7 +609,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
 
-	return migrate_page(mapping, newpage, page);
+	return migrate_page(mapping, newpage, page, sync);
 }
 
 /*
@@ -579,29 +645,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 
 	mapping = page_mapping(page);
 	if (!mapping)
-		rc = migrate_page(mapping, newpage, page);
-	else {
+		rc = migrate_page(mapping, newpage, page, sync);
+	else if (mapping->a_ops->migratepage)
 		/*
-		 * Do not writeback pages if !sync and migratepage is
-		 * not pointing to migrate_page() which is nonblocking
-		 * (swapcache/tmpfs uses migratepage = migrate_page).
+		 * Most pages have a mapping and most filesystems provide a
+		 * migratepage callback. Anonymous pages are part of swap
+		 * space which also has its own migratepage callback. This
+		 * is the most common path for page migration.
 		 */
-		if (PageDirty(page) && !sync &&
-		    mapping->a_ops->migratepage != migrate_page)
-			rc = -EBUSY;
-		else if (mapping->a_ops->migratepage)
-			/*
-			 * Most pages have a mapping and most filesystems
-			 * should provide a migration function. Anonymous
-			 * pages are part of swap space which also has its
-			 * own migration function. This is the most common
-			 * path for page migration.
-			 */
-			rc = mapping->a_ops->migratepage(mapping,
-							newpage, page);
-		else
-			rc = fallback_migrate_page(mapping, newpage, page);
-	}
+		rc = mapping->a_ops->migratepage(mapping,
+						newpage, page, sync);
+	else
+		rc = fallback_migrate_page(mapping, newpage, page, sync);
 
 	if (rc) {
 		newpage->mapping = NULL;
-- 
cgit v1.2.3


From a6bc32b899223a877f595ef9ddc1e89ead5072b8 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Jan 2012 17:19:43 -0800
Subject: mm: compaction: introduce sync-light migration for use by compaction

This patch adds a lightweight sync migrate operation MIGRATE_SYNC_LIGHT
mode that avoids writing back pages to backing storage.  Async compaction
maps to MIGRATE_ASYNC while sync compaction maps to MIGRATE_SYNC_LIGHT.
For other migrate_pages users such as memory hotplug, MIGRATE_SYNC is
used.

This avoids sync compaction stalling for an excessive length of time,
particularly when copying files to a USB stick where there might be a
large number of dirty pages backed by a filesystem that does not support
->writepages.

[aarcange@redhat.com: This patch is heavily based on Andrea's work]
[akpm@linux-foundation.org: fix fs/nfs/write.c build]
[akpm@linux-foundation.org: fix fs/btrfs/disk-io.c build]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andy Isaacson <adi@hexapodia.org>
Cc: Nai Xia <nai.xia@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/disk-io.c      |  5 ++--
 fs/hugetlbfs/inode.c    |  2 +-
 fs/nfs/internal.h       |  2 +-
 fs/nfs/write.c          |  4 +--
 include/linux/fs.h      |  6 ++--
 include/linux/migrate.h | 23 +++++++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  2 +-
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  2 +-
 mm/migrate.c            | 78 +++++++++++++++++++++++++++----------------------
 11 files changed, 76 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1375494c8cb..d8525662ca7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 #ifdef CONFIG_MIGRATION
 static int btree_migratepage(struct address_space *mapping,
-			struct page *newpage, struct page *page, bool sync)
+			struct page *newpage, struct page *page,
+			enum migrate_mode mode)
 {
 	/*
 	 * we can't safely write a btree page from here,
@@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping,
 	if (page_has_private(page) &&
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
-	return migrate_page(mapping, newpage, page, sync);
+	return migrate_page(mapping, newpage, page, mode);
 }
 #endif
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 06fd4608a99..1e85a7ac021 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -584,7 +584,7 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 
 static int hugetlbfs_migrate_page(struct address_space *mapping,
 				struct page *newpage, struct page *page,
-				bool sync)
+				enum migrate_mode mode)
 {
 	int rc;
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 114398a1583..8102db9b926 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);
 
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
-		struct page *, struct page *, bool);
+		struct page *, struct page *, enum migrate_mode);
 #else
 #define nfs_migrate_page NULL
 #endif
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 889e98bc5a2..834f0fe96f8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1688,7 +1688,7 @@ out_error:
 
 #ifdef CONFIG_MIGRATION
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
-		struct page *page, bool sync)
+		struct page *page, enum migrate_mode mode)
 {
 	/*
 	 * If PagePrivate is set, then the page is currently associated with
@@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 
 	nfs_fscache_release_page(page, GFP_KERNEL);
 
-	return migrate_page(mapping, newpage, page, sync);
+	return migrate_page(mapping, newpage, page, mode);
 }
 #endif
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b92b73d0b2b..e694bd4434a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -525,6 +525,7 @@ enum positive_aop_returns {
 struct page;
 struct address_space;
 struct writeback_control;
+enum migrate_mode;
 
 struct iov_iter {
 	const struct iovec *iov;
@@ -614,7 +615,7 @@ struct address_space_operations {
 	 * is false, it must not block.
 	 */
 	int (*migratepage) (struct address_space *,
-			struct page *, struct page *, bool);
+			struct page *, struct page *, enum migrate_mode);
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
@@ -2540,7 +2541,8 @@ extern int generic_check_addressable(unsigned, u64);
 
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
-				struct page *, struct page *, bool);
+				struct page *, struct page *,
+				enum migrate_mode);
 #else
 #define buffer_migrate_page NULL
 #endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 14e6d2a8847..eaf867412f7 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -6,18 +6,31 @@
 
 typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
+/*
+ * MIGRATE_ASYNC means never block
+ * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking
+ *	on most operations but not ->writepage as the potential stall time
+ *	is too significant
+ * MIGRATE_SYNC will block when migrating pages
+ */
+enum migrate_mode {
+	MIGRATE_ASYNC,
+	MIGRATE_SYNC_LIGHT,
+	MIGRATE_SYNC,
+};
+
 #ifdef CONFIG_MIGRATION
 #define PAGE_MIGRATION 1
 
 extern void putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
-			struct page *, struct page *, bool);
+			struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t x,
 			unsigned long private, bool offlining,
-			bool sync);
+			enum migrate_mode mode);
 extern int migrate_huge_pages(struct list_head *l, new_page_t x,
 			unsigned long private, bool offlining,
-			bool sync);
+			enum migrate_mode mode);
 
 extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
@@ -36,10 +49,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private, bool offlining,
-		bool sync) { return -ENOSYS; }
+		enum migrate_mode mode) { return -ENOSYS; }
 static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
 		unsigned long private, bool offlining,
-		bool sync) { return -ENOSYS; }
+		enum migrate_mode mode) { return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
index fb291585e1b..71a58f67f48 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -557,7 +557,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				(unsigned long)cc, false,
-				cc->sync);
+				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 06d3479513a..56080ea3614 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1557,7 +1557,7 @@ int soft_offline_page(struct page *page, int flags)
 					    page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-								0, true);
+							0, MIGRATE_SYNC);
 		if (ret) {
 			putback_lru_pages(&pagelist);
 			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2168489c0bc..6629fafd6ce 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -809,7 +809,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		}
 		/* this function returns # of failed pages */
 		ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
-								true, true);
+							true, MIGRATE_SYNC);
 		if (ret)
 			putback_lru_pages(&source);
 	}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e3d58f08846..06b145fb64a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -942,7 +942,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_node_page, dest,
-								false, true);
+							false, MIGRATE_SYNC);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
diff --git a/mm/migrate.c b/mm/migrate.c
index 4e86f3bacb8..9871a56d82c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -218,12 +218,13 @@ out:
 
 #ifdef CONFIG_BLOCK
 /* Returns true if all buffers are successfully locked */
-static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync)
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+							enum migrate_mode mode)
 {
 	struct buffer_head *bh = head;
 
 	/* Simple case, sync compaction */
-	if (sync) {
+	if (mode != MIGRATE_ASYNC) {
 		do {
 			get_bh(bh);
 			lock_buffer(bh);
@@ -259,7 +260,7 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync)
 }
 #else
 static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
-								bool sync)
+							enum migrate_mode mode)
 {
 	return true;
 }
@@ -275,7 +276,7 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
  */
 static int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page,
-		struct buffer_head *head, bool sync)
+		struct buffer_head *head, enum migrate_mode mode)
 {
 	int expected_count;
 	void **pslot;
@@ -311,7 +312,8 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	 * the mapping back due to an elevated page count, we would have to
 	 * block waiting on other references to be dropped.
 	 */
-	if (!sync && head && !buffer_migrate_lock_buffers(head, sync)) {
+	if (mode == MIGRATE_ASYNC && head &&
+			!buffer_migrate_lock_buffers(head, mode)) {
 		page_unfreeze_refs(page, expected_count);
 		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
@@ -472,13 +474,14 @@ EXPORT_SYMBOL(fail_migrate_page);
  * Pages are locked upon entry and exit.
  */
 int migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page, bool sync)
+		struct page *newpage, struct page *page,
+		enum migrate_mode mode)
 {
 	int rc;
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, sync);
+	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
 
 	if (rc)
 		return rc;
@@ -495,17 +498,17 @@ EXPORT_SYMBOL(migrate_page);
  * exist.
  */
 int buffer_migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page, bool sync)
+		struct page *newpage, struct page *page, enum migrate_mode mode)
 {
 	struct buffer_head *bh, *head;
 	int rc;
 
 	if (!page_has_buffers(page))
-		return migrate_page(mapping, newpage, page, sync);
+		return migrate_page(mapping, newpage, page, mode);
 
 	head = page_buffers(page);
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, head, sync);
+	rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
 
 	if (rc)
 		return rc;
@@ -515,8 +518,8 @@ int buffer_migrate_page(struct address_space *mapping,
 	 * with an IRQ-safe spinlock held. In the sync case, the buffers
 	 * need to be locked now
 	 */
-	if (sync)
-		BUG_ON(!buffer_migrate_lock_buffers(head, sync));
+	if (mode != MIGRATE_ASYNC)
+		BUG_ON(!buffer_migrate_lock_buffers(head, mode));
 
 	ClearPagePrivate(page);
 	set_page_private(newpage, page_private(page));
@@ -593,10 +596,11 @@ static int writeout(struct address_space *mapping, struct page *page)
  * Default handling if a filesystem does not provide a migration function.
  */
 static int fallback_migrate_page(struct address_space *mapping,
-	struct page *newpage, struct page *page, bool sync)
+	struct page *newpage, struct page *page, enum migrate_mode mode)
 {
 	if (PageDirty(page)) {
-		if (!sync)
+		/* Only writeback pages in full synchronous migration */
+		if (mode != MIGRATE_SYNC)
 			return -EBUSY;
 		return writeout(mapping, page);
 	}
@@ -609,7 +613,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
 
-	return migrate_page(mapping, newpage, page, sync);
+	return migrate_page(mapping, newpage, page, mode);
 }
 
 /*
@@ -624,7 +628,7 @@ static int fallback_migrate_page(struct address_space *mapping,
  *  == 0 - success
  */
 static int move_to_new_page(struct page *newpage, struct page *page,
-					int remap_swapcache, bool sync)
+				int remap_swapcache, enum migrate_mode mode)
 {
 	struct address_space *mapping;
 	int rc;
@@ -645,7 +649,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 
 	mapping = page_mapping(page);
 	if (!mapping)
-		rc = migrate_page(mapping, newpage, page, sync);
+		rc = migrate_page(mapping, newpage, page, mode);
 	else if (mapping->a_ops->migratepage)
 		/*
 		 * Most pages have a mapping and most filesystems provide a
@@ -654,9 +658,9 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 		 * is the most common path for page migration.
 		 */
 		rc = mapping->a_ops->migratepage(mapping,
-						newpage, page, sync);
+						newpage, page, mode);
 	else
-		rc = fallback_migrate_page(mapping, newpage, page, sync);
+		rc = fallback_migrate_page(mapping, newpage, page, mode);
 
 	if (rc) {
 		newpage->mapping = NULL;
@@ -671,7 +675,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 }
 
 static int __unmap_and_move(struct page *page, struct page *newpage,
-				int force, bool offlining, bool sync)
+			int force, bool offlining, enum migrate_mode mode)
 {
 	int rc = -EAGAIN;
 	int remap_swapcache = 1;
@@ -680,7 +684,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	struct anon_vma *anon_vma = NULL;
 
 	if (!trylock_page(page)) {
-		if (!force || !sync)
+		if (!force || mode == MIGRATE_ASYNC)
 			goto out;
 
 		/*
@@ -726,10 +730,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 
 	if (PageWriteback(page)) {
 		/*
-		 * For !sync, there is no point retrying as the retry loop
-		 * is expected to be too short for PageWriteback to be cleared
+		 * Only in the case of a full syncronous migration is it
+		 * necessary to wait for PageWriteback. In the async case,
+		 * the retry loop is too short and in the sync-light case,
+		 * the overhead of stalling is too much
 		 */
-		if (!sync) {
+		if (mode != MIGRATE_SYNC) {
 			rc = -EBUSY;
 			goto uncharge;
 		}
@@ -800,7 +806,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 
 skip_unmap:
 	if (!page_mapped(page))
-		rc = move_to_new_page(newpage, page, remap_swapcache, sync);
+		rc = move_to_new_page(newpage, page, remap_swapcache, mode);
 
 	if (rc && remap_swapcache)
 		remove_migration_ptes(page, page);
@@ -823,7 +829,8 @@ out:
  * to the newly allocated page in newpage.
  */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, bool offlining, bool sync)
+			struct page *page, int force, bool offlining,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -843,7 +850,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 		if (unlikely(split_huge_page(page)))
 			goto out;
 
-	rc = __unmap_and_move(page, newpage, force, offlining, sync);
+	rc = __unmap_and_move(page, newpage, force, offlining, mode);
 out:
 	if (rc != -EAGAIN) {
 		/*
@@ -891,7 +898,8 @@ out:
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
 				unsigned long private, struct page *hpage,
-				int force, bool offlining, bool sync)
+				int force, bool offlining,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -904,7 +912,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	rc = -EAGAIN;
 
 	if (!trylock_page(hpage)) {
-		if (!force || !sync)
+		if (!force || mode != MIGRATE_SYNC)
 			goto out;
 		lock_page(hpage);
 	}
@@ -915,7 +923,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 
 	if (!page_mapped(hpage))
-		rc = move_to_new_page(new_hpage, hpage, 1, sync);
+		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
 	if (rc)
 		remove_migration_ptes(hpage, hpage);
@@ -958,7 +966,7 @@ out:
  */
 int migrate_pages(struct list_head *from,
 		new_page_t get_new_page, unsigned long private, bool offlining,
-		bool sync)
+		enum migrate_mode mode)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -979,7 +987,7 @@ int migrate_pages(struct list_head *from,
 
 			rc = unmap_and_move(get_new_page, private,
 						page, pass > 2, offlining,
-						sync);
+						mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1009,7 +1017,7 @@ out:
 
 int migrate_huge_pages(struct list_head *from,
 		new_page_t get_new_page, unsigned long private, bool offlining,
-		bool sync)
+		enum migrate_mode mode)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1026,7 +1034,7 @@ int migrate_huge_pages(struct list_head *from,
 
 			rc = unmap_and_move_huge_page(get_new_page,
 					private, page, pass > 2, offlining,
-					sync);
+					mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1155,7 +1163,7 @@ set_status:
 	err = 0;
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_page_node,
-				(unsigned long)pm, 0, true);
+				(unsigned long)pm, 0, MIGRATE_SYNC);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
-- 
cgit v1.2.3


From ae55e1aaa7e2e57e538cb98cf617f511c5dc4f73 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 12 Jan 2012 17:20:33 -0800
Subject: fs/direct-io.c: calculate fs_count correctly in get_more_blocks()

In get_more_blocks(), we use dio_count to calcuate fs_count and do some
tricky things to increase fs_count if dio_count isn't aligned.  But
actually it still has some corner cases that can't be coverd.  See the
following example:

	dio_write foo -s 1024 -w 4096

(direct write 4096 bytes at offset 1024).  The same goes if the offset
isn't aligned to fs_blocksize.

In this case, the old calculation counts fs_count to be 1, but actually we
will write into 2 different blocks (if fs_blocksize=4096).  The old code
just works, since it will call get_block twice (and may have to allocate
and create extents twice for filesystems like ext4).  So we'd better call
get_block just once with the proper fs_count.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index d740ab67ff6..389863f59cd 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -580,9 +580,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 {
 	int ret;
 	sector_t fs_startblk;	/* Into file, in filesystem-sized blocks */
+	sector_t fs_endblk;	/* Into file, in filesystem-sized blocks */
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
-	unsigned long dio_count;/* Number of dio_block-sized blocks */
-	unsigned long blkmask;
 	int create;
 
 	/*
@@ -593,11 +592,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 	if (ret == 0) {
 		BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
 		fs_startblk = sdio->block_in_file >> sdio->blkfactor;
-		dio_count = sdio->final_block_in_request - sdio->block_in_file;
-		fs_count = dio_count >> sdio->blkfactor;
-		blkmask = (1 << sdio->blkfactor) - 1;
-		if (dio_count & blkmask)	
-			fs_count++;
+		fs_endblk = (sdio->final_block_in_request - 1) >>
+					sdio->blkfactor;
+		fs_count = fs_endblk - fs_startblk + 1;
 
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
-- 
cgit v1.2.3


From 87192a2a49c475cf322cb143e0fa63b0102d8567 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 12 Jan 2012 17:20:34 -0800
Subject: vfs: cache request_queue in struct block_device

This makes it possible to get from the inode to the request_queue with one
less cache miss.  Used in followon optimization.

The livetime of the pointer is the same as the gendisk.

This assumes that the queue will always stay the same in the gendisk while
it's visible to block_devices.  I think that's safe correct?

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c     | 3 +++
 include/linux/fs.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index afe74dda632..0e575d1304b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1139,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
+		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
@@ -1159,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					disk_put_part(bdev->bd_part);
 					bdev->bd_part = NULL;
 					bdev->bd_disk = NULL;
+					bdev->bd_queue = NULL;
 					mutex_unlock(&bdev->bd_mutex);
 					disk_unblock_events(disk);
 					put_disk(disk);
@@ -1232,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk_put_part(bdev->bd_part);
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
+	bdev->bd_queue = NULL;
 	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e694bd4434a..4bc8169fb5a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -660,6 +660,7 @@ struct address_space {
 	 * must be enforced here for CRIS, to let the least significant bit
 	 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
 	 */
+struct request_queue;
 
 struct block_device {
 	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
@@ -682,6 +683,7 @@ struct block_device {
 	unsigned		bd_part_count;
 	int			bd_invalidated;
 	struct gendisk *	bd_disk;
+	struct request_queue *  bd_queue;
 	struct list_head	bd_list;
 	/*
 	 * Private data.  You must have bd_claim'ed the block_device
-- 
cgit v1.2.3


From 65dd2aa90aa17a26703c28652408192856aa0396 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 12 Jan 2012 17:20:35 -0800
Subject: dio: optimize cache misses in the submission path

Some investigation of a transaction processing workload showed that a
major consumer of cycles in __blockdev_direct_IO is the cache miss while
accessing the block size.  This is because it has to walk the chain from
block_dev to gendisk to queue.

The block size is needed early on to check alignment and sizes.  It's only
done if the check for the inode block size fails.  But the costly block
device state is unconditionally fetched.

- Reorganize the code to only fetch block dev state when actually
  needed.

Then do a prefetch on the block dev early on in the direct IO path.  This
is worth it, because there is substantial code run before we actually
touch the block dev now.

- I also added some unlikelies to make it clear the compiler that block
  device fetch code is not normally executed.

This gave a small, but measurable improvement on a large database
benchmark (about 0.3%)

[akpm@linux-foundation.org: coding-style fixes]
[sfr@canb.auug.org.au: using prefetch requires including prefetch.h]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c | 46 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 389863f59cd..4a588dbd11b 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -36,6 +36,7 @@
 #include <linux/rwsem.h>
 #include <linux/uio.h>
 #include <linux/atomic.h>
+#include <linux/prefetch.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -1087,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
  * individual fields and will generate much worse code. This is important
  * for the whole file.
  */
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+static inline ssize_t
+do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
@@ -1097,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	size_t size;
 	unsigned long addr;
 	unsigned blkbits = inode->i_blkbits;
-	unsigned bdev_blkbits = 0;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
@@ -1110,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (rw & WRITE)
 		rw = WRITE_ODIRECT;
 
-	if (bdev)
-		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+	/*
+	 * Avoid references to bdev if not absolutely needed to give
+	 * the early prefetch in the caller enough time.
+	 */
 
 	if (offset & blocksize_mask) {
 		if (bdev)
-			 blkbits = bdev_blkbits;
+			blkbits = blksize_bits(bdev_logical_block_size(bdev));
 		blocksize_mask = (1 << blkbits) - 1;
 		if (offset & blocksize_mask)
 			goto out;
@@ -1126,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		addr = (unsigned long)iov[seg].iov_base;
 		size = iov[seg].iov_len;
 		end += size;
-		if ((addr & blocksize_mask) || (size & blocksize_mask))  {
+		if (unlikely((addr & blocksize_mask) ||
+			     (size & blocksize_mask))) {
 			if (bdev)
-				 blkbits = bdev_blkbits;
+				blkbits = blksize_bits(
+					 bdev_logical_block_size(bdev));
 			blocksize_mask = (1 << blkbits) - 1;
-			if ((addr & blocksize_mask) || (size & blocksize_mask))  
+			if ((addr & blocksize_mask) || (size & blocksize_mask))
 				goto out;
 		}
 	}
@@ -1313,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 out:
 	return retval;
 }
+
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	dio_submit_t submit_io,	int flags)
+{
+	/*
+	 * The block device state is needed in the end to finally
+	 * submit everything.  Since it's likely to be cache cold
+	 * prefetch it here as first thing to hide some of the
+	 * latency.
+	 *
+	 * Attempt to prefetch the pieces we likely need later.
+	 */
+	prefetch(&bdev->bd_disk->part_tbl);
+	prefetch(bdev->bd_queue);
+	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+
+	return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+				     nr_segs, get_block, end_io,
+				     submit_io, flags);
+}
+
 EXPORT_SYMBOL(__blockdev_direct_IO);
 
 static __init int dio_init(void)
-- 
cgit v1.2.3


From b3f7f573a20081910e34e99cbc91831f4f02f1ff Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 12 Jan 2012 17:20:53 -0800
Subject: c/r: procfs: add start_data, end_data, start_brk members to
 /proc/$pid/stat v4

The mm->start_code/end_code, mm->start_data/end_data, mm->start_brk are
involved into calculation of program text/data segment sizes (which might
be seen in /proc/<pid>/statm) and into brk() call final address.

For restore we need to know all these values.  While
mm->start_code/end_code already present in /proc/$pid/stat, the rest
members are not, so this patch brings them in.

The restore procedure of these members is addressed in another patch using
prctl().

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Vagin <avagin@openvz.org>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 3 +++
 fs/proc/array.c                    | 7 +++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 12fee132fbe..a76a26a1db8 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -307,6 +307,9 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
   blkio_ticks   time spent waiting for block IO
   gtime         guest time of the task in jiffies
   cgtime        guest time of the task children in jiffies
+  start_data    address above which program data+bss is placed
+  end_data      address below which program data+bss is placed
+  start_brk     address above which program heap can be expanded with brk()
 ..............................................................................
 
 The /proc/PID/maps file containing the currently mapped memory regions and
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 8c344f037bd..9252ee3b71e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
 		pid_nr_ns(pid, ns),
 		tcomm,
 		state,
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		task->policy,
 		(unsigned long long)delayacct_blkio_ticks(task),
 		cputime_to_clock_t(gtime),
-		cputime_to_clock_t(cgtime));
+		cputime_to_clock_t(cgtime),
+		(mm && permitted) ? mm->start_data : 0,
+		(mm && permitted) ? mm->end_data : 0,
+		(mm && permitted) ? mm->start_brk : 0);
 	if (mm)
 		mmput(mm);
 	return 0;
-- 
cgit v1.2.3


From beba006074e7170d3bc91470c8a6c914730d4c63 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 11 Jan 2012 15:52:09 +0200
Subject: UBIFS: use snprintf instead of sprintf when printing keys

Switch to 'snprintf()' which is more secure and reliable. This is also a
preparation to the subsequent key printing fixes.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
---
 fs/ubifs/debug.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b09ba2dd8b6..6ae9fdcd202 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -103,8 +103,8 @@ static const char *get_dent_type(int type)
 	}
 }
 
-static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
-			char *buffer)
+static void snprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
+			 char *buffer, int len)
 {
 	char *p = buffer;
 	int type = key_type(c, key);
@@ -112,44 +112,46 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
 	if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
 		switch (type) {
 		case UBIFS_INO_KEY:
-			sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key),
-			       get_key_type(type));
+			len -= snprintf(p, len, "(%lu, %s)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type));
 			break;
 		case UBIFS_DENT_KEY:
 		case UBIFS_XENT_KEY:
-			sprintf(p, "(%lu, %s, %#08x)",
-				(unsigned long)key_inum(c, key),
-				get_key_type(type), key_hash(c, key));
+			len -= snprintf(p, len, "(%lu, %s, %#08x)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type), key_hash(c, key));
 			break;
 		case UBIFS_DATA_KEY:
-			sprintf(p, "(%lu, %s, %u)",
-				(unsigned long)key_inum(c, key),
-				get_key_type(type), key_block(c, key));
+			len -= snprintf(p, len, "(%lu, %s, %u)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type), key_block(c, key));
 			break;
 		case UBIFS_TRUN_KEY:
-			sprintf(p, "(%lu, %s)",
-				(unsigned long)key_inum(c, key),
-				get_key_type(type));
+			len -= snprintf(p, len, "(%lu, %s)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type));
 			break;
 		default:
-			sprintf(p, "(bad key type: %#08x, %#08x)",
-				key->u32[0], key->u32[1]);
+			len -= snprintf(p, len, "(bad key type: %#08x, %#08x)",
+					key->u32[0], key->u32[1]);
 		}
 	} else
-		sprintf(p, "bad key format %d", c->key_fmt);
+		len -= snprintf(p, len, "bad key format %d", c->key_fmt);
+	ubifs_assert(len > 0);
 }
 
 const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
 {
 	/* dbg_lock must be held */
-	sprintf_key(c, key, dbg_key_buf0);
+	snprintf_key(c, key, dbg_key_buf0, sizeof(dbg_key_buf0) - 1);
 	return dbg_key_buf0;
 }
 
 const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
 {
 	/* dbg_lock must be held */
-	sprintf_key(c, key, dbg_key_buf1);
+	snprintf_key(c, key, dbg_key_buf1, sizeof(dbg_key_buf1) - 1);
 	return dbg_key_buf1;
 }
 
-- 
cgit v1.2.3


From 515315a123af641a9533e4ff0f178c470dc08fc7 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 13 Jan 2012 12:33:53 +0200
Subject: UBIFS: fix key printing

Before commit 56e46742e846e4de167dde0e1e1071ace1c882a5 we have had locking
around all printing macros and we could use static buffers for creating
key strings and printing them. However, now we do not have that locking and
we cannot use static buffers. This commit removes the old DBGKEY() macros
and introduces few new helper macros for printing debugging messages plus
a key at the end. Thankfully, all the messages are already structures in
a way that the key is printed in the end.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
---
 fs/ubifs/debug.c    | 60 ++++++++++++++++++++++++++---------------------------
 fs/ubifs/debug.h    | 39 +++++++++++++++++++---------------
 fs/ubifs/journal.c  |  7 +++----
 fs/ubifs/replay.c   |  8 +++----
 fs/ubifs/tnc.c      | 55 ++++++++++++++++++++++++------------------------
 fs/ubifs/tnc_misc.c | 10 ++++-----
 6 files changed, 91 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 6ae9fdcd202..f922cbacdb9 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -38,9 +38,6 @@
 
 DEFINE_SPINLOCK(dbg_lock);
 
-static char dbg_key_buf0[128];
-static char dbg_key_buf1[128];
-
 static const char *get_key_fmt(int fmt)
 {
 	switch (fmt) {
@@ -103,8 +100,8 @@ static const char *get_dent_type(int type)
 	}
 }
 
-static void snprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
-			 char *buffer, int len)
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+			     const union ubifs_key *key, char *buffer, int len)
 {
 	char *p = buffer;
 	int type = key_type(c, key);
@@ -139,20 +136,7 @@ static void snprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
 	} else
 		len -= snprintf(p, len, "bad key format %d", c->key_fmt);
 	ubifs_assert(len > 0);
-}
-
-const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
-{
-	/* dbg_lock must be held */
-	snprintf_key(c, key, dbg_key_buf0, sizeof(dbg_key_buf0) - 1);
-	return dbg_key_buf0;
-}
-
-const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
-{
-	/* dbg_lock must be held */
-	snprintf_key(c, key, dbg_key_buf1, sizeof(dbg_key_buf1) - 1);
-	return dbg_key_buf1;
+	return p;
 }
 
 const char *dbg_ntype(int type)
@@ -321,6 +305,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 	int i, n;
 	union ubifs_key key;
 	const struct ubifs_ch *ch = node;
+	char key_buf[DBG_KEY_BUF_LEN];
 
 	if (dbg_is_tst_rcvry(c))
 		return;
@@ -476,7 +461,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 		const struct ubifs_ino_node *ino = node;
 
 		key_read(c, &ino->key, &key);
-		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+		printk(KERN_DEBUG "\tkey            %s\n",
+		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
 		printk(KERN_DEBUG "\tcreat_sqnum    %llu\n",
 		       (unsigned long long)le64_to_cpu(ino->creat_sqnum));
 		printk(KERN_DEBUG "\tsize           %llu\n",
@@ -519,7 +505,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 		int nlen = le16_to_cpu(dent->nlen);
 
 		key_read(c, &dent->key, &key);
-		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+		printk(KERN_DEBUG "\tkey            %s\n",
+		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
 		printk(KERN_DEBUG "\tinum           %llu\n",
 		       (unsigned long long)le64_to_cpu(dent->inum));
 		printk(KERN_DEBUG "\ttype           %d\n", (int)dent->type);
@@ -543,7 +530,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 		int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
 
 		key_read(c, &dn->key, &key);
-		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+		printk(KERN_DEBUG "\tkey            %s\n",
+		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
 		printk(KERN_DEBUG "\tsize           %u\n",
 		       le32_to_cpu(dn->size));
 		printk(KERN_DEBUG "\tcompr_typ      %d\n",
@@ -584,7 +572,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 			key_read(c, &br->key, &key);
 			printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
 			       i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
-			       le32_to_cpu(br->len), DBGKEY(&key));
+			       le32_to_cpu(br->len),
+			       dbg_snprintf_key(c, &key, key_buf,
+						DBG_KEY_BUF_LEN));
 		}
 		break;
 	}
@@ -936,6 +926,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
 {
 	int n;
 	const struct ubifs_zbranch *zbr;
+	char key_buf[DBG_KEY_BUF_LEN];
 
 	spin_lock(&dbg_lock);
 	if (znode->parent)
@@ -960,12 +951,16 @@ void dbg_dump_znode(const struct ubifs_info *c,
 			printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
 					  "%s\n", n, zbr->znode, zbr->lnum,
 					  zbr->offs, zbr->len,
-					  DBGKEY(&zbr->key));
+					  dbg_snprintf_key(c, &zbr->key,
+							   key_buf,
+							   DBG_KEY_BUF_LEN));
 		else
 			printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
 					  "%s\n", n, zbr->znode, zbr->lnum,
 					  zbr->offs, zbr->len,
-					  DBGKEY(&zbr->key));
+					  dbg_snprintf_key(c, &zbr->key,
+							   key_buf,
+							   DBG_KEY_BUF_LEN));
 	}
 	spin_unlock(&dbg_lock);
 }
@@ -1262,6 +1257,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 	int err, nlen1, nlen2, cmp;
 	struct ubifs_dent_node *dent1, *dent2;
 	union ubifs_key key;
+	char key_buf[DBG_KEY_BUF_LEN];
 
 	ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
 	dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
@@ -1292,9 +1288,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 	key_read(c, &dent1->key, &key);
 	if (keys_cmp(c, &zbr1->key, &key)) {
 		dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
-			zbr1->offs, DBGKEY(&key));
+			zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+						     DBG_KEY_BUF_LEN));
 		dbg_err("but it should have key %s according to tnc",
-			DBGKEY(&zbr1->key));
+			dbg_snprintf_key(c, &zbr1->key, key_buf,
+					 DBG_KEY_BUF_LEN));
 		dbg_dump_node(c, dent1);
 		goto out_free;
 	}
@@ -1302,9 +1300,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 	key_read(c, &dent2->key, &key);
 	if (keys_cmp(c, &zbr2->key, &key)) {
 		dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
-			zbr1->offs, DBGKEY(&key));
+			zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+						     DBG_KEY_BUF_LEN));
 		dbg_err("but it should have key %s according to tnc",
-			DBGKEY(&zbr2->key));
+			dbg_snprintf_key(c, &zbr2->key, key_buf,
+					 DBG_KEY_BUF_LEN));
 		dbg_dump_node(c, dent2);
 		goto out_free;
 	}
@@ -1321,7 +1321,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 		dbg_err("2 xent/dent nodes with the same name");
 	else
 		dbg_err("bad order of colliding key %s",
-			DBGKEY(&key));
+			dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
 
 	ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
 	dbg_dump_node(c, dent1);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index c9d294111a5..307ab1d23f7 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -169,41 +169,39 @@ struct ubifs_global_debug_info {
 	spin_unlock(&dbg_lock);                                                \
 } while (0)
 
-const char *dbg_key_str0(const struct ubifs_info *c,
-			 const union ubifs_key *key);
-const char *dbg_key_str1(const struct ubifs_info *c,
-			 const union ubifs_key *key);
-
-/*
- * TODO: these macros are now broken because there is no locking around them
- * and we use a global buffer for the key string. This means that in case of
- * concurrent execution we will end up with incorrect and messy key strings.
- */
-#define DBGKEY(key) dbg_key_str0(c, (key))
-#define DBGKEY1(key) dbg_key_str1(c, (key))
-
-extern spinlock_t dbg_lock;
-
 #define ubifs_dbg_msg(type, fmt, ...) \
 	pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
 
+#define DBG_KEY_BUF_LEN 32
+#define ubifs_dbg_msg_key(type, key, fmt, ...) do {                            \
+	char __tmp_key_buf[DBG_KEY_BUF_LEN];                                   \
+	pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__,             \
+		 dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN));    \
+} while (0)
+
 /* Just a debugging messages not related to any specific UBIFS subsystem */
-#define dbg_msg(fmt, ...)                                                     \
-	printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,  \
+#define dbg_msg(fmt, ...)                                                      \
+	printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,   \
 	       __func__, ##__VA_ARGS__)
 
 /* General messages */
 #define dbg_gen(fmt, ...)   ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
 /* Additional journal messages */
 #define dbg_jnl(fmt, ...)   ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) \
+	ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__)
 /* Additional TNC messages */
 #define dbg_tnc(fmt, ...)   ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) \
+	ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__)
 /* Additional lprops messages */
 #define dbg_lp(fmt, ...)    ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
 /* Additional LEB find messages */
 #define dbg_find(fmt, ...)  ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
 /* Additional mount messages */
 #define dbg_mnt(fmt, ...)   ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) \
+	ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__)
 /* Additional I/O messages */
 #define dbg_io(fmt, ...)    ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
 /* Additional commit messages */
@@ -219,6 +217,7 @@ extern spinlock_t dbg_lock;
 /* Additional recovery messages */
 #define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
 
+extern spinlock_t dbg_lock;
 extern struct ubifs_global_debug_info ubifs_dbg;
 
 static inline int dbg_is_chk_gen(const struct ubifs_info *c)
@@ -259,6 +258,8 @@ const char *dbg_cstate(int cmt_state);
 const char *dbg_jhead(int jhead);
 const char *dbg_get_key_dump(const struct ubifs_info *c,
 			     const union ubifs_key *key);
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+			     const union ubifs_key *key, char *buffer, int len);
 void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode);
 void dbg_dump_node(const struct ubifs_info *c, const void *node);
 void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
@@ -369,6 +370,10 @@ static inline const char *dbg_jhead(int jhead)                    { return ""; }
 static inline const char *
 dbg_get_key_dump(const struct ubifs_info *c,
 		 const union ubifs_key *key)                      { return ""; }
+static inline const char *
+dbg_snprintf_key(const struct ubifs_info *c,
+		 const union ubifs_key *key, char *buffer,
+		 int len)                                         { return ""; }
 static inline void dbg_dump_inode(struct ubifs_info *c,
 				  const struct inode *inode)      { return; }
 static inline void dbg_dump_node(const struct ubifs_info *c,
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index cef0460f4c5..2f438ab2e7a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -697,9 +697,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 	int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
-	dbg_jnl("ino %lu, blk %u, len %d, key %s",
-		(unsigned long)key_inum(c, key), key_block(c, key), len,
-		DBGKEY(key));
+	dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
+		(unsigned long)key_inum(c, key), key_block(c, key), len);
 	ubifs_assert(len <= UBIFS_BLOCK_SIZE);
 
 	data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
@@ -1177,7 +1176,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 		dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
 		blk = new_size >> UBIFS_BLOCK_SHIFT;
 		data_key_init(c, &key, inum, blk);
-		dbg_jnl("last block key %s", DBGKEY(&key));
+		dbg_jnlk(&key, "last block key ");
 		err = ubifs_tnc_lookup(c, &key, dn);
 		if (err == -ENOENT)
 			dlen = 0; /* Not found (so it is a hole) */
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ccabaf1164b..b007637f040 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -221,8 +221,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 {
 	int err;
 
-	dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
-		r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
+	dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ",
+		 r->lnum, r->offs, r->len, r->deletion, r->sqnum);
 
 	/* Set c->replay_sqnum to help deal with dangling branches. */
 	c->replay_sqnum = r->sqnum;
@@ -361,7 +361,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
 {
 	struct replay_entry *r;
 
-	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+	dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
 
 	if (key_inum(c, key) >= c->highest_inum)
 		c->highest_inum = key_inum(c, key);
@@ -409,7 +409,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
 	struct replay_entry *r;
 	char *nbuf;
 
-	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+	dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
 	if (key_inum(c, key) >= c->highest_inum)
 		c->highest_inum = key_inum(c, key);
 
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 06673864768..cbd6dd31098 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
 {
 	int ret;
 
-	dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+	dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
 
 	ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
 			    zbr->offs);
@@ -520,8 +520,8 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
 			ret = 0;
 	}
 	if (ret == 0 && c->replaying)
-		dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
-			zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+		dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ",
+			zbr->lnum, zbr->offs, zbr->len);
 	return ret;
 }
 
@@ -996,9 +996,9 @@ static int fallible_resolve_collision(struct ubifs_info *c,
 	if (adding || !o_znode)
 		return 0;
 
-	dbg_mnt("dangling match LEB %d:%d len %d %s",
+	dbg_mntk(key, "dangling match LEB %d:%d len %d key ",
 		o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
-		o_znode->zbranch[o_n].len, DBGKEY(key));
+		o_znode->zbranch[o_n].len);
 	*zn = o_znode;
 	*n = o_n;
 	return 1;
@@ -1180,7 +1180,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
 	struct ubifs_znode *znode;
 	unsigned long time = get_seconds();
 
-	dbg_tnc("search key %s", DBGKEY(key));
+	dbg_tnck(key, "search key ");
 	ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
 
 	znode = c->zroot.znode;
@@ -1316,7 +1316,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
 	struct ubifs_znode *znode;
 	unsigned long time = get_seconds();
 
-	dbg_tnc("search and dirty key %s", DBGKEY(key));
+	dbg_tnck(key, "search and dirty key ");
 
 	znode = c->zroot.znode;
 	if (unlikely(!znode)) {
@@ -1723,8 +1723,8 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
 	if (!keys_eq(c, &zbr->key, &key1)) {
 		ubifs_err("bad key in node at LEB %d:%d",
 			  zbr->lnum, zbr->offs);
-		dbg_tnc("looked for key %s found node's key %s",
-			DBGKEY(&zbr->key), DBGKEY1(&key1));
+		dbg_tnck(&zbr->key, "looked for key ");
+		dbg_tnck(&key1, "found node's key ");
 		goto out_err;
 	}
 
@@ -1777,7 +1777,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
 		ubifs_err("failed to read from LEB %d:%d, error %d",
 			  lnum, offs, err);
 		dbg_dump_stack();
-		dbg_tnc("key %s", DBGKEY(&bu->key));
+		dbg_tnck(&bu->key, "key ");
 		return err;
 	}
 
@@ -1812,7 +1812,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 	int found, n, err;
 	struct ubifs_znode *znode;
 
-	dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+	dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
 	mutex_lock(&c->tnc_mutex);
 	found = ubifs_lookup_level0(c, key, &znode, &n);
 	if (!found) {
@@ -1986,8 +1986,7 @@ again:
 	zp = znode->parent;
 	if (znode->child_cnt < c->fanout) {
 		ubifs_assert(n != c->fanout);
-		dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
-			DBGKEY(key));
+		dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level);
 
 		insert_zbranch(znode, zbr, n);
 
@@ -2002,7 +2001,7 @@ again:
 	 * Unfortunately, @znode does not have more empty slots and we have to
 	 * split it.
 	 */
-	dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+	dbg_tnck(key, "splitting level %d, key ", znode->level);
 
 	if (znode->alt)
 		/*
@@ -2096,7 +2095,7 @@ do_split:
 	}
 
 	/* Insert new key and branch */
-	dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+	dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level);
 
 	insert_zbranch(zi, zbr, n);
 
@@ -2172,7 +2171,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+	dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len);
 	found = lookup_level0_dirty(c, key, &znode, &n);
 	if (!found) {
 		struct ubifs_zbranch zbr;
@@ -2221,8 +2220,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
-		old_offs, lnum, offs, len, DBGKEY(key));
+	dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum,
+		 old_offs, lnum, offs, len);
 	found = lookup_level0_dirty(c, key, &znode, &n);
 	if (found < 0) {
 		err = found;
@@ -2304,8 +2303,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
-		DBGKEY(key));
+	dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
+		 lnum, offs, nm->len, nm->name);
 	found = lookup_level0_dirty(c, key, &znode, &n);
 	if (found < 0) {
 		err = found;
@@ -2398,7 +2397,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
 	/* Delete without merge for now */
 	ubifs_assert(znode->level == 0);
 	ubifs_assert(n >= 0 && n < c->fanout);
-	dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+	dbg_tnck(&znode->zbranch[n].key, "deleting key ");
 
 	zbr = &znode->zbranch[n];
 	lnc_free(zbr);
@@ -2508,7 +2507,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnc("key %s", DBGKEY(key));
+	dbg_tnck(key, "key ");
 	found = lookup_level0_dirty(c, key, &znode, &n);
 	if (found < 0) {
 		err = found;
@@ -2539,7 +2538,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+	dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
 	err = lookup_level0_dirty(c, key, &znode, &n);
 	if (err < 0)
 		goto out_unlock;
@@ -2654,7 +2653,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
 				dbg_dump_znode(c, znode);
 				goto out_unlock;
 			}
-			dbg_tnc("removing %s", DBGKEY(key));
+			dbg_tnck(key, "removing key ");
 		}
 		if (k) {
 			for (i = n + 1 + k; i < znode->child_cnt; i++)
@@ -2774,7 +2773,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
 	struct ubifs_zbranch *zbr;
 	union ubifs_key *dkey;
 
-	dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+	dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
 	ubifs_assert(is_hash_key(c, key));
 
 	mutex_lock(&c->tnc_mutex);
@@ -3333,9 +3332,9 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
 
 out_dump:
 	block = key_block(c, key);
-	ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
-		  "(data key %s)", (unsigned long)inode->i_ino, size,
-		  ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+	ubifs_err("inode %lu has size %lld, but there are data at offset %lld",
+		  (unsigned long)inode->i_ino, size,
+		  ((loff_t)block) << UBIFS_BLOCK_SHIFT);
 	mutex_unlock(&c->tnc_mutex);
 	dbg_dump_inode(c, inode);
 	dbg_dump_stack();
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index b48db999903..dc28fe6ec07 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
 		case UBIFS_XENT_KEY:
 			break;
 		default:
-			dbg_msg("bad key type at slot %d: %s", i,
-				DBGKEY(&zbr->key));
+			dbg_msg("bad key type at slot %d: %d",
+				i, key_type(c, &zbr->key));
 			err = 3;
 			goto out_dump;
 		}
@@ -475,7 +475,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 				      zbr->offs);
 
 	if (err) {
-		dbg_tnc("key %s", DBGKEY(key));
+		dbg_tnck(key, "key ");
 		return err;
 	}
 
@@ -484,8 +484,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 	if (!keys_eq(c, key, &key1)) {
 		ubifs_err("bad key in node at LEB %d:%d",
 			  zbr->lnum, zbr->offs);
-		dbg_tnc("looked for key %s found node's key %s",
-			DBGKEY(key), DBGKEY1(&key1));
+		dbg_tnck(key, "looked for key ");
+		dbg_tnck(&key1, "but found node's key ");
 		dbg_dump_node(c, node);
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 8638094e956a47dbb9a25166705a91e9a0981d52 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 13 Jan 2012 20:41:46 +0800
Subject: autofs4 - fix deal with autofs4_write races

I don't know how I missed this obvious mistake when I
reviewed Als' patches, sorry.

[ Quoting Al:

	Grr...  Note to self: do git status *and* git stash show -p
	before git push.  Nothing like "WTF? I'd fixed that braino"
	feeling ;-/

  Al sent the same patch - it got broken in commit d668dc56631d:
  "autofs4: deal with autofs4_write/autofs4_write races". ]

Reported-and-tested-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/waitq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 9ef5b291440..da8876d38a7 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -76,7 +76,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
 		data += wr;
 		bytes -= wr;
 	}
-	mutex_lock(&sbi->pipe_mutex);
+	mutex_unlock(&sbi->pipe_mutex);
 
 	set_fs(fs);
 
-- 
cgit v1.2.3


From 673e8e597c06eb81954bf21a10f5cce74a1de8f1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:04 +0000
Subject: xfs: remove xfs_itruncate_data

This wrapper isn't overly useful, not to say rather confusing.

Around the call to xfs_itruncate_extents it does:

 - add tracing
 - add a few asserts in debug builds
 - conditionally update the inode size in two places
 - log the inode

Both the tracing and the inode logging can be moved to xfs_itruncate_extents
as they are useful for the attribute fork as well - in fact the attr code
already does an equivalent xfs_trans_log_inode call just after calling
xfs_itruncate_extents.  The conditional size updates are a mess, and there
was no reason to do them in two places anyway, as the first one was
conditional on the inode having extents - but without extents we
xfs_itruncate_extents would be a no-op and the placement wouldn't matter
anyway.  Instead move the size assignments and the asserts that make sense
to the callers that want it.

As a side effect of this clean up xfs_setattr_size by introducing variables
for the old and new inode size, and moving the size updates into a common
place.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr.c        |   4 --
 fs/xfs/xfs_inode.c       | 124 ++++-------------------------------------------
 fs/xfs/xfs_inode.h       |   2 -
 fs/xfs/xfs_iops.c        |  47 +++++++++++-------
 fs/xfs/xfs_qm_syscalls.c |   9 +++-
 fs/xfs/xfs_trace.h       |   4 +-
 fs/xfs/xfs_vnodeops.c    |  17 ++++++-
 7 files changed, 65 insertions(+), 142 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1e5d97f86ea..08b9ac644c3 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	if (error)
 		goto out;
 
-	/*
-	 * Commit the last in the sequence of transactions.
-	 */
-	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
 	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9dda7cc3284..ccd619a993f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1165,52 +1165,6 @@ xfs_ialloc(
 	return 0;
 }
 
-/*
- * Check to make sure that there are no blocks allocated to the
- * file beyond the size of the file.  We don't check this for
- * files with fixed size extents or real time extents, but we
- * at least do it for regular files.
- */
-#ifdef DEBUG
-STATIC void
-xfs_isize_check(
-	struct xfs_inode	*ip,
-	xfs_fsize_t		isize)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		map_first;
-	int			nimaps;
-	xfs_bmbt_irec_t		imaps[2];
-	int			error;
-
-	if (!S_ISREG(ip->i_d.di_mode))
-		return;
-
-	if (XFS_IS_REALTIME_INODE(ip))
-		return;
-
-	if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-		return;
-
-	nimaps = 2;
-	map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-	/*
-	 * The filesystem could be shutting down, so bmapi may return
-	 * an error.
-	 */
-	error = xfs_bmapi_read(ip, map_first,
-			 (XFS_B_TO_FSB(mp,
-			       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
-			 imaps, &nimaps, XFS_BMAPI_ENTIRE);
-	if (error)
-		return;
-	ASSERT(nimaps == 1);
-	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
-}
-#else	/* DEBUG */
-#define xfs_isize_check(ip, isize)
-#endif	/* DEBUG */
-
 /*
  * Free up the underlying blocks past new_size.  The new size must be smaller
  * than the current size.  This routine can be used both for the attribute and
@@ -1258,6 +1212,8 @@ xfs_itruncate_extents(
 	ASSERT(ip->i_itemp->ili_lock_flags == 0);
 	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 
+	trace_xfs_itruncate_extents_start(ip, new_size);
+
 	/*
 	 * Since it is possible for space to become allocated beyond
 	 * the end of the file (in a crash where the space is allocated
@@ -1325,6 +1281,14 @@ xfs_itruncate_extents(
 			goto out;
 	}
 
+	/*
+	 * Always re-log the inode so that our permanent transaction can keep
+	 * on rolling it forward in the log.
+	 */
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	trace_xfs_itruncate_extents_end(ip, new_size);
+
 out:
 	*tpp = tp;
 	return error;
@@ -1338,74 +1302,6 @@ out_bmap_cancel:
 	goto out;
 }
 
-int
-xfs_itruncate_data(
-	struct xfs_trans	**tpp,
-	struct xfs_inode	*ip,
-	xfs_fsize_t		new_size)
-{
-	int			error;
-
-	trace_xfs_itruncate_data_start(ip, new_size);
-
-	/*
-	 * The first thing we do is set the size to new_size permanently on
-	 * disk.  This way we don't have to worry about anyone ever being able
-	 * to look at the data being freed even in the face of a crash.
-	 * What we're getting around here is the case where we free a block, it
-	 * is allocated to another file, it is written to, and then we crash.
-	 * If the new data gets written to the file but the log buffers
-	 * containing the free and reallocation don't, then we'd end up with
-	 * garbage in the blocks being freed.  As long as we make the new_size
-	 * permanent before actually freeing any blocks it doesn't matter if
-	 * they get written to.
-	 */
-	if (ip->i_d.di_nextents > 0) {
-		/*
-		 * If we are not changing the file size then do not update
-		 * the on-disk file size - we may be called from
-		 * xfs_inactive_free_eofblocks().  If we update the on-disk
-		 * file size and then the system crashes before the contents
-		 * of the file are flushed to disk then the files may be
-		 * full of holes (ie NULL files bug).
-		 */
-		if (ip->i_size != new_size) {
-			ip->i_d.di_size = new_size;
-			ip->i_size = new_size;
-			xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-		}
-	}
-
-	error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
-	if (error)
-		return error;
-
-	/*
-	 * If we are not changing the file size then do not update the on-disk
-	 * file size - we may be called from xfs_inactive_free_eofblocks().
-	 * If we update the on-disk file size and then the system crashes
-	 * before the contents of the file are flushed to disk then the files
-	 * may be full of holes (ie NULL files bug).
-	 */
-	xfs_isize_check(ip, new_size);
-	if (ip->i_size != new_size) {
-		ip->i_d.di_size = new_size;
-		ip->i_size = new_size;
-	}
-
-	ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
-	ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
-
-	/*
-	 * Always re-log the inode so that our permanent transaction can keep
-	 * on rolling it forward in the log.
-	 */
-	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-
-	trace_xfs_itruncate_data_end(ip, new_size);
-	return 0;
-}
-
 /*
  * This is called when the inode's link count goes to 0.
  * We place the on-disk inode on a list in the AGI.  It
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f0e6b151ba3..440f2acebfa 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -491,8 +491,6 @@ int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
 			   struct xfs_bmap_free *);
 int		xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
 				      int, xfs_fsize_t);
-int		xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
-				   xfs_fsize_t);
 int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f9babd17922..f02eaa298d3 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -750,6 +750,7 @@ xfs_setattr_size(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct inode		*inode = VFS_I(ip);
 	int			mask = iattr->ia_valid;
+	xfs_off_t		oldsize, newsize;
 	struct xfs_trans	*tp;
 	int			error;
 	uint			lock_flags;
@@ -777,11 +778,13 @@ xfs_setattr_size(
 		lock_flags |= XFS_IOLOCK_EXCL;
 	xfs_ilock(ip, lock_flags);
 
+	oldsize = ip->i_size;
+	newsize = iattr->ia_size;
+
 	/*
 	 * Short circuit the truncate case for zero length files.
 	 */
-	if (iattr->ia_size == 0 &&
-	    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+	if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
 		if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
 			goto out_unlock;
 
@@ -807,14 +810,14 @@ xfs_setattr_size(
 	 * the inode to the transaction, because the inode cannot be unlocked
 	 * once it is a part of the transaction.
 	 */
-	if (iattr->ia_size > ip->i_size) {
+	if (newsize > oldsize) {
 		/*
 		 * Do the first part of growing a file: zero any data in the
 		 * last block that is beyond the old EOF.  We need to do this
 		 * before the inode is joined to the transaction to modify
 		 * i_size.
 		 */
-		error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+		error = xfs_zero_eof(ip, newsize, oldsize);
 		if (error)
 			goto out_unlock;
 	}
@@ -833,8 +836,8 @@ xfs_setattr_size(
 	 * here and prevents waiting for other data not within the range we
 	 * care about here.
 	 */
-	if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
-		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+	if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+		error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
 					FI_NONE);
 		if (error)
 			goto out_unlock;
@@ -845,8 +848,7 @@ xfs_setattr_size(
 	 */
 	inode_dio_wait(inode);
 
-	error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
-				     xfs_get_blocks);
+	error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
 	if (error)
 		goto out_unlock;
 
@@ -857,7 +859,7 @@ xfs_setattr_size(
 	if (error)
 		goto out_trans_cancel;
 
-	truncate_setsize(inode, iattr->ia_size);
+	truncate_setsize(inode, newsize);
 
 	commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 	lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +878,30 @@ xfs_setattr_size(
 	 * these flags set.  For all other operations the VFS set these flags
 	 * explicitly if it wants a timestamp update.
 	 */
-	if (iattr->ia_size != ip->i_size &&
-	    (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
 		iattr->ia_ctime = iattr->ia_mtime =
 			current_fs_time(inode->i_sb);
 		mask |= ATTR_CTIME | ATTR_MTIME;
 	}
 
-	if (iattr->ia_size > ip->i_size) {
-		ip->i_d.di_size = iattr->ia_size;
-		ip->i_size = iattr->ia_size;
-	} else if (iattr->ia_size <= ip->i_size ||
-		   (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
-		error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+	/*
+	 * The first thing we do is set the size to new_size permanently on
+	 * disk.  This way we don't have to worry about anyone ever being able
+	 * to look at the data being freed even in the face of a crash.
+	 * What we're getting around here is the case where we free a block, it
+	 * is allocated to another file, it is written to, and then we crash.
+	 * If the new data gets written to the file but the log buffers
+	 * containing the free and reallocation don't, then we'd end up with
+	 * garbage in the blocks being freed.  As long as we make the new size
+	 * permanent before actually freeing any blocks it doesn't matter if
+	 * they get written to.
+	 */
+	ip->i_d.di_size = newsize;
+	ip->i_size = newsize;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (newsize <= oldsize) {
+		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
 		if (error)
 			goto out_trans_abort;
 
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5cc3dde1bc9..27378650b5c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -31,6 +31,7 @@
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -263,13 +264,19 @@ xfs_qm_scall_trunc_qfile(
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
 
-	error = xfs_itruncate_data(&tp, ip, 0);
+	ip->i_d.di_size = 0;
+	ip->i_size = 0;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
 	if (error) {
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
 				     XFS_TRANS_ABORT);
 		goto out_unlock;
 	}
 
+	ASSERT(ip->i_d.di_nextents == 0);
+
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a9d5b1e06ef..297f9fa6fb6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1090,8 +1090,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
 DEFINE_EVENT(xfs_itrunc_class, name, \
 	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
 	TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
 
 TRACE_EVENT(xfs_pagecache_inval,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f2fea868d4d..96ff0342175 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -226,7 +226,14 @@ xfs_free_eofblocks(
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		xfs_trans_ijoin(tp, ip, 0);
 
-		error = xfs_itruncate_data(&tp, ip, ip->i_size);
+		/*
+		 * Do not update the on-disk file size.  If we update the
+		 * on-disk file size and then the system crashes before the
+		 * contents of the file are flushed to disk then the files
+		 * may be full of holes (ie NULL files bug).
+		 */
+		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+					      ip->i_size);
 		if (error) {
 			/*
 			 * If we get an error at this point we simply don't
@@ -670,13 +677,19 @@ xfs_inactive(
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		xfs_trans_ijoin(tp, ip, 0);
 
-		error = xfs_itruncate_data(&tp, ip, 0);
+		ip->i_d.di_size = 0;
+		ip->i_size = 0;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
 		if (error) {
 			xfs_trans_cancel(tp,
 				XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 			return VN_INACTIVE_CACHE;
 		}
+
+		ASSERT(ip->i_d.di_nextents == 0);
 	} else if (S_ISLNK(ip->i_d.di_mode)) {
 
 		/*
-- 
cgit v1.2.3


From bf322d983e540f66517db85b6870017613bb1e8d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:05 +0000
Subject: xfs: cleanup xfs_iomap_eof_align_last_fsb

Replace the nasty if, else if, elseif condition with more natural C flow
that expressed the logic we want here better.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_iomap.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282aa93..a27a44659da 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
 	xfs_fileoff_t	*last_fsb)
 {
 	xfs_fileoff_t	new_last_fsb = 0;
-	xfs_extlen_t	align;
+	xfs_extlen_t	align = 0;
 	int		eof, error;
 
-	if (XFS_IS_REALTIME_INODE(ip))
-		;
-	/*
-	 * If mounted with the "-o swalloc" option, roundup the allocation
-	 * request to a stripe width boundary if the file size is >=
-	 * stripe width and we are allocating past the allocation eof.
-	 */
-	else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
-	        (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
-		new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
-	/*
-	 * Roundup the allocation request to a stripe unit (m_dalign) boundary
-	 * if the file size is >= stripe unit size, and we are allocating past
-	 * the allocation eof.
-	 */
-	else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
-		new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+	if (!XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * Round up the allocation request to a stripe unit
+		 * (m_dalign) boundary if the file size is >= stripe unit
+		 * size, and we are allocating past the allocation eof.
+		 *
+		 * If mounted with the "-o swalloc" option the alignment is
+		 * increased from the strip unit size to the stripe width.
+		 */
+		if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+			align = mp->m_swidth;
+		else if (mp->m_dalign)
+			align = mp->m_dalign;
+
+		if (align && ip->i_size >= XFS_FSB_TO_B(mp, align))
+			new_last_fsb = roundup_64(*last_fsb, align);
+	}
 
 	/*
 	 * Always round up the allocation request to an extent boundary
-- 
cgit v1.2.3


From 3d2b3129c2c48cf0153e0f2058cf87e4b45ca3ac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:06 +0000
Subject: xfs: remove the unused dm_attrs structure

.. and the just as dead bhv_desc forward declaration while we're at it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 440f2acebfa..c1574721a19 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -211,7 +211,6 @@ typedef struct xfs_icdinode {
 
 #ifdef __KERNEL__
 
-struct bhv_desc;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot;
 
-typedef struct dm_attrs_s {
-	__uint32_t	da_dmevmask;	/* DMIG event mask */
-	__uint16_t	da_dmstate;	/* DMIG state info */
-	__uint16_t	da_pad;		/* DMIG extra padding */
-} dm_attrs_t;
-
 typedef struct xfs_inode {
 	/* Inode linking and identification information. */
 	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
-- 
cgit v1.2.3


From 69e4747ee9727d660b88d7e1efe0f4afcb35db1b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 8 Jan 2012 17:07:28 +0200
Subject: Unused iocbs in a batch should not be accounted as active.

Since commit 080d676de095 ("aio: allocate kiocbs in batches") iocbs are
allocated in a batch during processing of first iocbs.  All iocbs in a
batch are automatically added to ctx->active_reqs list and accounted in
ctx->reqs_active.

If one (not the last one) of iocbs submitted by an user fails, further
iocbs are not processed, but they are still present in ctx->active_reqs
and accounted in ctx->reqs_active.  This causes process to stuck in a D
state in wait_for_all_aios() on exit since ctx->reqs_active will never
go down to zero.  Furthermore since kiocb_batch_free() frees iocb
without removing it from active_reqs list the list become corrupted
which may cause oops.

Fix this by removing iocb from ctx->active_reqs and updating
ctx->reqs_active in kiocb_batch_free().

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Cc: stable@kernel.org   # 3.2
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 78c514cfd21..969beb0e223 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -476,14 +476,21 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total)
 	batch->count = total;
 }
 
-static void kiocb_batch_free(struct kiocb_batch *batch)
+static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
 {
 	struct kiocb *req, *n;
 
+	if (list_empty(&batch->head))
+		return;
+
+	spin_lock_irq(&ctx->ctx_lock);
 	list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
 		list_del(&req->ki_batch);
+		list_del(&req->ki_list);
 		kmem_cache_free(kiocb_cachep, req);
+		ctx->reqs_active--;
 	}
+	spin_unlock_irq(&ctx->ctx_lock);
 }
 
 /*
@@ -1742,7 +1749,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 	}
 	blk_finish_plug(&plug);
 
-	kiocb_batch_free(&batch);
+	kiocb_batch_free(ctx, &batch);
 	put_ioctx(ctx);
 	return i ? i : ret;
 }
-- 
cgit v1.2.3


From fed474857efbed79cd390d0aee224231ca718f63 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 12 Jan 2012 17:59:46 +0100
Subject: fsnotify: don't BUG in fsnotify_destroy_mark()

Removing the parent of a watched file results in "kernel BUG at
fs/notify/mark.c:139".

To reproduce

  add "-w /tmp/audit/dir/watched_file" to audit.rules
  rm -rf /tmp/audit/dir

This is caused by fsnotify_destroy_mark() being called without an
extra reference taken by the caller.

Reported by Francesco Cosoleto here:

  https://bugzilla.novell.com/show_bug.cgi?id=689860

Fix by removing the BUG_ON and adding a comment about not accessing mark after
the iput.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/mark.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index e14587d5568..f104d565b68 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -135,9 +135,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
 
-	/* 1 from caller and 1 for being on i_list/g_list */
-	BUG_ON(atomic_read(&mark->refcnt) < 2);
-
 	spin_lock(&group->mark_lock);
 
 	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
@@ -181,6 +178,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
 		iput(inode);
 
+	/*
+	 * We don't necessarily have a ref on mark from caller so the above iput
+	 * may have already destroyed it.  Don't touch from now on.
+	 */
+
 	/*
 	 * it's possible that this group tried to destroy itself, but this
 	 * this mark was simultaneously being freed by inode.  If that's the
-- 
cgit v1.2.3


From e234b5f2079414b3d772286e3ee00e7bbf6da833 Mon Sep 17 00:00:00 2001
From: Dominique Martinet <asmadeus@codewreck.org>
Date: Sun, 15 Jan 2012 00:28:03 +0100
Subject: UBIFS: fix non-debug configuration build

Fix a brown paperbag bug introduced by me in the previous commit. I was
in hurry and forgot about the non-debug case completely.

Artem: amend the commit message and tweak the patch to preserve alignment.
       This made the patch a bit less readable, though.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@linux.intel.com>
---
 fs/ubifs/debug.h | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 307ab1d23f7..ad1a6fee601 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -347,20 +347,23 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define dbg_dump_stack()
 #define ubifs_assert_cmt_locked(c)
 
-#define dbg_msg(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gen(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)        ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)      ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)        ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)      ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)        ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)      ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...)     ubifs_dbg_msg(fmt, ##__VA_ARGS__)
 
 static inline int ubifs_debugging_init(struct ubifs_info *c)      { return 0; }
 static inline void ubifs_debugging_exit(struct ubifs_info *c)     { return; }
-- 
cgit v1.2.3


From f7e6746ebae984ea67b0a1a1e23c7e6698240631 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Sat, 14 Jan 2012 00:01:01 +0000
Subject: sched/accounting, proc: Fix /proc/stat interrupts sum

Commit 3292beb340c7688 ("sched/accounting: Change cpustat fields to an array")
deleted the code which provides us with the sum of all interrupts in the
system, causing vmstat to report zero interrupts occuring in the system.

Fix this by restoring the code.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Russell King <rmk+kernel@arm.linux.org.uk> # [on ARM]
Tested-by: Tony Luck <tony.luck@intel.com>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Glauber Costa <glommer@parallels.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Tuner <pjt@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/stat.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index d76ca6ae2b1..121f77cfef7 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -77,6 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
 		steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
 		guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
 		guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+		sum += kstat_cpu_irqs_sum(i);
+		sum += arch_irq_stat_cpu(i);
 
 		for (j = 0; j < NR_SOFTIRQS; j++) {
 			unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
-- 
cgit v1.2.3


From 6fef8df1dcb9b586268caff66df1d71ce8610132 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: get rid of *_alloc_profile fields

{data,metadata,system}_alloc_profile fields have been unused for a long
time now.  Get rid of them.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h       |  3 ---
 fs/btrfs/disk-io.c     |  3 ---
 fs/btrfs/extent-tree.c | 10 ++++------
 fs/btrfs/volumes.c     |  6 ++----
 4 files changed, 6 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323..f5434ad49b9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1135,9 +1135,6 @@ struct btrfs_fs_info {
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
-	u64 data_alloc_profile;
-	u64 metadata_alloc_profile;
-	u64 system_alloc_profile;
 
 	unsigned data_chunk_allocations;
 	unsigned metadata_ratio;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3f9d5551e58..ce9d0fb3627 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2321,9 +2321,6 @@ retry_root_backup:
 
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
-	fs_info->data_alloc_profile = (u64)-1;
-	fs_info->metadata_alloc_profile = (u64)-1;
-	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
 	ret = btrfs_init_space_info(fs_info);
 	if (ret) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8603ee4e3df..f0591fd6624 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3067,14 +3067,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		flags |= root->fs_info->avail_data_alloc_bits &
-			 root->fs_info->data_alloc_profile;
+		flags |= root->fs_info->avail_data_alloc_bits;
 	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		flags |= root->fs_info->avail_system_alloc_bits &
-			 root->fs_info->system_alloc_profile;
+		flags |= root->fs_info->avail_system_alloc_bits;
 	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		flags |= root->fs_info->avail_metadata_alloc_bits &
-			 root->fs_info->metadata_alloc_profile;
+		flags |= root->fs_info->avail_metadata_alloc_bits;
+
 	return btrfs_reduce_alloc_profile(root, flags);
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9..89096f6d6fb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2752,8 +2752,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 		return ret;
 
 	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-			(fs_info->metadata_alloc_profile &
-			 fs_info->avail_metadata_alloc_bits);
+				fs_info->avail_metadata_alloc_bits;
 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
 	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +2762,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 	sys_chunk_offset = chunk_offset + chunk_size;
 
 	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-			(fs_info->system_alloc_profile &
-			 fs_info->avail_system_alloc_bits);
+				fs_info->avail_system_alloc_bits;
 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
 	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
-- 
cgit v1.2.3


From 52ba692972532f8d652080214b6599ece3dd51b9 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: introduce masks for chunk type and profile

Chunk's type and profile are encoded in u64 flags field.  Introduce
masks to easily access them.  Also fix the type of BTRFS_BLOCK_GROUP_*
constants, it should be ULL.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h       | 26 +++++++++++++++++---------
 fs/btrfs/extent-tree.c | 12 +++---------
 fs/btrfs/volumes.c     | 11 ++---------
 3 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f5434ad49b9..4370a56fe81 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -751,15 +751,23 @@ struct btrfs_csum_item {
 } __attribute__ ((__packed__));
 
 /* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
-#define BTRFS_NR_RAID_TYPES	   5
-
+#define BTRFS_BLOCK_GROUP_DATA		(1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM	(1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA	(1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0		(1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
+#define BTRFS_NR_RAID_TYPES		5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
+					 BTRFS_BLOCK_GROUP_SYSTEM |  \
+					 BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \
+					 BTRFS_BLOCK_GROUP_RAID1 |   \
+					 BTRFS_BLOCK_GROUP_DUP |     \
+					 BTRFS_BLOCK_GROUP_RAID10)
 struct btrfs_block_group_item {
 	__le64 used;
 	__le64 chunk_objectid;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f0591fd6624..a8d8204188d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 	struct list_head *head = &info->space_info;
 	struct btrfs_space_info *found;
 
-	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
-		 BTRFS_BLOCK_GROUP_METADATA;
+	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
@@ -2993,9 +2992,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		INIT_LIST_HEAD(&found->block_groups[i]);
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
-	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
-				BTRFS_BLOCK_GROUP_SYSTEM |
-				BTRFS_BLOCK_GROUP_METADATA);
+	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
 	found->total_bytes = total_bytes;
 	found->disk_total = total_bytes * factor;
 	found->bytes_used = bytes_used;
@@ -3016,10 +3013,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-				   BTRFS_BLOCK_GROUP_RAID1 |
-				   BTRFS_BLOCK_GROUP_RAID10 |
-				   BTRFS_BLOCK_GROUP_DUP);
+	u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
 	if (extra_flags) {
 		if (flags & BTRFS_BLOCK_GROUP_DATA)
 			fs_info->avail_data_alloc_bits |= extra_flags;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 89096f6d6fb..d5fdee56777 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2949,12 +2949,8 @@ again:
 		}
 	}
 	if (rw & REQ_DISCARD) {
-		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-				 BTRFS_BLOCK_GROUP_RAID1 |
-				 BTRFS_BLOCK_GROUP_DUP |
-				 BTRFS_BLOCK_GROUP_RAID10)) {
+		if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK)
 			stripes_required = map->num_stripes;
-		}
 	}
 	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
 	    stripes_allocated < stripes_required) {
@@ -2978,10 +2974,7 @@ again:
 
 	if (rw & REQ_DISCARD)
 		*length = min_t(u64, em->len - offset, *length);
-	else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-			      BTRFS_BLOCK_GROUP_RAID1 |
-			      BTRFS_BLOCK_GROUP_RAID10 |
-			      BTRFS_BLOCK_GROUP_DUP)) {
+	else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
 				map->stripe_len - stripe_offset);
-- 
cgit v1.2.3


From a46d11a8b06dd0431a3888fbc4856ea13a8e634f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: add BTRFS_AVAIL_ALLOC_BIT_SINGLE bit

Right now on-disk BTRFS_BLOCK_GROUP_* profile bits are used for
avail_{data,metadata,system}_alloc_bits fields, which gather info about
available allocation profiles in the FS.  When chunk is created or read
from disk, its profile is OR'ed with the corresponding avail_alloc_bits
field.  Since SINGLE is denoted by 0 in the on-disk format, currently
there is no way to tell when such chunks become avaialble.  Restriper
needs that information, so add a separate bit for SINGLE profile.

This bit is going to be in-memory only, it should never be written out
to disk, so it's not a disk format change.  However to avoid remappings
in future, reserve corresponding on-disk bit.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h       | 15 +++++++++++++++
 fs/btrfs/extent-tree.c | 30 +++++++++++++++++++++---------
 2 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4370a56fe81..3f8f11e18b5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -758,6 +758,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
 #define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
 #define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED	BTRFS_AVAIL_ALLOC_BIT_SINGLE
 #define BTRFS_NR_RAID_TYPES		5
 
 #define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
@@ -768,6 +769,15 @@ struct btrfs_csum_item {
 					 BTRFS_BLOCK_GROUP_RAID1 |   \
 					 BTRFS_BLOCK_GROUP_DUP |     \
 					 BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE	(1ULL << 48)
+
 struct btrfs_block_group_item {
 	__le64 used;
 	__le64 chunk_objectid;
@@ -1140,6 +1150,11 @@ struct btrfs_fs_info {
 	spinlock_t ref_cache_lock;
 	u64 total_ref_cache_size;
 
+	/*
+	 * these three are in extended format (availability of single
+	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+	 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+	 */
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a8d8204188d..15a22949da1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3014,16 +3014,24 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
-	if (extra_flags) {
-		if (flags & BTRFS_BLOCK_GROUP_DATA)
-			fs_info->avail_data_alloc_bits |= extra_flags;
-		if (flags & BTRFS_BLOCK_GROUP_METADATA)
-			fs_info->avail_metadata_alloc_bits |= extra_flags;
-		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-			fs_info->avail_system_alloc_bits |= extra_flags;
-	}
+
+	/* chunk -> extended profile */
+	if (extra_flags == 0)
+		extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits |= extra_flags;
 }
 
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format.
+ */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	/*
@@ -3053,8 +3061,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
-	     (flags & BTRFS_BLOCK_GROUP_DUP)))
+	     (flags & BTRFS_BLOCK_GROUP_DUP))) {
 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+	}
+
+	/* extended -> chunk profile */
+	flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
 	return flags;
 }
 
-- 
cgit v1.2.3


From 10ea00f55a07f8f9536d9112b95108a86f700bab Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: make avail_*_alloc_bits fields dynamic

Currently when new chunks are created respective avail_alloc_bits field
is updated to reflect profiles of all chunks present in the system.
However when chunks are removed profile bits are never cleared.

This patch clears profile bit of respective avail_alloc_bits field when
the last chunk with that profile is removed.  Restriper needs this to
properly operate when "downgrading".

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/extent-tree.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 15a22949da1..946b0671c5d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7469,6 +7469,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	/* chunk -> extended profile */
+	if (extra_flags == 0)
+		extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start)
 {
@@ -7479,6 +7495,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	struct inode *inode;
 	int ret;
+	int index;
 	int factor;
 
 	root = root->fs_info->extent_root;
@@ -7494,6 +7511,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	free_excluded_extents(root, block_group);
 
 	memcpy(&key, &block_group->key, sizeof(key));
+	index = get_block_group_index(block_group);
 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
 				  BTRFS_BLOCK_GROUP_RAID1 |
 				  BTRFS_BLOCK_GROUP_RAID10))
@@ -7568,6 +7586,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * are still on the list after taking the semaphore
 	 */
 	list_del_init(&block_group->list);
+	if (list_empty(&block_group->space_info->block_groups[index]))
+		clear_avail_alloc_bits(root->fs_info, block_group->flags);
 	up_write(&block_group->space_info->groups_sem);
 
 	if (block_group->cached == BTRFS_CACHE_STARTED)
-- 
cgit v1.2.3


From c9e9f97bdfb64d06e9520f8e4f37674ac21cc9bc Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: add basic restriper infrastructure

Add basic restriper infrastructure: extended balancing ioctl and all
related ioctl data structures, add data structure for tracking
restriper's state to fs_info, etc.  The semantics of the old balancing
ioctl are fully preserved.

Explicitly disallow any volume operations when balance is in progress.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h   |   6 +++
 fs/btrfs/disk-io.c |   4 ++
 fs/btrfs/ioctl.c   | 135 ++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/ioctl.h   |  43 +++++++++++++++++
 fs/btrfs/volumes.c | 120 +++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/volumes.h |  14 +++++-
 6 files changed, 281 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3f8f11e18b5..c4d98c8df5c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -934,6 +934,7 @@ struct btrfs_block_group_cache {
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
@@ -1159,6 +1160,11 @@ struct btrfs_fs_info {
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
 
+	/* restriper state */
+	spinlock_t balance_lock;
+	struct mutex balance_mutex;
+	struct btrfs_balance_control *balance_ctl;
+
 	unsigned data_chunk_allocations;
 	unsigned metadata_ratio;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ce9d0fb3627..190a1b24c90 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2002,6 +2002,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_rwsem(&fs_info->scrub_super_lock);
 	fs_info->scrub_workers_refcnt = 0;
 
+	spin_lock_init(&fs_info->balance_lock);
+	mutex_init(&fs_info->balance_mutex);
+	fs_info->balance_ctl = NULL;
+
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
 	sb->s_bdi = &fs_info->bdi;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c04f02c7d5b..d838d2cfb94 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1203,13 +1203,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	mutex_lock(&root->fs_info->volume_mutex);
+	if (root->fs_info->balance_ctl) {
+		printk(KERN_INFO "btrfs: balance in progress\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-	mutex_lock(&root->fs_info->volume_mutex);
 	sizestr = vol_args->name;
 	devstr = strchr(sizestr, ':');
 	if (devstr) {
@@ -1226,7 +1234,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
 		       (unsigned long long)devid);
 		ret = -EINVAL;
-		goto out_unlock;
+		goto out_free;
 	}
 	if (!strcmp(sizestr, "max"))
 		new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1249,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		new_size = memparse(sizestr, NULL);
 		if (new_size == 0) {
 			ret = -EINVAL;
-			goto out_unlock;
+			goto out_free;
 		}
 	}
 
@@ -1250,7 +1258,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 	if (mod < 0) {
 		if (new_size > old_size) {
 			ret = -EINVAL;
-			goto out_unlock;
+			goto out_free;
 		}
 		new_size = old_size - new_size;
 	} else if (mod > 0) {
@@ -1259,11 +1267,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
 	if (new_size < 256 * 1024 * 1024) {
 		ret = -EINVAL;
-		goto out_unlock;
+		goto out_free;
 	}
 	if (new_size > device->bdev->bd_inode->i_size) {
 		ret = -EFBIG;
-		goto out_unlock;
+		goto out_free;
 	}
 
 	do_div(new_size, root->sectorsize);
@@ -1276,7 +1284,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		trans = btrfs_start_transaction(root, 0);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
-			goto out_unlock;
+			goto out_free;
 		}
 		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
@@ -1284,9 +1292,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		ret = btrfs_shrink_device(device, new_size);
 	}
 
-out_unlock:
-	mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
 	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -2052,14 +2061,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	mutex_lock(&root->fs_info->volume_mutex);
+	if (root->fs_info->balance_ctl) {
+		printk(KERN_INFO "btrfs: balance in progress\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_init_new_device(root, vol_args->name);
 
 	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -2074,14 +2094,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
+	mutex_lock(&root->fs_info->volume_mutex);
+	if (root->fs_info->balance_ctl) {
+		printk(KERN_INFO "btrfs: balance in progress\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_rm_device(root, vol_args->name);
 
 	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -3034,6 +3065,76 @@ out:
 	return ret;
 }
 
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+			       struct btrfs_ioctl_balance_args *bargs)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	bargs->flags = bctl->flags;
+
+	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+}
+
+static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ioctl_balance_args *bargs;
+	struct btrfs_balance_control *bctl;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&fs_info->volume_mutex);
+	mutex_lock(&fs_info->balance_mutex);
+
+	if (arg) {
+		bargs = memdup_user(arg, sizeof(*bargs));
+		if (IS_ERR(bargs)) {
+			ret = PTR_ERR(bargs);
+			goto out;
+		}
+	} else {
+		bargs = NULL;
+	}
+
+	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	if (!bctl) {
+		ret = -ENOMEM;
+		goto out_bargs;
+	}
+
+	bctl->fs_info = fs_info;
+	if (arg) {
+		memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+		memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+		memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+		bctl->flags = bargs->flags;
+	}
+
+	ret = btrfs_balance(bctl, bargs);
+	/*
+	 * bctl is freed in __cancel_balance
+	 */
+	if (arg) {
+		if (copy_to_user(arg, bargs, sizeof(*bargs)))
+			ret = -EFAULT;
+	}
+
+out_bargs:
+	kfree(bargs);
+out:
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3078,7 +3179,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_DEV_INFO:
 		return btrfs_ioctl_dev_info(root, argp);
 	case BTRFS_IOC_BALANCE:
-		return btrfs_balance(root->fs_info->dev_root);
+		return btrfs_ioctl_balance(root, NULL);
 	case BTRFS_IOC_CLONE:
 		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
 	case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3211,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_scrub_cancel(root, argp);
 	case BTRFS_IOC_SCRUB_PROGRESS:
 		return btrfs_ioctl_scrub_progress(root, argp);
+	case BTRFS_IOC_BALANCE_V2:
+		return btrfs_ioctl_balance(root, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de..c8b37d2c0d7 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,47 @@ struct btrfs_ioctl_fs_info_args {
 	__u64 reserved[124];			/* pad to 1k */
 };
 
+/*
+ * this is packed, because it should be exactly the same as its disk
+ * byte order counterpart (struct btrfs_disk_balance_args)
+ */
+struct btrfs_balance_args {
+	__u64 profiles;
+	__u64 usage;
+	__u64 devid;
+	__u64 pstart;
+	__u64 pend;
+	__u64 vstart;
+	__u64 vend;
+
+	__u64 target;
+
+	__u64 flags;
+
+	__u64 unused[8];
+} __attribute__ ((__packed__));
+
+/* report balance progress to userspace */
+struct btrfs_balance_progress {
+	__u64 expected;		/* estimated # of chunks that will be
+				 * relocated to fulfill the request */
+	__u64 considered;	/* # of chunks we have considered so far */
+	__u64 completed;	/* # of chunks relocated so far */
+};
+
+struct btrfs_ioctl_balance_args {
+	__u64 flags;				/* in/out */
+	__u64 state;				/* out */
+
+	struct btrfs_balance_args data;		/* in/out */
+	struct btrfs_balance_args meta;		/* in/out */
+	struct btrfs_balance_args sys;		/* in/out */
+
+	struct btrfs_balance_progress stat;	/* out */
+
+	__u64 unused[72];			/* pad to 1k */
+};
+
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
 	__u64 treeid;
@@ -272,6 +313,8 @@ struct btrfs_ioctl_logical_ino_args {
 				 struct btrfs_ioctl_dev_info_args)
 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
 			       struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
+				   struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
 					struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d5fdee56777..9fc06e6bc51 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1282,7 +1282,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	bool clear_super = false;
 
 	mutex_lock(&uuid_mutex);
-	mutex_lock(&root->fs_info->volume_mutex);
 
 	all_avail = root->fs_info->avail_data_alloc_bits |
 		root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1451,6 @@ error_close:
 	if (bdev)
 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
-	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
 	return ret;
 error_undo:
@@ -1629,7 +1627,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
 	/*
@@ -1757,8 +1754,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		ret = btrfs_relocate_sys_chunks(root);
 		BUG_ON(ret);
 	}
-out:
-	mutex_unlock(&root->fs_info->volume_mutex);
+
 	return ret;
 error:
 	blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1762,7 @@ error:
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
 	}
-	goto out;
+	return ret;
 }
 
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2073,35 @@ error:
 	return ret;
 }
 
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper.  Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+	BUG_ON(fs_info->balance_ctl);
+
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = bctl;
+	spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	BUG_ON(!fs_info->balance_ctl);
+
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = NULL;
+	spin_unlock(&fs_info->balance_lock);
+
+	kfree(bctl);
+}
+
 static u64 div_factor(u64 num, int factor)
 {
 	if (factor == 10)
@@ -2086,29 +2111,23 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
-int btrfs_balance(struct btrfs_root *dev_root)
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
-	int ret;
-	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+	struct btrfs_root *chunk_root = fs_info->chunk_root;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct list_head *devices;
 	struct btrfs_device *device;
 	u64 old_size;
 	u64 size_to_free;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_key found_key;
-
-	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&dev_root->fs_info->volume_mutex);
-	dev_root = dev_root->fs_info->dev_root;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int enospc_errors = 0;
 
 	/* step one make some room on all the devices */
+	devices = &fs_info->fs_devices->devices;
 	list_for_each_entry(device, devices, dev_list) {
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
@@ -2151,12 +2170,14 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		 * failed
 		 */
 		if (ret == 0)
-			break;
+			BUG(); /* FIXME break ? */
 
 		ret = btrfs_previous_item(chunk_root, path, 0,
 					  BTRFS_CHUNK_ITEM_KEY);
-		if (ret)
+		if (ret) {
+			ret = 0;
 			break;
+		}
 
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
@@ -2174,12 +2195,63 @@ int btrfs_balance(struct btrfs_root *dev_root)
 					   found_key.offset);
 		if (ret && ret != -ENOSPC)
 			goto error;
+		if (ret == -ENOSPC)
+			enospc_errors++;
 		key.offset = found_key.offset - 1;
 	}
-	ret = 0;
+
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&dev_root->fs_info->volume_mutex);
+	if (enospc_errors) {
+		printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+		       enospc_errors);
+		if (!ret)
+			ret = -ENOSPC;
+	}
+
+	return ret;
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+	unset_balance_control(fs_info);
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+			       struct btrfs_ioctl_balance_args *bargs);
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+		  struct btrfs_ioctl_balance_args *bargs)
+{
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+	int ret;
+
+	if (btrfs_fs_closing(fs_info)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	set_balance_control(bctl);
+
+	mutex_unlock(&fs_info->balance_mutex);
+
+	ret = __btrfs_balance(fs_info);
+
+	mutex_lock(&fs_info->balance_mutex);
+
+	if (bargs) {
+		memset(bargs, 0, sizeof(*bargs));
+		update_ioctl_balance_args(fs_info, bargs);
+	}
+
+	__cancel_balance(fs_info);
+
+	return ret;
+out:
+	kfree(bctl);
 	return ret;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37..88258238528 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,17 @@ struct map_lookup {
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
+struct btrfs_balance_args;
+struct btrfs_balance_control {
+	struct btrfs_fs_info *fs_info;
+
+	struct btrfs_balance_args data;
+	struct btrfs_balance_args meta;
+	struct btrfs_balance_args sys;
+
+	u64 flags;
+};
+
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 				   u64 end, u64 *length);
 
@@ -228,7 +239,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+		  struct btrfs_ioctl_balance_args *bargs);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
-- 
cgit v1.2.3


From f43ffb60fd94e98be02780944e182ade6653b916 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: add basic infrastructure for selective balancing

This allows to have a separate set of filters for each chunk type
(data,meta,sys).  The code however is generic and switch on chunk type
is only done once.

This commit also adds a type filter: it allows to balance for example
meta and system chunks w/o touching data ones.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ioctl.c   |  3 +++
 fs/btrfs/volumes.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h | 11 ++++++++++
 3 files changed, 71 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d838d2cfb94..29b3a94933f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3116,6 +3116,9 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
 		memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
 
 		bctl->flags = bargs->flags;
+	} else {
+		/* balance everything - no filters */
+		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
 	}
 
 	ret = btrfs_balance(bctl, bargs);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9fc06e6bc51..91bbf6e774c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2102,6 +2102,30 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)
 	kfree(bctl);
 }
 
+static int should_balance_chunk(struct btrfs_root *root,
+				struct extent_buffer *leaf,
+				struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+	struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+	struct btrfs_balance_args *bargs = NULL;
+	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+	/* type filter */
+	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+		return 0;
+	}
+
+	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+		bargs = &bctl->data;
+	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+		bargs = &bctl->sys;
+	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+		bargs = &bctl->meta;
+
+	return 1;
+}
+
 static u64 div_factor(u64 num, int factor)
 {
 	if (factor == 10)
@@ -2119,10 +2143,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	struct btrfs_device *device;
 	u64 old_size;
 	u64 size_to_free;
+	struct btrfs_chunk *chunk;
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_trans_handle *trans;
+	struct extent_buffer *leaf;
+	int slot;
 	int ret;
 	int enospc_errors = 0;
 
@@ -2179,8 +2206,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 			break;
 		}
 
-		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-				      path->slots[0]);
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
 		if (found_key.objectid != key.objectid)
 			break;
 
@@ -2188,7 +2217,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		if (found_key.offset == 0)
 			break;
 
+		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+		ret = should_balance_chunk(chunk_root, leaf, chunk,
+					   found_key.offset);
 		btrfs_release_path(path);
+		if (!ret)
+			goto loop;
+
 		ret = btrfs_relocate_chunk(chunk_root,
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
@@ -2197,6 +2233,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 			goto error;
 		if (ret == -ENOSPC)
 			enospc_errors++;
+loop:
 		key.offset = found_key.offset - 1;
 	}
 
@@ -2227,6 +2264,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs)
 {
 	struct btrfs_fs_info *fs_info = bctl->fs_info;
+	u64 allowed;
 	int ret;
 
 	if (btrfs_fs_closing(fs_info)) {
@@ -2234,6 +2272,23 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		goto out;
 	}
 
+	/*
+	 * In case of mixed groups both data and meta should be picked,
+	 * and identical options should be given for both of them.
+	 */
+	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+	if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+	    (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+			printk(KERN_ERR "btrfs: with mixed groups data and "
+			       "metadata balance options must be the same\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
 	set_balance_control(bctl);
 
 	mutex_unlock(&fs_info->balance_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 88258238528..003e5421606 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,17 @@ struct map_lookup {
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA		(1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM		(1ULL << 1)
+#define BTRFS_BALANCE_METADATA		(1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK		(BTRFS_BALANCE_DATA |	    \
+					 BTRFS_BALANCE_SYSTEM |	    \
+					 BTRFS_BALANCE_METADATA)
+
 struct btrfs_balance_args;
 struct btrfs_balance_control {
 	struct btrfs_fs_info *fs_info;
-- 
cgit v1.2.3


From ed25e9b26f898d8d63ae4a836489f1923534143b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: profiles filter

Select chunks based on a given profile mask.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 24 ++++++++++++++++++++++++
 fs/btrfs/volumes.h |  4 ++++
 2 files changed, 28 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 91bbf6e774c..447bd422d86 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2102,6 +2102,24 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)
 	kfree(bctl);
 }
 
+/*
+ * Balance filters.  Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_profile,
+				 struct btrfs_balance_args *bargs)
+{
+	chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	if (chunk_profile == 0)
+		chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	if (bargs->profiles & chunk_profile)
+		return 0;
+
+	return 1;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2123,6 +2141,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
 		bargs = &bctl->meta;
 
+	/* profiles filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+	    chunk_profiles_filter(chunk_type, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 003e5421606..fb20d774044 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -196,6 +196,10 @@ struct map_lookup {
 #define BTRFS_BALANCE_TYPE_MASK		(BTRFS_BALANCE_DATA |	    \
 					 BTRFS_BALANCE_SYSTEM |	    \
 					 BTRFS_BALANCE_METADATA)
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From 5ce5b3c0916ba3a2e34cf648b94044adc5ef9e76 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: usage filter

Select chunks that are less than X percent full.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 36 ++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 2 files changed, 37 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 447bd422d86..b858242374d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2120,6 +2120,36 @@ static int chunk_profiles_filter(u64 chunk_profile,
 	return 1;
 }
 
+static u64 div_factor_fine(u64 num, int factor)
+{
+	if (factor <= 0)
+		return 0;
+	if (factor >= 100)
+		return num;
+
+	num *= factor;
+	do_div(num, 100);
+	return num;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+			      struct btrfs_balance_args *bargs)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 chunk_used, user_thresh;
+	int ret = 1;
+
+	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+	chunk_used = btrfs_block_group_used(&cache->item);
+
+	user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+	if (chunk_used < user_thresh)
+		ret = 0;
+
+	btrfs_put_block_group(cache);
+	return ret;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2147,6 +2177,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* usage filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index fb20d774044..eee77fcf812 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -200,6 +200,7 @@ struct map_lookup {
  * Balance filters
  */
 #define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From 409d404b461afa9738619f249fd7f62a366b68c2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: devid filter

Relocate chunks which have at least one stripe located on a device with
devid X.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 23 +++++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 2 files changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b858242374d..9ff5cd09a25 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2150,6 +2150,23 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
 	return ret;
 }
 
+static int chunk_devid_filter(struct extent_buffer *leaf,
+			      struct btrfs_chunk *chunk,
+			      struct btrfs_balance_args *bargs)
+{
+	struct btrfs_stripe *stripe;
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	int i;
+
+	for (i = 0; i < num_stripes; i++) {
+		stripe = btrfs_stripe_nr(chunk, i);
+		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+			return 0;
+	}
+
+	return 1;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2183,6 +2200,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* devid filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+	    chunk_devid_filter(leaf, chunk, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index eee77fcf812..7cfec03c29b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -201,6 +201,7 @@ struct map_lookup {
  */
 #define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
 #define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From 94e60d5a5c4b98a32b1077dec88df09ada712376 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: devid subset filter

Select chunks which have at least one byte of at least one stripe
located on a device with devid X in a given [pstart,pend) physical
address range.

This filter only works when devid filter is turned on.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 2 files changed, 47 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9ff5cd09a25..c60071a448a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2167,6 +2167,46 @@ static int chunk_devid_filter(struct extent_buffer *leaf,
 	return 1;
 }
 
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       u64 chunk_offset,
+			       struct btrfs_balance_args *bargs)
+{
+	struct btrfs_stripe *stripe;
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	u64 stripe_offset;
+	u64 stripe_length;
+	int factor;
+	int i;
+
+	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+		return 0;
+
+	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+		factor = 2;
+	else
+		factor = 1;
+	factor = num_stripes / factor;
+
+	for (i = 0; i < num_stripes; i++) {
+		stripe = btrfs_stripe_nr(chunk, i);
+		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+			continue;
+
+		stripe_offset = btrfs_stripe_offset(leaf, stripe);
+		stripe_length = btrfs_chunk_length(leaf, chunk);
+		do_div(stripe_length, factor);
+
+		if (stripe_offset < bargs->pend &&
+		    stripe_offset + stripe_length > bargs->pstart)
+			return 0;
+	}
+
+	return 1;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2206,6 +2246,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* drange filter, makes sense only with devid filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+	    chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7cfec03c29b..844b08e388f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -202,6 +202,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
 #define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
 #define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From ea67176ae8c024f64d85ec33873e5eadf1af7247 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: virtual address space subset filter

Select chunks which have at least one byte located inside a given
[vstart, vend) virtual address space range.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 20 ++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c60071a448a..e86c9e4fe51 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2207,6 +2207,20 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
 	return 1;
 }
 
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       u64 chunk_offset,
+			       struct btrfs_balance_args *bargs)
+{
+	if (chunk_offset < bargs->vend &&
+	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+		/* at least part of the chunk is inside this vrange */
+		return 0;
+
+	return 1;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2252,6 +2266,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* vrange filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 844b08e388f..eac26c35931 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -203,6 +203,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
 #define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)
 #define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From 70922617b0099f420deceb53d5dc7f4fb30d08d0 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: do not reduce profile in do_chunk_alloc()

Every caller of do_chunk_alloc() feeds it the reduced allocation
profile, so stop trying to reduce it one more time.  Instead check the
validity of the passed profile.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h       | 18 ++++++++++++++++++
 fs/btrfs/extent-tree.c |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c4d98c8df5c..1e7aea60da2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2536,6 +2536,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 	kfree(fs_info->super_for_commit);
 	kfree(fs_info);
 }
+/**
+ * profile_is_valid - tests whether a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static inline int profile_is_valid(u64 flags, int extended)
+{
+	u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+	if (extended)
+		mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	if (flags & mask)
+		return 0;
+	/* true if zero or exactly one bit set */
+	return (flags & (~flags + 1)) == flags;
+}
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 946b0671c5d..a1a18ea7b6c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3295,7 +3295,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	int wait_for_alloc = 0;
 	int ret = 0;
 
-	flags = btrfs_reduce_alloc_profile(extent_root, flags);
+	BUG_ON(!profile_is_valid(flags, 0));
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
-- 
cgit v1.2.3


From e4d8ec0f65b91bfb4984a4927632ded95f9825ad Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: implement online profile changing

Profile changing is done by launching a balance with
BTRFS_BALANCE_CONVERT bits set and target fields of respective
btrfs_balance_args structs initialized.  Profile reducing code in this
case will pick restriper's target profile if it's available instead of
doing a blind reduce.  If target profile is not yet available it goes
back to a plain reduce.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/extent-tree.c | 56 +++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.c     | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h     |  5 ++++
 3 files changed, 129 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a1a18ea7b6c..e6a832e3e64 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3030,7 +3030,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 /*
  * @flags: available profiles in extended format (see ctree.h)
  *
- * Returns reduced profile in chunk format.
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
  */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
@@ -3042,6 +3044,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
 		root->fs_info->fs_devices->missing_devices;
 
+	/* pick restriper's target profile if it's available */
+	spin_lock(&root->fs_info->balance_lock);
+	if (root->fs_info->balance_ctl) {
+		struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+		u64 tgt = 0;
+
+		if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+		    (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		    (flags & bctl->data.target)) {
+			tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+		} else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+			   (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+			   (flags & bctl->sys.target)) {
+			tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+		} else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+			   (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+			   (flags & bctl->meta.target)) {
+			tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+		}
+
+		if (tgt) {
+			spin_unlock(&root->fs_info->balance_lock);
+			flags = tgt;
+			goto out;
+		}
+	}
+	spin_unlock(&root->fs_info->balance_lock);
+
 	if (num_devices == 1)
 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
 	if (num_devices < 4)
@@ -3065,6 +3095,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
 	}
 
+out:
 	/* extended -> chunk profile */
 	flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
 	return flags;
@@ -6795,6 +6826,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
+	if (root->fs_info->balance_ctl) {
+		struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+		u64 tgt = 0;
+
+		/* pick restriper's target profile and return */
+		if (flags & BTRFS_BLOCK_GROUP_DATA &&
+		    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+			tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+		} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+			   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+			tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+		} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+			   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+			tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+		}
+
+		if (tgt) {
+			/* extended -> chunk profile */
+			tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+			return tgt;
+		}
+	}
+
 	/*
 	 * we add in the count of missing devices because we want
 	 * to make sure that any RAID levels on a degraded FS
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e86c9e4fe51..f08210e8333 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2438,6 +2438,75 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		}
 	}
 
+	/*
+	 * Profile changing sanity checks.  Skip them if a simple
+	 * balance is requested.
+	 */
+	if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
+	      BTRFS_BALANCE_ARGS_CONVERT))
+		goto do_balance;
+
+	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+	if (fs_info->fs_devices->num_devices == 1)
+		allowed |= BTRFS_BLOCK_GROUP_DUP;
+	else if (fs_info->fs_devices->num_devices < 4)
+		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+	else
+		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+				BTRFS_BLOCK_GROUP_RAID10);
+
+	if (!profile_is_valid(bctl->data.target, 1) ||
+	    bctl->data.target & ~allowed) {
+		printk(KERN_ERR "btrfs: unable to start balance with target "
+		       "data profile %llu\n",
+		       (unsigned long long)bctl->data.target);
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!profile_is_valid(bctl->meta.target, 1) ||
+	    bctl->meta.target & ~allowed) {
+		printk(KERN_ERR "btrfs: unable to start balance with target "
+		       "metadata profile %llu\n",
+		       (unsigned long long)bctl->meta.target);
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!profile_is_valid(bctl->sys.target, 1) ||
+	    bctl->sys.target & ~allowed) {
+		printk(KERN_ERR "btrfs: unable to start balance with target "
+		       "system profile %llu\n",
+		       (unsigned long long)bctl->sys.target);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+		printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* allow to reduce meta or sys integrity only if force set */
+	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+			BTRFS_BLOCK_GROUP_RAID10;
+	if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	     (fs_info->avail_system_alloc_bits & allowed) &&
+	     !(bctl->sys.target & allowed)) ||
+	    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	     (fs_info->avail_metadata_alloc_bits & allowed) &&
+	     !(bctl->meta.target & allowed))) {
+		if (bctl->flags & BTRFS_BALANCE_FORCE) {
+			printk(KERN_INFO "btrfs: force reducing metadata "
+			       "integrity\n");
+		} else {
+			printk(KERN_ERR "btrfs: balance will reduce metadata "
+			       "integrity, use force if you want this\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+do_balance:
 	set_balance_control(bctl);
 
 	mutex_unlock(&fs_info->balance_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index eac26c35931..79ee9c32456 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -196,6 +196,9 @@ struct map_lookup {
 #define BTRFS_BALANCE_TYPE_MASK		(BTRFS_BALANCE_DATA |	    \
 					 BTRFS_BALANCE_SYSTEM |	    \
 					 BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE		(1ULL << 3)
+
 /*
  * Balance filters
  */
@@ -205,6 +208,8 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 
+#define BTRFS_BALANCE_ARGS_CONVERT	(1ULL << 8)
+
 struct btrfs_balance_args;
 struct btrfs_balance_control {
 	struct btrfs_fs_info *fs_info;
-- 
cgit v1.2.3


From cfa4c961cc69ffb7bda450972320a25cbd413e19 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: soft profile changing mode (aka soft convert)

When doing convert from one profile to another if soft mode is on
restriper won't touch chunks that already have the profile we are
converting to.  This is useful if e.g. half of the FS was converted
earlier.

The soft mode switch is (like every other filter) per-type.  This means
that we can convert for example meta chunks the "hard" way while
converting data chunks selectively with soft switch.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 23 +++++++++++++++++++++++
 fs/btrfs/volumes.h |  6 ++++++
 2 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f08210e8333..98b4067017f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2221,6 +2221,23 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
 	return 1;
 }
 
+static int chunk_soft_convert_filter(u64 chunk_profile,
+				     struct btrfs_balance_args *bargs)
+{
+	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+		return 0;
+
+	chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	if (chunk_profile == 0)
+		chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	if (bargs->target & chunk_profile)
+		return 1;
+
+	return 0;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2272,6 +2289,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* soft profile changing mode */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+	    chunk_soft_convert_filter(chunk_type, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 79ee9c32456..6c143c98017 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -208,7 +208,13 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 
+/*
+ * Profile changing flags.  When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
 #define BTRFS_BALANCE_ARGS_CONVERT	(1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT		(1ULL << 9)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From 0940ebf6b92ea10a6f30ae5ac3993a3b75745da6 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: save balance parameters to disk

Introduce a new btree objectid for storing balance item.  The reason is
to be able to resume restriper after a crash with the same parameters.
Balance item has a very high objectid and goes into tree of tree roots.

The key for the new item is as follows:

	[ BTRFS_BALANCE_OBJECTID ; BTRFS_BALANCE_ITEM_KEY ; 0 ]

Older kernels simply ignore it so it's safe to mount with an older
kernel and then go back to the newer one.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h   | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.c |  99 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 231 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1e7aea60da2..9997a59e4f5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
 
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
 	__le16 name_len;
 } __attribute__ ((__packed__));
 
+struct btrfs_disk_balance_args {
+	/*
+	 * profiles to operate on, single is denoted by
+	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+	 */
+	__le64 profiles;
+
+	/* usage filter */
+	__le64 usage;
+
+	/* devid filter */
+	__le64 devid;
+
+	/* devid subset filter [pstart..pend) */
+	__le64 pstart;
+	__le64 pend;
+
+	/* btrfs virtual address space subset filter [vstart..vend) */
+	__le64 vstart;
+	__le64 vend;
+
+	/*
+	 * profile to convert to, single is denoted by
+	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+	 */
+	__le64 target;
+
+	/* BTRFS_BALANCE_ARGS_* */
+	__le64 flags;
+
+	__le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+	/* BTRFS_BALANCE_* */
+	__le64 flags;
+
+	struct btrfs_disk_balance_args data;
+	struct btrfs_disk_balance_args meta;
+	struct btrfs_disk_balance_args sys;
+
+	__le64 unused[4];
+} __attribute__ ((__packed__));
+
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -1409,6 +1460,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_ITEM_KEY	216
 #define BTRFS_CHUNK_ITEM_KEY	228
 
+#define BTRFS_BALANCE_ITEM_KEY	248
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -2103,8 +2156,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
 		   num_devices, 64);
 
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+				      struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+				      struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+				     struct btrfs_balance_item *bi,
+				     struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+					 struct btrfs_balance_item *bi,
+					 struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
 
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+			       struct btrfs_disk_balance_args *disk)
+{
+	memset(cpu, 0, sizeof(*cpu));
+
+	cpu->profiles = le64_to_cpu(disk->profiles);
+	cpu->usage = le64_to_cpu(disk->usage);
+	cpu->devid = le64_to_cpu(disk->devid);
+	cpu->pstart = le64_to_cpu(disk->pstart);
+	cpu->pend = le64_to_cpu(disk->pend);
+	cpu->vstart = le64_to_cpu(disk->vstart);
+	cpu->vend = le64_to_cpu(disk->vend);
+	cpu->target = le64_to_cpu(disk->target);
+	cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+			       struct btrfs_balance_args *cpu)
+{
+	memset(disk, 0, sizeof(*disk));
+
+	disk->profiles = cpu_to_le64(cpu->profiles);
+	disk->usage = cpu_to_le64(cpu->usage);
+	disk->devid = cpu_to_le64(cpu->devid);
+	disk->pstart = cpu_to_le64(cpu->pstart);
+	disk->pend = cpu_to_le64(cpu->pend);
+	disk->vstart = cpu_to_le64(cpu->vstart);
+	disk->vend = cpu_to_le64(cpu->vend);
+	disk->target = cpu_to_le64(cpu->target);
+	disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 98b4067017f..4c60ca0ae90 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2073,6 +2073,97 @@ error:
 	return ret;
 }
 
+static int insert_balance_item(struct btrfs_root *root,
+			       struct btrfs_balance_control *bctl)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_balance_item *item;
+	struct btrfs_disk_balance_args disk_bargs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret, err;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*item));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+	btrfs_set_balance_data(leaf, item, &disk_bargs);
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+	btrfs_set_balance_meta(leaf, item, &disk_bargs);
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+	btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+	btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+	return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret, err;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+	return ret;
+}
+
 /*
  * Should be called with both balance and volume mutexes held to
  * serialize other volume operations (add_dev/rm_dev/resize) with
@@ -2423,7 +2514,11 @@ error:
 
 static void __cancel_balance(struct btrfs_fs_info *fs_info)
 {
+	int ret;
+
 	unset_balance_control(fs_info);
+	ret = del_balance_item(fs_info->tree_root);
+	BUG_ON(ret);
 }
 
 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
@@ -2530,6 +2625,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	}
 
 do_balance:
+	ret = insert_balance_item(fs_info->tree_root, bctl);
+	if (ret)
+		goto out;
+
 	set_balance_control(bctl);
 
 	mutex_unlock(&fs_info->balance_mutex);
-- 
cgit v1.2.3


From 596410151ed71819b9e8a8018c6c9992796b256d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: recover balance on mount

On mount, if balance item is found, resume balance in a separate
kernel thread.

Try to be smart to continue roughly where previous balance (or convert)
was interrupted.  For chunk types that were being converted to some
profile we turn on soft convert, in case of a simple balance we turn on
usage filter and relocate only less-than-90%-full chunks of that type.
These are just heuristics but they help quite a bit, and can be improved
in future.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/disk-io.c |   4 ++
 fs/btrfs/volumes.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.h |   2 +
 3 files changed, 139 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 190a1b24c90..eb7a11ac5b7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2427,6 +2427,10 @@ retry_root_backup:
 		if (!err)
 			err = btrfs_orphan_cleanup(fs_info->tree_root);
 		up_read(&fs_info->cleanup_work_sem);
+
+		if (!err)
+			err = btrfs_recover_balance(fs_info->tree_root);
+
 		if (err) {
 			close_ctree(tree_root);
 			return ERR_PTR(err);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4c60ca0ae90..17e565388de 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
 #include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
+#include <linux/kthread.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -2164,6 +2165,46 @@ out:
 	return ret;
 }
 
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+	/*
+	 * Turn on soft mode for chunk types that were being converted.
+	 */
+	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+	/*
+	 * Turn on usage filter if is not already used.  The idea is
+	 * that chunks that we have already balanced should be
+	 * reasonably full.  Don't do it for chunks that are being
+	 * converted - that will keep us from relocating unconverted
+	 * (albeit full) chunks.
+	 */
+	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->data.usage = 90;
+	}
+	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->sys.usage = 90;
+	}
+	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->meta.usage = 90;
+	}
+}
+
 /*
  * Should be called with both balance and volume mutexes held to
  * serialize other volume operations (add_dev/rm_dev/resize) with
@@ -2626,10 +2667,18 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
 do_balance:
 	ret = insert_balance_item(fs_info->tree_root, bctl);
-	if (ret)
+	if (ret && ret != -EEXIST)
 		goto out;
 
-	set_balance_control(bctl);
+	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+		BUG_ON(ret == -EEXIST);
+		set_balance_control(bctl);
+	} else {
+		BUG_ON(ret != -EEXIST);
+		spin_lock(&fs_info->balance_lock);
+		update_balance_args(bctl);
+		spin_unlock(&fs_info->balance_lock);
+	}
 
 	mutex_unlock(&fs_info->balance_mutex);
 
@@ -2646,7 +2695,89 @@ do_balance:
 
 	return ret;
 out:
+	if (bctl->flags & BTRFS_BALANCE_RESUME)
+		__cancel_balance(fs_info);
+	else
+		kfree(bctl);
+	return ret;
+}
+
+static int balance_kthread(void *data)
+{
+	struct btrfs_balance_control *bctl =
+			(struct btrfs_balance_control *)data;
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+	int ret;
+
+	mutex_lock(&fs_info->volume_mutex);
+	mutex_lock(&fs_info->balance_mutex);
+
+	set_balance_control(bctl);
+
+	printk(KERN_INFO "btrfs: continuing balance\n");
+	ret = btrfs_balance(bctl, NULL);
+
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+	return ret;
+}
+
+int btrfs_recover_balance(struct btrfs_root *tree_root)
+{
+	struct task_struct *tsk;
+	struct btrfs_balance_control *bctl;
+	struct btrfs_balance_item *item;
+	struct btrfs_disk_balance_args disk_bargs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	if (!bctl) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out_bctl;
+	if (ret > 0) { /* ret = -ENOENT; */
+		ret = 0;
+		goto out_bctl;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+	bctl->fs_info = tree_root->fs_info;
+	bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+
+	btrfs_balance_data(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+	btrfs_balance_meta(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+	btrfs_balance_sys(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+	tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+	if (IS_ERR(tsk))
+		ret = PTR_ERR(tsk);
+	else
+		goto out;
+
+out_bctl:
 	kfree(bctl);
+out:
+	btrfs_free_path(path);
 	return ret;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6c143c98017..cd25ea58ec3 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -198,6 +198,7 @@ struct map_lookup {
 					 BTRFS_BALANCE_METADATA)
 
 #define BTRFS_BALANCE_FORCE		(1ULL << 3)
+#define BTRFS_BALANCE_RESUME		(1ULL << 4)
 
 /*
  * Balance filters
@@ -271,6 +272,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs);
+int btrfs_recover_balance(struct btrfs_root *tree_root);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
-- 
cgit v1.2.3


From 9555c6c180600b40f6e86bd4dc53bf47e06ed663 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: add skip_balance mount option

Since restriper kthread starts involuntarily on mount and can suck cpu
and memory bandwidth add a mount option to forcefully skip it.  The
restriper in that case hangs around in paused state and can be resumed
from userspace when it's convenient.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/super.c   | 11 +++++++++--
 fs/btrfs/volumes.c | 10 +++++++---
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9997a59e4f5..99eb2bcd9aa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1492,6 +1492,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
 #define BTRFS_MOUNT_RECOVERY		(1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE	(1 << 19)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 34a8b6112ea..063b521e3de 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -164,8 +164,9 @@ enum {
 	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+	Opt_err,
 };
 
 static match_table_t tokens = {
@@ -200,6 +201,7 @@ static match_table_t tokens = {
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_no_space_cache, "nospace_cache"},
 	{Opt_recovery, "recovery"},
+	{Opt_skip_balance, "skip_balance"},
 	{Opt_err, NULL},
 };
 
@@ -398,6 +400,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: enabling auto recovery");
 			btrfs_set_opt(info->mount_opt, RECOVERY);
 			break;
+		case Opt_skip_balance:
+			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
@@ -723,6 +728,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",autodefrag");
 	if (btrfs_test_opt(root, INODE_MAP_CACHE))
 		seq_puts(seq, ",inode_cache");
+	if (btrfs_test_opt(root, SKIP_BALANCE))
+		seq_puts(seq, ",skip_balance");
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 17e565388de..e0160607e6e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2707,15 +2707,19 @@ static int balance_kthread(void *data)
 	struct btrfs_balance_control *bctl =
 			(struct btrfs_balance_control *)data;
 	struct btrfs_fs_info *fs_info = bctl->fs_info;
-	int ret;
+	int ret = 0;
 
 	mutex_lock(&fs_info->volume_mutex);
 	mutex_lock(&fs_info->balance_mutex);
 
 	set_balance_control(bctl);
 
-	printk(KERN_INFO "btrfs: continuing balance\n");
-	ret = btrfs_balance(bctl, NULL);
+	if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+		printk(KERN_INFO "btrfs: force skipping balance\n");
+	} else {
+		printk(KERN_INFO "btrfs: continuing balance\n");
+		ret = btrfs_balance(bctl, NULL);
+	}
 
 	mutex_unlock(&fs_info->balance_mutex);
 	mutex_unlock(&fs_info->volume_mutex);
-- 
cgit v1.2.3


From 837d5b6e46d1a4af5b6cc8f2fe83cb5de79a2961 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:49 +0200
Subject: Btrfs: allow for pausing restriper

Implement an ioctl for pausing restriper.  This pauses the relocation,
but balance is still considered to be "in progress": balance item is
not deleted, other volume operations cannot be started, etc.  If paused
in the middle of profile changing operation we will continue making
allocations with the target profile.

Add a hook to close_ctree() to pause restriper and free its data
structures on unmount.  (It's safe to unmount when restriper is in
"paused" state, we will resume with the same parameters on the next
mount)

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h   |  4 ++++
 fs/btrfs/disk-io.c |  6 ++++++
 fs/btrfs/ioctl.c   | 28 +++++++++++++++++++++++++++-
 fs/btrfs/ioctl.h   |  7 +++++++
 fs/btrfs/volumes.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h |  1 +
 6 files changed, 94 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 99eb2bcd9aa..1afda75d541 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1214,7 +1214,10 @@ struct btrfs_fs_info {
 	/* restriper state */
 	spinlock_t balance_lock;
 	struct mutex balance_mutex;
+	atomic_t balance_running;
+	atomic_t balance_pause_req;
 	struct btrfs_balance_control *balance_ctl;
+	wait_queue_head_t balance_wait_q;
 
 	unsigned data_chunk_allocations;
 	unsigned metadata_ratio;
@@ -2658,6 +2661,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 }
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eb7a11ac5b7..8ce83740780 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2004,7 +2004,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	spin_lock_init(&fs_info->balance_lock);
 	mutex_init(&fs_info->balance_mutex);
+	atomic_set(&fs_info->balance_running, 0);
+	atomic_set(&fs_info->balance_pause_req, 0);
 	fs_info->balance_ctl = NULL;
+	init_waitqueue_head(&fs_info->balance_wait_q);
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
@@ -2980,6 +2983,9 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
+	/* pause restriper - we want to resume on mount */
+	btrfs_pause_balance(root->fs_info);
+
 	btrfs_scrub_cancel(root);
 
 	/* wait for any defraggers to finish */
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 29b3a94933f..f572c53dda4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3072,6 +3072,11 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 
 	bargs->flags = bctl->flags;
 
+	if (atomic_read(&fs_info->balance_running))
+		bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+	if (atomic_read(&fs_info->balance_pause_req))
+		bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+
 	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
 	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
 	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
@@ -3103,6 +3108,11 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
 		bargs = NULL;
 	}
 
+	if (fs_info->balance_ctl) {
+		ret = -EINPROGRESS;
+		goto out_bargs;
+	}
+
 	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
 	if (!bctl) {
 		ret = -ENOMEM;
@@ -3123,7 +3133,8 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
 
 	ret = btrfs_balance(bctl, bargs);
 	/*
-	 * bctl is freed in __cancel_balance
+	 * bctl is freed in __cancel_balance or in free_fs_info if
+	 * restriper was paused all the way until unmount
 	 */
 	if (arg) {
 		if (copy_to_user(arg, bargs, sizeof(*bargs)))
@@ -3138,6 +3149,19 @@ out:
 	return ret;
 }
 
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case BTRFS_BALANCE_CTL_PAUSE:
+		return btrfs_pause_balance(root->fs_info);
+	}
+
+	return -EINVAL;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3216,6 +3240,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_scrub_progress(root, argp);
 	case BTRFS_IOC_BALANCE_V2:
 		return btrfs_ioctl_balance(root, argp);
+	case BTRFS_IOC_BALANCE_CTL:
+		return btrfs_ioctl_balance_ctl(root, arg);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c8b37d2c0d7..e972e11a8d7 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,9 @@ struct btrfs_ioctl_fs_info_args {
 	__u64 reserved[124];			/* pad to 1k */
 };
 
+/* balance control ioctl modes */
+#define BTRFS_BALANCE_CTL_PAUSE		1
+
 /*
  * this is packed, because it should be exactly the same as its disk
  * byte order counterpart (struct btrfs_disk_balance_args)
@@ -137,6 +140,9 @@ struct btrfs_balance_progress {
 	__u64 completed;	/* # of chunks relocated so far */
 };
 
+#define BTRFS_BALANCE_STATE_RUNNING	(1ULL << 0)
+#define BTRFS_BALANCE_STATE_PAUSE_REQ	(1ULL << 1)
+
 struct btrfs_ioctl_balance_args {
 	__u64 flags;				/* in/out */
 	__u64 state;				/* out */
@@ -315,6 +321,7 @@ struct btrfs_ioctl_logical_ino_args {
 			       struct btrfs_ioctl_fs_info_args)
 #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
 				   struct btrfs_ioctl_balance_args)
+#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
 					struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0160607e6e..d32660ce753 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2492,6 +2492,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
+		if (atomic_read(&fs_info->balance_pause_req)) {
+			ret = -ECANCELED;
+			goto error;
+		}
+
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
@@ -2553,6 +2558,11 @@ error:
 	return ret;
 }
 
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+	return atomic_read(&fs_info->balance_pause_req) == 0;
+}
+
 static void __cancel_balance(struct btrfs_fs_info *fs_info)
 {
 	int ret;
@@ -2575,7 +2585,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	u64 allowed;
 	int ret;
 
-	if (btrfs_fs_closing(fs_info)) {
+	if (btrfs_fs_closing(fs_info) ||
+	    atomic_read(&fs_info->balance_pause_req)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -2680,18 +2691,25 @@ do_balance:
 		spin_unlock(&fs_info->balance_lock);
 	}
 
+	atomic_inc(&fs_info->balance_running);
 	mutex_unlock(&fs_info->balance_mutex);
 
 	ret = __btrfs_balance(fs_info);
 
 	mutex_lock(&fs_info->balance_mutex);
+	atomic_dec(&fs_info->balance_running);
 
 	if (bargs) {
 		memset(bargs, 0, sizeof(*bargs));
 		update_ioctl_balance_args(fs_info, bargs);
 	}
 
-	__cancel_balance(fs_info);
+	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+	    balance_need_close(fs_info)) {
+		__cancel_balance(fs_info);
+	}
+
+	wake_up(&fs_info->balance_wait_q);
 
 	return ret;
 out:
@@ -2785,6 +2803,35 @@ out:
 	return ret;
 }
 
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+	int ret = 0;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -ENOTCONN;
+	}
+
+	if (atomic_read(&fs_info->balance_running)) {
+		atomic_inc(&fs_info->balance_pause_req);
+		mutex_unlock(&fs_info->balance_mutex);
+
+		wait_event(fs_info->balance_wait_q,
+			   atomic_read(&fs_info->balance_running) == 0);
+
+		mutex_lock(&fs_info->balance_mutex);
+		/* we are good with balance_ctl ripped off from under us */
+		BUG_ON(atomic_read(&fs_info->balance_running));
+		atomic_dec(&fs_info->balance_pause_req);
+	} else {
+		ret = -ENOTCONN;
+	}
+
+	mutex_unlock(&fs_info->balance_mutex);
+	return ret;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index cd25ea58ec3..80953afb12b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -273,6 +273,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs);
 int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
-- 
cgit v1.2.3


From a7e99c691af553fc15ac46a51f130b7c59a65f76 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:49 +0200
Subject: Btrfs: allow for canceling restriper

Implement an ioctl for canceling restriper.  Currently we wait until
relocation of the current block group is finished, in future this can be
done by triggering a commit.  Balance item is deleted and no memory
about the interrupted balance is kept.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/ioctl.c   |  4 ++++
 fs/btrfs/ioctl.h   |  2 ++
 fs/btrfs/volumes.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/volumes.h |  1 +
 6 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1afda75d541..dfc136cc07d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1216,6 +1216,7 @@ struct btrfs_fs_info {
 	struct mutex balance_mutex;
 	atomic_t balance_running;
 	atomic_t balance_pause_req;
+	atomic_t balance_cancel_req;
 	struct btrfs_balance_control *balance_ctl;
 	wait_queue_head_t balance_wait_q;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8ce83740780..c23b82d8ec0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2006,6 +2006,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->balance_mutex);
 	atomic_set(&fs_info->balance_running, 0);
 	atomic_set(&fs_info->balance_pause_req, 0);
+	atomic_set(&fs_info->balance_cancel_req, 0);
 	fs_info->balance_ctl = NULL;
 	init_waitqueue_head(&fs_info->balance_wait_q);
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f572c53dda4..60852217ce9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3076,6 +3076,8 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 		bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
 	if (atomic_read(&fs_info->balance_pause_req))
 		bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+	if (atomic_read(&fs_info->balance_cancel_req))
+		bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
 
 	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
 	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
@@ -3157,6 +3159,8 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
 	switch (cmd) {
 	case BTRFS_BALANCE_CTL_PAUSE:
 		return btrfs_pause_balance(root->fs_info);
+	case BTRFS_BALANCE_CTL_CANCEL:
+		return btrfs_cancel_balance(root->fs_info);
 	}
 
 	return -EINVAL;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e972e11a8d7..cd19d10794b 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -111,6 +111,7 @@ struct btrfs_ioctl_fs_info_args {
 
 /* balance control ioctl modes */
 #define BTRFS_BALANCE_CTL_PAUSE		1
+#define BTRFS_BALANCE_CTL_CANCEL	2
 
 /*
  * this is packed, because it should be exactly the same as its disk
@@ -142,6 +143,7 @@ struct btrfs_balance_progress {
 
 #define BTRFS_BALANCE_STATE_RUNNING	(1ULL << 0)
 #define BTRFS_BALANCE_STATE_PAUSE_REQ	(1ULL << 1)
+#define BTRFS_BALANCE_STATE_CANCEL_REQ	(1ULL << 2)
 
 struct btrfs_ioctl_balance_args {
 	__u64 flags;				/* in/out */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d32660ce753..c32667318ae 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2492,7 +2492,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
-		if (atomic_read(&fs_info->balance_pause_req)) {
+		if (atomic_read(&fs_info->balance_pause_req) ||
+		    atomic_read(&fs_info->balance_cancel_req)) {
 			ret = -ECANCELED;
 			goto error;
 		}
@@ -2560,7 +2561,10 @@ error:
 
 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
 {
-	return atomic_read(&fs_info->balance_pause_req) == 0;
+	/* cancel requested || normal exit path */
+	return atomic_read(&fs_info->balance_cancel_req) ||
+		(atomic_read(&fs_info->balance_pause_req) == 0 &&
+		 atomic_read(&fs_info->balance_cancel_req) == 0);
 }
 
 static void __cancel_balance(struct btrfs_fs_info *fs_info)
@@ -2586,7 +2590,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	int ret;
 
 	if (btrfs_fs_closing(fs_info) ||
-	    atomic_read(&fs_info->balance_pause_req)) {
+	    atomic_read(&fs_info->balance_pause_req) ||
+	    atomic_read(&fs_info->balance_cancel_req)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -2832,6 +2837,42 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 	return ret;
 }
 
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -ENOTCONN;
+	}
+
+	atomic_inc(&fs_info->balance_cancel_req);
+	/*
+	 * if we are running just wait and return, balance item is
+	 * deleted in btrfs_balance in this case
+	 */
+	if (atomic_read(&fs_info->balance_running)) {
+		mutex_unlock(&fs_info->balance_mutex);
+		wait_event(fs_info->balance_wait_q,
+			   atomic_read(&fs_info->balance_running) == 0);
+		mutex_lock(&fs_info->balance_mutex);
+	} else {
+		/* __cancel_balance needs volume_mutex */
+		mutex_unlock(&fs_info->balance_mutex);
+		mutex_lock(&fs_info->volume_mutex);
+		mutex_lock(&fs_info->balance_mutex);
+
+		if (fs_info->balance_ctl)
+			__cancel_balance(fs_info);
+
+		mutex_unlock(&fs_info->volume_mutex);
+	}
+
+	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+	atomic_dec(&fs_info->balance_cancel_req);
+	mutex_unlock(&fs_info->balance_mutex);
+	return 0;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 80953afb12b..caa9abd218e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -274,6 +274,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs);
 int btrfs_recover_balance(struct btrfs_root *tree_root);
 int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
-- 
cgit v1.2.3


From de322263d3a6d4ffd4ed7c4d0c6536e9497aec9b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:49 +0200
Subject: Btrfs: allow for resuming restriper after it was paused

Recognize BTRFS_BALANCE_RESUME flag passed from userspace.  We use the
same heuristics used when recovering balance after a crash to try to
start where we left off last time.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ioctl.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 60852217ce9..85e546ffe3c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3106,6 +3106,20 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
 			ret = PTR_ERR(bargs);
 			goto out;
 		}
+
+		if (bargs->flags & BTRFS_BALANCE_RESUME) {
+			if (!fs_info->balance_ctl) {
+				ret = -ENOTCONN;
+				goto out_bargs;
+			}
+
+			bctl = fs_info->balance_ctl;
+			spin_lock(&fs_info->balance_lock);
+			bctl->flags |= BTRFS_BALANCE_RESUME;
+			spin_unlock(&fs_info->balance_lock);
+
+			goto do_balance;
+		}
 	} else {
 		bargs = NULL;
 	}
@@ -3133,6 +3147,7 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
 		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
 	}
 
+do_balance:
 	ret = btrfs_balance(bctl, bargs);
 	/*
 	 * bctl is freed in __cancel_balance or in free_fs_info if
-- 
cgit v1.2.3


From 19a39dce3b9bf0244d19a446718ad6f7605ff099 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:49 +0200
Subject: Btrfs: add balance progress reporting

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ioctl.c   | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/ioctl.h   |  2 ++
 fs/btrfs/volumes.c | 39 +++++++++++++++++++++++++++++++++++----
 fs/btrfs/volumes.h |  3 +++
 4 files changed, 84 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 85e546ffe3c..1e7a9bac31a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3065,7 +3065,7 @@ out:
 	return ret;
 }
 
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
 			       struct btrfs_ioctl_balance_args *bargs)
 {
 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -3082,6 +3082,14 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
 	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
 	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+	if (lock) {
+		spin_lock(&fs_info->balance_lock);
+		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+		spin_unlock(&fs_info->balance_lock);
+	} else {
+		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+	}
 }
 
 static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
@@ -3181,6 +3189,39 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
 	return -EINVAL;
 }
 
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+					 void __user *arg)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ioctl_balance_args *bargs;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		ret = -ENOTCONN;
+		goto out;
+	}
+
+	bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+	if (!bargs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	update_ioctl_balance_args(fs_info, 1, bargs);
+
+	if (copy_to_user(arg, bargs, sizeof(*bargs)))
+		ret = -EFAULT;
+
+	kfree(bargs);
+out:
+	mutex_unlock(&fs_info->balance_mutex);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3261,6 +3302,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_balance(root, argp);
 	case BTRFS_IOC_BALANCE_CTL:
 		return btrfs_ioctl_balance_ctl(root, arg);
+	case BTRFS_IOC_BALANCE_PROGRESS:
+		return btrfs_ioctl_balance_progress(root, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index cd19d10794b..4f69028a68c 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -324,6 +324,8 @@ struct btrfs_ioctl_logical_ino_args {
 #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
 				   struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
+#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
+					struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
 					struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c32667318ae..d73439b4d7d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2441,6 +2441,7 @@ static u64 div_factor(u64 num, int factor)
 
 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 	struct btrfs_root *chunk_root = fs_info->chunk_root;
 	struct btrfs_root *dev_root = fs_info->dev_root;
 	struct list_head *devices;
@@ -2456,6 +2457,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	int slot;
 	int ret;
 	int enospc_errors = 0;
+	bool counting = true;
 
 	/* step one make some room on all the devices */
 	devices = &fs_info->fs_devices->devices;
@@ -2487,12 +2489,18 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		ret = -ENOMEM;
 		goto error;
 	}
+
+	/* zero out stat counters */
+	spin_lock(&fs_info->balance_lock);
+	memset(&bctl->stat, 0, sizeof(bctl->stat));
+	spin_unlock(&fs_info->balance_lock);
+again:
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
-		if (atomic_read(&fs_info->balance_pause_req) ||
+		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
 		    atomic_read(&fs_info->balance_cancel_req)) {
 			ret = -ECANCELED;
 			goto error;
@@ -2529,24 +2537,47 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 
 		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
 
+		if (!counting) {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.considered++;
+			spin_unlock(&fs_info->balance_lock);
+		}
+
 		ret = should_balance_chunk(chunk_root, leaf, chunk,
 					   found_key.offset);
 		btrfs_release_path(path);
 		if (!ret)
 			goto loop;
 
+		if (counting) {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.expected++;
+			spin_unlock(&fs_info->balance_lock);
+			goto loop;
+		}
+
 		ret = btrfs_relocate_chunk(chunk_root,
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
 					   found_key.offset);
 		if (ret && ret != -ENOSPC)
 			goto error;
-		if (ret == -ENOSPC)
+		if (ret == -ENOSPC) {
 			enospc_errors++;
+		} else {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.completed++;
+			spin_unlock(&fs_info->balance_lock);
+		}
 loop:
 		key.offset = found_key.offset - 1;
 	}
 
+	if (counting) {
+		btrfs_release_path(path);
+		counting = false;
+		goto again;
+	}
 error:
 	btrfs_free_path(path);
 	if (enospc_errors) {
@@ -2576,7 +2607,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
 	BUG_ON(ret);
 }
 
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
 			       struct btrfs_ioctl_balance_args *bargs);
 
 /*
@@ -2706,7 +2737,7 @@ do_balance:
 
 	if (bargs) {
 		memset(bargs, 0, sizeof(*bargs));
-		update_ioctl_balance_args(fs_info, bargs);
+		update_ioctl_balance_args(fs_info, 0, bargs);
 	}
 
 	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index caa9abd218e..6faec9dd1f9 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -218,6 +218,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_SOFT		(1ULL << 9)
 
 struct btrfs_balance_args;
+struct btrfs_balance_progress;
 struct btrfs_balance_control {
 	struct btrfs_fs_info *fs_info;
 
@@ -226,6 +227,8 @@ struct btrfs_balance_control {
 	struct btrfs_balance_args sys;
 
 	u64 flags;
+
+	struct btrfs_balance_progress stat;
 };
 
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
-- 
cgit v1.2.3


From 7ad85bb76a61801362701b77c5cee5aa09f35369 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 12 Jan 2012 19:10:12 -0500
Subject: Btrfs: do not use btrfs_end_transaction_throttle everywhere

A user reported a problem where things like open with O_CREAT would take up to
30 seconds when he had nfs activity on the same mount.  This is because all of
our quick metadata operations, like create, symlink etc all do
btrfs_end_transaction_throttle, which if the transaction is blocked will wait
for the commit to complete before it returns.  This adds a ridiculous amount of
latency and isn't really needed.  The normal btrfs_end_transaction will mark the
transaction as blocked and wake the transaction kthread up if it thinks the
transaction needs to end (this being in the running out of global reserve space
scenario), and this is all that is really needed since we've already done
everything we're going to do, we just need to return.  This should help people
with the latency they were seeing when using synchronous heavy workloads.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 18 +++++++++---------
 fs/btrfs/xattr.c |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index acc4ff39ca4..5f8ba210c0a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2845,7 +2845,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
 		BUG_ON(!root->fs_info->enospc_unlink);
 		root->fs_info->enospc_unlink = 0;
 	}
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3434,7 +3434,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
 		i_size_write(inode, newsize);
 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
 		ret = btrfs_update_inode(trans, root, inode);
-		btrfs_end_transaction_throttle(trans, root);
+		btrfs_end_transaction(trans, root);
 	} else {
 
 		/*
@@ -4655,7 +4655,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	}
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -4723,7 +4723,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	}
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4782,7 +4782,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -4848,7 +4848,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 out_fail:
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	if (drop_on_err)
 		iput(inode);
 	btrfs_btree_balance_dirty(root, nr);
@@ -6668,7 +6668,7 @@ end_trans:
 			err = ret;
 
 		nr = trans->blocks_used;
-		ret = btrfs_end_transaction_throttle(trans, root);
+		ret = btrfs_end_transaction(trans, root);
 		btrfs_btree_balance_dirty(root, nr);
 	}
 
@@ -7075,7 +7075,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		btrfs_end_log_trans(root);
 	}
 out_fail:
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 out_notrans:
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
 		up_read(&root->fs_info->subvol_sem);
@@ -7247,7 +7247,7 @@ out_unlock:
 	if (!err)
 		d_instantiate(dentry, inode);
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3848b04e310..e7a5659087e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 out:
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	return ret;
 }
 
-- 
cgit v1.2.3


From f70a9a6b94af86fca069a7552ab672c31b457786 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 12 Jan 2012 19:10:12 -0500
Subject: Btrfs: fix btrfsck error 400 when truncating a compressed

Reproduce steps:
 # mkfs.btrfs /dev/sdb5
 # mount /dev/sdb5 -o compress=lzo /mnt
 # dd if=/dev/zero of=/mnt/tmpfile bs=128K count=1
 # sync
 # truncate -s 64K /mnt/tmpfile
 root 5 inode 257 errors 400

This is because of the wrong if condition, which is used to check if we should
subtract the bytes of the dropped range from i_blocks/i_bytes of i-node or not.
When we truncate a compressed extent, btrfs substracts the bytes of the whole
extent, it's wrong. We should substract the real size that we truncate, no
matter it is a compressed extent or not. Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f8ba210c0a..946a7f1b329 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3009,7 +3009,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
 	int extent_type = -1;
-	int encoding;
 	int ret;
 	int err = 0;
 	u64 ino = btrfs_ino(inode);
@@ -3059,7 +3058,6 @@ search_again:
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		found_type = btrfs_key_type(&found_key);
-		encoding = 0;
 
 		if (found_key.objectid != ino)
 			break;
@@ -3072,10 +3070,6 @@ search_again:
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			extent_type = btrfs_file_extent_type(leaf, fi);
-			encoding = btrfs_file_extent_compression(leaf, fi);
-			encoding |= btrfs_file_extent_encryption(leaf, fi);
-			encoding |= btrfs_file_extent_other_encoding(leaf, fi);
-
 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3097,7 @@ search_again:
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-			if (!del_item && !encoding) {
+			if (!del_item) {
 				u64 orig_num_bytes =
 					btrfs_file_extent_num_bytes(leaf, fi);
 				extent_num_bytes = new_size -
-- 
cgit v1.2.3


From ec39e180fd3188c983c94603634bfcd019f42ae7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 12 Jan 2012 19:10:12 -0500
Subject: Btrfs: release space on error in page_mkwrite

If updating the inode gave us an ENOSPC we were just returning in page_mkwrite,
which is a problem since we make our reservation right before trying to update
the inode, so fix the out label so that we actually free our reservation.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 946a7f1b329..85fd86ea983 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6488,8 +6488,8 @@ out_unlock:
 	if (!ret)
 		return VM_FAULT_LOCKED;
 	unlock_page(page);
-	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
+	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 45a8090e626ab470c91142954431a93846030b0d Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 12 Jan 2012 19:10:12 -0500
Subject: Btrfs: don't call btrfs_throttle in file write

Btrfs_throttle will make us wait if there is a currently committing transaction
until we can open new transactions, which is ridiculous since we don't actually
start any transactions within the file write path anyway, so all this does is
introduce big latencies if we have a sync/fsync heavy workload going on while
somebody else is trying to do work.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fc97b00bd87..0f61e11a299 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1273,7 +1273,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 						   dirty_pages);
 		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
 			btrfs_btree_balance_dirty(root, 1);
-		btrfs_throttle(root);
 
 		pos += copied;
 		num_written += copied;
-- 
cgit v1.2.3


From 3f7de037fb3727b20bc27332cdcf2488b702394c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 10 Nov 2011 08:29:20 -0500
Subject: Btrfs: add allocator tracepoints

I used these tracepoints when figuring out what the cluster stuff was doing, so
add them to mainline in case we need to profile this stuff again.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c       |   9 +++
 fs/btrfs/free-space-cache.c  |  12 ++-
 include/trace/events/btrfs.h | 173 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 193 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a44072a692a..ad1a20bc834 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5256,6 +5256,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	ins->objectid = 0;
 	ins->offset = 0;
 
+	trace_find_free_extent(orig_root, num_bytes, empty_size, data);
+
 	space_info = __find_space_info(root->fs_info, data);
 	if (!space_info) {
 		printk(KERN_ERR "No space info for %llu\n", data);
@@ -5432,6 +5434,8 @@ alloc:
 			if (offset) {
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
+				trace_btrfs_reserve_extent_cluster(root,
+					block_group, search_start, num_bytes);
 				goto checks;
 			}
 
@@ -5490,6 +5494,9 @@ refill_cluster:
 				if (offset) {
 					/* we found one, proceed */
 					spin_unlock(&last_ptr->refill_lock);
+					trace_btrfs_reserve_extent_cluster(root,
+						block_group, search_start,
+						num_bytes);
 					goto checks;
 				}
 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5576,6 +5583,8 @@ checks:
 		ins->objectid = search_start;
 		ins->offset = num_bytes;
 
+		trace_btrfs_reserve_extent(orig_root, block_group,
+					   search_start, num_bytes);
 		if (offset < search_start)
 			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6c7887a7770..efe20032e4a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2346,6 +2346,8 @@ again:
 				 &entry->offset_index, 1);
 	BUG_ON(ret);
 
+	trace_btrfs_setup_cluster(block_group, cluster,
+				  total_found * block_group->sectorsize, 1);
 	return 0;
 }
 
@@ -2368,6 +2370,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 	u64 window_start;
 	u64 window_free;
 	u64 max_extent;
+	u64 total_size = 0;
 
 	entry = tree_search_offset(ctl, offset, 0, 1);
 	if (!entry)
@@ -2433,11 +2436,12 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
+		total_size += entry->bytes;
 		BUG_ON(ret);
 	} while (node && entry != last);
 
 	cluster->max_size = max_extent;
-
+	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
 	return 0;
 }
 
@@ -2542,6 +2546,10 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
+	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+				 min_bytes);
+
+	INIT_LIST_HEAD(&bitmaps);
 	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
 				      bytes + empty_size,
 				      cont1_bytes, min_bytes);
@@ -2559,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		list_add_tail(&cluster->block_group_list,
 			      &block_group->cluster_list);
 		cluster->block_group = block_group;
+	} else {
+		trace_btrfs_failed_cluster_setup(block_group);
 	}
 out:
 	spin_unlock(&cluster->lock);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index b31702ac15b..1750c0e6660 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -16,6 +16,8 @@ struct btrfs_delayed_ref_node;
 struct btrfs_delayed_tree_ref;
 struct btrfs_delayed_data_ref;
 struct btrfs_delayed_ref_head;
+struct btrfs_block_group_cache;
+struct btrfs_free_cluster;
 struct map_lookup;
 struct extent_buffer;
 
@@ -44,6 +46,15 @@ struct extent_buffer;
 	obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||		\
 	      (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
 
+#define BTRFS_GROUP_FLAGS	\
+	{ BTRFS_BLOCK_GROUP_DATA,	"DATA"}, \
+	{ BTRFS_BLOCK_GROUP_SYSTEM,	"SYSTEM"}, \
+	{ BTRFS_BLOCK_GROUP_METADATA,	"METADATA"}, \
+	{ BTRFS_BLOCK_GROUP_RAID0,	"RAID0"}, \
+	{ BTRFS_BLOCK_GROUP_RAID1,	"RAID1"}, \
+	{ BTRFS_BLOCK_GROUP_DUP,	"DUP"}, \
+	{ BTRFS_BLOCK_GROUP_RAID10,	"RAID10"}
+
 TRACE_EVENT(btrfs_transaction_commit,
 
 	TP_PROTO(struct btrfs_root *root),
@@ -659,6 +670,168 @@ DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,
 	TP_ARGS(root, start, len)
 );
 
+TRACE_EVENT(find_free_extent,
+
+	TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size,
+		 u64 data),
+
+	TP_ARGS(root, num_bytes, empty_size, data),
+
+	TP_STRUCT__entry(
+		__field(	u64,	root_objectid		)
+		__field(	u64,	num_bytes		)
+		__field(	u64,	empty_size		)
+		__field(	u64,	data			)
+	),
+
+	TP_fast_assign(
+		__entry->root_objectid	= root->root_key.objectid;
+		__entry->num_bytes	= num_bytes;
+		__entry->empty_size	= empty_size;
+		__entry->data		= data;
+	),
+
+	TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, "
+		  "flags = %Lu(%s)", show_root_type(__entry->root_objectid),
+		  __entry->num_bytes, __entry->empty_size, __entry->data,
+		  __print_flags((unsigned long)__entry->data, "|",
+				 BTRFS_GROUP_FLAGS))
+);
+
+DECLARE_EVENT_CLASS(btrfs__reserve_extent,
+
+	TP_PROTO(struct btrfs_root *root,
+		 struct btrfs_block_group_cache *block_group, u64 start,
+		 u64 len),
+
+	TP_ARGS(root, block_group, start, len),
+
+	TP_STRUCT__entry(
+		__field(	u64,	root_objectid		)
+		__field(	u64,	bg_objectid		)
+		__field(	u64,	flags			)
+		__field(	u64,	start			)
+		__field(	u64,	len			)
+	),
+
+	TP_fast_assign(
+		__entry->root_objectid	= root->root_key.objectid;
+		__entry->bg_objectid	= block_group->key.objectid;
+		__entry->flags		= block_group->flags;
+		__entry->start		= start;
+		__entry->len		= len;
+	),
+
+	TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), "
+		  "start = %Lu, len = %Lu",
+		  show_root_type(__entry->root_objectid), __entry->bg_objectid,
+		  __entry->flags, __print_flags((unsigned long)__entry->flags,
+						"|", BTRFS_GROUP_FLAGS),
+		  __entry->start, __entry->len)
+);
+
+DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
+
+	TP_PROTO(struct btrfs_root *root,
+		 struct btrfs_block_group_cache *block_group, u64 start,
+		 u64 len),
+
+	TP_ARGS(root, block_group, start, len)
+);
+
+DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
+
+	TP_PROTO(struct btrfs_root *root,
+		 struct btrfs_block_group_cache *block_group, u64 start,
+		 u64 len),
+
+	TP_ARGS(root, block_group, start, len)
+);
+
+TRACE_EVENT(btrfs_find_cluster,
+
+	TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start,
+		 u64 bytes, u64 empty_size, u64 min_bytes),
+
+	TP_ARGS(block_group, start, bytes, empty_size, min_bytes),
+
+	TP_STRUCT__entry(
+		__field(	u64,	bg_objectid		)
+		__field(	u64,	flags			)
+		__field(	u64,	start			)
+		__field(	u64,	bytes			)
+		__field(	u64,	empty_size		)
+		__field(	u64,	min_bytes		)
+	),
+
+	TP_fast_assign(
+		__entry->bg_objectid	= block_group->key.objectid;
+		__entry->flags		= block_group->flags;
+		__entry->start		= start;
+		__entry->bytes		= bytes;
+		__entry->empty_size	= empty_size;
+		__entry->min_bytes	= min_bytes;
+	),
+
+	TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu,"
+		  " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid,
+		  __entry->flags,
+		  __print_flags((unsigned long)__entry->flags, "|",
+				BTRFS_GROUP_FLAGS), __entry->start,
+		  __entry->bytes, __entry->empty_size,  __entry->min_bytes)
+);
+
+TRACE_EVENT(btrfs_failed_cluster_setup,
+
+	TP_PROTO(struct btrfs_block_group_cache *block_group),
+
+	TP_ARGS(block_group),
+
+	TP_STRUCT__entry(
+		__field(	u64,	bg_objectid		)
+	),
+
+	TP_fast_assign(
+		__entry->bg_objectid	= block_group->key.objectid;
+	),
+
+	TP_printk("block_group = %Lu", __entry->bg_objectid)
+);
+
+TRACE_EVENT(btrfs_setup_cluster,
+
+	TP_PROTO(struct btrfs_block_group_cache *block_group,
+		 struct btrfs_free_cluster *cluster, u64 size, int bitmap),
+
+	TP_ARGS(block_group, cluster, size, bitmap),
+
+	TP_STRUCT__entry(
+		__field(	u64,	bg_objectid		)
+		__field(	u64,	flags			)
+		__field(	u64,	start			)
+		__field(	u64,	max_size		)
+		__field(	u64,	size			)
+		__field(	int,	bitmap			)
+	),
+
+	TP_fast_assign(
+		__entry->bg_objectid	= block_group->key.objectid;
+		__entry->flags		= block_group->flags;
+		__entry->start		= cluster->window_start;
+		__entry->max_size	= cluster->max_size;
+		__entry->size		= size;
+		__entry->bitmap		= bitmap;
+	),
+
+	TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, "
+		  "size = %Lu, max_size = %Lu, bitmap = %d",
+		  __entry->bg_objectid,
+		  __entry->flags,
+		  __print_flags((unsigned long)__entry->flags, "|",
+				BTRFS_GROUP_FLAGS), __entry->start,
+		  __entry->size, __entry->max_size, __entry->bitmap)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 90290e19820e3323ce6b9c2888eeb68bf29c278b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 2 Dec 2011 15:44:12 -0500
Subject: Btrfs: protect orphan block rsv with spin_lock

We've been seeing warnings coming out of the orphan commit stuff forever from
ceph.  Turns out it's because we're racing with checking if the orphan block
reserve is set, because we clear it outside of the spin_lock.  So leave the
normal fastpath checks where they are, but take the spin_lock and _recheck_ to
make sure we haven't had an orphan block rsv added in the meantime.  Then clear
the root's orphan block rsv and release the lock.  With this patch a user said
the warnings went away and they usually showed up pretty soon after he started
ceph.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 85fd86ea983..619742d3716 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state {
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root)
 {
+	struct btrfs_block_rsv *block_rsv;
 	int ret;
 
 	if (!list_empty(&root->orphan_list) ||
 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
 		return;
 
+	spin_lock(&root->orphan_lock);
+	if (!list_empty(&root->orphan_list)) {
+		spin_unlock(&root->orphan_lock);
+		return;
+	}
+
+	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+		spin_unlock(&root->orphan_lock);
+		return;
+	}
+
+	block_rsv = root->orphan_block_rsv;
+	root->orphan_block_rsv = NULL;
+	spin_unlock(&root->orphan_lock);
+
 	if (root->orphan_item_inserted &&
 	    btrfs_root_refs(&root->root_item) > 0) {
 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 		root->orphan_item_inserted = 0;
 	}
 
-	if (root->orphan_block_rsv) {
-		WARN_ON(root->orphan_block_rsv->size > 0);
-		btrfs_free_block_rsv(root, root->orphan_block_rsv);
-		root->orphan_block_rsv = NULL;
+	if (block_rsv) {
+		WARN_ON(block_rsv->size > 0);
+		btrfs_free_block_rsv(root, block_rsv);
 	}
 }
 
-- 
cgit v1.2.3


From 8c2a3ca20f6233677ac3222c6506174010eb414f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 10 Jan 2012 10:31:31 -0500
Subject: Btrfs: space leak tracepoints

This in addition to a script in my btrfs-tracing tree will help track down space
leaks when we're getting space left over in block groups on umount.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/delayed-inode.c     | 45 +++++++++++++++++++++++++---------
 fs/btrfs/extent-tree.c       | 58 ++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/inode-map.c         |  4 +++
 fs/btrfs/transaction.c       |  2 ++
 include/trace/events/btrfs.h | 30 +++++++++++++++++++++++
 5 files changed, 119 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c50..fe4cd0f1cef 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-	if (!ret)
+	if (!ret) {
+		trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+					      item->key.objectid,
+					      num_bytes, 1);
 		item->bytes_reserved = num_bytes;
+	}
 
 	return ret;
 }
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 		return;
 
 	rsv = &root->fs_info->delayed_block_rsv;
+	trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+				      item->key.objectid, item->bytes_reserved,
+				      0);
 	btrfs_block_rsv_release(root, rsv,
 				item->bytes_reserved);
 }
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 	struct btrfs_block_rsv *dst_rsv;
 	u64 num_bytes;
 	int ret;
-	int release = false;
+	bool release = false;
 
 	src_rsv = trans->block_rsv;
 	dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
 		 */
 		if (ret == -EAGAIN)
 			ret = -ENOSPC;
-		if (!ret)
+		if (!ret) {
 			node->bytes_reserved = num_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+						      "delayed_inode",
+						      btrfs_ino(inode),
+						      num_bytes, 1);
+		}
 		return ret;
 	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
 		spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
 	 * reservation here.  I think it may be time for a documentation page on
 	 * how block rsvs. work.
 	 */
-	if (!ret)
+	if (!ret) {
+		trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+					      btrfs_ino(inode), num_bytes, 1);
 		node->bytes_reserved = num_bytes;
+	}
 
-	if (release)
+	if (release) {
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), num_bytes, 0);
 		btrfs_block_rsv_release(root, src_rsv, num_bytes);
+	}
 
 	return ret;
 }
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
 		return;
 
 	rsv = &root->fs_info->delayed_block_rsv;
+	trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+				      node->inode_id, node->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, rsv,
 				node->bytes_reserved);
 	node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 		goto release_node;
 	}
 
-	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
-	/*
-	 * we have reserved enough space when we start a new transaction,
-	 * so reserving metadata failure is impossible
-	 */
-	BUG_ON(ret);
-
 	delayed_item->key.objectid = btrfs_ino(dir);
 	btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
 	delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	dir_item->type = type;
 	memcpy((char *)(dir_item + 1), name, name_len);
 
+	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+	/*
+	 * we have reserved enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible
+	 */
+	BUG_ON(ret);
+
+
 	mutex_lock(&delayed_node->mutex);
 	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
 	if (unlikely(ret)) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ad1a20bc834..556f9aa25bb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3310,6 +3310,8 @@ commit_trans:
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      (u64)data_sinfo, bytes, 1);
 	spin_unlock(&data_sinfo->lock);
 
 	return 0;
@@ -3329,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	data_sinfo = BTRFS_I(inode)->space_info;
 	spin_lock(&data_sinfo->lock);
 	data_sinfo->bytes_may_use -= bytes;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      (u64)data_sinfo, bytes, 0);
 	spin_unlock(&data_sinfo->lock);
 }
 
@@ -3686,6 +3690,10 @@ again:
 	if (used <= space_info->total_bytes) {
 		if (used + orig_bytes <= space_info->total_bytes) {
 			space_info->bytes_may_use += orig_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+						      "space_info",
+						      (u64)space_info,
+						      orig_bytes, 1);
 			ret = 0;
 		} else {
 			/*
@@ -3753,6 +3761,10 @@ again:
 
 		if (used + num_bytes < space_info->total_bytes + avail) {
 			space_info->bytes_may_use += orig_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+						      "space_info",
+						      (u64)space_info,
+						      orig_bytes, 1);
 			ret = 0;
 		} else {
 			wait_ordered = true;
@@ -3859,7 +3871,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
 	spin_unlock(&block_rsv->lock);
 }
 
-static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_rsv *block_rsv,
 				    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3895,6 +3908,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
 			space_info->bytes_may_use -= num_bytes;
+			trace_btrfs_space_reservation(fs_info, "space_info",
+						      (u64)space_info,
+						      num_bytes, 0);
 			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
@@ -4051,7 +4067,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
 	if (global_rsv->full || global_rsv == block_rsv ||
 	    block_rsv->space_info != global_rsv->space_info)
 		global_rsv = NULL;
-	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+				num_bytes);
 }
 
 /*
@@ -4110,11 +4127,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 		num_bytes = sinfo->total_bytes - num_bytes;
 		block_rsv->reserved += num_bytes;
 		sinfo->bytes_may_use += num_bytes;
+		trace_btrfs_space_reservation(fs_info, "space_info",
+					      (u64)sinfo, num_bytes, 1);
 	}
 
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
 		sinfo->bytes_may_use -= num_bytes;
+		trace_btrfs_space_reservation(fs_info, "space_info",
+					      (u64)sinfo, num_bytes, 0);
 		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
@@ -4149,7 +4170,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
-	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+				(u64)-1);
 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
 	WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4166,6 +4188,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 	if (!trans->bytes_reserved)
 		return;
 
+	trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
+				      trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
@@ -4183,6 +4207,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 	 * when we are truly done with the orphan item.
 	 */
 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	trace_btrfs_space_reservation(root->fs_info, "orphan",
+				      btrfs_ino(inode), num_bytes, 1);
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
@@ -4190,6 +4216,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	trace_btrfs_space_reservation(root->fs_info, "orphan",
+				      btrfs_ino(inode), num_bytes, 0);
 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
 
@@ -4370,8 +4398,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		if (dropped)
 			to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
-		if (to_free)
+		if (to_free) {
 			btrfs_block_rsv_release(root, block_rsv, to_free);
+			trace_btrfs_space_reservation(root->fs_info,
+						      "delalloc",
+						      btrfs_ino(inode),
+						      to_free, 0);
+		}
 		return ret;
 	}
 
@@ -4383,6 +4416,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	spin_unlock(&BTRFS_I(inode)->lock);
 
+	if (to_reserve)
+		trace_btrfs_space_reservation(root->fs_info,"delalloc",
+					      btrfs_ino(inode), to_reserve, 1);
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	return 0;
@@ -4412,6 +4448,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 	if (dropped > 0)
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
+	trace_btrfs_space_reservation(root->fs_info, "delalloc",
+				      btrfs_ino(inode), to_free, 0);
 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
 				to_free);
 }
@@ -4666,7 +4704,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 			cache->reserved += num_bytes;
 			space_info->bytes_reserved += num_bytes;
 			if (reserve == RESERVE_ALLOC) {
-				BUG_ON(space_info->bytes_may_use < num_bytes);
+				trace_btrfs_space_reservation(cache->fs_info,
+							      "space_info",
+							      (u64)space_info,
+							      num_bytes, 0);
 				space_info->bytes_may_use -= num_bytes;
 			}
 		}
@@ -6126,10 +6167,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	return ERR_PTR(-ENOSPC);
 }
 
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
 	block_rsv_add_bytes(block_rsv, blocksize, 0);
-	block_rsv_release_bytes(block_rsv, NULL, 0);
+	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
 }
 
 /*
@@ -6159,7 +6201,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
 				   empty_size, hint, (u64)-1, &ins, 0);
 	if (ret) {
-		unuse_block_rsv(block_rsv, blocksize);
+		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
 		return ERR_PTR(ret);
 	}
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d6..213ffa86ce1 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 					  trans->bytes_reserved);
 	if (ret)
 		goto out;
+	trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+				      trans->bytes_reserved, 1);
 again:
 	inode = lookup_free_ino_inode(root, path);
 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
 out_put:
 	iput(inode);
 out_release:
+	trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+				      trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
 	trans->block_rsv = rsv;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d5f987b49d7..287a6728b1a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -326,6 +326,8 @@ again:
 	}
 
 	if (num_bytes) {
+		trace_btrfs_space_reservation(root->fs_info, "transaction",
+					      (u64)h, num_bytes, 1);
 		h->block_rsv = &root->fs_info->trans_block_rsv;
 		h->bytes_reserved = num_bytes;
 	}
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 1750c0e6660..84f3001a568 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -55,6 +55,8 @@ struct extent_buffer;
 	{ BTRFS_BLOCK_GROUP_DUP,	"DUP"}, \
 	{ BTRFS_BLOCK_GROUP_RAID10,	"RAID10"}
 
+#define BTRFS_UUID_SIZE 16
+
 TRACE_EVENT(btrfs_transaction_commit,
 
 	TP_PROTO(struct btrfs_root *root),
@@ -632,6 +634,34 @@ TRACE_EVENT(btrfs_cow_block,
 		  __entry->cow_level)
 );
 
+TRACE_EVENT(btrfs_space_reservation,
+
+	TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val,
+		 u64 bytes, int reserve),
+
+	TP_ARGS(fs_info, type, val, bytes, reserve),
+
+	TP_STRUCT__entry(
+		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
+		__string(	type,	type			)
+		__field(	u64,	val			)
+		__field(	u64,	bytes			)
+		__field(	int,	reserve			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+		__assign_str(type, type);
+		__entry->val		= val;
+		__entry->bytes		= bytes;
+		__entry->reserve	= reserve;
+	),
+
+	TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type),
+		  __entry->val, __entry->reserve ? "reserve" : "release",
+		  __entry->bytes)
+);
+
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
 	TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
-- 
cgit v1.2.3


From f248679e86fead40cc78e724c7181d6bec1a2046 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 13 Jan 2012 12:09:22 -0500
Subject: Btrfs: add a delalloc mutex to inodes for delalloc reservations

I was using i_mutex for this, but we're getting bogus lockdep warnings by doing
that and theres no real way to get rid of those, so just stop using i_mutex to
protect delalloc metadata reservations and use a delalloc mutex instead.  This
shouldn't be contended often at all, only if you are writing and mmap writing to
the file at the same time.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/btrfs_inode.h |  3 +++
 fs/btrfs/extent-tree.c |  5 +++--
 fs/btrfs/inode.c       | 11 +----------
 fs/btrfs/ioctl.c       |  2 --
 fs/btrfs/relocation.c  |  2 --
 5 files changed, 7 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d..9b9b15fd520 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,9 @@ struct btrfs_inode {
 	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
 
+	/* held while doing delalloc reservations */
+	struct mutex delalloc_mutex;
+
 	/* used to order data wrt metadata */
 	struct btrfs_ordered_inode_tree ordered_tree;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 556f9aa25bb..e0ad5f0f895 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4345,12 +4345,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	/* Need to be holding the i_mutex here if we aren't free space cache */
 	if (btrfs_is_free_space_inode(root, inode))
 		flush = 0;
-	else
-		WARN_ON(!mutex_is_locked(&inode->i_mutex));
 
 	if (flush && btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
+	mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
 
 	spin_lock(&BTRFS_I(inode)->lock);
@@ -4405,6 +4404,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 						      btrfs_ino(inode),
 						      to_free, 0);
 		}
+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 		return ret;
 	}
 
@@ -4415,6 +4415,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	}
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	spin_unlock(&BTRFS_I(inode)->lock);
+	mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
 	if (to_reserve)
 		trace_btrfs_space_reservation(root->fs_info,"delalloc",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 619742d3716..5977987abdb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2239,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 				continue;
 			}
 			nr_truncate++;
-			/*
-			 * Need to hold the imutex for reservation purposes, not
-			 * a huge deal here but I have a WARN_ON in
-			 * btrfs_delalloc_reserve_space to catch offenders.
-			 */
-			mutex_lock(&inode->i_mutex);
 			ret = btrfs_truncate(inode);
-			mutex_unlock(&inode->i_mutex);
 		} else {
 			nr_unlink++;
 		}
@@ -6411,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_start;
 	u64 page_end;
 
-	/* Need this to keep space reservations serialized */
-	mutex_lock(&inode->i_mutex);
 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-	mutex_unlock(&inode->i_mutex);
 	if (!ret)
 		ret = btrfs_update_time(vma->vm_file);
 	if (ret) {
@@ -6758,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	extent_io_tree_init(&ei->io_tree, &inode->i_data);
 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
 	mutex_init(&ei->log_mutex);
+	mutex_init(&ei->delalloc_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	INIT_LIST_HEAD(&ei->i_orphan);
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7fdf22c2dc0..6834be4c870 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -868,10 +868,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
 		return 0;
 	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
 
-	mutex_lock(&inode->i_mutex);
 	ret = btrfs_delalloc_reserve_space(inode,
 					   num_pages << PAGE_CACHE_SHIFT);
-	mutex_unlock(&inode->i_mutex);
 	if (ret)
 		return ret;
 again:
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index efe9f792544..8c1aae2c845 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2949,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
 	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
 	while (index <= last_index) {
-		mutex_lock(&inode->i_mutex);
 		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
-		mutex_unlock(&inode->i_mutex);
 		if (ret)
 			goto out;
 
-- 
cgit v1.2.3


From 96bdc7dc61fb1b1e8e858dafb13abee8482ba064 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 16 Jan 2012 08:13:11 -0500
Subject: Btrfs: use larger system chunks

system chunks by default are very small.  This makes them slightly
larger and also fixes the conditional checks to make sure we don't
allocate a billion of them at once.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 +++
 fs/btrfs/volumes.c     | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e0ad5f0f895..700879ed64c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3384,6 +3384,9 @@ static int should_alloc_chunk(struct btrfs_root *root,
 
 	/* 256MB or 2% of the FS */
 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
+	/* system chunks need a much small threshold */
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		thresh = 32 * 1024 * 1024;
 
 	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
 		return 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 59e878f9fdc..7ffdb154dae 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3166,7 +3166,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			max_stripe_size = 256 * 1024 * 1024;
 		max_chunk_size = max_stripe_size;
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		max_stripe_size = 8 * 1024 * 1024;
+		max_stripe_size = 32 * 1024 * 1024;
 		max_chunk_size = 2 * max_stripe_size;
 	} else {
 		printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
-- 
cgit v1.2.3


From 8096b1ebb59b94b3bc6abb6b7d121419e83447ba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:07 +0000
Subject: xfs: remove the if_ext_max field in struct xfs_ifork

We spent a lot of effort to maintain this field, but it always equals to the
fork size divided by the constant size of an extent.  The prime use of it is
to assert that the two stay in sync.  Just divide the fork size by the extent
size in the few places that we actually use it and remove the overhead
of maintaining it.  Also introduce a few helpers to consolidate the places
where we actually care about the value.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr_leaf.c  |   9 -----
 fs/xfs/xfs_bmap.c       | 101 ++++++++++++++++++++++--------------------------
 fs/xfs/xfs_dfrag.c      |  43 ++++++++++-----------
 fs/xfs/xfs_iget.c       |   2 -
 fs/xfs/xfs_inode.c      |  30 +++++---------
 fs/xfs/xfs_inode.h      |   4 +-
 fs/xfs/xfs_inode_item.c |   2 -
 fs/xfs/xfs_trace.h      |   5 +--
 8 files changed, 81 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c1b55e59655..d25eafd4d28 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
 	dp = args->dp;
 	mp = dp->i_mount;
 	dp->i_d.di_forkoff = forkoff;
-	dp->i_df.if_ext_max =
-		XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-	dp->i_afp->if_ext_max =
-		XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
 
 	ifp = dp->i_afp;
 	ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +322,6 @@ xfs_attr_fork_reset(
 	ASSERT(ip->i_d.di_anextents == 0);
 	ASSERT(ip->i_afp == NULL);
 
-	ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
 
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 				(args->op_flags & XFS_DA_OP_ADDNAME) ||
 				!(mp->m_flags & XFS_MOUNT_ATTR2) ||
 				dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
-		dp->i_afp->if_ext_max =
-			XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-		dp->i_df.if_ext_max =
-			XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
 		xfs_trans_log_inode(args->trans, dp,
 					XFS_ILOG_CORE | XFS_ILOG_ADATA);
 	}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d0ab7883705..7a888ca2f7f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
 }
 
 /*
-* Update the record referred to by cur to the value given
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+		XFS_IFORK_NEXTENTS(ip, whichfork) >
+			XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+		XFS_IFORK_NEXTENTS(ip, whichfork) <=
+			XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
  * by [off, bno, len, state].
  * This either works (return 0) or gets an EFSCORRUPTED error.
  */
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 					bma->firstblock, bma->flist,
 					&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 				bma->firstblock, bma->flist, &bma->cur, 1,
 				&tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 					bma->firstblock, bma->flist, &bma->cur,
 					1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+	if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+	if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
 	/*
 	 * Make space in the inode incore.
 	 */
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
 	    ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
 		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
 
-		if (dfl_forkoff > ip->i_d.di_forkoff) {
+		if (dfl_forkoff > ip->i_d.di_forkoff)
 			ip->i_d.di_forkoff = dfl_forkoff;
-			ip->i_df.if_ext_max =
-				XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
-			ip->i_afp->if_ext_max =
-				XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
-		}
 	}
 }
 
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
 	int			error;		/* error return value */
 
 	ASSERT(XFS_IFORK_Q(ip) == 0);
-	ASSERT(ip->i_df.if_ext_max ==
-	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 
 	mp = ip->i_mount;
 	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
 		error = XFS_ERROR(EINVAL);
 		goto error1;
 	}
-	ip->i_df.if_ext_max =
-		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
 	ASSERT(ip->i_afp == NULL);
 	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
-	ip->i_afp->if_ext_max =
-		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	ip->i_afp->if_flags = XFS_IFEXTENTS;
 	logflags = 0;
 	xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
 		} else
 			spin_unlock(&mp->m_sb_lock);
 	}
-	if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+
+	error = xfs_bmap_finish(&tp, &flist, &committed);
+	if (error)
 		goto error2;
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	ASSERT(ip->i_df.if_ext_max ==
-	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
-	return error;
+	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 error2:
 	xfs_bmap_cancel(&flist);
 error1:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 error0:
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-	ASSERT(ip->i_df.if_ext_max ==
-	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 	return error;
 }
 
@@ -4379,8 +4382,6 @@ xfs_bmapi_read(
 	XFS_STATS_INC(xs_blk_mapr);
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4872,6 @@ xfs_bmapi_write(
 		return XFS_ERROR(EIO);
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 
 	XFS_STATS_INC(xs_blk_mapw);
 
@@ -4981,8 +4980,7 @@ xfs_bmapi_write(
 	/*
 	 * Transform from btree to extents, give it cur.
 	 */
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+	if (xfs_bmap_wants_extents(ip, whichfork)) {
 		int		tmp_logflags = 0;
 
 		ASSERT(bma.cur);
@@ -4992,10 +4990,10 @@ xfs_bmapi_write(
 		if (error)
 			goto error0;
 	}
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
-	       XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+	       XFS_IFORK_NEXTENTS(ip, whichfork) >
+		XFS_IFORK_MAXEXT(ip, whichfork));
 	error = 0;
 error0:
 	/*
@@ -5095,8 +5093,7 @@ xfs_bunmapi(
 
 	ASSERT(len > 0);
 	ASSERT(nexts >= 0);
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
 	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 		return error;
@@ -5322,7 +5319,8 @@ xfs_bunmapi(
 		 */
 		if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
 		    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-		    XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+		    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+			XFS_IFORK_MAXEXT(ip, whichfork) &&
 		    del.br_startoff > got.br_startoff &&
 		    del.br_startoff + del.br_blockcount <
 		    got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5351,11 @@ nodelete:
 		}
 	}
 	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
 	/*
 	 * Convert to a btree if necessary.
 	 */
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
 		ASSERT(cur == NULL);
 		error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
 			&cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5366,7 @@ nodelete:
 	/*
 	 * transform from btree to extents, give it cur
 	 */
-	else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-		 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+	else if (xfs_bmap_wants_extents(ip, whichfork)) {
 		ASSERT(cur != NULL);
 		error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
 			whichfork);
@@ -5382,8 +5377,6 @@ nodelete:
 	/*
 	 * transform from extents to local?
 	 */
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 	error = 0;
 error0:
 	/*
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 654dc6f05ba..dd974a55c77 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
 
 	/* Check temp in extent form to max in target */
 	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+			XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
 		return EINVAL;
 
 	/* Check target in extent form to max in temp */
 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+			XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
 		return EINVAL;
 
 	/*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
 	 * (a common defrag case) which will occur when the temp inode is in
 	 * extent format...
 	 */
-	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-	    ((XFS_IFORK_BOFF(ip) &&
-	      tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
-	     XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
-		return EINVAL;
+	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		if (XFS_IFORK_BOFF(ip) &&
+		    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+			return EINVAL;
+		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+			return EINVAL;
+	}
 
 	/* Reciprocal target->temp btree format checks */
-	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-	    ((XFS_IFORK_BOFF(tip) &&
-	      ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
-	     XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
-		return EINVAL;
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		if (XFS_IFORK_BOFF(tip) &&
+		    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+			return EINVAL;
+
+		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+			return EINVAL;
+	}
 
 	return 0;
 }
@@ -348,16 +357,6 @@ xfs_swap_extents(
 	*ifp = *tifp;		/* struct copy */
 	*tifp = *tempifp;	/* struct copy */
 
-	/*
-	 * Fix the in-memory data fork values that are dependent on the fork
-	 * offset in the inode. We can't assume they remain the same as attr2
-	 * has dynamic fork offsets.
-	 */
-	ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
-					(uint)sizeof(xfs_bmbt_rec_t);
-	tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
-					(uint)sizeof(xfs_bmbt_rec_t);
-
 	/*
 	 * Fix the on-disk inode values
 	 */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3960a066d7f..f180ce896cd 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -450,8 +450,6 @@ again:
 
 	*ipp = ip;
 
-	ASSERT(ip->i_df.if_ext_max ==
-	       XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
 	/*
 	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
 	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ccd619a993f..96b29e3286d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -299,11 +299,8 @@ xfs_iformat(
 {
 	xfs_attr_shortform_t	*atp;
 	int			size;
-	int			error;
+	int			error = 0;
 	xfs_fsize_t             di_size;
-	ip->i_df.if_ext_max =
-		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
-	error = 0;
 
 	if (unlikely(be32_to_cpu(dip->di_nextents) +
 		     be16_to_cpu(dip->di_anextents) >
@@ -409,10 +406,10 @@ xfs_iformat(
 	}
 	if (!XFS_DFORK_Q(dip))
 		return 0;
+
 	ASSERT(ip->i_afp == NULL);
 	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-	ip->i_afp->if_ext_max =
-		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
 	switch (dip->di_aformat) {
 	case XFS_DINODE_FMT_LOCAL:
 		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +601,11 @@ xfs_iformat_btree(
 	 * or the number of extents is greater than the number of
 	 * blocks.
 	 */
-	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
-	    || XFS_BMDR_SPACE_CALC(nrecs) >
-			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
-	    || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+			XFS_IFORK_MAXEXT(ip, whichfork) ||
+		     XFS_BMDR_SPACE_CALC(nrecs) >
+			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
 		xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
 			(unsigned long long) ip->i_ino);
 		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +833,6 @@ xfs_iread(
 		 * with the uninitialized part of it.
 		 */
 		ip->i_d.di_mode = 0;
-		/*
-		 * Initialize the per-fork minima and maxima for a new
-		 * inode here.  xfs_iformat will do it for old inodes.
-		 */
-		ip->i_df.if_ext_max =
-			XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	}
 
 	/*
@@ -1740,8 +1732,6 @@ xfs_ifree(
 	ip->i_d.di_flags = 0;
 	ip->i_d.di_dmevmask = 0;
 	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
-	ip->i_df.if_ext_max =
-		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 	/*
@@ -2408,7 +2398,7 @@ xfs_iflush(
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(!completion_done(&ip->i_flush));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
+	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
@@ -2524,7 +2514,7 @@ xfs_iflush_int(
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(!completion_done(&ip->i_flush));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
+	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c1574721a19..47497e11c11 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
 	struct xfs_btree_block	*if_broot;	/* file's incore btree root */
 	short			if_broot_bytes;	/* bytes allocated for root */
 	unsigned char		if_flags;	/* per-fork flags */
-	unsigned char		if_ext_max;	/* max # of extent records */
 	union {
 		xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
 		xfs_ext_irec_t	*if_ext_irec;	/* irec map file exts */
@@ -206,7 +205,8 @@ typedef struct xfs_icdinode {
 	((w) == XFS_DATA_FORK ? \
 		((ip)->i_d.di_nextents = (n)) : \
 		((ip)->i_d.di_anextents = (n)))
-
+#define XFS_IFORK_MAXEXT(ip, w) \
+	(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
 
 
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cfd6c7f8cc3..2b6b4fcef49 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -79,8 +79,6 @@ xfs_inode_item_size(
 		break;
 
 	case XFS_DINODE_FMT_BTREE:
-		ASSERT(ip->i_df.if_ext_max ==
-		       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 		iip->ili_format.ilf_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
 			  XFS_ILOG_DEV | XFS_ILOG_UUID);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 297f9fa6fb6..81efa041617 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1568,7 +1568,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 		__field(xfs_ino_t, ino)
 		__field(int, format)
 		__field(int, nex)
-		__field(int, max_nex)
 		__field(int, broot_size)
 		__field(int, fork_off)
 	),
@@ -1578,18 +1577,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 		__entry->ino = ip->i_ino;
 		__entry->format = ip->i_d.di_format;
 		__entry->nex = ip->i_d.di_nextents;
-		__entry->max_nex = ip->i_df.if_ext_max;
 		__entry->broot_size = ip->i_df.if_broot_bytes;
 		__entry->fork_off = XFS_IFORK_BOFF(ip);
 	),
 	TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
-		  "Max in-fork extents %d, broot size %d, fork offset %d",
+		  "broot size %d, fork offset %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
 		  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
 		  __entry->nex,
-		  __entry->max_nex,
 		  __entry->broot_size,
 		  __entry->fork_off)
 )
-- 
cgit v1.2.3


From 49e4c70e52a2bc2090e5a4e003e2888af21d6a2b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:08 +0000
Subject: xfs: make i_flags an unsigned long

To be used for bit wakeup i_flags needs to be an unsigned long or we'll
run into trouble on big endian systems.  Because of the 1-byte i_update
field right after it this actually causes a fairly large size increase
on its own (4 or 8 bytes), but that increase will be more than offset
by the next two patches.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 47497e11c11..be8dc0c2cf5 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -242,7 +242,7 @@ typedef struct xfs_inode {
 	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 	/* Miscellaneous state. */
-	unsigned short		i_flags;	/* see defined flags below */
+	unsigned long		i_flags;	/* see defined flags below */
 	unsigned char		i_update_core;	/* timestamps/size is dirty */
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
 
-- 
cgit v1.2.3


From 474fce067521a40dbacc722e8ba119e81c2d31bf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:09 +0000
Subject: xfs: replace i_flock with a sleeping bitlock

We almost never block on i_flock, the exception is synchronous inode
flushing.  Instead of bloating the inode with a 16/24-byte completion
that we abuse as a semaphore just implement it as a bitlock that uses
a bit waitqueue for the rare sleeping path.  This primarily is a
tradeoff between a much smaller inode and a faster non-blocking
path vs faster wakeups, and we are much better off with the former.

A small downside is that we will lose lockdep checking for i_flock, but
given that it's always taken inside the ilock that should be acceptable.

Note that for example the inode writeback locking is implemented in a
very similar way.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_iget.c       | 20 +++++++++++--
 fs/xfs/xfs_inode.c      |  4 +--
 fs/xfs/xfs_inode.h      | 78 +++++++++++++++++++++++++++++++------------------
 fs/xfs/xfs_inode_item.c |  4 +--
 fs/xfs/xfs_super.c      |  7 -----
 fs/xfs/xfs_sync.c       |  9 +++---
 6 files changed, 76 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index f180ce896cd..a7cf7139f9a 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -77,7 +77,7 @@ xfs_inode_alloc(
 
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(completion_done(&ip->i_flush));
+	ASSERT(!xfs_isiflocked(ip));
 	ASSERT(ip->i_ino == 0);
 
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -150,7 +150,7 @@ xfs_inode_free(
 	/* asserts to verify all state is correct here */
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(completion_done(&ip->i_flush));
+	ASSERT(!xfs_isiflocked(ip));
 
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always
@@ -713,3 +713,19 @@ xfs_isilocked(
 	return 0;
 }
 #endif
+
+void
+__xfs_iflock(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		if (xfs_isiflocked(ip))
+			io_schedule();
+	} while (!xfs_iflock_nowait(ip));
+
+	finish_wait(wq, &wait.wait);
+}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 96b29e3286d..eeb60d31b08 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2396,7 +2396,7 @@ xfs_iflush(
 	XFS_STATS_INC(xs_iflush_count);
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(!completion_done(&ip->i_flush));
+	ASSERT(xfs_isiflocked(ip));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
@@ -2512,7 +2512,7 @@ xfs_iflush_int(
 #endif
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(!completion_done(&ip->i_flush));
+	ASSERT(xfs_isiflocked(ip));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be8dc0c2cf5..960d2a89b3a 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -237,7 +237,6 @@ typedef struct xfs_inode {
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_iolock;	/* inode IO lock */
-	struct completion	i_flush;	/* inode flush completion q */
 	atomic_t		i_pincount;	/* inode pin count */
 	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
@@ -324,6 +323,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 	return ret;
 }
 
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+	int ret;
+
+	spin_lock(&ip->i_flags_lock);
+	ret = ip->i_flags & flags;
+	if (!ret)
+		ip->i_flags |= flags;
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
 /*
  * Project quota id helpers (previously projid was 16bit only
  * and using two 16bit values to hold new 32bit projid was chosen
@@ -343,36 +355,18 @@ xfs_set_projid(struct xfs_inode *ip,
 	ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
 }
 
-/*
- * Manage the i_flush queue embedded in the inode.  This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
-	wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
-	return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
-	complete(&ip->i_flush);
-}
-
 /*
  * In-core inode flags.
  */
-#define XFS_IRECLAIM		0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE		0x0002	/* inode has been staled */
-#define XFS_IRECLAIMABLE	0x0004	/* inode can be reclaimed */
-#define XFS_INEW		0x0008	/* inode has just been allocated */
-#define XFS_IFILESTREAM		0x0010	/* inode is in a filestream directory */
-#define XFS_ITRUNCATED		0x0020	/* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE	0x0040	/* dirty release already seen */
+#define XFS_IRECLAIM		(1 << 0) /* started reclaiming this inode */
+#define XFS_ISTALE		(1 << 1) /* inode has been staled */
+#define XFS_IRECLAIMABLE	(1 << 2) /* inode can be reclaimed */
+#define XFS_INEW		(1 << 3) /* inode has just been allocated */
+#define XFS_IFILESTREAM		(1 << 4) /* inode is in a filestream dir. */
+#define XFS_ITRUNCATED		(1 << 5) /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE	(1 << 6) /* dirty release already seen */
+#define __XFS_IFLOCK_BIT	7	 /* inode is being flushed right now */
+#define XFS_IFLOCK		(1 << __XFS_IFLOCK_BIT)
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -384,6 +378,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 	 XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
 	 XFS_IFILESTREAM);
 
+/*
+ * Synchronize processes attempting to flush the in-core inode back to disk.
+ */
+
+extern void __xfs_iflock(struct xfs_inode *ip);
+
+static inline int xfs_iflock_nowait(struct xfs_inode *ip)
+{
+	return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
+}
+
+static inline void xfs_iflock(struct xfs_inode *ip)
+{
+	if (!xfs_iflock_nowait(ip))
+		__xfs_iflock(ip);
+}
+
+static inline void xfs_ifunlock(struct xfs_inode *ip)
+{
+	xfs_iflags_clear(ip, XFS_IFLOCK);
+	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
+}
+
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+	return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
 /*
  * Flags for inode locking.
  * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2b6b4fcef49..c8d4ce0efd5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -717,7 +717,7 @@ xfs_inode_item_pushbuf(
 	 * If a flush is not in progress anymore, chances are that the
 	 * inode was taken off the AIL. So, just get out.
 	 */
-	if (completion_done(&ip->i_flush) ||
+	if (!xfs_isiflocked(ip) ||
 	    !(lip->li_flags & XFS_LI_IN_AIL)) {
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		return true;
@@ -750,7 +750,7 @@ xfs_inode_item_push(
 	struct xfs_inode	*ip = iip->ili_inode;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-	ASSERT(!completion_done(&ip->i_flush));
+	ASSERT(xfs_isiflocked(ip));
 
 	/*
 	 * Since we were able to lock the inode's flush lock and
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 281961c1d81..6851fa7b1af 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -829,13 +829,6 @@ xfs_fs_inode_init_once(
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
 	init_waitqueue_head(&ip->i_ipin_wait);
-	/*
-	 * Because we want to use a counting completion, complete
-	 * the flush completion once to allow a single access to
-	 * the flush completion without blocking.
-	 */
-	init_completion(&ip->i_flush);
-	complete(&ip->i_flush);
 
 	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 		     "xfsino", ip->i_ino);
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 72c01a1c16e..40b75eecd2b 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -707,14 +707,13 @@ xfs_reclaim_inode_grab(
 		return 1;
 
 	/*
-	 * do some unlocked checks first to avoid unnecessary lock traffic.
-	 * The first is a flush lock check, the second is a already in reclaim
-	 * check. Only do these checks if we are not going to block on locks.
+	 * If we are asked for non-blocking operation, do unlocked checks to
+	 * see if the inode already is being flushed or in reclaim to avoid
+	 * lock traffic.
 	 */
 	if ((flags & SYNC_TRYLOCK) &&
-	    (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
 		return 1;
-	}
 
 	/*
 	 * The radix tree lock here protects a thread in xfs_iget from racing
-- 
cgit v1.2.3


From f392e6319a4e9a028b0c8b48f000bb01d660ad53 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:10 +0000
Subject: xfs: replace i_pin_wait with a bit waitqueue

Replace i_pin_wait, which is only used during synchronous inode flushing
with a bit waitqueue.  This trades off a much smaller inode against
slightly slower wakeup performance, and saves 12 (32-bit) or 20 (64-bit)
bytes in the XFS inode.

Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c      | 27 +++++++++++++++++++++------
 fs/xfs/xfs_inode.h      |  3 ++-
 fs/xfs/xfs_inode_item.c |  2 +-
 fs/xfs/xfs_super.c      |  1 -
 4 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index eeb60d31b08..62603369b52 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2037,7 +2037,7 @@ xfs_idestroy_fork(
  * once someone is waiting for it to be unpinned.
  */
 static void
-xfs_iunpin_nowait(
+xfs_iunpin(
 	struct xfs_inode	*ip)
 {
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2049,14 +2049,29 @@ xfs_iunpin_nowait(
 
 }
 
+static void
+__xfs_iunpin_wait(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+
+	xfs_iunpin(ip);
+
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		if (xfs_ipincount(ip))
+			io_schedule();
+	} while (xfs_ipincount(ip));
+	finish_wait(wq, &wait.wait);
+}
+
 void
 xfs_iunpin_wait(
 	struct xfs_inode	*ip)
 {
-	if (xfs_ipincount(ip)) {
-		xfs_iunpin_nowait(ip);
-		wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
-	}
+	if (xfs_ipincount(ip))
+		__xfs_iunpin_wait(ip);
 }
 
 /*
@@ -2415,7 +2430,7 @@ xfs_iflush(
 	 * out for us if they occur after the log force completes.
 	 */
 	if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
-		xfs_iunpin_nowait(ip);
+		xfs_iunpin(ip);
 		xfs_ifunlock(ip);
 		return EAGAIN;
 	}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 960d2a89b3a..4acbe740be4 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -238,7 +238,6 @@ typedef struct xfs_inode {
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_iolock;	/* inode IO lock */
 	atomic_t		i_pincount;	/* inode pin count */
-	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 	/* Miscellaneous state. */
 	unsigned long		i_flags;	/* see defined flags below */
@@ -367,6 +366,8 @@ xfs_set_projid(struct xfs_inode *ip,
 #define XFS_IDIRTY_RELEASE	(1 << 6) /* dirty release already seen */
 #define __XFS_IFLOCK_BIT	7	 /* inode is being flushed right now */
 #define XFS_IFLOCK		(1 << __XFS_IFLOCK_BIT)
+#define __XFS_IPINNED_BIT	8	 /* wakeup key for zero pin count */
+#define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c8d4ce0efd5..91d71dcd485 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -555,7 +555,7 @@ xfs_inode_item_unpin(
 	trace_xfs_inode_unpin(ip, _RET_IP_);
 	ASSERT(atomic_read(&ip->i_pincount) > 0);
 	if (atomic_dec_and_test(&ip->i_pincount))
-		wake_up(&ip->i_ipin_wait);
+		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
 }
 
 /*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 6851fa7b1af..ee5b695c99a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -828,7 +828,6 @@ xfs_fs_inode_init_once(
 	/* xfs inode */
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
-	init_waitqueue_head(&ip->i_ipin_wait);
 
 	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 		     "xfsino", ip->i_ino);
-- 
cgit v1.2.3


From ce7ae151ddada3dbf67301464343c154903166b3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:11 +0000
Subject: xfs: remove the i_size field in struct xfs_inode

There is no fundamental need to keep an in-memory inode size copy in the XFS
inode.  We already have the on-disk value in the dinode, and the separate
in-memory copy that we need for regular files only in the XFS inode.

Remove the xfs_inode i_size field and change the XFS_ISIZE macro to use the
VFS inode i_size field for regular files.  Switch code that was directly
accessing the i_size field in the xfs_inode to XFS_ISIZE, or in cases where
we are limited to regular files direct access of the VFS inode i_size field.

This also allows dropping some fairly complicated code in the write path
which dealt with keeping the xfs_inode i_size uptodate with the VFS i_size
that is getting updated inside ->write_end.

Note that we do not bother resetting the VFS i_size when truncating a file
that gets freed to zero as there is no point in doing so because the VFS inode
is no longer in use at this point.  Just relax the assert in xfs_ifree to
only check the on-disk size instead.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c        |  2 +-
 fs/xfs/xfs_bmap.c        | 15 ++++++---------
 fs/xfs/xfs_file.c        | 45 +++++++++++----------------------------------
 fs/xfs/xfs_fs_subr.c     |  2 +-
 fs/xfs/xfs_iget.c        |  1 -
 fs/xfs/xfs_inode.c       |  8 ++------
 fs/xfs/xfs_inode.h       | 16 ++++++++++++----
 fs/xfs/xfs_iomap.c       | 12 ++++++------
 fs/xfs/xfs_iops.c        |  3 +--
 fs/xfs/xfs_qm_syscalls.c |  1 -
 fs/xfs/xfs_trace.h       |  2 +-
 fs/xfs/xfs_vnodeops.c    | 31 +++++++++++++++----------------
 12 files changed, 56 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee9b62..4d27ea117e0 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,7 +111,7 @@ xfs_ioend_new_eof(
 	xfs_fsize_t		bsize;
 
 	bsize = ioend->io_offset + ioend->io_size;
-	isize = MAX(ip->i_size, ip->i_new_size);
+	isize = MAX(i_size_read(VFS_I(ip)), ip->i_new_size);
 	isize = MIN(isize, bsize);
 	return isize > ip->i_d.di_size ? isize : 0;
 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7a888ca2f7f..188ef2fbd62 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3997,11 +3997,8 @@ xfs_bmap_one_block(
 	xfs_bmbt_irec_t	s;		/* internal version of extent */
 
 #ifndef DEBUG
-	if (whichfork == XFS_DATA_FORK) {
-		return S_ISREG(ip->i_d.di_mode) ?
-			(ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
-			(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
-	}
+	if (whichfork == XFS_DATA_FORK)
+		return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
 #endif	/* !DEBUG */
 	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
 		return 0;
@@ -4013,7 +4010,7 @@ xfs_bmap_one_block(
 	xfs_bmbt_get_all(ep, &s);
 	rval = s.br_startoff == 0 && s.br_blockcount == 1;
 	if (rval && whichfork == XFS_DATA_FORK)
-		ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+		ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
 	return rval;
 }
 
@@ -5427,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole(
 	if (startblock == HOLESTARTBLOCK) {
 		mp = ip->i_mount;
 		out->bmv_block = -1;
-		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
 		fixlen -= out->bmv_offset;
 		if (prealloced && out->bmv_offset + out->bmv_length == end) {
 			/* Came to hole at EOF. Trim it. */
@@ -5515,7 +5512,7 @@ xfs_getbmap(
 			fixlen = XFS_MAXIOFFSET(mp);
 		} else {
 			prealloced = 0;
-			fixlen = ip->i_size;
+			fixlen = XFS_ISIZE(ip);
 		}
 	}
 
@@ -5544,7 +5541,7 @@ xfs_getbmap(
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
-		if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+		if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
 			error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
 			if (error)
 				goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f675f3d9d7b..86d5dc26046 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -327,7 +327,7 @@ xfs_file_aio_read(
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 		if ((iocb->ki_pos & target->bt_smask) ||
 		    (size & target->bt_smask)) {
-			if (iocb->ki_pos == ip->i_size)
+			if (iocb->ki_pos == i_size_read(inode))
 				return 0;
 			return -XFS_ERROR(EINVAL);
 		}
@@ -412,30 +412,6 @@ xfs_file_splice_read(
 	return ret;
 }
 
-STATIC void
-xfs_aio_write_isize_update(
-	struct inode	*inode,
-	loff_t		*ppos,
-	ssize_t		bytes_written)
-{
-	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fsize_t		isize = i_size_read(inode);
-
-	if (bytes_written > 0)
-		XFS_STATS_ADD(xs_write_bytes, bytes_written);
-
-	if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
-					*ppos > isize))
-		*ppos = isize;
-
-	if (*ppos > ip->i_size) {
-		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-		if (*ppos > ip->i_size)
-			ip->i_size = *ppos;
-		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-	}
-}
-
 /*
  * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
  * part of the I/O may have been written to disk before the error occurred.  In
@@ -451,8 +427,8 @@ xfs_aio_write_newsize_update(
 		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
 		if (new_size == ip->i_new_size)
 			ip->i_new_size = 0;
-		if (ip->i_d.di_size > ip->i_size)
-			ip->i_d.di_size = ip->i_size;
+		if (ip->i_d.di_size > i_size_read(VFS_I(ip)))
+			ip->i_d.di_size = i_size_read(VFS_I(ip));
 		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 }
@@ -492,15 +468,16 @@ xfs_file_splice_write(
 	new_size = *ppos + count;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	if (new_size > ip->i_size)
+	if (new_size > i_size_read(inode))
 		ip->i_new_size = new_size;
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+	if (ret > 0)
+		XFS_STATS_ADD(xs_write_bytes, ret);
 
-	xfs_aio_write_isize_update(inode, ppos, ret);
 	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
@@ -728,14 +705,14 @@ restart:
 	 * values are still valid.
 	 */
 	if ((ip->i_new_size && *pos > ip->i_new_size) ||
-	    (!ip->i_new_size && *pos > ip->i_size)) {
+	    (!ip->i_new_size && *pos > i_size_read(inode))) {
 		if (*iolock == XFS_IOLOCK_SHARED) {
 			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
 			xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 			goto restart;
 		}
-		error = -xfs_zero_eof(ip, *pos, ip->i_size);
+		error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
 	}
 
 	/*
@@ -744,7 +721,7 @@ restart:
 	 * ip->i_new_size if this IO ends beyond any other in-flight writes.
 	 */
 	new_size = *pos + *count;
-	if (new_size > ip->i_size) {
+	if (new_size > i_size_read(inode)) {
 		if (new_size > ip->i_new_size)
 			ip->i_new_size = new_size;
 		*new_sizep = new_size;
@@ -957,11 +934,11 @@ xfs_file_aio_write(
 		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
 						ocount, &new_size, &iolock);
 
-	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-
 	if (ret <= 0)
 		goto out_unlock;
 
+	XFS_STATS_ADD(xs_write_bytes, ret);
+
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
 		loff_t end = pos + ret - 1;
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811..652b875a9d4 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
 
 	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
 		return -filemap_fdatawait_range(mapping, first,
-					last == -1 ? ip->i_size - 1 : last);
+					last == -1 ? XFS_ISIZE(ip) - 1 : last);
 	}
 	return 0;
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index a7cf7139f9a..3b5b78aa3b8 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -94,7 +94,6 @@ xfs_inode_alloc(
 	ip->i_update_core = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-	ip->i_size = 0;
 	ip->i_new_size = 0;
 
 	return ip;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 62603369b52..b21022499c2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -347,7 +347,6 @@ xfs_iformat(
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		ip->i_d.di_size = 0;
-		ip->i_size = 0;
 		ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
 		break;
 
@@ -853,7 +852,6 @@ xfs_iread(
 	}
 
 	ip->i_delayed_blks = 0;
-	ip->i_size = ip->i_d.di_size;
 
 	/*
 	 * Mark the buffer containing the inode as something to keep
@@ -1043,7 +1041,6 @@ xfs_ialloc(
 	}
 
 	ip->i_d.di_size = 0;
-	ip->i_size = 0;
 	ip->i_d.di_nextents = 0;
 	ASSERT(ip->i_d.di_nblocks == 0);
 
@@ -1198,7 +1195,7 @@ xfs_itruncate_extents(
 	int			done = 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-	ASSERT(new_size <= ip->i_size);
+	ASSERT(new_size <= XFS_ISIZE(ip));
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 	ASSERT(ip->i_itemp != NULL);
 	ASSERT(ip->i_itemp->ili_lock_flags == 0);
@@ -1712,8 +1709,7 @@ xfs_ifree(
 	ASSERT(ip->i_d.di_nlink == 0);
 	ASSERT(ip->i_d.di_nextents == 0);
 	ASSERT(ip->i_d.di_anextents == 0);
-	ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
-	       (!S_ISREG(ip->i_d.di_mode)));
+	ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
 	ASSERT(ip->i_d.di_nblocks == 0);
 
 	/*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4acbe740be4..cd99e43fa8f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -246,16 +246,12 @@ typedef struct xfs_inode {
 
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
 
-	xfs_fsize_t		i_size;		/* in-memory size */
 	xfs_fsize_t		i_new_size;	/* size when write completes */
 
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 } xfs_inode_t;
 
-#define XFS_ISIZE(ip)	S_ISREG((ip)->i_d.di_mode) ? \
-				(ip)->i_size : (ip)->i_d.di_size;
-
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
@@ -268,6 +264,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 	return &ip->i_vnode;
 }
 
+/*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk.  Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+	if (S_ISREG(ip->i_d.di_mode))
+		return i_size_read(VFS_I(ip));
+	return ip->i_d.di_size;
+}
+
 /*
  * i_flags helper functions
  */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a27a44659da..246c7d57c6f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -74,7 +74,7 @@ xfs_iomap_eof_align_last_fsb(
 		else if (mp->m_dalign)
 			align = mp->m_dalign;
 
-		if (align && ip->i_size >= XFS_FSB_TO_B(mp, align))
+		if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
 			new_last_fsb = roundup_64(*last_fsb, align);
 	}
 
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
 
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-	if ((offset + count) > ip->i_size) {
+	if ((offset + count) > XFS_ISIZE(ip)) {
 		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
 		if (error)
 			goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
 	xfs_trans_ijoin(tp, ip, 0);
 
 	bmapi_flag = 0;
-	if (offset < ip->i_size || extsz)
+	if (offset < XFS_ISIZE(ip) || extsz)
 		bmapi_flag |= XFS_BMAPI_PREALLOC;
 
 	/*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
 	int		found_delalloc = 0;
 
 	*prealloc = 0;
-	if ((offset + count) <= ip->i_size)
+	if (offset + count <= XFS_ISIZE(ip))
 		return 0;
 
 	/*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
 		 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
 		 * ensure we always pass in a non-zero value.
 		 */
-		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+		alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
 		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
 					rounddown_pow_of_two(alloc_blocks));
 
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
 			 * back....
 			 */
 			nimaps = 1;
-			end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+			end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
 			error = xfs_bmap_last_offset(NULL, ip, &last_block,
 							XFS_DATA_FORK);
 			if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f02eaa298d3..ab302539e5b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -778,7 +778,7 @@ xfs_setattr_size(
 		lock_flags |= XFS_IOLOCK_EXCL;
 	xfs_ilock(ip, lock_flags);
 
-	oldsize = ip->i_size;
+	oldsize = inode->i_size;
 	newsize = iattr->ia_size;
 
 	/*
@@ -897,7 +897,6 @@ xfs_setattr_size(
 	 * they get written to.
 	 */
 	ip->i_d.di_size = newsize;
-	ip->i_size = newsize;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	if (newsize <= oldsize) {
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 27378650b5c..eafbcff81f3 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -265,7 +265,6 @@ xfs_qm_scall_trunc_qfile(
 	xfs_trans_ijoin(tp, ip, 0);
 
 	ip->i_d.di_size = 0;
-	ip->i_size = 0;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 81efa041617..2aabcc9c507 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1038,7 +1038,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
-		__entry->isize = ip->i_size;
+		__entry->isize = VFS_I(ip)->i_size;
 		__entry->disize = ip->i_d.di_size;
 		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 96ff0342175..0cf52da9d24 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -175,7 +175,7 @@ xfs_free_eofblocks(
 	 * Figure out if there are any blocks beyond the end
 	 * of the file.  If not, then there is nothing to do.
 	 */
-	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
 	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
 	if (last_fsb <= end_fsb)
 		return 0;
@@ -233,7 +233,7 @@ xfs_free_eofblocks(
 		 * may be full of holes (ie NULL files bug).
 		 */
 		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
-					      ip->i_size);
+					      XFS_ISIZE(ip));
 		if (error) {
 			/*
 			 * If we get an error at this point we simply don't
@@ -547,8 +547,8 @@ xfs_release(
 		return 0;
 
 	if ((S_ISREG(ip->i_d.di_mode) &&
-	     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-	       ip->i_delayed_blks > 0)) &&
+	     (VFS_I(ip)->i_size > 0 ||
+	      (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
 	     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
 	    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 
@@ -625,7 +625,7 @@ xfs_inactive(
 	 * only one with a reference to the inode.
 	 */
 	truncate = ((ip->i_d.di_nlink == 0) &&
-	    ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+	    ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
 	     (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
 	    S_ISREG(ip->i_d.di_mode));
 
@@ -639,12 +639,12 @@ xfs_inactive(
 
 	if (ip->i_d.di_nlink != 0) {
 		if ((S_ISREG(ip->i_d.di_mode) &&
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
-		     (!(ip->i_d.di_flags &
+		    (VFS_I(ip)->i_size > 0 ||
+		     (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
+		    (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+		    (!(ip->i_d.di_flags &
 				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
-		      (ip->i_delayed_blks != 0)))) {
+		     ip->i_delayed_blks != 0))) {
 			error = xfs_free_eofblocks(mp, ip, 0);
 			if (error)
 				return VN_INACTIVE_CACHE;
@@ -678,7 +678,6 @@ xfs_inactive(
 		xfs_trans_ijoin(tp, ip, 0);
 
 		ip->i_d.di_size = 0;
-		ip->i_size = 0;
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
@@ -1974,11 +1973,11 @@ xfs_zero_remaining_bytes(
 	 * since nothing can read beyond eof.  The space will
 	 * be zeroed when the file is extended anyway.
 	 */
-	if (startoff >= ip->i_size)
+	if (startoff >= XFS_ISIZE(ip))
 		return 0;
 
-	if (endoff > ip->i_size)
-		endoff = ip->i_size;
+	if (endoff > XFS_ISIZE(ip))
+		endoff = XFS_ISIZE(ip);
 
 	bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2273,7 +2272,7 @@ xfs_change_file_space(
 		bf->l_start += offset;
 		break;
 	case 2: /*SEEK_END*/
-		bf->l_start += ip->i_size;
+		bf->l_start += XFS_ISIZE(ip);
 		break;
 	default:
 		return XFS_ERROR(EINVAL);
@@ -2290,7 +2289,7 @@ xfs_change_file_space(
 	bf->l_whence = 0;
 
 	startoffset = bf->l_start;
-	fsize = ip->i_size;
+	fsize = XFS_ISIZE(ip);
 
 	/*
 	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
-- 
cgit v1.2.3


From 2813d682e8e6a278f94817429afd46b30875bb6e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:12 +0000
Subject: xfs: remove the i_new_size field in struct xfs_inode

Now that we use the VFS i_size field throughout XFS there is no need for the
i_new_size field any more given that the VFS i_size field gets updated
in ->write_end before unlocking the page, and thus is always uptodate when
writeback could see a page.  Removing i_new_size also has the advantage that
we will never have to trim back di_size during a failed buffered write,
given that it never gets updated past i_size.

Note that currently the generic direct I/O code only updates i_size after
calling our end_io handler, which requires a small workaround to make
sure di_size actually makes it to disk.  I hope to fix this properly in
the generic code.

A downside is that we lose the support for parallel non-overlapping O_DIRECT
appending writes that recently was added.  I don't think keeping the complex
and fragile i_new_size infrastructure for this is a good tradeoff - if we
really care about parallel appending writers we should investigate turning
the iolock into a range lock, which would also allow for parallel
non-overlapping buffered writers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c  | 29 ++++++++++++----------
 fs/xfs/xfs_file.c  | 72 ++++++++----------------------------------------------
 fs/xfs/xfs_iget.c  |  1 -
 fs/xfs/xfs_inode.h |  2 --
 fs/xfs/xfs_trace.h | 18 +++-----------
 5 files changed, 30 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4d27ea117e0..74b9baf36ac 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,8 +111,7 @@ xfs_ioend_new_eof(
 	xfs_fsize_t		bsize;
 
 	bsize = ioend->io_offset + ioend->io_size;
-	isize = MAX(i_size_read(VFS_I(ip)), ip->i_new_size);
-	isize = MIN(isize, bsize);
+	isize = MIN(i_size_read(VFS_I(ip)), bsize);
 	return isize > ip->i_d.di_size ? isize : 0;
 }
 
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 }
 
 /*
- * Update on-disk file size now that data has been written to disk.  The
- * current in-memory file size is i_size.  If a write is beyond eof i_new_size
- * will be the intended file size until i_size is updated.  If this write does
- * not extend all the way to the valid file size then restrict this update to
- * the end of the write.
+ * Update on-disk file size now that data has been written to disk.
  *
  * This function does not block as blocking on the inode lock in IO completion
  * can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1278,6 +1273,15 @@ xfs_end_io_direct_write(
 {
 	struct xfs_ioend	*ioend = iocb->private;
 
+	/*
+	 * While the generic direct I/O code updates the inode size, it does
+	 * so only after the end_io handler is called, which means our
+	 * end_io handler thinks the on-disk size is outside the in-core
+	 * size.  To prevent this just update it a little bit earlier here.
+	 */
+	if (offset + size > i_size_read(ioend->io_inode))
+		i_size_write(ioend->io_inode, offset + size);
+
 	/*
 	 * blockdev_direct_IO can return an error even after the I/O
 	 * completion handler was called.  Thus we need to protect
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed(
 
 	if (to > inode->i_size) {
 		/*
-		 * punch out the delalloc blocks we have already allocated. We
-		 * don't call xfs_setattr() to do this as we may be in the
-		 * middle of a multi-iovec write and so the vfs inode->i_size
-		 * will not match the xfs ip->i_size and so it will zero too
-		 * much. Hence we jus truncate the page cache to zero what is
-		 * necessary and punch the delalloc blocks directly.
+		 * Punch out the delalloc blocks we have already allocated.
+		 *
+		 * Don't bother with xfs_setattr given that nothing can have
+		 * made it to disk yet as the page is still locked at this
+		 * point.
 		 */
 		struct xfs_inode	*ip = XFS_I(inode);
 		xfs_fileoff_t		start_fsb;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 86d5dc26046..63231392678 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -412,27 +412,6 @@ xfs_file_splice_read(
 	return ret;
 }
 
-/*
- * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occurred.  In
- * this case the on-disk file size may have been adjusted beyond the in-memory
- * file size and now needs to be truncated back.
- */
-STATIC void
-xfs_aio_write_newsize_update(
-	struct xfs_inode	*ip,
-	xfs_fsize_t		new_size)
-{
-	if (new_size == ip->i_new_size) {
-		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-		if (new_size == ip->i_new_size)
-			ip->i_new_size = 0;
-		if (ip->i_d.di_size > i_size_read(VFS_I(ip)))
-			ip->i_d.di_size = i_size_read(VFS_I(ip));
-		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-	}
-}
-
 /*
  * xfs_file_splice_write() does not use xfs_rw_ilock() because
  * generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -451,7 +430,6 @@ xfs_file_splice_write(
 {
 	struct inode		*inode = outfilp->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fsize_t		new_size;
 	int			ioflags = 0;
 	ssize_t			ret;
 
@@ -465,20 +443,12 @@ xfs_file_splice_write(
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-	new_size = *ppos + count;
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	if (new_size > i_size_read(inode))
-		ip->i_new_size = new_size;
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
 	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_write_bytes, ret);
 
-	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
@@ -673,16 +643,13 @@ xfs_file_aio_write_checks(
 	struct file		*file,
 	loff_t			*pos,
 	size_t			*count,
-	xfs_fsize_t		*new_sizep,
 	int			*iolock)
 {
 	struct inode		*inode = file->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fsize_t		new_size;
 	int			error = 0;
 
 	xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-	*new_sizep = 0;
 restart:
 	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
 	if (error) {
@@ -697,15 +664,13 @@ restart:
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
 	 * blocks that fall between the existing EOF and the start of this
-	 * write. There is no need to issue zeroing if another in-flght IO ends
-	 * at or before this one If zeronig is needed and we are currently
-	 * holding the iolock shared, we need to update it to exclusive which
-	 * involves dropping all locks and relocking to maintain correct locking
-	 * order. If we do this, restart the function to ensure all checks and
-	 * values are still valid.
+	 * write.  If zeroing is needed and we are currently holding the
+	 * iolock shared, we need to update it to exclusive which involves
+	 * dropping all locks and relocking to maintain correct locking order.
+	 * If we do this, restart the function to ensure all checks and values
+	 * are still valid.
 	 */
-	if ((ip->i_new_size && *pos > ip->i_new_size) ||
-	    (!ip->i_new_size && *pos > i_size_read(inode))) {
+	if (*pos > i_size_read(inode)) {
 		if (*iolock == XFS_IOLOCK_SHARED) {
 			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
@@ -714,19 +679,6 @@ restart:
 		}
 		error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
 	}
-
-	/*
-	 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
-	 * We have already zeroed space beyond EOF (if necessary).  Only update
-	 * ip->i_new_size if this IO ends beyond any other in-flight writes.
-	 */
-	new_size = *pos + *count;
-	if (new_size > i_size_read(inode)) {
-		if (new_size > ip->i_new_size)
-			ip->i_new_size = new_size;
-		*new_sizep = new_size;
-	}
-
 	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error)
 		return error;
@@ -772,7 +724,6 @@ xfs_file_dio_aio_write(
 	unsigned long		nr_segs,
 	loff_t			pos,
 	size_t			ocount,
-	xfs_fsize_t		*new_size,
 	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
@@ -817,7 +768,7 @@ xfs_file_dio_aio_write(
 		xfs_rw_ilock(ip, *iolock);
 	}
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
 	if (ret)
 		return ret;
 
@@ -855,7 +806,6 @@ xfs_file_buffered_aio_write(
 	unsigned long		nr_segs,
 	loff_t			pos,
 	size_t			ocount,
-	xfs_fsize_t		*new_size,
 	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
@@ -869,7 +819,7 @@ xfs_file_buffered_aio_write(
 	*iolock = XFS_IOLOCK_EXCL;
 	xfs_rw_ilock(ip, *iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
 	if (ret)
 		return ret;
 
@@ -909,7 +859,6 @@ xfs_file_aio_write(
 	ssize_t			ret;
 	int			iolock;
 	size_t			ocount = 0;
-	xfs_fsize_t		new_size = 0;
 
 	XFS_STATS_INC(xs_write_calls);
 
@@ -929,10 +878,10 @@ xfs_file_aio_write(
 
 	if (unlikely(file->f_flags & O_DIRECT))
 		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &new_size, &iolock);
+						ocount, &iolock);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &new_size, &iolock);
+						ocount, &iolock);
 
 	if (ret <= 0)
 		goto out_unlock;
@@ -953,7 +902,6 @@ xfs_file_aio_write(
 	}
 
 out_unlock:
-	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3b5b78aa3b8..8c3e46394d4 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -94,7 +94,6 @@ xfs_inode_alloc(
 	ip->i_update_core = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-	ip->i_new_size = 0;
 
 	return ip;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index cd99e43fa8f..2f27b745408 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -246,8 +246,6 @@ typedef struct xfs_inode {
 
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
 
-	xfs_fsize_t		i_new_size;	/* size when write completes */
-
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 } xfs_inode_t;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2aabcc9c507..6b6df5802e9 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -891,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(xfs_fsize_t, size)
-		__field(xfs_fsize_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
 		__field(int, flags)
@@ -900,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
 		__entry->size = ip->i_d.di_size;
-		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
 		__entry->flags = flags;
 	),
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
 		  "offset 0x%llx count 0x%zx ioflags %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->size,
-		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count,
 		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -978,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(loff_t, size)
-		__field(loff_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
 		__field(int, type)
@@ -990,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
 		__entry->size = ip->i_d.di_size;
-		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
 		__entry->type = type;
@@ -998,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__entry->startblock = irec ? irec->br_startblock : 0;
 		__entry->blockcount = irec ? irec->br_blockcount : 0;
 	),
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-		  "offset 0x%llx count %zd type %s "
-		  "startoff 0x%llx startblock %lld blockcount 0x%llx",
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
+		  "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->size,
-		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count,
 		  __print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1031,7 +1024,6 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 		__field(xfs_ino_t, ino)
 		__field(loff_t, isize)
 		__field(loff_t, disize)
-		__field(loff_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
 	),
@@ -1040,17 +1032,15 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 		__entry->ino = ip->i_ino;
 		__entry->isize = VFS_I(ip)->i_size;
 		__entry->disize = ip->i_d.di_size;
-		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
 	),
-	TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
+	TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
 		  "offset 0x%llx count %zd",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->isize,
 		  __entry->disize,
-		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count)
 );
-- 
cgit v1.2.3


From 5bf1f26227a59b9634e95eb3c7c012b766e5e6a0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:13 +0000
Subject: xfs: always return with the iolock held from
 xfs_file_aio_write_checks

While xfs_iunlock is fine with 0 lockflags the calling conventions are much
cleaner if xfs_file_aio_write_checks never returns without the iolock held.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_file.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 63231392678..134ff2fe4f4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -636,7 +636,9 @@ out_lock:
 /*
  * Common pre-write limit and setup checks.
  *
- * Returns with iolock held according to @iolock.
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
  */
 STATIC ssize_t
 xfs_file_aio_write_checks(
@@ -653,8 +655,7 @@ xfs_file_aio_write_checks(
 restart:
 	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
 	if (error) {
-		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
-		*iolock = 0;
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 		return error;
 	}
 
-- 
cgit v1.2.3


From d060646436233912178e6b9e3a7f30a41214220f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:14 +0000
Subject: xfs: cleanup xfs_file_aio_write

With all the size field updates out of the way xfs_file_aio_write can
be further simplified by pushing all iolock handling into
xfs_file_dio_aio_write and xfs_file_buffered_aio_write and using
the generic generic_write_sync helper for synchronous writes.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_file.c | 82 +++++++++++++++++++++++++------------------------------
 1 file changed, 37 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 134ff2fe4f4..7e5bc872f2b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -724,8 +724,7 @@ xfs_file_dio_aio_write(
 	const struct iovec	*iovp,
 	unsigned long		nr_segs,
 	loff_t			pos,
-	size_t			ocount,
-	int			*iolock)
+	size_t			ocount)
 {
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
@@ -735,10 +734,10 @@ xfs_file_dio_aio_write(
 	ssize_t			ret = 0;
 	size_t			count = ocount;
 	int			unaligned_io = 0;
+	int			iolock;
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
-	*iolock = 0;
 	if ((pos & target->bt_smask) || (count & target->bt_smask))
 		return -XFS_ERROR(EINVAL);
 
@@ -753,31 +752,31 @@ xfs_file_dio_aio_write(
 	 * EOF zeroing cases and fill out the new inode size as appropriate.
 	 */
 	if (unaligned_io || mapping->nrpages)
-		*iolock = XFS_IOLOCK_EXCL;
+		iolock = XFS_IOLOCK_EXCL;
 	else
-		*iolock = XFS_IOLOCK_SHARED;
-	xfs_rw_ilock(ip, *iolock);
+		iolock = XFS_IOLOCK_SHARED;
+	xfs_rw_ilock(ip, iolock);
 
 	/*
 	 * Recheck if there are cached pages that need invalidate after we got
 	 * the iolock to protect against other threads adding new pages while
 	 * we were waiting for the iolock.
 	 */
-	if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
-		xfs_rw_iunlock(ip, *iolock);
-		*iolock = XFS_IOLOCK_EXCL;
-		xfs_rw_ilock(ip, *iolock);
+	if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
+		xfs_rw_iunlock(ip, iolock);
+		iolock = XFS_IOLOCK_EXCL;
+		xfs_rw_ilock(ip, iolock);
 	}
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
 	if (ret)
-		return ret;
+		goto out;
 
 	if (mapping->nrpages) {
 		ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
 							FI_REMAPF_LOCKED);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	/*
@@ -786,15 +785,18 @@ xfs_file_dio_aio_write(
 	 */
 	if (unaligned_io)
 		inode_dio_wait(inode);
-	else if (*iolock == XFS_IOLOCK_EXCL) {
+	else if (iolock == XFS_IOLOCK_EXCL) {
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-		*iolock = XFS_IOLOCK_SHARED;
+		iolock = XFS_IOLOCK_SHARED;
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
 	ret = generic_file_direct_write(iocb, iovp,
 			&nr_segs, pos, &iocb->ki_pos, count, ocount);
 
+out:
+	xfs_rw_iunlock(ip, iolock);
+
 	/* No fallback to buffered IO on errors for XFS. */
 	ASSERT(ret < 0 || ret == count);
 	return ret;
@@ -806,8 +808,7 @@ xfs_file_buffered_aio_write(
 	const struct iovec	*iovp,
 	unsigned long		nr_segs,
 	loff_t			pos,
-	size_t			ocount,
-	int			*iolock)
+	size_t			ocount)
 {
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
@@ -815,14 +816,14 @@ xfs_file_buffered_aio_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
 	int			enospc = 0;
+	int			iolock = XFS_IOLOCK_EXCL;
 	size_t			count = ocount;
 
-	*iolock = XFS_IOLOCK_EXCL;
-	xfs_rw_ilock(ip, *iolock);
+	xfs_rw_ilock(ip, iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
 	if (ret)
-		return ret;
+		goto out;
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
@@ -836,13 +837,15 @@ write_retry:
 	 * page locks and retry *once*
 	 */
 	if (ret == -ENOSPC && !enospc) {
-		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-		if (ret)
-			return ret;
 		enospc = 1;
-		goto write_retry;
+		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+		if (!ret)
+			goto write_retry;
 	}
+
 	current->backing_dev_info = NULL;
+out:
+	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
 
@@ -858,7 +861,6 @@ xfs_file_aio_write(
 	struct inode		*inode = mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
-	int			iolock;
 	size_t			ocount = 0;
 
 	XFS_STATS_INC(xs_write_calls);
@@ -878,32 +880,22 @@ xfs_file_aio_write(
 		return -EIO;
 
 	if (unlikely(file->f_flags & O_DIRECT))
-		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
+		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
+						  ocount);
 
-	if (ret <= 0)
-		goto out_unlock;
-
-	XFS_STATS_ADD(xs_write_bytes, ret);
+	if (ret > 0) {
+		ssize_t err;
 
-	/* Handle various SYNC-type writes */
-	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-		loff_t end = pos + ret - 1;
-		int error;
+		XFS_STATS_ADD(xs_write_bytes, ret);
 
-		xfs_rw_iunlock(ip, iolock);
-		error = xfs_file_fsync(file, pos, end,
-				      (file->f_flags & __O_SYNC) ? 0 : 1);
-		xfs_rw_ilock(ip, iolock);
-		if (error)
-			ret = error;
+		/* Handle various SYNC-type writes */
+		err = generic_write_sync(file, pos, ret);
+		if (err < 0)
+			ret = err;
 	}
 
-out_unlock:
-	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 0a300be6d5be8f66cd96609334710c268d0bfdce Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: remove task argument to audit_set_loginuid

The function always deals with current.  Don't expose an option
pretending one can use it for something.  You can't.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/proc/base.c        | 2 +-
 include/linux/audit.h | 2 +-
 kernel/auditsc.c      | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8173dfd89cb..e3cbebbabeb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1228,7 +1228,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	length = audit_set_loginuid(current, loginuid);
+	length = audit_set_loginuid(loginuid);
 	if (likely(length == 0))
 		length = count;
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index e36aa37c88a..7cbd6fe4157 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -489,7 +489,7 @@ static inline void audit_ptrace(struct task_struct *t)
 extern unsigned int audit_serial(void);
 extern int auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
-extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
+extern int  audit_set_loginuid(uid_t loginuid);
 #define audit_get_loginuid(t) ((t)->loginuid)
 #define audit_get_sessionid(t) ((t)->sessionid)
 extern void audit_log_task_context(struct audit_buffer *ab);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 87b375fb12f..9d6dd7d869c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2163,16 +2163,16 @@ int auditsc_get_stamp(struct audit_context *ctx,
 static atomic_t session_id = ATOMIC_INIT(0);
 
 /**
- * audit_set_loginuid - set a task's audit_context loginuid
- * @task: task whose audit context is being modified
+ * audit_set_loginuid - set current task's audit_context loginuid
  * @loginuid: loginuid value
  *
  * Returns 0.
  *
  * Called (set) from fs/proc/base.c::proc_loginuid_write().
  */
-int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+int audit_set_loginuid(uid_t loginuid)
 {
+	struct task_struct *task = current;
 	unsigned int sessionid = atomic_inc_return(&session_id);
 	struct audit_context *context = task->audit_context;
 
-- 
cgit v1.2.3


From 633b45454503489209b0d9a45f9e3cd1b852c614 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: only allow tasks to set their loginuid if it is -1

At the moment we allow tasks to set their loginuid if they have
CAP_AUDIT_CONTROL.  In reality we want tasks to set the loginuid when they
log in and it be impossible to ever reset.  We had to make it mutable even
after it was once set (with the CAP) because on update and admin might have
to restart sshd.  Now sshd would get his loginuid and the next user which
logged in using ssh would not be able to set his loginuid.

Systemd has changed how userspace works and allowed us to make the kernel
work the way it should.  With systemd users (even admins) are not supposed
to restart services directly.  The system will restart the service for
them.  Thus since systemd is going to loginuid==-1, sshd would get -1, and
sshd would be allowed to set a new loginuid without special permissions.

If an admin in this system were to manually start an sshd he is inserting
himself into the system chain of trust and thus, logically, it's his
loginuid that should be used!  Since we have old systems I make this a
Kconfig option.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/proc/base.c   |  3 ---
 init/Kconfig     | 14 ++++++++++++++
 kernel/auditsc.c | 11 ++++++++++-
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e3cbebbabeb..482df23036b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1197,9 +1197,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	ssize_t length;
 	uid_t loginuid;
 
-	if (!capable(CAP_AUDIT_CONTROL))
-		return -EPERM;
-
 	rcu_read_lock();
 	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
 		rcu_read_unlock();
diff --git a/init/Kconfig b/init/Kconfig
index a075765d5fb..5ad8b775f2a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -372,6 +372,20 @@ config AUDIT_TREE
 	depends on AUDITSYSCALL
 	select FSNOTIFY
 
+config AUDIT_LOGINUID_IMMUTABLE
+	bool "Make audit loginuid immutable"
+	depends on AUDIT
+	help
+	  The config option toggles if a task setting it's loginuid requires
+	  CAP_SYS_AUDITCONTROL or if that task should require no special permissions
+	  but should instead only allow setting its loginuid if it was never
+	  previously set.  On systems which use systemd or a similar central
+	  process to restart login services this should be set to true.  On older
+	  systems in which an admin would typically have to directly stop and
+	  start processes this should be set to false.  Setting this to true allows
+	  one to drop potentially dangerous capabilites from the login tasks,
+	  but may not be backwards compatible with older init systems.
+
 source "kernel/irq/Kconfig"
 
 menu "RCU Subsystem"
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9d6dd7d869c..bd084a13c71 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2173,9 +2173,18 @@ static atomic_t session_id = ATOMIC_INIT(0);
 int audit_set_loginuid(uid_t loginuid)
 {
 	struct task_struct *task = current;
-	unsigned int sessionid = atomic_inc_return(&session_id);
 	struct audit_context *context = task->audit_context;
+	unsigned int sessionid;
+
+#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
+	if (task->loginuid != -1)
+		return -EPERM;
+#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+	if (!capable(CAP_AUDIT_CONTROL))
+		return -EPERM;
+#endif  /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
 
+	sessionid = atomic_inc_return(&session_id);
 	if (context && context->in_syscall) {
 		struct audit_buffer *ab;
 
-- 
cgit v1.2.3


From 4043cde8ecf7f7d880eb1133c201a3d392fd68c3 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: do not call audit_getname on error

Just a code cleanup really.  We don't need to make a function call just for
it to return on error.  This also makes the VFS function even easier to follow
and removes a conditional on a hot path.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/namei.c       | 28 +++++++++++++---------------
 kernel/auditsc.c |  3 ---
 2 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index c283a1ec008..208c6aa4a98 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
 
 static char *getname_flags(const char __user *filename, int flags, int *empty)
 {
-	char *tmp, *result;
-
-	result = ERR_PTR(-ENOMEM);
-	tmp = __getname();
-	if (tmp)  {
-		int retval = do_getname(filename, tmp);
-
-		result = tmp;
-		if (retval < 0) {
-			if (retval == -ENOENT && empty)
-				*empty = 1;
-			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-				__putname(tmp);
-				result = ERR_PTR(retval);
-			}
+	char *result = __getname();
+	int retval;
+
+	if (!result)
+		return ERR_PTR(-ENOMEM);
+
+	retval = do_getname(filename, result);
+	if (retval < 0) {
+		if (retval == -ENOENT && empty)
+			*empty = 1;
+		if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+			__putname(result);
+			return ERR_PTR(retval);
 		}
 	}
 	audit_getname(result);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bd084a13c71..9161e70a437 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1913,9 +1913,6 @@ void __audit_getname(const char *name)
 	struct audit_context *context = current->audit_context;
 	struct audit_names *n;
 
-	if (IS_ERR(name) || !name)
-		return;
-
 	if (!context->in_syscall) {
 #if AUDIT_DEBUG == 2
 		printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
-- 
cgit v1.2.3


From e1616300a20c80396109c1cf013ba9a36055a3da Mon Sep 17 00:00:00 2001
From: Kazuya Mio <k-mio@sx.jp.nec.com>
Date: Thu, 1 Dec 2011 16:51:07 +0900
Subject: wake up s_wait_unfrozen when ->freeze_fs fails

dd slept infinitely when fsfeeze failed because of EIO.
To fix this problem, if ->freeze_fs fails, freeze_super() wakes up
the tasks waiting for the filesystem to become unfrozen.

When s_frozen isn't SB_UNFROZEN in __generic_file_aio_write(),
the function sleeps until FITHAW ioctl wakes up s_wait_unfrozen.

However, if ->freeze_fs fails, s_frozen is set to SB_UNFROZEN and then
freeze_super() returns an error number. In this case, FITHAW ioctl returns
EINVAL because s_frozen is already SB_UNFROZEN. There is no way to wake up
s_wait_unfrozen, so __generic_file_aio_write() sleeps infinitely.

Signed-off-by: Kazuya Mio <k-mio@sx.jp.nec.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index de41e1e46f0..6015c02296b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1186,6 +1186,8 @@ int freeze_super(struct super_block *sb)
 			printk(KERN_ERR
 				"VFS:Filesystem freeze failed\n");
 			sb->s_frozen = SB_UNFROZEN;
+			smp_wmb();
+			wake_up(&sb->s_wait_unfrozen);
 			deactivate_locked_super(sb);
 			return ret;
 		}
-- 
cgit v1.2.3


From 424a5334a5235c2fbb80090b18a065eeceb51d64 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 12 Jan 2012 12:41:36 +0100
Subject: vfs: remove printk from set_nlink()

Don't log a message for set_nlink(0).

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 4fa4f0916af..fb10d86ffad 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -322,9 +322,6 @@ EXPORT_SYMBOL(clear_nlink);
 void set_nlink(struct inode *inode, unsigned int nlink)
 {
 	if (!nlink) {
-		printk_ratelimited(KERN_INFO
-			"set_nlink() clearing i_nlink on %s inode %li\n",
-			inode->i_sb->s_type->name, inode->i_ino);
 		clear_nlink(inode);
 	} else {
 		/* Yes, some filesystems do change nlink from zero to one */
-- 
cgit v1.2.3


From e268337dfe26dfc7efd422a804dbb27977a3cccc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 17 Jan 2012 15:21:19 -0800
Subject: proc: clean up and fix /proc/<pid>/mem handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jüri Aedla reported that the /proc/<pid>/mem handling really isn't very
robust, and it also doesn't match the permission checking of any of the
other related files.

This changes it to do the permission checks at open time, and instead of
tracking the process, it tracks the VM at the time of the open.  That
simplifies the code a lot, but does mean that if you hold the file
descriptor open over an execve(), you'll continue to read from the _old_
VM.

That is different from our previous behavior, but much simpler.  If
somebody actually finds a load where this matters, we'll need to revert
this commit.

I suspect that nobody will ever notice - because the process mapping
addresses will also have changed as part of the execve.  So you cannot
actually usefully access the fd across a VM change simply because all
the offsets for IO would have changed too.

Reported-by: Jüri Aedla <asd@ut.ee>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 145 ++++++++++++++++-----------------------------------------
 1 file changed, 39 insertions(+), 106 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5485a5388ec..662ddf2ec4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -198,65 +198,7 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
 	return result;
 }
 
-static struct mm_struct *__check_mem_permission(struct task_struct *task)
-{
-	struct mm_struct *mm;
-
-	mm = get_task_mm(task);
-	if (!mm)
-		return ERR_PTR(-EINVAL);
-
-	/*
-	 * A task can always look at itself, in case it chooses
-	 * to use system calls instead of load instructions.
-	 */
-	if (task == current)
-		return mm;
-
-	/*
-	 * If current is actively ptrace'ing, and would also be
-	 * permitted to freshly attach with ptrace now, permit it.
-	 */
-	if (task_is_stopped_or_traced(task)) {
-		int match;
-		rcu_read_lock();
-		match = (ptrace_parent(task) == current);
-		rcu_read_unlock();
-		if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
-			return mm;
-	}
-
-	/*
-	 * No one else is allowed.
-	 */
-	mmput(mm);
-	return ERR_PTR(-EPERM);
-}
-
-/*
- * If current may access user memory in @task return a reference to the
- * corresponding mm, otherwise ERR_PTR.
- */
-static struct mm_struct *check_mem_permission(struct task_struct *task)
-{
-	struct mm_struct *mm;
-	int err;
-
-	/*
-	 * Avoid racing if task exec's as we might get a new mm but validate
-	 * against old credentials.
-	 */
-	err = mutex_lock_killable(&task->signal->cred_guard_mutex);
-	if (err)
-		return ERR_PTR(err);
-
-	mm = __check_mem_permission(task);
-	mutex_unlock(&task->signal->cred_guard_mutex);
-
-	return mm;
-}
-
-struct mm_struct *mm_for_maps(struct task_struct *task)
+static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 {
 	struct mm_struct *mm;
 	int err;
@@ -267,7 +209,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 
 	mm = get_task_mm(task);
 	if (mm && mm != current->mm &&
-			!ptrace_may_access(task, PTRACE_MODE_READ)) {
+			!ptrace_may_access(task, mode)) {
 		mmput(mm);
 		mm = ERR_PTR(-EACCES);
 	}
@@ -276,6 +218,11 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 	return mm;
 }
 
+struct mm_struct *mm_for_maps(struct task_struct *task)
+{
+	return mm_access(task, PTRACE_MODE_READ);
+}
+
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
 {
 	int res = 0;
@@ -752,38 +699,39 @@ static const struct file_operations proc_single_file_operations = {
 
 static int mem_open(struct inode* inode, struct file* file)
 {
-	file->private_data = (void*)((long)current->self_exec_id);
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	struct mm_struct *mm;
+
+	if (!task)
+		return -ESRCH;
+
+	mm = mm_access(task, PTRACE_MODE_ATTACH);
+	put_task_struct(task);
+
+	if (IS_ERR(mm))
+		return PTR_ERR(mm);
+
 	/* OK to pass negative loff_t, we can catch out-of-range */
 	file->f_mode |= FMODE_UNSIGNED_OFFSET;
+	file->private_data = mm;
+
 	return 0;
 }
 
 static ssize_t mem_read(struct file * file, char __user * buf,
 			size_t count, loff_t *ppos)
 {
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	int ret;
 	char *page;
 	unsigned long src = *ppos;
-	int ret = -ESRCH;
-	struct mm_struct *mm;
+	struct mm_struct *mm = file->private_data;
 
-	if (!task)
-		goto out_no_task;
+	if (!mm)
+		return 0;
 
-	ret = -ENOMEM;
 	page = (char *)__get_free_page(GFP_TEMPORARY);
 	if (!page)
-		goto out;
-
-	mm = check_mem_permission(task);
-	ret = PTR_ERR(mm);
-	if (IS_ERR(mm))
-		goto out_free;
-
-	ret = -EIO;
- 
-	if (file->private_data != (void*)((long)current->self_exec_id))
-		goto out_put;
+		return -ENOMEM;
 
 	ret = 0;
  
@@ -810,13 +758,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 	}
 	*ppos = src;
 
-out_put:
-	mmput(mm);
-out_free:
 	free_page((unsigned long) page);
-out:
-	put_task_struct(task);
-out_no_task:
 	return ret;
 }
 
@@ -825,27 +767,15 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 {
 	int copied;
 	char *page;
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 	unsigned long dst = *ppos;
-	struct mm_struct *mm;
+	struct mm_struct *mm = file->private_data;
 
-	copied = -ESRCH;
-	if (!task)
-		goto out_no_task;
+	if (!mm)
+		return 0;
 
-	copied = -ENOMEM;
 	page = (char *)__get_free_page(GFP_TEMPORARY);
 	if (!page)
-		goto out_task;
-
-	mm = check_mem_permission(task);
-	copied = PTR_ERR(mm);
-	if (IS_ERR(mm))
-		goto out_free;
-
-	copied = -EIO;
-	if (file->private_data != (void *)((long)current->self_exec_id))
-		goto out_mm;
+		return -ENOMEM;
 
 	copied = 0;
 	while (count > 0) {
@@ -869,13 +799,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 	}
 	*ppos = dst;
 
-out_mm:
-	mmput(mm);
-out_free:
 	free_page((unsigned long) page);
-out_task:
-	put_task_struct(task);
-out_no_task:
 	return copied;
 }
 
@@ -895,11 +819,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 	return file->f_pos;
 }
 
+static int mem_release(struct inode *inode, struct file *file)
+{
+	struct mm_struct *mm = file->private_data;
+
+	mmput(mm);
+	return 0;
+}
+
 static const struct file_operations proc_mem_operations = {
 	.llseek		= mem_lseek,
 	.read		= mem_read,
 	.write		= mem_write,
 	.open		= mem_open,
+	.release	= mem_release,
 };
 
 static ssize_t environ_read(struct file *file, char __user *buf,
-- 
cgit v1.2.3


From f5fffcee27c09143ba80e5257dbd1f381d86342f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 17 Jan 2012 13:49:17 -0500
Subject: cifs: better instrumentation for coalesce_t2

When coalesce_t2 returns an error, have it throw a cFYI message that
explains the reason. Also rename some variables to clarify what they
represent.

Reported-and-Tested-by: Konstantinos Skarlatos <k.skarlatos@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 84 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 50 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4666780f315..5cc15856e4a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -225,74 +225,90 @@ static int check2ndT2(struct smb_hdr *pSMB)
 
 static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 {
-	struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
+	struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)psecond;
 	struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB;
-	char *data_area_of_target;
-	char *data_area_of_buf2;
+	char *data_area_of_tgt;
+	char *data_area_of_src;
 	int remaining;
-	unsigned int byte_count, total_in_buf;
-	__u16 total_data_size, total_in_buf2;
+	unsigned int byte_count, total_in_tgt;
+	__u16 tgt_total_cnt, src_total_cnt, total_in_src;
 
-	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+	src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount);
+	tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
 
-	if (total_data_size !=
-	    get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
-		cFYI(1, "total data size of primary and secondary t2 differ");
+	if (tgt_total_cnt != src_total_cnt)
+		cFYI(1, "total data count of primary and secondary t2 differ "
+			"source=%hu target=%hu", src_total_cnt, tgt_total_cnt);
 
-	total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+	total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
 
-	remaining = total_data_size - total_in_buf;
+	remaining = tgt_total_cnt - total_in_tgt;
 
-	if (remaining < 0)
+	if (remaining < 0) {
+		cFYI(1, "Server sent too much data. tgt_total_cnt=%hu "
+			"total_in_tgt=%hu", tgt_total_cnt, total_in_tgt);
 		return -EPROTO;
+	}
 
-	if (remaining == 0) /* nothing to do, ignore */
+	if (remaining == 0) {
+		/* nothing to do, ignore */
+		cFYI(1, "no more data remains");
 		return 0;
+	}
 
-	total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
-	if (remaining < total_in_buf2) {
+	total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
+	if (remaining < total_in_src)
 		cFYI(1, "transact2 2nd response contains too much data");
-	}
 
 	/* find end of first SMB data area */
-	data_area_of_target = (char *)&pSMBt->hdr.Protocol +
+	data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
 				get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
-	/* validate target area */
 
-	data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
-				get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
+	/* validate target area */
+	data_area_of_src = (char *)&pSMBs->hdr.Protocol +
+				get_unaligned_le16(&pSMBs->t2_rsp.DataOffset);
 
-	data_area_of_target += total_in_buf;
+	data_area_of_tgt += total_in_tgt;
 
-	/* copy second buffer into end of first buffer */
-	total_in_buf += total_in_buf2;
+	total_in_tgt += total_in_src;
 	/* is the result too big for the field? */
-	if (total_in_buf > USHRT_MAX)
+	if (total_in_tgt > USHRT_MAX) {
+		cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt);
 		return -EPROTO;
-	put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
+	}
+	put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
 
 	/* fix up the BCC */
 	byte_count = get_bcc(pTargetSMB);
-	byte_count += total_in_buf2;
+	byte_count += total_in_src;
 	/* is the result too big for the field? */
-	if (byte_count > USHRT_MAX)
+	if (byte_count > USHRT_MAX) {
+		cFYI(1, "coalesced BCC too large (%u)", byte_count);
 		return -EPROTO;
+	}
 	put_bcc(byte_count, pTargetSMB);
 
 	byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
-	byte_count += total_in_buf2;
+	byte_count += total_in_src;
 	/* don't allow buffer to overflow */
-	if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
+	if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+		cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count);
 		return -ENOBUFS;
+	}
 	pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
 
-	memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
+	/* copy second buffer into end of first buffer */
+	memcpy(data_area_of_tgt, data_area_of_src, total_in_src);
 
-	if (remaining == total_in_buf2) {
-		cFYI(1, "found the last secondary response");
-		return 0; /* we are done */
-	} else /* more responses to go */
+	if (remaining != total_in_src) {
+		/* more responses to go */
+		cFYI(1, "waiting for more secondary responses");
 		return 1;
+	}
+
+	/* we are done */
+	cFYI(1, "found the last secondary response");
+	return 0;
 }
 
 static void
-- 
cgit v1.2.3


From ce91acb3acae26f4163c5a6f1f695d1a1e8d9009 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 17 Jan 2012 16:08:51 -0500
Subject: cifs: lower default wsize when unix extensions are not used

We've had some reports of servers (namely, the Solaris in-kernel CIFS
server) that don't deal properly with writes that are "too large" even
though they set CAP_LARGE_WRITE_ANDX. Change the default to better
mirror what windows clients do.

Cc: stable@vger.kernel.org
Cc: Pavel Shilovsky <piastry@etersoft.ru>
Reported-by: Nick Davis <phireph0x@yahoo.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5cc15856e4a..a66dcb52988 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2930,18 +2930,33 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 #define CIFS_DEFAULT_IOSIZE (1024 * 1024)
 
 /*
- * Windows only supports a max of 60k reads. Default to that when posix
- * extensions aren't in force.
+ * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
+ * those values when posix extensions aren't in force. In actuality here, we
+ * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
+ * to be ok with the extra byte even though Windows doesn't send writes that
+ * are that large.
+ *
+ * Citation:
+ *
+ * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
  */
 #define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
+#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
 
 static unsigned int
 cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
 {
 	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
 	struct TCP_Server_Info *server = tcon->ses->server;
-	unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
-				CIFS_DEFAULT_IOSIZE;
+	unsigned int wsize;
+
+	/* start with specified wsize, or default */
+	if (pvolume_info->wsize)
+		wsize = pvolume_info->wsize;
+	else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+		wsize = CIFS_DEFAULT_IOSIZE;
+	else
+		wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
 
 	/* can server support 24-bit write sizes? (via UNIX extensions) */
 	if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
-- 
cgit v1.2.3


From 04febabcf55beeffb8794a0d8c539e571bd2ae29 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 17 Jan 2012 16:09:15 -0500
Subject: cifs: sanitize username handling

Currently, it's not very clear whether you're allowed to have a NULL
vol->username or ses->user_name. Some places check for it and some don't.

Make it clear that a NULL pointer is OK in these fields, and ensure that
all the callers check for that.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifs_spnego.c | 10 +++++++---
 fs/cifs/cifsencrypt.c | 11 ++++++++---
 fs/cifs/connect.c     | 19 ++++++++++++-------
 3 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 2272fd5fe5b..e622863b292 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -113,9 +113,11 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
 		   MAX_MECH_STR_LEN +
 		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
 		   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
-		   USER_KEY_LEN + strlen(sesInfo->user_name) +
 		   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
 
+	if (sesInfo->user_name)
+		desc_len += USER_KEY_LEN + strlen(sesInfo->user_name);
+
 	spnego_key = ERR_PTR(-ENOMEM);
 	description = kzalloc(desc_len, GFP_KERNEL);
 	if (description == NULL)
@@ -152,8 +154,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
 	dp = description + strlen(description);
 	sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
 
-	dp = description + strlen(description);
-	sprintf(dp, ";user=%s", sesInfo->user_name);
+	if (sesInfo->user_name) {
+		dp = description + strlen(description);
+		sprintf(dp, ";user=%s", sesInfo->user_name);
+	}
 
 	dp = description + strlen(description);
 	sprintf(dp, ";pid=0x%x", current->pid);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5d9b9acc5fc..bce99e6a495 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -420,15 +420,20 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 	}
 
 	/* convert ses->user_name to unicode and uppercase */
-	len = strlen(ses->user_name);
+	len = ses->user_name ? strlen(ses->user_name) : 0;
 	user = kmalloc(2 + (len * 2), GFP_KERNEL);
 	if (user == NULL) {
 		cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
 		rc = -ENOMEM;
 		return rc;
 	}
-	len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
-	UniStrupr(user);
+
+	if (len) {
+		len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
+		UniStrupr(user);
+	} else {
+		memset(user, '\0', 2);
+	}
 
 	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
 				(char *)user, 2 * len);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a66dcb52988..b952a21e917 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1997,10 +1997,16 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
 			return 0;
 		break;
 	default:
+		/* NULL username means anonymous session */
+		if (ses->user_name == NULL) {
+			if (!vol->nullauth)
+				return 0;
+			break;
+		}
+
 		/* anything else takes username/password */
-		if (ses->user_name == NULL)
-			return 0;
-		if (strncmp(ses->user_name, vol->username,
+		if (strncmp(ses->user_name,
+			    vol->username ? vol->username : "",
 			    MAX_USERNAME_SIZE))
 			return 0;
 		if (strlen(vol->username) != 0 &&
@@ -3167,10 +3173,9 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
 		return -EINVAL;
 
 	if (volume_info->nullauth) {
-		cFYI(1, "null user");
-		volume_info->username = kzalloc(1, GFP_KERNEL);
-		if (volume_info->username == NULL)
-			return -ENOMEM;
+		cFYI(1, "Anonymous login");
+		kfree(volume_info->username);
+		volume_info->username = NULL;
 	} else if (volume_info->username) {
 		/* BB fixme parse for domain name here */
 		cFYI(1, "Username: %s", volume_info->username);
-- 
cgit v1.2.3


From 8a8798a5ff90977d6459ce1d657cf8fe13a51e97 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 17 Jan 2012 16:09:15 -0500
Subject: cifs: fetch credentials out of keyring for non-krb5 auth multiuser
 mounts

Fix up multiuser mounts to set the secType and set the username and
password from the key payload in the vol info for non-krb5 auth types.

Look for a key of type "secret" with a description of
"cifs:a:<server address>" or "cifs:d:<domainname>". If that's found,
then scrape the username and password out of the key payload and use
that to create a new user session.

Finally, don't have the code enforce krb5 auth on multiuser mounts,
but do require a kernel with keys support.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 165 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b952a21e917..28f23c03da5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -38,6 +38,7 @@
 #include <asm/processor.h>
 #include <linux/inet.h>
 #include <linux/module.h>
+#include <keys/user-type.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -1594,11 +1595,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 		}
 	}
 
-	if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
-		cERROR(1, "Multiuser mounts currently require krb5 "
-			  "authentication!");
+#ifndef CONFIG_KEYS
+	/* Muliuser mounts require CONFIG_KEYS support */
+	if (vol->multiuser) {
+		cERROR(1, "Multiuser mounts require kernels with "
+			  "CONFIG_KEYS enabled.");
 		goto cifs_parse_mount_err;
 	}
+#endif
 
 	if (vol->UNCip == NULL)
 		vol->UNCip = &vol->UNC[2];
@@ -2061,6 +2065,132 @@ cifs_put_smb_ses(struct cifs_ses *ses)
 	cifs_put_tcp_session(server);
 }
 
+#ifdef CONFIG_KEYS
+
+/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */
+#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1)
+
+/* Populate username and pw fields from keyring if possible */
+static int
+cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
+{
+	int rc = 0;
+	char *desc, *delim, *payload;
+	ssize_t len;
+	struct key *key;
+	struct TCP_Server_Info *server = ses->server;
+	struct sockaddr_in *sa;
+	struct sockaddr_in6 *sa6;
+	struct user_key_payload *upayload;
+
+	desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	/* try to find an address key first */
+	switch (server->dstaddr.ss_family) {
+	case AF_INET:
+		sa = (struct sockaddr_in *)&server->dstaddr;
+		sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr);
+		break;
+	case AF_INET6:
+		sa6 = (struct sockaddr_in6 *)&server->dstaddr;
+		sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
+		break;
+	default:
+		cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family);
+		rc = -EINVAL;
+		goto out_err;
+	}
+
+	cFYI(1, "%s: desc=%s", __func__, desc);
+	key = request_key(&key_type_logon, desc, "");
+	if (IS_ERR(key)) {
+		if (!ses->domainName) {
+			cFYI(1, "domainName is NULL");
+			rc = PTR_ERR(key);
+			goto out_err;
+		}
+
+		/* didn't work, try to find a domain key */
+		sprintf(desc, "cifs:d:%s", ses->domainName);
+		cFYI(1, "%s: desc=%s", __func__, desc);
+		key = request_key(&key_type_logon, desc, "");
+		if (IS_ERR(key)) {
+			rc = PTR_ERR(key);
+			goto out_err;
+		}
+	}
+
+	down_read(&key->sem);
+	upayload = key->payload.data;
+	if (IS_ERR_OR_NULL(upayload)) {
+		rc = PTR_ERR(key);
+		goto out_key_put;
+	}
+
+	/* find first : in payload */
+	payload = (char *)upayload->data;
+	delim = strnchr(payload, upayload->datalen, ':');
+	cFYI(1, "payload=%s", payload);
+	if (!delim) {
+		cFYI(1, "Unable to find ':' in payload (datalen=%d)",
+				upayload->datalen);
+		rc = -EINVAL;
+		goto out_key_put;
+	}
+
+	len = delim - payload;
+	if (len > MAX_USERNAME_SIZE || len <= 0) {
+		cFYI(1, "Bad value from username search (len=%ld)", len);
+		rc = -EINVAL;
+		goto out_key_put;
+	}
+
+	vol->username = kstrndup(payload, len, GFP_KERNEL);
+	if (!vol->username) {
+		cFYI(1, "Unable to allocate %ld bytes for username", len);
+		rc = -ENOMEM;
+		goto out_key_put;
+	}
+	cFYI(1, "%s: username=%s", __func__, vol->username);
+
+	len = key->datalen - (len + 1);
+	if (len > MAX_PASSWORD_SIZE || len <= 0) {
+		cFYI(1, "Bad len for password search (len=%ld)", len);
+		rc = -EINVAL;
+		kfree(vol->username);
+		vol->username = NULL;
+		goto out_key_put;
+	}
+
+	++delim;
+	vol->password = kstrndup(delim, len, GFP_KERNEL);
+	if (!vol->password) {
+		cFYI(1, "Unable to allocate %ld bytes for password", len);
+		rc = -ENOMEM;
+		kfree(vol->username);
+		vol->username = NULL;
+		goto out_key_put;
+	}
+
+out_key_put:
+	up_read(&key->sem);
+	key_put(key);
+out_err:
+	kfree(desc);
+	cFYI(1, "%s: returning %d", __func__, rc);
+	return rc;
+}
+#else /* ! CONFIG_KEYS */
+static inline int
+cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
+		   struct cifs_ses *ses __attribute__((unused)))
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_KEYS */
+
 static bool warned_on_ntlm;  /* globals init to false automatically */
 
 static struct cifs_ses *
@@ -3693,16 +3823,38 @@ int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
 	return rc;
 }
 
+static int
+cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
+{
+	switch (ses->server->secType) {
+	case Kerberos:
+		vol->secFlg = CIFSSEC_MUST_KRB5;
+		return 0;
+	case NTLMv2:
+		vol->secFlg = CIFSSEC_MUST_NTLMV2;
+		break;
+	case NTLM:
+		vol->secFlg = CIFSSEC_MUST_NTLM;
+		break;
+	case RawNTLMSSP:
+		vol->secFlg = CIFSSEC_MUST_NTLMSSP;
+		break;
+	case LANMAN:
+		vol->secFlg = CIFSSEC_MUST_LANMAN;
+		break;
+	}
+
+	return cifs_set_cifscreds(vol, ses);
+}
+
 static struct cifs_tcon *
 cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 {
+	int rc;
 	struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
 	struct cifs_ses *ses;
 	struct cifs_tcon *tcon = NULL;
 	struct smb_vol *vol_info;
-	char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
-			   /* We used to have this as MAX_USERNAME which is   */
-			   /* way too big now (256 instead of 32) */
 
 	vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
 	if (vol_info == NULL) {
@@ -3710,8 +3862,6 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 		goto out;
 	}
 
-	snprintf(username, sizeof(username), "krb50x%x", fsuid);
-	vol_info->username = username;
 	vol_info->local_nls = cifs_sb->local_nls;
 	vol_info->linux_uid = fsuid;
 	vol_info->cred_uid = fsuid;
@@ -3721,8 +3871,11 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 	vol_info->local_lease = master_tcon->local_lease;
 	vol_info->no_linux_ext = !master_tcon->unix_ext;
 
-	/* FIXME: allow for other secFlg settings */
-	vol_info->secFlg = CIFSSEC_MUST_KRB5;
+	rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
+	if (rc) {
+		tcon = ERR_PTR(rc);
+		goto out;
+	}
 
 	/* get a reference for the same TCP session */
 	spin_lock(&cifs_tcp_ses_lock);
@@ -3745,6 +3898,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 	if (ses->capabilities & CAP_UNIX)
 		reset_cifs_unix_caps(0, tcon, NULL, vol_info);
 out:
+	kfree(vol_info->username);
+	kfree(vol_info->password);
 	kfree(vol_info);
 
 	return tcon;
-- 
cgit v1.2.3


From 789b4588da40cf572ef982bdc5d590ec1b0386fe Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 17 Jan 2012 16:09:15 -0500
Subject: cifs: warn about impending deprecation of legacy MultiuserMount code

We'll allow a grace period of 2 releases (3.3 and 3.4) and then remove
the legacy code in 3.5.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifs_debug.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 84e8c072470..24b3dfc0528 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -676,14 +676,23 @@ static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
 {
 	char c;
 	int rc;
+	static bool warned;
 
 	rc = get_user(c, buffer);
 	if (rc)
 		return rc;
 	if (c == '0' || c == 'n' || c == 'N')
 		multiuser_mount = 0;
-	else if (c == '1' || c == 'y' || c == 'Y')
+	else if (c == '1' || c == 'y' || c == 'Y') {
 		multiuser_mount = 1;
+		if (!warned) {
+			warned = true;
+			printk(KERN_WARNING "CIFS VFS: The legacy multiuser "
+				"mount code is scheduled to be deprecated in "
+				"3.5. Please switch to using the multiuser "
+				"mount option.");
+		}
+	}
 
 	return count;
 }
-- 
cgit v1.2.3


From 88a4412b798236bfdd9284d5c251d76679f944e1 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 18 Jan 2012 17:13:47 -0600
Subject: [CIFS] Fix build break with multiuser patch when LANMAN disabled

CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ba53c1c6c6c..76e7d8b6da1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -879,6 +879,8 @@ require use of the stronger protocol */
 #define   CIFSSEC_MASK          0xB70B7 /* current flags supported if weak */
 #endif /* UPCALL */
 #else /* do not allow weak pw hash */
+#define   CIFSSEC_MUST_LANMAN	0
+#define   CIFSSEC_MUST_PLNTXT	0
 #ifdef CONFIG_CIFS_UPCALL
 #define   CIFSSEC_MASK          0x8F08F /* flags supported if no weak allowed */
 #else
-- 
cgit v1.2.3


From c56001879bc091eee0c7a8e6e94ea0bea63c3012 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 18 Jan 2012 17:19:11 -0600
Subject: [CIFS] ACL and FSCACHE support no longer EXPERIMENTAL

CIFS ACL support and FSCACHE support have been in long enough
to be no longer considered experimental.  Remove obsolete Kconfig
dependency.

Signed-off-by: Steve French <sfrench@us.ibm.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f66cc162515..0554b00a7b3 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -140,7 +140,6 @@ config CIFS_DFS_UPCALL
 
 config CIFS_FSCACHE
 	  bool "Provide CIFS client caching support (EXPERIMENTAL)"
-	  depends on EXPERIMENTAL
 	  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
 	  help
 	    Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
@@ -149,7 +148,7 @@ config CIFS_FSCACHE
 
 config CIFS_ACL
 	  bool "Provide CIFS ACL support (EXPERIMENTAL)"
-	  depends on EXPERIMENTAL && CIFS_XATTR && KEYS
+	  depends on CIFS_XATTR && KEYS
 	  help
 	    Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
 	    is handed over to the application/caller.
-- 
cgit v1.2.3


From acbbb76a26648dfae6fed0989879e40d75692bfc Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 18 Jan 2012 22:32:33 -0600
Subject: CIFS: Rename *UCS* functions to *UTF16*

to reflect the unicode encoding used by CIFS protocol.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Acked-by: Jeff Layton <jlayton@samba.org>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
---
 fs/cifs/cifs_unicode.c |  41 +++++++------
 fs/cifs/cifs_unicode.h |  20 +++---
 fs/cifs/cifsencrypt.c  |  12 ++--
 fs/cifs/cifssmb.c      | 162 +++++++++++++++++++++++++------------------------
 fs/cifs/connect.c      |   4 +-
 fs/cifs/readdir.c      |   9 +--
 fs/cifs/sess.c         |  34 +++++------
 fs/cifs/smbencrypt.c   |   2 +-
 8 files changed, 146 insertions(+), 138 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 1b2e180b018..fbb9da95184 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -27,17 +27,17 @@
 #include "cifs_debug.h"
 
 /*
- * cifs_ucs2_bytes - how long will a string be after conversion?
- * @ucs - pointer to input string
+ * cifs_utf16_bytes - how long will a string be after conversion?
+ * @utf16 - pointer to input string
  * @maxbytes - don't go past this many bytes of input string
  * @codepage - destination codepage
  *
- * Walk a ucs2le string and return the number of bytes that the string will
+ * Walk a utf16le string and return the number of bytes that the string will
  * be after being converted to the given charset, not including any null
  * termination required. Don't walk past maxbytes in the source buffer.
  */
 int
-cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+cifs_utf16_bytes(const __le16 *from, int maxbytes,
 		const struct nls_table *codepage)
 {
 	int i;
@@ -122,7 +122,7 @@ cp_convert:
 }
 
 /*
- * cifs_from_ucs2 - convert utf16le string to local charset
+ * cifs_from_utf16 - convert utf16le string to local charset
  * @to - destination buffer
  * @from - source buffer
  * @tolen - destination buffer size (in bytes)
@@ -130,7 +130,7 @@ cp_convert:
  * @codepage - codepage to which characters should be converted
  * @mapchar - should characters be remapped according to the mapchars option?
  *
- * Convert a little-endian ucs2le string (as sent by the server) to a string
+ * Convert a little-endian utf16le string (as sent by the server) to a string
  * in the provided codepage. The tolen and fromlen parameters are to ensure
  * that the code doesn't walk off of the end of the buffer (which is always
  * a danger if the alignment of the source buffer is off). The destination
@@ -139,12 +139,12 @@ cp_convert:
  * null terminator).
  *
  * Note that some windows versions actually send multiword UTF-16 characters
- * instead of straight UCS-2. The linux nls routines however aren't able to
+ * instead of straight UTF16-2. The linux nls routines however aren't able to
  * deal with those characters properly. In the event that we get some of
  * those characters, they won't be translated properly.
  */
 int
-cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
 		 const struct nls_table *codepage, bool mapchar)
 {
 	int i, charlen, safelen;
@@ -190,13 +190,13 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
 }
 
 /*
- * NAME:	cifs_strtoUCS()
+ * NAME:	cifs_strtoUTF16()
  *
  * FUNCTION:	Convert character string to unicode string
  *
  */
 int
-cifs_strtoUCS(__le16 *to, const char *from, int len,
+cifs_strtoUTF16(__le16 *to, const char *from, int len,
 	      const struct nls_table *codepage)
 {
 	int charlen;
@@ -206,7 +206,7 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
 		charlen = codepage->char2uni(from, len, &wchar_to);
 		if (charlen < 1) {
-			cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
+			cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d",
 				*from, charlen);
 			/* A question mark */
 			wchar_to = 0x003f;
@@ -220,7 +220,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 }
 
 /*
- * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
+ * cifs_strndup_from_utf16 - copy a string from wire format to the local
+ * codepage
  * @src - source string
  * @maxlen - don't walk past this many bytes in the source string
  * @is_unicode - is this a unicode string?
@@ -231,19 +232,19 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
  * error.
  */
 char *
-cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
-	     const struct nls_table *codepage)
+cifs_strndup_from_utf16(const char *src, const int maxlen,
+			const bool is_unicode, const struct nls_table *codepage)
 {
 	int len;
 	char *dst;
 
 	if (is_unicode) {
-		len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
+		len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
 		len += nls_nullsize(codepage);
 		dst = kmalloc(len, GFP_KERNEL);
 		if (!dst)
 			return NULL;
-		cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
+		cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
 			       false);
 	} else {
 		len = strnlen(src, maxlen);
@@ -264,7 +265,7 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
  * names are little endian 16 bit Unicode on the wire
  */
 int
-cifsConvertToUCS(__le16 *target, const char *source, int srclen,
+cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
 		 const struct nls_table *cp, int mapChars)
 {
 	int i, j, charlen;
@@ -273,7 +274,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
 	wchar_t tmp;
 
 	if (!mapChars)
-		return cifs_strtoUCS(target, source, PATH_MAX, cp);
+		return cifs_strtoUTF16(target, source, PATH_MAX, cp);
 
 	for (i = 0, j = 0; i < srclen; j++) {
 		src_char = source[i];
@@ -281,7 +282,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
 		switch (src_char) {
 		case 0:
 			put_unaligned(0, &target[j]);
-			goto ctoUCS_out;
+			goto ctoUTF16_out;
 		case ':':
 			dst_char = cpu_to_le16(UNI_COLON);
 			break;
@@ -326,7 +327,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
 		put_unaligned(dst_char, &target[j]);
 	}
 
-ctoUCS_out:
+ctoUTF16_out:
 	return i;
 }
 
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 6d02fd56056..a513a546700 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -74,16 +74,16 @@ extern const struct UniCaseRange CifsUniLowerRange[];
 #endif				/* UNIUPR_NOLOWER */
 
 #ifdef __KERNEL__
-int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
-		   const struct nls_table *codepage, bool mapchar);
-int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
-		    const struct nls_table *codepage);
-int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
-char *cifs_strndup_from_ucs(const char *src, const int maxlen,
-			    const bool is_unicode,
-			    const struct nls_table *codepage);
-extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
-			const struct nls_table *cp, int mapChars);
+int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
+		    const struct nls_table *codepage, bool mapchar);
+int cifs_utf16_bytes(const __le16 *from, int maxbytes,
+		     const struct nls_table *codepage);
+int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
+char *cifs_strndup_from_utf16(const char *src, const int maxlen,
+			      const bool is_unicode,
+			      const struct nls_table *codepage);
+extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
+			      const struct nls_table *cp, int mapChars);
 
 #endif
 
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index bce99e6a495..63c460e503b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -327,7 +327,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
 	attrptr->length = cpu_to_le16(2 * dlen);
 	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+	cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
 
 	return 0;
 }
@@ -376,7 +376,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
 					kmalloc(attrsize + 1, GFP_KERNEL);
 				if (!ses->domainName)
 						return -ENOMEM;
-				cifs_from_ucs2(ses->domainName,
+				cifs_from_utf16(ses->domainName,
 					(__le16 *)blobptr, attrsize, attrsize,
 					nls_cp, false);
 				break;
@@ -429,7 +429,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 	}
 
 	if (len) {
-		len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
+		len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
 		UniStrupr(user);
 	} else {
 		memset(user, '\0', 2);
@@ -453,8 +453,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 			rc = -ENOMEM;
 			return rc;
 		}
-		len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
-					nls_cp);
+		len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
+				      nls_cp);
 		rc =
 		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
 					(char *)domain, 2 * len);
@@ -473,7 +473,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 			rc = -ENOMEM;
 			return rc;
 		}
-		len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+		len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,
 					nls_cp);
 		rc =
 		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6600aa2d2ef..8b7794c3159 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -821,8 +821,8 @@ PsxDelete:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else { /* BB add path length overrun check */
@@ -893,8 +893,8 @@ DelFileRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->fileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {		/* BB improve check for buffer overruns BB */
@@ -938,8 +938,8 @@ RmDirRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName,
-					 PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName,
+					      PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {		/* BB improve check for buffer overruns BB */
@@ -981,8 +981,8 @@ MkDirRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name,
-					    PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
+					      PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {		/* BB improve check for buffer overruns BB */
@@ -1030,8 +1030,8 @@ PsxCreat:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, name,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, name,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -1197,8 +1197,8 @@ OldOpenRetry:
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		count = 1;      /* account for one byte pad to word boundary */
 		name_len =
-		   cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
-				    fileName, PATH_MAX, nls_codepage, remap);
+		   cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+				      fileName, PATH_MAX, nls_codepage, remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {                /* BB improve check for buffer overruns BB */
@@ -1304,8 +1304,8 @@ openRetry:
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		count = 1;	/* account for one byte pad to word boundary */
 		name_len =
-		    cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
-				     fileName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+				       fileName, PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 		pSMB->NameLength = cpu_to_le16(name_len);
@@ -2649,16 +2649,16 @@ renameRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 		pSMB->OldFileName[name_len] = 0x04;	/* pad */
 	/* protocol requires ASCII signature byte on Unicode string */
 		pSMB->OldFileName[name_len + 1] = 0x00;
 		name_len2 =
-		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
-				     toName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       toName, PATH_MAX, nls_codepage, remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2;	/* convert to bytes */
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -2738,10 +2738,12 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
 	/* unicode only call */
 	if (target_name == NULL) {
 		sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid);
-		len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+		len_of_str =
+			cifsConvertToUTF16((__le16 *)rename_info->target_name,
 					dummy_string, 24, nls_codepage, remap);
 	} else {
-		len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+		len_of_str =
+			cifsConvertToUTF16((__le16 *)rename_info->target_name,
 					target_name, PATH_MAX, nls_codepage,
 					remap);
 	}
@@ -2795,17 +2797,17 @@ copyRetry:
 	pSMB->Flags = cpu_to_le16(flags & COPY_TREE);
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName,
-					    fromName, PATH_MAX, nls_codepage,
-					    remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
+					      fromName, PATH_MAX, nls_codepage,
+					      remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 		pSMB->OldFileName[name_len] = 0x04;     /* pad */
 		/* protocol requires ASCII signature byte on Unicode string */
 		pSMB->OldFileName[name_len + 1] = 0x00;
 		name_len2 =
-		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
-				toName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       toName, PATH_MAX, nls_codepage, remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2; /* convert to bytes */
 	} else { 	/* BB improve the check for buffer overruns BB */
@@ -2861,9 +2863,9 @@ createSymLinkRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX
-				  /* find define for this maxpathcomponent */
-				  , nls_codepage);
+		    cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName,
+				    /* find define for this maxpathcomponent */
+				    PATH_MAX, nls_codepage);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
@@ -2885,9 +2887,9 @@ createSymLinkRetry:
 	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len_target =
-		    cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX
-				  /* find define for this maxpathcomponent */
-				  , nls_codepage);
+		    cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX
+				    /* find define for this maxpathcomponent */
+				    , nls_codepage);
 		name_len_target++;	/* trailing null */
 		name_len_target *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -2949,8 +2951,8 @@ createHardLinkRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName,
-					    PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName,
+					      PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
@@ -2972,8 +2974,8 @@ createHardLinkRetry:
 	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len_target =
-		    cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX,
-				     nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) data_offset, fromName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len_target++;	/* trailing null */
 		name_len_target *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -3042,8 +3044,8 @@ winCreateHardLinkRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
@@ -3051,8 +3053,8 @@ winCreateHardLinkRetry:
 		pSMB->OldFileName[name_len] = 0x04;
 		pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
 		name_len2 =
-		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
-				     toName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       toName, PATH_MAX, nls_codepage, remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2;	/* convert to bytes */
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -3108,8 +3110,8 @@ querySymLinkRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifs_strtoUCS((__le16 *) pSMB->FileName, searchName,
-				  PATH_MAX, nls_codepage);
+			cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName,
+					PATH_MAX, nls_codepage);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -3166,8 +3168,8 @@ querySymLinkRetry:
 				is_unicode = false;
 
 			/* BB FIXME investigate remapping reserved chars here */
-			*symlinkinfo = cifs_strndup_from_ucs(data_start, count,
-						    is_unicode, nls_codepage);
+			*symlinkinfo = cifs_strndup_from_utf16(data_start,
+					count, is_unicode, nls_codepage);
 			if (!*symlinkinfo)
 				rc = -ENOMEM;
 		}
@@ -3450,8 +3452,9 @@ queryAclRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-					 PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   searchName, PATH_MAX, nls_codepage,
+					   remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 		pSMB->FileName[name_len] = 0;
@@ -3537,8 +3540,8 @@ setAclRetry:
 		return rc;
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				      PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+					   PATH_MAX, nls_codepage, remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -3948,8 +3951,9 @@ QInfRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-					PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   searchName, PATH_MAX, nls_codepage,
+					   remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {
@@ -4086,8 +4090,8 @@ QPathInfoRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -4255,8 +4259,8 @@ UnixQPathInfoRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				  PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -4344,8 +4348,8 @@ findFirstRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				 PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		/* We can not add the asterik earlier in case
 		it got remapped to 0xF03A as if it were part of the
 		directory name instead of a wildcard */
@@ -4656,8 +4660,9 @@ GetInodeNumberRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-					 PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   searchName, PATH_MAX, nls_codepage,
+					   remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -4794,9 +4799,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 				rc = -ENOMEM;
 				goto parse_DFS_referrals_exit;
 			}
-			cifsConvertToUCS((__le16 *) tmp, searchName,
-					PATH_MAX, nls_codepage, remap);
-			node->path_consumed = cifs_ucs2_bytes(tmp,
+			cifsConvertToUTF16((__le16 *) tmp, searchName,
+					   PATH_MAX, nls_codepage, remap);
+			node->path_consumed = cifs_utf16_bytes(tmp,
 					le16_to_cpu(pSMBr->PathConsumed),
 					nls_codepage);
 			kfree(tmp);
@@ -4809,8 +4814,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		/* copy DfsPath */
 		temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
 		max_len = data_end - temp;
-		node->path_name = cifs_strndup_from_ucs(temp, max_len,
-						      is_unicode, nls_codepage);
+		node->path_name = cifs_strndup_from_utf16(temp, max_len,
+						is_unicode, nls_codepage);
 		if (!node->path_name) {
 			rc = -ENOMEM;
 			goto parse_DFS_referrals_exit;
@@ -4819,8 +4824,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		/* copy link target UNC */
 		temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
 		max_len = data_end - temp;
-		node->node_name = cifs_strndup_from_ucs(temp, max_len,
-						      is_unicode, nls_codepage);
+		node->node_name = cifs_strndup_from_utf16(temp, max_len,
+						is_unicode, nls_codepage);
 		if (!node->node_name)
 			rc = -ENOMEM;
 	}
@@ -4873,8 +4878,9 @@ getDFSRetry:
 	if (ses->capabilities & CAP_UNICODE) {
 		pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->RequestFileName,
-				     searchName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->RequestFileName,
+				       searchName, PATH_MAX, nls_codepage,
+				       remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -5506,8 +5512,8 @@ SetEOFRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -5796,8 +5802,8 @@ SetTimesRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -5877,8 +5883,8 @@ SetAttrLgcyRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			ConvertToUCS((__le16 *) pSMB->fileName, fileName,
-				PATH_MAX, nls_codepage);
+			ConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+				       PATH_MAX, nls_codepage);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -6030,8 +6036,8 @@ setPermsRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -6123,8 +6129,8 @@ QAllEAsRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		list_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		list_len++;	/* trailing null */
 		list_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -6301,8 +6307,8 @@ SetEARetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 28f23c03da5..986709a8d90 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3644,7 +3644,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
 	if (ses->capabilities & CAP_UNICODE) {
 		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
 		length =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, tree,
+		    cifs_strtoUTF16((__le16 *) bcc_ptr, tree,
 			6 /* max utf8 char length in bytes */ *
 			(/* server len*/ + 256 /* share len */), nls_codepage);
 		bcc_ptr += 2 * length;	/* convert num 16 bit words to bytes */
@@ -3699,7 +3699,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
 
 		/* mostly informational -- no need to fail on error here */
 		kfree(tcon->nativeFileSystem);
-		tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
+		tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr,
 						      bytes_left, is_unicode,
 						      nls_codepage);
 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a090bbe6ee2..e2bbc683e01 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -647,10 +647,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
 
 		name.name = scratch_buf;
 		name.len =
-			cifs_from_ucs2((char *)name.name, (__le16 *)de.name,
-				       UNICODE_NAME_MAX,
-				       min(de.namelen, (size_t)max_len), nlt,
-				       cifs_sb->mnt_cifs_flags &
+			cifs_from_utf16((char *)name.name, (__le16 *)de.name,
+					UNICODE_NAME_MAX,
+					min_t(size_t, de.namelen,
+					      (size_t)max_len), nlt,
+					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 		name.len -= nls_nullsize(nlt);
 	} else {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 4ec3ee9d72c..d85efad5765 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -167,16 +167,16 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
 	int bytes_ret = 0;
 
 	/* Copy OS version */
-	bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
-				  nls_cp);
+	bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32,
+				    nls_cp);
 	bcc_ptr += 2 * bytes_ret;
-	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release,
-				  32, nls_cp);
+	bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release,
+				    32, nls_cp);
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* trailing null */
 
-	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-				  32, nls_cp);
+	bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+				    32, nls_cp);
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* trailing null */
 
@@ -197,8 +197,8 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
 		*(bcc_ptr+1) = 0;
 		bytes_ret = 0;
 	} else
-		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName,
-					  256, nls_cp);
+		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
+					    256, nls_cp);
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2;  /* account for null terminator */
 
@@ -226,8 +226,8 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
 		*bcc_ptr = 0;
 		*(bcc_ptr+1) = 0;
 	} else {
-		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name,
-					  MAX_USERNAME_SIZE, nls_cp);
+		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
+					    MAX_USERNAME_SIZE, nls_cp);
 	}
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* account for null termination */
@@ -287,7 +287,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
 	cFYI(1, "bleft %d", bleft);
 
 	kfree(ses->serverOS);
-	ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
 	cFYI(1, "serverOS=%s", ses->serverOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
@@ -296,7 +296,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
 		return;
 
 	kfree(ses->serverNOS);
-	ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
 	cFYI(1, "serverNOS=%s", ses->serverNOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
@@ -305,7 +305,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
 		return;
 
 	kfree(ses->serverDomain);
-	ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
 	cFYI(1, "serverDomain=%s", ses->serverDomain);
 
 	return;
@@ -502,8 +502,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		tmp += 2;
 	} else {
 		int len;
-		len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
-				    MAX_USERNAME_SIZE, nls_cp);
+		len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
+				      MAX_USERNAME_SIZE, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->DomainName.Length = cpu_to_le16(len);
@@ -518,8 +518,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		tmp += 2;
 	} else {
 		int len;
-		len = cifs_strtoUCS((__le16 *)tmp, ses->user_name,
-				    MAX_USERNAME_SIZE, nls_cp);
+		len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
+				      MAX_USERNAME_SIZE, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
 		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->UserName.Length = cpu_to_le16(len);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 80d85088193..d5cd9aa7eac 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -213,7 +213,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
 
 	/* Password cannot be longer than 128 characters */
 	if (passwd) /* Password must be converted to NT unicode */
-		len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
+		len = cifs_strtoUTF16(wpwd, passwd, 128, codepage);
 	else {
 		len = 0;
 		*wpwd = 0; /* Ensure string is null terminated */
-- 
cgit v1.2.3


From 1aab323ea5cd67d2d2572a1f2794978583ff8545 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 19 Jan 2012 13:19:42 -0500
Subject: qnx4: di_fname is an array, for crying out loud...

(struct qnx4_inode_entry *)(bh->b_data + some_offset)->di_fname
is not going to be NULL, TYVM...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/inode.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2bfd987f485..63e0f7471fb 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -194,20 +194,18 @@ static const char *qnx4_checkroot(struct super_block *sb)
 			}
 			for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
 				rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
-				if (rootdir->di_fname != NULL) {
-					QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
-					if (!strcmp(rootdir->di_fname,
-						    QNX4_BMNAME)) {
-						found = 1;
-						qnx4_sb(sb)->BitMap = kmemdup(rootdir,
-									      sizeof(struct qnx4_inode_entry),
-									      GFP_KERNEL);
-						if (!qnx4_sb(sb)->BitMap) {
-							brelse (bh);
-							return "not enough memory for bitmap inode";
-						}/* keep bitmap inode known */
-						break;
-					}
+				QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
+				if (!strcmp(rootdir->di_fname,
+					    QNX4_BMNAME)) {
+					found = 1;
+					qnx4_sb(sb)->BitMap = kmemdup(rootdir,
+								      sizeof(struct qnx4_inode_entry),
+								      GFP_KERNEL);
+					if (!qnx4_sb(sb)->BitMap) {
+						brelse (bh);
+						return "not enough memory for bitmap inode";
+					}/* keep bitmap inode known */
+					break;
 				}
 			}
 			brelse(bh);
-- 
cgit v1.2.3


From 4134bf81ffd962f4de9bbeca55130d2238bd3698 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 19 Jan 2012 13:40:57 -0500
Subject: qnx4: reduce the insane nesting in qnx4_checkroot()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/inode.c | 56 ++++++++++++++++++++++----------------------------------
 1 file changed, 22 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 63e0f7471fb..3fd121c7c30 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -179,45 +179,33 @@ static const char *qnx4_checkroot(struct super_block *sb)
 	struct qnx4_inode_entry *rootdir;
 	int rd, rl;
 	int i, j;
-	int found = 0;
 
-	if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
+	if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
 		return "no qnx4 filesystem (no root dir).";
-	} else {
-		QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
-		rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
-		rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
-		for (j = 0; j < rl; j++) {
-			bh = sb_bread(sb, rd + j);	/* root dir, first block */
-			if (bh == NULL) {
-				return "unable to read root entry.";
-			}
-			for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
-				rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
-				QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
-				if (!strcmp(rootdir->di_fname,
-					    QNX4_BMNAME)) {
-					found = 1;
-					qnx4_sb(sb)->BitMap = kmemdup(rootdir,
-								      sizeof(struct qnx4_inode_entry),
-								      GFP_KERNEL);
-					if (!qnx4_sb(sb)->BitMap) {
-						brelse (bh);
-						return "not enough memory for bitmap inode";
-					}/* keep bitmap inode known */
-					break;
-				}
-			}
+	QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
+	rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
+	rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
+	for (j = 0; j < rl; j++) {
+		bh = sb_bread(sb, rd + j);	/* root dir, first block */
+		if (bh == NULL)
+			return "unable to read root entry.";
+		rootdir = (struct qnx4_inode_entry *) bh->b_data;
+		for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) {
+			QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
+			if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0)
+				continue;
+			qnx4_sb(sb)->BitMap = kmemdup(rootdir,
+						      sizeof(struct qnx4_inode_entry),
+						      GFP_KERNEL);
 			brelse(bh);
-			if (found != 0) {
-				break;
-			}
-		}
-		if (found == 0) {
-			return "bitmap file not found.";
+			if (!qnx4_sb(sb)->BitMap)
+				return "not enough memory for bitmap inode";
+			/* keep bitmap inode known */
+			return NULL;
 		}
+		brelse(bh);
 	}
-	return NULL;
+	return "bitmap file not found.";
 }
 
 static int qnx4_fill_super(struct super_block *s, void *data, int silent)
-- 
cgit v1.2.3


From 8bc5191b261c4fd9a5e9052cebe04ce2ef05f2e7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 19 Jan 2012 13:54:36 -0500
Subject: qnx4: don't leak ->BitMap on late failure exits

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3fd121c7c30..6b009548d2e 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -256,7 +256,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	if (IS_ERR(root)) {
 		printk(KERN_ERR "qnx4: get inode failed\n");
 		ret = PTR_ERR(root);
- 		goto out;
+ 		goto outb;
  	}
 
 	ret = -ENOMEM;
@@ -269,6 +269,8 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 
       outi:
 	iput(root);
+      outb:
+	kfree(qs->BitMap);
       out:
 	brelse(bh);
       outnobh:
-- 
cgit v1.2.3


From 85e72aa5384b1a614563ad63257ded0e91d1a620 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 20 Jan 2012 14:34:09 -0800
Subject: proc: clear_refs: do not clear reserved pages

/proc/pid/clear_refs is used to clear the Referenced and YOUNG bits for
pages and corresponding page table entries of the task with PID pid, which
includes any special mappings inserted into the page tables in order to
provide things like vDSOs and user helper functions.

On ARM this causes a problem because the vectors page is mapped as a
global mapping and since ec706dab ("ARM: add a vma entry for the user
accessible vector page"), a VMA is also inserted into each task for this
page to aid unwinding through signals and syscall restarts.  Since the
vectors page is required for handling faults, clearing the YOUNG bit (and
subsequently writing a faulting pte) means that we lose the vectors page
*globally* and cannot fault it back in.  This results in a system deadlock
on the next exception.

To see this problem in action, just run:

	$ echo 1 > /proc/self/clear_refs

on an ARM platform (as any user) and watch your system hang.  I think this
has been the case since 2.6.37

This patch avoids clearing the aforementioned bits for reserved pages,
therefore leaving the vectors page intact on ARM.  Since reserved pages
are not candidates for swap, this change should not have any impact on the
usefulness of clear_refs.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Reported-by: Moussa Ba <moussaba@micron.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Acked-by: Nicolas Pitre <nico@linaro.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: <stable@vger.kernel.org>		[2.6.37+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e418c5abdb0..7dcd2a25049 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -518,6 +518,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!page)
 			continue;
 
+		if (PageReserved(page))
+			continue;
+
 		/* Clear accessed and referenced bits. */
 		ptep_test_and_clear_young(vma, addr, pte);
 		ClearPageReferenced(page);
-- 
cgit v1.2.3


From b5763accd3b5fc131ee06e26ce56e63ae0322c9b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sat, 21 Jan 2012 11:02:42 -0800
Subject: kernel-doc: fix new warnings in debugfs

Fix new kernel-doc warnings:

Warning(fs/debugfs/file.c:556): No description found for parameter 'nregs'
Warning(fs/debugfs/file.c:556): Excess function parameter 'mregs' description in 'debugfs_print_regs32'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/debugfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index f65d4455c5e..ef023eef046 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_blob);
  * debugfs_print_regs32 - use seq_print to describe a set of registers
  * @s: the seq_file structure being used to generate output
  * @regs: an array if struct debugfs_reg32 structures
- * @mregs: the length of the above array
+ * @nregs: the length of the above array
  * @base: the base address to be used in reading the registers
  * @prefix: a string to be prefixed to every output line
  *
-- 
cgit v1.2.3


From 0863b04d1578879173aacbc5c7be749fccb70809 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sat, 21 Jan 2012 11:02:42 -0800
Subject: kernel-doc: fix new warnings in debugfs

Fix new kernel-doc warnings:

Warning(fs/debugfs/file.c:556): No description found for parameter 'nregs'
Warning(fs/debugfs/file.c:556): Excess function parameter 'mregs' description in 'debugfs_print_regs32'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index f65d4455c5e..ef023eef046 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_blob);
  * debugfs_print_regs32 - use seq_print to describe a set of registers
  * @s: the seq_file structure being used to generate output
  * @regs: an array if struct debugfs_reg32 structures
- * @mregs: the length of the above array
+ * @nregs: the length of the above array
  * @base: the base address to be used in reading the registers
  * @prefix: a string to be prefixed to every output line
  *
-- 
cgit v1.2.3


From ce597919361dcec97341151690e780eade2a9cf4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 13 Jan 2012 21:32:59 -0800
Subject: sysfs: Complain bitterly about attempts to remove files from
 nonexistent directories.

Recently an OOPS was observed from the usb serial io_ti driver when it tried to remove
sysfs directories.  Upon investigation it turns out this driver was always buggy
and that a recent sysfs change had stopped guarding itself against removing attributes
from sysfs directories that had already been removed. :(

Historically we have been silent about attempting to files from nonexistent sysfs
directories and have politely returned error codes.  That has resulted in people writing
broken code that ignores the error codes.

Issue a kernel WARNING and a stack backtrace to make it clear in no uncertain
terms that abusing sysfs is not ok, and the callers need to fix their code.

This change transforms the io_ti OOPS into a more comprehensible error message
and stack backtrace.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Reported-by: Wolfgang Frisch <wfpub@roembden.net>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c  | 6 ++++++
 fs/sysfs/inode.c | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 62f4fb37789..00012e31829 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -493,6 +493,12 @@ int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
 	const void *ns = NULL;
 	int err;
 
+	if (!dir_sd) {
+		WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n",
+			kobject_name(kobj));
+		return -ENOENT;
+	}
+
 	err = 0;
 	if (!sysfs_ns_type(dir_sd))
 		goto out;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 4a802b4a905..85eb81683a2 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -318,8 +318,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
 
-	if (!dir_sd)
+	if (!dir_sd) {
+		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
+			name);
 		return -ENOENT;
+	}
 
 	sysfs_addrm_start(&acxt, dir_sd);
 
-- 
cgit v1.2.3


From 9b025eb3a89e041bab6698e3858706be2385d692 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Jan 2012 18:52:10 +0000
Subject: xfs: Fix missing xfs_iunlock() on error recovery path in
 xfs_readlink()

Commit b52a360b forgot to call xfs_iunlock() when it detected corrupted
symplink and bailed out. Fix it by jumping to 'out' instead of doing return.

CC: stable@kernel.org
CC: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Alex Elder <elder@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0cf52da9d24..ebdb88840a4 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -131,7 +131,8 @@ xfs_readlink(
 			 __func__, (unsigned long long) ip->i_ino,
 			 (long long) pathlen);
 		ASSERT(0);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto out;
 	}
 
 
-- 
cgit v1.2.3


From 30373dc0c87ffef68d5628e77d56ffb1fa22e1ee Mon Sep 17 00:00:00 2001
From: Tim Gardner <tim.gardner@canonical.com>
Date: Thu, 12 Jan 2012 16:31:55 +0100
Subject: ecryptfs: Improve metadata read failure logging

Print inode on metadata read failure. The only real
way of dealing with metadata read failures is to delete
the underlying file system file. Having the inode
allows one to 'find . -inum INODE`.

[tyhicks@canonical.com: Removed some minor not-for-stable parts]
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: stable@vger.kernel.org
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/crypto.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2a834255c75..2bf52033538 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1620,7 +1620,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 		rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
 		if (rc) {
 			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
-			       "file header region or xattr region\n");
+			       "file header region or xattr region, inode %lu\n",
+				ecryptfs_inode->i_ino);
 			rc = -EINVAL;
 			goto out;
 		}
@@ -1629,7 +1630,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 						ECRYPTFS_DONT_VALIDATE_HEADER_SIZE);
 		if (rc) {
 			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
-			       "file xattr region either\n");
+			       "file xattr region either, inode %lu\n",
+				ecryptfs_inode->i_ino);
 			rc = -EINVAL;
 		}
 		if (crypt_stat->mount_crypt_stat->flags
@@ -1640,7 +1642,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 			       "crypto metadata only in the extended attribute "
 			       "region, but eCryptfs was mounted without "
 			       "xattr support enabled. eCryptfs will not treat "
-			       "this like an encrypted file.\n");
+			       "this like an encrypted file, inode %lu\n",
+				ecryptfs_inode->i_ino);
 			rc = -EINVAL;
 		}
 	}
-- 
cgit v1.2.3


From bb4503615d95d6826b7907986ad574e3157877e8 Mon Sep 17 00:00:00 2001
From: Tim Gardner <tim.gardner@canonical.com>
Date: Thu, 12 Jan 2012 16:31:55 +0100
Subject: ecryptfs: Remove unnecessary variable initialization

Removes unneeded variable initialization in ecryptfs_read_metadata(). Also adds
a small comment to help explain metadata reading logic.

[tyhicks@canonical.com: Pulled out of for-stable patch and wrote commit msg]
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/crypto.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2bf52033538..ff981503b3e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1590,8 +1590,8 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
  */
 int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 {
-	int rc = 0;
-	char *page_virt = NULL;
+	int rc;
+	char *page_virt;
 	struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
 	struct ecryptfs_crypt_stat *crypt_stat =
 	    &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
@@ -1616,6 +1616,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 						ecryptfs_dentry,
 						ECRYPTFS_VALIDATE_HEADER_SIZE);
 	if (rc) {
+		/* metadata is not in the file header, so try xattrs */
 		memset(page_virt, 0, PAGE_CACHE_SIZE);
 		rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
 		if (rc) {
-- 
cgit v1.2.3


From db10e556518eb9d21ee92ff944530d84349684f4 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Thu, 12 Jan 2012 11:30:44 +0100
Subject: eCryptfs: Sanitize write counts of /dev/ecryptfs

A malicious count value specified when writing to /dev/ecryptfs may
result in a a very large kernel memory allocation.

This patch peeks at the specified packet payload size, adds that to the
size of the packet headers and compares the result with the write count
value. The resulting maximum memory allocation size is approximately 532
bytes.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reported-by: Sasha Levin <levinsasha928@gmail.com>
Cc: <stable@vger.kernel.org>
---
 fs/ecryptfs/miscdev.c | 56 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 940a82e63dc..0dc5a3d554a 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -409,11 +409,47 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 	ssize_t sz = 0;
 	char *data;
 	uid_t euid = current_euid();
+	unsigned char packet_size_peek[3];
 	int rc;
 
-	if (count == 0)
+	if (count == 0) {
 		goto out;
+	} else if (count == (1 + 4)) {
+		/* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
+		goto memdup;
+	} else if (count < (1 + 4 + 1)
+		   || count > (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4
+			       + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)) {
+		printk(KERN_WARNING "%s: Acceptable packet size range is "
+		       "[%d-%lu], but amount of data written is [%zu].",
+		       __func__, (1 + 4 + 1),
+		       (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4
+			+ ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES), count);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(packet_size_peek, (buf + 1 + 4),
+			   sizeof(packet_size_peek))) {
+		printk(KERN_WARNING "%s: Error while inspecting packet size\n",
+		       __func__);
+		return -EFAULT;
+	}
+
+	rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size,
+					  &packet_size_length);
+	if (rc) {
+		printk(KERN_WARNING "%s: Error parsing packet length; "
+		       "rc = [%d]\n", __func__, rc);
+		return rc;
+	}
+
+	if ((1 + 4 + packet_size_length + packet_size) != count) {
+		printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
+		       packet_size);
+		return -EINVAL;
+	}
 
+memdup:
 	data = memdup_user(buf, count);
 	if (IS_ERR(data)) {
 		printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
@@ -435,23 +471,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 		}
 		memcpy(&counter_nbo, &data[i], 4);
 		seq = be32_to_cpu(counter_nbo);
-		i += 4;
-		rc = ecryptfs_parse_packet_length(&data[i], &packet_size,
-						  &packet_size_length);
-		if (rc) {
-			printk(KERN_WARNING "%s: Error parsing packet length; "
-			       "rc = [%d]\n", __func__, rc);
-			goto out_free;
-		}
-		i += packet_size_length;
-		if ((1 + 4 + packet_size_length + packet_size) != count) {
-			printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
-			       " + packet_size([%zd]))([%zd]) != "
-			       "count([%zd]). Invalid packet format.\n",
-			       __func__, packet_size_length, packet_size,
-			       (1 + packet_size_length + packet_size), count);
-			goto out_free;
-		}
+		i += 4 + packet_size_length;
 		rc = ecryptfs_miscdev_response(&data[i], packet_size,
 					       euid, current_user_ns(),
 					       task_pid(current), seq);
-- 
cgit v1.2.3


From 7f133504249afa48618becac546ce3c35c9f0185 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Sat, 14 Jan 2012 15:51:37 +0100
Subject: eCryptfs: Report errors in writes to /dev/ecryptfs

Errors in writes to /dev/ecryptfs were being incorrectly reported by
returning 0 or the value of the original write count.

This patch clears up the return code assignment in error paths.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/miscdev.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 0dc5a3d554a..1145c58103e 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -406,14 +406,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 	__be32 counter_nbo;
 	u32 seq;
 	size_t packet_size, packet_size_length, i;
-	ssize_t sz = 0;
 	char *data;
 	uid_t euid = current_euid();
 	unsigned char packet_size_peek[3];
-	int rc;
+	ssize_t rc;
 
 	if (count == 0) {
-		goto out;
+		return 0;
 	} else if (count == (1 + 4)) {
 		/* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
 		goto memdup;
@@ -439,7 +438,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 					  &packet_size_length);
 	if (rc) {
 		printk(KERN_WARNING "%s: Error parsing packet length; "
-		       "rc = [%d]\n", __func__, rc);
+		       "rc = [%zd]\n", __func__, rc);
 		return rc;
 	}
 
@@ -454,9 +453,8 @@ memdup:
 	if (IS_ERR(data)) {
 		printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
 		       __func__, PTR_ERR(data));
-		goto out;
+		return PTR_ERR(data);
 	}
-	sz = count;
 	i = 0;
 	switch (data[i++]) {
 	case ECRYPTFS_MSG_RESPONSE:
@@ -467,6 +465,7 @@ memdup:
 			       __func__,
 			       (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
 			       count);
+			rc = -EINVAL;
 			goto out_free;
 		}
 		memcpy(&counter_nbo, &data[i], 4);
@@ -475,10 +474,12 @@ memdup:
 		rc = ecryptfs_miscdev_response(&data[i], packet_size,
 					       euid, current_user_ns(),
 					       task_pid(current), seq);
-		if (rc)
+		if (rc) {
 			printk(KERN_WARNING "%s: Failed to deliver miscdev "
-			       "response to requesting operation; rc = [%d]\n",
+			       "response to requesting operation; rc = [%zd]\n",
 			       __func__, rc);
+			goto out_free;
+		}
 		break;
 	case ECRYPTFS_MSG_HELO:
 	case ECRYPTFS_MSG_QUIT:
@@ -487,12 +488,13 @@ memdup:
 		ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
 				"message of unrecognized type [%d]\n",
 				data[0]);
-		break;
+		rc = -EINVAL;
+		goto out_free;
 	}
+	rc = count;
 out_free:
 	kfree(data);
-out:
-	return sz;
+	return rc;
 }
 
 
-- 
cgit v1.2.3


From 48399c0b0e6172888a2e2e36df1595ab1e049ba8 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Sat, 14 Jan 2012 16:46:46 +0100
Subject: eCryptfs: Replace miscdev read/write magic numbers

ecryptfs_miscdev_read() and ecryptfs_miscdev_write() contained many
magic numbers for specifying packet header field sizes and offsets. This
patch defines those values and replaces the magic values.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/ecryptfs_kernel.h |  5 +++
 fs/ecryptfs/keystore.c        |  5 ++-
 fs/ecryptfs/miscdev.c         | 86 +++++++++++++++++++++++--------------------
 3 files changed, 55 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a9f29b12fbf..a2362df58ae 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -151,6 +151,11 @@ ecryptfs_get_key_payload_data(struct key *key)
 					  * dentry name */
 #define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
 					  * metadata */
+#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */
+#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to
+				     * ecryptfs_parse_packet_length() and
+				     * ecryptfs_write_packet_length()
+				     */
 /* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
  * ECRYPTFS_MAX_IV_BYTES */
 #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ac1ad48c237..8e3b943e330 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -109,7 +109,7 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
 		(*size) += ((unsigned char)(data[1]) + 192);
 		(*length_size) = 2;
 	} else if (data[0] == 255) {
-		/* Five-byte length; we're not supposed to see this */
+		/* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
 		ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
 				"supported\n");
 		rc = -EINVAL;
@@ -126,7 +126,7 @@ out:
 /**
  * ecryptfs_write_packet_length
  * @dest: The byte array target into which to write the length. Must
- *        have at least 5 bytes allocated.
+ *        have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated.
  * @size: The length to write.
  * @packet_size_length: The number of bytes used to encode the packet
  *                      length is written to this address.
@@ -146,6 +146,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
 		dest[1] = ((size - 192) % 256);
 		(*packet_size_length) = 2;
 	} else {
+		/* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
 		rc = -EINVAL;
 		ecryptfs_printk(KERN_WARNING,
 				"Unsupported packet size: [%zd]\n", size);
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 1145c58103e..349209dc6a9 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -218,6 +218,29 @@ out_unlock:
 	return rc;
 }
 
+/*
+ * miscdevfs packet format:
+ *  Octet 0: Type
+ *  Octets 1-4: network byte order msg_ctx->counter
+ *  Octets 5-N0: Size of struct ecryptfs_message to follow
+ *  Octets N0-N1: struct ecryptfs_message (including data)
+ *
+ *  Octets 5-N1 not written if the packet type does not include a message
+ */
+#define PKT_TYPE_SIZE		1
+#define PKT_CTR_SIZE		4
+#define MIN_NON_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE)
+#define MIN_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE \
+				 + ECRYPTFS_MIN_PKT_LEN_SIZE)
+/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */
+#define MAX_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE \
+				 + ECRYPTFS_MAX_PKT_LEN_SIZE \
+				 + sizeof(struct ecryptfs_message) \
+				 + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)
+#define PKT_TYPE_OFFSET		0
+#define PKT_CTR_OFFSET		PKT_TYPE_SIZE
+#define PKT_LEN_OFFSET		(PKT_TYPE_SIZE + PKT_CTR_SIZE)
+
 /**
  * ecryptfs_miscdev_read - format and send message from queue
  * @file: fs/ecryptfs/euid miscdevfs handle (ignored)
@@ -237,7 +260,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
 	struct ecryptfs_daemon *daemon;
 	struct ecryptfs_msg_ctx *msg_ctx;
 	size_t packet_length_size;
-	char packet_length[3];
+	char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];
 	size_t i;
 	size_t total_length;
 	uid_t euid = current_euid();
@@ -305,15 +328,8 @@ check_list:
 		packet_length_size = 0;
 		msg_ctx->msg_size = 0;
 	}
-	/* miscdevfs packet format:
-	 *  Octet 0: Type
-	 *  Octets 1-4: network byte order msg_ctx->counter
-	 *  Octets 5-N0: Size of struct ecryptfs_message to follow
-	 *  Octets N0-N1: struct ecryptfs_message (including data)
-	 *
-	 *  Octets 5-N1 not written if the packet type does not
-	 *  include a message */
-	total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size);
+	total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size
+			+ msg_ctx->msg_size);
 	if (count < total_length) {
 		rc = 0;
 		printk(KERN_WARNING "%s: Only given user buffer of "
@@ -324,9 +340,10 @@ check_list:
 	rc = -EFAULT;
 	if (put_user(msg_ctx->type, buf))
 		goto out_unlock_msg_ctx;
-	if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1)))
+	if (put_user(cpu_to_be32(msg_ctx->counter),
+		     (__be32 __user *)(&buf[PKT_CTR_OFFSET])))
 		goto out_unlock_msg_ctx;
-	i = 5;
+	i = PKT_TYPE_SIZE + PKT_CTR_SIZE;
 	if (msg_ctx->msg) {
 		if (copy_to_user(&buf[i], packet_length, packet_length_size))
 			goto out_unlock_msg_ctx;
@@ -391,12 +408,6 @@ out:
  * @count: Amount of data in @buf
  * @ppos: Pointer to offset in file (ignored)
  *
- * miscdevfs packet format:
- *  Octet 0: Type
- *  Octets 1-4: network byte order msg_ctx->counter (0's for non-response)
- *  Octets 5-N0: Size of struct ecryptfs_message to follow
- *  Octets N0-N1: struct ecryptfs_message (including data)
- *
  * Returns the number of bytes read from @buf
  */
 static ssize_t
@@ -405,29 +416,25 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 {
 	__be32 counter_nbo;
 	u32 seq;
-	size_t packet_size, packet_size_length, i;
+	size_t packet_size, packet_size_length;
 	char *data;
 	uid_t euid = current_euid();
-	unsigned char packet_size_peek[3];
+	unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE];
 	ssize_t rc;
 
 	if (count == 0) {
 		return 0;
-	} else if (count == (1 + 4)) {
+	} else if (count == MIN_NON_MSG_PKT_SIZE) {
 		/* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
 		goto memdup;
-	} else if (count < (1 + 4 + 1)
-		   || count > (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4
-			       + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)) {
+	} else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) {
 		printk(KERN_WARNING "%s: Acceptable packet size range is "
 		       "[%d-%lu], but amount of data written is [%zu].",
-		       __func__, (1 + 4 + 1),
-		       (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4
-			+ ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES), count);
+		       __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count);
 		return -EINVAL;
 	}
 
-	if (copy_from_user(packet_size_peek, (buf + 1 + 4),
+	if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET],
 			   sizeof(packet_size_peek))) {
 		printk(KERN_WARNING "%s: Error while inspecting packet size\n",
 		       __func__);
@@ -442,7 +449,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 		return rc;
 	}
 
-	if ((1 + 4 + packet_size_length + packet_size) != count) {
+	if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size)
+	    != count) {
 		printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
 		       packet_size);
 		return -EINVAL;
@@ -455,25 +463,25 @@ memdup:
 		       __func__, PTR_ERR(data));
 		return PTR_ERR(data);
 	}
-	i = 0;
-	switch (data[i++]) {
+	switch (data[PKT_TYPE_OFFSET]) {
 	case ECRYPTFS_MSG_RESPONSE:
-		if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
+		if (count < (MIN_MSG_PKT_SIZE
+			     + sizeof(struct ecryptfs_message))) {
 			printk(KERN_WARNING "%s: Minimum acceptable packet "
 			       "size is [%zd], but amount of data written is "
 			       "only [%zd]. Discarding response packet.\n",
 			       __func__,
-			       (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
-			       count);
+			       (MIN_MSG_PKT_SIZE
+				+ sizeof(struct ecryptfs_message)), count);
 			rc = -EINVAL;
 			goto out_free;
 		}
-		memcpy(&counter_nbo, &data[i], 4);
+		memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);
 		seq = be32_to_cpu(counter_nbo);
-		i += 4 + packet_size_length;
-		rc = ecryptfs_miscdev_response(&data[i], packet_size,
-					       euid, current_user_ns(),
-					       task_pid(current), seq);
+		rc = ecryptfs_miscdev_response(
+				&data[PKT_LEN_OFFSET + packet_size_length],
+				packet_size, euid, current_user_ns(),
+				task_pid(current), seq);
 		if (rc) {
 			printk(KERN_WARNING "%s: Failed to deliver miscdev "
 			       "response to requesting operation; rc = [%zd]\n",
-- 
cgit v1.2.3


From 684a3ff7e69acc7c678d1a1394fe9e757993fd34 Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@nudt.edu.cn>
Date: Thu, 19 Jan 2012 09:44:36 +0800
Subject: eCryptfs: Infinite loop due to overflow in ecryptfs_write()

ecryptfs_write() can enter an infinite loop when truncating a file to a
size larger than 4G. This only happens on architectures where size_t is
represented by 32 bits.

This was caused by a size_t overflow due to it incorrectly being used to
store the result of a calculation which uses potentially large values of
type loff_t.

[tyhicks@canonical.com: rewrite subject and commit message]
Signed-off-by: Li Wang <liwang@nudt.edu.cn>
Signed-off-by: Yunchuan Wen <wenyunchuan@kylinos.com.cn>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/read_write.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 3745f7c2b9c..ec3d9368dc5 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -130,13 +130,13 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 		pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
 		size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
 		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
-		size_t total_remaining_bytes = ((offset + size) - pos);
+		loff_t total_remaining_bytes = ((offset + size) - pos);
 
 		if (num_bytes > total_remaining_bytes)
 			num_bytes = total_remaining_bytes;
 		if (pos < offset) {
 			/* remaining zeros to write, up to destination offset */
-			size_t total_remaining_zeros = (offset - pos);
+			loff_t total_remaining_zeros = (offset - pos);
 
 			if (num_bytes > total_remaining_zeros)
 				num_bytes = total_remaining_zeros;
-- 
cgit v1.2.3


From 5e6f0d769017cc49207ef56996e42363ec26c1f0 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Wed, 18 Jan 2012 18:30:04 -0600
Subject: eCryptfs: Make truncate path killable

ecryptfs_write() handles the truncation of eCryptfs inodes. It grabs a
page, zeroes out the appropriate portions, and then encrypts the page
before writing it to the lower filesystem. It was unkillable and due to
the lack of sparse file support could result in tying up a large portion
of system resources, while encrypting pages of zeros, with no way for
the truncate operation to be stopped from userspace.

This patch adds the ability for ecryptfs_write() to detect a pending
fatal signal and return as gracefully as possible. The intent is to
leave the lower file in a useable state, while still allowing a user to
break out of the encryption loop. If a pending fatal signal is detected,
the eCryptfs inode size is updated to reflect the modified inode size
and then -EINTR is returned.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Cc: <stable@vger.kernel.org>
---
 fs/ecryptfs/read_write.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index ec3d9368dc5..608c1c3fde1 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -132,6 +132,11 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
 		loff_t total_remaining_bytes = ((offset + size) - pos);
 
+		if (fatal_signal_pending(current)) {
+			rc = -EINTR;
+			break;
+		}
+
 		if (num_bytes > total_remaining_bytes)
 			num_bytes = total_remaining_bytes;
 		if (pos < offset) {
@@ -193,15 +198,19 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 		}
 		pos += num_bytes;
 	}
-	if ((offset + size) > ecryptfs_file_size) {
-		i_size_write(ecryptfs_inode, (offset + size));
+	if (pos > ecryptfs_file_size) {
+		i_size_write(ecryptfs_inode, pos);
 		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
-			rc = ecryptfs_write_inode_size_to_metadata(
+			int rc2;
+
+			rc2 = ecryptfs_write_inode_size_to_metadata(
 								ecryptfs_inode);
-			if (rc) {
+			if (rc2) {
 				printk(KERN_ERR	"Problem with "
 				       "ecryptfs_write_inode_size_to_metadata; "
-				       "rc = [%d]\n", rc);
+				       "rc = [%d]\n", rc2);
+				if (!rc)
+					rc = rc2;
 				goto out;
 			}
 		}
-- 
cgit v1.2.3


From a261a03904849c3df50bd0300efb7fb3f865137d Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Thu, 19 Jan 2012 20:33:44 -0600
Subject: eCryptfs: Check inode changes in setattr

Most filesystems call inode_change_ok() very early in ->setattr(), but
eCryptfs didn't call it at all. It allowed the lower filesystem to make
the call in its ->setattr() function. Then, eCryptfs would copy the
appropriate inode attributes from the lower inode to the eCryptfs inode.

This patch changes that and actually calls inode_change_ok() on the
eCryptfs inode, fairly early in ecryptfs_setattr(). Ideally, the call
would happen earlier in ecryptfs_setattr(), but there are some possible
inode initialization steps that must happen first.

Since the call was already being made on the lower inode, the change in
functionality should be minimal, except for the case of a file extending
truncate call. In that case, inode_newsize_ok() was never being
called on the eCryptfs inode. Rather than inode_newsize_ok() catching
maximum file size errors early on, eCryptfs would encrypt zeroed pages
and write them to the lower filesystem until the lower filesystem's
write path caught the error in generic_write_checks(). This patch
introduces a new function, called ecryptfs_inode_newsize_ok(), which
checks if the new lower file size is within the appropriate limits when
the truncate operation will be growing the lower file.

In summary this change prevents eCryptfs truncate operations (and the
resulting page encryptions), which would exceed the lower filesystem
limits or FSIZE rlimits, from ever starting.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reviewed-by: Li Wang <liwang@nudt.edu.cn>
Cc: <stable@vger.kernel.org>
---
 fs/ecryptfs/inode.c | 48 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 19a8ca4ab1d..19892d7d2ed 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -822,18 +822,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 		size_t num_zeros = (PAGE_CACHE_SIZE
 				    - (ia->ia_size & ~PAGE_CACHE_MASK));
 
-
-		/*
-		 * XXX(truncate) this should really happen at the begginning
-		 * of ->setattr.  But the code is too messy to that as part
-		 * of a larger patch.  ecryptfs is also totally missing out
-		 * on the inode_change_ok check at the beginning of
-		 * ->setattr while would include this.
-		 */
-		rc = inode_newsize_ok(inode, ia->ia_size);
-		if (rc)
-			goto out;
-
 		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
 			truncate_setsize(inode, ia->ia_size);
 			lower_ia->ia_size = ia->ia_size;
@@ -883,6 +871,28 @@ out:
 	return rc;
 }
 
+static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset)
+{
+	struct ecryptfs_crypt_stat *crypt_stat;
+	loff_t lower_oldsize, lower_newsize;
+
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	lower_oldsize = upper_size_to_lower_size(crypt_stat,
+						 i_size_read(inode));
+	lower_newsize = upper_size_to_lower_size(crypt_stat, offset);
+	if (lower_newsize > lower_oldsize) {
+		/*
+		 * The eCryptfs inode and the new *lower* size are mixed here
+		 * because we may not have the lower i_mutex held and/or it may
+		 * not be appropriate to call inode_newsize_ok() with inodes
+		 * from other filesystems.
+		 */
+		return inode_newsize_ok(inode, lower_newsize);
+	}
+
+	return 0;
+}
+
 /**
  * ecryptfs_truncate
  * @dentry: The ecryptfs layer dentry
@@ -899,6 +909,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 	struct iattr lower_ia = { .ia_valid = 0 };
 	int rc;
 
+	rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length);
+	if (rc)
+		return rc;
+
 	rc = truncate_upper(dentry, &ia, &lower_ia);
 	if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
 		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
@@ -978,6 +992,16 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 		}
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
+
+	rc = inode_change_ok(inode, ia);
+	if (rc)
+		goto out;
+	if (ia->ia_valid & ATTR_SIZE) {
+		rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size);
+		if (rc)
+			goto out;
+	}
+
 	if (S_ISREG(inode->i_mode)) {
 		rc = filemap_write_and_wait(inode->i_mapping);
 		if (rc)
-- 
cgit v1.2.3


From f2cb933501ebc066bf3c4b1836fd8428f8fe9863 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Wed, 18 Jan 2012 15:09:43 -0600
Subject: eCryptfs: Remove unused ecryptfs_read()

ecryptfs_read() has been ifdef'ed out for years now and it was
apparently unused before then. It is time to get rid of it for good.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/read_write.c | 73 ------------------------------------------------
 1 file changed, 73 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 608c1c3fde1..5c0106f7577 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -282,76 +282,3 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 	flush_dcache_page(page_for_ecryptfs);
 	return rc;
 }
-
-#if 0
-/**
- * ecryptfs_read
- * @data: The virtual address into which to write the data read (and
- *        possibly decrypted) from the lower file
- * @offset: The offset in the decrypted view of the file from which to
- *          read into @data
- * @size: The number of bytes to read into @data
- * @ecryptfs_file: The eCryptfs file from which to read
- *
- * Read an arbitrary amount of data from an arbitrary location in the
- * eCryptfs page cache. This is done on an extent-by-extent basis;
- * individual extents are decrypted and read from the lower page
- * cache (via VFS reads). This function takes care of all the
- * address translation to locations in the lower filesystem.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_read(char *data, loff_t offset, size_t size,
-		  struct file *ecryptfs_file)
-{
-	struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
-	struct page *ecryptfs_page;
-	char *ecryptfs_page_virt;
-	loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
-	loff_t data_offset = 0;
-	loff_t pos;
-	int rc = 0;
-
-	if ((offset + size) > ecryptfs_file_size) {
-		rc = -EINVAL;
-		printk(KERN_ERR "%s: Attempt to read data past the end of the "
-			"file; offset = [%lld]; size = [%td]; "
-		       "ecryptfs_file_size = [%lld]\n",
-		       __func__, offset, size, ecryptfs_file_size);
-		goto out;
-	}
-	pos = offset;
-	while (pos < (offset + size)) {
-		pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
-		size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
-		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
-		size_t total_remaining_bytes = ((offset + size) - pos);
-
-		if (num_bytes > total_remaining_bytes)
-			num_bytes = total_remaining_bytes;
-		ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
-							 ecryptfs_page_idx);
-		if (IS_ERR(ecryptfs_page)) {
-			rc = PTR_ERR(ecryptfs_page);
-			printk(KERN_ERR "%s: Error getting page at "
-			       "index [%ld] from eCryptfs inode "
-			       "mapping; rc = [%d]\n", __func__,
-			       ecryptfs_page_idx, rc);
-			goto out;
-		}
-		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
-		memcpy((data + data_offset),
-		       ((char *)ecryptfs_page_virt + start_offset_in_page),
-		       num_bytes);
-		kunmap_atomic(ecryptfs_page_virt, KM_USER0);
-		flush_dcache_page(ecryptfs_page);
-		SetPageUptodate(ecryptfs_page);
-		unlock_page(ecryptfs_page);
-		page_cache_release(ecryptfs_page);
-		pos += num_bytes;
-		data_offset += num_bytes;
-	}
-out:
-	return rc;
-}
-#endif  /*  0  */
-- 
cgit v1.2.3


From 58ded24f0fcb85bddb665baba75892f6ad0f4b8a Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Tue, 24 Jan 2012 10:02:22 -0600
Subject: eCryptfs: Fix oops when printing debug info in extent crypto
 functions

If pages passed to the eCryptfs extent-based crypto functions are not
mapped and the module parameter ecryptfs_verbosity=1 was specified at
loading time, a NULL pointer dereference will occur.

Note that this wouldn't happen on a production system, as you wouldn't
pass ecryptfs_verbosity=1 on a production system. It leaks private
information to the system logs and is for debugging only.

The debugging info printed in these messages is no longer very useful
and rather than doing a kmap() in these debugging paths, it will be
better to simply remove the debugging paths completely.

https://launchpad.net/bugs/913651

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reported-by: Daniel DeFreez
Cc: <stable@vger.kernel.org>
---
 fs/ecryptfs/crypto.c | 40 ----------------------------------------
 1 file changed, 40 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index ff981503b3e..63ab2451064 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -417,17 +417,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
 			(unsigned long long)(extent_base + extent_offset), rc);
 		goto out;
 	}
-	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
-				"with iv:\n");
-		ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
-		ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
-				"encryption:\n");
-		ecryptfs_dump_hex((char *)
-				  (page_address(page)
-				   + (extent_offset * crypt_stat->extent_size)),
-				  8);
-	}
 	rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
 					  page, (extent_offset
 						 * crypt_stat->extent_size),
@@ -440,14 +429,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
 		goto out;
 	}
 	rc = 0;
-	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
-			"rc = [%d]\n",
-			(unsigned long long)(extent_base + extent_offset), rc);
-		ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
-				"encryption:\n");
-		ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
-	}
 out:
 	return rc;
 }
@@ -543,17 +524,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
 			(unsigned long long)(extent_base + extent_offset), rc);
 		goto out;
 	}
-	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
-				"with iv:\n");
-		ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
-		ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
-				"decryption:\n");
-		ecryptfs_dump_hex((char *)
-				  (page_address(enc_extent_page)
-				   + (extent_offset * crypt_stat->extent_size)),
-				  8);
-	}
 	rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
 					  (extent_offset
 					   * crypt_stat->extent_size),
@@ -567,16 +537,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
 		goto out;
 	}
 	rc = 0;
-	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
-			"rc = [%d]\n",
-			(unsigned long long)(extent_base + extent_offset), rc);
-		ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
-				"decryption:\n");
-		ecryptfs_dump_hex((char *)(page_address(page)
-					   + (extent_offset
-					      * crypt_stat->extent_size)), 8);
-	}
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From 1589cb1a94c381579a0235ca708d9e2dca6d3a39 Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@nudt.edu.cn>
Date: Wed, 25 Jan 2012 15:40:31 +0800
Subject: eCryptfs: move misleading function comments

 The data encryption was moved from ecryptfs_write_end into
ecryptfs_writepage, this patch moves the corresponding function
comments to be consistent with the modification.

Signed-off-by: Li Wang <liwang@nudt.edu.cn>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/mmap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 6a44148c5fb..10ec695ccd6 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -57,6 +57,10 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
  * @page: Page that is locked before this call is made
  *
  * Returns zero on success; non-zero otherwise
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem.  In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
  */
 static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
 {
@@ -481,10 +485,6 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
  * @copied: The amount of data copied
  * @page: The eCryptfs page
  * @fsdata: The fsdata (unused)
- *
- * This is where we encrypt the data and pass the encrypted data to
- * the lower filesystem.  In OpenPGP-compatible mode, we operate on
- * entire underlying packets.
  */
 static int ecryptfs_write_end(struct file *file,
 			struct address_space *mapping,
-- 
cgit v1.2.3


From b1375d64c539c5b76794be759b62d3f178e67c32 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: Btrfs: fix uninit warning in backref.c

Added initialization with the declaration of ret. It isn't set later on the
switch-default branch (which should never be taken).

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/backref.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b9a843226de..633c701a287 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -297,7 +297,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 	struct rb_node *n = &head->node.rb_node;
 	int sgn;
-	int ret;
+	int ret = 0;
 
 	if (extent_op && extent_op->update_key)
 		btrfs_disk_key_to_cpu(info_key, &extent_op->key);
@@ -392,7 +392,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 			     struct btrfs_key *info_key, int *info_level,
 			     struct list_head *prefs)
 {
-	int ret;
+	int ret = 0;
 	int slot;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
-- 
cgit v1.2.3


From 357b9784b79924a31ccded5d9a0c688f48cc28f2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: Btrfs: make sure a bitmap has enough bytes

We have only been checking for min_bytes available in bitmap entries, but we
won't successfully setup a bitmap cluster unless it has at least bytes in the
bitmap, so in the common case min_bytes is 4k and we want something like 2MB, so
if there are a bunch of bitmap entries with less than 2mb's in them, we'll
search all them anyway, which is suboptimal.  Fix this check.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index efe20032e4a..6e740693234 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2475,7 +2475,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 	}
 
 	list_for_each_entry(entry, bitmaps, list) {
-		if (entry->bytes < min_bytes)
+		if (entry->bytes < bytes)
 			continue;
 		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
 					   bytes, cont1_bytes, min_bytes);
-- 
cgit v1.2.3


From 6dd70ce4eb7429c2ba6dd9fa46f78a0a2a254038 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: btrfs: Fix busyloops in transaction waiting code

wait_log_commit() and wait_for_writer() were using slightly different
conditions for deciding whether they should call schedule() and whether they
should continue in the wait loop. Thus it could happen that we busylooped when
the first condition was not true while the second one was. That is burning CPU
cycles needlessly and is deadly on UP machines...

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cb877e0886a..966cc74f5d6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1957,7 +1957,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
-	} while (root->log_transid < transid + 2 &&
+	} while (root->fs_info->last_trans_log_full_commit !=
+		 trans->transid && root->log_transid < transid + 2 &&
 		 atomic_read(&root->log_commit[index]));
 	return 0;
 }
@@ -1966,7 +1967,8 @@ static int wait_for_writer(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
-	while (atomic_read(&root->log_writers)) {
+	while (root->fs_info->last_trans_log_full_commit !=
+	       trans->transid && atomic_read(&root->log_writers)) {
 		prepare_to_wait(&root->log_writer_wait,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-- 
cgit v1.2.3


From 8bedd51b6121c4607784d75f852828d25d119c52 Mon Sep 17 00:00:00 2001
From: Mitch Harder <mitch.harder@sabayonlinux.org>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: Btrfs: Check for NULL page in extent_range_uptodate

A user has encountered a NULL pointer kernel oops in btrfs when
encountering media errors.  The problem has been identified
as an unhandled NULL pointer returned from find_get_page().
This modification simply checks for a NULL page, and returns
with an error if found (the extent_range_uptodate() function
returns 1 on errors).

After testing this patch, the user reported that the error with
the NULL pointer oops was solved.  However, there is still a
remaining problem with a thread becoming stuck in
wait_on_page_locked(page) in the read_extent_buffer_pages(...)
function in extent_io.c

       for (i = start_i; i < num_pages; i++) {
               page = extent_buffer_page(eb, i);
               wait_on_page_locked(page);
               if (!PageUptodate(page))
                       ret = -EIO;
       }

This patch leaves the issue with the locked page yet to be resolved.

Signed-off-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9d09a4f8187..fcf77e1ded4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3909,6 +3909,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 	while (start <= end) {
 		index = start >> PAGE_CACHE_SHIFT;
 		page = find_get_page(tree->mapping, index);
+		if (!page)
+			return 1;
 		uptodate = PageUptodate(page);
 		page_cache_release(page);
 		if (!uptodate) {
-- 
cgit v1.2.3


From 0b4a9d248f88e6773312f262e8185f23863d984a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: Btrfs: use cluster->window_start when allocating from a cluster
 bitmap

We specifically set window_start in the cluster struct to indicate where the
cluster starts in a bitmap, but we've been using min_start to indicate where
we're searching from.  This is usually the start of the blockgroup, so
essentially means we're constantly searching from the start of any bitmap we
find, which completely negates all the trouble we go to in order to setup a
cluster.  So start using window_start to make sure we actually use the area we
found.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6e740693234..61447a51f64 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2242,7 +2242,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 		if (entry->bitmap) {
 			ret = btrfs_alloc_from_bitmap(block_group,
 						      cluster, entry, bytes,
-						      min_start);
+						      cluster->window_start);
 			if (ret == 0) {
 				node = rb_next(&entry->offset_index);
 				if (!node)
-- 
cgit v1.2.3


From 0b485143d835c019cddc45f46e4b3873dcc9aa4e Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: Btrfs: fix warning for 32-bit build of fs/btrfs/check-integrity.c

There have been 4 warnings on 32-bit build, they are herewith fixed.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/check-integrity.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ad0b3ba735b..b669a7d8e49 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1662,7 +1662,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
 	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
 					       &state->block_hashtable);
 	if (NULL != block) {
-		u64 bytenr;
+		u64 bytenr = 0;
 		struct list_head *elem_ref_to;
 		struct list_head *tmp_ref_to;
 
@@ -2777,9 +2777,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
 			printk(KERN_INFO
 			       "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
 			       " size=%lu, data=%p, bdev=%p)\n",
-			       rw, bh->b_blocknr,
-			       (unsigned long long)dev_bytenr, bh->b_size,
-			       bh->b_data, bh->b_bdev);
+			       rw, (unsigned long)bh->b_blocknr,
+			       (unsigned long long)dev_bytenr,
+			       (unsigned long)bh->b_size, bh->b_data,
+			       bh->b_bdev);
 		btrfsic_process_written_block(dev_state, dev_bytenr,
 					      bh->b_data, bh->b_size, NULL,
 					      NULL, bh, rw);
@@ -2844,7 +2845,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
 			printk(KERN_INFO
 			       "submit_bio(rw=0x%x, bi_vcnt=%u,"
 			       " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
-			       rw, bio->bi_vcnt, bio->bi_sector,
+			       rw, bio->bi_vcnt, (unsigned long)bio->bi_sector,
 			       (unsigned long long)dev_bytenr,
 			       bio->bi_bdev);
 
-- 
cgit v1.2.3


From 7ec31b548a17f773ab6289e795ed3a6820e8b56e Mon Sep 17 00:00:00 2001
From: Liu Bo <liubo2009@cn.fujitsu.com>
Date: Thu, 26 Jan 2012 15:01:12 -0500
Subject: Btrfs: do not defrag a file partially

xfstests 218 complains that btrfs defrags a file partially:
 After: 1
 Write backwards sync, but contiguous - should defrag to 1 extent
 Before: 10
-After: 1
+After: 2

To fix this, we need to set max_to_defrag count properly.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6834be4c870..0b06a5ca8af 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1066,7 +1066,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		i = range->start >> PAGE_CACHE_SHIFT;
 	}
 	if (!max_to_defrag)
-		max_to_defrag = last_index;
+		max_to_defrag = last_index + 1;
 
 	/*
 	 * make writeback starts from i, so the defrag range can be
-- 
cgit v1.2.3


From 9e622d6bea0202e9fe267955362c01918562c09b Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Jan 2012 15:01:12 -0500
Subject: Btrfs: fix enospc error caused by wrong checks of the chunk

When we did sysbench test for inline files, enospc error happened easily though
there was lots of free disk space which could be allocated for new chunks.

Reproduce steps:
 # mkfs.btrfs -b $((2 * 1024 * 1024 * 1024)) <test partition>
 # mount <test partition> /mnt
 # ulimit -n 102400
 # cd /mnt
 # sysbench --num-threads=1 --test=fileio --file-num=81920 \
 > --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
 > --file-test-mode=seqwr prepare
 # sysbench --num-threads=1 --test=fileio --file-num=81920 \
 > --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
 > --file-test-mode=seqwr run
 <soon later, BUG_ON() was triggered by enospc error>

The reason of this bug is:
Now, we can reserve space which is larger than the free space in the chunks if
we have enough free disk space which can be used for new chunks. By this way,
the space allocator should allocate a new chunk by force if there is no free
space in the free space cache. But there are two wrong checks which break this
operation.

One is
	if (ret == -ENOSPC && num_bytes > min_alloc_size)
in btrfs_reserve_extent(), it is wrong, we should try to allocate a new chunk
even we fail to allocate free space by minimum allocable size.

The other is
	if (space_info->force_alloc)
		force = space_info->force_alloc;
in do_chunk_alloc(). It makes the allocator ignore CHUNK_ALLOC_FORCE If someone
sets ->force_alloc to CHUNK_ALLOC_LIMITED, and makes the enospc error happen.

Fix these two wrong checks. Especially the second one, we fix it by changing
the value of CHUNK_ALLOC_LIMITED and CHUNK_ALLOC_FORCE, and make
CHUNK_ALLOC_FORCE greater than CHUNK_ALLOC_LIMITED since CHUNK_ALLOC_FORCE has
higher priority. And if the value which is passed in by the caller is greater
than ->force_alloc, use the passed value.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 49 +++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 700879ed64c..283af7a676a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,23 +34,24 @@
 #include "locking.h"
 #include "free-space-cache.h"
 
-/* control flags for do_chunk_alloc's force field
+/*
+ * control flags for do_chunk_alloc's force field
  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  * if we really need one.
  *
- * CHUNK_ALLOC_FORCE means it must try to allocate one
- *
  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  * if we have very few chunks already allocated.  This is
  * used as part of the clustering code to help make sure
  * we have a good pool of storage to cluster in, without
  * filling the FS with empty chunks
  *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
  */
 enum {
 	CHUNK_ALLOC_NO_FORCE = 0,
-	CHUNK_ALLOC_FORCE = 1,
-	CHUNK_ALLOC_LIMITED = 2,
+	CHUNK_ALLOC_LIMITED = 1,
+	CHUNK_ALLOC_FORCE = 2,
 };
 
 /*
@@ -3414,7 +3415,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 
 again:
 	spin_lock(&space_info->lock);
-	if (space_info->force_alloc)
+	if (force < space_info->force_alloc)
 		force = space_info->force_alloc;
 	if (space_info->full) {
 		spin_unlock(&space_info->lock);
@@ -5794,6 +5795,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 			 u64 search_end, struct btrfs_key *ins,
 			 u64 data)
 {
+	bool final_tried = false;
 	int ret;
 	u64 search_start = 0;
 
@@ -5813,22 +5815,25 @@ again:
 			       search_start, search_end, hint_byte,
 			       ins, data);
 
-	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
-		num_bytes = num_bytes >> 1;
-		num_bytes = num_bytes & ~(root->sectorsize - 1);
-		num_bytes = max(num_bytes, min_alloc_size);
-		do_chunk_alloc(trans, root->fs_info->extent_root,
-			       num_bytes, data, CHUNK_ALLOC_FORCE);
-		goto again;
-	}
-	if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
-		struct btrfs_space_info *sinfo;
-
-		sinfo = __find_space_info(root->fs_info, data);
-		printk(KERN_ERR "btrfs allocation failed flags %llu, "
-		       "wanted %llu\n", (unsigned long long)data,
-		       (unsigned long long)num_bytes);
-		dump_space_info(sinfo, num_bytes, 1);
+	if (ret == -ENOSPC) {
+		if (!final_tried) {
+			num_bytes = num_bytes >> 1;
+			num_bytes = num_bytes & ~(root->sectorsize - 1);
+			num_bytes = max(num_bytes, min_alloc_size);
+			do_chunk_alloc(trans, root->fs_info->extent_root,
+				       num_bytes, data, CHUNK_ALLOC_FORCE);
+			if (num_bytes == min_alloc_size)
+				final_tried = true;
+			goto again;
+		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+			struct btrfs_space_info *sinfo;
+
+			sinfo = __find_space_info(root->fs_info, data);
+			printk(KERN_ERR "btrfs allocation failed flags %llu, "
+			       "wanted %llu\n", (unsigned long long)data,
+			       (unsigned long long)num_bytes);
+			dump_space_info(sinfo, num_bytes, 1);
+		}
 	}
 
 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
-- 
cgit v1.2.3


From 0c4e538bccc106872d31b1514570b4dac95fb7f2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Thu, 26 Jan 2012 15:01:12 -0500
Subject: btrfs: mask out gfp flags in releasepage

btree_releasepage is a callback and can be passed unknown gfp flags and then
they may end up in kmem_cache_alloc called from alloc_extent_state, slab
allocator will BUG_ON when there is HIGHMEM or DMA32 flag set.

This may happen when btrfs is mounted from a loop device, which masks out
__GFP_IO flag. The check in try_release_extent_state

3399                 if ((mask & GFP_NOFS) == GFP_NOFS)
3400                         mask = GFP_NOFS;

will not work and passes unfiltered flags further resulting in crash at
mm/slab.c:2963

 [<000000000024ae4c>] cache_alloc_refill+0x3b4/0x5c8
 [<000000000024c810>] kmem_cache_alloc+0x204/0x294
 [<00000000001fd3c2>] mempool_alloc+0x52/0x170
 [<000003c000ced0b0>] alloc_extent_state+0x40/0xd4 [btrfs]
 [<000003c000cee5ae>] __clear_extent_bit+0x38a/0x4cc [btrfs]
 [<000003c000cee78c>] try_release_extent_state+0x9c/0xd4 [btrfs]
 [<000003c000cc4c66>] btree_releasepage+0x7e/0xd0 [btrfs]
 [<0000000000210d84>] shrink_page_list+0x6a0/0x724
 [<0000000000211394>] shrink_inactive_list+0x230/0x578
 [<0000000000211bb8>] shrink_list+0x6c/0x120
 [<0000000000211e4e>] shrink_zone+0x1e2/0x228
 [<0000000000211f24>] shrink_zones+0x90/0x254
 [<0000000000213410>] do_try_to_free_pages+0xac/0x420
 [<0000000000213ae0>] try_to_free_pages+0x13c/0x1b0
 [<0000000000204e6c>] __alloc_pages_nodemask+0x5b4/0x9a8
 [<00000000001fb04a>] grab_cache_page_write_begin+0x7e/0xe8

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index da4457f84d7..4c867112b4c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -961,6 +961,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 
+	/*
+	 * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
+	 * slab allocation from alloc_extent_state down the callchain where
+	 * it'd hit a BUG_ON as those flags are not allowed.
+	 */
+	gfp_flags &= ~GFP_SLAB_BUG_MASK;
+
 	ret = try_release_extent_state(map, tree, page, gfp_flags);
 	if (!ret)
 		return 0;
-- 
cgit v1.2.3


From 9b23062840e7c685ef0a0b561285d6e3a3b6811b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 26 Jan 2012 15:01:12 -0500
Subject: Btrfs: advance window_start if we're using a bitmap

If we span a long area in a bitmap we could end up taking a lot of time
searching to the next free area if we're searching from the original
window_start, so advance window_start in order to make sure we don't do any
superficial searching.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 61447a51f64..5802b1473c3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2251,6 +2251,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 						 offset_index);
 				continue;
 			}
+			cluster->window_start += bytes;
 		} else {
 			ret = entry->offset;
 
-- 
cgit v1.2.3


From 9998eb703490589c3e8f1bf09b15203156776edb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jan 2012 13:47:40 -0500
Subject: Btrfs: fix reservations in btrfs_page_mkwrite

Josef fixed btrfs_page_mkwrite to properly release reserved
extents if there was an error.  But if we fail to get a reservation
and we fail to dirty the inode (for ENOSPC reasons), we'll end up
trying to release a reservation we never had.

This makes sure we only release if we were able to reserve.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5977987abdb..7405753ec5d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6401,18 +6401,23 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	unsigned long zero_start;
 	loff_t size;
 	int ret;
+	int reserved = 0;
 	u64 page_start;
 	u64 page_end;
 
 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-	if (!ret)
+	if (!ret) {
 		ret = btrfs_update_time(vma->vm_file);
+		reserved = 1;
+	}
 	if (ret) {
 		if (ret == -ENOMEM)
 			ret = VM_FAULT_OOM;
 		else /* -ENOSPC, -EIO, etc */
 			ret = VM_FAULT_SIGBUS;
-		goto out;
+		if (reserved)
+			goto out;
+		goto out_noreserve;
 	}
 
 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
@@ -6495,6 +6500,7 @@ out_unlock:
 	unlock_page(page);
 out:
 	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out_noreserve:
 	return ret;
 }
 
-- 
cgit v1.2.3


From 96150606e2fb82d242c9e4a414e4e922849f7bf7 Mon Sep 17 00:00:00 2001
From: Prasad Joshi <prasadjoshi.linux@gmail.com>
Date: Sat, 26 Nov 2011 11:00:47 +0530
Subject: logfs: update page reference count for pined pages

LogFS sets PG_private flag to indicate a pined page. We assumed that
marking a page as private is enough to ensure its existence. But
instead it is necessary to hold a reference count to the page.

The change resolves the following BUG

BUG: Bad page state in process flush-253:16  pfn:6a6d0
page flags: 0x100000000000808(uptodate|private)

Suggested-and-Acked-by: Joern Engel <joern@logfs.org>
Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
---
 fs/logfs/readwrite.c | 29 ++++++++++++++++++++++-------
 fs/logfs/segment.c   | 37 +++++++++++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 2ac4217b790..6d663e8ea6d 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -560,8 +560,13 @@ static void inode_free_block(struct super_block *sb, struct logfs_block *block)
 static void indirect_free_block(struct super_block *sb,
 		struct logfs_block *block)
 {
-	ClearPagePrivate(block->page);
-	block->page->private = 0;
+	struct page *page = block->page;
+
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		page_cache_release(page);
+		set_page_private(page, 0);
+	}
 	__free_block(sb, block);
 }
 
@@ -650,8 +655,11 @@ static void alloc_data_block(struct inode *inode, struct page *page)
 	logfs_unpack_index(page->index, &bix, &level);
 	block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
 	block->page = page;
+
 	SetPagePrivate(page);
-	page->private = (unsigned long)block;
+	page_cache_get(page);
+	set_page_private(page, (unsigned long) block);
+
 	block->ops = &indirect_block_ops;
 }
 
@@ -1901,8 +1909,11 @@ static void move_page_to_inode(struct inode *inode, struct page *page)
 	li->li_block = block;
 
 	block->page = NULL;
-	page->private = 0;
-	ClearPagePrivate(page);
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		page_cache_release(page);
+		set_page_private(page, 0);
+	}
 }
 
 static void move_inode_to_page(struct page *page, struct inode *inode)
@@ -1918,8 +1929,12 @@ static void move_inode_to_page(struct page *page, struct inode *inode)
 	BUG_ON(PagePrivate(page));
 	block->ops = &indirect_block_ops;
 	block->page = page;
-	page->private = (unsigned long)block;
-	SetPagePrivate(page);
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, (unsigned long) block);
+	}
 
 	block->inode = NULL;
 	li->li_block = NULL;
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 9d518735325..6aee6092860 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -86,7 +86,11 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
 		BUG_ON(!page); /* FIXME: reserve a pool */
 		SetPageUptodate(page);
 		memcpy(page_address(page) + offset, buf, copylen);
-		SetPagePrivate(page);
+
+		if (!PagePrivate(page)) {
+			SetPagePrivate(page);
+			page_cache_get(page);
+		}
 		page_cache_release(page);
 
 		buf += copylen;
@@ -110,7 +114,10 @@ static void pad_partial_page(struct logfs_area *area)
 		page = get_mapping_page(sb, index, 0);
 		BUG_ON(!page); /* FIXME: reserve a pool */
 		memset(page_address(page) + offset, 0xff, len);
-		SetPagePrivate(page);
+		if (!PagePrivate(page)) {
+			SetPagePrivate(page);
+			page_cache_get(page);
+		}
 		page_cache_release(page);
 	}
 }
@@ -130,7 +137,10 @@ static void pad_full_pages(struct logfs_area *area)
 		BUG_ON(!page); /* FIXME: reserve a pool */
 		SetPageUptodate(page);
 		memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
-		SetPagePrivate(page);
+		if (!PagePrivate(page)) {
+			SetPagePrivate(page);
+			page_cache_get(page);
+		}
 		page_cache_release(page);
 		index++;
 		no_indizes--;
@@ -485,8 +495,12 @@ static void move_btree_to_page(struct inode *inode, struct page *page,
 		mempool_free(item, super->s_alias_pool);
 	}
 	block->page = page;
-	SetPagePrivate(page);
-	page->private = (unsigned long)block;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, (unsigned long) block);
+	}
 	block->ops = &indirect_block_ops;
 	initialize_block_counters(page, block, data, 0);
 }
@@ -536,8 +550,12 @@ void move_page_to_btree(struct page *page)
 		list_add(&item->list, &block->item_list);
 	}
 	block->page = NULL;
-	ClearPagePrivate(page);
-	page->private = 0;
+
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		page_cache_release(page);
+		set_page_private(page, 0);
+	}
 	block->ops = &btree_block_ops;
 	err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
 			block);
@@ -702,7 +720,10 @@ void freeseg(struct super_block *sb, u32 segno)
 		page = find_get_page(mapping, ofs >> PAGE_SHIFT);
 		if (!page)
 			continue;
-		ClearPagePrivate(page);
+		if (PagePrivate(page)) {
+			ClearPagePrivate(page);
+			page_cache_release(page);
+		}
 		page_cache_release(page);
 	}
 }
-- 
cgit v1.2.3


From 934eed395d201bf0901ca0c0cc3703b18729d0ce Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Sun, 20 Nov 2011 22:29:01 +0530
Subject: logfs: Prevent memory corruption

This is a bad one.  I wonder whether we were so far protected by
no_free_segments(sb) usually being smaller than LOGFS_NO_AREAS.

Found by Dan Carpenter <dan.carpenter@oracle.com> using smatch.

Signed-off-by: Joern Engel <joern@logfs.org>
Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
---
 fs/logfs/gc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index caa4419285d..d4efb061bdc 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -367,7 +367,7 @@ static struct gc_candidate *get_candidate(struct super_block *sb)
 	int i, max_dist;
 	struct gc_candidate *cand = NULL, *this;
 
-	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
 
 	for (i = max_dist; i >= 0; i--) {
 		this = first_in_list(&super->s_low_list[i]);
-- 
cgit v1.2.3


From 13ced29cb28996a9bc4f68e43ff0c57eafdb1e21 Mon Sep 17 00:00:00 2001
From: Prasad Joshi <prasadjoshi.linux@gmail.com>
Date: Sat, 28 Jan 2012 11:36:06 +0530
Subject: logfs: take write mutex lock during fsync and sync

LogFS uses super->s_write_mutex while writing data to disk. Taking the
same mutex lock in sync and fsync code path solves the following BUG:

------------[ cut here ]------------
kernel BUG at /home/prasad/logfs/dev_bdev.c:134!

Pid: 2387, comm: flush-253:16 Not tainted 3.0.0+ #4 Bochs Bochs
RIP: 0010:[<ffffffffa007deed>]  [<ffffffffa007deed>]
                bdev_writeseg+0x25d/0x270 [logfs]
Call Trace:
[<ffffffffa007c381>] logfs_open_area+0x91/0x150 [logfs]
[<ffffffff8128dcb2>] ? find_level.clone.9+0x62/0x100
[<ffffffffa007c49c>] __logfs_segment_write.clone.20+0x5c/0x190 [logfs]
[<ffffffff810ef005>] ? mempool_kmalloc+0x15/0x20
[<ffffffff810ef383>] ? mempool_alloc+0x53/0x130
[<ffffffffa007c7a4>] logfs_segment_write+0x1d4/0x230 [logfs]
[<ffffffffa0078f8e>] logfs_write_i0+0x12e/0x190 [logfs]
[<ffffffffa0079300>] __logfs_write_rec+0x140/0x220 [logfs]
[<ffffffffa0079444>] logfs_write_rec+0x64/0xd0 [logfs]
[<ffffffffa00795b6>] __logfs_write_buf+0x106/0x110 [logfs]
[<ffffffffa007a13e>] logfs_write_buf+0x4e/0x80 [logfs]
[<ffffffffa0073e33>] __logfs_writepage+0x23/0x80 [logfs]
[<ffffffffa007410c>] logfs_writepage+0xdc/0x110 [logfs]
[<ffffffff810f5ba7>] __writepage+0x17/0x40
[<ffffffff810f6208>] write_cache_pages+0x208/0x4f0
[<ffffffff810f5b90>] ? set_page_dirty+0x70/0x70
[<ffffffff810f653a>] generic_writepages+0x4a/0x70
[<ffffffff810f75d1>] do_writepages+0x21/0x40
[<ffffffff8116b9d1>] writeback_single_inode+0x101/0x250
[<ffffffff8116bdbd>] writeback_sb_inodes+0xed/0x1c0
[<ffffffff8116c5fb>] writeback_inodes_wb+0x7b/0x1e0
[<ffffffff8116cc23>] wb_writeback+0x4c3/0x530
[<ffffffff814d984d>] ? sub_preempt_count+0x9d/0xd0
[<ffffffff8116cd6b>] wb_do_writeback+0xdb/0x290
[<ffffffff814d984d>] ? sub_preempt_count+0x9d/0xd0
[<ffffffff814d6208>] ? _raw_spin_unlock_irqrestore+0x18/0x40
[<ffffffff8105aa5a>] ? del_timer+0x8a/0x120
[<ffffffff8116cfac>] bdi_writeback_thread+0x8c/0x2e0
[<ffffffff8116cf20>] ? wb_do_writeback+0x290/0x290
[<ffffffff8106d2e6>] kthread+0x96/0xa0
[<ffffffff814de514>] kernel_thread_helper+0x4/0x10
[<ffffffff8106d250>] ? kthread_worker_fn+0x190/0x190
[<ffffffff814de510>] ? gs_change+0xb/0xb
RIP  [<ffffffffa007deed>] bdev_writeseg+0x25d/0x270 [logfs]
---[ end trace 0211ad60a57657c4 ]---

Reviewed-by: Joern Engel <joern@logfs.org>
Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
---
 fs/logfs/file.c      | 2 ++
 fs/logfs/inode.c     | 2 ++
 fs/logfs/logfs.h     | 2 ++
 fs/logfs/readwrite.c | 6 ++----
 4 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index b548c87a86f..3886cded283 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -230,7 +230,9 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		return ret;
 
 	mutex_lock(&inode->i_mutex);
+	logfs_get_wblocks(sb, NULL, WF_LOCK);
 	logfs_write_anchor(sb);
+	logfs_put_wblocks(sb, NULL, WF_LOCK);
 	mutex_unlock(&inode->i_mutex);
 
 	return 0;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 7e441ad5f79..388d7c5a7be 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -364,7 +364,9 @@ static void logfs_init_once(void *_li)
 
 static int logfs_sync_fs(struct super_block *sb, int wait)
 {
+	logfs_get_wblocks(sb, NULL, WF_LOCK);
 	logfs_write_anchor(sb);
+	logfs_put_wblocks(sb, NULL, WF_LOCK);
 	return 0;
 }
 
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 398ecff6e54..bb4340850c1 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -577,6 +577,8 @@ void initialize_block_counters(struct page *page, struct logfs_block *block,
 		__be64 *array, int page_is_empty);
 int logfs_exist_block(struct inode *inode, u64 bix);
 int get_page_reserve(struct inode *inode, struct page *page);
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
 extern struct logfs_block_ops indirect_block_ops;
 
 /* segment.c */
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 6d663e8ea6d..7b10e8aecce 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -244,8 +244,7 @@ static void preunlock_page(struct super_block *sb, struct page *page, int lock)
  * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked
  * in addition to PG_locked.
  */
-static void logfs_get_wblocks(struct super_block *sb, struct page *page,
-		int lock)
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
 {
 	struct logfs_super *super = logfs_super(sb);
 
@@ -260,8 +259,7 @@ static void logfs_get_wblocks(struct super_block *sb, struct page *page,
 	}
 }
 
-static void logfs_put_wblocks(struct super_block *sb, struct page *page,
-		int lock)
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
 {
 	struct logfs_super *super = logfs_super(sb);
 
-- 
cgit v1.2.3


From ecfd890991a26e70547e025673580923a004c5e4 Mon Sep 17 00:00:00 2001
From: Prasad Joshi <prasadjoshi.linux@gmail.com>
Date: Sun, 30 Oct 2011 22:15:32 +0530
Subject: logfs: set superblock shutdown flag after generic sb shutdown

While unmounting the file system LogFS calls generic_shutdown_super.
The function does file system independent superblock shutdown.
However, it might result in call file system specific inode eviction.

LogFS marks FS shutting down by setting bit LOGFS_SB_FLAG_SHUTDOWN in
super->s_flags. Since, inode eviction might call truncate on inode,
following BUG is observed when file system is unmounted:

------------[ cut here ]------------
kernel BUG at /home/prasad/logfs/segment.c:362!
invalid opcode: 0000 [#1] PREEMPT SMP
CPU 3
Modules linked in: logfs binfmt_misc ppdev virtio_blk parport_pc lp
	parport psmouse floppy virtio_pci serio_raw virtio_ring virtio

Pid: 1933, comm: umount Not tainted 3.0.0+ #4 Bochs Bochs
RIP: 0010:[<ffffffffa008c841>]  [<ffffffffa008c841>]
		logfs_segment_write+0x211/0x230 [logfs]
RSP: 0018:ffff880062d7b9e8  EFLAGS: 00010202
RAX: 000000000000000e RBX: ffff88006eca9000 RCX: 0000000000000000
RDX: ffff88006fd87c40 RSI: ffffea00014ff468 RDI: ffff88007b68e000
RBP: ffff880062d7ba48 R08: 8000000020451430 R09: 0000000000000000
R10: dead000000100100 R11: 0000000000000000 R12: ffff88006fd87c40
R13: ffffea00014ff468 R14: ffff88005ad0a460 R15: 0000000000000000
FS:  00007f25d50ea760(0000) GS:ffff88007fd80000(0000)
	knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000d05e48 CR3: 0000000062c72000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process umount (pid: 1933, threadinfo ffff880062d7a000,
	task ffff880070b44500)
Stack:
ffff880062d7ba38 ffff88005ad0a508 0000000000001000 0000000000000000
8000000020451430 ffffea00014ff468 ffff880062d7ba48 ffff88005ad0a460
ffff880062d7bad8 ffffea00014ff468 ffff88006fd87c40 0000000000000000
Call Trace:
[<ffffffffa0088fee>] logfs_write_i0+0x12e/0x190 [logfs]
[<ffffffffa0089360>] __logfs_write_rec+0x140/0x220 [logfs]
[<ffffffffa0089312>] __logfs_write_rec+0xf2/0x220 [logfs]
[<ffffffffa00894a4>] logfs_write_rec+0x64/0xd0 [logfs]
[<ffffffffa0089616>] __logfs_write_buf+0x106/0x110 [logfs]
[<ffffffffa008a19e>] logfs_write_buf+0x4e/0x80 [logfs]
[<ffffffffa008a6b8>] __logfs_write_inode+0x98/0x110 [logfs]
[<ffffffffa008a7c4>] logfs_truncate+0x54/0x290 [logfs]
[<ffffffffa008abfc>] logfs_evict_inode+0xdc/0x190 [logfs]
[<ffffffff8115eef5>] evict+0x85/0x170
[<ffffffff8115f126>] iput+0xe6/0x1b0
[<ffffffff8115b4a8>] shrink_dcache_for_umount_subtree+0x218/0x280
[<ffffffff8115ce91>] shrink_dcache_for_umount+0x51/0x90
[<ffffffff8114796c>] generic_shutdown_super+0x2c/0x100
[<ffffffffa008cc47>] logfs_kill_sb+0x57/0xf0 [logfs]
[<ffffffff81147de5>] deactivate_locked_super+0x45/0x70
[<ffffffff811487ea>] deactivate_super+0x4a/0x70
[<ffffffff81163934>] mntput_no_expire+0xa4/0xf0
[<ffffffff8116469f>] sys_umount+0x6f/0x380
[<ffffffff814dd46b>] system_call_fastpath+0x16/0x1b
Code: 55 c8 49 8d b6 a8 00 00 00 45 89 f9 45 89 e8 4c 89 e1 4c 89 55
b8 c7 04 24 00 00 00 00 e8 68 fc ff ff 4c 8b 55 b8 e9 3c ff ff ff <0f>
0b 0f 0b c7 45 c0 00 00 00 00 e9 44 fe ff ff 66 66 66 66 66
RIP  [<ffffffffa008c841>] logfs_segment_write+0x211/0x230 [logfs]
RSP <ffff880062d7b9e8>
---[ end trace fe6b040cea952290 ]---

Therefore, move super->s_flags setting after the fs-indenpendent work
has been finished.

Reviewed-by: Joern Engel <joern@logfs.org>
Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
---
 fs/logfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index e795c234ea3..f9b7a30b00a 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -491,9 +491,9 @@ static void logfs_kill_sb(struct super_block *sb)
 	 * From this point on alias entries are simply dropped - and any
 	 * writes to the object store are considered bugs.
 	 */
-	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
 	log_super("LogFS: Now in shutdown\n");
 	generic_shutdown_super(sb);
+	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
 
 	BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
 
-- 
cgit v1.2.3


From 0bd90387ed5a8abbcf43391b480efdc211721cfe Mon Sep 17 00:00:00 2001
From: Prasad Joshi <prasadjoshi.linux@gmail.com>
Date: Sun, 2 Oct 2011 23:46:51 +0530
Subject: logfs: Propagate page parameter to __logfs_write_inode

During GC LogFS has to rewrite each valid block to a separate segment.
Rewrite operation reads data from an old segment and writes it to a
newly allocated segment. Since every write operation changes data
block pointers maintained in inode, inode should also be rewritten.

In GC path to avoid AB-BA deadlock LogFS marks a page with
PG_pre_locked in addition to locking the page (PG_locked). The page
lock is ignored iff the page is pre-locked.

LogFS uses a special file called segment file. The segment file
maintains an 8 bytes entry for every segment. It keeps track of erase
count, level etc. for every segment.

Bad things happen with a segment belonging to the segment file is GCed

 ------------[ cut here ]------------
kernel BUG at /home/prasad/logfs/readwrite.c:297!
invalid opcode: 0000 [#1] SMP
Modules linked in: logfs joydev usbhid hid psmouse e1000 i2c_piix4
		serio_raw [last unloaded: logfs]
Pid: 20161, comm: mount Not tainted 3.1.0-rc3+ #3 innotek GmbH
		VirtualBox
EIP: 0060:[<f809132a>] EFLAGS: 00010292 CPU: 0
EIP is at logfs_lock_write_page+0x6a/0x70 [logfs]
EAX: 00000027 EBX: f73f5b20 ECX: c16007c8 EDX: 00000094
ESI: 00000000 EDI: e59be6e4 EBP: c7337b28 ESP: c7337b18
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
Process mount (pid: 20161, ti=c7336000 task=eb323f70 task.ti=c7336000)
Stack:
f8099a3d c7337b24 f73f5b20 00001002 c7337b50 f8091f6d f8099a4d f80994e4
00000003 00000000 c7337b68 00000000 c67e4400 00001000 c7337b80 f80935e5
00000000 00000000 00000000 00000000 e1fcf000 0000000f e59be618 c70bf900
Call Trace:
[<f8091f6d>] logfs_get_write_page.clone.16+0xdd/0x100 [logfs]
[<f80935e5>] logfs_mod_segment_entry+0x55/0x110 [logfs]
[<f809460d>] logfs_get_segment_entry+0x1d/0x20 [logfs]
[<f8091060>] ? logfs_cleanup_journal+0x50/0x50 [logfs]
[<f809521b>] ostore_get_erase_count+0x1b/0x40 [logfs]
[<f80965b8>] logfs_open_area+0xc8/0x150 [logfs]
[<c141a7ec>] ? kmemleak_alloc+0x2c/0x60
[<f809668e>] __logfs_segment_write.clone.16+0x4e/0x1b0 [logfs]
[<c10dd563>] ? mempool_kmalloc+0x13/0x20
[<c10dd563>] ? mempool_kmalloc+0x13/0x20
[<f809696f>] logfs_segment_write+0x17f/0x1d0 [logfs]
[<f8092e8c>] logfs_write_i0+0x11c/0x180 [logfs]
[<f8092f35>] logfs_write_direct+0x45/0x90 [logfs]
[<f80934cd>] __logfs_write_buf+0xbd/0xf0 [logfs]
[<c102900e>] ? kmap_atomic_prot+0x4e/0xe0
[<f809424b>] logfs_write_buf+0x3b/0x60 [logfs]
[<f80947a9>] __logfs_write_inode+0xa9/0x110 [logfs]
[<f8094cb0>] logfs_rewrite_block+0xc0/0x110 [logfs]
[<f8095300>] ? get_mapping_page+0x10/0x60 [logfs]
[<f8095aa0>] ? logfs_load_object_aliases+0x2e0/0x2f0 [logfs]
[<f808e57d>] logfs_gc_segment+0x2ad/0x310 [logfs]
[<f808e62a>] __logfs_gc_once+0x4a/0x80 [logfs]
[<f808ed43>] logfs_gc_pass+0x683/0x6a0 [logfs]
[<f8097a89>] logfs_mount+0x5a9/0x680 [logfs]
[<c1126b21>] mount_fs+0x21/0xd0
[<c10f6f6f>] ? __alloc_percpu+0xf/0x20
[<c113da41>] ? alloc_vfsmnt+0xb1/0x130
[<c113db4b>] vfs_kern_mount+0x4b/0xa0
[<c113e06e>] do_kern_mount+0x3e/0xe0
[<c113f60d>] do_mount+0x34d/0x670
[<c10f2749>] ? strndup_user+0x49/0x70
[<c113fcab>] sys_mount+0x6b/0xa0
[<c142d87c>] syscall_call+0x7/0xb
Code: f8 e8 8b 93 39 c9 8b 45 f8 3e 0f ba 28 00 19 d2 85 d2 74 ca eb d0 0f 0b 8d 45 fc 89 44 24 04 c7 04 24 3d 9a 09 f8 e8 09 92 39 c9 <0f> 0b 8d 74 26 00 55 89 e5 3e 8d 74 26 00 8b 10 80 e6 01 74 09
EIP: [<f809132a>] logfs_lock_write_page+0x6a/0x70 [logfs] SS:ESP 0068:c7337b18
---[ end trace 96e67d5b3aa3d6ca ]---

The patch passes locked page to __logfs_write_inode. It calls function
logfs_get_wblocks() to pre-lock the page. This ensures any further
attempts to lock the page are ignored (esp from get_erase_count).

Acked-by: Joern Engel <joern@logfs.org>
Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
---
 fs/logfs/dir.c       |  2 +-
 fs/logfs/inode.c     |  2 +-
 fs/logfs/logfs.h     |  2 +-
 fs/logfs/readwrite.c | 12 ++++++------
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b7d7f67cee5..b6404898da8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -71,7 +71,7 @@ static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
 
 static int write_inode(struct inode *inode)
 {
-	return __logfs_write_inode(inode, WF_LOCK);
+	return __logfs_write_inode(inode, NULL, WF_LOCK);
 }
 
 static s64 dir_seek_data(struct inode *inode, s64 pos)
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 388d7c5a7be..7c42c132c17 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -287,7 +287,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
 		return 0;
 
-	ret = __logfs_write_inode(inode, flags);
+	ret = __logfs_write_inode(inode, NULL, flags);
 	LOGFS_BUG_ON(ret, inode->i_sb);
 	return ret;
 }
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index bb4340850c1..0dec29887a8 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -528,7 +528,7 @@ void logfs_destroy_inode_cache(void);
 void logfs_set_blocks(struct inode *inode, u64 no);
 /* these logically belong into inode.c but actually reside in readwrite.c */
 int logfs_read_inode(struct inode *inode);
-int __logfs_write_inode(struct inode *inode, long flags);
+int __logfs_write_inode(struct inode *inode, struct page *, long flags);
 void logfs_evict_inode(struct inode *inode);
 
 /* journal.c */
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7b10e8aecce..88284c67ba9 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -422,7 +422,7 @@ static void inode_write_block(struct logfs_block *block)
 	if (inode->i_ino == LOGFS_INO_MASTER)
 		logfs_write_anchor(inode->i_sb);
 	else {
-		ret = __logfs_write_inode(inode, 0);
+		ret = __logfs_write_inode(inode, NULL, 0);
 		/* see indirect_write_block comment */
 		BUG_ON(ret);
 	}
@@ -1629,7 +1629,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
 			if (inode->i_ino == LOGFS_INO_MASTER)
 				logfs_write_anchor(inode->i_sb);
 			else {
-				err = __logfs_write_inode(inode, flags);
+				err = __logfs_write_inode(inode, page, flags);
 			}
 		}
 	}
@@ -1879,7 +1879,7 @@ int logfs_truncate(struct inode *inode, u64 target)
 		logfs_get_wblocks(sb, NULL, 1);
 		err = __logfs_truncate(inode, size);
 		if (!err)
-			err = __logfs_write_inode(inode, 0);
+			err = __logfs_write_inode(inode, NULL, 0);
 		logfs_put_wblocks(sb, NULL, 1);
 	}
 
@@ -2119,14 +2119,14 @@ void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
 			ec_level);
 }
 
-int __logfs_write_inode(struct inode *inode, long flags)
+int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
 {
 	struct super_block *sb = inode->i_sb;
 	int ret;
 
-	logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+	logfs_get_wblocks(sb, page, flags & WF_LOCK);
 	ret = do_write_inode(inode);
-	logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+	logfs_put_wblocks(sb, page, flags & WF_LOCK);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6c69494f6b442834f26377e02d43fc8e1272221d Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Mon, 12 Sep 2011 21:09:16 +0530
Subject: logfs: remove useless BUG_ON

It prevents write sizes >4k.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/journal.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 9da29706f91..1e1c369df22 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -612,7 +612,6 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
 	if (len == 0)
 		return logfs_write_header(super, header, 0, type);
 
-	BUG_ON(len > sb->s_blocksize);
 	compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
 	if (compr_len < 0 || type == JE_ANCHOR) {
 		memcpy(data, buf, len);
-- 
cgit v1.2.3


From 1bcceaff8cbe5e5698ccf1015c9a938aa72718c4 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 5 Aug 2011 11:18:19 +0200
Subject: logfs: Free areas before calling generic_shutdown_super()

Or hit an assertion in map_invalidatepage() instead.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/logfs.h   |  1 +
 fs/logfs/segment.c | 14 ++++++++++----
 fs/logfs/super.c   |  1 +
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 0dec29887a8..59ed32cd62d 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -596,6 +596,7 @@ int logfs_init_mapping(struct super_block *sb);
 void logfs_sync_area(struct logfs_area *area);
 void logfs_sync_segments(struct super_block *sb);
 void freeseg(struct super_block *sb, u32 segno);
+void free_areas(struct super_block *sb);
 
 /* area handling */
 int logfs_init_areas(struct super_block *sb);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 6aee6092860..ab798ed1cc8 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -862,6 +862,16 @@ static void free_area(struct logfs_area *area)
 	kfree(area);
 }
 
+void free_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for_each_area(i)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+}
+
 static struct logfs_area *alloc_area(struct super_block *sb)
 {
 	struct logfs_area *area;
@@ -944,10 +954,6 @@ err:
 void logfs_cleanup_areas(struct super_block *sb)
 {
 	struct logfs_super *super = logfs_super(sb);
-	int i;
 
 	btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
-	for_each_area(i)
-		free_area(super->s_area[i]);
-	free_area(super->s_journal_area);
 }
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index f9b7a30b00a..c9ee7f5d1ca 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -486,6 +486,7 @@ static void logfs_kill_sb(struct super_block *sb)
 	/* Alias entries slow down mount, so evict as many as possible */
 	sync_filesystem(sb);
 	logfs_write_anchor(sb);
+	free_areas(sb);
 
 	/*
 	 * From this point on alias entries are simply dropped - and any
-- 
cgit v1.2.3


From bbe01387129f76fa4bec17904eb14c4bdc3c179f Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 5 Aug 2011 11:13:30 +0200
Subject: logfs: Grow inode in delete path

Can be necessary if an inode gets deleted (through -ENOSPC) before being
written.  Might be better to move this into logfs_write_rec(), but for
now go with the stupid&safe patch.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/readwrite.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 88284c67ba9..4153e65b014 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1576,11 +1576,15 @@ int logfs_write_buf(struct inode *inode, struct page *page, long flags)
 static int __logfs_delete(struct inode *inode, struct page *page)
 {
 	long flags = WF_DELETE;
+	int err;
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 
 	if (page->index < I0_BLOCKS)
 		return logfs_write_direct(inode, page, flags);
+	err = grow_inode(inode, page->index, 0);
+	if (err)
+		return err;
 	return logfs_write_rec(inode, page, page->index, 0, flags);
 }
 
-- 
cgit v1.2.3


From f2933e86ad93a8d1287079d59e67afd6f4166a9d Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 5 Aug 2011 11:09:55 +0200
Subject: Logfs: Allow NULL block_isbad() methods

Not all mtd drivers define block_isbad().  Let's assume no bad blocks
instead of refusing to mount.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/dev_mtd.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 339e17e9133..d054d7e975c 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -150,14 +150,13 @@ static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
 	filler_t *filler = mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
-	if (!mtd->block_isbad)
-		return NULL;
-
 	*ofs = 0;
-	while (mtd->block_isbad(mtd, *ofs)) {
-		*ofs += mtd->erasesize;
-		if (*ofs >= mtd->size)
-			return NULL;
+	if (mtd->block_isbad) {
+		while (mtd->block_isbad(mtd, *ofs)) {
+			*ofs += mtd->erasesize;
+			if (*ofs >= mtd->size)
+				return NULL;
+		}
 	}
 	BUG_ON(*ofs & ~PAGE_MASK);
 	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
@@ -170,14 +169,13 @@ static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
 	filler_t *filler = mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
-	if (!mtd->block_isbad)
-		return NULL;
-
 	*ofs = mtd->size - mtd->erasesize;
-	while (mtd->block_isbad(mtd, *ofs)) {
-		*ofs -= mtd->erasesize;
-		if (*ofs <= 0)
-			return NULL;
+	if (mtd->block_isbad) {
+		while (mtd->block_isbad(mtd, *ofs)) {
+			*ofs -= mtd->erasesize;
+			if (*ofs <= 0)
+				return NULL;
+		}
 	}
 	*ofs = *ofs + mtd->erasesize - 0x1000;
 	BUG_ON(*ofs & ~PAGE_MASK);
-- 
cgit v1.2.3


From 7d731019218e49a9811f6d0adec4b1cfcb752bed Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 1 Feb 2012 11:10:24 -0800
Subject: mtd: fix merge conflict resolution breakage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes merge conflict resolution breakage introduced by merge
d3712b9dfcf4 ("Merge tag 'for-linus' of git://github.com/prasad-joshi/logfs_upstream").

The commit changed 'mtd_can_have_bb()' function and made it always
return zero, which is incorrect.  Instead, we need it to return whether
the underlying flash device can have bad eraseblocks or not.  UBI needs
this information because it affects how it handles the underlying flash.
E.g., if the underlying flash is NOR, it cannot have bad blocks and any
write or erase error is fatal, and all we can do is to switch to R/O
mode.  We do not need to reserve a pool of good eraseblocks for bad
eraseblocks handling, and so on.

This patch also removes 'mtd_can_have_bb()' invocations from Logfs to
ensure correct Logfs behavior.

I've tested that with this patch UBI works on top of NOR and NAND
flashes emulated by mtdram and nandsim correspondingly.

This patch is based on patch from Linus Torvalds.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Jörn Engel <joern@logfs.org>
Acked-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/logfs/dev_mtd.c      | 6 ------
 include/linux/mtd/mtd.h | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index e97404d611e..9c501449450 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -152,9 +152,6 @@ static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
 	filler_t *filler = logfs_mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
-	if (!mtd_can_have_bb(mtd))
-		return NULL;
-
 	*ofs = 0;
 	while (mtd_block_isbad(mtd, *ofs)) {
 		*ofs += mtd->erasesize;
@@ -172,9 +169,6 @@ static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
 	filler_t *filler = logfs_mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
-	if (!mtd_can_have_bb(mtd))
-		return NULL;
-
 	*ofs = mtd->size - mtd->erasesize;
 	while (mtd_block_isbad(mtd, *ofs)) {
 		*ofs -= mtd->erasesize;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 221295208fd..887ebe318c7 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -489,7 +489,7 @@ static inline int mtd_has_oob(const struct mtd_info *mtd)
 
 static inline int mtd_can_have_bb(const struct mtd_info *mtd)
 {
-	return 0;
+	return !!mtd->block_isbad;
 }
 
 	/* Kernel-side ioctl definitions */
-- 
cgit v1.2.3


From 71879d3cb3dd8f2dfdefb252775c1b3ea04a3dd4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 31 Jan 2012 17:14:38 +0100
Subject: proc: mem_release() should check mm != NULL

mem_release() can hit mm == NULL, add the necessary check.

Cc: stable@kernel.org
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9cde9edf9c4..c3617ea7830 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -822,8 +822,8 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 static int mem_release(struct inode *inode, struct file *file)
 {
 	struct mm_struct *mm = file->private_data;
-
-	mmput(mm);
+	if (mm)
+		mmput(mm);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 572d34b946bae070debd42db1143034d9687e13f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 31 Jan 2012 17:14:54 +0100
Subject: proc: unify mem_read() and mem_write()

No functional changes, cleanup and preparation.

mem_read() and mem_write() are very similar. Move this code into the
new common helper, mem_rw(), which takes the additional "int write"
argument.

Cc: stable@kernel.org
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 90 +++++++++++++++++++++-------------------------------------
 1 file changed, 32 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c3617ea7830..be190904168 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -718,57 +718,13 @@ static int mem_open(struct inode* inode, struct file* file)
 	return 0;
 }
 
-static ssize_t mem_read(struct file * file, char __user * buf,
-			size_t count, loff_t *ppos)
+static ssize_t mem_rw(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos, int write)
 {
-	int ret;
-	char *page;
-	unsigned long src = *ppos;
 	struct mm_struct *mm = file->private_data;
-
-	if (!mm)
-		return 0;
-
-	page = (char *)__get_free_page(GFP_TEMPORARY);
-	if (!page)
-		return -ENOMEM;
-
-	ret = 0;
- 
-	while (count > 0) {
-		int this_len, retval;
-
-		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-		retval = access_remote_vm(mm, src, page, this_len, 0);
-		if (!retval) {
-			if (!ret)
-				ret = -EIO;
-			break;
-		}
-
-		if (copy_to_user(buf, page, retval)) {
-			ret = -EFAULT;
-			break;
-		}
- 
-		ret += retval;
-		src += retval;
-		buf += retval;
-		count -= retval;
-	}
-	*ppos = src;
-
-	free_page((unsigned long) page);
-	return ret;
-}
-
-static ssize_t mem_write(struct file * file, const char __user *buf,
-			 size_t count, loff_t *ppos)
-{
-	int copied;
+	unsigned long addr = *ppos;
+	ssize_t copied;
 	char *page;
-	unsigned long dst = *ppos;
-	struct mm_struct *mm = file->private_data;
 
 	if (!mm)
 		return 0;
@@ -779,30 +735,48 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 
 	copied = 0;
 	while (count > 0) {
-		int this_len, retval;
+		int this_len = min_t(int, count, PAGE_SIZE);
 
-		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-		if (copy_from_user(page, buf, this_len)) {
+		if (write && copy_from_user(page, buf, this_len)) {
 			copied = -EFAULT;
 			break;
 		}
-		retval = access_remote_vm(mm, dst, page, this_len, 1);
-		if (!retval) {
+
+		this_len = access_remote_vm(mm, addr, page, this_len, write);
+		if (!this_len) {
 			if (!copied)
 				copied = -EIO;
 			break;
 		}
-		copied += retval;
-		buf += retval;
-		dst += retval;
-		count -= retval;			
+
+		if (!write && copy_to_user(buf, page, this_len)) {
+			copied = -EFAULT;
+			break;
+		}
+
+		buf += this_len;
+		addr += this_len;
+		copied += this_len;
+		count -= this_len;
 	}
-	*ppos = dst;
+	*ppos = addr;
 
 	free_page((unsigned long) page);
 	return copied;
 }
 
+static ssize_t mem_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	return mem_rw(file, buf, count, ppos, 0);
+}
+
+static ssize_t mem_write(struct file *file, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
+
 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 {
 	switch (orig) {
-- 
cgit v1.2.3


From 6d08f2c7139790c268820a2e590795cb8333181a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 31 Jan 2012 17:15:11 +0100
Subject: proc: make sure mem_open() doesn't pin the target's memory

Once /proc/pid/mem is opened, the memory can't be released until
mem_release() even if its owner exits.

Change mem_open() to do atomic_inc(mm_count) + mmput(), this only
pins mm_struct. Change mem_rw() to do atomic_inc_not_zero(mm_count)
before access_remote_vm(), this verifies that this mm is still alive.

I am not sure what should mem_rw() return if atomic_inc_not_zero()
fails. With this patch it returns zero to match the "mm == NULL" case,
may be it should return -EINVAL like it did before e268337d.

Perhaps it makes sense to add the additional fatal_signal_pending()
check into the main loop, to ensure we do not hold this memory if
the target task was oom-killed.

Cc: stable@kernel.org
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index be190904168..d9512bd03e6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -711,6 +711,13 @@ static int mem_open(struct inode* inode, struct file* file)
 	if (IS_ERR(mm))
 		return PTR_ERR(mm);
 
+	if (mm) {
+		/* ensure this mm_struct can't be freed */
+		atomic_inc(&mm->mm_count);
+		/* but do not pin its memory */
+		mmput(mm);
+	}
+
 	/* OK to pass negative loff_t, we can catch out-of-range */
 	file->f_mode |= FMODE_UNSIGNED_OFFSET;
 	file->private_data = mm;
@@ -734,6 +741,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 		return -ENOMEM;
 
 	copied = 0;
+	if (!atomic_inc_not_zero(&mm->mm_users))
+		goto free;
+
 	while (count > 0) {
 		int this_len = min_t(int, count, PAGE_SIZE);
 
@@ -761,6 +771,8 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 	}
 	*ppos = addr;
 
+	mmput(mm);
+free:
 	free_page((unsigned long) page);
 	return copied;
 }
@@ -797,7 +809,7 @@ static int mem_release(struct inode *inode, struct file *file)
 {
 	struct mm_struct *mm = file->private_data;
 	if (mm)
-		mmput(mm);
+		mmdrop(mm);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 114fc47492e23d93653e4a16664833e98d62a563 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Wed, 11 Jan 2012 17:41:01 -0800
Subject: ceph: change "ceph.layout" xattr to be "ceph.file.layout"

The virtual extended attribute named "ceph.layout" is meaningful
only for regular files.  Change its name to be "ceph.file.layout" to
more directly reflect that in the ceph xattr namespace.  Preserve
the old "ceph.layout" name for the time being (until we decide it's
safe to get rid of it entirely).

Add a missing initializer for "readonly" in the terminating entry.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a5e36e4488a..9e6734e38c1 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -111,8 +111,10 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+	{ true, "ceph.file.layout", ceph_vxattrcb_layout},
+	/* The following extended attribute name is deprecated */
 	{ true, "ceph.layout", ceph_vxattrcb_layout},
-	{ NULL, NULL }
+	{ true, NULL, NULL }
 };
 
 static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
-- 
cgit v1.2.3


From 32852a81bccd9e3d1953b894966393d1b546576d Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Sat, 14 Jan 2012 22:20:59 -0500
Subject: ceph: fix length validation in parse_reply_info()

"len" is read from network and thus needs validation.  Otherwise, given
a bogus "len" value, p+len could be an out-of-bounds pointer, which is
used in further parsing.

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6203d805eb4..be1415fcaac 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
 	/* trace */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
 		err = parse_reply_info_trace(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
@@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg,
 	/* extra */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
 		err = parse_reply_info_extra(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
-- 
cgit v1.2.3


From d8fb02abdc39f92a1066313e2b17047876afa8f9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 12 Jan 2012 17:48:10 -0800
Subject: ceph: create a new session lock to avoid lock inversion

Lockdep was reporting a possible circular lock dependency in
dentry_lease_is_valid().  That function needs to sample the
session's s_cap_gen and and s_cap_ttl fields coherently, but needs
to do so while holding a dentry lock.  The s_cap_lock field was
being used to protect the two fields, but that can't be taken while
holding a lock on a dentry within the session.

In most cases, the s_cap_gen and s_cap_ttl fields only get operated
on separately.  But in three cases they need to be updated together.
Implement a new lock to protect the spots updating both fields
atomically is required.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 4 ++--
 fs/ceph/dir.c        | 4 ++--
 fs/ceph/mds_client.c | 8 +++++---
 fs/ceph/mds_client.h | 7 +++++--
 4 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8b53193e4f7..90d789df9ce 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap)
 	unsigned long ttl;
 	u32 gen;
 
-	spin_lock(&cap->session->s_cap_lock);
+	spin_lock(&cap->session->s_gen_ttl_lock);
 	gen = cap->session->s_cap_gen;
 	ttl = cap->session->s_cap_ttl;
-	spin_unlock(&cap->session->s_cap_lock);
+	spin_unlock(&cap->session->s_gen_ttl_lock);
 
 	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
 		dout("__cap_is_valid %p cap %p issued %s "
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 98954003a8d..63c52f33361 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -975,10 +975,10 @@ static int dentry_lease_is_valid(struct dentry *dentry)
 	di = ceph_dentry(dentry);
 	if (di && di->lease_session) {
 		s = di->lease_session;
-		spin_lock(&s->s_cap_lock);
+		spin_lock(&s->s_gen_ttl_lock);
 		gen = s->s_cap_gen;
 		ttl = s->s_cap_ttl;
-		spin_unlock(&s->s_cap_lock);
+		spin_unlock(&s->s_gen_ttl_lock);
 
 		if (di->lease_gen == gen &&
 		    time_before(jiffies, dentry->d_time) &&
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index be1415fcaac..a4fdf9397a9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -400,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
 	s->s_con.peer_name.num = cpu_to_le64(mds);
 
-	spin_lock_init(&s->s_cap_lock);
+	spin_lock_init(&s->s_gen_ttl_lock);
 	s->s_cap_gen = 0;
 	s->s_cap_ttl = 0;
+
+	spin_lock_init(&s->s_cap_lock);
 	s->s_renew_requested = 0;
 	s->s_renew_seq = 0;
 	INIT_LIST_HEAD(&s->s_caps);
@@ -2328,10 +2330,10 @@ static void handle_session(struct ceph_mds_session *session,
 	case CEPH_SESSION_STALE:
 		pr_info("mds%d caps went stale, renewing\n",
 			session->s_mds);
-		spin_lock(&session->s_cap_lock);
+		spin_lock(&session->s_gen_ttl_lock);
 		session->s_cap_gen++;
 		session->s_cap_ttl = 0;
-		spin_unlock(&session->s_cap_lock);
+		spin_unlock(&session->s_gen_ttl_lock);
 		send_renew_caps(mdsc, session);
 		break;
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a50ca0e3947..8c7c04ebb59 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -117,10 +117,13 @@ struct ceph_mds_session {
 	void             *s_authorizer_buf, *s_authorizer_reply_buf;
 	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
 
-	/* protected by s_cap_lock */
-	spinlock_t        s_cap_lock;
+	/* protected by s_gen_ttl_lock */
+	spinlock_t        s_gen_ttl_lock;
 	u32               s_cap_gen;  /* inc each time we get mds stale msg */
 	unsigned long     s_cap_ttl;  /* when session caps expire */
+
+	/* protected by s_cap_lock */
+	spinlock_t        s_cap_lock;
 	struct list_head  s_caps;     /* all caps issued by this session */
 	int               s_nr_caps, s_trim_caps;
 	int               s_num_cap_releases;
-- 
cgit v1.2.3


From 8cdb878dcb359fd1137e9abdee9322f5e9bcfdf8 Mon Sep 17 00:00:00 2001
From: Christopher Yeoh <cyeoh@au1.ibm.com>
Date: Thu, 2 Feb 2012 11:34:09 +1030
Subject: Fix race in process_vm_rw_core

This fixes the race in process_vm_core found by Oleg (see

  http://article.gmane.org/gmane.linux.kernel/1235667/

for details).

This has been updated since I last sent it as the creation of the new
mm_access() function did almost exactly the same thing as parts of the
previous version of this patch did.

In order to use mm_access() even when /proc isn't enabled, we move it to
kernel/fork.c where other related process mm access functions already
are.

Signed-off-by: Chris Yeoh <yeohc@au1.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c         | 20 --------------------
 include/linux/sched.h  |  6 ++++++
 kernel/fork.c          | 20 ++++++++++++++++++++
 mm/process_vm_access.c | 23 +++++++++--------------
 4 files changed, 35 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d9512bd03e6..d4548dd49b0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -198,26 +198,6 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
 	return result;
 }
 
-static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
-{
-	struct mm_struct *mm;
-	int err;
-
-	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
-	if (err)
-		return ERR_PTR(err);
-
-	mm = get_task_mm(task);
-	if (mm && mm != current->mm &&
-			!ptrace_may_access(task, mode)) {
-		mmput(mm);
-		mm = ERR_PTR(-EACCES);
-	}
-	mutex_unlock(&task->signal->cred_guard_mutex);
-
-	return mm;
-}
-
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	return mm_access(task, PTRACE_MODE_READ);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2234985a5e6..7d379a6bfd8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2259,6 +2259,12 @@ static inline void mmdrop(struct mm_struct * mm)
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
+/*
+ * Grab a reference to a task's mm, if it is not already going away
+ * and ptrace_may_access with the mode parameter passed to it
+ * succeeds.
+ */
+extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
 /* Allocate a new mm structure and copy contents from tsk->mm */
diff --git a/kernel/fork.c b/kernel/fork.c
index 051f090d40c..1b2ef3c23ae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -647,6 +647,26 @@ struct mm_struct *get_task_mm(struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(get_task_mm);
 
+struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+{
+	struct mm_struct *mm;
+	int err;
+
+	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (err)
+		return ERR_PTR(err);
+
+	mm = get_task_mm(task);
+	if (mm && mm != current->mm &&
+			!ptrace_may_access(task, mode)) {
+		mmput(mm);
+		mm = ERR_PTR(-EACCES);
+	}
+	mutex_unlock(&task->signal->cred_guard_mutex);
+
+	return mm;
+}
+
 /* Please note the differences between mmput and mm_release.
  * mmput is called whenever we stop holding onto a mm_struct,
  * error success whatever.
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index e920aa3ce10..c20ff48994c 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -298,23 +298,18 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
 		goto free_proc_pages;
 	}
 
-	task_lock(task);
-	if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
-		task_unlock(task);
-		rc = -EPERM;
-		goto put_task_struct;
-	}
-	mm = task->mm;
-
-	if (!mm || (task->flags & PF_KTHREAD)) {
-		task_unlock(task);
-		rc = -EINVAL;
+	mm = mm_access(task, PTRACE_MODE_ATTACH);
+	if (!mm || IS_ERR(mm)) {
+		rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+		/*
+		 * Explicitly map EACCES to EPERM as EPERM is a more a
+		 * appropriate error code for process_vw_readv/writev
+		 */
+		if (rc == -EACCES)
+			rc = -EPERM;
 		goto put_task_struct;
 	}
 
-	atomic_inc(&mm->mm_users);
-	task_unlock(task);
-
 	for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
 		rc = process_vm_rw_single_vec(
 			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
-- 
cgit v1.2.3