From 6cb52147b254373364a2fef5df8b4aa3739c8bb6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 31 Jul 2007 19:15:08 +0900
Subject: sysfs: fix locking in sysfs_lookup() and sysfs_rename_dir()

sd children list walking in sysfs_lookup() and sd renaming in
sysfs_rename_dir() were left out during i_mutex -> sysfs_mutex
conversion.  Fix them.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 048e6054c2f..83e76b3813c 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -762,12 +762,15 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
 static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 				struct nameidata *nd)
 {
+	struct dentry *ret = NULL;
 	struct sysfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
 	struct sysfs_dirent * sd;
 	struct bin_attribute *bin_attr;
 	struct inode *inode;
 	int found = 0;
 
+	mutex_lock(&sysfs_mutex);
+
 	for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) {
 		if (sysfs_type(sd) &&
 		    !strcmp(sd->s_name, dentry->d_name.name)) {
@@ -778,14 +781,14 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	/* no such entry */
 	if (!found)
-		return NULL;
+		goto out_unlock;
 
 	/* attach dentry and inode */
 	inode = sysfs_get_inode(sd);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	mutex_lock(&sysfs_mutex);
+	if (!inode) {
+		ret = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
 
 	if (inode->i_state & I_NEW) {
 		/* initialize inode according to type */
@@ -815,9 +818,9 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	sysfs_instantiate(dentry, inode);
 	sysfs_attach_dentry(sd, dentry);
 
+ out_unlock:
 	mutex_unlock(&sysfs_mutex);
-
-	return NULL;
+	return ret;
 }
 
 const struct inode_operations sysfs_dir_inode_operations = {
@@ -942,6 +945,8 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
 	if (error)
 		goto out_drop;
 
+	mutex_lock(&sysfs_mutex);
+
 	dup_name = sd->s_name;
 	sd->s_name = new_name;
 
@@ -949,8 +954,6 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
 	d_add(new_dentry, NULL);
 	d_move(sd->s_dentry, new_dentry);
 
-	mutex_lock(&sysfs_mutex);
-
 	sysfs_unlink_sibling(sd);
 	sysfs_get(new_parent_sd);
 	sysfs_put(sd->s_parent);
-- 
cgit v1.2.3


From 5f1835da79df8607ecbd69f648b5b140b7a0b8ba Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 16 Aug 2007 16:13:06 -0400
Subject: sysfs: don't warn on removal of a nonexistent binary file

This patch (as960) removes the error message and stack dump logged by
sysfs_remove_bin_file() when someone tries to remove a nonexistent
file.  The warning doesn't seem to be needed, since none of the other
file-, symlink-, or directory-removal routines in sysfs complain in a
comparable way.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 135353f8a29..5afe2a26f5d 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -248,12 +248,7 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 
 void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 {
-	if (sysfs_hash_and_remove(kobj->sd, attr->attr.name) < 0) {
-		printk(KERN_ERR "%s: "
-			"bad dentry or inode or no such file: \"%s\"\n",
-			__FUNCTION__, attr->attr.name);
-		dump_stack();
-	}
+	sysfs_hash_and_remove(kobj->sd, attr->attr.name);
 }
 
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
-- 
cgit v1.2.3


From efe567fc8281661524ffa75477a7c4ca9b466c63 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: sched: accounting regression since rc1

Fix the accounting regression for CONFIG_VIRT_CPU_ACCOUNTING.  It
reverts parts of commit b27f03d4bdc145a09fb7b0c0e004b29f1ee555fa by
converting fs/proc/array.c back to cputime_t.  The new functions
task_utime and task_stime now return cputime_t instead of clock_t.  If
CONFIG_VIRT_CPU_ACCOUTING is set, task->utime and task->stime are
returned directly instead of using sum_exec_runtime.

Patch is tested on s390x with and without VIRT_CPU_ACCOUTING as well as
on i386.

[ mingo@elte.hu: cleanups, comments. ]

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 965625a0977..ee4814dd98f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -320,7 +320,21 @@ int proc_pid_status(struct task_struct *task, char *buffer)
 	return buffer - orig;
 }
 
-static clock_t task_utime(struct task_struct *p)
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+static cputime_t task_utime(struct task_struct *p)
+{
+	return p->utime;
+}
+
+static cputime_t task_stime(struct task_struct *p)
+{
+	return p->stime;
+}
+#else
+static cputime_t task_utime(struct task_struct *p)
 {
 	clock_t utime = cputime_to_clock_t(p->utime),
 		total = utime + cputime_to_clock_t(p->stime);
@@ -337,10 +351,10 @@ static clock_t task_utime(struct task_struct *p)
 	}
 	utime = (clock_t)temp;
 
-	return utime;
+	return clock_t_to_cputime(utime);
 }
 
-static clock_t task_stime(struct task_struct *p)
+static cputime_t task_stime(struct task_struct *p)
 {
 	clock_t stime;
 
@@ -349,10 +363,12 @@ static clock_t task_stime(struct task_struct *p)
 	 * the total, to make sure the total observed by userspace
 	 * grows monotonically - apps rely on that):
 	 */
-	stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p);
+	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+			cputime_to_clock_t(task_utime(p));
 
-	return stime;
+	return clock_t_to_cputime(stime);
 }
+#endif
 
 static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 {
@@ -368,8 +384,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 	unsigned long long start_time;
 	unsigned long cmin_flt = 0, cmaj_flt = 0;
 	unsigned long  min_flt = 0,  maj_flt = 0;
-	cputime_t cutime, cstime;
-	clock_t utime, stime;
+	cputime_t cutime, cstime, utime, stime;
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 	unsigned long flags;
@@ -387,8 +402,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 
 	sigemptyset(&sigign);
 	sigemptyset(&sigcatch);
-	cutime = cstime = cputime_zero;
-	utime = stime = 0;
+	cutime = cstime = utime = stime = cputime_zero;
 
 	rcu_read_lock();
 	if (lock_task_sighand(task, &flags)) {
@@ -414,15 +428,15 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				utime += task_utime(t);
-				stime += task_stime(t);
+				utime = cputime_add(utime, task_utime(t));
+				stime = cputime_add(stime, task_stime(t));
 				t = next_thread(t);
 			} while (t != task);
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			utime += cputime_to_clock_t(sig->utime);
-			stime += cputime_to_clock_t(sig->stime);
+			utime = cputime_add(utime, sig->utime);
+			stime = cputime_add(stime, sig->stime);
 		}
 
 		sid = signal_session(sig);
@@ -471,8 +485,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 		cmin_flt,
 		maj_flt,
 		cmaj_flt,
-		utime,
-		stime,
+		cputime_to_clock_t(utime),
+		cputime_to_clock_t(stime),
 		cputime_to_clock_t(cutime),
 		cputime_to_clock_t(cstime),
 		priority,
-- 
cgit v1.2.3


From fbcb7599e411309cf47a2b834d3546469c153cf4 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@ericvh-laptop.austin.ibm.com>
Date: Thu, 23 Aug 2007 10:08:45 -0500
Subject: 9p: remove deprecated v9fs_fid_lookup_remove()

This patch removes the v9fs_fid_lookup_remove which is no longer used.

Based on original patch from Adrian Bunk <bunk@stusta.de> which
used #if 0 to isolate the code.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/fid.c | 17 -----------------
 fs/9p/fid.h |  1 -
 2 files changed, 18 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 08fa320b7e6..15e05a15b57 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -92,23 +92,6 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 	return fid;
 }
 
-struct p9_fid *v9fs_fid_lookup_remove(struct dentry *dentry)
-{
-	struct p9_fid *fid;
-	struct v9fs_dentry *dent;
-
-	dent = dentry->d_fsdata;
-	fid = v9fs_fid_lookup(dentry);
-	if (!IS_ERR(fid)) {
-		spin_lock(&dent->lock);
-		list_del(&fid->dlist);
-		spin_unlock(&dent->lock);
-	}
-
-	return fid;
-}
-
-
 /**
  * v9fs_fid_clone - lookup the fid for a dentry, clone a private copy and
  * 	release it
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 47a0ba74287..26e07df783b 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -28,6 +28,5 @@ struct v9fs_dentry {
 };
 
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
-struct p9_fid *v9fs_fid_lookup_remove(struct dentry *dentry);
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
 int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
-- 
cgit v1.2.3


From 060d11b0b35d13eb2251fd08403d900b71df5791 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 30 Aug 2007 23:56:16 -0700
Subject: revert "eCryptfs: fix lookup error for special files"

This patch got appied twice.

Cc: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/inode.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5d40ad13ab5..131954b3fb9 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -357,10 +357,6 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 		ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
 		goto out;
 	}
-	if (special_file(lower_inode->i_mode)) {
-		ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
-		goto out;
-	}
 	if (!nd) {
 		ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
 				"as we *think* we are about to unlink\n");
-- 
cgit v1.2.3


From bcec44770cc65660369ae17b4e44be027a64a46c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Aug 2007 23:56:22 -0700
Subject: UDF: handle wrong superblock better

If UDF superblock is incorrect, we can fail to find a table of free /
allocated space and consequently Oops.  Handle this situation more
gracefully by ignoring the broken UDF partition.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/udf/super.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 382be7be5ae..c68a6e730b9 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -89,7 +89,7 @@ static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
 static void udf_load_pvoldesc(struct super_block *, struct buffer_head *);
 static void udf_load_fileset(struct super_block *, struct buffer_head *,
 			     kernel_lb_addr *);
-static void udf_load_partdesc(struct super_block *, struct buffer_head *);
+static int udf_load_partdesc(struct super_block *, struct buffer_head *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
@@ -877,7 +877,7 @@ static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
 		  root->logicalBlockNum, root->partitionReferenceNum);
 }
 
-static void udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
+static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
 {
 	struct partitionDesc *p;
 	int i;
@@ -912,6 +912,11 @@ static void udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
 
 					UDF_SB_PARTMAPS(sb)[i].s_uspace.s_table =
 						udf_iget(sb, loc);
+					if (!UDF_SB_PARTMAPS(sb)[i].s_uspace.s_table) {
+						udf_debug("cannot load unallocSpaceTable (part %d)\n",
+							i);
+						return 1;
+					}
 					UDF_SB_PARTFLAGS(sb,i) |= UDF_PART_FLAG_UNALLOC_TABLE;
 					udf_debug("unallocSpaceTable (part %d) @ %ld\n",
 						  i, UDF_SB_PARTMAPS(sb)[i].s_uspace.s_table->i_ino);
@@ -938,6 +943,11 @@ static void udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
 
 					UDF_SB_PARTMAPS(sb)[i].s_fspace.s_table =
 						udf_iget(sb, loc);
+					if (!UDF_SB_PARTMAPS(sb)[i].s_fspace.s_table) {
+						udf_debug("cannot load freedSpaceTable (part %d)\n",
+							i);
+						return 1;
+					}
 					UDF_SB_PARTFLAGS(sb,i) |= UDF_PART_FLAG_FREED_TABLE;
 					udf_debug("freedSpaceTable (part %d) @ %ld\n",
 						  i, UDF_SB_PARTMAPS(sb)[i].s_fspace.s_table->i_ino);
@@ -966,6 +976,7 @@ static void udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
 			  le16_to_cpu(p->partitionNumber), i, UDF_SB_PARTTYPE(sb,i),
 			  UDF_SB_PARTROOT(sb,i), UDF_SB_PARTLEN(sb,i));
 	}
+	return 0;
 }
 
 static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
@@ -1177,12 +1188,19 @@ static int udf_process_sequence(struct super_block *sb, long block, long lastblo
 				udf_load_logicalvol(sb, bh, fileset);
 			} else if (i == VDS_POS_PARTITION_DESC) {
 				struct buffer_head *bh2 = NULL;
-				udf_load_partdesc(sb, bh);
+				if (udf_load_partdesc(sb, bh)) {
+					brelse(bh);
+					return 1;
+				}
 				for (j = vds[i].block + 1; j <  vds[VDS_POS_TERMINATING_DESC].block; j++) {
 					bh2 = udf_read_tagged(sb, j, j, &ident);
 					gd = (struct generic_desc *)bh2->b_data;
 					if (ident == TAG_IDENT_PD)
-						udf_load_partdesc(sb, bh2);
+						if (udf_load_partdesc(sb, bh2)) {
+							brelse(bh);
+							brelse(bh2);
+							return 1;
+						}
 					brelse(bh2);
 				}
 			}
-- 
cgit v1.2.3


From f5cc15dac55d4943176f84681f37aa48094ffa8b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Aug 2007 23:56:22 -0700
Subject: Fix possible NULL pointer dereference in udf_table_free_blocks()

Fix possible NULL pointer dereference when freeing blocks in case table of
free space is used.  Also fix handling of the case when we need to move
extent from one block to another one to make space for indirect extent.
BTW: Nobody seem to have ever used this code.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/udf/balloc.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 276f7207a56..87e87dcd3f9 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -540,26 +540,24 @@ static void udf_table_free_blocks(struct super_block *sb,
 			if (epos.offset + adsize > sb->s_blocksize) {
 				loffset = epos.offset;
 				aed->lengthAllocDescs = cpu_to_le32(adsize);
-				sptr = UDF_I_DATA(inode) + epos.offset -
-					udf_file_entry_alloc_offset(inode) +
-					UDF_I_LENEATTR(inode) - adsize;
+				sptr = UDF_I_DATA(table) + epos.offset - adsize;
 				dptr = epos.bh->b_data + sizeof(struct allocExtDesc);
 				memcpy(dptr, sptr, adsize);
 				epos.offset = sizeof(struct allocExtDesc) + adsize;
 			} else {
 				loffset = epos.offset + adsize;
 				aed->lengthAllocDescs = cpu_to_le32(0);
-				sptr = oepos.bh->b_data + epos.offset;
-				epos.offset = sizeof(struct allocExtDesc);
-
 				if (oepos.bh) {
+					sptr = oepos.bh->b_data + epos.offset;
 					aed = (struct allocExtDesc *)oepos.bh->b_data;
 					aed->lengthAllocDescs =
 						cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) + adsize);
 				} else {
+					sptr = UDF_I_DATA(table) + epos.offset;
 					UDF_I_LENALLOC(table) += adsize;
 					mark_inode_dirty(table);
 				}
+				epos.offset = sizeof(struct allocExtDesc);
 			}
 			if (UDF_SB_UDFREV(sb) >= 0x0200)
 				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 3, 1,
-- 
cgit v1.2.3


From 2aeb3db17fc33443d21b11d7121c5627d55717c6 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 30 Aug 2007 23:56:33 -0700
Subject: eCryptfs: fix possible fault in ecryptfs_sync_page

This will avoid a possible fault in ecryptfs_sync_page().

In the function, eCryptfs calls sync_page() method of a lower filesystem
without checking its existence.  However, there are many filesystems that
don't have this method including network filesystems such as NFS, AFS, and
so forth.  They may fail when an eCryptfs page is waiting for lock.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/mmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index e4ab7bc14ef..fd3f94d4a66 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -834,7 +834,8 @@ static void ecryptfs_sync_page(struct page *page)
 		ecryptfs_printk(KERN_DEBUG, "find_lock_page failed\n");
 		return;
 	}
-	lower_page->mapping->a_ops->sync_page(lower_page);
+	if (lower_page->mapping->a_ops->sync_page)
+		lower_page->mapping->a_ops->sync_page(lower_page);
 	ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
 			lower_page->index);
 	unlock_page(lower_page);
-- 
cgit v1.2.3


From dec4ad86c2fbea062e9ef9caa6d6e79f7c5e0b12 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 30 Aug 2007 23:56:40 -0700
Subject: hugepage: fix broken check for offset alignment in hugepage mappings

For hugepage mappings, the file offset, like the address and size, needs to
be aligned to the size of a hugepage.

In commit 68589bc353037f233fe510ad9ff432338c95db66, the check for this was
moved into prepare_hugepage_range() along with the address and size checks.
 But since BenH's rework of the get_unmapped_area() paths leading up to
commit 4b1d89290b62bb2db476c94c82cf7442aab440c8, prepare_hugepage_range()
is only called for MAP_FIXED mappings, not for other mappings.  This means
we're no longer ever checking for an aligned offset - I've confirmed that
mmap() will (apparently) succeed with a misaligned offset on both powerpc
and i386 at least.

This patch restores the check, removing it from prepare_hugepage_range()
and putting it back into hugetlbfs_file_mmap().  I'm putting it there,
rather than in the get_unmapped_area() path so it only needs to go in one
place, than separately in the half-dozen or so arch-specific
implementations of hugetlb_get_unmapped_area().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/mm/hugetlbpage.c    |  2 +-
 arch/ia64/mm/hugetlbpage.c    |  6 ++----
 arch/sparc64/mm/hugetlbpage.c |  2 +-
 fs/hugetlbfs/inode.c          | 15 ++++++++++-----
 include/linux/hugetlb.h       | 10 +++-------
 5 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index efdf95ac803..6c06d9c0488 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -367,7 +367,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len, pgoff))
+		if (prepare_hugepage_range(addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index d22861c5b04..a9ff685aea2 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -75,10 +75,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  * Don't actually need to do any preparation, but need to make sure
  * the address is in the right region.
  */
-int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff)
+int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-	if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
-		return -EINVAL;
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 	if (addr & ~HPAGE_MASK)
@@ -151,7 +149,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, u
 
 	/* Handle MAP_FIXED */
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len, pgoff))
+		if (prepare_hugepage_range(addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index eaba9b70b18..6cfab2e4d34 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -175,7 +175,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len, pgoff))
+		if (prepare_hugepage_range(addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c848a191525..950c2fbb815 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -82,14 +82,19 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	int ret;
 
 	/*
-	 * vma alignment has already been checked by prepare_hugepage_range.
-	 * If you add any error returns here, do so after setting VM_HUGETLB,
-	 * so is_vm_hugetlb_page tests below unmap_region go the right way
-	 * when do_mmap_pgoff unwinds (may be important on powerpc and ia64).
+	 * vma address alignment (but not the pgoff alignment) has
+	 * already been checked by prepare_hugepage_range.  If you add
+	 * any error returns here, do so after setting VM_HUGETLB, so
+	 * is_vm_hugetlb_page tests below unmap_region go the right
+	 * way when do_mmap_pgoff unwinds (may be important on powerpc
+	 * and ia64).
 	 */
 	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
 	vma->vm_ops = &hugetlb_vm_ops;
 
+	if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT))
+		return -EINVAL;
+
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
 	mutex_lock(&inode->i_mutex);
@@ -132,7 +137,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len, pgoff))
+		if (prepare_hugepage_range(addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e6a71c82d20..3a19b032c0e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -66,11 +66,8 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len,
-						pgoff_t pgoff)
+static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-	if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
-		return -EINVAL;
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 	if (addr & ~HPAGE_MASK)
@@ -78,8 +75,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len,
 	return 0;
 }
 #else
-int prepare_hugepage_range(unsigned long addr, unsigned long len,
-						pgoff_t pgoff);
+int prepare_hugepage_range(unsigned long addr, unsigned long len);
 #endif
 
 #ifndef ARCH_HAS_SETCLEAR_HUGE_PTE
@@ -117,7 +113,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define prepare_hugepage_range(addr,len,pgoff)	(-EINVAL)
+#define prepare_hugepage_range(addr,len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
-- 
cgit v1.2.3


From e89a5a43b95cdc4305b7c8e8121a380f02476636 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 31 Aug 2007 10:45:17 -0400
Subject: NFS: Fix the mount regression

This avoids the recent NFS mount regression (returning EBUSY when
mounting the same filesystem twice with different parameters).

The best I can do given the constraints appears to be to have the kernel
first look for a superblock that matches both the fsid and the
user-specified mount options, and then spawn off a new superblock if
that search fails.

Note that this is not the same as specifying nosharecache everywhere
since nosharecache will never attempt to match an existing superblock.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Tested-by: Hua Zhong <hzhong@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/super.c | 110 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 64 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b2a851c1b8c..46139003ea0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1303,34 +1303,6 @@ static void nfs_clone_super(struct super_block *sb,
  	nfs_initialise_sb(sb);
 }
 
-static int nfs_set_super(struct super_block *s, void *_server)
-{
-	struct nfs_server *server = _server;
-	int ret;
-
-	s->s_fs_info = server;
-	ret = set_anon_super(s, server);
-	if (ret == 0)
-		server->s_dev = s->s_dev;
-	return ret;
-}
-
-static int nfs_compare_super(struct super_block *sb, void *data)
-{
-	struct nfs_server *server = data, *old = NFS_SB(sb);
-
-	if (memcmp(&old->nfs_client->cl_addr,
-				&server->nfs_client->cl_addr,
-				sizeof(old->nfs_client->cl_addr)) != 0)
-		return 0;
-	/* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
-	if (old->flags & NFS_MOUNT_UNSHARED)
-		return 0;
-	if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
-		return 0;
-	return 1;
-}
-
 #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
 
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
@@ -1359,9 +1331,46 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
 		goto Ebusy;
 	if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
 		goto Ebusy;
-	return 0;
+	return 1;
 Ebusy:
-	return -EBUSY;
+	return 0;
+}
+
+struct nfs_sb_mountdata {
+	struct nfs_server *server;
+	int mntflags;
+};
+
+static int nfs_set_super(struct super_block *s, void *data)
+{
+	struct nfs_sb_mountdata *sb_mntdata = data;
+	struct nfs_server *server = sb_mntdata->server;
+	int ret;
+
+	s->s_flags = sb_mntdata->mntflags;
+	s->s_fs_info = server;
+	ret = set_anon_super(s, server);
+	if (ret == 0)
+		server->s_dev = s->s_dev;
+	return ret;
+}
+
+static int nfs_compare_super(struct super_block *sb, void *data)
+{
+	struct nfs_sb_mountdata *sb_mntdata = data;
+	struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb);
+	int mntflags = sb_mntdata->mntflags;
+
+	if (memcmp(&old->nfs_client->cl_addr,
+				&server->nfs_client->cl_addr,
+				sizeof(old->nfs_client->cl_addr)) != 0)
+		return 0;
+	/* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
+	if (old->flags & NFS_MOUNT_UNSHARED)
+		return 0;
+	if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
+		return 0;
+	return nfs_compare_mount_options(sb, server, mntflags);
 }
 
 static int nfs_get_sb(struct file_system_type *fs_type,
@@ -1373,6 +1382,9 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	struct nfs_mount_data *data = raw_data;
 	struct dentry *mntroot;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+	};
 	int error;
 
 	/* Validate the mount data */
@@ -1386,28 +1398,25 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 		error = PTR_ERR(server);
 		goto out;
 	}
+	sb_mntdata.server = server;
 
 	if (server->flags & NFS_MOUNT_UNSHARED)
 		compare_super = NULL;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(fs_type, compare_super, nfs_set_super, server);
+	s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
 	}
 
 	if (s->s_fs_info != server) {
-		error = nfs_compare_mount_options(s, server, flags);
 		nfs_free_server(server);
 		server = NULL;
-		if (error < 0)
-			goto error_splat_super;
 	}
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		s->s_flags = flags;
 		nfs_fill_super(s, data);
 	}
 
@@ -1460,6 +1469,9 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	struct nfs_server *server;
 	struct dentry *mntroot;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+	};
 	int error;
 
 	dprintk("--> nfs_xdev_get_sb()\n");
@@ -1470,28 +1482,25 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 		error = PTR_ERR(server);
 		goto out_err_noserver;
 	}
+	sb_mntdata.server = server;
 
 	if (server->flags & NFS_MOUNT_UNSHARED)
 		compare_super = NULL;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, compare_super, nfs_set_super, server);
+	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
 	}
 
 	if (s->s_fs_info != server) {
-		error = nfs_compare_mount_options(s, server, flags);
 		nfs_free_server(server);
 		server = NULL;
-		if (error < 0)
-			goto error_splat_super;
 	}
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		s->s_flags = flags;
 		nfs_clone_super(s, data->sb);
 	}
 
@@ -1729,6 +1738,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	struct dentry *mntroot;
 	char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+	};
 	int error;
 
 	/* Validate the mount data */
@@ -1744,12 +1756,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 		error = PTR_ERR(server);
 		goto out;
 	}
+	sb_mntdata.server = server;
 
 	if (server->flags & NFS4_MOUNT_UNSHARED)
 		compare_super = NULL;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(fs_type, compare_super, nfs_set_super, server);
+	s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_free;
@@ -1762,7 +1775,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		s->s_flags = flags;
 		nfs4_fill_super(s);
 	}
 
@@ -1816,6 +1828,9 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	struct nfs_server *server;
 	struct dentry *mntroot;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+	};
 	int error;
 
 	dprintk("--> nfs4_xdev_get_sb()\n");
@@ -1826,12 +1841,13 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 		error = PTR_ERR(server);
 		goto out_err_noserver;
 	}
+	sb_mntdata.server = server;
 
 	if (server->flags & NFS4_MOUNT_UNSHARED)
 		compare_super = NULL;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, compare_super, nfs_set_super, server);
+	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
@@ -1844,7 +1860,6 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		s->s_flags = flags;
 		nfs4_clone_super(s, data->sb);
 	}
 
@@ -1887,6 +1902,9 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 	struct dentry *mntroot;
 	struct nfs_fh mntfh;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+	};
 	int error;
 
 	dprintk("--> nfs4_referral_get_sb()\n");
@@ -1897,12 +1915,13 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 		error = PTR_ERR(server);
 		goto out_err_noserver;
 	}
+	sb_mntdata.server = server;
 
 	if (server->flags & NFS4_MOUNT_UNSHARED)
 		compare_super = NULL;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, compare_super, nfs_set_super, server);
+	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
@@ -1915,7 +1934,6 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		s->s_flags = flags;
 		nfs4_fill_super(s);
 	}
 
-- 
cgit v1.2.3


From 560aef74503e928f44ddbf481b8b02d9cef37dbf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 27 Aug 2007 09:14:56 -0400
Subject: NFS: Fix use of cancel_delayed_work_sync in
 nfs_release_automount_timer

Doh! We can't use cancel_delayed_work_sync because we may have been called
from an unmount that was being performed by nfs_automount_task.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index aea76d0e5fb..acfc56f9edc 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -176,7 +176,7 @@ static void nfs_expire_automounts(struct work_struct *work)
 void nfs_release_automount_timer(void)
 {
 	if (list_empty(&nfs_automount_list))
-		cancel_delayed_work_sync(&nfs_automount_task);
+		cancel_delayed_work(&nfs_automount_task);
 }
 
 /*
-- 
cgit v1.2.3


From 65bbf6bdbba0f74e254f706bfb00fe533974f4f1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 27 Aug 2007 09:57:46 -0400
Subject: NFSv4: Fix a typo in _nfs4_do_open_reclaim

This should fix the following Oops reported by Jeff Garzik:

kernel BUG at fs/nfs/nfs4xdr.c:1040!
invalid opcode: 0000 [1] SMP
CPU 0
Modules linked in: nfs lockd sunrpc af_packet
ipv6 cpufreq_ondemand acpi_cpufreq battery floppy nvram sg snd_hda_intel
ata_generic snd_pcm_oss snd_mixer_oss snd_pcm i2c_i801 snd_page_alloc e1000
firewire_ohci ata_piix i2c_core sr_mod cdrom sata_sil ahci libata sd_mod
scsi_mod ext3 jbd ehci_hcd uhci_hcd
Pid: 16353, comm: 10.10.10.1-recl Not tainted 2.6.23-rc3 #1
RIP: 0010:[<ffffffff88240980>] [<ffffffff88240980>] :nfs:encode_open+0x1c0/0x330
RSP: 0018:ffff8100467c5c60  EFLAGS: 00010202
RAX: ffff81000f89b8b8 RBX: 00000000697a6f6d RCX: ffff81000f89b8b8
RDX: 0000000000000004 RSI: 0000000000000004 RDI: ffff8100467c5c80
RBP: ffff8100467c5c80 R08: ffff81000f89bc30 R09: ffff81000f89b83f
R10: 0000000000000001 R11: ffffffff881e79e0 R12: ffff81003cbd1808
R13: ffff81000f89b860 R14: ffff81005fc984e0 R15: ffffffff88240af0
FS:  0000000000000000(0000) GS:ffffffff8052a000(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 00002adb9e51a030 CR3: 000000007ea7e000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process 10.10.10.1-recl (pid: 16353, threadinfo ffff8100467c4000, task ffff8100038ce780)
Stack:  ffff81004aeb6a40 ffff81003cbd1808 ffff81003cbd1808 ffffffff88240b5d
 ffff81000f89b8bc ffff81005fc984e8 ffff81000f89bc30 ffff81005fc984e8
 0000000300000000 0000000000000000 0000000000000000 ffff81003cbd1800
Call Trace:
 [<ffffffff88240b5d>] :nfs:nfs4_xdr_enc_open_noattr+0x6d/0x90
 [<ffffffff881e74b7>] :sunrpc:rpcauth_wrap_req+0x97/0xf0
 [<ffffffff88240af0>] :nfs:nfs4_xdr_enc_open_noattr+0x0/0x90
 [<ffffffff881df57a>] :sunrpc:call_transmit+0x18a/0x290
 [<ffffffff881e5e7b>] :sunrpc:__rpc_execute+0x6b/0x290
 [<ffffffff881dff76>] :sunrpc:rpc_do_run_task+0x76/0xd0
 [<ffffffff882373f6>] :nfs:_nfs4_proc_open+0x76/0x230
 [<ffffffff88237a2e>] :nfs:nfs4_open_recover_helper+0x5e/0xc0
 [<ffffffff88237b74>] :nfs:nfs4_open_recover+0xe4/0x120
 [<ffffffff88238e14>] :nfs:nfs4_open_reclaim+0xa4/0xf0
 [<ffffffff882413c5>] :nfs:nfs4_reclaim_open_state+0x55/0x1b0
 [<ffffffff882417ea>] :nfs:reclaimer+0x2ca/0x390
 [<ffffffff88241520>] :nfs:reclaimer+0x0/0x390
 [<ffffffff8024e59b>] kthread+0x4b/0x80
 [<ffffffff8020cad8>] child_rip+0xa/0x12
 [<ffffffff8024e550>] kthread+0x0/0x80
 [<ffffffff8020cace>] child_rip+0x0/0x12


Code: 0f 0b eb fe 48 89 ef c7 00 00 00 00 02 be 08 00 00 00 e8 79
RIP  [<ffffffff88240980>] :nfs:encode_open+0x1c0/0x330
 RSP <ffff8100467c5c60>

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 62b3ae28031..036d8625b4f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -646,7 +646,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(state->inode)->delegation);
 	if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) != 0)
-		delegation_type = delegation->flags;
+		delegation_type = delegation->type;
 	rcu_read_unlock();
 	opendata->o_arg.u.delegation_type = delegation_type;
 	status = nfs4_open_recover(opendata, state);
-- 
cgit v1.2.3


From deee9369b993b52a8e2f25683b4a67be7a65e8ae Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 27 Aug 2007 11:33:00 -0400
Subject: NFSv4: Ensure that we pass the correct dentry to nfs4_intent_set_file

This patch fixes an Oops that was reported by Gabriel Barazer.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 036d8625b4f..4b90e17555a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1434,7 +1434,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	}
 	res = d_add_unique(dentry, igrab(state->inode));
 	if (res != NULL)
-		dentry = res;
+		path.dentry = res;
 	nfs4_intent_set_file(nd, &path, state);
 	return res;
 }
-- 
cgit v1.2.3


From fdb66ff4ace3c4e18809b9b71dfcce1692143147 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 29 Aug 2007 17:58:57 -0400
Subject: NFS: mount option parser chokes on proto=

The new text-based NFS mount option parsing logic doesn't recognize any
valid transport protocols due to a silly mistake in the protocol token
matching logic.  This prevents basic mount requests such as:

   mount.nfs server:/export /mnt -o proto=tcp

from working with the new text-based NFS mount API.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 46139003ea0..66a223b22a8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -911,13 +911,13 @@ static int nfs_parse_mount_options(char *raw,
 			kfree(string);
 
 			switch (token) {
-			case Opt_udp:
+			case Opt_xprt_udp:
 				mnt->flags &= ~NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = IPPROTO_UDP;
 				mnt->timeo = 7;
 				mnt->retrans = 5;
 				break;
-			case Opt_tcp:
+			case Opt_xprt_tcp:
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = IPPROTO_TCP;
 				mnt->timeo = 600;
@@ -936,10 +936,10 @@ static int nfs_parse_mount_options(char *raw,
 			kfree(string);
 
 			switch (token) {
-			case Opt_udp:
+			case Opt_xprt_udp:
 				mnt->mount_server.protocol = IPPROTO_UDP;
 				break;
-			case Opt_tcp:
+			case Opt_xprt_tcp:
 				mnt->mount_server.protocol = IPPROTO_TCP;
 				break;
 			default:
-- 
cgit v1.2.3


From fdc6e2c8c0dc0ac702fca0b802f5d9ae99a54bb6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 29 Aug 2007 17:58:59 -0400
Subject: NFS: Return a real error code from mount(2)

Don't filter the return code from the in-kernel rpcbind or NFS mount
clients.  Return the real error code so that callers of the new NFS
text-based mount API can apply a useful retry strategy.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 66a223b22a8..9cd0828010c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1166,7 +1166,7 @@ static int nfs_validate_mount_data(struct nfs_mount_data **options,
 
 		status = nfs_try_mount(&args, mntfh);
 		if (status)
-			return -EINVAL;
+			return status;
 
 		/*
 		 * Translate to nfs_mount_data, which nfs_fill_super
-- 
cgit v1.2.3


From 350c73af6af51ae7654dad91874c0d30dd13bbbe Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 29 Aug 2007 17:59:01 -0400
Subject: NFS: Off-by-one length error in string handling

The hostname was getting truncated in the new text-based NFS mount API.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9cd0828010c..ef3643284f7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1153,7 +1153,7 @@ static int nfs_validate_mount_data(struct nfs_mount_data **options,
 		c = strchr(dev_name, ':');
 		if (c == NULL)
 			return -EINVAL;
-		len = c - dev_name - 1;
+		len = c - dev_name;
 		if (len > sizeof(data->hostname))
 			return -EINVAL;
 		strncpy(data->hostname, dev_name, len);
-- 
cgit v1.2.3


From 7d1cca72994c0e910ca443076dcfcfd473871dda Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 29 Aug 2007 17:59:03 -0400
Subject: NFS: change NFS mount error return when hostname/pathname too long

According to the mount(2) man page, the proper error return code for the
mount(2) system call when the special device name or the mounted-on
directory name is too long is ENAMETOOLONG.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ef3643284f7..8ed593766f1 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1155,13 +1155,13 @@ static int nfs_validate_mount_data(struct nfs_mount_data **options,
 			return -EINVAL;
 		len = c - dev_name;
 		if (len > sizeof(data->hostname))
-			return -EINVAL;
+			return -ENAMETOOLONG;
 		strncpy(data->hostname, dev_name, len);
 		args.nfs_server.hostname = data->hostname;
 
 		c++;
 		if (strlen(c) > NFS_MAXPATHLEN)
-			return -EINVAL;
+			return -ENAMETOOLONG;
 		args.nfs_server.export_path = c;
 
 		status = nfs_try_mount(&args, mntfh);
@@ -1677,7 +1677,7 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
 		/* while calculating len, pretend ':' is '\0' */
 		len = c - dev_name;
 		if (len > NFS4_MAXNAMLEN)
-			return -EINVAL;
+			return -ENAMETOOLONG;
 		*hostname = kzalloc(len, GFP_KERNEL);
 		if (*hostname == NULL)
 			return -ENOMEM;
@@ -1686,7 +1686,7 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
 		c++;			/* step over the ':' */
 		len = strlen(c);
 		if (len > NFS4_MAXPATHLEN)
-			return -EINVAL;
+			return -ENAMETOOLONG;
 		*mntpath = kzalloc(len + 1, GFP_KERNEL);
 		if (*mntpath == NULL)
 			return -ENOMEM;
-- 
cgit v1.2.3


From 1b3b4a1a2deb7d3e5d66063bd76304d840c966b3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 28 Aug 2007 10:29:36 -0400
Subject: NFS: Fix a write request leak in nfs_invalidate_page()

Ryusuke Konishi says:

The recent truncate_complete_page() clears the dirty flag from a page
before calling a_ops->invalidatepage(),
^^^^^^
static void
truncate_complete_page(struct address_space *mapping, struct page *page)
{
        ...
        cancel_dirty_page(page, PAGE_CACHE_SIZE);  <--- Inserted here at
kernel 2.6.20

        if (PagePrivate(page))
                do_invalidatepage(page, 0);   ---> will call
a_ops->invalidatepage()
        ...
}

and this is disturbing nfs_wb_page_priority() from calling
nfs_writepage_locked() that is expected to handle the pending
request (=nfs_page) associated with the page.

int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
{
        ...
        if (clear_page_dirty_for_io(page)) {
                ret = nfs_writepage_locked(page, &wbc);
                if (ret < 0)
                        goto out;
        }
        ...
}

Since truncate_complete_page() will get rid of the page after
a_ops->invalidatepage() returns, the request (=nfs_page) associated
with the page becomes a garbage in nfs_inode->nfs_page_tree.
------------------------

Fix this by ensuring that nfs_wb_page_priority() recognises that it may
also need to clear out non-dirty pages that have an nfs_page associated
with them.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c          |  2 +-
 fs/nfs/write.c         | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_fs.h |  1 +
 3 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c87dc713b5d..579cf8a7d4a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -316,7 +316,7 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
 	if (offset != 0)
 		return;
 	/* Cancel any unstarted writes on this page */
-	nfs_wb_page_priority(page->mapping->host, page, FLUSH_INVALIDATE);
+	nfs_wb_page_cancel(page->mapping->host, page);
 }
 
 static int nfs_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ef97e0c0f5b..0d7a77cc394 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1396,6 +1396,50 @@ out:
 	return ret;
 }
 
+int nfs_wb_page_cancel(struct inode *inode, struct page *page)
+{
+	struct nfs_page *req;
+	loff_t range_start = page_offset(page);
+	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+	struct writeback_control wbc = {
+		.bdi = page->mapping->backing_dev_info,
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = LONG_MAX,
+		.range_start = range_start,
+		.range_end = range_end,
+	};
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+	for (;;) {
+		req = nfs_page_find_request(page);
+		if (req == NULL)
+			goto out;
+		if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+			nfs_release_request(req);
+			break;
+		}
+		if (nfs_lock_request_dontget(req)) {
+			nfs_inode_remove_request(req);
+			/*
+			 * In case nfs_inode_remove_request has marked the
+			 * page as being dirty
+			 */
+			cancel_dirty_page(page, PAGE_CACHE_SIZE);
+			nfs_unlock_request(req);
+			break;
+		}
+		ret = nfs_wait_on_request(req);
+		if (ret < 0)
+			goto out;
+	}
+	if (!PagePrivate(page))
+		return 0;
+	ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
+out:
+	return ret;
+}
+
 int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
 {
 	loff_t range_start = page_offset(page);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 157dcb055b5..7250eeadd7b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -431,6 +431,7 @@ extern int nfs_sync_mapping_range(struct address_space *, loff_t, loff_t, int);
 extern int nfs_wb_all(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_priority(struct inode *inode, struct page* page, int how);
+extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_write_data *nfs_commit_alloc(void);
-- 
cgit v1.2.3


From fc0e01974ccccc7530b7634a63ee3fcc57b845ea Mon Sep 17 00:00:00 2001
From: Jason Lunz <lunz@falooley.org>
Date: Sat, 1 Sep 2007 12:06:03 -0700
Subject: [JFFS2] fix write deadlock regression

I've bisected the deadlock when many small appends are done on jffs2 down to
this commit:

commit 6fe6900e1e5b6fa9e5c59aa5061f244fe3f467e2
Author: Nick Piggin <npiggin@suse.de>
Date:   Sun May 6 14:49:04 2007 -0700

    mm: make read_cache_page synchronous

    Ensure pages are uptodate after returning from read_cache_page, which allows
    us to cut out most of the filesystem-internal PageUptodate calls.

    I didn't have a great look down the call chains, but this appears to fixes 7
    possible use-before uptodate in hfs, 2 in hfsplus, 1 in jfs, a few in
    ecryptfs, 1 in jffs2, and a possible cleared data overwritten with readpage in
    block2mtd.  All depending on whether the filler is async and/or can return
    with a !uptodate page.

It introduced a wait to read_cache_page, as well as a
read_cache_page_async function equivalent to the old read_cache_page
without any callers.

Switching jffs2_gc_fetch_page to read_cache_page_async for the old
behavior makes the deadlocks go away, but maybe reintroduces the
use-before-uptodate problem? I don't understand the mm/fs interaction
well enough to say.

[It's fine. dwmw2.]

Signed-off-by: Jason Lunz <lunz@falooley.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/fs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 1d3b7a9fc82..8bc727b7169 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -627,7 +627,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
 	struct inode *inode = OFNI_EDONI_2SFFJ(f);
 	struct page *pg;
 
-	pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+	pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
 			     (void *)jffs2_do_readpage_unlock, inode);
 	if (IS_ERR(pg))
 		return (void *)pg;
-- 
cgit v1.2.3


From 8da22d7a3690818f6d340baa0ea585e71f0c506f Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 16 Aug 2007 15:20:56 +1000
Subject: [XFS] Set filestreams object timeout to something sane.

SGI-PV: 968554
SGI-Modid: xfs-linux-melb:xfs-kern:29303a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/linux-2.6/xfs_globals.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index bb72c3d4141..81565dea9af 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -46,7 +46,7 @@ xfs_param_t xfs_params = {
 	.inherit_nosym	= {	0,		0,		1	},
 	.rotorstep	= {	1,		1,		255	},
 	.inherit_nodfrg	= {	0,		1,		1	},
-	.fstrm_timer	= {	1,		50,		3600*100},
+	.fstrm_timer	= {	1,		30*100,		3600*100},
 };
 
 /*
-- 
cgit v1.2.3


From 4b80916b29170744632356dd2e801f7c374676eb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 16 Aug 2007 15:37:36 +1000
Subject: [XFS] Fix sparse NULL vs 0 warnings

Sparse now warns about comparing pointers to 0, so change all instance
where that happens to NULL instead.

SGI-PV: 968555
SGI-Modid: xfs-linux-melb:xfs-kern:29308a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/xfs_log.c         | 12 ++++++------
 fs/xfs/xfs_log_recover.c | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 9d4c4fbeb3e..9bfb69e1e88 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2185,13 +2185,13 @@ xlog_state_do_callback(
 			}
 			cb = iclog->ic_callback;
 
-			while (cb != 0) {
+			while (cb) {
 				iclog->ic_callback_tail = &(iclog->ic_callback);
 				iclog->ic_callback = NULL;
 				LOG_UNLOCK(log, s);
 
 				/* perform callbacks in the order given */
-				for (; cb != 0; cb = cb_next) {
+				for (; cb; cb = cb_next) {
 					cb_next = cb->cb_next;
 					cb->cb_func(cb->cb_arg, aborted);
 				}
@@ -2202,7 +2202,7 @@ xlog_state_do_callback(
 			loopdidcallbacks++;
 			funcdidcallbacks++;
 
-			ASSERT(iclog->ic_callback == 0);
+			ASSERT(iclog->ic_callback == NULL);
 			if (!(iclog->ic_state & XLOG_STATE_IOERROR))
 				iclog->ic_state = XLOG_STATE_DIRTY;
 
@@ -3242,10 +3242,10 @@ xlog_ticket_put(xlog_t		*log,
 #else
 	/* When we debug, it is easier if tickets are cycled */
 	ticket->t_next     = NULL;
-	if (log->l_tail != 0) {
+	if (log->l_tail) {
 		log->l_tail->t_next = ticket;
 	} else {
-		ASSERT(log->l_freelist == 0);
+		ASSERT(log->l_freelist == NULL);
 		log->l_freelist = ticket;
 	}
 	log->l_tail	    = ticket;
@@ -3463,7 +3463,7 @@ xlog_verify_iclog(xlog_t	 *log,
 	s = LOG_LOCK(log);
 	icptr = log->l_iclog;
 	for (i=0; i < log->l_iclog_bufs; i++) {
-		if (icptr == 0)
+		if (icptr == NULL)
 			xlog_panic("xlog_verify_iclog: invalid ptr");
 		icptr = icptr->ic_next;
 	}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index fddbb091a86..8ae6e8e5f3d 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1366,7 +1366,7 @@ xlog_recover_add_to_cont_trans(
 	int			old_len;
 
 	item = trans->r_itemq;
-	if (item == 0) {
+	if (item == NULL) {
 		/* finish copying rest of trans header */
 		xlog_recover_add_item(&trans->r_itemq);
 		ptr = (xfs_caddr_t) &trans->r_theader +
@@ -1412,7 +1412,7 @@ xlog_recover_add_to_trans(
 	if (!len)
 		return 0;
 	item = trans->r_itemq;
-	if (item == 0) {
+	if (item == NULL) {
 		ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
 		if (len == sizeof(xfs_trans_header_t))
 			xlog_recover_add_item(&trans->r_itemq);
@@ -1467,12 +1467,12 @@ xlog_recover_unlink_tid(
 	xlog_recover_t		*tp;
 	int			found = 0;
 
-	ASSERT(trans != 0);
+	ASSERT(trans != NULL);
 	if (trans == *q) {
 		*q = (*q)->r_next;
 	} else {
 		tp = *q;
-		while (tp != 0) {
+		while (tp) {
 			if (tp->r_next == trans) {
 				found = 1;
 				break;
@@ -1495,7 +1495,7 @@ xlog_recover_insert_item_backq(
 	xlog_recover_item_t	**q,
 	xlog_recover_item_t	*item)
 {
-	if (*q == 0) {
+	if (*q == NULL) {
 		item->ri_prev = item->ri_next = item;
 		*q = item;
 	} else {
@@ -1899,7 +1899,7 @@ xlog_recover_do_reg_buffer(
 			break;
 		nbits = xfs_contig_bits(data_map, map_size, bit);
 		ASSERT(nbits > 0);
-		ASSERT(item->ri_buf[i].i_addr != 0);
+		ASSERT(item->ri_buf[i].i_addr != NULL);
 		ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
 		ASSERT(XFS_BUF_COUNT(bp) >=
 		       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
-- 
cgit v1.2.3


From 34521c5e4971d01f6ef650fdee59e07be6c2c5e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 16 Aug 2007 15:37:57 +1000
Subject: [XFS] Fix sparse warning in kmem_shake_allow

We can't return a masked result of a __bitwise type. Compare it to 0 first
to keep the behaviour without the warning.

SGI-PV: 968555
SGI-Modid: xfs-linux-melb:xfs-kern:29309a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/linux-2.6/kmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index b4acc7f3c37..e6ea293f303 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -103,7 +103,7 @@ extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
 static inline int
 kmem_shake_allow(gfp_t gfp_mask)
 {
-	return (gfp_mask & __GFP_WAIT);
+	return (gfp_mask & __GFP_WAIT) != 0;
 }
 
 #endif /* __XFS_SUPPORT_KMEM_H__ */
-- 
cgit v1.2.3


From ee5c80239d5f152d99f69165afbd115518353563 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 16 Aug 2007 15:38:08 +1000
Subject: [XFS] fix ASSERT and ASSERT_ALWAYS

- remove the != 0 inside the unlikely in ASSERT_ALWAYS because sparse now
  complains about comparisons between pointers and 0
- add a standalone ASSERT implementation because defining it to
  ASSERT_ALWAYS means the string is expanded before the token passing
  stringification. This way we get the actual content of the
  assertion in the assfail message and don't overflow sparse's
  stringification buffer leading to sparse error messages.

SGI-PV: 968555
SGI-Modid: xfs-linux-melb:xfs-kern:29310a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/support/debug.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index a27a7c8c052..855da040864 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -34,10 +34,10 @@ extern void cmn_err(int, char *, ...)
 extern void assfail(char *expr, char *f, int l);
 
 #define ASSERT_ALWAYS(expr)	\
-	(unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
 #ifndef DEBUG
-# define ASSERT(expr)	((void)0)
+#define ASSERT(expr)	((void)0)
 
 #ifndef STATIC
 # define STATIC static noinline
@@ -49,8 +49,10 @@ extern void assfail(char *expr, char *f, int l);
 
 #else /* DEBUG */
 
-# define ASSERT(expr)	ASSERT_ALWAYS(expr)
-# include <linux/random.h>
+#include <linux/random.h>
+
+#define ASSERT(expr)	\
+	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
 #ifndef STATIC
 # define STATIC noinline
-- 
cgit v1.2.3


From 265c1fac38e37e828df09965406e9cc20bfa3588 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 16 Aug 2007 15:38:19 +1000
Subject: [XFS] fix sparse shadowed variable warnings

- in xfs_probe_cluster rename the inner len to pg_len. There's no harm
  here because the outer len isn't used after the inner len comes into
  existence but it keeps the code clean.
- in xfs_da_do_buf remove the inner i because they don't overlap
  and they are both the same type.

SGI-PV: 968555
SGI-Modid: xfs-linux-melb:xfs-kern:29311a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 8 ++++----
 fs/xfs/xfs_da_btree.c       | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index fd4105d662e..d9c40fe6419 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -652,7 +652,7 @@ xfs_probe_cluster(
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
-			size_t pg_offset, len = 0;
+			size_t pg_offset, pg_len = 0;
 
 			if (tindex == tlast) {
 				pg_offset =
@@ -665,16 +665,16 @@ xfs_probe_cluster(
 				pg_offset = PAGE_CACHE_SIZE;
 
 			if (page->index == tindex && !TestSetPageLocked(page)) {
-				len = xfs_probe_page(page, pg_offset, mapped);
+				pg_len = xfs_probe_page(page, pg_offset, mapped);
 				unlock_page(page);
 			}
 
-			if (!len) {
+			if (!pg_len) {
 				done = 1;
 				break;
 			}
 
-			total += len;
+			total += pg_len;
 			tindex++;
 		}
 
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index aea37df4aa6..26d09e2e1a7 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1975,7 +1975,6 @@ xfs_da_do_buf(
 		error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
 		if (unlikely(error == EFSCORRUPTED)) {
 			if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
-				int	i;
 				cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n",
 					(long long)bno);
 				cmn_err(CE_ALERT, "dir: inode %lld\n",
-- 
cgit v1.2.3


From 5995cb7d805496362e5af73235145667096fbc6f Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 16 Aug 2007 16:49:11 +1000
Subject: [XFS] fix nasty quota hashtable allocation bug

This git mod: 77e4635ae191774526ed695482a151ac986f3806
converted to a "greedy" allocation interface, but for the quota hashtables
it switched from allocating XFS_QM_HASHSIZE (nr of elements)
xfs_dqhash_t's to allocating only XFS_QM_HASHSIZE *bytes* - quite a lot
smaller! Then when we converted hsize "back" to nr of elements (the
division line) hsize went to 0. This was leading to oopses when running
any quota tests on the Fedora 8 test kernel, but the problem has been
there for almost a year.

SGI-PV: 968837
SGI-Modid: xfs-linux-melb:xfs-kern:29354a

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 2d274b23ade..6ff0f4de163 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -120,7 +120,8 @@ xfs_Gqm_init(void)
 	 * Initialize the dquot hash tables.
 	 */
 	udqhash = kmem_zalloc_greedy(&hsize,
-				     XFS_QM_HASHSIZE_LOW, XFS_QM_HASHSIZE_HIGH,
+				     XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
+				     XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t),
 				     KM_SLEEP | KM_MAYFAIL | KM_LARGE);
 	gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE);
 	hsize /= sizeof(xfs_dqhash_t);
-- 
cgit v1.2.3


From a1033be72cdb053e182bb41e302a1c11e72b68bb Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Wed, 5 Sep 2007 17:22:12 -0400
Subject: knfsd: Fixed problem with NFS exporting directories which are mounted
 on.

Recent changes in NFSd cause a directory which is mounted-on
to not appear properly when the filesystem containing it is exported.

*exp_get* now returns -ENOENT rather than NULL and when
  commit 5d3dbbeaf56d0365ac6b5c0a0da0bd31cc4781e1
removed the NULL checks, it didn't add a check for -ENOENT.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/vfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a0c2b253818..7867151ebb8 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -115,7 +115,8 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 
 	exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
 	if (IS_ERR(exp2)) {
-		err = PTR_ERR(exp2);
+		if (PTR_ERR(exp2) != -ENOENT)
+			err = PTR_ERR(exp2);
 		dput(mounts);
 		mntput(mnt);
 		goto out;
-- 
cgit v1.2.3


From b8da0d1c27f144bce999c653467106f3f0d5a308 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Wed, 5 Sep 2007 17:22:13 -0400
Subject: knfsd: Validate filehandle type in fsid_source

fsid_source decided where to get the 'fsid' number to
return for a GETATTR based on the type of filehandle.
It can be from the device, from the fsid, or from the
UUID.

It is possible for the filehandle to be inconsistent
with the export information, so make sure the export information
actually has the info implied by the value returned by
fsid_source.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: "Luiz Fernando N. Capitulino" <lcapitulino@gmail.com>
Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfsfh.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 0eb464a39aa..7011d62acfc 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -566,13 +566,23 @@ enum fsid_source fsid_source(struct svc_fh *fhp)
 	case FSID_DEV:
 	case FSID_ENCODE_DEV:
 	case FSID_MAJOR_MINOR:
-		return FSIDSOURCE_DEV;
+		if (fhp->fh_export->ex_dentry->d_inode->i_sb->s_type->fs_flags
+		    & FS_REQUIRES_DEV)
+			return FSIDSOURCE_DEV;
+		break;
 	case FSID_NUM:
-		return FSIDSOURCE_FSID;
-	default:
 		if (fhp->fh_export->ex_flags & NFSEXP_FSID)
 			return FSIDSOURCE_FSID;
-		else
-			return FSIDSOURCE_UUID;
+		break;
+	default:
+		break;
 	}
+	/* either a UUID type filehandle, or the filehandle doesn't
+	 * match the export.
+	 */
+	if (fhp->fh_export->ex_flags & NFSEXP_FSID)
+		return FSIDSOURCE_FSID;
+	if (fhp->fh_export->ex_uuid)
+		return FSIDSOURCE_UUID;
+	return FSIDSOURCE_DEV;
 }
-- 
cgit v1.2.3


From 10b0845bed2b93f88d9758880a0a0e53f50c5139 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 23 Aug 2007 11:17:55 -0700
Subject: ocfs2: update docs for new features

Update documentation listing ocfs2 features to reflect the current state of
the file system. Add missing descriptions for some mount options which ocfs2
supports.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 Documentation/filesystems/ocfs2.txt | 13 +++++++++----
 fs/Kconfig                          |  3 ---
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index 8ccf0c1b58e..ed55238023a 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -28,11 +28,7 @@ Manish Singh  <manish.singh@oracle.com>
 Caveats
 =======
 Features which OCFS2 does not support yet:
-	- sparse files
 	- extended attributes
-	- shared writable mmap
-	- loopback is supported, but data written will not
-	  be cluster coherent.
 	- quotas
 	- cluster aware flock
 	- cluster aware lockf
@@ -57,3 +53,12 @@ nointr			Do not allow signals to interrupt cluster
 atime_quantum=60(*)	OCFS2 will not update atime unless this number
 			of seconds has passed since the last update.
 			Set to zero to always update atime.
+data=ordered	(*)	All data are forced directly out to the main file
+			system prior to its metadata being committed to the
+			journal.
+data=writeback		Data ordering is not preserved, data may be written
+			into the main file system after its metadata has been
+			committed to the journal.
+preferred_slot=0(*)	During mount, try to use this filesystem slot first. If
+			it is in use by another node, the first empty one found
+			will be chosen. Invalid values will be ignored.
diff --git a/fs/Kconfig b/fs/Kconfig
index 58a0650293e..f9eed6d7906 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -441,9 +441,6 @@ config OCFS2_FS
 
 	  Note: Features which OCFS2 does not support yet:
 	          - extended attributes
-		  - shared writeable mmap
-	          - loopback is supported, but data written will not
-	            be cluster coherent.
 	          - quotas
 	          - cluster aware flock
 	          - Directory change notification (F_NOTIFY)
-- 
cgit v1.2.3


From c0123adef626607535f3c2c93b530c36780885e0 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Sat, 8 Sep 2007 00:16:10 +0800
Subject: [PATCH] ocfs2: fix mount option parsing

For some mount option types, ocfs2_parse_options() will try to access
sb->s_fs_info to get at the ocfs2 private superblock. Unfortunately, that
hasn't been allocated yet and will cause a kernel crash.

Fix this by storing options in a struct which can then get pushed into the
ocfs2_super once it's been allocated later. If we need more options which
store to the ocfs2_super in the future, we can just fields to this struct.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/super.c | 69 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f2fc9a795de..c034b5129c1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -81,8 +81,15 @@ static struct dentry *ocfs2_debugfs_root = NULL;
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
 
+struct mount_options
+{
+	unsigned long	mount_opt;
+	unsigned int	atime_quantum;
+	signed short	slot;
+};
+
 static int ocfs2_parse_options(struct super_block *sb, char *options,
-			       unsigned long *mount_opt, s16 *slot,
+			       struct mount_options *mopt,
 			       int is_remount);
 static void ocfs2_put_super(struct super_block *sb);
 static int ocfs2_mount_volume(struct super_block *sb);
@@ -367,24 +374,23 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 {
 	int incompat_features;
 	int ret = 0;
-	unsigned long parsed_options;
-	s16 slot;
+	struct mount_options parsed_options;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	if (!ocfs2_parse_options(sb, data, &parsed_options, &slot, 1)) {
+	if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
-	    (parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
+	    (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
 		goto out;
 	}
 
 	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
-	    (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
+	    (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot change data mode on remount\n");
 		goto out;
@@ -435,7 +441,9 @@ unlock_osb:
 
 		/* Only save off the new mount options in case of a successful
 		 * remount. */
-		osb->s_mount_opt = parsed_options;
+		osb->s_mount_opt = parsed_options.mount_opt;
+		osb->s_atime_quantum = parsed_options.atime_quantum;
+		osb->preferred_slot = parsed_options.slot;
 	}
 out:
 	return ret;
@@ -547,8 +555,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct dentry *root;
 	int status, sector_size;
-	unsigned long parsed_opt;
-	s16 slot;
+	struct mount_options parsed_options;
 	struct inode *inode = NULL;
 	struct ocfs2_super *osb = NULL;
 	struct buffer_head *bh = NULL;
@@ -556,14 +563,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	mlog_entry("%p, %p, %i", sb, data, silent);
 
-	if (!ocfs2_parse_options(sb, data, &parsed_opt, &slot, 0)) {
+	if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
 		status = -EINVAL;
 		goto read_super_error;
 	}
 
 	/* for now we only have one cluster/node, make sure we see it
 	 * in the heartbeat universe */
-	if (parsed_opt & OCFS2_MOUNT_HB_LOCAL) {
+	if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) {
 		if (!o2hb_check_local_node_heartbeating()) {
 			status = -EINVAL;
 			goto read_super_error;
@@ -585,8 +592,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	brelse(bh);
 	bh = NULL;
-	osb->s_mount_opt = parsed_opt;
-	osb->preferred_slot = slot;
+	osb->s_mount_opt = parsed_options.mount_opt;
+	osb->s_atime_quantum = parsed_options.atime_quantum;
+	osb->preferred_slot = parsed_options.slot;
 
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
@@ -728,8 +736,7 @@ static struct file_system_type ocfs2_fs_type = {
 
 static int ocfs2_parse_options(struct super_block *sb,
 			       char *options,
-			       unsigned long *mount_opt,
-			       s16 *slot,
+			       struct mount_options *mopt,
 			       int is_remount)
 {
 	int status;
@@ -738,8 +745,9 @@ static int ocfs2_parse_options(struct super_block *sb,
 	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
 		   options ? options : "(none)");
 
-	*mount_opt = 0;
-	*slot = OCFS2_INVALID_SLOT;
+	mopt->mount_opt = 0;
+	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
+	mopt->slot = OCFS2_INVALID_SLOT;
 
 	if (!options) {
 		status = 1;
@@ -749,7 +757,6 @@ static int ocfs2_parse_options(struct super_block *sb,
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token, option;
 		substring_t args[MAX_OPT_ARGS];
-		struct ocfs2_super * osb = OCFS2_SB(sb);
 
 		if (!*p)
 			continue;
@@ -757,10 +764,10 @@ static int ocfs2_parse_options(struct super_block *sb,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_hb_local:
-			*mount_opt |= OCFS2_MOUNT_HB_LOCAL;
+			mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
 			break;
 		case Opt_hb_none:
-			*mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+			mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
 			break;
 		case Opt_barrier:
 			if (match_int(&args[0], &option)) {
@@ -768,27 +775,27 @@ static int ocfs2_parse_options(struct super_block *sb,
 				goto bail;
 			}
 			if (option)
-				*mount_opt |= OCFS2_MOUNT_BARRIER;
+				mopt->mount_opt |= OCFS2_MOUNT_BARRIER;
 			else
-				*mount_opt &= ~OCFS2_MOUNT_BARRIER;
+				mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER;
 			break;
 		case Opt_intr:
-			*mount_opt &= ~OCFS2_MOUNT_NOINTR;
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR;
 			break;
 		case Opt_nointr:
-			*mount_opt |= OCFS2_MOUNT_NOINTR;
+			mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
 			break;
 		case Opt_err_panic:
-			*mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+			mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
 			break;
 		case Opt_err_ro:
-			*mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
 			break;
 		case Opt_data_ordered:
-			*mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
+			mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
 			break;
 		case Opt_data_writeback:
-			*mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
+			mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
 			break;
 		case Opt_atime_quantum:
 			if (match_int(&args[0], &option)) {
@@ -796,9 +803,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 				goto bail;
 			}
 			if (option >= 0)
-				osb->s_atime_quantum = option;
-			else
-				osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
+				mopt->atime_quantum = option;
 			break;
 		case Opt_slot:
 			option = 0;
@@ -807,7 +812,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 				goto bail;
 			}
 			if (option)
-				*slot = (s16)option;
+				mopt->slot = (s16)option;
 			break;
 		default:
 			mlog(ML_ERROR,
-- 
cgit v1.2.3


From 30b8548f2c270c0205558fe4826a6ab8e7fe51ad Mon Sep 17 00:00:00 2001
From: "tao.ma@oracle.com" <tao.ma@oracle.com>
Date: Thu, 6 Sep 2007 08:02:25 +0800
Subject: [PATCH] ocfs2: Fix a wrong cluster calculation.

In ocfs2_alloc_write_write_ctxt, the written clusters length is calculated
by the byte length only. This may cause some problems if we start to write
at some position in the end of one cluster and last to a second cluster
while the "len" is smaller than a cluster size. In that case, we have to
write 2 clusters actually.
So we have to take the start position into consideration also.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 460d440310f..50cd8a20901 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -855,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 				  struct ocfs2_super *osb, loff_t pos,
 				  unsigned len, struct buffer_head *di_bh)
 {
+	u32 cend;
 	struct ocfs2_write_ctxt *wc;
 
 	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
@@ -862,7 +863,8 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 		return -ENOMEM;
 
 	wc->w_cpos = pos >> osb->s_clustersize_bits;
-	wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
+	cend = (pos + len - 1) >> osb->s_clustersize_bits;
+	wc->w_clen = cend - wc->w_cpos + 1;
 	get_bh(di_bh);
 	wc->w_di_bh = di_bh;
 
-- 
cgit v1.2.3


From e535e2efd295c3990bb9f654c8bb6bd176ebdc2b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 31 Aug 2007 10:23:41 -0700
Subject: ocfs2: Fix calculation of i_blocks during truncate

We were setting i_blocks too early - before truncating any allocation.
Correct things to set i_blocks after the allocation change.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c | 1 +
 fs/ocfs2/file.c  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4f517665c9a..778a850b463 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5602,6 +5602,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 				      clusters_to_del;
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
 
 	status = ocfs2_trim_tree(inode, path, handle, tc,
 				 clusters_to_del, &delete_blk);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4ffa715be09..7e34e66159c 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -314,7 +314,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 	}
 
 	i_size_write(inode, new_i_size);
-	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 
 	di = (struct ocfs2_dinode *) fe_bh->b_data;
-- 
cgit v1.2.3


From 9c3013e9b91ad23ecae88e45405e98208cce455d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 11 Sep 2007 15:23:29 -0700
Subject: quota: fix infinite loop

If we fail to start a transaction when releasing dquot, we have to call
dquot_release() anyway to mark dquot structure as inactive.  Otherwise we
end in an infinite loop inside dqput().

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: xb <xavier.bru@bull.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c     | 11 ++++++++++-
 fs/ext4/super.c     | 11 ++++++++++-
 fs/reiserfs/super.c | 13 +++++++++++--
 3 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 22cfdd61c06..9537316a071 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2578,8 +2578,11 @@ static int ext3_release_dquot(struct dquot *dquot)
 
 	handle = ext3_journal_start(dquot_to_inode(dquot),
 					EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb));
-	if (IS_ERR(handle))
+	if (IS_ERR(handle)) {
+		/* Release dquot anyway to avoid endless cycle in dqput() */
+		dquot_release(dquot);
 		return PTR_ERR(handle);
+	}
 	ret = dquot_release(dquot);
 	err = ext3_journal_stop(handle);
 	if (!ret)
@@ -2712,6 +2715,12 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
+	if (!handle) {
+		printk(KERN_WARNING "EXT3-fs: Quota write (off=%Lu, len=%Lu)"
+			" cancelled because transaction is not started.\n",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
 	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4550b83ab1c..3c1397fa83d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2698,8 +2698,11 @@ static int ext4_release_dquot(struct dquot *dquot)
 
 	handle = ext4_journal_start(dquot_to_inode(dquot),
 					EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
-	if (IS_ERR(handle))
+	if (IS_ERR(handle)) {
+		/* Release dquot anyway to avoid endless cycle in dqput() */
+		dquot_release(dquot);
 		return PTR_ERR(handle);
+	}
 	ret = dquot_release(dquot);
 	err = ext4_journal_stop(handle);
 	if (!ret)
@@ -2832,6 +2835,12 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
+	if (!handle) {
+		printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)"
+			" cancelled because transaction is not started.\n",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
 	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5b68dd3f191..a005451930b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1915,8 +1915,11 @@ static int reiserfs_release_dquot(struct dquot *dquot)
 	ret =
 	    journal_begin(&th, dquot->dq_sb,
 			  REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
-	if (ret)
+	if (ret) {
+		/* Release dquot anyway to avoid endless cycle in dqput() */
+		dquot_release(dquot);
 		goto out;
+	}
 	ret = dquot_release(dquot);
 	err =
 	    journal_end(&th, dquot->dq_sb,
@@ -2067,6 +2070,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 	size_t towrite = len;
 	struct buffer_head tmp_bh, *bh;
 
+	if (!current->journal_info) {
+		printk(KERN_WARNING "reiserfs: Quota write (off=%Lu, len=%Lu)"
+			" cancelled because transaction is not started.\n",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
 	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
@@ -2098,7 +2107,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 		data += tocopy;
 		blk++;
 	}
-      out:
+out:
 	if (len == towrite)
 		return err;
 	if (inode->i_size < off + len - towrite)
-- 
cgit v1.2.3


From 1a1a1a758bf0107d1f78ff1d622f45987803d894 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Tue, 11 Sep 2007 15:23:37 -0700
Subject: afs: mntput called before dput

dput must be called before mntput here.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Acked-By: David Howells <dhowells@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/mntpt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index a3684dcc76e..6f8c96fb29e 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -235,8 +235,8 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 	err = do_add_mount(newmnt, nd, MNT_SHRINKABLE, &afs_vfsmounts);
 	switch (err) {
 	case 0:
-		mntput(nd->mnt);
 		dput(nd->dentry);
+		mntput(nd->mnt);
 		nd->mnt = newmnt;
 		nd->dentry = dget(newmnt->mnt_root);
 		schedule_delayed_work(&afs_mntpt_expiry_timer,
-- 
cgit v1.2.3


From dd23aae4f5edf4e1dbd8f7f8013a754ba3253f48 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 11 Sep 2007 15:23:55 -0700
Subject: Fix select on /proc files without ->poll
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Taneli Vähäkangas <vahakang@cs.helsinki.fi> reported that commit
786d7e1612f0b0adb6046f19b906609e4fe8b1ba aka "Fix rmmod/read/write races
in /proc entries" broke SBCL + SLIME combo.

The old code in do_select() used DEFAULT_POLLMASK, if couldn't find
->poll handler.  The new code makes ->poll always there and returns 0 by
default, which is not correct.  Return DEFAULT_POLLMASK instead.

Steps to reproduce:

	install emacs, SBCL, SLIME
	emacs
	M-x slime	in *inferior-lisp* buffer
	[watch it doing "Connecting to Swank on port X.."]

Please, apply before 2.6.23.

P.S.: why SBCL can't just read(2) /proc/cpuinfo is a mystery.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: T Taneli Vahakangas <vahakang@cs.helsinki.fi>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c      | 3 ++-
 fs/select.c          | 2 --
 include/linux/poll.h | 2 ++
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index a5b0dfd89a1..0e4d37c93ee 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/completion.h>
+#include <linux/poll.h>
 #include <linux/file.h>
 #include <linux/limits.h>
 #include <linux/init.h>
@@ -232,7 +233,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
 static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
 {
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
-	unsigned int rv = 0;
+	unsigned int rv = DEFAULT_POLLMASK;
 	unsigned int (*poll)(struct file *, struct poll_table_struct *);
 
 	spin_lock(&pde->pde_unload_lock);
diff --git a/fs/select.c b/fs/select.c
index a974082b082..46dca31c607 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -26,8 +26,6 @@
 
 #include <asm/uaccess.h>
 
-#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
-
 struct poll_table_page {
 	struct poll_table_page * next;
 	struct poll_table_entry * entry;
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 27690798623..16d813b364e 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -21,6 +21,8 @@
 #define WQUEUES_STACK_ALLOC	(MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
 #define N_INLINE_POLL_ENTRIES	(WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))
 
+#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
+
 struct poll_table_struct;
 
 /* 
-- 
cgit v1.2.3


From 0e2f6db88a6900bc9db576d6b478b12ee60d61f7 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 11 Sep 2007 15:24:01 -0700
Subject: Leases can be hidden by flocks

The inode->i_flock list contains the leases, flocks and posix
locks in the specified order. However, the flocks are added in
the head of this list thus hiding the leases from F_GETLEASE
command, from time_out_leases() and other code that expects
the leases to come first.

The following example will demonstrate this:

#define _GNU_SOURCE

#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <sys/file.h>

static void show_lease(int fd)
{
        int res;

        res = fcntl(fd, F_GETLEASE);
        switch (res) {
                case F_RDLCK:
                        printf("Read lease\n");
                        break;
                case F_WRLCK:
                        printf("Write lease\n");
                        break;
                case F_UNLCK:
                        printf("No leases\n");
                        break;
                default:
                        printf("Some shit\n");
                        break;
        }
}

int main(int argc, char **argv)
{
        int fd, res;

        fd = open(argv[1], O_RDONLY);
        if (fd == -1) {
                perror("Can't open file");
                return 1;
        }

        res = fcntl(fd, F_SETLEASE, F_WRLCK);
        if (res == -1) {
                perror("Can't set lease");
                return 1;
        }

        show_lease(fd);

        if (flock(fd, LOCK_SH) == -1) {
                perror("Can't flock shared");
                return 1;
        }

        show_lease(fd);

        return 0;
}

The first call to show_lease() will show the write lease set, but
the second will show no leases.

Fix the flock adding so that the leases always stay in the head
of this list.

Found during making the flocks pid-namespaces aware.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 50857d2d340..c795eaaf6c4 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -782,7 +782,7 @@ find_conflict:
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
 	locks_copy_lock(new_fl, request);
-	locks_insert_lock(&inode->i_flock, new_fl);
+	locks_insert_lock(before, new_fl);
 	new_fl = NULL;
 	error = 0;
 
-- 
cgit v1.2.3


From 53c5725581cce8a29925afd4eae71fa8c7ce551f Mon Sep 17 00:00:00 2001
From: Masakazu Mokuno <mokuno@sm.sony.co.jp>
Date: Fri, 14 Sep 2007 14:35:38 -0400
Subject: As struct iw_point is bi-directional payload, we should copy back the
 content on return from ioctl calls

Signed-off-by: Masakazu Mokuno <mokuno@sm.sony.co.jp>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 fs/compat_ioctl.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a6c9078af12..5a5b7116cef 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2311,8 +2311,10 @@ static int do_wireless_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar
 	struct iwreq __user *iwr_u;
 	struct iw_point __user *iwp;
 	struct compat_iw_point __user *iwp_u;
-	compat_caddr_t pointer;
+	compat_caddr_t pointer_u;
+	void __user *pointer;
 	__u16 length, flags;
+	int ret;
 
 	iwr_u = compat_ptr(arg);
 	iwp_u = (struct compat_iw_point __user *) &iwr_u->u.data;
@@ -2330,17 +2332,29 @@ static int do_wireless_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar
 			   sizeof(iwr->ifr_ifrn.ifrn_name)))
 		return -EFAULT;
 
-	if (__get_user(pointer, &iwp_u->pointer) ||
+	if (__get_user(pointer_u, &iwp_u->pointer) ||
 	    __get_user(length, &iwp_u->length) ||
 	    __get_user(flags, &iwp_u->flags))
 		return -EFAULT;
 
-	if (__put_user(compat_ptr(pointer), &iwp->pointer) ||
+	if (__put_user(compat_ptr(pointer_u), &iwp->pointer) ||
 	    __put_user(length, &iwp->length) ||
 	    __put_user(flags, &iwp->flags))
 		return -EFAULT;
 
-	return sys_ioctl(fd, cmd, (unsigned long) iwr);
+	ret = sys_ioctl(fd, cmd, (unsigned long) iwr);
+
+	if (__get_user(pointer, &iwp->pointer) ||
+	    __get_user(length, &iwp->length) ||
+	    __get_user(flags, &iwp->flags))
+		return -EFAULT;
+
+	if (__put_user(ptr_to_compat(pointer), &iwp_u->pointer) ||
+	    __put_user(length, &iwp_u->length) ||
+	    __put_user(flags, &iwp_u->flags))
+		return -EFAULT;
+
+	return ret;
 }
 
 /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
-- 
cgit v1.2.3


From 65de5567564e70edd01b6d4e95e548d7ba284872 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 16 Aug 2007 15:21:11 +1000
Subject: [XFS] On-demand reaping of the MRU cache

Instead of running the mru cache reaper all the time based on a timeout,
we should only run it when the cache has active objects. This allows CPUs
to sleep when there is no activity rather than be woken repeatedly just to
check if there is anything to do.

SGI-PV: 968554
SGI-Modid: xfs-linux-melb:xfs-kern:29305a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Donald Douwsma <donaldd@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/xfs_filestream.c |  3 +--
 fs/xfs/xfs_mru_cache.c  | 72 +++++++++++++++++++------------------------------
 fs/xfs/xfs_mru_cache.h  |  6 ++---
 3 files changed, 31 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index ce2278611bb..16f8e175167 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -467,8 +467,7 @@ void
 xfs_filestream_flush(
 	xfs_mount_t	*mp)
 {
-	/* point in time flush, so keep the reaper running */
-	xfs_mru_cache_flush(mp->m_filestream, 1);
+	xfs_mru_cache_flush(mp->m_filestream);
 }
 
 /*
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 7deb9e3cbbd..e0b358c1c53 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -206,8 +206,11 @@ _xfs_mru_cache_list_insert(
 	 */
 	if (!_xfs_mru_cache_migrate(mru, now)) {
 		mru->time_zero = now;
-		if (!mru->next_reap)
-			mru->next_reap = mru->grp_count * mru->grp_time;
+		if (!mru->queued) {
+			mru->queued = 1;
+			queue_delayed_work(xfs_mru_reap_wq, &mru->work,
+			                   mru->grp_count * mru->grp_time);
+		}
 	} else {
 		grp = (now - mru->time_zero) / mru->grp_time;
 		grp = (mru->lru_grp + grp) % mru->grp_count;
@@ -271,29 +274,26 @@ _xfs_mru_cache_reap(
 	struct work_struct	*work)
 {
 	xfs_mru_cache_t		*mru = container_of(work, xfs_mru_cache_t, work.work);
-	unsigned long		now;
+	unsigned long		now, next;
 
 	ASSERT(mru && mru->lists);
 	if (!mru || !mru->lists)
 		return;
 
 	mutex_spinlock(&mru->lock);
-	now = jiffies;
-	if (mru->reap_all ||
-	    (mru->next_reap && time_after(now, mru->next_reap))) {
-		if (mru->reap_all)
-			now += mru->grp_count * mru->grp_time * 2;
-		mru->next_reap = _xfs_mru_cache_migrate(mru, now);
-		_xfs_mru_cache_clear_reap_list(mru);
+	next = _xfs_mru_cache_migrate(mru, jiffies);
+	_xfs_mru_cache_clear_reap_list(mru);
+
+	mru->queued = next;
+	if ((mru->queued > 0)) {
+		now = jiffies;
+		if (next <= now)
+			next = 0;
+		else
+			next -= now;
+		queue_delayed_work(xfs_mru_reap_wq, &mru->work, next);
 	}
 
-	/*
-	 * the process that triggered the reap_all is responsible
-	 * for restating the periodic reap if it is required.
-	 */
-	if (!mru->reap_all)
-		queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
-	mru->reap_all = 0;
 	mutex_spinunlock(&mru->lock, 0);
 }
 
@@ -352,7 +352,7 @@ xfs_mru_cache_create(
 
 	/* An extra list is needed to avoid reaping up to a grp_time early. */
 	mru->grp_count = grp_count + 1;
-	mru->lists = kmem_alloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
 
 	if (!mru->lists) {
 		err = ENOMEM;
@@ -374,11 +374,6 @@ xfs_mru_cache_create(
 	mru->grp_time  = grp_time;
 	mru->free_func = free_func;
 
-	/* start up the reaper event */
-	mru->next_reap = 0;
-	mru->reap_all = 0;
-	queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
-
 	*mrup = mru;
 
 exit:
@@ -394,35 +389,25 @@ exit:
  * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
  * free functions as they're deleted.  When this function returns, the caller is
  * guaranteed that all the free functions for all the elements have finished
- * executing.
- *
- * While we are flushing, we stop the periodic reaper event from triggering.
- * Normally, we want to restart this periodic event, but if we are shutting
- * down the cache we do not want it restarted. hence the restart parameter
- * where 0 = do not restart reaper and 1 = restart reaper.
+ * executing and the reaper is not running.
  */
 void
 xfs_mru_cache_flush(
-	xfs_mru_cache_t		*mru,
-	int			restart)
+	xfs_mru_cache_t		*mru)
 {
 	if (!mru || !mru->lists)
 		return;
 
-	cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
-
 	mutex_spinlock(&mru->lock);
-	mru->reap_all = 1;
-	mutex_spinunlock(&mru->lock, 0);
+	if (mru->queued) {
+		mutex_spinunlock(&mru->lock, 0);
+		cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
+		mutex_spinlock(&mru->lock);
+	}
 
-	queue_work(xfs_mru_reap_wq, &mru->work.work);
-	flush_workqueue(xfs_mru_reap_wq);
+	_xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time);
+	_xfs_mru_cache_clear_reap_list(mru);
 
-	mutex_spinlock(&mru->lock);
-	WARN_ON_ONCE(mru->reap_all != 0);
-	mru->reap_all = 0;
-	if (restart)
-		queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
 	mutex_spinunlock(&mru->lock, 0);
 }
 
@@ -433,8 +418,7 @@ xfs_mru_cache_destroy(
 	if (!mru || !mru->lists)
 		return;
 
-	/* we don't want the reaper to restart here */
-	xfs_mru_cache_flush(mru, 0);
+	xfs_mru_cache_flush(mru);
 
 	kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
 	kmem_free(mru, sizeof(*mru));
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 624fd10ee8e..dd58ea1bbeb 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -32,11 +32,9 @@ typedef struct xfs_mru_cache
 	unsigned int		grp_time;  /* Time period spanned by grps.  */
 	unsigned int		lru_grp;   /* Group containing time zero.   */
 	unsigned long		time_zero; /* Time first element was added. */
-	unsigned long		next_reap; /* Time that the reaper should
-					      next do something. */
-	unsigned int		reap_all;  /* if set, reap all lists */
 	xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
 	struct delayed_work	work;      /* Workqueue data for reaping.   */
+	unsigned int		queued;	   /* work has been queued */
 } xfs_mru_cache_t;
 
 int xfs_mru_cache_init(void);
@@ -44,7 +42,7 @@ void xfs_mru_cache_uninit(void);
 int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
 			     unsigned int grp_count,
 			     xfs_mru_cache_free_func_t free_func);
-void xfs_mru_cache_flush(xfs_mru_cache_t *mru, int restart);
+void xfs_mru_cache_flush(xfs_mru_cache_t *mru);
 void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
 int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
 				void *value);
-- 
cgit v1.2.3


From 776a75fa5cfb8f3602d3ca9d221dc34497133f4b Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Fri, 14 Sep 2007 15:22:50 +1000
Subject: [XFS] Ensure file size updates have been completed before writing
 inode to disk.

SGI-PV: 968767
SGI-Modid: xfs-linux-melb:xfs-kern:29675a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c  |  1 +
 fs/xfs/linux-2.6/xfs_super.c |  4 +++-
 fs/xfs/xfs_vnodeops.c        | 20 ++++++++++++--------
 3 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d9c40fe6419..5f152f60d74 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -181,6 +181,7 @@ xfs_setfilesize(
 		ip->i_d.di_size = isize;
 		ip->i_update_core = 1;
 		ip->i_update_size = 1;
+		mark_inode_dirty_sync(vn_to_inode(ioend->io_vnode));
 	}
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 4528f9a3f30..491d1f4f202 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -415,8 +415,10 @@ xfs_fs_write_inode(
 
 	if (vp) {
 		vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
-		if (sync)
+		if (sync) {
+			filemap_fdatawait(inode->i_mapping);
 			flags |= FLUSH_SYNC;
+		}
 		error = bhv_vop_iflush(vp, flags);
 		if (error == EAGAIN)
 			error = sync? bhv_vop_iflush(vp, flags | FLUSH_LOG) : 0;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 1a5ad8cd97b..60345922990 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1082,6 +1082,9 @@ xfs_fsync(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return XFS_ERROR(EIO);
 
+	if (flag & FSYNC_DATA)
+		filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+
 	/*
 	 * We always need to make sure that the required inode state
 	 * is safe on disk.  The vnode might be clean but because
@@ -3769,12 +3772,16 @@ xfs_inode_flush(
 			sync_lsn = log->l_last_sync_lsn;
 			GRANT_UNLOCK(log, s);
 
-			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
-				return 0;
+			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
+				if (flags & FLUSH_SYNC)
+					log_flags |= XFS_LOG_SYNC;
+				error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
+				if (error)
+					return error;
+			}
 
-			if (flags & FLUSH_SYNC)
-				log_flags |= XFS_LOG_SYNC;
-			return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
+			if (ip->i_update_core == 0)
+				return 0;
 		}
 	}
 
@@ -3788,9 +3795,6 @@ xfs_inode_flush(
 	if (flags & FLUSH_INODE) {
 		int	flush_flags;
 
-		if (xfs_ipincount(ip))
-			return EAGAIN;
-
 		if (flags & FLUSH_SYNC) {
 			xfs_ilock(ip, XFS_ILOCK_SHARED);
 			xfs_iflock(ip);
-- 
cgit v1.2.3


From b394e43e995d08821588a22561c6a71a63b4ff27 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Fri, 14 Sep 2007 15:23:04 +1000
Subject: [XFS] Avoid replaying inode buffer initialisation log items if
 on-disk version is newer.

SGI-PV: 969656
SGI-Modid: xfs-linux-melb:xfs-kern:29676a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/xfs_buf_item.h    |  5 +++++
 fs/xfs/xfs_log_recover.c | 51 +++++++++++++++++++++++++++++++++++++++++++++---
 fs/xfs/xfs_trans_buf.c   |  1 +
 3 files changed, 54 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index d7e13614306..fa25b7dcc6c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -52,6 +52,11 @@ typedef struct xfs_buf_log_format_t {
 #define	XFS_BLI_UDQUOT_BUF	0x4
 #define XFS_BLI_PDQUOT_BUF	0x8
 #define	XFS_BLI_GDQUOT_BUF	0x10
+/*
+ * This flag indicates that the buffer contains newly allocated
+ * inodes.
+ */
+#define	XFS_BLI_INODE_NEW_BUF	0x20
 
 #define	XFS_BLI_CHUNK		128
 #define	XFS_BLI_SHIFT		7
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8ae6e8e5f3d..dacb19739cc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1874,6 +1874,7 @@ xlog_recover_do_inode_buffer(
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
+	xfs_mount_t		*mp,
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
@@ -1884,6 +1885,50 @@ xlog_recover_do_reg_buffer(
 	unsigned int		*data_map = NULL;
 	unsigned int		map_size = 0;
 	int                     error;
+	int			stale_buf = 1;
+
+	/*
+	 * Scan through the on-disk inode buffer and attempt to
+	 * determine if it has been written to since it was logged.
+	 *
+	 * - If any of the magic numbers are incorrect then the buffer is stale
+	 * - If any of the modes are non-zero then the buffer is not stale
+	 * - If all of the modes are zero and at least one of the generation
+	 *   counts is non-zero then the buffer is stale
+	 *
+	 * If the end result is a stale buffer then the log buffer is replayed
+	 * otherwise it is skipped.
+	 *
+	 * This heuristic is not perfect.  It can be improved by scanning the
+	 * entire inode chunk for evidence that any of the inode clusters have
+	 * been updated.  To fix this problem completely we will need a major
+	 * architectural change to the logging system.
+	 */
+	if (buf_f->blf_flags & XFS_BLI_INODE_NEW_BUF) {
+		xfs_dinode_t    *dip;
+		int             inodes_per_buf;
+		int		mode_count = 0;
+		int		gen_count = 0;
+
+		stale_buf = 0;
+		inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+		for (i = 0; i < inodes_per_buf; i++) {
+			dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+				i * mp->m_sb.sb_inodesize);
+			if (be16_to_cpu(dip->di_core.di_magic) !=
+					XFS_DINODE_MAGIC) {
+				stale_buf = 1;
+				break;
+			}
+			if (be16_to_cpu(dip->di_core.di_mode))
+				mode_count++;
+			if (be16_to_cpu(dip->di_core.di_gen))
+				gen_count++;
+		}
+
+		if (!mode_count && gen_count)
+			stale_buf = 1;
+	}
 
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
@@ -1917,7 +1962,7 @@ xlog_recover_do_reg_buffer(
 					       -1, 0, XFS_QMOPT_DOWARN,
 					       "dquot_buf_recover");
 		}
-		if (!error)
+		if (!error && stale_buf)
 			memcpy(xfs_buf_offset(bp,
 				(uint)bit << XFS_BLI_SHIFT),	/* dest */
 				item->ri_buf[i].i_addr,		/* source */
@@ -2089,7 +2134,7 @@ xlog_recover_do_dquot_buffer(
 	if (log->l_quotaoffs_flag & type)
 		return;
 
-	xlog_recover_do_reg_buffer(item, bp, buf_f);
+	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 }
 
 /*
@@ -2190,7 +2235,7 @@ xlog_recover_do_buffer_trans(
 		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
-		xlog_recover_do_reg_buffer(item, bp, buf_f);
+		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 	}
 	if (error)
 		return XFS_ERROR(error);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 60b6b898022..95fff6872a2 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -966,6 +966,7 @@ xfs_trans_inode_alloc_buf(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+	bip->bli_format.blf_flags |= XFS_BLI_INODE_NEW_BUF;
 }
 
 
-- 
cgit v1.2.3


From 3d82abae9523c33d4a16fdfdfd2bdde316d7b56a Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 18 Sep 2007 22:46:38 -0700
Subject: dir_index: error out instead of BUG on corrupt dx dirs

Convert asserts (BUGs) in dx_probe from bad on-disk data to recoverable
errors with helpful warnings.  With help catching other asserts from Duane
Griffin <duaneg@dghda.com>

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/namei.c | 34 ++++++++++++++++++++++++++++++----
 fs/ext4/namei.c | 34 ++++++++++++++++++++++++++++++----
 2 files changed, 60 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 1586807b817..9d4a89820e1 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -379,13 +379,28 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 
 	entries = (struct dx_entry *) (((char *)&root->info) +
 				       root->info.info_length);
-	assert(dx_get_limit(entries) == dx_root_limit(dir,
-						      root->info.info_length));
+
+	if (dx_get_limit(entries) != dx_root_limit(dir,
+						   root->info.info_length)) {
+		ext3_warning(dir->i_sb, __FUNCTION__,
+			     "dx entry: limit != root limit");
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
 	dxtrace (printk("Look up %x", hash));
 	while (1)
 	{
 		count = dx_get_count(entries);
-		assert (count && count <= dx_get_limit(entries));
+		if (!count || count > dx_get_limit(entries)) {
+			ext3_warning(dir->i_sb, __FUNCTION__,
+				     "dx entry: no count or count > limit");
+			brelse(bh);
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
+
 		p = entries + 1;
 		q = entries + count - 1;
 		while (p <= q)
@@ -423,8 +438,15 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 		if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
 			goto fail2;
 		at = entries = ((struct dx_node *) bh->b_data)->entries;
-		assert (dx_get_limit(entries) == dx_node_limit (dir));
+		if (dx_get_limit(entries) != dx_node_limit (dir)) {
+			ext3_warning(dir->i_sb, __FUNCTION__,
+				     "dx entry: limit != node limit");
+			brelse(bh);
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
 		frame++;
+		frame->bh = NULL;
 	}
 fail2:
 	while (frame >= frame_in) {
@@ -432,6 +454,10 @@ fail2:
 		frame--;
 	}
 fail:
+	if (*err == ERR_BAD_DX_DIR)
+		ext3_warning(dir->i_sb, __FUNCTION__,
+			     "Corrupt dir inode %ld, running e2fsck is "
+			     "recommended.", dir->i_ino);
 	return NULL;
 }
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index da224974af7..9468289637a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -379,13 +379,28 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 
 	entries = (struct dx_entry *) (((char *)&root->info) +
 				       root->info.info_length);
-	assert(dx_get_limit(entries) == dx_root_limit(dir,
-						      root->info.info_length));
+
+	if (dx_get_limit(entries) != dx_root_limit(dir,
+						   root->info.info_length)) {
+		ext4_warning(dir->i_sb, __FUNCTION__,
+			     "dx entry: limit != root limit");
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
 	dxtrace (printk("Look up %x", hash));
 	while (1)
 	{
 		count = dx_get_count(entries);
-		assert (count && count <= dx_get_limit(entries));
+		if (!count || count > dx_get_limit(entries)) {
+			ext4_warning(dir->i_sb, __FUNCTION__,
+				     "dx entry: no count or count > limit");
+			brelse(bh);
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
+
 		p = entries + 1;
 		q = entries + count - 1;
 		while (p <= q)
@@ -423,8 +438,15 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 		if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
 			goto fail2;
 		at = entries = ((struct dx_node *) bh->b_data)->entries;
-		assert (dx_get_limit(entries) == dx_node_limit (dir));
+		if (dx_get_limit(entries) != dx_node_limit (dir)) {
+			ext4_warning(dir->i_sb, __FUNCTION__,
+				     "dx entry: limit != node limit");
+			brelse(bh);
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
 		frame++;
+		frame->bh = NULL;
 	}
 fail2:
 	while (frame >= frame_in) {
@@ -432,6 +454,10 @@ fail2:
 		frame--;
 	}
 fail:
+	if (*err == ERR_BAD_DX_DIR)
+		ext4_warning(dir->i_sb, __FUNCTION__,
+			     "Corrupt dir inode %ld, running e2fsck is "
+			     "recommended.", dir->i_ino);
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 49af7ee181f4f516ac99eba85d3f70ed42cabe76 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 18 Sep 2007 22:46:40 -0700
Subject: nfs: fix oops re sysctls and V4 support

NFS unregisters sysctls only if V4 support is compiled in.  However, sysctl
table is not V4 specific, so unregister it always.

Steps to reproduce:

	[build nfs.ko with CONFIG_NFS_V4=n]
	modrobe nfs
	rmmod nfs
	ls /proc/sys

Unable to handle kernel paging request at ffffffff880661c0 RIP:
 [<ffffffff802af8e3>] proc_sys_readdir+0xd3/0x350
PGD 203067 PUD 207063 PMD 7e216067 PTE 0
Oops: 0000 [1] SMP
CPU 1
Modules linked in: lockd nfs_acl sunrpc
Pid: 3335, comm: ls Not tainted 2.6.23-rc3-bloat #2
RIP: 0010:[<ffffffff802af8e3>]  [<ffffffff802af8e3>] proc_sys_readdir+0xd3/0x350
RSP: 0018:ffff81007fd93e78  EFLAGS: 00010286
RAX: ffffffff880661c0 RBX: ffffffff80466370 RCX: ffffffff880661c0
RDX: 00000000000014c0 RSI: ffff81007f3ad020 RDI: ffff81007efd8b40
RBP: 0000000000000018 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000001 R11: ffffffff802a8570 R12: ffffffff880661c0
R13: ffff81007e219640 R14: ffff81007efd8b40 R15: ffff81007ded7280
FS:  00002ba25ef03060(0000) GS:ffff81007ff81258(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffffffff880661c0 CR3: 000000007dfaf000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process ls (pid: 3335, threadinfo ffff81007fd92000, task ffff81007d8a0000)
Stack:  ffff81007f3ad150 ffffffff80283f30 ffff81007fd93f48 ffff81007efd8b40
 ffff81007ee00440 0000000422222222 0000000200035593 ffffffff88037e9a
 2222222222222222 ffffffff80466500 ffff81007e416400 ffff81007e219640
Call Trace:
 [<ffffffff80283f30>] filldir+0x0/0xf0
 [<ffffffff80283f30>] filldir+0x0/0xf0
 [<ffffffff802840c7>] vfs_readdir+0xa7/0xc0
 [<ffffffff80284376>] sys_getdents+0x96/0xe0
 [<ffffffff8020bb3e>] system_call+0x7e/0x83

Code: 41 8b 14 24 85 d2 74 dc 49 8b 44 24 08 48 85 c0 74 e7 49 3b
RIP  [<ffffffff802af8e3>] proc_sys_readdir+0xd3/0x350
 RSP <ffff81007fd93e78>
CR2: ffffffff880661c0
Kernel panic - not syncing: Fatal exception

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 8ed593766f1..b878528b64c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -345,8 +345,8 @@ void __exit unregister_nfs_fs(void)
 	unregister_shrinker(&acl_shrinker);
 #ifdef CONFIG_NFS_V4
 	unregister_filesystem(&nfs4_fs_type);
-	nfs_unregister_sysctl();
 #endif
+	nfs_unregister_sysctl();
 	unregister_filesystem(&nfs_fs_type);
 }
 
-- 
cgit v1.2.3


From ef2b02d3e617cb0400eedf2668f86215e1b0e6af Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 18 Sep 2007 22:46:42 -0700
Subject: ext34: ensure do_split leaves enough free space in both blocks

The do_split() function for htree dir blocks is intended to split a leaf
block to make room for a new entry.  It sorts the entries in the original
block by hash value, then moves the last half of the entries to the new
block - without accounting for how much space this actually moves.  (IOW,
it moves half of the entry *count* not half of the entry *space*).  If by
chance we have both large & small entries, and we move only the smallest
entries, and we have a large new entry to insert, we may not have created
enough space for it.

The patch below stores each record size when calculating the dx_map, and
then walks the hash-sorted dx_map, calculating how many entries must be
moved to more evenly split the existing entries between the old block and
the new block, guaranteeing enough space for the new entry.

The dx_map "offs" member is reduced to u16 so that the overall map size
does not change - it is temporarily stored at the end of the new block, and
if it grows too large it may be overwritten.  By making offs and size both
u16, we won't grow the map size.

Also add a few comments to the functions involved.

This fixes the testcase reported by hooanon05@yahoo.co.jp on the
linux-ext4 list, "ext3 dir_index causes an error"

Thanks to Andreas Dilger for discussing the problem & solution with me.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Tested-by: Junjiro Okajima <hooanon05@yahoo.co.jp>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: <linux-ext4@vger.kernel.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/namei.c | 39 +++++++++++++++++++++++++++++++++++----
 fs/ext4/namei.c | 39 +++++++++++++++++++++++++++++++++++----
 2 files changed, 70 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 9d4a89820e1..c1fa1908dba 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -140,7 +140,8 @@ struct dx_frame
 struct dx_map_entry
 {
 	u32 hash;
-	u32 offs;
+	u16 offs;
+	u16 size;
 };
 
 #ifdef CONFIG_EXT3_INDEX
@@ -697,6 +698,10 @@ errout:
  * Directory block splitting, compacting
  */
 
+/*
+ * Create map of hash values, offsets, and sizes, stored at end of block.
+ * Returns number of entries mapped.
+ */
 static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
 			struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
 {
@@ -710,7 +715,8 @@ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
 			ext3fs_dirhash(de->name, de->name_len, &h);
 			map_tail--;
 			map_tail->hash = h.hash;
-			map_tail->offs = (u32) ((char *) de - base);
+			map_tail->offs = (u16) ((char *) de - base);
+			map_tail->size = le16_to_cpu(de->rec_len);
 			count++;
 			cond_resched();
 		}
@@ -720,6 +726,7 @@ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
 	return count;
 }
 
+/* Sort map by hash value */
 static void dx_sort_map (struct dx_map_entry *map, unsigned count)
 {
         struct dx_map_entry *p, *q, *top = map + count - 1;
@@ -1117,6 +1124,10 @@ static inline void ext3_set_de_type(struct super_block *sb,
 }
 
 #ifdef CONFIG_EXT3_INDEX
+/*
+ * Move count entries from end of map between two memory locations.
+ * Returns pointer to last entry moved.
+ */
 static struct ext3_dir_entry_2 *
 dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 {
@@ -1135,6 +1146,10 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 	return (struct ext3_dir_entry_2 *) (to - rec_len);
 }
 
+/*
+ * Compact each dir entry in the range to the minimal rec_len.
+ * Returns pointer to last entry in range.
+ */
 static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
 {
 	struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
@@ -1157,6 +1172,11 @@ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
 	return prev;
 }
 
+/*
+ * Split a full leaf block to make room for a new dir entry.
+ * Allocate a new block, and move entries so that they are approx. equally full.
+ * Returns pointer to de in block into which the new entry will be inserted.
+ */
 static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 			struct buffer_head **bh,struct dx_frame *frame,
 			struct dx_hash_info *hinfo, int *error)
@@ -1168,7 +1188,7 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	u32 hash2;
 	struct dx_map_entry *map;
 	char *data1 = (*bh)->b_data, *data2;
-	unsigned split;
+	unsigned split, move, size, i;
 	struct ext3_dir_entry_2 *de = NULL, *de2;
 	int	err = 0;
 
@@ -1196,8 +1216,19 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
 			     blocksize, hinfo, map);
 	map -= count;
-	split = count/2; // need to adjust to actual middle
 	dx_sort_map (map, count);
+	/* Split the existing block in the middle, size-wise */
+	size = 0;
+	move = 0;
+	for (i = count-1; i >= 0; i--) {
+		/* is more than half of this entry in 2nd half of the block? */
+		if (size + map[i].size/2 > blocksize/2)
+			break;
+		size += map[i].size;
+		move++;
+	}
+	/* map index at which we will split */
+	split = count - move;
 	hash2 = map[split].hash;
 	continued = hash2 == map[split - 1].hash;
 	dxtrace(printk("Split block %i at %x, %i/%i\n",
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9468289637a..5fdb862e71c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -140,7 +140,8 @@ struct dx_frame
 struct dx_map_entry
 {
 	u32 hash;
-	u32 offs;
+	u16 offs;
+	u16 size;
 };
 
 #ifdef CONFIG_EXT4_INDEX
@@ -697,6 +698,10 @@ errout:
  * Directory block splitting, compacting
  */
 
+/*
+ * Create map of hash values, offsets, and sizes, stored at end of block.
+ * Returns number of entries mapped.
+ */
 static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
 			struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
 {
@@ -710,7 +715,8 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
 			ext4fs_dirhash(de->name, de->name_len, &h);
 			map_tail--;
 			map_tail->hash = h.hash;
-			map_tail->offs = (u32) ((char *) de - base);
+			map_tail->offs = (u16) ((char *) de - base);
+			map_tail->size = le16_to_cpu(de->rec_len);
 			count++;
 			cond_resched();
 		}
@@ -720,6 +726,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
 	return count;
 }
 
+/* Sort map by hash value */
 static void dx_sort_map (struct dx_map_entry *map, unsigned count)
 {
 	struct dx_map_entry *p, *q, *top = map + count - 1;
@@ -1115,6 +1122,10 @@ static inline void ext4_set_de_type(struct super_block *sb,
 }
 
 #ifdef CONFIG_EXT4_INDEX
+/*
+ * Move count entries from end of map between two memory locations.
+ * Returns pointer to last entry moved.
+ */
 static struct ext4_dir_entry_2 *
 dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 {
@@ -1133,6 +1144,10 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 	return (struct ext4_dir_entry_2 *) (to - rec_len);
 }
 
+/*
+ * Compact each dir entry in the range to the minimal rec_len.
+ * Returns pointer to last entry in range.
+ */
 static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
 {
 	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
@@ -1155,6 +1170,11 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
 	return prev;
 }
 
+/*
+ * Split a full leaf block to make room for a new dir entry.
+ * Allocate a new block, and move entries so that they are approx. equally full.
+ * Returns pointer to de in block into which the new entry will be inserted.
+ */
 static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 			struct buffer_head **bh,struct dx_frame *frame,
 			struct dx_hash_info *hinfo, int *error)
@@ -1166,7 +1186,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	u32 hash2;
 	struct dx_map_entry *map;
 	char *data1 = (*bh)->b_data, *data2;
-	unsigned split;
+	unsigned split, move, size, i;
 	struct ext4_dir_entry_2 *de = NULL, *de2;
 	int	err = 0;
 
@@ -1194,8 +1214,19 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
 			     blocksize, hinfo, map);
 	map -= count;
-	split = count/2; // need to adjust to actual middle
 	dx_sort_map (map, count);
+	/* Split the existing block in the middle, size-wise */
+	size = 0;
+	move = 0;
+	for (i = count-1; i >= 0; i--) {
+		/* is more than half of this entry in 2nd half of the block? */
+		if (size + map[i].size/2 > blocksize/2)
+			break;
+		size += map[i].size;
+		move++;
+	}
+	/* map index at which we will split */
+	split = count - move;
 	hash2 = map[split].hash;
 	continued = hash2 == map[split - 1].hash;
 	dxtrace(printk("Split block %i at %x, %i/%i\n",
-- 
cgit v1.2.3


From bcc7b445eff295664a3a3ab14e742b3c9d88e6e3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 30 Aug 2007 17:21:38 +1000
Subject: [XFS] fix filestreams on 32-bit boxes

xfs_filestream_mount() sets up an mru cache with:
  err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
  (xfs_mru_cache_free_func_t)xfs_fstrm_free_func);
but that cast is causing problems...
  typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*);
but:
  void xfs_fstrm_free_func( xfs_ino_t ino, fstrm_item_t *item)
so on a 32-bit box, it's casting (32, 32) args into (64, 32) and I assume
it's getting garbage for *item, which subsequently causes an explosion.
With this change the filestreams xfsqa tests don't oops on my 32-bit box.

SGI-PV: 967795
SGI-Modid: xfs-linux-melb:xfs-kern:29510a

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/xfs_filestream.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 16f8e175167..36d8f6aa11a 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -350,9 +350,10 @@ _xfs_filestream_update_ag(
 /* xfs_fstrm_free_func(): callback for freeing cached stream items. */
 void
 xfs_fstrm_free_func(
-	xfs_ino_t	ino,
-	fstrm_item_t	*item)
+	unsigned long	ino,
+	void		*data)
 {
+	fstrm_item_t	*item  = (fstrm_item_t *)data;
 	xfs_inode_t	*ip = item->ip;
 	int ref;
 
@@ -438,7 +439,7 @@ xfs_filestream_mount(
 	grp_count = 10;
 
 	err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
-	                     (xfs_mru_cache_free_func_t)xfs_fstrm_free_func);
+	                     xfs_fstrm_free_func);
 
 	return err;
 }
-- 
cgit v1.2.3


From 1bc5858d0d40e07697b5eda47ed8628b8a934235 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 19 Sep 2007 15:27:30 +1000
Subject: [XFS] fix valid but harmless sparse warning

The new xlog_recover_do_reg_buffer checks call be16_to_cpu on di_gen which
is a 32bit value so sparse rightly complains. Fortunately the warning is
harmless because we don't care for the value, but only whether it's
non-NULL. Due to that fact we can simply kill the endian swaps on this and
the previous di_mode check entirely.

SGI-PV: 969656
SGI-Modid: xfs-linux-melb:xfs-kern:29709a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index dacb19739cc..7174991f4be 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1920,9 +1920,9 @@ xlog_recover_do_reg_buffer(
 				stale_buf = 1;
 				break;
 			}
-			if (be16_to_cpu(dip->di_core.di_mode))
+			if (dip->di_core.di_mode)
 				mode_count++;
-			if (be16_to_cpu(dip->di_core.di_gen))
+			if (dip->di_core.di_gen)
 				gen_count++;
 		}
 
-- 
cgit v1.2.3


From b8fceee17a310f189188599a8fa5e9beaff57eb0 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 20 Sep 2007 12:40:16 -0700
Subject: signalfd simplification

This simplifies signalfd code, by avoiding it to remain attached to the
sighand during its lifetime.

In this way, the signalfd remain attached to the sighand only during
poll(2) (and select and epoll) and read(2).  This also allows to remove
all the custom "tsk == current" checks in kernel/signal.c, since
dequeue_signal() will only be called by "current".

I think this is also what Ben was suggesting time ago.

The external effect of this, is that a thread can extract only its own
private signals and the group ones.  I think this is an acceptable
behaviour, in that those are the signals the thread would be able to
fetch w/out signalfd.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                 |   3 -
 fs/signalfd.c             | 190 +++++++---------------------------------------
 include/linux/init_task.h |   2 +-
 include/linux/sched.h     |   2 +-
 include/linux/signalfd.h  |  40 +---------
 kernel/exit.c             |   9 ---
 kernel/fork.c             |   2 +-
 kernel/signal.c           |   8 +-
 8 files changed, 39 insertions(+), 217 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index c21a8cc0627..073b0b8c6d0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -50,7 +50,6 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
-#include <linux/signalfd.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -784,7 +783,6 @@ static int de_thread(struct task_struct *tsk)
 	 * and we can just re-use it all.
 	 */
 	if (atomic_read(&oldsighand->count) <= 1) {
-		signalfd_detach(tsk);
 		exit_itimers(sig);
 		return 0;
 	}
@@ -923,7 +921,6 @@ static int de_thread(struct task_struct *tsk)
 	sig->flags = 0;
 
 no_thread_group:
-	signalfd_detach(tsk);
 	exit_itimers(sig);
 	if (leader)
 		release_task(leader);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index a8e293d3003..aefb0be0794 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -11,8 +11,10 @@
  *      Now using anonymous inode source.
  *      Thanks to Oleg Nesterov for useful code review and suggestions.
  *      More comments and suggestions from Arnd Bergmann.
- * Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
+ *  Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
  *      Retrieve multiple signals with one read() call
+ *  Sun Jul 15, 2007: Davide Libenzi <davidel@xmailserver.org>
+ *      Attach to the sighand only during read() and poll().
  */
 
 #include <linux/file.h>
@@ -27,102 +29,12 @@
 #include <linux/signalfd.h>
 
 struct signalfd_ctx {
-	struct list_head lnk;
-	wait_queue_head_t wqh;
 	sigset_t sigmask;
-	struct task_struct *tsk;
 };
 
-struct signalfd_lockctx {
-	struct task_struct *tsk;
-	unsigned long flags;
-};
-
-/*
- * Tries to acquire the sighand lock. We do not increment the sighand
- * use count, and we do not even pin the task struct, so we need to
- * do it inside an RCU read lock, and we must be prepared for the
- * ctx->tsk going to NULL (in signalfd_deliver()), and for the sighand
- * being detached. We return 0 if the sighand has been detached, or
- * 1 if we were able to pin the sighand lock.
- */
-static int signalfd_lock(struct signalfd_ctx *ctx, struct signalfd_lockctx *lk)
-{
-	struct sighand_struct *sighand = NULL;
-
-	rcu_read_lock();
-	lk->tsk = rcu_dereference(ctx->tsk);
-	if (likely(lk->tsk != NULL))
-		sighand = lock_task_sighand(lk->tsk, &lk->flags);
-	rcu_read_unlock();
-
-	if (!sighand)
-		return 0;
-
-	if (!ctx->tsk) {
-		unlock_task_sighand(lk->tsk, &lk->flags);
-		return 0;
-	}
-
-	if (lk->tsk->tgid == current->tgid)
-		lk->tsk = current;
-
-	return 1;
-}
-
-static void signalfd_unlock(struct signalfd_lockctx *lk)
-{
-	unlock_task_sighand(lk->tsk, &lk->flags);
-}
-
-/*
- * This must be called with the sighand lock held.
- */
-void signalfd_deliver(struct task_struct *tsk, int sig)
-{
-	struct sighand_struct *sighand = tsk->sighand;
-	struct signalfd_ctx *ctx, *tmp;
-
-	BUG_ON(!sig);
-	list_for_each_entry_safe(ctx, tmp, &sighand->signalfd_list, lnk) {
-		/*
-		 * We use a negative signal value as a way to broadcast that the
-		 * sighand has been orphaned, so that we can notify all the
-		 * listeners about this. Remember the ctx->sigmask is inverted,
-		 * so if the user is interested in a signal, that corresponding
-		 * bit will be zero.
-		 */
-		if (sig < 0) {
-			if (ctx->tsk == tsk) {
-				ctx->tsk = NULL;
-				list_del_init(&ctx->lnk);
-				wake_up(&ctx->wqh);
-			}
-		} else {
-			if (!sigismember(&ctx->sigmask, sig))
-				wake_up(&ctx->wqh);
-		}
-	}
-}
-
-static void signalfd_cleanup(struct signalfd_ctx *ctx)
-{
-	struct signalfd_lockctx lk;
-
-	/*
-	 * This is tricky. If the sighand is gone, we do not need to remove
-	 * context from the list, the list itself won't be there anymore.
-	 */
-	if (signalfd_lock(ctx, &lk)) {
-		list_del(&ctx->lnk);
-		signalfd_unlock(&lk);
-	}
-	kfree(ctx);
-}
-
 static int signalfd_release(struct inode *inode, struct file *file)
 {
-	signalfd_cleanup(file->private_data);
+	kfree(file->private_data);
 	return 0;
 }
 
@@ -130,23 +42,15 @@ static unsigned int signalfd_poll(struct file *file, poll_table *wait)
 {
 	struct signalfd_ctx *ctx = file->private_data;
 	unsigned int events = 0;
-	struct signalfd_lockctx lk;
 
-	poll_wait(file, &ctx->wqh, wait);
+	poll_wait(file, &current->sighand->signalfd_wqh, wait);
 
-	/*
-	 * Let the caller get a POLLIN in this case, ala socket recv() when
-	 * the peer disconnects.
-	 */
-	if (signalfd_lock(ctx, &lk)) {
-		if ((lk.tsk == current &&
-		     next_signal(&lk.tsk->pending, &ctx->sigmask) > 0) ||
-		    next_signal(&lk.tsk->signal->shared_pending,
-				&ctx->sigmask) > 0)
-			events |= POLLIN;
-		signalfd_unlock(&lk);
-	} else
+	spin_lock_irq(&current->sighand->siglock);
+	if (next_signal(&current->pending, &ctx->sigmask) ||
+	    next_signal(&current->signal->shared_pending,
+			&ctx->sigmask))
 		events |= POLLIN;
+	spin_unlock_irq(&current->sighand->siglock);
 
 	return events;
 }
@@ -219,59 +123,46 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
 				int nonblock)
 {
 	ssize_t ret;
-	struct signalfd_lockctx lk;
 	DECLARE_WAITQUEUE(wait, current);
 
-	if (!signalfd_lock(ctx, &lk))
-		return 0;
-
-	ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
+	spin_lock_irq(&current->sighand->siglock);
+	ret = dequeue_signal(current, &ctx->sigmask, info);
 	switch (ret) {
 	case 0:
 		if (!nonblock)
 			break;
 		ret = -EAGAIN;
 	default:
-		signalfd_unlock(&lk);
+		spin_unlock_irq(&current->sighand->siglock);
 		return ret;
 	}
 
-	add_wait_queue(&ctx->wqh, &wait);
+	add_wait_queue(&current->sighand->signalfd_wqh, &wait);
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
-		signalfd_unlock(&lk);
+		ret = dequeue_signal(current, &ctx->sigmask, info);
 		if (ret != 0)
 			break;
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
+		spin_unlock_irq(&current->sighand->siglock);
 		schedule();
-		ret = signalfd_lock(ctx, &lk);
-		if (unlikely(!ret)) {
-			/*
-			 * Let the caller read zero byte, ala socket
-			 * recv() when the peer disconnect. This test
-			 * must be done before doing a dequeue_signal(),
-			 * because if the sighand has been orphaned,
-			 * the dequeue_signal() call is going to crash
-			 * because ->sighand will be long gone.
-			 */
-			 break;
-		}
+		spin_lock_irq(&current->sighand->siglock);
 	}
+	spin_unlock_irq(&current->sighand->siglock);
 
-	remove_wait_queue(&ctx->wqh, &wait);
+	remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
 	__set_current_state(TASK_RUNNING);
 
 	return ret;
 }
 
 /*
- * Returns either the size of a "struct signalfd_siginfo", or zero if the
- * sighand we are attached to, has been orphaned. The "count" parameter
- * must be at least the size of a "struct signalfd_siginfo".
+ * Returns a multiple of the size of a "struct signalfd_siginfo", or a negative
+ * error code. The "count" parameter must be at least the size of a
+ * "struct signalfd_siginfo".
  */
 static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 			     loff_t *ppos)
@@ -287,7 +178,6 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 		return -EINVAL;
 
 	siginfo = (struct signalfd_siginfo __user *) buf;
-
 	do {
 		ret = signalfd_dequeue(ctx, &info, nonblock);
 		if (unlikely(ret <= 0))
@@ -300,7 +190,7 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 		nonblock = 1;
 	} while (--count);
 
-	return total ? total : ret;
+	return total ? total: ret;
 }
 
 static const struct file_operations signalfd_fops = {
@@ -309,20 +199,13 @@ static const struct file_operations signalfd_fops = {
 	.read		= signalfd_read,
 };
 
-/*
- * Create a file descriptor that is associated with our signal
- * state. We can pass it around to others if we want to, but
- * it will always be _our_ signal state.
- */
 asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
 {
 	int error;
 	sigset_t sigmask;
 	struct signalfd_ctx *ctx;
-	struct sighand_struct *sighand;
 	struct file *file;
 	struct inode *inode;
-	struct signalfd_lockctx lk;
 
 	if (sizemask != sizeof(sigset_t) ||
 	    copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
@@ -335,17 +218,7 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 		if (!ctx)
 			return -ENOMEM;
 
-		init_waitqueue_head(&ctx->wqh);
 		ctx->sigmask = sigmask;
-		ctx->tsk = current->group_leader;
-
-		sighand = current->sighand;
-		/*
-		 * Add this fd to the list of signal listeners.
-		 */
-		spin_lock_irq(&sighand->siglock);
-		list_add_tail(&ctx->lnk, &sighand->signalfd_list);
-		spin_unlock_irq(&sighand->siglock);
 
 		/*
 		 * When we call this, the initialization must be complete, since
@@ -364,23 +237,18 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 			fput(file);
 			return -EINVAL;
 		}
-		/*
-		 * We need to be prepared of the fact that the sighand this fd
-		 * is attached to, has been detched. In that case signalfd_lock()
-		 * will return 0, and we'll just skip setting the new mask.
-		 */
-		if (signalfd_lock(ctx, &lk)) {
-			ctx->sigmask = sigmask;
-			signalfd_unlock(&lk);
-		}
-		wake_up(&ctx->wqh);
+		spin_lock_irq(&current->sighand->siglock);
+		ctx->sigmask = sigmask;
+		spin_unlock_irq(&current->sighand->siglock);
+
+		wake_up(&current->sighand->signalfd_wqh);
 		fput(file);
 	}
 
 	return ufd;
 
 err_fdalloc:
-	signalfd_cleanup(ctx);
+	kfree(ctx);
 	return error;
 }
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index cab741c2d60..f8abfa349ef 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -86,7 +86,7 @@ extern struct nsproxy init_nsproxy;
 	.count		= ATOMIC_INIT(1), 				\
 	.action		= { { { .sa_handler = NULL, } }, },		\
 	.siglock	= __SPIN_LOCK_UNLOCKED(sighand.siglock),	\
-	.signalfd_list	= LIST_HEAD_INIT(sighand.signalfd_list),	\
+	.signalfd_wqh	= __WAIT_QUEUE_HEAD_INITIALIZER(sighand.signalfd_wqh),	\
 }
 
 extern struct group_info init_groups;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3de79016f2a..a01ac6dd5f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -438,7 +438,7 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
-	struct list_head        signalfd_list;
+	wait_queue_head_t	signalfd_wqh;
 };
 
 struct pacct_struct {
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index 51042949569..4c9ff0910ae 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -45,49 +45,17 @@ struct signalfd_siginfo {
 #ifdef CONFIG_SIGNALFD
 
 /*
- * Deliver the signal to listening signalfd. This must be called
- * with the sighand lock held. Same are the following that end up
- * calling signalfd_deliver().
- */
-void signalfd_deliver(struct task_struct *tsk, int sig);
-
-/*
- * No need to fall inside signalfd_deliver() if no signal listeners
- * are available.
+ * Deliver the signal to listening signalfd.
  */
 static inline void signalfd_notify(struct task_struct *tsk, int sig)
 {
-	if (unlikely(!list_empty(&tsk->sighand->signalfd_list)))
-		signalfd_deliver(tsk, sig);
-}
-
-/*
- * The signal -1 is used to notify the signalfd that the sighand
- * is on its way to be detached.
- */
-static inline void signalfd_detach_locked(struct task_struct *tsk)
-{
-	if (unlikely(!list_empty(&tsk->sighand->signalfd_list)))
-		signalfd_deliver(tsk, -1);
-}
-
-static inline void signalfd_detach(struct task_struct *tsk)
-{
-	struct sighand_struct *sighand = tsk->sighand;
-
-	if (unlikely(!list_empty(&sighand->signalfd_list))) {
-		spin_lock_irq(&sighand->siglock);
-		signalfd_deliver(tsk, -1);
-		spin_unlock_irq(&sighand->siglock);
-	}
+	if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh)))
+		wake_up(&tsk->sighand->signalfd_wqh);
 }
 
 #else /* CONFIG_SIGNALFD */
 
-#define signalfd_deliver(t, s) do { } while (0)
-#define signalfd_notify(t, s) do { } while (0)
-#define signalfd_detach_locked(t) do { } while (0)
-#define signalfd_detach(t) do { } while (0)
+static inline void signalfd_notify(struct task_struct *tsk, int sig) { }
 
 #endif /* CONFIG_SIGNALFD */
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 06b24b3aa37..993369ee94d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,7 +24,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
-#include <linux/signalfd.h>
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/kthread.h>
@@ -86,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
 	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 
-	/*
-	 * Notify that this sighand has been detached. This must
-	 * be called with the tsk->sighand lock held. Also, this
-	 * access tsk->sighand internally, so it must be called
-	 * before tsk->sighand is reset.
-	 */
-	signalfd_detach_locked(tsk);
-
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7332e236d36..33f12f48684 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep,
 	struct sighand_struct *sighand = data;
 
 	spin_lock_init(&sighand->siglock);
-	INIT_LIST_HEAD(&sighand->signalfd_list);
+	init_waitqueue_head(&sighand->signalfd_wqh);
 }
 
 void __init proc_caches_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 3169bed0b4d..9fb91a32edd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -378,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	/* We only dequeue private signals from ourselves, we don't let
 	 * signalfd steal them
 	 */
-	if (likely(tsk == current))
-		signr = __dequeue_signal(&tsk->pending, mask, info);
+	signr = __dequeue_signal(&tsk->pending, mask, info);
 	if (!signr) {
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
 					 mask, info);
@@ -407,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 			}
 		}
 	}
-	if (likely(tsk == current))
-		recalc_sigpending();
+	recalc_sigpending();
 	if (signr && unlikely(sig_kernel_stop(signr))) {
 		/*
 		 * Set a marker that we have dequeued a stop signal.  Our
@@ -425,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
 			tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
 	}
-	if (signr && likely(tsk == current) &&
+	if (signr &&
 	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
 	     info->si_sys_private){
 		/*
-- 
cgit v1.2.3


From 415cb800375cc4e89fb5a6a454e484bd4adbffb4 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Sun, 16 Sep 2007 20:10:16 -0700
Subject: ocfs2: Allow smaller allocations during large writes

The ocfs2 write code loops through a page much like the block code, except
that ocfs2 allocation units can be any size, including larger than page
size. Typically it's equal to or larger than page size - most kernels run 4k
pages, the minimum ocfs2 allocation (cluster) size.

Some changes introduced during 2.6.23 changed the way writes to pages are
handled, and inadvertantly broke support for > 4k page size. Instead of just
writing one cluster at a time, we now handle the whole page in one pass.

This means that multiple (small) seperate allocations might happen in the
same pass. The allocation code howver typically optimizes by getting the
maximum which was reserved. This triggered a BUG_ON in the extend code where
it'd ask for a single bit (for one part of a > 4k page) and get back more
than it asked for.

Fix this by providing a variant of the high level allocation function which
allows the caller to specify a maximum. The traditional function remains and
just calls the new one with a maximum determined from the initial
reservation.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c       |  4 ++--
 fs/ocfs2/localalloc.c |  4 +---
 fs/ocfs2/localalloc.h |  2 +-
 fs/ocfs2/suballoc.c   | 29 +++++++++++++++++++++--------
 fs/ocfs2/suballoc.h   | 11 +++++++++++
 5 files changed, 36 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7e34e66159c..f3bc3658e7a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -491,8 +491,8 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
-				      &bit_off, &num_bits);
+	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+					clusters_to_add, &bit_off, &num_bits);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 545f7892cdf..de984d27257 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -524,13 +524,12 @@ bail:
 int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 				 handle_t *handle,
 				 struct ocfs2_alloc_context *ac,
-				 u32 min_bits,
+				 u32 bits_wanted,
 				 u32 *bit_off,
 				 u32 *num_bits)
 {
 	int status, start;
 	struct inode *local_alloc_inode;
-	u32 bits_wanted;
 	void *bitmap;
 	struct ocfs2_dinode *alloc;
 	struct ocfs2_local_alloc *la;
@@ -538,7 +537,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 	mlog_entry_void();
 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
 
-	bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
 	local_alloc_inode = ac->ac_inode;
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 	la = OCFS2_LOCAL_ALLOC(alloc);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 385a10152f9..3f76631e110 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -48,7 +48,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 				 handle_t *handle,
 				 struct ocfs2_alloc_context *ac,
-				 u32 min_bits,
+				 u32 bits_wanted,
 				 u32 *bit_off,
 				 u32 *num_bits);
 
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d9c5c9fcb30..8f09f5235e3 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1486,21 +1486,21 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
  * contig. allocation, set to '1' to indicate we can deal with extents
  * of any size.
  */
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
-			 handle_t *handle,
-			 struct ocfs2_alloc_context *ac,
-			 u32 min_clusters,
-			 u32 *cluster_start,
-			 u32 *num_clusters)
+int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+			   handle_t *handle,
+			   struct ocfs2_alloc_context *ac,
+			   u32 min_clusters,
+			   u32 max_clusters,
+			   u32 *cluster_start,
+			   u32 *num_clusters)
 {
 	int status;
-	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	unsigned int bits_wanted = max_clusters;
 	u64 bg_blkno = 0;
 	u16 bg_bit_off;
 
 	mlog_entry_void();
 
-	BUG_ON(!ac);
 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
 
 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
@@ -1557,6 +1557,19 @@ bail:
 	return status;
 }
 
+int ocfs2_claim_clusters(struct ocfs2_super *osb,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters)
+{
+	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+
+	return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+				      bits_wanted, cluster_start, num_clusters);
+}
+
 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 					       struct inode *alloc_inode,
 					       struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index f212dc01a84..cafe9370309 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -85,6 +85,17 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 			 u32 min_clusters,
 			 u32 *cluster_start,
 			 u32 *num_clusters);
+/*
+ * Use this variant of ocfs2_claim_clusters to specify a maxiumum
+ * number of clusters smaller than the allocation reserved.
+ */
+int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+			   handle_t *handle,
+			   struct ocfs2_alloc_context *ac,
+			   u32 min_clusters,
+			   u32 max_clusters,
+			   u32 *cluster_start,
+			   u32 *num_clusters);
 
 int ocfs2_free_suballoc_bits(handle_t *handle,
 			     struct inode *alloc_inode,
-- 
cgit v1.2.3


From db56246c6980e376b02d2da568d119da71f82fb9 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 17 Sep 2007 09:06:29 -0700
Subject: ocfs2: Fix pos/len passed to ocfs2_write_cluster

This was broken for file systems whose cluster size is greater than page
size. Pos needs to be incremented as we loop through the descriptors, and
len needs to be capped to the size of a single cluster.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 50cd8a20901..fa43810e597 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1211,18 +1211,33 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
 				       loff_t pos, unsigned len)
 {
 	int ret, i;
+	loff_t cluster_off;
+	unsigned int local_len = len;
 	struct ocfs2_write_cluster_desc *desc;
+	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
 
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
 
+		/*
+		 * We have to make sure that the total write passed in
+		 * doesn't extend past a single cluster.
+		 */
+		local_len = len;
+		cluster_off = pos & (osb->s_clustersize - 1);
+		if ((cluster_off + local_len) > osb->s_clustersize)
+			local_len = osb->s_clustersize - cluster_off;
+
 		ret = ocfs2_write_cluster(mapping, desc->c_phys,
 					  desc->c_unwritten, data_ac, meta_ac,
-					  wc, desc->c_cpos, pos, len);
+					  wc, desc->c_cpos, pos, local_len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
+
+		len -= local_len;
+		pos += local_len;
 	}
 
 	ret = 0;
-- 
cgit v1.2.3


From 5c26a7b70f89c36e8d9acc95cb896c3cd205fc8d Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 18 Sep 2007 17:49:29 -0700
Subject: ocfs2: Don't double set write parameters

The target page offsets were being incorrectly set a second time in
ocfs2_prepare_page_for_write(), which was causing problems on a 16k page
size kernel. Additionally, ocfs2_write_failure() was incorrectly using those
parameters instead of the parameters for the individual page being cleaned
up.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index fa43810e597..f37f25c931f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -930,18 +930,11 @@ static void ocfs2_write_failure(struct inode *inode,
 				loff_t user_pos, unsigned user_len)
 {
 	int i;
-	unsigned from, to;
+	unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
+		to = user_pos + user_len;
 	struct page *tmppage;
 
-	ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
-
-	if (wc->w_large_pages) {
-		from = wc->w_target_from;
-		to = wc->w_target_to;
-	} else {
-		from = 0;
-		to = PAGE_CACHE_SIZE;
-	}
+	ocfs2_zero_new_buffers(wc->w_target_page, from, to);
 
 	for(i = 0; i < wc->w_num_pages; i++) {
 		tmppage = wc->w_pages[i];
@@ -991,9 +984,6 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
 			map_from = cluster_start;
 			map_to = cluster_end;
 		}
-
-		wc->w_target_from = map_from;
-		wc->w_target_to = map_to;
 	} else {
 		/*
 		 * If we haven't allocated the new page yet, we
-- 
cgit v1.2.3


From 813d974c53a2b353566a86bb127625b403696dae Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 20 Sep 2007 10:59:48 -0700
Subject: ocfs2: Pack vote message and response structures

The ocfs2_vote_msg and ocfs2_response_msg structs needed to be
packed to ensure similar sizeofs in 32-bit and 64-bit arches. Without this,
we had inadvertantly broken 32/64 bit cross mounts.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/vote.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index 66a13ee63d4..c05358538f2 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -66,7 +66,7 @@ struct ocfs2_vote_msg
 {
 	struct ocfs2_msg_hdr v_hdr;
 	__be32 v_reserved1;
-};
+} __attribute__ ((packed));
 
 /* Responses are given these values to maintain backwards
  * compatibility with older ocfs2 versions */
@@ -78,7 +78,7 @@ struct ocfs2_response_msg
 {
 	struct ocfs2_msg_hdr r_hdr;
 	__be32 r_response;
-};
+} __attribute__ ((packed));
 
 struct ocfs2_vote_work {
 	struct list_head   w_list;
-- 
cgit v1.2.3


From d59952d532ed8fc93ccb98186f73d9ce5cfcb93d Mon Sep 17 00:00:00 2001
From: Jean Tourrilhes <jt@hpl.hp.com>
Date: Thu, 20 Sep 2007 11:08:18 -0700
Subject: [PATCH] WE : Add missing auth compat-ioctl

Johannes just found that we are missing a compat-ioctl
declaration. The fix is trivial. As previous patches for compat-ioctl,
this should also go to stable.

More info :
	http://marc.info/?l=linux-wireless&m=119029667902588&w=2

Signed-off-by: Jean Tourrilhes <jt@hpl.hp.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 fs/compat_ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5a5b7116cef..37310b0e810 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -3190,6 +3190,8 @@ COMPATIBLE_IOCTL(SIOCSIWRETRY)
 COMPATIBLE_IOCTL(SIOCGIWRETRY)
 COMPATIBLE_IOCTL(SIOCSIWPOWER)
 COMPATIBLE_IOCTL(SIOCGIWPOWER)
+COMPATIBLE_IOCTL(SIOCSIWAUTH)
+COMPATIBLE_IOCTL(SIOCGIWAUTH)
 /* hiddev */
 COMPATIBLE_IOCTL(HIDIOCGVERSION)
 COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
-- 
cgit v1.2.3


From f9720205d1f847cb59e197e851b5276425363f6b Mon Sep 17 00:00:00 2001
From: Bernd Schmidt <bernd.schmidt@analog.com>
Date: Wed, 3 Oct 2007 23:41:43 +0800
Subject: Binfmt_flat: Add minimum support for the Blackfin relocations

Add minimum support for the Blackfin relocations, since we don't have
enough space in each reloc.  The idea is to store a value with one
relocation so that subsequent ones can access it.

Actually, this patch is required for Blackfin.  Currently if BINFMT_FLAT is
enabled, git-tree kernel will fail to compile.

Signed-off-by: Bernd Schmidt <bernd.schmidt@analog.com>
Signed-off-by: Bryan Wu <bryan.wu@analog.com>
Cc: David McCullough <davidm@snapgear.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Miles Bader <miles.bader@necel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/binfmt_flat.c             | 5 ++++-
 include/asm-h8300/flat.h     | 3 ++-
 include/asm-m32r/flat.h      | 3 ++-
 include/asm-m68knommu/flat.h | 3 ++-
 include/asm-sh/flat.h        | 3 ++-
 include/asm-v850/flat.h      | 4 +++-
 6 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 861141b4f6d..34e9b06a744 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -742,6 +742,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	 * __start to address 4 so that is okay).
 	 */
 	if (rev > OLD_FLAT_VERSION) {
+		unsigned long persistent = 0;
 		for (i=0; i < relocs; i++) {
 			unsigned long addr, relval;
 
@@ -749,6 +750,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 			   relocated (of course, the address has to be
 			   relocated first).  */
 			relval = ntohl(reloc[i]);
+			if (flat_set_persistent (relval, &persistent))
+				continue;
 			addr = flat_get_relocate_addr(relval);
 			rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1);
 			if (rp == (unsigned long *)RELOC_FAILED) {
@@ -757,7 +760,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			}
 
 			/* Get the pointer's value.  */
-			addr = flat_get_addr_from_rp(rp, relval, flags);
+			addr = flat_get_addr_from_rp(rp, relval, flags, &persistent);
 			if (addr != 0) {
 				/*
 				 * Do the relocation.  PIC relocs in the data section are
diff --git a/include/asm-h8300/flat.h b/include/asm-h8300/flat.h
index c20eee767d6..2a873508a9a 100644
--- a/include/asm-h8300/flat.h
+++ b/include/asm-h8300/flat.h
@@ -9,6 +9,7 @@
 #define	flat_argvp_envp_on_stack()		1
 #define	flat_old_ram_flag(flags)		1
 #define	flat_reloc_valid(reloc, size)		((reloc) <= (size))
+#define	flat_set_persistent(relval, p)		0
 
 /*
  * on the H8 a couple of the relocations have an instruction in the
@@ -18,7 +19,7 @@
  */
 
 #define	flat_get_relocate_addr(rel)		(rel)
-#define flat_get_addr_from_rp(rp, relval, flags) \
+#define flat_get_addr_from_rp(rp, relval, flags, persistent) \
         (get_unaligned(rp) & ((flags & FLAT_FLAG_GOTPIC) ? 0xffffffff: 0x00ffffff))
 #define flat_put_addr_at_rp(rp, addr, rel) \
 	put_unaligned (((*(char *)(rp)) << 24) | ((addr) & 0x00ffffff), rp)
diff --git a/include/asm-m32r/flat.h b/include/asm-m32r/flat.h
index 1b285f65cab..d851cf0c4aa 100644
--- a/include/asm-m32r/flat.h
+++ b/include/asm-m32r/flat.h
@@ -15,9 +15,10 @@
 #define	flat_stack_align(sp)		(*sp += (*sp & 3 ? (4 - (*sp & 3)): 0))
 #define	flat_argvp_envp_on_stack()		0
 #define	flat_old_ram_flag(flags)		(flags)
+#define	flat_set_persistent(relval, p)		0
 #define	flat_reloc_valid(reloc, size)		\
 	(((reloc) - textlen_for_m32r_lo16_data) <= (size))
-#define flat_get_addr_from_rp(rp, relval, flags) \
+#define flat_get_addr_from_rp(rp, relval, flags, persistent) \
 	m32r_flat_get_addr_from_rp(rp, relval, (text_len) )
 
 #define flat_put_addr_at_rp(rp, addr, relval) \
diff --git a/include/asm-m68knommu/flat.h b/include/asm-m68knommu/flat.h
index 2d836edc434..814b5174a8e 100644
--- a/include/asm-m68knommu/flat.h
+++ b/include/asm-m68knommu/flat.h
@@ -9,8 +9,9 @@
 #define	flat_argvp_envp_on_stack()		1
 #define	flat_old_ram_flag(flags)		(flags)
 #define	flat_reloc_valid(reloc, size)		((reloc) <= (size))
-#define	flat_get_addr_from_rp(rp, relval, flags)	get_unaligned(rp)
+#define	flat_get_addr_from_rp(rp, relval, flags, p)	get_unaligned(rp)
 #define	flat_put_addr_at_rp(rp, val, relval)	put_unaligned(val,rp)
 #define	flat_get_relocate_addr(rel)		(rel)
+#define	flat_set_persistent(relval, p)		0
 
 #endif /* __M68KNOMMU_FLAT_H__ */
diff --git a/include/asm-sh/flat.h b/include/asm-sh/flat.h
index 0d5cc04ab00..dc4f5950daf 100644
--- a/include/asm-sh/flat.h
+++ b/include/asm-sh/flat.h
@@ -16,8 +16,9 @@
 #define	flat_argvp_envp_on_stack()		0
 #define	flat_old_ram_flag(flags)		(flags)
 #define	flat_reloc_valid(reloc, size)		((reloc) <= (size))
-#define	flat_get_addr_from_rp(rp, relval, flags)	get_unaligned(rp)
+#define	flat_get_addr_from_rp(rp, relval, flags, p)	get_unaligned(rp)
 #define	flat_put_addr_at_rp(rp, val, relval)	put_unaligned(val,rp)
 #define	flat_get_relocate_addr(rel)		(rel)
+#define	flat_set_persistent(relval, p)		0
 
 #endif /* __ASM_SH_FLAT_H */
diff --git a/include/asm-v850/flat.h b/include/asm-v850/flat.h
index 3888f59d688..17f0ea56661 100644
--- a/include/asm-v850/flat.h
+++ b/include/asm-v850/flat.h
@@ -25,6 +25,7 @@
 #define	flat_stack_align(sp)		/* nothing needed */
 #define	flat_argvp_envp_on_stack()	0
 #define	flat_old_ram_flag(flags)	(flags)
+#define	flat_set_persistent(relval, p)	0
 
 /* We store the type of relocation in the top 4 bits of the `relval.' */
 
@@ -46,7 +47,8 @@ flat_get_relocate_addr (unsigned long relval)
    For the v850, RP should always be half-word aligned.  */
 static inline unsigned long flat_get_addr_from_rp (unsigned long *rp,
 						   unsigned long relval,
-						   unsigned long flags)
+						   unsigned long flags,
+						   unsigned long *persistent)
 {
 	short *srp = (short *)rp;
 
-- 
cgit v1.2.3


From 576bb9ced2d274446639d7fbeee7125e24daf012 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 3 Oct 2007 23:43:57 +0800
Subject: binfmt_flat: checkpatch fixing minimum support for the blackfin
 relocations

Cc: Bernd Schmidt <bernd.schmidt@analog.com>
Cc: David McCullough <davidm@snapgear.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Miles Bader <miles.bader@necel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Bryan Wu <bryan.wu@analog.com>
---
 fs/binfmt_flat.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 34e9b06a744..fcb3405bb14 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -760,7 +760,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 			}
 
 			/* Get the pointer's value.  */
-			addr = flat_get_addr_from_rp(rp, relval, flags, &persistent);
+			addr = flat_get_addr_from_rp(rp, relval, flags,
+							&persistent);
 			if (addr != 0) {
 				/*
 				 * Do the relocation.  PIC relocs in the data section are
-- 
cgit v1.2.3


From f9b7cba1b8a74c10b0771ca28d4c554aeb9803fc Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Mon, 24 Sep 2007 21:24:39 -0700
Subject: ufs: fix sun state

Different types of ufs hold state in different places, to hide complexity
of this, there is ufs_get_fs_state, it returns state according to
"UFS_SB(sb)->s_flags", but during mount ufs_get_fs_state is called, before
setting s_flags, this cause message for ufs types like sun ufs: "fs need
fsck", and remount in readonly state.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ufs/super.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 73402c5eeb8..38eb0b7a1f3 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -894,7 +894,7 @@ magic_found:
 		goto again;
 	}
 
-
+	sbi->s_flags = flags;/*after that line some functions use s_flags*/
 	ufs_print_super_stuff(sb, usb1, usb2, usb3);
 
 	/*
@@ -1025,8 +1025,6 @@ magic_found:
 	    UFS_MOUNT_UFSTYPE_44BSD)
 		uspi->s_maxsymlinklen =
 		    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
-	
-	sbi->s_flags = flags;
 
 	inode = iget(sb, UFS_ROOTINO);
 	if (!inode || is_bad_inode(inode))
-- 
cgit v1.2.3


From 255129d1e9ca0ed3d69d5517fae3e03d7ab4b806 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 25 Sep 2007 15:55:03 -0400
Subject: NLM: Fix a circular lock dependency in lockd

The problem is that the garbage collector for the 'host' structures
nlm_gc_hosts(), holds nlm_host_mutex while calling down to
nlmsvc_mark_resources, which, eventually takes the file->f_mutex.

We cannot therefore call nlmsvc_lookup_host() from within
nlmsvc_create_block, since the caller will already hold file->f_mutex, so
the attempt to grab nlm_host_mutex may deadlock.

Fix the problem by calling nlmsvc_lookup_host() outside the file->f_mutex.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/lockd/svclock.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index a21e4bc5444..d098c7af0d2 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -171,19 +171,14 @@ found:
  * GRANTED_RES message by cookie, without having to rely on the client's IP
  * address. --okir
  */
-static inline struct nlm_block *
-nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
-		struct nlm_lock *lock, struct nlm_cookie *cookie)
+static struct nlm_block *
+nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
+		    struct nlm_file *file, struct nlm_lock *lock,
+		    struct nlm_cookie *cookie)
 {
 	struct nlm_block	*block;
-	struct nlm_host		*host;
 	struct nlm_rqst		*call = NULL;
 
-	/* Create host handle for callback */
-	host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
-	if (host == NULL)
-		return NULL;
-
 	call = nlm_alloc_call(host);
 	if (call == NULL)
 		return NULL;
@@ -366,6 +361,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
 {
 	struct nlm_block	*block = NULL;
+	struct nlm_host		*host;
 	int			error;
 	__be32			ret;
 
@@ -377,6 +373,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 				(long long)lock->fl.fl_end,
 				wait);
 
+	/* Create host handle for callback */
+	host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
+	if (host == NULL)
+		return nlm_lck_denied_nolocks;
 
 	/* Lock file against concurrent access */
 	mutex_lock(&file->f_mutex);
@@ -385,7 +385,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	 */
 	block = nlmsvc_lookup_block(file, lock);
 	if (block == NULL) {
-		block = nlmsvc_create_block(rqstp, file, lock, cookie);
+		block = nlmsvc_create_block(rqstp, nlm_get_host(host), file,
+				lock, cookie);
 		ret = nlm_lck_denied_nolocks;
 		if (block == NULL)
 			goto out;
@@ -449,6 +450,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 out:
 	mutex_unlock(&file->f_mutex);
 	nlmsvc_release_block(block);
+	nlm_release_host(host);
 	dprintk("lockd: nlmsvc_lock returned %u\n", ret);
 	return ret;
 }
@@ -477,10 +479,15 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 
 	if (block == NULL) {
 		struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+		struct nlm_host	*host;
 
 		if (conf == NULL)
 			return nlm_granted;
-		block = nlmsvc_create_block(rqstp, file, lock, cookie);
+		/* Create host handle for callback */
+		host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
+		if (host == NULL)
+			return nlm_lck_denied_nolocks;
+		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
 		if (block == NULL) {
 			kfree(conf);
 			return nlm_granted;
-- 
cgit v1.2.3


From 54af3bb543c071769141387a42deaaab5074da55 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Sep 2007 12:27:41 -0400
Subject: NFS: Fix an Oops in encode_lookup()

It doesn't look as if the NFS file name limit is being initialised correctly
in the struct nfs_server. Make sure that we limit whatever is being set in
nfs_probe_fsinfo() and nfs_init_server().

Also ensure that readdirplus and nfs4_path_walk respect our file name
limits.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/client.c  | 29 +++++++++++++++++++----------
 fs/nfs/dir.c     |  2 ++
 fs/nfs/getroot.c |  3 +++
 3 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a49f9feff77..a204484072f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -588,16 +588,6 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
 	server->namelen  = data->namlen;
 	/* Create a client RPC handle for the NFSv3 ACL management interface */
 	nfs_init_server_aclclient(server);
-	if (clp->cl_nfsversion == 3) {
-		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
-			server->namelen = NFS3_MAXNAMLEN;
-		if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
-			server->caps |= NFS_CAP_READDIRPLUS;
-	} else {
-		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
-			server->namelen = NFS2_MAXNAMLEN;
-	}
-
 	dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp);
 	return 0;
 
@@ -794,6 +784,16 @@ struct nfs_server *nfs_create_server(const struct nfs_mount_data *data,
 	error = nfs_probe_fsinfo(server, mntfh, &fattr);
 	if (error < 0)
 		goto error;
+	if (server->nfs_client->rpc_ops->version == 3) {
+		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+			server->namelen = NFS3_MAXNAMLEN;
+		if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+			server->caps |= NFS_CAP_READDIRPLUS;
+	} else {
+		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+			server->namelen = NFS2_MAXNAMLEN;
+	}
+
 	if (!(fattr.valid & NFS_ATTR_FATTR)) {
 		error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
 		if (error < 0) {
@@ -984,6 +984,9 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
 	if (error < 0)
 		goto error;
 
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
 	BUG_ON(!server->nfs_client);
 	BUG_ON(!server->nfs_client->rpc_ops);
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
@@ -1056,6 +1059,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 	if (error < 0)
 		goto error;
 
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
 	dprintk("Referral FSID: %llx:%llx\n",
 		(unsigned long long) server->fsid.major,
 		(unsigned long long) server->fsid.minor);
@@ -1115,6 +1121,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 	if (error < 0)
 		goto out_free_server;
 
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
 	dprintk("Cloned FSID: %llx:%llx\n",
 		(unsigned long long) server->fsid.major,
 		(unsigned long long) server->fsid.minor);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ea97408e423..e4a04d16b8b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1162,6 +1162,8 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 	}
 	if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
 		return NULL;
+	if (name.len > NFS_SERVER(dir)->namelen)
+		return NULL;
 	/* Note: caller is already holding the dir->i_mutex! */
 	dentry = d_alloc(parent, &name);
 	if (dentry == NULL)
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index d1cbf0a0fbb..522e5ad4d8a 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -175,6 +175,9 @@ next_component:
 		path++;
 	name.len = path - (const char *) name.name;
 
+	if (name.len > NFS4_MAXNAMLEN)
+		return -ENAMETOOLONG;
+
 eat_dot_dir:
 	while (*path == '/')
 		path++;
-- 
cgit v1.2.3


From 564256c9e06d75e16d894a2cd30604bd6582cbba Mon Sep 17 00:00:00 2001
From: Tim Shimmin <tes@chook.melbourne.sgi.com>
Date: Mon, 1 Oct 2007 16:39:37 +1000
Subject: Revert "[XFS] Avoid replaying inode buffer initialisation log items
 if on-disk version is newer."

This reverts commit b394e43e995d08821588a22561c6a71a63b4ff27.

Lachlan McIlroy says:
    It tried to fix an issue where log replay is replaying an inode cluster
    initialisation transaction that should not be replayed because the inode
    cluster on disk is more up to date.  Since we don't log file sizes (we
    rely on inode flushing to get them to disk) then we can't just replay
    all the transations in the log and expect the inode to be completely
    restored.  We lose file size updates.  Unfortunately this fix is causing
    more (serious) problems than it is fixing.

SGI-PV: 969656
SGI-Modid: xfs-linux-melb:xfs-kern:29804a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/xfs_buf_item.h    |  5 -----
 fs/xfs/xfs_log_recover.c | 51 +++---------------------------------------------
 fs/xfs/xfs_trans_buf.c   |  1 -
 3 files changed, 3 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index fa25b7dcc6c..d7e13614306 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -52,11 +52,6 @@ typedef struct xfs_buf_log_format_t {
 #define	XFS_BLI_UDQUOT_BUF	0x4
 #define XFS_BLI_PDQUOT_BUF	0x8
 #define	XFS_BLI_GDQUOT_BUF	0x10
-/*
- * This flag indicates that the buffer contains newly allocated
- * inodes.
- */
-#define	XFS_BLI_INODE_NEW_BUF	0x20
 
 #define	XFS_BLI_CHUNK		128
 #define	XFS_BLI_SHIFT		7
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7174991f4be..8ae6e8e5f3d 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1874,7 +1874,6 @@ xlog_recover_do_inode_buffer(
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
-	xfs_mount_t		*mp,
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
@@ -1885,50 +1884,6 @@ xlog_recover_do_reg_buffer(
 	unsigned int		*data_map = NULL;
 	unsigned int		map_size = 0;
 	int                     error;
-	int			stale_buf = 1;
-
-	/*
-	 * Scan through the on-disk inode buffer and attempt to
-	 * determine if it has been written to since it was logged.
-	 *
-	 * - If any of the magic numbers are incorrect then the buffer is stale
-	 * - If any of the modes are non-zero then the buffer is not stale
-	 * - If all of the modes are zero and at least one of the generation
-	 *   counts is non-zero then the buffer is stale
-	 *
-	 * If the end result is a stale buffer then the log buffer is replayed
-	 * otherwise it is skipped.
-	 *
-	 * This heuristic is not perfect.  It can be improved by scanning the
-	 * entire inode chunk for evidence that any of the inode clusters have
-	 * been updated.  To fix this problem completely we will need a major
-	 * architectural change to the logging system.
-	 */
-	if (buf_f->blf_flags & XFS_BLI_INODE_NEW_BUF) {
-		xfs_dinode_t    *dip;
-		int             inodes_per_buf;
-		int		mode_count = 0;
-		int		gen_count = 0;
-
-		stale_buf = 0;
-		inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
-		for (i = 0; i < inodes_per_buf; i++) {
-			dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-				i * mp->m_sb.sb_inodesize);
-			if (be16_to_cpu(dip->di_core.di_magic) !=
-					XFS_DINODE_MAGIC) {
-				stale_buf = 1;
-				break;
-			}
-			if (dip->di_core.di_mode)
-				mode_count++;
-			if (dip->di_core.di_gen)
-				gen_count++;
-		}
-
-		if (!mode_count && gen_count)
-			stale_buf = 1;
-	}
 
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
@@ -1962,7 +1917,7 @@ xlog_recover_do_reg_buffer(
 					       -1, 0, XFS_QMOPT_DOWARN,
 					       "dquot_buf_recover");
 		}
-		if (!error && stale_buf)
+		if (!error)
 			memcpy(xfs_buf_offset(bp,
 				(uint)bit << XFS_BLI_SHIFT),	/* dest */
 				item->ri_buf[i].i_addr,		/* source */
@@ -2134,7 +2089,7 @@ xlog_recover_do_dquot_buffer(
 	if (log->l_quotaoffs_flag & type)
 		return;
 
-	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+	xlog_recover_do_reg_buffer(item, bp, buf_f);
 }
 
 /*
@@ -2235,7 +2190,7 @@ xlog_recover_do_buffer_trans(
 		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
-		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+		xlog_recover_do_reg_buffer(item, bp, buf_f);
 	}
 	if (error)
 		return XFS_ERROR(error);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 95fff6872a2..60b6b898022 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -966,7 +966,6 @@ xfs_trans_inode_alloc_buf(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
-	bip->bli_format.blf_flags |= XFS_BLI_INODE_NEW_BUF;
 }
 
 
-- 
cgit v1.2.3


From 75723957673bfa10c98b735259f891cc79cf0450 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Mon, 1 Oct 2007 13:17:28 -0700
Subject: Fix possible splice() mmap_sem deadlock

Nick Piggin points out that splice isn't being good about the mmap
semaphore: while two readers can nest inside each others, it does leave
a possible deadlock if a writer (ie a new mmap()) comes in during that
nesting.

Original "just move the locking" patch by Nick, replaced by one by me
based on an optimistic pagefault_disable().  And then Jens tested and
updated that patch.

Reported-by: Nick Piggin <npiggin@suse.de>
Tested-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/splice.c | 46 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index c010a72ca2d..e95a3622886 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1223,6 +1223,33 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 	return -EINVAL;
 }
 
+/*
+ * Do a copy-from-user while holding the mmap_semaphore for reading, in a
+ * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
+ * for writing) and page faulting on the user memory pointed to by src.
+ * This assumes that we will very rarely hit the partial != 0 path, or this
+ * will not be a win.
+ */
+static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
+{
+	int partial;
+
+	pagefault_disable();
+	partial = __copy_from_user_inatomic(dst, src, n);
+	pagefault_enable();
+
+	/*
+	 * Didn't copy everything, drop the mmap_sem and do a faulting copy
+	 */
+	if (unlikely(partial)) {
+		up_read(&current->mm->mmap_sem);
+		partial = copy_from_user(dst, src, n);
+		down_read(&current->mm->mmap_sem);
+	}
+
+	return partial;
+}
+
 /*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
@@ -1236,31 +1263,26 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 {
 	int buffers = 0, error = 0;
 
-	/*
-	 * It's ok to take the mmap_sem for reading, even
-	 * across a "get_user()".
-	 */
 	down_read(&current->mm->mmap_sem);
 
 	while (nr_vecs) {
 		unsigned long off, npages;
+		struct iovec entry;
 		void __user *base;
 		size_t len;
 		int i;
 
-		/*
-		 * Get user address base and length for this iovec.
-		 */
-		error = get_user(base, &iov->iov_base);
-		if (unlikely(error))
-			break;
-		error = get_user(len, &iov->iov_len);
-		if (unlikely(error))
+		error = -EFAULT;
+		if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
 			break;
 
+		base = entry.iov_base;
+		len = entry.iov_len;
+
 		/*
 		 * Sanity check this iovec. 0 read succeeds.
 		 */
+		error = 0;
 		if (unlikely(!len))
 			break;
 		error = -EFAULT;
-- 
cgit v1.2.3


From bda0233b89c10ae46ccecb78bffdaf0fd7833d17 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Fri, 21 Sep 2007 11:41:43 -0700
Subject: ocfs2: Unlock mutex in local alloc failure case

The fs was not unlocking the local alloc inode mutex in the code path in
which it failed to find a window of free bits in the global bitmap.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/localalloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index de984d27257..d272847d5a0 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -514,8 +514,10 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 	ac->ac_bh = osb->local_alloc_bh;
 	status = 0;
 bail:
-	if (status < 0 && local_alloc_inode)
+	if (status < 0 && local_alloc_inode) {
+		mutex_unlock(&local_alloc_inode->i_mutex);
 		iput(local_alloc_inode);
+	}
 
 	mlog_exit(status);
 	return status;
-- 
cgit v1.2.3


From 87e2831c3fa39cbf6f7ab676bb5aef039b9659e2 Mon Sep 17 00:00:00 2001
From: Yan Zheng <yanzheng@21cn.com>
Date: Mon, 8 Oct 2007 12:16:20 -0700
Subject: AIO: fix cleanup in io_submit_one(...)

When IOCB_FLAG_RESFD flag is set and iocb->aio_resfd is incorrect,
statement 'goto out_put_req' is executed. At label 'out_put_req',
aio_put_req(..) is called, which requires 'req->ki_filp' set.

Signed-off-by: Yan Zheng<yanzheng@21cn.com>
Cc: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index dbe699e9828..ea2e1982038 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1562,6 +1562,7 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		fput(file);
 		return -EAGAIN;
 	}
+	req->ki_filp = file;
 	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
 		/*
 		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -1576,7 +1577,6 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		}
 	}
 
-	req->ki_filp = file;
 	ret = put_user(req->ki_key, &user_iocb->aio_key);
 	if (unlikely(ret)) {
 		dprintk("EFAULT: aio_key\n");
-- 
cgit v1.2.3


From a6d85430424d44e946e0946bfaad607115510989 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 9 Oct 2007 11:04:57 -0400
Subject: NLM: Fix a memory leak in nlmsvc_testlock

The recent fix for a circular lock dependency unfortunately introduced a
potential memory leak in the event where the call to nlmsvc_lookup_host
fails for some reason.

Thanks to Roel Kluin for spotting this.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/lockd/svclock.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d098c7af0d2..d120ec39bcb 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -485,8 +485,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 			return nlm_granted;
 		/* Create host handle for callback */
 		host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
-		if (host == NULL)
+		if (host == NULL) {
+			kfree(conf);
 			return nlm_lck_denied_nolocks;
+		}
 		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
 		if (block == NULL) {
 			kfree(conf);
-- 
cgit v1.2.3