From dc61b1d65e353d638b2445f71fb8e5b5630f2415 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 8 Jun 2010 11:40:42 +0200
Subject: sched: Fix PROVE_RCU vs cpu_cgroup

PROVE_RCU has a few issues with the cpu_cgroup because the scheduler
typically holds rq->lock around the css rcu derefs but the generic
cgroup code doesn't (and can't) know about that lock.

Provide means to add extra checks to the css dereference and use that
in the scheduler to annotate its users.

The addition of rq->lock to these checks is correct because the
cgroup_subsys::attach() method takes the rq->lock for each task it
moves, therefore by holding that lock, we ensure the task is pinned to
the current cgroup and the RCU derefence is valid.

That leaves one genuine race in __sched_setscheduler() where we used
task_group() without holding any of the required locks and thus raced
with the cgroup code. Solve this by moving the check under the
appropriate lock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 115 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 59 insertions(+), 56 deletions(-)

(limited to 'kernel')
diff --git a/kernel/sched.c b/kernel/sched.c
index f8b8996228d..2aaceebd484 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -306,52 +306,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
  */
 struct task_group init_task_group;
 
-/* return group to which a task belongs */
-static inline struct task_group *task_group(struct task_struct *p)
-{
-	struct task_group *tg;
-
-#ifdef CONFIG_CGROUP_SCHED
-	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
-				struct task_group, css);
-#else
-	tg = &init_task_group;
-#endif
-	return tg;
-}
-
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-	/*
-	 * Strictly speaking this rcu_read_lock() is not needed since the
-	 * task_group is tied to the cgroup, which in turn can never go away
-	 * as long as there are tasks attached to it.
-	 *
-	 * However since task_group() uses task_subsys_state() which is an
-	 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
-	 */
-	rcu_read_lock();
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
-	p->se.parent = task_group(p)->se[cpu];
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
-	p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
-	rcu_read_unlock();
-}
-
-#else
-
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
-	return NULL;
-}
-
 #endif	/* CONFIG_CGROUP_SCHED */
 
 /* CFS-related fields in a runqueue */
@@ -644,6 +598,49 @@ static inline int cpu_of(struct rq *rq)
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define raw_rq()		(&__raw_get_cpu_var(runqueues))
 
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification
+ * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * holds that lock for each task it moves into the cgroup. Therefore
+ * by holding that lock, we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+	struct cgroup_subsys_state *css;
+
+	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+			lockdep_is_held(&task_rq(p)->lock));
+	return container_of(css, struct task_group, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
+	p->se.parent = task_group(p)->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
+	p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
 inline void update_rq_clock(struct rq *rq)
 {
 	if (!rq->skip_clock_update)
@@ -4465,16 +4462,6 @@ recheck:
 	}
 
 	if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
-		/*
-		 * Do not allow realtime tasks into groups that have no runtime
-		 * assigned.
-		 */
-		if (rt_bandwidth_enabled() && rt_policy(policy) &&
-				task_group(p)->rt_bandwidth.rt_runtime == 0)
-			return -EPERM;
-#endif
-
 		retval = security_task_setscheduler(p, policy, param);
 		if (retval)
 			return retval;
@@ -4490,6 +4477,22 @@ recheck:
 	 * runqueue lock must be held.
 	 */
 	rq = __task_rq_lock(p);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (user) {
+		/*
+		 * Do not allow realtime tasks into groups that have no runtime
+		 * assigned.
+		 */
+		if (rt_bandwidth_enabled() && rt_policy(policy) &&
+				task_group(p)->rt_bandwidth.rt_runtime == 0) {
+			__task_rq_unlock(rq);
+			raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+			return -EPERM;
+		}
+	}
+#endif
+
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
-- 
cgit v1.2.3


From 4673247562e39a17e09440fa1400819522ccd446 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Jun 2010 17:53:51 +0200
Subject: genirq: Deal with desc->set_type() changing desc->chip

The set_type() function can change the chip implementation when the
trigger mode changes. That might result in using an non-initialized
irq chip when called from __setup_irq() or when called via
set_irq_type() on an already enabled irq.

The set_irq_type() function should not be called on an enabled irq,
but because we forgot to put a check into it, we have a bunch of users
which grew the habit of doing that and it never blew up as the
function is serialized via desc->lock against all users of desc->chip
and they never hit the non-initialized irq chip issue.

The easy fix for the __setup_irq() issue would be to move the
irq_chip_set_defaults(desc->chip) call after the trigger setting to
make sure that a chip change is covered.

But as we have already users, which do the type setting after
request_irq(), the safe fix for now is to call irq_chip_set_defaults()
from __irq_set_trigger() when desc->set_type() changed the irq chip.

It needs a deeper analysis whether we should refuse to change the chip
on an already enabled irq, but that'd be a large scale change to fix
all the existing users. So that's neither stable nor 2.6.35 material.

Reported-by: Esben Haabendal <eha@doredevelopment.dk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: linuxppc-dev <linuxppc-dev@ozlabs.org>
Cc: stable@kernel.org
---
 kernel/irq/manage.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3164ba7ce15..e1497481fe8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -456,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		/* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
 		desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
 		desc->status |= flags;
+
+		if (chip != desc->chip)
+			irq_chip_set_defaults(desc->chip);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From dd4c4f17d722ffeb2515bf781400675a30fcead7 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Fri, 28 May 2010 16:32:14 -0400
Subject: suspend: Move NVS save/restore code to generic suspend functionality

Saving platform non-volatile state may be required for suspend to RAM as
well as hibernation. Move it to more generic code.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Tested-by: Maxim Levitsky <maximlevitsky@gmail.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/e820.c       |   2 +-
 drivers/acpi/sleep.c         |  12 ++--
 include/linux/suspend.h      |  26 ++++-----
 kernel/power/Kconfig         |   9 +--
 kernel/power/Makefile        |   2 +-
 kernel/power/hibernate_nvs.c | 136 -------------------------------------------
 kernel/power/nvs.c           | 136 +++++++++++++++++++++++++++++++++++++++++++
 kernel/power/suspend.c       |   6 ++
 8 files changed, 168 insertions(+), 161 deletions(-)
 delete mode 100644 kernel/power/hibernate_nvs.c
 create mode 100644 kernel/power/nvs.c

(limited to 'kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7bca3c6a02f..0d6fc71bedb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -729,7 +729,7 @@ static int __init e820_mark_nvs_memory(void)
 		struct e820entry *ei = &e820.map[i];
 
 		if (ei->type == E820_NVS)
-			hibernate_nvs_register(ei->addr, ei->size);
+			suspend_nvs_register(ei->addr, ei->size);
 	}
 
 	return 0;
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 4ab2275b446..bcaa6efa813 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -393,7 +393,7 @@ static int acpi_hibernation_begin(void)
 {
 	int error;
 
-	error = s4_no_nvs ? 0 : hibernate_nvs_alloc();
+	error = s4_no_nvs ? 0 : suspend_nvs_alloc();
 	if (!error) {
 		acpi_target_sleep_state = ACPI_STATE_S4;
 		acpi_sleep_tts_switch(acpi_target_sleep_state);
@@ -407,7 +407,7 @@ static int acpi_hibernation_pre_snapshot(void)
 	int error = acpi_pm_prepare();
 
 	if (!error)
-		hibernate_nvs_save();
+		suspend_nvs_save();
 
 	return error;
 }
@@ -432,7 +432,7 @@ static int acpi_hibernation_enter(void)
 
 static void acpi_hibernation_finish(void)
 {
-	hibernate_nvs_free();
+	suspend_nvs_free();
 	acpi_pm_finish();
 }
 
@@ -452,7 +452,7 @@ static void acpi_hibernation_leave(void)
 		panic("ACPI S4 hardware signature mismatch");
 	}
 	/* Restore the NVS memory area */
-	hibernate_nvs_restore();
+	suspend_nvs_restore();
 }
 
 static int acpi_pm_pre_restore(void)
@@ -501,7 +501,7 @@ static int acpi_hibernation_begin_old(void)
 
 	if (!error) {
 		if (!s4_no_nvs)
-			error = hibernate_nvs_alloc();
+			error = suspend_nvs_alloc();
 		if (!error)
 			acpi_target_sleep_state = ACPI_STATE_S4;
 	}
@@ -513,7 +513,7 @@ static int acpi_hibernation_pre_snapshot_old(void)
 	int error = acpi_pm_disable_gpes();
 
 	if (!error)
-		hibernate_nvs_save();
+		suspend_nvs_save();
 
 	return error;
 }
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 5e781d824e6..bc7d6bb4cd8 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -256,22 +256,22 @@ static inline int hibernate(void) { return -ENOSYS; }
 static inline bool system_entering_hibernation(void) { return false; }
 #endif /* CONFIG_HIBERNATION */
 
-#ifdef CONFIG_HIBERNATION_NVS
-extern int hibernate_nvs_register(unsigned long start, unsigned long size);
-extern int hibernate_nvs_alloc(void);
-extern void hibernate_nvs_free(void);
-extern void hibernate_nvs_save(void);
-extern void hibernate_nvs_restore(void);
-#else /* CONFIG_HIBERNATION_NVS */
-static inline int hibernate_nvs_register(unsigned long a, unsigned long b)
+#ifdef CONFIG_SUSPEND_NVS
+extern int suspend_nvs_register(unsigned long start, unsigned long size);
+extern int suspend_nvs_alloc(void);
+extern void suspend_nvs_free(void);
+extern void suspend_nvs_save(void);
+extern void suspend_nvs_restore(void);
+#else /* CONFIG_SUSPEND_NVS */
+static inline int suspend_nvs_register(unsigned long a, unsigned long b)
 {
 	return 0;
 }
-static inline int hibernate_nvs_alloc(void) { return 0; }
-static inline void hibernate_nvs_free(void) {}
-static inline void hibernate_nvs_save(void) {}
-static inline void hibernate_nvs_restore(void) {}
-#endif /* CONFIG_HIBERNATION_NVS */
+static inline int suspend_nvs_alloc(void) { return 0; }
+static inline void suspend_nvs_free(void) {}
+static inline void suspend_nvs_save(void) {}
+static inline void suspend_nvs_restore(void) {}
+#endif /* CONFIG_SUSPEND_NVS */
 
 #ifdef CONFIG_PM_SLEEP
 void save_processor_state(void);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5c36ea9d55d..ca6066a6952 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG
 	depends on PM_ADVANCED_DEBUG
 	default n
 
+config SUSPEND_NVS
+       bool
+
 config SUSPEND
 	bool "Suspend to RAM and standby"
 	depends on PM && ARCH_SUSPEND_POSSIBLE
+	select SUSPEND_NVS if HAS_IOMEM
 	default y
 	---help---
 	  Allow the system to enter sleep states in which main memory is
@@ -130,13 +134,10 @@ config SUSPEND_FREEZER
 
 	  Turning OFF this setting is NOT recommended! If in doubt, say Y.
 
-config HIBERNATION_NVS
-	bool
-
 config HIBERNATION
 	bool "Hibernation (aka 'suspend to disk')"
 	depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
-	select HIBERNATION_NVS if HAS_IOMEM
+	select SUSPEND_NVS if HAS_IOMEM
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
 	  called "hibernation" in user interfaces.  STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 524e058dcf0..f9063c6b185 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -10,6 +10,6 @@ obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
 obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o \
 				   block_io.o
-obj-$(CONFIG_HIBERNATION_NVS)	+= hibernate_nvs.o
+obj-$(CONFIG_SUSPEND_NVS)	+= nvs.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
deleted file mode 100644
index fdcad9ed5a7..00000000000
--- a/kernel/power/hibernate_nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
- *
- * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/suspend.h>
-
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * hibernation and to restore the contents of this memory during the subsequent
- * resume.  The code below implements a mechanism allowing us to do that.
- */
-
-struct nvs_page {
-	unsigned long phys_start;
-	unsigned int size;
-	void *kaddr;
-	void *data;
-	struct list_head node;
-};
-
-static LIST_HEAD(nvs_list);
-
-/**
- *	hibernate_nvs_register - register platform NVS memory region to save
- *	@start - physical address of the region
- *	@size - size of the region
- *
- *	The NVS region need not be page-aligned (both ends) and we arrange
- *	things so that the data from page-aligned addresses in this region will
- *	be copied into separate RAM pages.
- */
-int hibernate_nvs_register(unsigned long start, unsigned long size)
-{
-	struct nvs_page *entry, *next;
-
-	while (size > 0) {
-		unsigned int nr_bytes;
-
-		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
-		if (!entry)
-			goto Error;
-
-		list_add_tail(&entry->node, &nvs_list);
-		entry->phys_start = start;
-		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
-		entry->size = (size < nr_bytes) ? size : nr_bytes;
-
-		start += entry->size;
-		size -= entry->size;
-	}
-	return 0;
-
- Error:
-	list_for_each_entry_safe(entry, next, &nvs_list, node) {
-		list_del(&entry->node);
-		kfree(entry);
-	}
-	return -ENOMEM;
-}
-
-/**
- *	hibernate_nvs_free - free data pages allocated for saving NVS regions
- */
-void hibernate_nvs_free(void)
-{
-	struct nvs_page *entry;
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data) {
-			free_page((unsigned long)entry->data);
-			entry->data = NULL;
-			if (entry->kaddr) {
-				iounmap(entry->kaddr);
-				entry->kaddr = NULL;
-			}
-		}
-}
-
-/**
- *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int hibernate_nvs_alloc(void)
-{
-	struct nvs_page *entry;
-
-	list_for_each_entry(entry, &nvs_list, node) {
-		entry->data = (void *)__get_free_page(GFP_KERNEL);
-		if (!entry->data) {
-			hibernate_nvs_free();
-			return -ENOMEM;
-		}
-	}
-	return 0;
-}
-
-/**
- *	hibernate_nvs_save - save NVS memory regions
- */
-void hibernate_nvs_save(void)
-{
-	struct nvs_page *entry;
-
-	printk(KERN_INFO "PM: Saving platform NVS memory\n");
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data) {
-			entry->kaddr = ioremap(entry->phys_start, entry->size);
-			memcpy(entry->data, entry->kaddr, entry->size);
-		}
-}
-
-/**
- *	hibernate_nvs_restore - restore NVS memory regions
- *
- *	This function is going to be called with interrupts disabled, so it
- *	cannot iounmap the virtual addresses used to access the NVS region.
- */
-void hibernate_nvs_restore(void)
-{
-	struct nvs_page *entry;
-
-	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data)
-			memcpy(entry->kaddr, entry->data, entry->size);
-}
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
new file mode 100644
index 00000000000..1836db60bbb
--- /dev/null
+++ b/kernel/power/nvs.c
@@ -0,0 +1,136 @@
+/*
+ * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
+ *
+ * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * suspend and to restore the contents of this memory during the subsequent
+ * resume.  The code below implements a mechanism allowing us to do that.
+ */
+
+struct nvs_page {
+	unsigned long phys_start;
+	unsigned int size;
+	void *kaddr;
+	void *data;
+	struct list_head node;
+};
+
+static LIST_HEAD(nvs_list);
+
+/**
+ *	suspend_nvs_register - register platform NVS memory region to save
+ *	@start - physical address of the region
+ *	@size - size of the region
+ *
+ *	The NVS region need not be page-aligned (both ends) and we arrange
+ *	things so that the data from page-aligned addresses in this region will
+ *	be copied into separate RAM pages.
+ */
+int suspend_nvs_register(unsigned long start, unsigned long size)
+{
+	struct nvs_page *entry, *next;
+
+	while (size > 0) {
+		unsigned int nr_bytes;
+
+		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+		if (!entry)
+			goto Error;
+
+		list_add_tail(&entry->node, &nvs_list);
+		entry->phys_start = start;
+		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+		entry->size = (size < nr_bytes) ? size : nr_bytes;
+
+		start += entry->size;
+		size -= entry->size;
+	}
+	return 0;
+
+ Error:
+	list_for_each_entry_safe(entry, next, &nvs_list, node) {
+		list_del(&entry->node);
+		kfree(entry);
+	}
+	return -ENOMEM;
+}
+
+/**
+ *	suspend_nvs_free - free data pages allocated for saving NVS regions
+ */
+void suspend_nvs_free(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			free_page((unsigned long)entry->data);
+			entry->data = NULL;
+			if (entry->kaddr) {
+				iounmap(entry->kaddr);
+				entry->kaddr = NULL;
+			}
+		}
+}
+
+/**
+ *	suspend_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int suspend_nvs_alloc(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node) {
+		entry->data = (void *)__get_free_page(GFP_KERNEL);
+		if (!entry->data) {
+			suspend_nvs_free();
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+/**
+ *	suspend_nvs_save - save NVS memory regions
+ */
+void suspend_nvs_save(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Saving platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			entry->kaddr = ioremap(entry->phys_start, entry->size);
+			memcpy(entry->data, entry->kaddr, entry->size);
+		}
+}
+
+/**
+ *	suspend_nvs_restore - restore NVS memory regions
+ *
+ *	This function is going to be called with interrupts disabled, so it
+ *	cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void suspend_nvs_restore(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data)
+			memcpy(entry->kaddr, entry->data, entry->size);
+}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 56e7dbb8b99..f37cb7dd440 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -16,6 +16,12 @@
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
 #include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
 
 #include "power.h"
 
-- 
cgit v1.2.3


From a8fb2608053547bc3152ea61a5ec7cdfce5d942c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 10 Jun 2010 14:53:16 -0400
Subject: perf/tracing: Fix regression of perf losing kprobe events

With the addition of the code to shrink the kernel tracepoint
infrastructure, we lost kprobes being traced by perf. The reason
is that I tested if the "tp_event->class->perf_probe" existed before
enabling it. This prevents "ftrace only" events (like the function
trace events) from being enabled by perf.

Unfortunately, kprobe events do not use perf_probe. This causes
kprobes to be missed by perf. To fix this, we add the test to
see if "tp_event->class->reg" exists as well as perf_probe.

Normal trace events have only "perf_probe" but no "reg" function,
and kprobes and syscalls have the "reg" but no "perf_probe".
The ftrace unique events do not have either, so this is a valid
test. If a kprobe or syscall is not to be probed by perf, the
"reg" function is called anyway, and will return a failure and
prevent perf from probing it.

Reported-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_event_perf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e6f65887842..8a2b73f7c06 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -96,7 +96,9 @@ int perf_trace_init(struct perf_event *p_event)
 	mutex_lock(&event_mutex);
 	list_for_each_entry(tp_event, &ftrace_events, list) {
 		if (tp_event->event.type == event_id &&
-		    tp_event->class && tp_event->class->perf_probe &&
+		    tp_event->class &&
+		    (tp_event->class->perf_probe ||
+		     tp_event->class->reg) &&
 		    try_module_get(tp_event->mod)) {
 			ret = perf_trace_event_init(tp_event, p_event);
 			break;
-- 
cgit v1.2.3


From 3310d4d38fbc514e7b18bd3b1eea8effdd63b5aa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 17 Jun 2010 18:02:37 +0200
Subject: nohz: Fix nohz ratelimit

Chris Wedgwood reports that 39c0cbe (sched: Rate-limit nohz) causes a
serial console regression, unresponsiveness, and indeed it does. The
reason is that the nohz code is skipped even when the tick was already
stopped before the nohz_ratelimit(cpu) condition changed.

Move the nohz_ratelimit() check to the other conditions which prevent
long idle sleeps.

Reported-by: Chris Wedgwood <cw@f00f.org>
Tested-by: Brian Bloniarz <bmb@athenacr.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg KH <gregkh@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Jef Driesen <jefdriesen@telenet.be>
LKML-Reference: <1276790557.27822.516.camel@twins>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1d7b9bc1c03..783fbadf220 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -315,9 +315,6 @@ void tick_nohz_stop_sched_tick(int inidle)
 		goto end;
 	}
 
-	if (nohz_ratelimit(cpu))
-		goto end;
-
 	ts->idle_calls++;
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
@@ -328,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	} while (read_seqretry(&xtime_lock, seq));
 
 	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-	    arch_needs_cpu(cpu)) {
+	    arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
 		next_jiffies = last_jiffies + 1;
 		delta_jiffies = 1;
 	} else {
-- 
cgit v1.2.3


From 3c93717cfa51316e4dbb471e7c0f9d243359d5f8 Mon Sep 17 00:00:00 2001
From: "Alex,Shi" <alex.shi@intel.com>
Date: Thu, 17 Jun 2010 14:08:13 +0800
Subject: sched: Fix over-scheduling bug

Commit e70971591 ("sched: Optimize unused cgroup configuration") introduced
an imbalanced scheduling bug.

If we do not use CGROUP, function update_h_load won't update h_load. When the
system has a large number of tasks far more than logical CPU number, the
incorrect cfs_rq[cpu]->h_load value will cause load_balance() to pull too
many tasks to the local CPU from the busiest CPU. So the busiest CPU keeps
going in a round robin. That will hurt performance.

The issue was found originally by a scientific calculation workload that
developed by Yanmin. With that commit, the workload performance drops
about 40%.

 CPU  before    after

 00   : 2       : 7
 01   : 1       : 7
 02   : 11      : 6
 03   : 12      : 7
 04   : 6       : 6
 05   : 11      : 7
 06   : 10      : 6
 07   : 12      : 7
 08   : 11      : 6
 09   : 12      : 6
 10   : 1       : 6
 11   : 1       : 6
 12   : 6       : 6
 13   : 2       : 6
 14   : 2       : 6
 15   : 1       : 6

Reviewed-by: Yanmin zhang <yanmin.zhang@intel.com>
Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1276754893.9452.5442.camel@debian>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2aaceebd484..6c9e7c8735b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1657,9 +1657,6 @@ static void update_shares(struct sched_domain *sd)
 
 static void update_h_load(long cpu)
 {
-	if (root_task_group_empty())
-		return;
-
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
-- 
cgit v1.2.3


From f3b577dec1f2ce32d2db6d2ca6badff7002512af Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Tue, 1 Jun 2010 14:06:13 +0100
Subject: rcu: apply RCU protection to wake_affine()

The task_group() function returns a pointer that must be protected
by either RCU, the ->alloc_lock, or the cgroup lock (see the
rcu_dereference_check() in task_subsys_state(), which is invoked by
task_group()).  The wake_affine() function currently does none of these,
which means that a concurrent update would be within its rights to free
the structure returned by task_group().  Because wake_affine() uses this
structure only to compute load-balancing heuristics, there is no reason
to acquire either of the two locks.

Therefore, this commit introduces an RCU read-side critical section that
starts before the first call to task_group() and ends after the last use
of the "tg" pointer returned from task_group().  Thanks to Li Zefan for
pointing out the need to extend the RCU read-side critical section from
that proposed by the original patch.

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/sched_fair.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index eed35eded60..a878b5332da 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1240,6 +1240,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	 * effect of the currently running task from the load
 	 * of the current CPU:
 	 */
+	rcu_read_lock();
 	if (sync) {
 		tg = task_group(current);
 		weight = current->se.load.weight;
@@ -1275,6 +1276,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 		balanced = this_eff_load <= prev_eff_load;
 	} else
 		balanced = true;
+	rcu_read_unlock();
 
 	/*
 	 * If the currently running task will sleep within
-- 
cgit v1.2.3


From 8695159967957015f8dfb49315d6f88e111d90e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 22 Jun 2010 11:44:53 +0200
Subject: sched: silence PROVE_RCU in sched_fork()

Because cgroup_fork() is ran before sched_fork() [ from copy_process() ]
and the child's pid is not yet visible the child is pinned to its
cgroup. Therefore we can silence this warning.

A nicer solution would be moving cgroup_fork() to right after
dup_task_struct() and exclude PF_STARTING from task_subsys_state().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/sched.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f8b8996228d..a2d215d132f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2494,7 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	if (p->sched_class->task_fork)
 		p->sched_class->task_fork(p);
 
+	/*
+	 * The child is not yet in the pid-hash so no cgroup attach races,
+	 * and the cgroup is pinned to this child due to cgroup_fork()
+	 * is ran before sched_fork().
+	 *
+	 * Silence PROVE_RCU.
+	 */
+	rcu_read_lock();
 	set_task_cpu(p, cpu);
+	rcu_read_unlock();
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
-- 
cgit v1.2.3


From 0d98bb2656e9bd2dfda2d089db1fe1dbdab41504 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 24 May 2010 12:11:43 -0700
Subject: sched: Prevent compiler from optimising the sched_avg_update() loop

GCC 4.4.1 on ARM has been observed to replace the while loop in
sched_avg_update with a call to uldivmod, resulting in the
following build failure at link-time:

kernel/built-in.o: In function `sched_avg_update':
 kernel/sched.c:1261: undefined reference to `__aeabi_uldivmod'
 kernel/sched.c:1261: undefined reference to `__aeabi_uldivmod'
make: *** [.tmp_vmlinux1] Error 1

This patch introduces a fake data hazard to the loop body to
prevent the compiler optimising the loop away.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6c9e7c8735b..a24d6d5d83f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1254,6 +1254,12 @@ static void sched_avg_update(struct rq *rq)
 	s64 period = sched_avg_period();
 
 	while ((s64)(rq->clock - rq->age_stamp) > period) {
+		/*
+		 * Inline assembly required to prevent the compiler
+		 * optimising this loop into a divmod call.
+		 * See __iter_div_u64_rem() for another example of this.
+		 */
+		asm("" : "+rm" (rq->age_stamp));
 		rq->age_stamp += period;
 		rq->rt_avg /= 2;
 	}
-- 
cgit v1.2.3


From e05bd3367bd3d88715b53766f95bb3a8ec7ab59e Mon Sep 17 00:00:00 2001
From: Pavan Naregundi <pavan@linux.vnet.ibm.com>
Date: Tue, 29 Jun 2010 15:05:28 -0700
Subject: kexec: fix Oops in crash_shrink_memory()

When crashkernel is not enabled, "echo 0 > /sys/kernel/kexec_crash_size"
OOPSes the kernel in crash_shrink_memory.  This happens when
crash_shrink_memory tries to release the 'crashk_res' resource which are
not reserved.  Also value of "/sys/kernel/kexec_crash_size" shows as 1,
which should be 0.

This patch fixes the OOPS in crash_shrink_memory and shows
"/sys/kernel/kexec_crash_size" as 0 when crash kernel memory is not
reserved.

Signed-off-by: Pavan Naregundi <pavan@linux.vnet.ibm.com>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 474a84715ea..131b1703936 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs)
 
 size_t crash_get_memory_size(void)
 {
-	size_t size;
+	size_t size = 0;
 	mutex_lock(&kexec_mutex);
-	size = crashk_res.end - crashk_res.start + 1;
+	if (crashk_res.end != crashk_res.start)
+		size = crashk_res.end - crashk_res.start + 1;
 	mutex_unlock(&kexec_mutex);
 	return size;
 }
@@ -1134,7 +1135,7 @@ int crash_shrink_memory(unsigned long new_size)
 
 	free_reserved_phys_range(end, crashk_res.end);
 
-	if (start == end)
+	if ((start == end) && (crashk_res.parent != NULL))
 		release_resource(&crashk_res);
 	crashk_res.end = end - 1;
 
-- 
cgit v1.2.3


From 7a0ea09ad5352efce8fe79ed853150449903b9f5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Wed, 30 Jun 2010 09:51:19 +0200
Subject: futex: futex_find_get_task remove credentails check

futex_find_get_task is currently used (through lookup_pi_state) from two
contexts, futex_requeue and futex_lock_pi_atomic.  None of the paths
looks it needs the credentials check, though.  Different (e)uids
shouldn't matter at all because the only thing that is important for
shared futex is the accessibility of the shared memory.

The credentail check results in glibc assert failure or process hang (if
glibc is compiled without assert support) for shared robust pthread
mutex with priority inheritance if a process tries to lock already held
lock owned by a process with a different euid:

pthread_mutex_lock.c:312: __pthread_mutex_lock_full: Assertion `(-(e)) != 3 || !robust' failed.

The problem is that futex_lock_pi_atomic which is called when we try to
lock already held lock checks the current holder (tid is stored in the
futex value) to get the PI state.  It uses lookup_pi_state which in turn
gets task struct from futex_find_get_task.  ESRCH is returned either
when the task is not found or if credentials check fails.

futex_lock_pi_atomic simply returns if it gets ESRCH.  glibc code,
however, doesn't expect that robust lock returns with ESRCH because it
should get either success or owner died.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index e7a35f1039e..6a3a5fa1526 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
 static struct task_struct * futex_find_get_task(pid_t pid)
 {
 	struct task_struct *p;
-	const struct cred *cred = current_cred(), *pcred;
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
-	if (!p) {
-		p = ERR_PTR(-ESRCH);
-	} else {
-		pcred = __task_cred(p);
-		if (cred->euid != pcred->euid &&
-		    cred->euid != pcred->uid)
-			p = ERR_PTR(-ESRCH);
-		else
-			get_task_struct(p);
-	}
+	if (p)
+		get_task_struct(p);
 
 	rcu_read_unlock();
 
@@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 	if (!pid)
 		return -ESRCH;
 	p = futex_find_get_task(pid);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
+	if (!p)
+		return -ESRCH;
 
 	/*
 	 * We need to look at the task state flags to figure out,
-- 
cgit v1.2.3


From 8c215bd3890c347dfb6a2db4779755f8b9c298a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 1 Jul 2010 09:07:17 +0200
Subject: sched: Cure nr_iowait_cpu() users

Commit 0224cf4c5e (sched: Intoduce get_cpu_iowait_time_us())
broke things by not making sure preemption was indeed disabled
by the callers of nr_iowait_cpu() which took the iowait value of
the current cpu.

This resulted in a heap of preempt warnings. Cure this by making
nr_iowait_cpu() take a cpu number and fix up the callers to pass
in the right number.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Maxim Levitsky <maximlevitsky@gmail.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: linux-pm@lists.linux-foundation.org
LKML-Reference: <1277968037.1868.120.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/cpuidle/governors/menu.c |  4 ++--
 include/linux/sched.h            |  2 +-
 kernel/sched.c                   |  4 ++--
 kernel/time/tick-sched.c         | 16 ++++++++--------
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 52ff8aa63f8..1b128702d30 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -143,7 +143,7 @@ static inline int which_bucket(unsigned int duration)
 	 * This allows us to calculate
 	 * E(duration)|iowait
 	 */
-	if (nr_iowait_cpu())
+	if (nr_iowait_cpu(smp_processor_id()))
 		bucket = BUCKETS/2;
 
 	if (duration < 10)
@@ -175,7 +175,7 @@ static inline int performance_multiplier(void)
 	mult += 2 * get_loadavg();
 
 	/* for IO wait tasks (per cpu!) we add 5x each */
-	mult += 10 * nr_iowait_cpu();
+	mult += 10 * nr_iowait_cpu(smp_processor_id());
 
 	return mult;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f118809c953..747fcaedddb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -139,7 +139,7 @@ extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_iowait(void);
-extern unsigned long nr_iowait_cpu(void);
+extern unsigned long nr_iowait_cpu(int cpu);
 extern unsigned long this_cpu_load(void);
 
 
diff --git a/kernel/sched.c b/kernel/sched.c
index a24d6d5d83f..f87abe3b017 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2864,9 +2864,9 @@ unsigned long nr_iowait(void)
 	return sum;
 }
 
-unsigned long nr_iowait_cpu(void)
+unsigned long nr_iowait_cpu(int cpu)
 {
-	struct rq *this = this_rq();
+	struct rq *this = cpu_rq(cpu);
 	return atomic_read(&this->nr_iowait);
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1d7b9bc1c03..1a6f828e57a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -154,14 +154,14 @@ static void tick_nohz_update_jiffies(ktime_t now)
  * Updates the per cpu time idle statistics counters
  */
 static void
-update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 {
 	ktime_t delta;
 
 	if (ts->idle_active) {
 		delta = ktime_sub(now, ts->idle_entrytime);
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-		if (nr_iowait_cpu() > 0)
+		if (nr_iowait_cpu(cpu) > 0)
 			ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
 		ts->idle_entrytime = now;
 	}
@@ -175,19 +175,19 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
-	update_ts_time_stats(ts, now, NULL);
+	update_ts_time_stats(cpu, ts, now, NULL);
 	ts->idle_active = 0;
 
 	sched_clock_idle_wakeup_event(0);
 }
 
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 {
 	ktime_t now;
 
 	now = ktime_get();
 
-	update_ts_time_stats(ts, now, NULL);
+	update_ts_time_stats(cpu, ts, now, NULL);
 
 	ts->idle_entrytime = now;
 	ts->idle_active = 1;
@@ -216,7 +216,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 	if (!tick_nohz_enabled)
 		return -1;
 
-	update_ts_time_stats(ts, ktime_get(), last_update_time);
+	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
 
 	return ktime_to_us(ts->idle_sleeptime);
 }
@@ -242,7 +242,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 	if (!tick_nohz_enabled)
 		return -1;
 
-	update_ts_time_stats(ts, ktime_get(), last_update_time);
+	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
 
 	return ktime_to_us(ts->iowait_sleeptime);
 }
@@ -284,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	 */
 	ts->inidle = 1;
 
-	now = tick_nohz_start_idle(ts);
+	now = tick_nohz_start_idle(cpu, ts);
 
 	/*
 	 * If this cpu is offline and it is the one which updates
-- 
cgit v1.2.3


From ff49d74ad383f54041378144ca1a229ee9aeaa59 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Sat, 3 Jul 2010 13:07:35 +1000
Subject: module: initialize module dynamic debug later

We should initialize the module dynamic debug datastructures
only after determining that the module is not loaded yet. This
fixes a bug that introduced in 2.6.35-rc2, where when a trying
to load a module twice, we also load it's dynamic printing data
twice which causes all sorts of nasty issues. Also handle
the dynamic debug cleanup later on failure.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (removed a #ifdef)
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dynamic_debug.h |  4 ++--
 kernel/module.c               | 23 +++++++++++++++--------
 lib/dynamic_debug.c           |  2 +-
 3 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index b3cd4de9432..52c0da4bdd1 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -40,7 +40,7 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
 				const char *modname);
 
 #if defined(CONFIG_DYNAMIC_DEBUG)
-extern int ddebug_remove_module(char *mod_name);
+extern int ddebug_remove_module(const char *mod_name);
 
 #define __dynamic_dbg_enabled(dd)  ({	     \
 	int __ret = 0;							     \
@@ -73,7 +73,7 @@ extern int ddebug_remove_module(char *mod_name);
 
 #else
 
-static inline int ddebug_remove_module(char *mod)
+static inline int ddebug_remove_module(const char *mod)
 {
 	return 0;
 }
diff --git a/kernel/module.c b/kernel/module.c
index 8c6b42840dd..5d2d28197c8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2062,6 +2062,12 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
 #endif
 }
 
+static void dynamic_debug_remove(struct _ddebug *debug)
+{
+	if (debug)
+		ddebug_remove_module(debug->modname);
+}
+
 static void *module_alloc_update_bounds(unsigned long size)
 {
 	void *ret = module_alloc(size);
@@ -2124,6 +2130,8 @@ static noinline struct module *load_module(void __user *umod,
 	void *ptr = NULL; /* Stops spurious gcc warning */
 	unsigned long symoffs, stroffs, *strmap;
 	void __percpu *percpu;
+	struct _ddebug *debug = NULL;
+	unsigned int num_debug = 0;
 
 	mm_segment_t old_fs;
 
@@ -2476,15 +2484,9 @@ static noinline struct module *load_module(void __user *umod,
 	kfree(strmap);
 	strmap = NULL;
 
-	if (!mod->taints) {
-		struct _ddebug *debug;
-		unsigned int num_debug;
-
+	if (!mod->taints)
 		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
 				     sizeof(*debug), &num_debug);
-		if (debug)
-			dynamic_debug_setup(debug, num_debug);
-	}
 
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
@@ -2526,10 +2528,13 @@ static noinline struct module *load_module(void __user *umod,
 		goto unlock;
 	}
 
+	if (debug)
+		dynamic_debug_setup(debug, num_debug);
+
 	/* Find duplicate symbols */
 	err = verify_export_symbols(mod);
 	if (err < 0)
-		goto unlock;
+		goto ddebug;
 
 	list_add_rcu(&mod->list, &modules);
 	mutex_unlock(&module_mutex);
@@ -2557,6 +2562,8 @@ static noinline struct module *load_module(void __user *umod,
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+ ddebug:
+	dynamic_debug_remove(debug);
  unlock:
 	mutex_unlock(&module_mutex);
 	synchronize_sched();
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 3df8eb17a60..02afc253372 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -692,7 +692,7 @@ static void ddebug_table_free(struct ddebug_table *dt)
  * Called in response to a module being unloaded.  Removes
  * any ddebug_table's which point at the module.
  */
-int ddebug_remove_module(char *mod_name)
+int ddebug_remove_module(const char *mod_name)
 {
 	struct ddebug_table *dt, *nextdt;
 	int ret = -ENOENT;
-- 
cgit v1.2.3


From 9078370c0d2cfe4a905aa34f398bbb0d65921a2b Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 19 Jul 2010 11:54:15 +0100
Subject: kmemleak: Add support for NO_BOOTMEM configurations

With commits 08677214 and 59be5a8e, alloc_bootmem()/free_bootmem() and
friends use the early_res functions for memory management when
NO_BOOTMEM is enabled. This patch adds the kmemleak calls in the
corresponding code paths for bootmem allocations.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: stable@kernel.org
---
 kernel/early_res.c | 6 ++++++
 mm/page_alloc.c    | 5 +++++
 2 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/early_res.c b/kernel/early_res.c
index 31aa9332ef3..7bfae887f21 100644
--- a/kernel/early_res.c
+++ b/kernel/early_res.c
@@ -7,6 +7,8 @@
 #include <linux/bootmem.h>
 #include <linux/mm.h>
 #include <linux/early_res.h>
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
 
 /*
  * Early reserved memory areas.
@@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end)
 	struct early_res *r;
 	int i;
 
+	kmemleak_free_part(__va(start), end - start);
+
 	i = find_overlapped_early(start, end);
 	r = &early_res[i];
 	if (i >= max_early_res || r->end != end || r->start != start)
@@ -333,6 +337,8 @@ void __init free_early_partial(u64 start, u64 end)
 	struct early_res *r;
 	int i;
 
+	kmemleak_free_part(__va(start), end - start);
+
 	if (start == end)
 		return;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 431214b941a..68319dd20be 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3659,6 +3659,11 @@ void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 		ptr = phys_to_virt(addr);
 		memset(ptr, 0, size);
 		reserve_early_without_check(addr, addr + size, "BOOTMEM");
+		/*
+		 * The min_count is set to 0 so that bootmem allocated blocks
+		 * are never reported as leaks.
+		 */
+		kmemleak_alloc(ptr, size, 0, 0);
 		return ptr;
 	}
 
-- 
cgit v1.2.3


From 1396a21ba0d4ec381db19bc9cd5b6f25a89cf633 Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@sgi.com>
Date: Wed, 21 Jul 2010 19:27:05 -0500
Subject: kdb: break out of kdb_ll() when command is terminated

Without this patch the "ll" linked-list traversal command won't
terminate when you hit q/Q.

Signed-off-by: Martin Hicks <mort@sgi.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/kdb/kdb_main.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 184cd8209c3..a7fe2e98d61 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2291,6 +2291,9 @@ static int kdb_ll(int argc, const char **argv)
 	while (va) {
 		char buf[80];
 
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+
 		sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
 		diag = kdb_parse(buf);
 		if (diag)
-- 
cgit v1.2.3


From fb82c0ff27b2c40c6f7a3d1a94cafb154591fa80 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 21 Jul 2010 19:27:05 -0500
Subject: repair gdbstub to match the gdbserial protocol specification

The gdbserial protocol handler should return an empty packet instead
of an error string when ever it responds to a command it does not
implement.

The problem cases come from a debugger client sending
qTBuffer, qTStatus, qSearch, qSupported.

The incorrect response from the gdbstub leads the debugger clients to
not function correctly.  Recent versions of gdb will not detach correctly as a result of this behavior.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Dongdong Deng <dongdong.deng@windriver.com>
---
 kernel/debug/gdbstub.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 4b17b326952..e8fd6868682 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -621,10 +621,8 @@ static void gdb_cmd_query(struct kgdb_state *ks)
 	switch (remcom_in_buffer[1]) {
 	case 's':
 	case 'f':
-		if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
-			error_packet(remcom_out_buffer, -EINVAL);
+		if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
 			break;
-		}
 
 		i = 0;
 		remcom_out_buffer[0] = 'm';
@@ -665,10 +663,9 @@ static void gdb_cmd_query(struct kgdb_state *ks)
 		pack_threadid(remcom_out_buffer + 2, thref);
 		break;
 	case 'T':
-		if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
-			error_packet(remcom_out_buffer, -EINVAL);
+		if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
 			break;
-		}
+
 		ks->threadid = 0;
 		ptr = remcom_in_buffer + 17;
 		kgdb_hex2long(&ptr, &ks->threadid);
-- 
cgit v1.2.3


From 9e8b624fcaebf9c237b5be9116f4424bf168e6d1 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 21 Jul 2010 19:27:06 -0500
Subject: Fix merge regression from external kdb to upstream kdb

In the process of merging kdb to the mainline, the kdb lsmod command
stopped printing the base load address of kernel modules.  This is
needed for using kdb in conjunction with external tools such as gdb.

Simply restore the functionality by adding a kdb_printf for the base
load address of the kernel modules.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/kdb/kdb_main.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index a7fe2e98d61..7e9bfd54a0d 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1883,6 +1883,7 @@ static int kdb_lsmod(int argc, const char **argv)
 			kdb_printf(" (Loading)");
 		else
 			kdb_printf(" (Live)");
+		kdb_printf(" 0x%p", mod->module_core);
 
 #ifdef CONFIG_MODULE_UNLOAD
 		{
-- 
cgit v1.2.3


From b0679c63db655fa12007558e267bc0eb1d486fdb Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 21 Jul 2010 19:27:07 -0500
Subject: debug_core,kdb: fix kgdb_connected bit set in the wrong place

Immediately following an exit from the kdb shell the kgdb_connected
variable should be set to zero, unless there are breakpoints planted.
If the kgdb_connected variable is not zeroed out with kdb, it is
impossible to turn off kdb.

This patch is merely a work around for now, the real fix will check
for the breakpoints.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/debug_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 5cb7cd1de10..8bc5eeffec8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -605,13 +605,13 @@ cpu_master_loop:
 		if (dbg_kdb_mode) {
 			kgdb_connected = 1;
 			error = kdb_stub(ks);
+			kgdb_connected = 0;
 		} else {
 			error = gdb_serial_stub(ks);
 		}
 
 		if (error == DBG_PASS_EVENT) {
 			dbg_kdb_mode = !dbg_kdb_mode;
-			kgdb_connected = 0;
 		} else if (error == DBG_SWITCH_CPU_EVENT) {
 			dbg_cpu_switch(cpu, dbg_switch_cpu);
 			goto cpu_loop;
-- 
cgit v1.2.3


From edd63cb6b91024332d6983fc51058ac1ef0c081e Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 21 Jul 2010 19:27:07 -0500
Subject: sysrq,kdb: Use __handle_sysrq() for kdb's sysrq function

The kdb code should not toggle the sysrq state in case an end user
wants to try and resume the normal kernel execution.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/char/sysrq.c        | 2 +-
 include/linux/sysrq.h       | 1 +
 kernel/debug/kdb/kdb_main.c | 3 +--
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 5d64e3acb00..878ac0c2cc6 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -493,7 +493,7 @@ static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p)
                 sysrq_key_table[i] = op_p;
 }
 
-static void __handle_sysrq(int key, struct tty_struct *tty, int check_mask)
+void __handle_sysrq(int key, struct tty_struct *tty, int check_mask)
 {
 	struct sysrq_key_op *op_p;
 	int orig_log_level;
diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h
index 4496322e28d..609e8ca5f53 100644
--- a/include/linux/sysrq.h
+++ b/include/linux/sysrq.h
@@ -45,6 +45,7 @@ struct sysrq_key_op {
  */
 
 void handle_sysrq(int key, struct tty_struct *tty);
+void __handle_sysrq(int key, struct tty_struct *tty, int check_mask);
 int register_sysrq_key(int key, struct sysrq_key_op *op);
 int unregister_sysrq_key(int key, struct sysrq_key_op *op);
 struct sysrq_key_op *__sysrq_get_key_op(int key);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 7e9bfd54a0d..ebe4a287419 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1820,9 +1820,8 @@ static int kdb_sr(int argc, const char **argv)
 {
 	if (argc != 1)
 		return KDB_ARGCOUNT;
-	sysrq_toggle_support(1);
 	kdb_trap_printk++;
-	handle_sysrq(*argv[1], NULL);
+	__handle_sysrq(*argv[1], NULL, 0);
 	kdb_trap_printk--;
 
 	return 0;
-- 
cgit v1.2.3


From b82bab4bbe9efa7bc7177fc20620fff19bd95484 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Tue, 27 Jul 2010 13:18:01 -0700
Subject: dynamic debug: move ddebug_remove_module() down into free_module()

The command

	echo "file ec.c +p" >/sys/kernel/debug/dynamic_debug/control

causes an oops.

Move the call to ddebug_remove_module() down into free_module().  In this
way it should be called from all error paths.  Currently, we are missing
the remove if the module init routine fails.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Reported-by: Thomas Renninger <trenn@suse.de>
Tested-by: Thomas Renninger <trenn@suse.de>
Cc: <stable@kernel.org>		[2.6.32+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5d2d28197c8..6c562828c85 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -787,7 +787,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 
 	/* Store the name of the last unloaded module for diagnostic purposes */
 	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-	ddebug_remove_module(mod->name);
 
 	free_module(mod);
 	return 0;
@@ -1550,6 +1549,9 @@ static void free_module(struct module *mod)
 	remove_sect_attrs(mod);
 	mod_kobject_remove(mod);
 
+	/* Remove dynamic debug info */
+	ddebug_remove_module(mod->name);
+
 	/* Arch-specific cleanup. */
 	module_arch_cleanup(mod);
 
-- 
cgit v1.2.3


From de09a9771a5346029f4d11e4ac886be7f9bfdd75 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 29 Jul 2010 12:45:49 +0100
Subject: CRED: Fix get_task_cred() and task_state() to not resurrect dead
 credentials

It's possible for get_task_cred() as it currently stands to 'corrupt' a set of
credentials by incrementing their usage count after their replacement by the
task being accessed.

What happens is that get_task_cred() can race with commit_creds():

	TASK_1			TASK_2			RCU_CLEANER
	-->get_task_cred(TASK_2)
	rcu_read_lock()
	__cred = __task_cred(TASK_2)
				-->commit_creds()
				old_cred = TASK_2->real_cred
				TASK_2->real_cred = ...
				put_cred(old_cred)
				  call_rcu(old_cred)
		[__cred->usage == 0]
	get_cred(__cred)
		[__cred->usage == 1]
	rcu_read_unlock()
							-->put_cred_rcu()
							[__cred->usage == 1]
							panic()

However, since a tasks credentials are generally not changed very often, we can
reasonably make use of a loop involving reading the creds pointer and using
atomic_inc_not_zero() to attempt to increment it if it hasn't already hit zero.

If successful, we can safely return the credentials in the knowledge that, even
if the task we're accessing has released them, they haven't gone to the RCU
cleanup code.

We then change task_state() in procfs to use get_task_cred() rather than
calling get_cred() on the result of __task_cred(), as that suffers from the
same problem.

Without this change, a BUG_ON in __put_cred() or in put_cred_rcu() can be
tripped when it is noticed that the usage count is not zero as it ought to be,
for example:

kernel BUG at kernel/cred.c:168!
invalid opcode: 0000 [#1] SMP
last sysfs file: /sys/kernel/mm/ksm/run
CPU 0
Pid: 2436, comm: master Not tainted 2.6.33.3-85.fc13.x86_64 #1 0HR330/OptiPlex
745
RIP: 0010:[<ffffffff81069881>]  [<ffffffff81069881>] __put_cred+0xc/0x45
RSP: 0018:ffff88019e7e9eb8  EFLAGS: 00010202
RAX: 0000000000000001 RBX: ffff880161514480 RCX: 00000000ffffffff
RDX: 00000000ffffffff RSI: ffff880140c690c0 RDI: ffff880140c690c0
RBP: ffff88019e7e9eb8 R08: 00000000000000d0 R09: 0000000000000000
R10: 0000000000000001 R11: 0000000000000040 R12: ffff880140c690c0
R13: ffff88019e77aea0 R14: 00007fff336b0a5c R15: 0000000000000001
FS:  00007f12f50d97c0(0000) GS:ffff880007400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f8f461bc000 CR3: 00000001b26ce000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process master (pid: 2436, threadinfo ffff88019e7e8000, task ffff88019e77aea0)
Stack:
 ffff88019e7e9ec8 ffffffff810698cd ffff88019e7e9ef8 ffffffff81069b45
<0> ffff880161514180 ffff880161514480 ffff880161514180 0000000000000000
<0> ffff88019e7e9f28 ffffffff8106aace 0000000000000001 0000000000000246
Call Trace:
 [<ffffffff810698cd>] put_cred+0x13/0x15
 [<ffffffff81069b45>] commit_creds+0x16b/0x175
 [<ffffffff8106aace>] set_current_groups+0x47/0x4e
 [<ffffffff8106ac89>] sys_setgroups+0xf6/0x105
 [<ffffffff81009b02>] system_call_fastpath+0x16/0x1b
Code: 48 8d 71 ff e8 7e 4e 15 00 85 c0 78 0b 8b 75 ec 48 89 df e8 ef 4a 15 00
48 83 c4 18 5b c9 c3 55 8b 07 8b 07 48 89 e5 85 c0 74 04 <0f> 0b eb fe 65 48 8b
04 25 00 cc 00 00 48 3b b8 58 04 00 00 75
RIP  [<ffffffff81069881>] __put_cred+0xc/0x45
 RSP <ffff88019e7e9eb8>
---[ end trace df391256a100ebdd ]---

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c      |  2 +-
 include/linux/cred.h | 21 +--------------------
 kernel/cred.c        | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9b58d38bc91..fff6572676a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -176,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		if (tracer)
 			tpid = task_pid_nr_ns(tracer, ns);
 	}
-	cred = get_cred((struct cred *) __task_cred(p));
+	cred = get_task_cred(p);
 	seq_printf(m,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 75c0fa88130..ce40cbc791e 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -153,6 +153,7 @@ struct cred {
 extern void __put_cred(struct cred *);
 extern void exit_creds(struct task_struct *);
 extern int copy_creds(struct task_struct *, unsigned long);
+extern const struct cred *get_task_cred(struct task_struct *);
 extern struct cred *cred_alloc_blank(void);
 extern struct cred *prepare_creds(void);
 extern struct cred *prepare_exec_creds(void);
@@ -281,26 +282,6 @@ static inline void put_cred(const struct cred *_cred)
 #define __task_cred(task) \
 	((const struct cred *)(rcu_dereference_check((task)->real_cred, rcu_read_lock_held() || lockdep_tasklist_lock_is_held())))
 
-/**
- * get_task_cred - Get another task's objective credentials
- * @task: The task to query
- *
- * Get the objective credentials of a task, pinning them so that they can't go
- * away.  Accessing a task's credentials directly is not permitted.
- *
- * The caller must make sure task doesn't go away, either by holding a ref on
- * task or by holding tasklist_lock to prevent it from being unlinked.
- */
-#define get_task_cred(task)				\
-({							\
-	struct cred *__cred;				\
-	rcu_read_lock();				\
-	__cred = (struct cred *) __task_cred((task));	\
-	get_cred(__cred);				\
-	rcu_read_unlock();				\
-	__cred;						\
-})
-
 /**
  * get_current_cred - Get the current task's subjective credentials
  *
diff --git a/kernel/cred.c b/kernel/cred.c
index a2d5504fbcc..60bc8b1e32e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk)
 	}
 }
 
+/**
+ * get_task_cred - Get another task's objective credentials
+ * @task: The task to query
+ *
+ * Get the objective credentials of a task, pinning them so that they can't go
+ * away.  Accessing a task's credentials directly is not permitted.
+ *
+ * The caller must also make sure task doesn't get deleted, either by holding a
+ * ref on task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+const struct cred *get_task_cred(struct task_struct *task)
+{
+	const struct cred *cred;
+
+	rcu_read_lock();
+
+	do {
+		cred = __task_cred((task));
+		BUG_ON(!cred);
+	} while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+
+	rcu_read_unlock();
+	return cred;
+}
+
 /*
  * Allocate blank credentials, such that the credentials can be filled in at a
  * later date without risk of ENOMEM.
-- 
cgit v1.2.3