From 051a1d1afa47206e23ae03f781c6795ce870e3d5 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Tue, 18 Dec 2007 15:21:13 +0100
Subject: sched: fix crash on ia64, introduce task_current()

Some services (e.g. sched_setscheduler(), rt_mutex_setprio() and
sched_move_task()) must handle a given task differently in case it's the
'rq->curr' task on its run-queue. The task_running() interface is not
suitable for determining such tasks for platforms with one of the
following options:

#define __ARCH_WANT_UNLOCKED_CTXSW
#define __ARCH_WANT_INTERRUPTS_ON_CTXSW

Due to the fact that it makes use of 'p->oncpu == 1' as a criterion but
such a task is not necessarily 'rq->curr'.

The detailed explanation is available here:
https://lists.linux-foundation.org/pipermail/containers/2007-December/009262.html

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Tested-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 kernel/sched.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index c6e551de795b..5ae0d4296e7c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -508,10 +508,15 @@ EXPORT_SYMBOL_GPL(cpu_clock);
 # define finish_arch_switch(prev)	do { } while (0)
 #endif
 
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+	return rq->curr == p;
+}
+
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
-	return rq->curr == p;
+	return task_current(rq, p);
 }
 
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
@@ -540,7 +545,7 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
 #ifdef CONFIG_SMP
 	return p->oncpu;
 #else
-	return rq->curr == p;
+	return task_current(rq, p);
 #endif
 }
 
@@ -3334,7 +3339,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime;
-	if (rq->curr == p) {
+	if (task_current(rq, p)) {
 		update_rq_clock(rq);
 		delta_exec = rq->clock - p->se.exec_start;
 		if ((s64)delta_exec > 0)
@@ -4021,7 +4026,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
-	running = task_running(rq, p);
+	running = task_current(rq, p);
 	if (on_rq) {
 		dequeue_task(rq, p, 0);
 		if (running)
@@ -4332,7 +4337,7 @@ recheck:
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
-	running = task_running(rq, p);
+	running = task_current(rq, p);
 	if (on_rq) {
 		deactivate_task(rq, p, 0);
 		if (running)
@@ -7101,7 +7106,7 @@ void sched_move_task(struct task_struct *tsk)
 
 	update_rq_clock(rq);
 
-	running = task_running(rq, tsk);
+	running = task_current(rq, tsk);
 	on_rq = tsk->se.on_rq;
 
 	if (on_rq) {
-- 
cgit v1.2.3


From c7af77b584b02d3e321b00203a618a9c93782121 Mon Sep 17 00:00:00 2001
From: Livio Soares <livio@eecg.toronto.edu>
Date: Tue, 18 Dec 2007 15:21:13 +0100
Subject: sched: mark rwsem functions as __sched for wchan/profiling

This following commit

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=fdf8cb0909b531f9ae8f9b9d7e4eb35ba3505f07

un-inlined a low-level rwsem function, but did not mark it as __sched.
The result is that it now shows up as thread wchan (which also affects
/proc/profile stats).  The following simple patch fixes this by properly
marking rwsem_down_failed_common() as a __sched function.

Also in this patch, which is up for discussion, marks down_read() and
down_write() proper as __sched.  For profiling, it is pretty much
useless to know that a semaphore is beig help - it is necessary to know
_which_ one.  By going up another frame on the stack, the information
becomes much more useful.

In summary, the below change to lib/rwsem.c should be applied; the
changes to kernel/rwsem.c could be applied if other kernel hackers agree
with my proposal that down_read()/down_write() in the profile is not
enough.

[ akpm@linux-foundation.org: build fix ]

Signed-off-by: Livio Soares <livio@eecg.toronto.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rwsem.c | 5 +++--
 lib/rwsem.c    | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 1ec620c03064..cae050b05f5e 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -6,6 +6,7 @@
 
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/rwsem.h>
 
@@ -15,7 +16,7 @@
 /*
  * lock for reading
  */
-void down_read(struct rw_semaphore *sem)
+void __sched down_read(struct rw_semaphore *sem)
 {
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -42,7 +43,7 @@ EXPORT_SYMBOL(down_read_trylock);
 /*
  * lock for writing
  */
-void down_write(struct rw_semaphore *sem)
+void __sched down_write(struct rw_semaphore *sem)
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
diff --git a/lib/rwsem.c b/lib/rwsem.c
index cdb4e3d05607..7d02700a4b0e 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -146,7 +146,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 /*
  * wait for a lock to be granted
  */
-static struct rw_semaphore *
+static struct rw_semaphore __sched *
 rwsem_down_failed_common(struct rw_semaphore *sem,
 			struct rwsem_waiter *waiter, signed long adjustment)
 {
-- 
cgit v1.2.3


From 73c4efd2c88a41c8a4810904266a34423b5584e5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 18 Dec 2007 15:21:13 +0100
Subject: sched: sysctl, proc_dointvec_minmax() expects int values for

min_sched_granularity_ns, max_sched_granularity_ns,
min_wakeup_granularity_ns and max_wakeup_granularity_ns are declared
"unsigned long".

This is incorrect since proc_dointvec_minmax() expects plain "int" guard
values.

This bug only triggers on big endian 64 bit arches.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1135de730872..c68f68dcc605 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -225,10 +225,10 @@ static struct ctl_table root_table[] = {
 };
 
 #ifdef CONFIG_SCHED_DEBUG
-static unsigned long min_sched_granularity_ns = 100000;		/* 100 usecs */
-static unsigned long max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
-static unsigned long min_wakeup_granularity_ns;			/* 0 usecs */
-static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
+static int min_sched_granularity_ns = 100000;		/* 100 usecs */
+static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
+static int min_wakeup_granularity_ns;			/* 0 usecs */
+static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 #endif
 
 static struct ctl_table kern_table[] = {
-- 
cgit v1.2.3


From 2bacec8c318ca0418c0ee9ac662ee44207765dd4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 18 Dec 2007 15:21:13 +0100
Subject: sched: touch softlockup watchdog after idling

touch softlockup watchdog after idling.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 5ae0d4296e7c..3df84ea6aba9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -668,6 +668,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	struct rq *rq = cpu_rq(smp_processor_id());
 	u64 now = sched_clock();
 
+	touch_softlockup_watchdog();
 	rq->idle_clock += delta_ns;
 	/*
 	 * Override the previous timestamp and ignore all
-- 
cgit v1.2.3


From 6cbf1c126cf6a727287d61b122fde00a8b827bfe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 18 Dec 2007 15:21:13 +0100
Subject: sched: do not hurt SCHED_BATCH on wakeup

measurements by Yanmin Zhang have shown that SCHED_BATCH tasks benefit
if they run the same place_entity() logic as SCHED_OTHER tasks - so
uniformize behavior in this area.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c33f0ceb3de9..da7c061e7206 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -511,8 +511,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
-		if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
-				task_of(se)->policy != SCHED_BATCH)
+		if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se))
 			vruntime -= sysctl_sched_latency;
 
 		/* ensure we never gain time by being placed backwards. */
-- 
cgit v1.2.3