From 051a1d1afa47206e23ae03f781c6795ce870e3d5 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Tue, 18 Dec 2007 15:21:13 +0100 Subject: sched: fix crash on ia64, introduce task_current() Some services (e.g. sched_setscheduler(), rt_mutex_setprio() and sched_move_task()) must handle a given task differently in case it's the 'rq->curr' task on its run-queue. The task_running() interface is not suitable for determining such tasks for platforms with one of the following options: #define __ARCH_WANT_UNLOCKED_CTXSW #define __ARCH_WANT_INTERRUPTS_ON_CTXSW Due to the fact that it makes use of 'p->oncpu == 1' as a criterion but such a task is not necessarily 'rq->curr'. The detailed explanation is available here: https://lists.linux-foundation.org/pipermail/containers/2007-December/009262.html Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Tested-by: Dhaval Giani Tested-by: KAMEZAWA Hiroyuki --- kernel/sched.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index c6e551de795b..5ae0d4296e7c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -508,10 +508,15 @@ EXPORT_SYMBOL_GPL(cpu_clock); # define finish_arch_switch(prev) do { } while (0) #endif +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + #ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { - return rq->curr == p; + return task_current(rq, p); } static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) @@ -540,7 +545,7 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP return p->oncpu; #else - return rq->curr == p; + return task_current(rq, p); #endif } @@ -3334,7 +3339,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime; - if (rq->curr == p) { + if (task_current(rq, p)) { update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) @@ -4021,7 +4026,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; - running = task_running(rq, p); + running = task_current(rq, p); if (on_rq) { dequeue_task(rq, p, 0); if (running) @@ -4332,7 +4337,7 @@ recheck: } update_rq_clock(rq); on_rq = p->se.on_rq; - running = task_running(rq, p); + running = task_current(rq, p); if (on_rq) { deactivate_task(rq, p, 0); if (running) @@ -7101,7 +7106,7 @@ void sched_move_task(struct task_struct *tsk) update_rq_clock(rq); - running = task_running(rq, tsk); + running = task_current(rq, tsk); on_rq = tsk->se.on_rq; if (on_rq) { -- cgit v1.2.3 From c7af77b584b02d3e321b00203a618a9c93782121 Mon Sep 17 00:00:00 2001 From: Livio Soares Date: Tue, 18 Dec 2007 15:21:13 +0100 Subject: sched: mark rwsem functions as __sched for wchan/profiling This following commit http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=fdf8cb0909b531f9ae8f9b9d7e4eb35ba3505f07 un-inlined a low-level rwsem function, but did not mark it as __sched. The result is that it now shows up as thread wchan (which also affects /proc/profile stats). The following simple patch fixes this by properly marking rwsem_down_failed_common() as a __sched function. Also in this patch, which is up for discussion, marks down_read() and down_write() proper as __sched. For profiling, it is pretty much useless to know that a semaphore is beig help - it is necessary to know _which_ one. By going up another frame on the stack, the information becomes much more useful. In summary, the below change to lib/rwsem.c should be applied; the changes to kernel/rwsem.c could be applied if other kernel hackers agree with my proposal that down_read()/down_write() in the profile is not enough. [ akpm@linux-foundation.org: build fix ] Signed-off-by: Livio Soares Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rwsem.c | 5 +++-- lib/rwsem.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 1ec620c03064..cae050b05f5e 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -15,7 +16,7 @@ /* * lock for reading */ -void down_read(struct rw_semaphore *sem) +void __sched down_read(struct rw_semaphore *sem) { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -42,7 +43,7 @@ EXPORT_SYMBOL(down_read_trylock); /* * lock for writing */ -void down_write(struct rw_semaphore *sem) +void __sched down_write(struct rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); diff --git a/lib/rwsem.c b/lib/rwsem.c index cdb4e3d05607..7d02700a4b0e 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -146,7 +146,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) /* * wait for a lock to be granted */ -static struct rw_semaphore * +static struct rw_semaphore __sched * rwsem_down_failed_common(struct rw_semaphore *sem, struct rwsem_waiter *waiter, signed long adjustment) { -- cgit v1.2.3 From 73c4efd2c88a41c8a4810904266a34423b5584e5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 18 Dec 2007 15:21:13 +0100 Subject: sched: sysctl, proc_dointvec_minmax() expects int values for min_sched_granularity_ns, max_sched_granularity_ns, min_wakeup_granularity_ns and max_wakeup_granularity_ns are declared "unsigned long". This is incorrect since proc_dointvec_minmax() expects plain "int" guard values. This bug only triggers on big endian 64 bit arches. Signed-off-by: Eric Dumazet Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1135de730872..c68f68dcc605 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -225,10 +225,10 @@ static struct ctl_table root_table[] = { }; #ifdef CONFIG_SCHED_DEBUG -static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ -static unsigned long max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ -static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ -static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_sched_granularity_ns = 100000; /* 100 usecs */ +static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_wakeup_granularity_ns; /* 0 usecs */ +static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ #endif static struct ctl_table kern_table[] = { -- cgit v1.2.3 From 2bacec8c318ca0418c0ee9ac662ee44207765dd4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 18 Dec 2007 15:21:13 +0100 Subject: sched: touch softlockup watchdog after idling touch softlockup watchdog after idling. Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched.c b/kernel/sched.c index 5ae0d4296e7c..3df84ea6aba9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -668,6 +668,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) struct rq *rq = cpu_rq(smp_processor_id()); u64 now = sched_clock(); + touch_softlockup_watchdog(); rq->idle_clock += delta_ns; /* * Override the previous timestamp and ignore all -- cgit v1.2.3 From 6cbf1c126cf6a727287d61b122fde00a8b827bfe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 18 Dec 2007 15:21:13 +0100 Subject: sched: do not hurt SCHED_BATCH on wakeup measurements by Yanmin Zhang have shown that SCHED_BATCH tasks benefit if they run the same place_entity() logic as SCHED_OTHER tasks - so uniformize behavior in this area. Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c33f0ceb3de9..da7c061e7206 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -511,8 +511,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && - task_of(se)->policy != SCHED_BATCH) + if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se)) vruntime -= sysctl_sched_latency; /* ensure we never gain time by being placed backwards. */ -- cgit v1.2.3