From e05510d01ad1565e5e086a939261084d67ba2b10 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2008 23:56:17 +0200 Subject: sched: optimize calc_delta_mine() Joel noticed that the !lw->inv_weight contition isn't unlikely anymore so remove the unlikely annotation. Also, remove the two div64_u64() inv_weight calculations, which makes them rely on the calc_delta_mine() path as well. Signed-off-by: Peter Zijlstra CC: Joel Schopp Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 34bcc5bc120..00c1ba706a5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1438,8 +1438,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - if (unlikely(!lw->inv_weight)) - lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); + if (!lw->inv_weight) + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); tmp = (u64)delta_exec * weight; /* @@ -8025,7 +8025,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->my_q = cfs_rq; se->load.weight = tg->shares; - se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight); + se->load.inv_weight = 0; se->parent = parent; } #endif @@ -8692,7 +8692,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares) dequeue_entity(cfs_rq, se, 0); se->load.weight = shares; - se->load.inv_weight = div64_u64((1ULL<<32), shares); + se->load.inv_weight = 0; if (on_rq) enqueue_entity(cfs_rq, se, 0); -- cgit v1.2.3 From d478c2cfaa2476f8b6876f9eb4d8fddcfa986479 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 26 Apr 2008 11:30:34 -0700 Subject: sched: add debug checks to idle functions Cc: Venkatesh Pallipadi Cc: "Justin Mattock" Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 00c1ba706a5..ed3caf26990 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1124,6 +1124,7 @@ void sched_clock_idle_sleep_event(void) { struct rq *rq = cpu_rq(smp_processor_id()); + WARN_ON(!irqs_disabled()); spin_lock(&rq->lock); __update_rq_clock(rq); spin_unlock(&rq->lock); @@ -1139,6 +1140,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) struct rq *rq = cpu_rq(smp_processor_id()); u64 now = sched_clock(); + WARN_ON(!irqs_disabled()); rq->idle_clock += delta_ns; /* * Override the previous timestamp and ignore all -- cgit v1.2.3 From 983ed7a66bcec9dc307d89dc7af47cdf209e56af Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Thu, 24 Apr 2008 18:17:55 -0700 Subject: sched: add statics, don't return void expressions Noticed by sparse: kernel/sched.c:760:20: warning: symbol 'sched_feat_names' was not declared. Should it be static? kernel/sched.c:767:5: warning: symbol 'sched_feat_open' was not declared. Should it be static? kernel/sched_fair.c:845:3: warning: returning void-valued expression kernel/sched.c:4386:3: warning: returning void-valued expression Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++---- kernel/sched_fair.c | 6 ++++-- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ed3caf26990..d941ddc9ec1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -757,14 +757,14 @@ const_debug unsigned int sysctl_sched_features = #define SCHED_FEAT(name, enabled) \ #name , -__read_mostly char *sched_feat_names[] = { +static __read_mostly char *sched_feat_names[] = { #include "sched_features.h" NULL }; #undef SCHED_FEAT -int sched_feat_open(struct inode *inode, struct file *filp) +static int sched_feat_open(struct inode *inode, struct file *filp) { filp->private_data = inode->i_private; return 0; @@ -4341,8 +4341,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset, struct rq *rq = this_rq(); cputime64_t tmp; - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) - return account_guest_time(p, cputime); + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime); + return; + } p->stime = cputime_add(p->stime, cputime); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1295ddc5656..e8e5ad2614b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -841,8 +841,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */ - if (queued) - return resched_task(rq_of(cfs_rq)->curr); + if (queued) { + resched_task(rq_of(cfs_rq)->curr); + return; + } /* * don't let the period tick interfere with the hrtick preemption */ -- cgit v1.2.3 From b328ca182f01c2a04b85e0ee8a410720b104fbcc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Apr 2008 10:02:46 +0200 Subject: sched: fix hrtick_start_fair and CPU-Hotplug Gautham R Shenoy reported: > While running the usual CPU-Hotplug stress tests on linux-2.6.25, > I noticed the following in the console logs. > > This is a wee bit difficult to reproduce. In the past 10 runs I hit this > only once. > > ------------[ cut here ]------------ > > WARNING: at kernel/sched.c:962 hrtick+0x2e/0x65() > > Just wondering if we are doing a good job at handling the cancellation > of any per-cpu scheduler timers during CPU-Hotplug. This looks like its indeed not cancelled at all and migrates the it to another cpu. Fix it via a proper hotplug notifier mechanism. Reported-by: Gautham R Shenoy Signed-off-by: Peter Zijlstra Cc: stable@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d941ddc9ec1..bee9cbe13c1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1191,6 +1191,7 @@ static inline void resched_rq(struct rq *rq) enum { HRTICK_SET, /* re-programm hrtick_timer */ HRTICK_RESET, /* not a new slice */ + HRTICK_BLOCK, /* stop hrtick operations */ }; /* @@ -1202,6 +1203,8 @@ static inline int hrtick_enabled(struct rq *rq) { if (!sched_feat(HRTICK)) return 0; + if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) + return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } @@ -1284,7 +1287,63 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } -static inline void init_rq_hrtick(struct rq *rq) +static void hotplug_hrtick_disable(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + rq->hrtick_flags = 0; + __set_bit(HRTICK_BLOCK, &rq->hrtick_flags); + spin_unlock_irqrestore(&rq->lock, flags); + + hrtick_clear(rq); +} + +static void hotplug_hrtick_enable(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + hotplug_hrtick_disable(cpu); + return NOTIFY_OK; + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hotplug_hrtick_enable(cpu); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static void init_hrtick(void) +{ + hotcpu_notifier(hotplug_hrtick, 0); +} + +static void init_rq_hrtick(struct rq *rq) { rq->hrtick_flags = 0; hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -1321,6 +1380,10 @@ static inline void init_rq_hrtick(struct rq *rq) void hrtick_resched(void) { } + +static inline void init_hrtick(void) +{ +} #endif /* @@ -7943,6 +8006,7 @@ void __init sched_init_smp(void) put_online_cpus(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + init_hrtick(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) -- cgit v1.2.3 From 673a90a1e05c8127886f7659d1a457169378371f Mon Sep 17 00:00:00 2001 From: David Simner Date: Tue, 29 Apr 2008 10:08:59 +0100 Subject: sched: fix sched_info_switch not being called according to documentation http://bugzilla.kernel.org/show_bug.cgi?id=10545 sched_stats.h says that __sched_info_switch is "called when prev != next" in the comment. sched.c should therefore do that. Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index bee9cbe13c1..3ac3d7af04a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4662,9 +4662,9 @@ need_resched_nonpreemptible: prev->sched_class->put_prev_task(rq, prev); next = pick_next_task(rq, prev); - sched_info_switch(prev, next); - if (likely(prev != next)) { + sched_info_switch(prev, next); + rq->nr_switches++; rq->curr = next; ++*switch_count; -- cgit v1.2.3 From 690229a0912ca2fef8b542fe4d8b73acfcdc6e24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 23 Apr 2008 09:31:35 +0200 Subject: sched: make clock sync tunable by architecture code make time_sync_thresh tunable to architecture code. Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 698b5a4d25a..54c9ca26b7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -158,6 +158,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) } #endif +extern unsigned long long time_sync_thresh; + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). diff --git a/kernel/sched.c b/kernel/sched.c index 3ac3d7af04a..8f433fedfcb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -899,7 +899,7 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -static const unsigned long long time_sync_thresh = 100000; +unsigned long long time_sync_thresh = 100000; static DEFINE_PER_CPU(unsigned long long, time_offset); static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); -- cgit v1.2.3 From 712555ee4f873515612f89554ad1a3fda5fa887e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 28 Apr 2008 11:33:07 +0200 Subject: sched: fix missing locking in sched_domains code Concurrent calls to detach_destroy_domains and arch_init_sched_domains were prevented by the old scheduler subsystem cpu hotplug mutex. When this got converted to get_online_cpus() the locking got broken. Unlike before now several processes can concurrently enter the critical sections that were protected by the old lock. So use the already present doms_cur_mutex to protect these sections again. Cc: Gautham R Shenoy Cc: Paul Jackson Signed-off-by: Heiko Carstens Signed-off-by: Ingo Molnar --- kernel/sched.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8f433fedfcb..561b3b39bdb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -242,6 +242,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) } #endif +/* + * sched_domains_mutex serializes calls to arch_init_sched_domains, + * detach_destroy_domains and partition_sched_domains. + */ +static DEFINE_MUTEX(sched_domains_mutex); + #ifdef CONFIG_GROUP_SCHED #include @@ -308,9 +314,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; */ static DEFINE_SPINLOCK(task_group_lock); -/* doms_cur_mutex serializes access to doms_cur[] array */ -static DEFINE_MUTEX(doms_cur_mutex); - #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) @@ -358,21 +361,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif } -static inline void lock_doms_cur(void) -{ - mutex_lock(&doms_cur_mutex); -} - -static inline void unlock_doms_cur(void) -{ - mutex_unlock(&doms_cur_mutex); -} - #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline void lock_doms_cur(void) { } -static inline void unlock_doms_cur(void) { } #endif /* CONFIG_GROUP_SCHED */ @@ -7822,7 +7813,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, { int i, j; - lock_doms_cur(); + mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); @@ -7871,7 +7862,7 @@ match2: register_sched_domain_sysctl(); - unlock_doms_cur(); + mutex_unlock(&sched_domains_mutex); } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -7880,8 +7871,10 @@ int arch_reinit_sched_domains(void) int err; get_online_cpus(); + mutex_lock(&sched_domains_mutex); detach_destroy_domains(&cpu_online_map); err = arch_init_sched_domains(&cpu_online_map); + mutex_unlock(&sched_domains_mutex); put_online_cpus(); return err; @@ -7999,10 +7992,12 @@ void __init sched_init_smp(void) BUG_ON(sched_group_nodes_bycpu == NULL); #endif get_online_cpus(); + mutex_lock(&sched_domains_mutex); arch_init_sched_domains(&cpu_online_map); cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); + mutex_unlock(&sched_domains_mutex); put_online_cpus(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); -- cgit v1.2.3 From cb4ad1ffc7c0d8ea7dc8cd8ba303d83551716d46 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 28 Apr 2008 12:54:56 +0800 Subject: sched: fair-group: fix a Div0 error of the fair group scheduler When I echoed 0 into the "cpu.shares" file, a Div0 error occured. We found it is caused by the following calling. sched_group_set_shares(tg, shares) set_se_shares(tg->se[i], shares/nr_cpu_ids) __set_se_shares(se, shares) div64_64((1ULL<<32), shares) When the echoed value was less than the number of processores, the result of the sentence "shares/nr_cpu_ids" was 0, and then the system called div64() to divide the result, the Div0 error occured. It is unnecessary that the shares value is divided by nr_cpu_ids, I think. Because in the function __update_group_shares_cpu() and init_tg_cfs_entry(), the shares value isn't divided by nr_cpu_ids when setting shares of the sched entity. This patch fixes this bug. And echoing ULONG_MAX value into cpu.shares also causes Div0 error, so we set a macro MAX_SHARES to limit the max value of shares. Signed-off-by: Miao Xie Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 561b3b39bdb..f98f75f3c70 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -321,7 +321,13 @@ static DEFINE_SPINLOCK(task_group_lock); # define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif +/* + * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ #define MIN_SHARES 2 +#define MAX_SHARES (ULONG_MAX - 1) static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif @@ -1804,6 +1810,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, if (shares < MIN_SHARES) shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; __set_se_shares(tg->se[tcpu], shares); } @@ -8785,13 +8793,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (!tg->se[0]) return -EINVAL; - /* - * A weight of 0 or 1 can cause arithmetics problems. - * (The default weight is 1024 - so there's no practical - * limitation from this.) - */ if (shares < MIN_SHARES) shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; mutex_lock(&shares_mutex); if (tg->shares == shares) @@ -8816,7 +8821,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * force a rebalance */ cfs_rq_set_shares(tg->cfs_rq[i], 0); - set_se_shares(tg->se[i], shares/nr_cpu_ids); + set_se_shares(tg->se[i], shares); } /* -- cgit v1.2.3 From dfbf4a1bc319f0f9a31e39b2da1fa5c55e85af89 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 23 Apr 2008 09:24:06 +0200 Subject: sched: fix cpu clock David Miller pointed it out that nothing in cpu_clock() sets prev_cpu_time. This caused __sync_cpu_clock() to be called all the time - against the intention of this code. The result was that in practice we hit a global spinlock every time cpu_clock() is called - which - even though cpu_clock() is used for tracing and debugging, is suboptimal. While at it, also: - move the irq disabling to the outest layer, this should make cpu_clock() warp-free when called with irqs enabled. - use long long instead of cycles_t - for platforms where cycles_t is 32-bit. Reported-by: David Miller Signed-off-by: Ingo Molnar --- kernel/sched.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f98f75f3c70..9457106b18a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -910,11 +910,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); static DEFINE_SPINLOCK(time_sync_lock); static unsigned long long prev_global_time; -static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) +static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) { - unsigned long flags; - - spin_lock_irqsave(&time_sync_lock, flags); + /* + * We want this inlined, to not get tracer function calls + * in this critical section: + */ + spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); + __raw_spin_lock(&time_sync_lock.raw_lock); if (time < prev_global_time) { per_cpu(time_offset, cpu) += prev_global_time - time; @@ -923,7 +926,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) prev_global_time = time; } - spin_unlock_irqrestore(&time_sync_lock, flags); + __raw_spin_unlock(&time_sync_lock.raw_lock); + spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); return time; } @@ -931,7 +935,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) static unsigned long long __cpu_clock(int cpu) { unsigned long long now; - unsigned long flags; struct rq *rq; /* @@ -941,11 +944,9 @@ static unsigned long long __cpu_clock(int cpu) if (unlikely(!scheduler_running)) return 0; - local_irq_save(flags); rq = cpu_rq(cpu); update_rq_clock(rq); now = rq->clock; - local_irq_restore(flags); return now; } @@ -957,13 +958,18 @@ static unsigned long long __cpu_clock(int cpu) unsigned long long cpu_clock(int cpu) { unsigned long long prev_cpu_time, time, delta_time; + unsigned long flags; + local_irq_save(flags); prev_cpu_time = per_cpu(prev_cpu_time, cpu); time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); delta_time = time-prev_cpu_time; - if (unlikely(delta_time > time_sync_thresh)) + if (unlikely(delta_time > time_sync_thresh)) { time = __sync_cpu_clock(time, cpu); + per_cpu(prev_cpu_time, cpu) = time; + } + local_irq_restore(flags); return time; } -- cgit v1.2.3 From 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 3 May 2008 18:29:28 +0200 Subject: sched: add optional support for CONFIG_HAVE_UNSTABLE_SCHED_CLOCK this replaces the rq->clock stuff (and possibly cpu_clock()). - architectures that have an 'imperfect' hardware clock can set CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - the 'jiffie' window might be superfulous when we update tick_gtod before the __update_sched_clock() call in sched_clock_tick() - cpu_clock() might be implemented as: sched_clock_cpu(smp_processor_id()) if the accuracy proves good enough - how far can TSC drift in a single jiffie when considering the filtering and idle hooks? [ mingo@elte.hu: various fixes and cleanups ] Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 29 +++++++ init/main.c | 1 + kernel/Makefile | 2 +- kernel/sched.c | 165 +++-------------------------------- kernel/sched_clock.c | 236 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched_debug.c | 7 -- kernel/sched_fair.c | 2 +- 7 files changed, 281 insertions(+), 161 deletions(-) create mode 100644 kernel/sched_clock.c (limited to 'kernel/sched.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 54c9ca26b7d..0c35b0343a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1553,6 +1553,35 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) extern unsigned long long sched_clock(void); +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline void sched_clock_init(void) +{ +} + +static inline u64 sched_clock_cpu(int cpu) +{ + return sched_clock(); +} + +static inline void sched_clock_tick(void) +{ +} + +static inline void sched_clock_idle_sleep_event(void) +{ +} + +static inline void sched_clock_idle_wakeup_event(u64 delta_ns) +{ +} +#else +extern void sched_clock_init(void); +extern u64 sched_clock_cpu(int cpu); +extern void sched_clock_tick(void); +extern void sched_clock_idle_sleep_event(void); +extern void sched_clock_idle_wakeup_event(u64 delta_ns); +#endif + /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): diff --git a/init/main.c b/init/main.c index a87d4ca5c36..ddada7acf36 100644 --- a/init/main.c +++ b/init/main.c @@ -602,6 +602,7 @@ asmlinkage void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); + sched_clock_init(); profile_init(); if (!irqs_disabled()) printk("start_kernel(): bug: interrupts were enabled early\n"); diff --git a/kernel/Makefile b/kernel/Makefile index 188c43223f5..1c9938addb9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o + notifier.o ksysfs.o pm_qos_params.o sched_clock.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/sched.c b/kernel/sched.c index 9457106b18a..58fb8af1577 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -74,16 +74,6 @@ #include #include -/* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) -{ - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); -} - /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -557,13 +547,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - u64 clock, prev_clock_raw; - s64 clock_max_delta; - - unsigned int clock_warps, clock_overflows, clock_underflows; - u64 idle_clock; - unsigned int clock_deep_idle_events; - u64 tick_timestamp; + u64 clock; atomic_t nr_iowait; @@ -628,82 +612,6 @@ static inline int cpu_of(struct rq *rq) #endif } -#ifdef CONFIG_NO_HZ -static inline bool nohz_on(int cpu) -{ - return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE; -} - -static inline u64 max_skipped_ticks(struct rq *rq) -{ - return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1; -} - -static inline void update_last_tick_seen(struct rq *rq) -{ - rq->last_tick_seen = jiffies; -} -#else -static inline u64 max_skipped_ticks(struct rq *rq) -{ - return 1; -} - -static inline void update_last_tick_seen(struct rq *rq) -{ -} -#endif - -/* - * Update the per-runqueue clock, as finegrained as the platform can give - * us, but without assuming monotonicity, etc.: - */ -static void __update_rq_clock(struct rq *rq) -{ - u64 prev_raw = rq->prev_clock_raw; - u64 now = sched_clock(); - s64 delta = now - prev_raw; - u64 clock = rq->clock; - -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -#endif - /* - * Protect against sched_clock() occasionally going backwards: - */ - if (unlikely(delta < 0)) { - clock++; - rq->clock_warps++; - } else { - /* - * Catch too large forward jumps too: - */ - u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC; - u64 max_time = rq->tick_timestamp + max_jump; - - if (unlikely(clock + delta > max_time)) { - if (clock < max_time) - clock = max_time; - else - clock++; - rq->clock_overflows++; - } else { - if (unlikely(delta > rq->clock_max_delta)) - rq->clock_max_delta = delta; - clock += delta; - } - } - - rq->prev_clock_raw = now; - rq->clock = clock; -} - -static void update_rq_clock(struct rq *rq) -{ - if (likely(smp_processor_id() == cpu_of(rq))) - __update_rq_clock(rq); -} - /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -719,6 +627,11 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +static inline void update_rq_clock(struct rq *rq) +{ + rq->clock = sched_clock_cpu(cpu_of(rq)); +} + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -935,7 +848,6 @@ static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) static unsigned long long __cpu_clock(int cpu) { unsigned long long now; - struct rq *rq; /* * Only call sched_clock() if the scheduler has already been @@ -944,9 +856,7 @@ static unsigned long long __cpu_clock(int cpu) if (unlikely(!scheduler_running)) return 0; - rq = cpu_rq(cpu); - update_rq_clock(rq); - now = rq->clock; + now = sched_clock_cpu(cpu); return now; } @@ -1120,45 +1030,6 @@ static struct rq *this_rq_lock(void) return rq; } -/* - * We are going deep-idle (irqs are disabled): - */ -void sched_clock_idle_sleep_event(void) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - - WARN_ON(!irqs_disabled()); - spin_lock(&rq->lock); - __update_rq_clock(rq); - spin_unlock(&rq->lock); - rq->clock_deep_idle_events++; -} -EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); - -/* - * We just idled delta nanoseconds (called with irqs disabled): - */ -void sched_clock_idle_wakeup_event(u64 delta_ns) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - u64 now = sched_clock(); - - WARN_ON(!irqs_disabled()); - rq->idle_clock += delta_ns; - /* - * Override the previous timestamp and ignore all - * sched_clock() deltas that occured while we idled, - * and use the PM-provided delta_ns to advance the - * rq clock: - */ - spin_lock(&rq->lock); - rq->prev_clock_raw = now; - rq->clock += delta_ns; - spin_unlock(&rq->lock); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); - static void __resched_task(struct task_struct *p, int tif_bit); static inline void resched_task(struct task_struct *p) @@ -1283,7 +1154,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); spin_lock(&rq->lock); - __update_rq_clock(rq); + update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); spin_unlock(&rq->lock); @@ -4476,19 +4347,11 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; - u64 next_tick = rq->tick_timestamp + TICK_NSEC; + + sched_clock_tick(); spin_lock(&rq->lock); - __update_rq_clock(rq); - /* - * Let rq->clock advance by at least TICK_NSEC: - */ - if (unlikely(rq->clock < next_tick)) { - rq->clock = next_tick; - rq->clock_underflows++; - } - rq->tick_timestamp = rq->clock; - update_last_tick_seen(rq); + update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); @@ -4642,7 +4505,7 @@ need_resched_nonpreemptible: * Do the rq-clock update outside the rq lock: */ local_irq_disable(); - __update_rq_clock(rq); + update_rq_clock(rq); spin_lock(&rq->lock); clear_tsk_need_resched(prev); @@ -8226,8 +8089,6 @@ void __init sched_init(void) spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->clock = 1; - update_last_tick_seen(rq); init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -8371,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { int on_rq; + update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) @@ -8402,7 +8264,6 @@ void normalize_rt_tasks(void) p->se.sleep_start = 0; p->se.block_start = 0; #endif - task_rq(p)->clock = 0; if (!rt_task(p)) { /* diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c new file mode 100644 index 00000000000..9c597e37f7d --- /dev/null +++ b/kernel/sched_clock.c @@ -0,0 +1,236 @@ +/* + * sched_clock for unstable cpu clocks + * + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra + * + * Based on code by: + * Ingo Molnar + * Guillaume Chazarain + * + * Create a semi stable clock from a mixture of other events, including: + * - gtod + * - jiffies + * - sched_clock() + * - explicit idle events + * + * We use gtod as base and the unstable clock deltas. The deltas are filtered, + * making it monotonic and keeping it within an expected window. This window + * is set up using jiffies. + * + * Furthermore, explicit sleep and wakeup hooks allow us to account for time + * that is otherwise invisible (TSC gets stopped). + * + * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat + * consistent between cpus (never more than 1 jiffies difference). + */ +#include +#include +#include +#include +#include + + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + +struct sched_clock_data { + /* + * Raw spinlock - this is a special case: this might be called + * from within instrumentation code so we dont want to do any + * instrumentation ourselves. + */ + raw_spinlock_t lock; + + unsigned long prev_jiffies; + u64 prev_raw; + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) +{ + return &__get_cpu_var(sched_clock_data); +} + +static inline struct sched_clock_data *cpu_sdc(int cpu) +{ + return &per_cpu(sched_clock_data, cpu); +} + +void sched_clock_init(void) +{ + u64 ktime_now = ktime_to_ns(ktime_get()); + u64 now = 0; + int cpu; + + for_each_possible_cpu(cpu) { + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + scd->prev_jiffies = jiffies; + scd->prev_raw = now; + scd->tick_raw = now; + scd->tick_gtod = ktime_now; + scd->clock = ktime_now; + } +} + +/* + * update the percpu scd from the raw @now value + * + * - filter out backward motion + * - use jiffies to generate a min,max window to clip the raw values + */ +static void __update_sched_clock(struct sched_clock_data *scd, u64 now) +{ + unsigned long now_jiffies = jiffies; + long delta_jiffies = now_jiffies - scd->prev_jiffies; + u64 clock = scd->clock; + u64 min_clock, max_clock; + s64 delta = now - scd->prev_raw; + + WARN_ON_ONCE(!irqs_disabled()); + min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; + + if (unlikely(delta < 0)) { + clock++; + goto out; + } + + max_clock = min_clock + TICK_NSEC; + + if (unlikely(clock + delta > max_clock)) { + if (clock < max_clock) + clock = max_clock; + else + clock++; + } else { + clock += delta; + } + + out: + if (unlikely(clock < min_clock)) + clock = min_clock; + + scd->prev_raw = now; + scd->prev_jiffies = now_jiffies; + scd->clock = clock; +} + +static void lock_double_clock(struct sched_clock_data *data1, + struct sched_clock_data *data2) +{ + if (data1 < data2) { + __raw_spin_lock(&data1->lock); + __raw_spin_lock(&data2->lock); + } else { + __raw_spin_lock(&data2->lock); + __raw_spin_lock(&data1->lock); + } +} + +u64 sched_clock_cpu(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + u64 now, clock; + + WARN_ON_ONCE(!irqs_disabled()); + now = sched_clock(); + + if (cpu != raw_smp_processor_id()) { + /* + * in order to update a remote cpu's clock based on our + * unstable raw time rebase it against: + * tick_raw (offset between raw counters) + * tick_gotd (tick offset between cpus) + */ + struct sched_clock_data *my_scd = this_scd(); + + lock_double_clock(scd, my_scd); + + now -= my_scd->tick_raw; + now += scd->tick_raw; + + now -= my_scd->tick_gtod; + now += scd->tick_gtod; + + __raw_spin_unlock(&my_scd->lock); + } else { + __raw_spin_lock(&scd->lock); + } + + __update_sched_clock(scd, now); + clock = scd->clock; + + __raw_spin_unlock(&scd->lock); + + return clock; +} + +void sched_clock_tick(void) +{ + struct sched_clock_data *scd = this_scd(); + u64 now, now_gtod; + + WARN_ON_ONCE(!irqs_disabled()); + + now = sched_clock(); + now_gtod = ktime_to_ns(ktime_get()); + + __raw_spin_lock(&scd->lock); + __update_sched_clock(scd, now); + /* + * update tick_gtod after __update_sched_clock() because that will + * already observe 1 new jiffy; adding a new tick_gtod to that would + * increase the clock 2 jiffies. + */ + scd->tick_raw = now; + scd->tick_gtod = now_gtod; + __raw_spin_unlock(&scd->lock); +} + +/* + * We are going deep-idle (irqs are disabled): + */ +void sched_clock_idle_sleep_event(void) +{ + sched_clock_cpu(smp_processor_id()); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct sched_clock_data *scd = this_scd(); + u64 now = sched_clock(); + + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + __raw_spin_lock(&scd->lock); + scd->prev_raw = now; + scd->clock += delta_ns; + __raw_spin_unlock(&scd->lock); + + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); + +#endif + +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); +} diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 6b4a12558e8..5f06118fbc3 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu) PN(next_balance); P(curr->pid); PN(clock); - PN(idle_clock); - PN(prev_clock_raw); - P(clock_warps); - P(clock_overflows); - P(clock_underflows); - P(clock_deep_idle_events); - PN(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d99e01f6929..c863663d204 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -959,7 +959,7 @@ static void yield_task_fair(struct rq *rq) return; if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - __update_rq_clock(rq); + update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ -- cgit v1.2.3