From 597d0275736dad9c3bda6f0a00a1c477dc0f37b1 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj <arun@linux.vnet.ibm.com> Date: Thu, 16 Apr 2009 12:13:26 +0530 Subject: timers: Framework for identifying pinned timers * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]: This patch creates a new framework for identifying cpu-pinned timers and hrtimers. This framework is needed because pinned timers are expected to fire on the same CPU on which they are queued. So it is essential to identify these and not migrate them, in case there are any. For regular timers, the currently existing add_timer_on() can be used queue pinned timers and subsequently mod_timer_pinned() can be used to modify the 'expires' field. For hrtimers, new modes HRTIMER_ABS_PINNED and HRTIMER_REL_PINNED are added to queue cpu-pinned hrtimer. [ tglx: use .._PINNED mode argument instead of creating tons of new functions ] Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/linux/hrtimer.h | 7 +++++-- include/linux/timer.h | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 0d2f7c8a33d6..7400900de94a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -30,8 +30,11 @@ struct hrtimer_cpu_base; * Mode arguments of xxx_hrtimer functions: */ enum hrtimer_mode { - HRTIMER_MODE_ABS, /* Time value is absolute */ - HRTIMER_MODE_REL, /* Time value is relative to now */ + HRTIMER_MODE_ABS = 0x0, /* Time value is absolute */ + HRTIMER_MODE_REL = 0x1, /* Time value is relative to now */ + HRTIMER_MODE_PINNED = 0x02, /* Timer is bound to CPU */ + HRTIMER_MODE_ABS_PINNED = 0x02, + HRTIMER_MODE_REL_PINNED = 0x03, }; /* diff --git a/include/linux/timer.h b/include/linux/timer.h index 6cdb6f3331f1..ccf882eed8f8 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -163,7 +163,10 @@ extern void add_timer_on(struct timer_list *timer, int cpu); extern int del_timer(struct timer_list * timer); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); +extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires); +#define TIMER_NOT_PINNED 0 +#define TIMER_PINNED 1 /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: -- cgit v1.2.3 From cd1bb94b4a0531e8211a3774f17de831f8285f76 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj <arun@linux.vnet.ibm.com> Date: Thu, 16 Apr 2009 12:15:34 +0530 Subject: timers: /proc/sys sysctl hook to enable timer migration * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]: This patch creates the /proc/sys sysctl interface at /proc/sys/kernel/timer_migration Timer migration is enabled by default. To disable timer migration, when CONFIG_SCHED_DEBUG = y, echo 0 > /proc/sys/kernel/timer_migration Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/linux/sched.h | 1 + kernel/sched.c | 2 ++ kernel/sysctl.c | 8 ++++++++ 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049c..618504010400 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1766,6 +1766,7 @@ extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_timer_migration; int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, diff --git a/kernel/sched.c b/kernel/sched.c index 9c5b4d3f97ab..7f1dd56af863 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8731,6 +8731,8 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ +const_debug unsigned int sysctl_timer_migration = 1; + int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e3d2c7dd59b9..b3ce58137303 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -324,6 +324,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif { .ctl_name = CTL_UNNUMBERED, -- cgit v1.2.3 From eea08f32adb3f97553d49a4f79a119833036000a Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj <arun@linux.vnet.ibm.com> Date: Thu, 16 Apr 2009 12:16:41 +0530 Subject: timers: Logic to move non pinned timers * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]: This patch migrates all non pinned timers and hrtimers to the current idle load balancer, from all the idle CPUs. Timers firing on busy CPUs are not migrated. While migrating hrtimers, care should be taken to check if migrating a hrtimer would result in a latency or not. So we compare the expiry of the hrtimer with the next timer interrupt on the target cpu and migrate the hrtimer only if it expires *after* the next interrupt on the target cpu. So, added a clockevents_get_next_event() helper function to return the next_event on the target cpu's clock_event_device. [ tglx: cleanups and simplifications ] Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/linux/clockchips.h | 9 ++++++++ include/linux/sched.h | 12 +++++++++++ kernel/hrtimer.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched.c | 5 +++++ kernel/time/clockevents.c | 12 +++++++++++ kernel/timer.c | 17 +++++++++++++--- 6 files changed, 101 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 3a1dbba4d3ae..20a100fe2b4f 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -143,3 +143,12 @@ extern void clockevents_notify(unsigned long reason, void *arg); #endif #endif + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +extern ktime_t clockevents_get_next_event(int cpu); +#else +static inline ktime_t clockevents_get_next_event(int cpu) +{ + return (ktime_t) { .tv64 = KTIME_MAX }; +} +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 618504010400..311dec123974 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -257,6 +257,7 @@ extern void task_rq_unlock_wait(struct task_struct *p); extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); +extern int get_nohz_load_balancer(void); #else static inline int select_nohz_load_balancer(int cpu) { @@ -1772,6 +1773,17 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_DEBUG +static inline unsigned int get_sysctl_timer_migration(void) +{ + return sysctl_timer_migration; +} +#else +static inline unsigned int get_sysctl_timer_migration(void) +{ + return 1; +} +#endif extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c71bcd549241..b675a67c9ac3 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -43,6 +43,8 @@ #include <linux/seq_file.h> #include <linux/err.h> #include <linux/debugobjects.h> +#include <linux/sched.h> +#include <linux/timer.h> #include <asm/uaccess.h> @@ -198,8 +200,19 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, { struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; + int cpu, preferred_cpu = -1; + + cpu = smp_processor_id(); +#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { + preferred_cpu = get_nohz_load_balancer(); + if (preferred_cpu >= 0) + cpu = preferred_cpu; + } +#endif - new_cpu_base = &__get_cpu_var(hrtimer_bases); +again: + new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[base->index]; if (base != new_base) { @@ -219,6 +232,40 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, timer->base = NULL; spin_unlock(&base->cpu_base->lock); spin_lock(&new_base->cpu_base->lock); + + /* Optimized away for NOHZ=n SMP=n */ + if (cpu == preferred_cpu) { + /* Calculate clock monotonic expiry time */ +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), + new_base->offset); +#else + ktime_t expires = hrtimer_get_expires(timer); +#endif + + /* + * Get the next event on target cpu from the + * clock events layer. + * This covers the highres=off nohz=on case as well. + */ + ktime_t next = clockevents_get_next_event(cpu); + + ktime_t delta = ktime_sub(expires, next); + + /* + * We do not migrate the timer when it is expiring + * before the next event on the target cpu because + * we cannot reprogram the target cpu hardware and + * we would cause it to fire late. + */ + if (delta.tv64 < 0) { + cpu = smp_processor_id(); + spin_unlock(&new_base->cpu_base->lock); + spin_lock(&base->cpu_base->lock); + timer->base = base; + goto again; + } + } timer->base = new_base; } return new_base; @@ -236,7 +283,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) return base; } -# define switch_hrtimer_base(t, b) (b) +# define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ diff --git a/kernel/sched.c b/kernel/sched.c index 7f1dd56af863..9fe3774a0fd3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4244,6 +4244,11 @@ static struct { .load_balancer = ATOMIC_INIT(-1), }; +int get_nohz_load_balancer(void) +{ + return atomic_read(&nohz.load_balancer); +} + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d13be216a790..ab20ded013bd 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -18,6 +18,7 @@ #include <linux/notifier.h> #include <linux/smp.h> #include <linux/sysdev.h> +#include <linux/tick.h> /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); @@ -251,4 +252,15 @@ void clockevents_notify(unsigned long reason, void *arg) spin_unlock(&clockevents_lock); } EXPORT_SYMBOL_GPL(clockevents_notify); + +ktime_t clockevents_get_next_event(int cpu) +{ + struct tick_device *td; + struct clock_event_device *dev; + + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + + return dev->next_event; +} #endif diff --git a/kernel/timer.c b/kernel/timer.c index 3424dfd11d50..3f841db5edf9 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,6 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -609,9 +610,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, { struct tvec_base *base, *new_base; unsigned long flags; - int ret; - - ret = 0; + int ret = 0 , cpu; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -630,6 +629,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, new_base = __get_cpu_var(tvec_bases); + cpu = smp_processor_id(); + +#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { + int preferred_cpu = get_nohz_load_balancer(); + + if (preferred_cpu >= 0) + cpu = preferred_cpu; + } +#endif + new_base = per_cpu(tvec_bases, cpu); + if (base != new_base) { /* * We are trying to schedule the timer on the local CPU. -- cgit v1.2.3